首页 文章

使用Scrapy在Python中进行AJAX分页Web爬行

提问于
浏览
0

我正在使用Python和Scrapy抓取具有AJAX分页的网站 . 我能够爬第一页 .

但是如果我在AJAX加载完成的第二页上下载,我就无法获得其他页面的链接 .

请指导我如何获取AJAX页面的链接 . 我正在使用 BeautifulSoup 库进行webscraping .

class SItenameSpider(CrawlSpider):
start_urls = []
rules = (
    Rule(SgmlLinkExtractor(allow=('/trends/','/keynote/')), callback='parse_item'),
)

def parse_item(self, response):
    print('Hi, crawling this page! %s' % response.url)


    extract_tuple_list =  site_product_crawl.parse_product_page('site url')
    items = []

    for extract_tuple in  extract_tuple_list:
      item = SitenameItem()
      item['site_id'] = extract_tuple[0]
      item['name'] = extract_tuple[1]
      item['price'] = extract_tuple[2]
      item['rating']=  extract_tuple[3]
      item['num_reviews']=  extract_tuple[4]
      item['category']=  cat_code
      item['url'] = response.url
      item['date'] = date_created 
      item['description'] = extract_tuple[6]
      items.append(item)
    return items
from bs4 import BeautifulSoup as bsoup
import requests
import pprint
import re

def return_html(url):
    try:
        return requests.get(url).text
    except Exception as e:
        print e
        return None

def parse_product_page(prod_url):
    #print prod_url
    soup = bsoup(return_html(prod_url))
    tuple_list = []
    avg_rating = None
    num_reviews = None
    prod_category = None
    prod_name = None
    prod_price = None
    prod_number = None



prod_price = '0' # the price is not available on site so it was put 0
#num_rev_div = soup.find('a', {'class' : 'bv-rating-label bv-text-link bv-focusable', 'href' : 'javascript:void(0)'})
url_split_prod_number = prod_url.split('://')
prod_number = url_split_prod_number[1].split('/')[1] + '_' + url_split_prod_number[1].split('/')[2].strip().encode('utf-8');
print prod_number
prod_description = soup.find('div', {'class' : 'articleText'}).get_text().strip().replace('
','').encode('utf-8') print prod_description prod_name_div = soup.find('div', id = 'titleSection') prod_name = prod_name_div.h2.get_text().strip().encode('utf-8'); print prod_name num_reviews = soup.find('span',itemprop='votes').get_text().strip().encode('utf-8').replace(',',''); avg_rating = soup.find('span',{'class' :'featuredstatbox'}).find('span',itemprop='rating').get_text().strip().encode('utf-8') #get_text().strip().encode('utf-8').replace(',',''); #print price_text #if price_text != None: #prod_price = price_text.get_text().strip().encode('utf-8').replace('$','').replace(',','').split('-')[0].strip() #print prod_price tuple = (prod_number, prod_name.strip().encode('utf-8'), prod_price, avg_rating, num_reviews, prod_category, prod_description.replace('\n','').replace("'","''")) tuple_list.append(tuple) pprint.pprint(tuple_list) return tuple_list
def main():
  parse_product_page('sitename')

if __name__ == '__main__':
    main()

1 回答

  • 1

    即使是通过ajax加载的页面也必须将请求发送到某个URL . 如果您使用chrome,请在Chrome开发人员工具中通过网络选项卡查找 .

相关问题