I was trying to write the follwing scrapy script to scrape items from the follwing web site.
I was able to scrape first page items but there are more about 2000 page that i want scrape all.
There is a option "load more result" , I also try to scrape load more result's pages, but unable to do that.
please help me.
from scrapy.shell import open_in_browser
import scrapy
from scrapy import Selector
import math
import json
class MyItems(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
class ProductSpider(scrapy.Spider):
name= 'reuters'
allowed_domains = ['reuters.com']
start_urls = ['https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.']
download_delay = 1.5
def parse(self,response):
for url in response.css('h3.search-result-title a ::attr(href)').extract():
url=response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_article)
#"load more result"
job_count = 1970
job_per_page = 10
pages = math.ceil(job_count/job_per_page)
for page in range(2,pages):
headers = {
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8,af;q=0.7',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
'accept': '*/*',
'referer': 'https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.',
'authority': 'www.reuters.com',
'cookie': '_ga=GA1.2.592162541.1518081459; _gid=GA1.2.1478931362.1518081459; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22e58b8e9e-8674-49b4-aaff-0248b6976654%22; _cb_ls=1; OX_plg=pm; __gads=ID=3c74f81d13d6c1b4:T=1518081460:S=ALNI_MZsx67ijryijAj2JcD2YXXZw20zIA; _cb=sjG2aCNHffBaLnBl; AAMC_reuters_0=REGION%7C3; aam_uuid=06971314173867630360429126725673522696; _cb_svref=null; D_DUID=334503eb-dac8-49cd-babd-02081b0b6d24; D_TOKEN=1.0:a25bacf1dbb943e3ba1e93edb2093843:9841e8a348072081c4b770cfdd017d59831a31e6d41f368c89065cd08eec79bb34c9020669a0d8cbd7a670e4e11de2e762b5f67038115c02ba5fcbd9da8de4078116daf500471d1d6440734c181cb49859090467365cbf9d646c0d3fc7e7bb7e4e2643ea7a20bf00f9a695f9bf30b0df402746b31e429526a87ed7aa3c9da9bb:4b5290392fda7a6ff1f0f529cfad0d027a406ae35b6edb8e7cd3f6493ca8b99d; OX_sd=2; mnet_session_depth=2%7C1518104359854; _chartbeat2=.1518081466539.1518104385876.1.k_ivd8UuDjDegChcDsjhRBbcy9U',
}
data = {'blob':'National Health Investors, Inc.',
'bigOrSmall':'big',
'articleWithBlog':'true',
'sortBy':"",
'dateRange':"",
'numResultsToShow':'10',
'pn':str(page),
'callback':'addMoreNewsResults'}
url ='https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=10&pn={}&callback=addMoreNewsResults'.format(page)
yield scrapy.FormRequest(url,
headers=headers,callback=self.parse
)
def parse_article(self, response):
print('
')
print('***Heading:***',response.css('h1.ArticleHeader_headline_2zdFM ::text').extract_first())
print('***Url-Link:***',response.url)
print('***Date :***',response.css('div.ArticleHeader_date_V9eGk ::text').extract())
print('
')
See Question&Answers more detail:
os 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…