I've been going through the Scrapy documentation today and trying to get a working version of - https://docs.scrapy.org/en/latest/intro/tutorial.html#our-first-spider - on a real world example. My example is slightly different in that it has 2 next pages, i.e.
start_url > city page > unit page
It is the unit pages I want to grab data from.
My code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://www.unitestudents.com/',
]
def parse(self, response):
for quote in response.css('div.property-body'):
yield {
'name': quote.xpath('//span/a/text()').extract(),
'type': quote.xpath('//div/h4/text()').extract(),
'price_amens': quote.xpath('//div/p/text()').extract(),
'distance_beds': quote.xpath('//li/p/text()').extract()
}
# Purpose is to crawl links of cities
next_page = response.css('a.listing-item__link::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
# Purpose is to crawl links of units
next_unit_page = response.css(response.css('a.text-highlight__inner::attr(href)').extract_first())
if next_unit_page is not None:
next_unit_page = response.urljoin(next_unit_page)
yield scrapy.Request(next_unit_page, callback=self.parse)
But when I run this I get:
INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
So I am thinking my code is not set up to retrieve the links in the flow mentioned above, but am not sure how best to do that?
Updated flow:
Main page > City page > Building page > Unit page
It's still the unit page I want to get the data from.
Updated code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://www.unitestudents.com/',
]
def parse(self, response):
for quote in response.css('div.site-wrapper'):
yield {
'area_name': quote.xpath('//div/ul/li/a/span/text()').extract(),
'type': quote.xpath('//div/div/div/h1/span/text()').extract(),
'period': quote.xpath('/html/body/div/div/section/div/form/h4/span/text()').extract(),
'duration_weekly': quote.xpath('//html/body/div/div/section/div/form/div/div/em/text()').extract(),
'guide_total': quote.xpath('//html/body/div/div/section/div/form/div/div/p/text()').extract(),
'amenities': quote.xpath('//div/div/div/ul/li/p/text()').extract(),
}
# Purpose is to crawl links of cities
next_page = response.xpath('//html/body/div/footer/div/div/div/ul/li/a[@class="listing-item__link"]/@href').extract()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
# Purpose is to crawl links of units
next_unit_page = response.xpath('//li/div/h3/span/a/@href').extract()
if next_unit_page is not None:
next_unit_page = response.urljoin(next_unit_page)
yield scrapy.Request(next_unit_page, callback=self.parse)
# Purpose to crawl crawl pages on full unit info
last_unit_page = response.xpath('//div/div/div[@class="content__btn"]/a/@href').extract()
if last_unit_page is not None:
last_unit_page = response.urljoin(last_unit_page)
yield scrapy.Request(last_unit_page, callback=self.parse)
See Question&Answers more detail:
os 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…