在Scrapy中处理多级页面跳转通常可以通过两种方式来实现:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class MyCrawlSpider(CrawlSpider):
name = 'my_crawl_spider'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
rules = (
Rule(LinkExtractor(allow='item'), callback='parse_item'),
)
def parse_item(self, response):
# 提取数据
pass
import scrapy
class MySpider(scrapy.Spider):
name = 'my_spider'
start_urls = ['http://www.example.com']
def parse(self, response):
# 提取数据
# 处理下一个页面的跳转
next_page_url = response.css('a.next_page::attr(href)').extract_first()
if next_page_url:
yield response.follow(next_page_url, callback=self.parse_next_page)
def parse_next_page(self, response):
# 提取数据
pass
使用以上两种方法之一,你可以很方便地处理多级页面跳转并提取需要的数据。