32 lines
1.1 KiB
Python
32 lines
1.1 KiB
Python
import scrapy
|
||
from typing import Optional
|
||
|
||
|
||
class ExampleSpider(scrapy.Spider):
|
||
name = "example"
|
||
|
||
def __init__(self, url: Optional[str] = None, *args, **kwargs):
|
||
"""初始化爬虫
|
||
|
||
Args:
|
||
url: 开始URL,可通过API传入
|
||
"""
|
||
super(ExampleSpider, self).__init__(*args, **kwargs)
|
||
self.start_urls = [url] if url else ["http://quotes.toscrape.com"]
|
||
|
||
def parse(self, response):
|
||
"""解析页面数据
|
||
|
||
这是一个示例解析器,从quotes.toscrape.com抓取引用和作者
|
||
"""
|
||
for quote in response.css('div.quote'):
|
||
yield {
|
||
'text': quote.css('span.text::text').get(),
|
||
'author': quote.css('small.author::text').get(),
|
||
'tags': quote.css('div.tags a.tag::text').getall(),
|
||
}
|
||
|
||
# 获取下一页链接
|
||
next_page = response.css('li.next a::attr(href)').get()
|
||
if next_page is not None:
|
||
yield response.follow(next_page, self.parse) |