查看: 961|回复: 16

[讨论] 起点跑不起来换了一个网站练手crawlspider 正则还是不会用还是xpaths跑起来的

发表于 2021-5-31 19:40

[Asm] 纯文本查看 复制代码

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

class KsSpider(CrawlSpider):

name = 'ks'

allowed_domains = ['sxcnw.net']

start_urls = ['http://www.sxcnw.net/xuanhuan/List_1.html']

rules = (

Rule(LinkExtractor(restrict_xpaths='//ul[@class="listcon"]/li/a'), callback='parse_item'

),   #到最后一个标签比如这个a标签之后不要自己往下去取url  坑了我好久  rules会自动提取最后一个标签里的url

Rule(LinkExtractor(restrict_xpaths='//div[@class="showpage"]/a[position()>1]'),follow=True),

)

def parse_item(self, response):

item = {}

item['book_name'] = response.xpath('//div[@class="book-title clear"]/h1/text()').extract()

item['content'] = response.xpath('//div[@class="about-txt"]//text()').extract()

item['book_down'] = response.xpath('//div[@class="dl-to-pc"]/a/@href').extract_first()

print(item)

return item

起点跑不起来换了一个网站练手crawlspider 正则还是不会用还是xpaths跑起来的