一、基于halo的Vno主题的博客网站
代码主要实现了爬取首页所有文章的标题及地址
核心代码
class itest(scrapy.Spider):
name = "itest"
start_urls = ['https://againriver.com']
def parse(self, response):
source = response.css("ol.post-list>li")
list = self.getInfo(source)
self.log("结果:%s" % list)
def getInfo(self,source):
list = []
for i in source:
dict = {}
title = i.css("h2.post-list__post-title>a::attr(title)").extract()
href = i.css("h2.post-list__post-title>a::attr(href)").extract()
dict["title"] = title
dict["href"] = href
list.append(dict)
return list
二、对示例一的加强练习
增加翻页、文章详情页内容并保存文章内容到本地
class itest(scrapy.Spider):
name = "itest"
start_urls = ['https://againriver.com/#blog']
def parse(self, response):
source = response.css("ol.post-list>li")
list = []
for i in source:
dict = {}
title = i.css("h2.post-list__post-title>a::attr(title)").extract_first()
href = i.css("h2.post-list__post-title>a::attr(href)").extract_first()
#标题获取
dict["title"] = title
#链接获取
dict["href"] = href
list.append(dict)
yield scrapy.Request(url=str(dict["href"]), meta={"dict": dict}, callback=self.parseContent)
#翻页
next_page = response.css('a.pagination__older::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
#文章详情
def parseContent(self, response):
dict = response.meta["dict"]
dict["content"] = response.css("article.post-container>section.post").extract_first()
self.log("aa:%s" % dict)
filename = str(dict["title"])
with open(filename, 'w',encoding='utf-8') as f:
f.write(str(dict["content"]))
self.log("保存文件:%s" % filename)
return dict
三、增加代理池
基于scrapy自带下载中间件
# middlewares.py新增代理中间件
class HttpbinProxyMiddleware(object):
def process_request(self, request, spider):
#http://localhost:5010/get/为开源的代理ip服务(需自行搭建服务)
#开源地址 https://github.com/jhao104/proxy_pool
pro_addr = requests.get('http://localhost:5010/get/').json()["proxy"]
request.meta['proxy'] = 'https://' + pro_addr
# setting.py 指定权重
DOWNLOADER_MIDDLEWARES = {
'itest.middlewares.HttpbinProxyMiddleware': 543,
}