1. 校花网对爬虫浏览器(User-Agent)有限制,如果强行多次爬取会报错,IP被封禁:
Connection was refused by other side: 10061: 由于目标计算机积极拒绝,无法联机
解决办法:加入Headers, 可以加入动态header和动态代理IP,防止频繁爬取数据IP被封:
一般如果已经被封了,过1小时左右就可以自动解封.
2.spider中建立xiaohua.py(scrapy genspider xiaohua xiaohuar.com),内容如下:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from scrapy.http import Request
class XiaohuaSpider(scrapy.Spider):
name = 'xiaohua'
allowed_domains = ['xiaohuar.com']
start_urls = ['http://www.xiaohuar.com/hua/']
#为了防止IP被封,自定义浏览器Headers
custom_settings = {
'DEFAULT_REQUEST_HEADERS': {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
}
url_set = set()
def parse(self, response):
#查看user-agent 浏览器内容
#print(response.request.headers['User-Agent'])
hxs = Selector(response).xpath("//div[@class='img']/a")
for img in hxs:
#self.folder_name = img.xpath(".//img/@alt").extract_first()
img_url = img.xpath(".//@href").extract_first()
if img_url in self.url_set:
pass
else:
self.url_set.add(img_url)
new_img_url = "%s#p1"%img_url.replace('/p','/s')
print(new_img_url)
yield Request(url=new_img_url,callback=self.imgParse)
def imgParse(self,response):
hxs1 = Selector(response=response).xpath("//li/div[@class='inner']/a/@href").extract()
img_url_dict = {}
for item in hxs1:
if item.startswith("http"):
img_url = item
else:
img_url = "http://www.xiaohuar.com%s" % item
print(img_url)
img_name = img_url.rsplit("/", 1)[1]
img_url_dict[img_name] = img_url
folder_name = Selector(response).xpath("//h1/text()").extract_first()
from spider1.items import XiaohuaItem
obj = XiaohuaItem(folder_name=folder_name, img_url=img_url_dict, img_name=img_name)
yield obj
2.items.py
class XiaohuaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
folder_name = scrapy.Field()
img_url = scrapy.Field()
img_name = scrapy.Field()
3.piplines.py
class XiaohuaPipeline(object):
def process_item(self, item, spider):
if spider.name=='xiaohua':
print(item['folder_name'],item["img_name"],item["img_url"])
import os,requests
img_path = os.path.join(r"D:\xiaohua",item['folder_name'])
if not os.path.exists(img_path):
os.makedirs(img_path)
#图片保存到本地
from PIL import Image
from io import BytesIO
for img_name,img_url in item['img_url'].items():
img_save = os.path.join(img_path, img_name)
print(img_save)
res = requests.get(img_url)
img = Image.open(BytesIO(res.content))
img.save(img_save)
4.settings.py
ITEM_PIPELINES = {
'spider1.pipelines.XianPipeline': 300, #300代表pipline执行权重,顺序
'spider1.pipelines.XiaohuaPipeline': 200, #300代表pipline执行权重,顺序
}
DEPTH_LIMIT=1 #递归深度 0表示不设深度,无线循环
5.最后发下爬到的图片,都是大图喔: