# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from img.items import ImgItem
----------**下面是用scrapy.spider这个类爬取的**----------------
# class DemoSpider(scrapy.Spider):
# name = "demo"
# #allowed_domains = ["demo.com"]
# start_urls = ['http://www.xiaohuar.com/list-1-1.html']
# urls = 'http://www.xiaohuar.com/list-1-'
# def start_requests(self):
# # yield scrapy.Request(self.urls+'1.html')
#
# for i in range(1,41):
# url=self.urls+str(i)+'.html'
# yield scrapy.Request(url)
#
# def parse(self, response):
# item=ImgItem()
# divs=response.xpath("//div[@class='img']")
# for div in divs:
# item['url']="http ://www.xiaohuar.com"+div.xpath('.//img/@src')[0].extract()
# item['name']=div.xpath("./span/text()").extract()
# item['school']=div.xpath("a/img/@alt").extract()
# yield item
# # res=requests.get(item['url'])
********************下面使用CrawlSpider这个类爬取的************
**这是spider下的demo文件**
class DemoSpider(CrawlSpider):
name='demo'
start_urls = ['http://www.xiaohuar.com/list-1-2.html']
rules=[Rule(LinkExtractor(allow=('http://www.xiaohuar.com/list'),
restrict_xpaths=("//div[@class='page_num']")),
callback="paser_item",
follow=True)]
def paser_item(self,response):
item=ImgItem()
url=response.url
print 'url=%s'%url
divs = response.xpath("//div[@class='img']")
for div in divs:
item['url'] = "http ://www.xiaohuar.com" + div.xpath('.//img/@src')[0].extract()
item['name'] = div.xpath("./span/text()").extract()
item['school'] = div.xpath("a/img/@alt").extract()
yield item
**下面是scrapy框架下的items.py文件**
import scrapy
class ImgItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
url=scrapy.Field() #图片url
name=scrapy.Field() #校花的名字
school=scrapy.Field()#校花所在的学校
******编写完成后在工程的根目录下运行scrapy crawl demo -o img.csv**
scrapy框架爬取校花网站
最新推荐文章于 2021-05-26 18:30:55 发布