scrapy框架爬取校花网站

# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from img.items import ImgItem


----------**下面是用scrapy.spider这个类爬取的**----------------
# class DemoSpider(scrapy.Spider):
#     name = "demo"
#     #allowed_domains = ["demo.com"]
#     start_urls = ['http://www.xiaohuar.com/list-1-1.html']
#     urls = 'http://www.xiaohuar.com/list-1-'
#     def start_requests(self):
#     #     yield scrapy.Request(self.urls+'1.html')
#
#         for i in range(1,41):
#             url=self.urls+str(i)+'.html'
#             yield scrapy.Request(url)
#
#     def parse(self, response):
#         item=ImgItem()
#         divs=response.xpath("//div[@class='img']")
#         for div in divs:
#             item['url']="http ://www.xiaohuar.com"+div.xpath('.//img/@src')[0].extract()
#             item['name']=div.xpath("./span/text()").extract()
#             item['school']=div.xpath("a/img/@alt").extract()
#             yield item
#             # res=requests.get(item['url'])


********************下面使用CrawlSpider这个类爬取的************
                   **这是spider下的demo文件**

class  DemoSpider(CrawlSpider):
    name='demo'
    start_urls = ['http://www.xiaohuar.com/list-1-2.html']
    rules=[Rule(LinkExtractor(allow=('http://www.xiaohuar.com/list'),
                              restrict_xpaths=("//div[@class='page_num']")),
                              callback="paser_item",
                              follow=True)]
    def paser_item(self,response):
        item=ImgItem()
        url=response.url
        print 'url=%s'%url
        divs = response.xpath("//div[@class='img']")
        for div in divs:
            item['url'] = "http ://www.xiaohuar.com" + div.xpath('.//img/@src')[0].extract()
            item['name'] = div.xpath("./span/text()").extract()
            item['school'] = div.xpath("a/img/@alt").extract()
            yield item

           **下面是scrapy框架下的items.py文件**
           import scrapy


class ImgItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    url=scrapy.Field()   #图片url
    name=scrapy.Field()  #校花的名字
    school=scrapy.Field()#校花所在的学校


******编写完成后在工程的根目录下运行scrapy crawl demo -o img.csv**








评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

不才陈某

欢迎关注公众号【码猿技术专栏】

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值