scrapy如何GET和POST请求

最新推荐文章于 2024-06-12 22:52:28 发布

Are you ready

最新推荐文章于 2024-06-12 22:52:28 发布

阅读量2.2k

点赞数

分类专栏：爬虫与数据分析文章标签： scrapy框架经典案例

爬虫与数据分析专栏收录该内容

24 篇文章

订阅专栏

post请求

class FanyiSpider(scrapy.Spider):
    name = 'fanyi'
    allowed_domains = ['baidu.com']
    # start_urls = ['https://fanyi.baidu.com/sug']
    # 引擎调度起来以后首先会从start_urls中提取起始url然后发起get请求，现在把这个属性注释掉，引擎就找不到起始url了，就不会发起默认的get请求
    # 如果要发起post请求，我们需要在这里重写爬虫的周期函数
    def start_requests(self):
        # 这个周期函数，下载器开始下载的时候开启
        print("下载器开始请求网络数据...")
        post_url = "https://fanyi.baidu.com/sug"
        # 创建表单
        data = {
            "kw":'a'
        }
        # 发起post请求
        yield scrapy.FormRequest(url=post_url,formdata=data,callback=self.parse_post) # 下载器对象不能由我们手动，我们需要将下载器创建流程返回给调度器，让调度器去同一调度

    # 定义post请求的回调函数
    def parse_post(self, response):
        print(111111111111111111111111111)
        print(response.text)

GET加POST请求

# -*- coding: utf-8 -*-
import scrapy
import pytesseract
from PIL import Image

class GushiwenSpider(scrapy.Spider):
    name = 'gushiwen'
    allowed_domains = ['gushiwen.org']
    start_urls = ['https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx']

    def parse(self, response):
        # 从response中取出两个token和一个验证码url
        # 取出两个token
        self.token1 = response.css("#__VIEWSTATE::attr(value)").extract()[0]
        self.token2 = response.css("#__VIEWSTATEGENERATOR::attr(value)").extract()[0]
        img_src = "https://so.gushiwen.org"+ response.css("#imgCode::attr(src)").extract()[0]
        # 【注意】scrapy自带的css方法可以直接通过css选择器选择页面中元素，如果我们要提取元素的某属性值直接在选择器后面加上“::attr(某属性)”,如果要提取内容，在选择器后面加上“::text”
        # 下载验证码
        # 调起来一个get下载器，来下载验证码
        yield scrapy.Request(url=img_src,callback=self.parse_code)

    # 封装一个回调函数，用于处理验证码的响应
    def parse_code(self,response):
        # response是验证码图片的响应数据
        with open("./code.png",'wb') as fp:
            fp.write(response.body) # response的二进制是body
        img = Image.open("./code.png")
        img = img.convert("L")
        code = pytesseract.image_to_string(img)
        # 登录接口
        login_url = "https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx"
        # 登录提交的表单数据
        data = {
            "__VIEWSTATE":	self.token1,
            "__VIEWSTATEGENERATOR":self.token2,
            "from":"http://so.gushiwen.org/user/collect.aspx",
            "email":"fanjianbo666@163.com",
            "pwd":"12345678",
            "code":code,
            "denglu":"登录"
        }
        # 返回出去一个post请求的下载器对象
        yield scrapy.FormRequest(url=login_url,formdata=data,callback=self.parse_login)

    # 定义一个回调函数，用于处理登录的响应数据
    def parse_login(self,response):
        print(response.text)
        pass

两个相互关联的爬虫

案例：爬取“穷游网”
“中国”的所有的旅游城市，要爬取的内容
1）城市名，去过的人数，常见的景点，城市的概况
2）进入每个城市的二级页面“旅行地”模块，提取出每个城市的旅行地的如下字段：地名，评分，评论数，推荐锦囊数，排名

city的爬虫器
# -*- coding: utf-8 -*-
import scrapy
from Qiongyou.items import QiongyouItem

class CitySpider(scrapy.Spider):
   name = 'city'
   allowed_domains = ['qyer.com']
   start_urls = ['https://place.qyer.com/china/citylist-0-0-%d/'%i for i in range(1,9)]

   def parse(self, response):
       # print(response.text)
       city_list = response.xpath("//ul[@class='plcCitylist']/li")
       for city in city_list:
           item = QiongyouItem()
           item["cityName"] = " ".join(city.xpath(".//h3//a//text()").extract())
           item["visitorNum"] = city.xpath(".//p[@class='beento']/text()").extract()[0]
           item["scenicSpot"] = "".join(city.xpath(".//p[@class='pois']//text()").extract())
           item["cityNum"] = city.xpath(".//h3//a/@href").extract()[0].split("/")[-2]
           # 取出城市的pid
           item["cityPid"] = city.xpath(".//p[@class='addPlanBtn']/@data-pid").extract()[0]
           # 城市的概况在下级页面中，所以在这里要匹配出下级页面的url
           next_url = "https://place.qyer.com/" + item["cityNum"] + "/profile/"
           print(next_url)
           # 发起请求，打开二级页面提取数据
           yield scrapy.Request(url=next_url,callback=self.parse_next,meta={"item":item})
           # response有一个属性叫做meta，用于记录响应的相关信息，这个属性可以自定义，在发起Request请求的时候，把我们item放入到meta，此时item就可以跟着响应数据对象传入到下级页面

   def parse_next(self,response):
       # print(response.meta)
       item = response.meta["item"] # 取出上级页面中传过来的item对象
       item["cityInfo"] = "\n".join(response.xpath("//div[@class='entry_main']//text()").extract()) # 继续解析上级页面中没有解析完的item

       yield item

#travel爬虫器
# -*- coding: utf-8 -*-
import scrapy
import redis
import json
class TravelSpider(scrapy.Spider):
   name = 'travel'
   allowed_domains = ['qyer.com']
   # start_urls = ['http://qyer.com/']
   def start_requests(self):
       print("开始请求...")
       # 1）从redis数据库中提取出城市pid和城市的代号
       rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=8)
       lens = rds.llen("Qiuyou:cityList")
       print(lens)
       city_list = rds.lrange("Qiuyou:cityList",0,lens)
       # print(city_list)
       # 定义两个列表分别用于存储pid和城市代号
       pid_list = []
       cityNum_list = []
       for c in city_list:
           city = json.loads(c)
           pid_list.append(city["cityPid"])
           cityNum_list.append(city["cityNum"])
       # 2）将城市代号拼接出城市的旅游景点的列表页，从中提取出总页数
       for i in range(len(cityNum_list)):
           city_url = "https://place.qyer.com/" + cityNum_list[i] + "/alltravel/"
           print("正在向：%s发起请求！"%city_url)
           yield scrapy.Request(url=city_url,callback=self.parse_city,meta={"pid":pid_list[i]})
   # 定义一个回调函数用于处理每个城市的旅游景点列表的页面
   def parse_city(self, response):
       pid = response.meta["pid"]
       # 解析出每个城市景点列表页的总页数
       total_pages = response.xpath("//a[@class='ui_page_item']/@data-page").extract()
       if len(total_pages) != 0:
           # 抓取每一页每一页的数据
           for page in range(1,int(total_pages[-1])+1):
               post_url = "https://place.qyer.com/poi.php?action=list_json"
               data = {
               'page': str(page),
               "type": "city",
               "pid": str(pid),
               "sort": '0',
               "subsort": "all",
               "isnominate": '-1',
               "haslastm": "false",
               "rank": '0'
               }
               yield scrapy.FormRequest(url=post_url,formdata=data,callback=self.parse_post)

   def parse_post(self, response):
       # print(response.text)
       # 练习：json解析并且存储景点页面
       pass

管道文件

import redis
import json
class QiongyouPipeline(object):
    def open_spider(self,spider):
        # 判断爬虫
        if spider.name == "city":
            self.rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=8)
        pass
    def process_item(self, item, spider):
        if spider.name == "city":
            self.rds.lpush("Qiuyou:cityList",json.dumps(dict(item)))
        return item

    def close_spider(self,spider):
        pass