requests；exceptions；UA；pq；re；time爬取搜狗微信文章(使用代理池，免费自造的代理池)

本文链接：https://blog.csdn.net/Programmer_huangtao/article/details/84340037

因为是免费的，所以成功率不是很高，所以需要更多的ip代理，看你们的情况而定，有效的ip数量越多，爬取的数量越多，然后就是搜狗微信上的数量除非是特别少的，要不然是假的，比如查找到222000条数据，但是你查到最后100页的时候最多了，所以数据数量很多的就是假的数据，按100页查询就行了，数据数量要是少的话，直接看最后，要是不加个判断，如果以后没有数据程序关闭，但是啊，这样你的代理得比较好，做个代理失败无法查询，还是数据查询完毕，程序结束，所以看情况而定。

# -*- coding: utf-8 -*-
# @date: 2018\11\19 001917:17 
# @Author  : huangtao！！
# @FileName: 微信文章.py
# @Software: PyCharm
# @Blog    ：https://blog.csdn.net/Programmer_huangtao
import requests
from fake_useragent import UserAgent
from requests.exceptions import ConnectionError
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import time
import re
base_url = 'http://weixin.sogou.com/weixin?'
headers= {
    'cook': '你自己登录之后得到的cook',
    'Host': 'weixin.sogou.com',
    'Referer':'https://weixin.sogou.com/antispider/?from=%2fweixin%3Foq%3d%26query%3d%E9%A3%8E%E6%99%AF%26_sug_type_%3d1%26sut%3d0%26lkt%3d0,0,0%26s_from%3dinput%26ri%3d2%26_sug_%3dn%26type%3d2%26sst0%3d1542627739511%26page%3d6%26ie%3dutf8%26p%3d40040108%26dp%3d1%26w%3d01015002%26dr%3d1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent':UserAgent().random
}
proxy_pool_url = 'http://127.0.0.1:5000/get'
proxy = None
max_count = 5

def get_proxy():

    try:
        response = requests.get(proxy_pool_url)
        if response.status_code == 200:
            return response.text
        return None
    except ConnectionError:
        return None

def get_html(url,count=1):
    print('爬取的url',url)
    print('请求参数',count)
    global proxy
    if count >=max_count:
        print('太多请求')
        return  None
    try:
        if proxy:
            proxies = {
                'http':'http://'+proxy
            }
            response = requests.get(url,allow_redirects = False,headers=headers,proxies=proxies)
        else:
            response = requests.get(url, allow_redirects=False, headers=headers)
        if response.status_code==200:
            return response.text
        if response.status_code==302:
            #代理
            print('302')
            proxy = get_proxy()
            if proxy:
                print('使用的代理是：',proxy)
                count +=1
                return get_html(url,count)
            else:
                print('get proxy failed')
                return None
    except ConnectionError as e:
        print('Error',e.args)
        proxy = get_proxy()
        count +=1
        return get_html(url)

def get_index(keyword,page):
    data = {
        'query':keyword,
        'type' :2,
        'page' :page
    }
    queries = urlencode(data)
    url = base_url+queries
    html = get_html(url)
    return html

def parse_index(html):
    # hrefs = re.findall(r'<div class="txt-box">.*?<h3><a target="_blank" href="(.*?)" id=".*?</a></h3>',html,re.DOTALL)
    doc = pq(html)
    items = doc('.news-box .news-list li .txt-box h3 a').items()
    for item in items:
        yield item.attr('href')

def main():
    keyword = input('请输入想要查询的内容:')
    keyword = keyword

    for page in range(1,101):
        html = get_index(keyword,page)
        if html:
            article_urls = parse_index(html)
            for article_url in article_urls:
                print(article_url)

if __name__ == '__main__':
    main()
    time.sleep(3)