爬虫小练手_网络爬虫练手小项目-CSDN博客

本文链接：https://blog.csdn.net/gllvcpp/article/details/89739767

1.抓取西刺代理IP

import requests,random,time
from lxml import etree
UA=[
    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'
    ]
def get_html(url,reurl):
    headers={
    'Referer': reurl,
'Host': 'www.xicidaili.com',
'User-Agent': random.choice(UA)
}
    r=requests.get(url,headers=headers)
    r.encoding='utf-8'
# print(r.text)
    print(r.status_code)
    html=etree.HTML(r.text)
    host_list=html.xpath("//tr/td[2]/text()")
    port_list=html.xpath("//tr/td[3]/text()")
    return host_list,port_list
ua={'User-Agent': random.choice(UA)}
def save(host_list,port_list):
    f=open('ip.ini','a')
    for i,j in zip(host_list,port_list):

        proxies = {
        'http': str(i)+':'+str(j)

    }
        try:
            t=requests.get('https://www.baidu.com',proxies=proxies,headers=ua)
            if t.status_code==200:
                f.write(str(i)+':'+str(j)+'\n')
            else:
                print(i,j)
        except:
            print(1)
    f.close()
if __name__ == '__main__':
    url='https://www.xicidaili.com/nn/'
    for i in range(2,30):
        xurl=url+str(i)
        reurl=url+str(i-1)
        host,port=get_html(xurl,reurl)
        save(host,port)

2.抓取jd商品信息

 from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time,pymysql
def getid():
    driver = webdriver.Chrome()
    driver.get('https://www.jd.com/')
    element = WebDriverWait(driver, 5, 0.5).until(
        EC.presence_of_element_located((By.ID, 'key'))
    )
    element.send_keys('连衣裙')
    click = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, 'button'))
    )
    click.click()
    return driver
def gethtml(driver):

    for i in range(5):
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        time.sleep(1)
    html=driver.page_source
    html=etree.HTML(html)
# for i in html.xpath("//li[@class='gl-item']/div[@class='gl-i-wrap']/div[@class='p-price']/strong/i/text()"):
#     print(i)
    name=html.xpath("//li[@class='gl-item']/div[@class='gl-i-wrap']/div[@class='p-name p-name-type-2']/a/@title")
    price=html.xpath("//li[@class='gl-item']/div[@class='gl-i-wrap']/div[@class='p-price']/strong/i/text()")
    next_page = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CLASS_NAME, 'pn-next'))
    )
    next_page.click()
    return name,price,driver
def saveinfo(name,price):
    conn=pymysql.connect(
        host='127.0.0.1',port=3306,
        user='root',password='123456',
        db='lianxi',charset='utf8MB4'
)
    cursor=conn.cursor()

# cursor.execute("create table jd(name varchar(32),price varchar(32)) character set = utf8")
# cursor.executemany("insert into jd(`name`,`price`) values ('%s','%s')",list(zip(list(name),list(price))))
# row=cursor.execute("create table jd(name varchar(32),price varchar(32))")
    for i,j in zip(name,price):
        cursor.execute("insert into jd(name,price) values ('%s','%s')"%(str(i)[:5],str(j))) #注意字符串不要太长否则会报错
    conn.commit()
    conn.close()
    return
def main(i=1):
    driver = getid()
    
    gethtml(driver)
    name, price,driver= gethtml(driver)
    saveinfo(name, price)
    while True:
        if i>100:
            break
        gethtml(driver)
        name, price = gethtml(driver)[:2]
        saveinfo(name, price)
        i+=1
if __name__ == '__main__':
    main()