1.抓取西刺代理IP
import requests,random,time
from lxml import etree
UA=[
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'
]
def get_html(url,reurl):
headers={
'Referer': reurl,
'Host': 'www.xicidaili.com',
'User-Agent': random.choice(UA)
}
r=requests.get(url,headers=headers)
r.encoding='utf-8'
# print(r.text)
print(r.status_code)
html=etree.HTML(r.text)
host_list=html.xpath("//tr/td[2]/text()")
port_list=html.xpath("//tr/td[3]/text()")
return host_list,port_list
ua={'User-Agent': random.choice(UA)}
def save(host_list,port_list):
f=open('ip.ini','a')
for i,j in zip(host_list,port_list):
proxies = {
'http': str(i)+':'+str(j)
}
try:
t=requests.get('https://www.baidu.com',proxies=proxies,headers=ua)
if t.status_code==200:
f.write(str(i)+':'+str(j)+'\n')
else:
print(i,j)
except:
print(1)
f.close()
if __name__ == '__main__':
url='https://www.xicidaili.com/nn/'
for i in range(2,30):
xurl=url+str(i)
reurl=url+str(i-1)
host,port=get_html(xurl,reurl)
save(host,port)
2.抓取jd商品信息
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time,pymysql
def getid():
driver = webdriver.Chrome()
driver.get('https://www.jd.com/')
element = WebDriverWait(driver, 5, 0.5).until(
EC.presence_of_element_located((By.ID, 'key'))
)
element.send_keys('连衣裙')
click = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'button'))
)
click.click()
return driver
def gethtml(driver):
for i in range(5):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)
html=driver.page_source
html=etree.HTML(html)
# for i in html.xpath("//li[@class='gl-item']/div[@class='gl-i-wrap']/div[@class='p-price']/strong/i/text()"):
# print(i)
name=html.xpath("//li[@class='gl-item']/div[@class='gl-i-wrap']/div[@class='p-name p-name-type-2']/a/@title")
price=html.xpath("//li[@class='gl-item']/div[@class='gl-i-wrap']/div[@class='p-price']/strong/i/text()")
next_page = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CLASS_NAME, 'pn-next'))
)
next_page.click()
return name,price,driver
def saveinfo(name,price):
conn=pymysql.connect(
host='127.0.0.1',port=3306,
user='root',password='123456',
db='lianxi',charset='utf8MB4'
)
cursor=conn.cursor()
# cursor.execute("create table jd(name varchar(32),price varchar(32)) character set = utf8")
# cursor.executemany("insert into jd(`name`,`price`) values ('%s','%s')",list(zip(list(name),list(price))))
# row=cursor.execute("create table jd(name varchar(32),price varchar(32))")
for i,j in zip(name,price):
cursor.execute("insert into jd(name,price) values ('%s','%s')"%(str(i)[:5],str(j))) #注意字符串不要太长否则会报错
conn.commit()
conn.close()
return
def main(i=1):
driver = getid()
gethtml(driver)
name, price,driver= gethtml(driver)
saveinfo(name, price)
while True:
if i>100:
break
gethtml(driver)
name, price = gethtml(driver)[:2]
saveinfo(name, price)
i+=1
if __name__ == '__main__':
main()