先来讲讲逻辑,爬取的网站是:情话网
1、进入主页面,这个网站属于同步加载,在document内发现有需要的HTML数据,因此只需看element就行,使用追踪箭头找到标签的位置。
标签网址和标签名都在[li a]标签下,包括下面的也是,可以自己查看一下,接下来就是进行模型匹配
//ul[@class="tj_two"]/li/a/@href :标签网址
//ul[@class="tj_two"]/li/a/text() :标签名
这里就不解释了,xpath匹配教程网上也有
2、进入标签网址,同样的方法,找到每一篇文章的位置
进行匹配
//div[@class="item"]/div/a/@href :文章网址
3、进如文章网址,会看到网友的评论,同样的方法,找到位置
进行匹配
//div[@class="stbody "]/a/p/text()|//div[@class="stbody first"]/a/p/text() :文字评论
最后就是层层循环,把数据一个一个提取出来,存进相应的文件内即可
代码做了一些判断,可以为用户使用
完整代码:
loverPratter.py
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
import requests
import sys
from lxml import etree
import pandas as pd
import numpy as np
'''-----------------------------'''
url = 'http://www.ainicr.cn/tab/'
path = 'myData'
'''-----------------------------'''
class SpiderLoverPratter():
def get_data(self, url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def get_type(self, urltyp):
urltyp = requests.get(urltyp)
if urltyp.status_code == 200:
return urltyp.text
else:
return None
def get_character(self, urlCha):
urlCha = requests.get(urlCha)
if urlCha.status_code == 200:
return urlCha.text
else:
return None
def get_xml(self, data):
xml = etree.HTML(data)
xml_url = xml.xpath('//ul[@class="tj_two"]/li/a/@href')
xml_typ = xml.xpath('//ul[@class="tj_two"]/li/a/text()')
return xml_url, xml_typ
def get_xmlTyp(self, data):
xmlTyp = etree.HTML(data)
xmlTyp = xmlTyp.xpath('//div[@class="item"]/div/a/@href')
return xmlTyp
def get_xmlCha(self,data):
xmlCha = etree.HTML(data)
xmlCha = xmlCha.xpath('//div[@class="stbody "]/a/p/text()|//div[@class="stbody first"]/a/p/text()')
return xmlCha
def save_file(self, dataFile, pathFile):
dataFile = pd.DataFrame(dataFile)
dataFile.to_csv(pathFile, mode='a', header=False, index=False, encoding='gb18030')
def data_processing(self, ur, pathFlie):
res = self.get_type(ur)
xmlTyp = self.get_xmlTyp(res)
for ul in xmlTyp:
urlCha = self.get_character(ul)
xmlCha = self.get_xmlCha(urlCha)
xmlCha = np.array(xmlCha)
self.save_file(xmlCha, pathFlie)
def main(self, select):
try:
resData = self.get_data(url)
except:
resData = self.get_data(url)
sys.setrecursionlimit(5) # 限制递归参数
finally:
xml_url, xml_typ = self.get_xml(resData)
if select == '1':
print('正在下载数据中......')
for xu, xm in zip(xml_url, xml_typ):
print('=====类型=====')
print('---------------------- {} -------------------------'.format(xm).encode('raw_unicode_escape').decode())
pathFlie = path + '/' + str(xm.encode('raw_unicode_escape').decode()) + '.csv'
# print(pathFlie)
self.data_processing(xu, pathFlie)
print('下载完毕!!!')
elif select == '0':
xml_list = []
for x in xml_typ:
x = x.encode('raw_unicode_escape').decode()
xml_list.append(x)
print(xml_list)
category = input('请选择您想下载的类别:')
print('正在下载数据中......')
if category in xml_list:
ca_index = xml_list.index(category)
ca_url = xml_url[ca_index]
print('---------------------- {} -------------------------'.format(category))
pathFlie = path + '/' + category + '.csv'
# print(pathFlie)
self.data_processing(ca_url, pathFlie)
print('下载完毕!!!')
else:
print('请重新输入!!!')
myLoverPra.py
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
import loverPratter as lp
import sys
lovp = lp.SpiderLoverPratter()
print('----下载全部数据请输入1----\n----下载单组数据请输入0----')
sel = input('请输入数值:')
try:
lovp.main(sel)
except:
sys.exit(1)
有不懂的地方欢迎在评论区留言!!!