接上一篇python:爬虫系列-01,按照当时的想法做了一个根据一个入口,爬出全部链接,以及这些链接里面的全部链接。
- 基于
python3
涉及的知识点如下:http 请求 / 数据编码 / 字符串处理 / 遍历 / 递归
存在的问题点:
- 做的不是很成熟,爬出来的链接没有存本地,就打印显示了一下。
- 有些链接访问不了,会返回403或者其他的状态码
- 还有一些网页不是
utf-8
编码的,使用utf-8
转码会失败。 - 我将这些异常全部
catch
了,并且设置了一个访问超时timeout=10
s。 - 所以,这样爬取的链接不是全部的链接,只是全部的可以被这个爬虫访问的链接。
由于从一个入口进入之后,后面还有很多的链接,子链接里面又会有很多的链接。所以,我本地测试了3个小时,还是没有爬完。也没有办法验证有没有重复爬取。
该爬虫实现的比较简单,获取的数据没有存本地,一共仅包含三个文件
common_var.py
http_file.py
url_collections.py
- 其中,
url_collections.py
是该爬虫的入口文件。调用爬虫也是在这里执行的
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @file :common_var.py
# @author : cat
# @date : 2017/6/25.
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers = {"User-Agent": user_agent}
if __name__ == '__main__':
pass
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @file : http_file.py
# @author : cat
# @date : 2017/6/24.
from urllib import request
import ssl
from web.common_var import headers
import re
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
csdn = 'http://www.csdn.com'
def get_urls(url_in=csdn, key="href="):
"""
通过一个入口的URL爬取其中的全部的URL
:param url_in: 入口的URL
:param key: 'href='
:return: urls set !
"""
url_sets = set()
ssl_context = ssl._create_unverified_context()
req = request.Request(url_in, headers=headers)
resp_bytes = ""
try:
# 超时:10s
resp_bytes = request.urlopen(req, timeout=10, context=ssl_context)
# print(resp_bytes.getcode())
if resp_bytes.getcode() != 200:
return
for line in resp_bytes:
line_html = ""
try:
line_html = line.decode('utf-8')
except UnicodeDecodeError:
print("can not decode utf8: ", line_html)
# print(line_html)
if key in line_html:
# print(line_html)
index = line_html.index(key)
split = line_html[index + len(key):].replace('"', "#").split('#')
sub_url = None
try:
sub_url = split[1] if len(split) > 1 else None
except IndexError:
print("error in : ", split, len(split))
pass
match = False
try:
match = regex.search(sub_url)
except TypeError:
print(sub_url, type(sub_url))
if match:
# print(match.group())
# yield match.group()
url_sets.add(match.group())
# print(url_sets)
except Exception:
print("urlopen error: ", resp_bytes)
return url_sets
if __name__ == '__main__':
print(list(get_urls("http://news.baidu.com/?tn=news")))
# baidu_news = "http://news.baidu.com/?tn=news"
# urls_collected = set()
# urls = get_urls(baidu_news)
# # print(urls)
# for u in urls:
# print(" ", u)
#
# print("total url size in {} = {}"
# .format(baidu_news, len(urls)))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @file : url_collections.py
# @author : cat
# @date : 2017/6/25.
from web.http_file import get_urls
from web.http_file import csdn
class UrlCollection:
def __init__(self, spidered_urls=set(), collcted_urls=set()):
self.spidered_urls = spidered_urls
self.collcted_urls = collcted_urls
def collct(self, url_in):
if url_in not in self.spidered_urls:
urls = get_urls(url_in)
self.url_in = url_in
self.collcted_urls = self.collcted_urls | urls
self.spidered_urls.add(self.url_in)
print(self.url_in, len(self.collcted_urls))
for u in urls:
self.collct(u)
if __name__ == '__main__':
spider = UrlCollection()
spider.collct(csdn)
pass
执行结果如下:(未爬完)
http://www.csdn.com 307
http://g.csdn.net/5272869 381
http://hardware.csdn.net/themes/zone/hardware/css/01mod-nav.css 381
http://huiyi.csdn.net/activity/product/goods_list?project_id=1628 392
can not decode utf8:
http://csdnimg.cn/public/favicon.ico 392
··· # 后面还有很多内容,没办法全部复制...
该练习的初衷是使用一下
python
。