python:爬虫系列-02-CSDN博客

本文链接：https://blog.csdn.net/DucklikeJAVA/article/details/73717979

本文介绍了一个简单的Python爬虫实现，该爬虫可以从指定的入口URL开始抓取所有可访问的链接。涉及的知识点包括HTTP请求、数据编码、字符串处理等。文章提供了具体的代码示例并讨论了存在的问题。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

接上一篇python:爬虫系列-01，按照当时的想法做了一个根据一个入口，爬出全部链接，以及这些链接里面的全部链接。

基于python3

涉及的知识点如下：http 请求 / 数据编码 / 字符串处理 / 遍历 / 递归

存在的问题点：

做的不是很成熟，爬出来的链接没有存本地，就打印显示了一下。
有些链接访问不了，会返回403或者其他的状态码
还有一些网页不是utf-8编码的，使用utf-8转码会失败。
我将这些异常全部catch了，并且设置了一个访问超时timeout=10s。
所以，这样爬取的链接不是全部的链接，只是全部的可以被这个爬虫访问的链接。
由于从一个入口进入之后，后面还有很多的链接，子链接里面又会有很多的链接。所以，我本地测试了3个小时，还是没有爬完。也没有办法验证有没有重复爬取。
该爬虫实现的比较简单，获取的数据没有存本地，一共仅包含三个文件
1. common_var.py
2. http_file.py
3. url_collections.py
其中，url_collections.py是该爬虫的入口文件。调用爬虫也是在这里执行的

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @file   :common_var.py
# @author : cat
# @date   : 2017/6/25.

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers = {"User-Agent": user_agent}

if __name__ == '__main__':
    pass

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @file   : http_file.py
# @author : cat
# @date   : 2017/6/24.
from urllib import request
import ssl
from web.common_var import headers
import re

regex = re.compile(
    r'^(?:http|ftp)s?://'  # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
    r'(?::\d+)?'  # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)

csdn = 'http://www.csdn.com'


def get_urls(url_in=csdn, key="href="):
    """
    通过一个入口的URL爬取其中的全部的URL
    :param url_in: 入口的URL
    :param key: 'href='
    :return: urls set !
    """
    url_sets = set()
    ssl_context = ssl._create_unverified_context()
    req = request.Request(url_in, headers=headers)
    resp_bytes = ""
    try:
        # 超时：10s
        resp_bytes = request.urlopen(req, timeout=10, context=ssl_context)
        # print(resp_bytes.getcode())
        if resp_bytes.getcode() != 200:
            return
        for line in resp_bytes:
            line_html = ""
            try:
                line_html = line.decode('utf-8')
            except UnicodeDecodeError:
                print("can not decode utf8:   ", line_html)
            # print(line_html)
            if key in line_html:
                # print(line_html)
                index = line_html.index(key)
                split = line_html[index + len(key):].replace('"', "#").split('#')
                sub_url = None
                try:
                    sub_url = split[1] if len(split) > 1 else None
                except IndexError:
                    print("error in : ", split, len(split))
                    pass
                match = False
                try:
                    match = regex.search(sub_url)
                except TypeError:
                    print(sub_url, type(sub_url))
                if match:
                    # print(match.group())
                    # yield match.group()
                    url_sets.add(match.group())
                    # print(url_sets)
    except Exception:
        print("urlopen error: ", resp_bytes)

    return url_sets


if __name__ == '__main__':
    print(list(get_urls("http://news.baidu.com/?tn=news")))
    # baidu_news = "http://news.baidu.com/?tn=news"
    # urls_collected = set()
    # urls = get_urls(baidu_news)
    # # print(urls)
    # for u in urls:
    #     print(" ", u)
    # 
    # print("total url size in {} = {}"
    #       .format(baidu_news, len(urls)))

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @file   : url_collections.py
# @author : cat
# @date   : 2017/6/25.

from web.http_file import get_urls
from web.http_file import csdn


class UrlCollection:
    def __init__(self, spidered_urls=set(), collcted_urls=set()):
        self.spidered_urls = spidered_urls
        self.collcted_urls = collcted_urls

    def collct(self, url_in):
        if url_in not in self.spidered_urls:
            urls = get_urls(url_in)
            self.url_in = url_in
            self.collcted_urls = self.collcted_urls | urls
            self.spidered_urls.add(self.url_in)
            print(self.url_in, len(self.collcted_urls))
            for u in urls:
                self.collct(u)


if __name__ == '__main__':
    spider = UrlCollection()
    spider.collct(csdn)
    pass

执行结果如下：（未爬完）

http://www.csdn.com 307
http://g.csdn.net/5272869 381
http://hardware.csdn.net/themes/zone/hardware/css/01mod-nav.css 381
http://huiyi.csdn.net/activity/product/goods_list?project_id=1628 392
can not decode utf8:    
http://csdnimg.cn/public/favicon.ico 392
··· # 后面还有很多内容，没办法全部复制...