diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/get_html.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/get_html.json" index ef69031d00d166a077254d435a3dbd79fad67803..b876ddf4b546bfb9072eb373d83370f77464b1ac 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/get_html.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/get_html.json" @@ -1,17 +1,8 @@ { - "one_line": { - "urllib.request.urlopen": [ - "urllib.request" - ], - "response.read()": [ - "response.readline()" - ], - "buff.decode(\"utf8\")": [ - "buff.encode(\"utf8\")" - ] - }, - "source": "get_html.py", + "source": "get_html.md", "depends": [], "exercise_id": 198, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/get_html.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/get_html.md" new file mode 100644 index 0000000000000000000000000000000000000000..8a76313f72fbc70703178052f2e382c5df63e95f --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/get_html.md" @@ -0,0 +1,88 @@ +# urlib 获取网页(1) + +将 url 对应的网页下载到本地 + +```python +# -*- coding: UTF-8 -*- +import urllib.request + +def get_html(url): + # TODO(You): 请在此实现代码 + return html + +if __name__ == '__main__': + url = "http://www.baidu.com" + html = get_html(url) + print(html) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +import urllib.request + +def get_html(url): + response = urllib.request.urlopen(url) + buff = response.read() + html = buff.decode("utf8") + return html + +if __name__ == '__main__': + url = "http://www.baidu.com" + html = get_html(url) + print(html) +``` + +## 答案 + +```python +def get_html(url): + response = urllib.request.urlopen(url) + buff = response.read() + html = buff.decode("utf8") + return html +``` + +## 选项 + +### A + +```python +def get_html(url): + response = urllib.request.urlopen(url) + buff = response.read() + html = buff.encode("utf8") + return html +``` + +### B + +```python +def get_html(url): + response = urllib.request.urlopen(url) + buff = response.readline() + html = buff.decode("utf8") + return html +``` + +### C + +```python +def get_html(url): + response = urllib.request(url) + buff = response.read() + html = buff.decode("utf8") + return html +``` + +### D + +```python +def get_html(url): + response = urllib.request.urlopen(url) + buff = response.read() + html = buff.decode() + return html +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/post.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/post.json" index 48ebed6348289798925f27130cdfe76a41282294..62918b06306bc5e7465bde924ec7b9d1326d20dd 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/post.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/post.json" @@ -1,13 +1,8 @@ { - "one_line": { - "bytes(urllib.parse.urlencode(data), encoding='utf8')": [ - "bytes(urllib.parse.urlencode(data))", - "bytes(data, encoding='utf8')", - "urllib.parse.urlencode(data)" - ] - }, - "source": "post.py", + "source": "post.md", "depends": [], "exercise_id": 202, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/post.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/post.md" new file mode 100644 index 0000000000000000000000000000000000000000..4b0036518b807c98e046ff1deabd0e38ce51ff10 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/post.md" @@ -0,0 +1,102 @@ +# urllib post请求 + +urllib post请求 + +```python +# -*- coding: UTF-8 -*- +import urllib.request +import urllib.parse + +def get_response(url, data): + # TODO(You): 请在此编写代码 + return result + +if __name__ == '__main__': + data = { + "key1": "value1", + "key2": "value2" + } + url = "http://httpbin.org/post" + html = get_response(url, data) + print(html) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +import urllib.request +import urllib.parse + + +def get_response(url, data): + data = bytes(urllib.parse.urlencode(data), encoding='utf8') + response = urllib.request.urlopen(url, data=data) + buff = response.read() + result = buff.decode("utf8") + return result + +if __name__ == '__main__': + data = { + "key1": "value1", + "key2": "value2" + } + url = "http://httpbin.org/post" + html = get_response(url, data) + print(html) +``` + +## 答案 + +```python +def get_response(url, data): + data = bytes(urllib.parse.urlencode(data), encoding='utf8') + response = urllib.request.urlopen( + url, data=data + ) + buff = response.read() + result = buff.decode("utf8") + return result +``` + +## 选项 + +### A + +```python +def get_response(url, data): + data = bytes(urllib.parse.urlencode(data, encoding='utf8')) + response = urllib.request.urlopen( + url, data=data + ) + buff = response.read() + result = buff.decode("utf8") + return result +``` + +### B + +```python +def get_response(url, data): + data = bytes(urllib.parse.urlencode(data), encoding='utf8') + response = urllib.request.urlopen( + url, data + ) + buff = response.read() + result = buff.decode("utf8") + return result +``` + +### C + +```python +def get_response(url, data): + data = urllib.parse.urlencode(data, encoding='utf8') + response = urllib.request.urlopen( + url, data=data + ) + buff = response.read() + result = buff.decode("utf8") + return result +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/with_headers.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/with_headers.json" index 71773ba6a8dbaae58f8177e86d397e9179c354fc..83da203ddb1c1134b76e9cd298ce8ac38accb18a 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/with_headers.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/with_headers.json" @@ -1,17 +1,8 @@ { - "one_line": { - "req.add_header(key, headers[key])": [ - "req.append(key, headers[key])" - ], - "urllib.request.urlopen(req)": [ - "urllib.request.urlopen(url)" - ], - "urllib.request.Request(url)": [ - "urllib.request.request(url)" - ] - }, - "source": "with_headers.py", + "source": "with_headers.md", "depends": [], "exercise_id": 247, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/with_headers.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/with_headers.md" new file mode 100644 index 0000000000000000000000000000000000000000..7d106a81bd37b6d96fd77e86bb7bb963e5485da6 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/1.urllib/with_headers.md" @@ -0,0 +1,100 @@ +# urlib 获取网页(2) with header + +将 url 对应的网页下载到本地 + +```python +# -*- coding: UTF-8 -*- +import urllib.request + +def get_html(url, headers): + # TODO(You): 请在此实现带头部信息的网页请求 + return html + +if __name__ == '__main__': + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" + } + url = "http://www.baidu.com" + html = get_html(url, headers) + print(html) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +import urllib.request + +def get_html(url, headers=None): + req = urllib.request.Request(url) + if headers is not None: + for key in headers: + req.add_header(key, headers[key]) + response = urllib.request.urlopen(req) + buff = response.read() + html = buff.decode("utf8") + return html + +if __name__ == '__main__': + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" + } + url = "http://www.baidu.com" + html = get_html(url, headers) + print(html) +``` + +## 答案 + +```python +def get_html(url, headers): + req = urllib.request.Request(url) + for key in headers: + req.add_header(key, headers[key]) + response = urllib.request.urlopen(req) + buff = response.read() + html = buff.decode("utf8") + return html +``` + +## 选项 + +### A + +```python +def get_html(url, headers): + req = urllib.request.Request(url) + for key in headers: + urllib.request.add_header(key, headers[key]) + response = urllib.request.urlopen(req) + buff = response.read() + html = buff.decode("utf8") + return html +``` + +### B + +```python +def get_html(url, headers): + req = urllib.request.urlopen(url) + for key in headers: + req.add_header(key, headers[key]) + response = urllib.request.urlopen(req) + buff = response.read() + html = buff.decode("utf8") + return html +``` + +### C + +```python +def get_html(url, headers): + req = urllib.request.Request(url) + for key in headers: + req.set_header(key, headers[key]) + response = urllib.request.urlopen(req) + buff = response.read() + html = buff.decode("utf8") + return html +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/10.\345\212\250\346\200\201\346\270\262\346\237\223\351\241\265\351\235\242\347\210\254\345\217\226/dynamic_page.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/10.\345\212\250\346\200\201\346\270\262\346\237\223\351\241\265\351\235\242\347\210\254\345\217\226/dynamic_page.md" index a2c871ca0588c7d37e9e88d0aaf267c6de3a3435..e06ba22dc933fe3e256a8176ab4152718569c5f6 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/10.\345\212\250\346\200\201\346\270\262\346\237\223\351\241\265\351\235\242\347\210\254\345\217\226/dynamic_page.md" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/10.\345\212\250\346\200\201\346\270\262\346\237\223\351\241\265\351\235\242\347\210\254\345\217\226/dynamic_page.md" @@ -2,18 +2,17 @@ 现在想爬取一个url为下拉滚动的页面,下列选项可以爬取到下列页面内容的是: - - ## 答案 ```python +# -*- coding: UTF-8 -*- import time from selenium import webdriver from bs4 import BeautifulSoup driver = webdriver.Chrome() -driver.get(url); -Thread.sleep(1000); +driver.get(url) +Thread.sleep(1000) page_size = 10 for i in range(page_size): @@ -29,7 +28,7 @@ print(page.text) ### A -``` +```bash 以上均不正确 ``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/config.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/config.json" index 18ffd8561498a9ab51a1e00433918ce6d25153bc..ff1e1aed24a462dbbe47b1c46091482137c5058e 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/config.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/config.json" @@ -1,6 +1,7 @@ { "export": [ - "simulate_login.json" + "simulate_login.json", + "hello_simulate.json" ], "keywords": [], "children": [ diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/hello_simulate.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/hello_simulate.json" new file mode 100644 index 0000000000000000000000000000000000000000..b6fe8aff76e6fa240c348964e7349ce0b2f0627d --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/hello_simulate.json" @@ -0,0 +1,8 @@ +{ + "author": "huanhuilong", + "source": "hello_simulate.md", + "depends": [], + "type": "code_options", + "notebook_enable": true, + "exercise_id": "237d7909392a48998437fdfe58ea3db4" +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/hello_simulate.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/hello_simulate.md" new file mode 100644 index 0000000000000000000000000000000000000000..8b5a0011e0e8becbc3ca38f1b5bf2d7b197d48ad --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/hello_simulate.md" @@ -0,0 +1,70 @@ +# 模拟登陆例子 + +以下是一个使用 cookie 模拟登录请求页面的例子 + +```python +# -*- coding: UTF-8 -*- +import requests +import sys +import io + +if __name__ == "__main__": + # 登录后才能访问的网页 + url = 'http://www.csdn.net' + + # 浏览器登录后得到的cookie + cookie_str = r'xxx=yyy;zzz=mmm' + + # 把cookie字符串处理成字典,以便接下来使用 + # TODO(You): 请正确准备cookie数据 + + # 设置请求头 + headers = { + 'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' + } + + # 在发送get请求时带上请求头和cookies + resp = requests.get( + url, + headers=headers, + cookies=cookies + ) + + print(resp.content.decode('utf-8')) +``` + +正确实现代码的是? + +## 答案 + +```python +cookies = {} +for line in cookie_str.split(';'): + key, value = line.split('=', 1) + cookies[key] = value +``` + +## 选项 + +### A + +```python +cookies = {} +for line in cookie_str.split(';'): + key, value = line.split('=', 1) + cookies[key] = line +``` + +### B + +```python +cookies = cookie_str.split(';') +``` + +### C + +```python +cookies = [] +for line in cookie_str.split(';'): + cookies.append([key,value]) +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/hello_simulate.py" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/hello_simulate.py" new file mode 100644 index 0000000000000000000000000000000000000000..1e7fe932fa530a92e1cb51cb967e0264b16f41e9 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/hello_simulate.py" @@ -0,0 +1,27 @@ +import requests +import sys +import io + + +if __name__ == "__main__": + # 登录后才能访问的网页 + url = 'csdn.net' + + # 浏览器登录后得到的cookie + cookie_str = r'xxx=yyy;zzz=mmm' + + # 把cookie字符串处理成字典,以便接下来使用 + cookies = {} + for line in cookie_str.split(';'): + key, value = line.split('=', 1) + cookies[key] = value + + # 设置请求头 + headers = { + 'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' + } + + # 在发送get请求时带上请求头和cookies + # TODO(You): 请在此使用 cookie 登录请求页面 + + print(resp.content.decode('utf-8')) diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/simulate_login.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/simulate_login.md" index 7f5b7ce69d157ad61a6d041cc6707c1bad911ce4..702de539f83f240b6a316b4c2fe8c9dc0ca32a46 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/simulate_login.md" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/11.\346\250\241\346\213\237\347\231\273\345\275\225/simulate_login.md" @@ -2,11 +2,9 @@ 一些网站需要登录之后才能浏览网站的其他内容,爬虫需要拥有登录获取cookie/session的能力才能继续采集数据,以下关于说法错误的是: - - ## 答案 -``` +```bash 登录成功后获取的cookie一般来说永久有效 ``` @@ -14,18 +12,18 @@ ### A -``` +```bash 模拟登陆需要先注册网站的账号,或者多注册一些账号来维护一个cookies池 ``` ### B -``` +```bash 获取登录页面,可以从登录按钮处获取到登录的url ``` ### C -``` +```bash 登录成功后获取到cookie,其他请求带上cookie就可以获取到请求的页面资源 ``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese01.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese01.json" index 2b752addb02dd023540ed53694fd69481f1ae0eb..40674f6fdfdeda316f9bfa875e63a561f5e39d37 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese01.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese01.json" @@ -1,13 +1,8 @@ { - "one_line": { - "findall": [ - "find", - "finds", - "find_all" - ] - }, - "source": "chinese01.py", + "source": "chinese01.md", "depends": [], "exercise_id": 243, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese01.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese01.md" new file mode 100644 index 0000000000000000000000000000000000000000..a23bbc13a52ab533d01230675415ab089714498e --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese01.md" @@ -0,0 +1,68 @@ +# Python 中文处理(1) + +获取中文个数 + +```python +# -*- coding: UTF-8 -*- +import re + +def getnum_of_cn(inputdata): + '''计算字符串中 中文字符 数量''' + # TODO(You): 请编写正则查询代码 + return len(chi) + +def test(): + n = getnum_of_cn('你好,lajfldkjaklda123') + print(n) + +if __name__ == '__main__': + test() +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +import re + + +def getnum_of_cn(inputdata): + '''计算字符串中 中文字符 数量''' + chi = re.findall(r'[\u4E00-\u9FFF]', inputdata) + return len(chi) + + +def test(): + n = getnum_of_cn('你好,lajfldkjaklda123') + print(n) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +chi = re.findall(r'[\u4E00-\u9FFF]', inputdata) +``` + +## 选项 + +### A + +```python +chi = re.find(r'[\u4E00-\u9FFF]', inputdata) +``` + +### B + +```python +chi = inputdata.findall(r'[\u4E00-\u9FFF]') +``` + +### C + +```python +chi = re.findall(r'\u4E00-\u9FFF', inputdata) +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese02.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese02.json" index ad50f37821f1c5f0b68705ad6fcd2b05b33ee20a..306626ace91effdce40d9fc6d8d1c301a85b78e5 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese02.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese02.json" @@ -1,13 +1,8 @@ { - "one_line": { - "search": [ - "searchall", - "match", - "find" - ] - }, - "source": "chinese02.py", + "source": "chinese02.md", "depends": [], "exercise_id": 219, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese02.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese02.md" new file mode 100644 index 0000000000000000000000000000000000000000..5595b3f6453dff69554268521ba860b6bb247456 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/chinese02.md" @@ -0,0 +1,68 @@ +# Python 中文处理(2) + +获取中文个数 + +```python +# -*- coding: UTF-8 -*- +import re + +def search_text(inputdata): + '''search返回匹配到的一个''' + # TODO(You): 请在此实现代码 + return chi + +def test(): + n = search_text('你好,nlp先生!nlp先生!') + print(n) + +if __name__ == '__main__': + test() +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +import re + + +def search_text(inputdata): + '''search返回匹配到的一个''' + chi = re.search('nlp', inputdata) + return chi + + +def test(): + n = search_text('你好,nlp先生!nlp先生!') + print(n) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +chi = re.search('nlp', inputdata) +``` + +## 选项 + +### A + +```python +chi = re.searchAll('nlp', inputdata) +``` + +### B + +```python +chi = re.search(inputdata, 'nlp') +``` + +### C + +```python +chi = inputdata.search('nlp') +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/find_ip_address.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/find_ip_address.json" index 08e0d8d0551082c9be86b9eb6319fc57b8209ee9..73b264be2d2658b2e8866218846c77914ee8c88a 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/find_ip_address.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/find_ip_address.json" @@ -1,13 +1,8 @@ { - "one_line": { - "findall": [ - "search", - "match", - "sub" - ] - }, - "source": "find_ip_address.py", + "source": "find_ip_address.md", "depends": [], "exercise_id": 181, - "type": "code_options" + "type": "code_options", + "author": "huanhuilong", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/find_ip_address.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/find_ip_address.md" new file mode 100644 index 0000000000000000000000000000000000000000..71b9596c31a789e7398bc629950587a210f7c5cf --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/find_ip_address.md" @@ -0,0 +1,127 @@ +# 正则表达式实战(2) + +查找字符串里含有的全部 IPV4 和 IPV6 地址 + +```python +# -*- coding: UTF-8 -*- +import re + +def find_all_ipv4(text): + result = [] + ipv4 = r"((\b25[0-5]|\b2[0-4][0-9]|\b[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3})" + + # TODO(You):请在此匹配ipv4 + + for m in ret: + result.append({'type': 'ipv4', 'value': m[0]}) + return result + +def find_all_ipv6(text): + result = [] + + ipv6 = r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))" + + # TODO(You): 请在此匹配ipv6 + + for m in ret: + result.append({'type': 'ipv6', 'value': m[0]}) + return result + +def find_all_ip(text): + result = find_all_ipv4(text) + find_all_ipv6(text) + return result + +if __name__ == '__main__': + input = 'IP地址有IPV4,例如:192.168.100.2,也有IPV6,例如:fe80:0000:0000:0000:0204:61ff:fe9d:f156,以及:fe80:0000:0000:0000:0204:61ff:fe9d:f156,还有 192.168.100.50' + results = find_all_ip(input) + for item in results: + print('type: {}, value: {}'.format(item['type'], item['value'])) +``` + +请选出下列能**正确**实现ipv4和ipv6正则匹配的选项。 + +## template + +```python +import re + + +def find_all_ip(text): + result = [] + + ipv4 = r"((\b25[0-5]|\b2[0-4][0-9]|\b[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3})" + ret = re.findall(ipv4, text) + for m in ret: + result.append({'type': 'ipv4', 'value': m[0]}) + + ipv6 = r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))" + ret = re.finditer(ipv6, text) + for m in ret: + result.append({'type': 'ipv6', 'value': m[0]}) + + return result + +if __name__ == '__main__': + input = 'IP地址有IPV4,例如:192.168.100.2,也有IPV6,例如:fe80:0000:0000:0000:0204:61ff:fe9d:f156,以及:fe80:0000:0000:0000:0204:61ff:fe9d:f156,还有 192.168.100.50' + results = find_all_ip(input) + for item in results: + print('type: {}, value: {}'.format(item['type'], item['value'])) +``` + +## 答案 + +```python +def find_all_ipv4(text): + ... + ret = re.findall(ipv4, text) + ... + +def find_all_ipv6(text): + ... + ret = re.finditer(ipv6, text) + ... +``` + +## 选项 + +### A + +```python +def find_all_ipv4(text): + ... + ret = re.findall(text, ipv4) + ... + +def find_all_ipv6(text): + ... + ret = re.finditer(text, ipv6) + ... +``` + +### B + +```python +def find_all_ipv4(text): + ... + ret = text.findall(ipv4) + ... + +def find_all_ipv6(text): + ... + ret = text.finditer(ipv6) + ... +``` + +### C + +```python +def find_all_ipv4(text): + ... + ret = re.search(ipv4, text) + ... + +def find_all_ipv6(text): + ... + ret = re.search(ipv6, text) + ... +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/remove_html.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/remove_html.json" index c3847a2e316afc287031c972014a007eb86ca111..24f3bfe893f5b94216cc453f66e9fa10e6de2bb8 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/remove_html.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/remove_html.json" @@ -1,15 +1,8 @@ { - "one_line": { - "<[^>]+>": [ - "<.*>", - "<[^>]?>" - ], - ", re.S": [ - "" - ] - }, - "source": "remove_html.py", + "source": "remove_html.md", "depends": [], "exercise_id": 182, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/remove_html.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/remove_html.md" new file mode 100644 index 0000000000000000000000000000000000000000..240d1a099c08ec764f9286081364bbf1e8279f63 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/2.\346\255\243\345\210\231\350\241\250\350\276\276\345\274\217/remove_html.md" @@ -0,0 +1,100 @@ +# 正则表达式实战(1) + +去除html标签 + +```python +# -*- coding: UTF-8 -*- +import re +from typing import Text + +def remove_html(content): + # TODO(You): 请在此实现代码 + return result + +if __name__ == '__main__': + html = ''' + +
+body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + Text = remove_html(html) + print(Text) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +import re +from typing import Text + + +def remove_html(content): + pattern = re.compile(r'<[^>]+>', re.S) + result = pattern.sub('', content) + return result + + +def test(): + html = ''' + + +body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + Text = remove_html(html) + print(Text) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +def remove_html(content): + pattern = re.compile(r'<[^>]+>', re.S) + result = pattern.sub('', content) + return result +``` + +## 选项 + +### A + +```python +def remove_html(content): + pattern = re.compile(r'<[^>]+>') + result = pattern.sub('', content) + return result +``` + +### B + +```python +def remove_html(content): + pattern = re.compile(r'<[^>]+>', re.s) + result = pattern.sub('', content) + return result +``` + +### C + +```python +def remove_html(content): + pattern = re.compile(r'<[^>]+>', re.S) + result = re.sub(pattern, content) + return result +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_p.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_p.json" index 9a12b54dcaa855facbf2951b9f9170fee1e1fac6..a7b862d45e3728ec0b0d84fcb3ed8ee6d6e67812 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_p.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_p.json" @@ -1,13 +1,8 @@ { - "one_line": { - "find_all": [ - "find", - "xpath", - "findall" - ] - }, - "source": "get_p.py", + "source": "get_p.md", "depends": [], "exercise_id": 204, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_p.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_p.md" new file mode 100644 index 0000000000000000000000000000000000000000..0c7c2297fb7459807fa2a139c102da9f20871f2c --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_p.md" @@ -0,0 +1,101 @@ +# BeautifulSoup 获取所有p标签 + +获取所有p标签里的文本 + +```python +# -*- coding: UTF-8 -*- +from bs4 import BeautifulSoup + +def fetch_p(html): + # TODO(You): 请在此实现代码 + return results + +if __name__ == '__main__': + html = ''' + + +body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + p_text = fetch_p(html) + print(p_text) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +from bs4 import BeautifulSoup + + +def fetch_p(html): + soup = BeautifulSoup(html, 'lxml') + p_list = soup.find_all("p") + return [p.text for p in p_list] + + +def test(): + html = ''' + + +body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + p_text = fetch_p(html) + print(p_text) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +def fetch_p(html): + soup = BeautifulSoup(html, 'lxml') + p_list = soup.find_all("p") + results = [p.text for p in p_list] + return results +``` + +## 选项 + +### A + +```python +def fetch_p(html): + soup = BeautifulSoup(html, 'lxml') + p_list = soup.xpath("p") + results = [p.text for p in p_list] + return results +``` + +### B + +```python +def fetch_p(html): + soup = BeautifulSoup(html, 'lxml') + p_list = soup.findAll("p") + results = [p.text for p in p_list] + return results +``` + +### C + +```python +def fetch_p(html): + soup = BeautifulSoup(html, 'lxml') + results = soup.find_all("p") + return results +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_text.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_text.json" index 0176fb1485c72c916304c3a54860c6e4d74c07eb..19f499b09cb9890943d95b834c46b34e6fb5804f 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_text.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_text.json" @@ -1,13 +1,8 @@ { - "one_line": { - "text": [ - "text()", - "find_text()", - "all_text()" - ] - }, - "source": "get_text.py", + "source": "get_text.md", "depends": [], "exercise_id": 245, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_text.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_text.md" new file mode 100644 index 0000000000000000000000000000000000000000..55ac6dd25c76f39e236127c55d7bec3582e7ed39 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/get_text.md" @@ -0,0 +1,96 @@ +# BeautifulSoup 获取text + +获取网页的text + +```python +# -*- coding: UTF-8 -*- +from bs4 import BeautifulSoup + +def fetch_text(html): + # TODO(You): 请在此实现代码 + return result + +if __name__ == '__main__': + html = ''' + + +body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + text = fetch_text(html) + print(text) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +from bs4 import BeautifulSoup + +def fetch_text(html): + soup = BeautifulSoup(html, 'lxml') + result = soup.text + return result + +def test(): + html = ''' + + +body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + text = fetch_text(html) + print(text) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +def fetch_text(html): + soup = BeautifulSoup(html, 'lxml') + result = soup.text + return result +``` + +## 选项 + +### A + +```python +def fetch_text(html): + soup = BeautifulSoup(html, 'lxml') + result = soup.find_all('text') + return result +``` + +### B + +```python +def fetch_text(html): + soup = BeautifulSoup(html, 'lxml') + result = soup.find_text() + return result +``` + +### C + +```python +def fetch_text(html): + soup = BeautifulSoup(html, 'lxml') + result = soup.text() + return result +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/html_parer.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/html_parer.json" index 3d4d7e9091635f09a515269b4083cff0442195e8..d5bb8bd34d0fcdccbfaa270be6e7d0ee0b23f0bb 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/html_parer.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/html_parer.json" @@ -1,17 +1,8 @@ { - "one_line": { - "html.parser": [ - "html5" - ], - "'img'": [ - "'src'" - ], - "BeautifulSoup": [ - "beautifulsoup" - ] - }, - "source": "html_parer.py", + "source": "html_parer.md", "depends": [], "exercise_id": 226, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/html_parer.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/html_parer.md" new file mode 100644 index 0000000000000000000000000000000000000000..9fdc33200c4d9da8742cb2d9fe602e95a8a69289 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/3.Beautiful Soup/html_parer.md" @@ -0,0 +1,80 @@ +# BeautifulSoup + +查找网页里所有图片地址 + +```python +from bs4 import BeautifulSoup + +def fetch_imgs(html): + # TODO(You): 请在此实现代码 + return imgs + +def test(): + imgs = fetch_imgs( + 'body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + imgs = fetch_text(html) + print(imgs) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python + +from lxml import etree + + +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//p[@class='item-1']/text()") + return result + + +def test(): + html = ''' + + +body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + imgs = fetch_text(html) + print(imgs) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//p[@class='item-1']/text()") + return result +``` + +## 选项 + +### A + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//p[@class='item-2']/text()") + return result +``` + +### B + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//p[class='item-1']/text()") + return result +``` + +### C + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//p[@class='item-1']/text") + return result +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_p.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_p.json" index 66e7ff99bae7ede987996cf416436f918604fa7c..c58b96312ac7b27e5d53d73d700feda961e8e9d4 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_p.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_p.json" @@ -1,13 +1,8 @@ { - "one_line": { - "//p/text()": [ - "p/text()", - "//p", - "p.text" - ] - }, - "source": "get_html_p.py", + "source": "get_html_p.md", "depends": [], "exercise_id": 191, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_p.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_p.md" new file mode 100644 index 0000000000000000000000000000000000000000..f636846e061127460710baed83433cbd9bf2eb29 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_p.md" @@ -0,0 +1,99 @@ +# lxml解析网页 + +使用xpath获取所有段落的文本 + +```python +# -*- coding: UTF-8 -*- +from lxml import etree + +def fetch_text(html): + # TODO(You): 请在此实现代码 + return result + +if __name__ == '__main__': + html = ''' + + +body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + imgs = fetch_text(html) + print(imgs) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python + +from lxml import etree + + +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//p/text()") + return result + + +def test(): + html = ''' + + +body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + imgs = fetch_text(html) + print(imgs) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//p/text()") + return result +``` + +## 选项 + +### A + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//p/text") + return result +``` + +### B + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("/p/text()") + return result +``` + +### C + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//p.text()") + return result +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_text.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_text.json" index e27888d143192ff367e5e724911c46d790eaa54f..f290da1519625d319427e4e55023b253fa61df84 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_text.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_text.json" @@ -1,17 +1,8 @@ { - "one_line": { - "etree": [ - "tree", - "btree" - ], - "//text()": [ - "text()", - "//text", - "/text()" - ] - }, - "source": "get_html_text.py", + "source": "get_html_text.md", "depends": [], "exercise_id": 220, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_text.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_text.md" new file mode 100644 index 0000000000000000000000000000000000000000..db21bce4bdb2b6d27c413fc8447515b550fb1a1f --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/4.lxml/get_html_text.md" @@ -0,0 +1,99 @@ +# lxml解析网页 + +使用xpath获取所有的文本 + +```python +# -*- coding: UTF-8 -*- +from lxml import etree + +def fetch_text(html): + # TODO(You): 请在此实现代码 + return result + +if __name__ == '__main__': + html = ''' + + +body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + imgs = fetch_text(html) + print(imgs) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python + +from lxml import etree + + +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//text()") + return result + + +def test(): + html = ''' + + +body 元素的内容会显示在浏览器中。
+title 元素的内容会显示在浏览器的标题栏中。
+ + + ''' + imgs = fetch_text(html) + print(imgs) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//text()") + return result +``` + +## 选项 + +### A + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("/text()") + return result +``` + +### B + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("//text") + return result +``` + +### C + +```python +def fetch_text(html): + html = etree.HTML(html) + result = html.xpath("/text()") + return result +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/get_html.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/get_html.json" index 15d25ac8e9131c88e937c699b36a76efbe2daab2..6937ace5f7e7a285a8b43657f343dc8e3cb6c6b2 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/get_html.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/get_html.json" @@ -1,13 +1,8 @@ { - "one_line": { - "get": [ - "post", - "gets", - "fetch" - ] - }, - "source": "get_html.py", + "source": "get_html.md", "depends": [], "exercise_id": 242, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/get_html.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/get_html.md" new file mode 100644 index 0000000000000000000000000000000000000000..6c36644f414529390df9200828de0a9becb24011 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/get_html.md" @@ -0,0 +1,76 @@ +# requests 获取网页(1) + +获取url对应的网页HTML + +```python +# -*- coding: UTF-8 -*- +import requests + +def get_html(url): + # TODO(You): 请在此实现代码 + return result + +if __name__ == '__main__': + url = "http://www.baidu.com" + html = get_html(url) + print(html) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +import requests + + +def get_html(url): + response = requests.get(url=url) + return response.text + + +def test(): + url = "http://www.baidu.com" + html = get_html(url) + print(html) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +def get_html(url): + response = requests.get(url=url) + result = response.text + return result +``` + +## 选项 + +### A + +```python +def get_html(url): + response = requests.get(url) + result = response.text + return result +``` + +### B + +```python +def get_html(url): + result = requests.get(url=url) + return result +``` + +### C + +```python +def get_html(url): + response = requests.get(url=url) + result = response.html + return result +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/post.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/post.json" index 60af7a98c91b38754bfd8fa99344a6956873939d..ffdd38f843c3b46ea436a77c41fa467e8c36bb8b 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/post.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/post.json" @@ -1,16 +1,8 @@ { - "one_line": { - "post": [ - "get", - "posts" - ], - "response = requests.post(url, data, headers)": [ - "response = requests.post(url, headers, data)", - "response = requests.post(data, url, headers)" - ] - }, - "source": "post.py", + "source": "post.md", "depends": [], "exercise_id": 186, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/post.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/post.md" new file mode 100644 index 0000000000000000000000000000000000000000..b12c3c437d79b1b5e3434ba840416e8f5477d506 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/post.md" @@ -0,0 +1,90 @@ +# requests post 请求 + +requests post 请求 + +```python +# -*- coding: UTF-8 -*- +import requests + +def get_response(url, data, headers=None): + # TODO(You): 请在此实现代码 + return result + +if __name__ == '__main__': + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" + } + data = { + "key1": "value1", + "key2": "value2" + } + url = "http://httpbin.org/post" + html = get_response(url, data, headers) + print(html) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +import requests + + +def get_response(url, data, headers=None): + response = requests.post(url, data, headers) + return response.text + + +def test(): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" + } + data = { + "key1": "value1", + "key2": "value2" + } + url = "http://httpbin.org/post" + html = get_response(url, data, headers) + print(html) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +def get_response(url, data, headers=None): + response = requests.post(url, data, headers) + result = response.text + return result +``` + +## 选项 + +### A + +```python +def get_response(url, data, headers=None): + response = requests.get(url, headers, data) + result = response.text + return result +``` + +### B + +```python +def get_response(url, data, headers=None): + result = requests.post(url, data, headers) + return result +``` + +### C + +```python +def get_response(url, data, headers=None): + response = requests.post(url, data, headers) + result = response.text() + return result +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/with_headers.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/with_headers.json" index 09ed80641dbe65c69228a51ecf42b09e6229784b..04c3d960309a21adbde500c2fce9b8b0776f3034 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/with_headers.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/with_headers.json" @@ -1,14 +1,8 @@ { - "one_line": { - "response.text": [ - "response.text()", - "response.gettext()", - "response.get_text()", - "response" - ] - }, - "source": "with_headers.py", + "source": "with_headers.md", "depends": [], "exercise_id": 210, - "type": "code_options" + "type": "code_options", + "author": "zxm2015", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/with_headers.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/with_headers.md" new file mode 100644 index 0000000000000000000000000000000000000000..91597f26b89e21a66a51526aef9a8dff712cd09b --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/5.requests/with_headers.md" @@ -0,0 +1,78 @@ +# requests 获取网页(2) with headers + +将url对应的网页下载到本地 + +```python +# -*- coding: UTF-8 -*- +import requests + +def get_html(url, headers=None): + response = requests.get(url=url) + return response.text + +if __name__ == '__main__': + # TODO(You): 请正确编写 headers + headers = ... + url = "http://www.baidu.com" + html = get_html(url, headers) + print(html) +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +import requests + + +def get_html(url, headers=None): + response = requests.get(url=url) + return response.text + + +def test(): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" + } + url = "http://www.baidu.com" + html = get_html(url, headers) + print(html) + +if __name__ == '__main__': + test() +``` + +## 答案 + +```python +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" +} +``` + +## 选项 + +### A + +```python +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" +} +``` + +### B + +```python +headers = { + "useragent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" +} +``` + +### C + +```python +headers = [ + "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" +] +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/config.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/config.json" index 4eef6b903ceea7685452af378bae59f3f0a27f12..32ae6044b5246a40b5e7e5fae27e182fdc2767b1 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/config.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/config.json" @@ -1,6 +1,7 @@ { "export": [ - "selenium.json" + "selenium.json", + "hello_selenium.json" ], "keywords": [], "children": [ diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/hello_selenium.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/hello_selenium.json" new file mode 100644 index 0000000000000000000000000000000000000000..27f69d5a90d0b012f8792b7b17d7ad975cc9b270 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/hello_selenium.json" @@ -0,0 +1,8 @@ +{ + "author": "huanhuilong", + "source": "hello_selenium.md", + "depends": [], + "type": "code_options", + "notebook_enable": false, + "exercise_id": "8b4b78b2b9f84b5f8cd6fbb7fe85c3d0" +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/hello_selenium.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/hello_selenium.md" new file mode 100644 index 0000000000000000000000000000000000000000..3dccca8c8195fd119481a008c267b8b2b5e96195 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/hello_selenium.md" @@ -0,0 +1,138 @@ +# selenium 测试用例 + +Selenium 是web自动化测试工具集,爬虫可以利用其实现对页面动态资源的采集。请按顺序操作 + +1. 安装 Python Selenium 包:`pip install selenium` +2. 安装 Chrome 驱动:`https://npm.taobao.org/mirrors/chromedriver/`,如果使用别的浏览器需要下载对应浏览器的驱动 +3. 编写使用 python unittest 测试使用 selenium 完成自动化 + +selenium 自动化网页测试的操作: + +1. 使用 selenium 的Chrome 驱动,打开 CSDN 首页,此时会打开 Chrome 浏览器测试页面 +2. 验证字符串 "CSDN" 在页面标题 +3. 找到网页里的搜索框 +4. 输入"OpenCV技能树" +5. 输入回车,搜索结果 +6. 等待10秒退出 + +代码框架如下: + +```python +# -*- coding: UTF-8 -*- +import unittest +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +import time + +class PythonOrgSearch(unittest.TestCase): + + def setUp(self): + self.driver = webdriver.Chrome() + + def test_search_in_python_org(self): + # TODO(You): 请正确实现浏览器自动化测试需求 + time.sleep(10) + + def tearDown(self): + self.driver.close() + +if __name__ == "__main__": + unittest.main() +``` + +以下代码实现正确的是? + +## template + +```python +import unittest +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +import time + + +class PythonOrgSearch(unittest.TestCase): + + def setUp(self): + self.driver = webdriver.Chrome() + + def test_search_in_python_org(self): + driver = self.driver + + driver.get("https://www.csdn.net/") + self.assertIn("CSDN", driver.title) + + elem = driver.find_element_by_id("toolbar-search-input") + elem.send_keys("OpenCV 技能树") + elem.send_keys(Keys.RETURN) + assert "No results found." not in driver.page_source + time.sleep(10) + + def tearDown(self): + self.driver.close() + + +if __name__ == "__main__": + unittest.main() + +``` + +## 答案 + +```python +def test_search_in_python_org(self): + driver = self.driver + driver.get("https://www.csdn.net/") + self.assertIn("CSDN", driver.title) + + elem = driver.find_element_by_id("toolbar-search-input") + elem.send_keys("OpenCV 技能树") + elem.send_keys(Keys.RETURN) + assert "No results found." not in driver.page_source + time.sleep(10) +``` + +## 选项 + +### A + +```bash +def test_search_in_python_org(self): + driver = self.driver + driver.get("https://www.csdn.net/") + self.assertIn("CSDN", driver.title) + + elem = driver.find_element_by_name("toolbar-search-input") + elem.send_keys("OpenCV 技能树") + elem.send_keys(Keys.RETURN) + assert "No results found." not in driver.page_source + time.sleep(10) +``` + +### B + +```bash +def test_search_in_python_org(self): + driver = self.driver + driver.get("https://www.csdn.net/") + self.assertIn("CSDN", driver.title) + + elem = driver.find_element_by_id("toolbar-search-input") + elem.send_keys("OpenCV 技能树") + assert "No results found." not in driver.page_source + time.sleep(10) +``` + +### C + +```bash +def test_search_in_python_org(self): + driver = self.driver + driver.get("https://www.csdn.net/") + self.assertIn("CSDN", driver.title) + + elem = driver.find_element_by_id("toolbar-search-input") + elem.send_keys(Keys.RETURN) + assert "No results found." not in driver.page_source + time.sleep(10) +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/hello_selenium.py" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/hello_selenium.py" new file mode 100644 index 0000000000000000000000000000000000000000..b2795dbb3a662f25bddf45e6887eb9415148063c --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/hello_selenium.py" @@ -0,0 +1,29 @@ +import unittest +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +import time + + +class PythonOrgSearch(unittest.TestCase): + + def setUp(self): + self.driver = webdriver.Chrome() + + def test_search_in_python_org(self): + driver = self.driver + + driver.get("https://www.csdn.net/") + self.assertIn("CSDN", driver.title) + + elem = driver.find_element_by_id("toolbar-search-input") + elem.send_keys("OpenCV 技能树") + elem.send_keys(Keys.RETURN) + assert "No results found." not in driver.page_source + time.sleep(10) + + def tearDown(self): + self.driver.close() + + +if __name__ == "__main__": + unittest.main() diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/selenium.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/selenium.md" index d5af05ba47fa21a4afe7bbd9c3c08e284e9cf594..70ae3cc1f6dda8a130b9e6cc160b1593585c0609 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/selenium.md" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/6.Selenium/selenium.md" @@ -2,11 +2,9 @@ Selenium是web自动化测试工具集,爬虫可以利用其实现对页面动态资源的采集,对于其这种说法错误的是: - - ## 答案 -``` +```bash selenium和requests一样,都能用来采集数据,具有同等的速度 ``` @@ -14,18 +12,18 @@ selenium和requests一样,都能用来采集数据,具有同等的速度 ### A -``` -页面执行js才能呈现的内容,可以使用selenium来协助采集 +```bash +页面执行 js 才能呈现的内容,可以使用 selenium 来协助采集 ``` ### B -``` +```bash selenium本质是驱动浏览器来发送请求,模拟浏览器的行为 ``` ### C -``` +```bash 请求之后往往需要等待一段时间,等待资源加载渲染完成 ``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/so_tag_spider.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/so_tag_spider.json" index 8728c473262451b6cb3cbc2e246b7b1b722dba68..58750f6a28b740c5e9b3856e320034962c4e33c6 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/so_tag_spider.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/so_tag_spider.json" @@ -1,19 +1,10 @@ { - "one_line": { - "if self.page_count < self.totgal_pages:": [ - "if self.page_count <= self.totgal_pages:" - ], - "callback=self.parse": [ - "callback=parse" - ], - "yield": [ - "return" - ] - }, - "source": "so_tag_spider.py", + "source": "so_tag_spider.md", "depends": [ "tag_pipeline.py" ], "exercise_id": 206, - "type": "code_options" + "type": "code_options", + "author": "huanhuilong", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/so_tag_spider.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/so_tag_spider.md" new file mode 100644 index 0000000000000000000000000000000000000000..22b4bdbc43613f090b8732c2ce43985dd5c464d6 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/so_tag_spider.md" @@ -0,0 +1,165 @@ +# Python 爬虫 + +爬取 stackoverflow 标签 + +```python +# -*- coding: UTF-8 -*- +import scrapy +from scrapy.crawler import CrawlerProcess +from scrapy.settings import Settings + +BASE_DIR = __loader__.name + +class StackOverflowTagSpider(scrapy.Spider): + # 爬虫名字 + name = "stackoverflow_tags" + + # 爬虫运行的域名 + allowed_domains = ["stackoverflow.com"] + + # 爬虫开始爬取的第1个页面 + start_urls = ['https://stackoverflow.com/tags/synonyms?page=1'] + + # 爬虫配置,ITEM_PIPELINES指定每个条目的处理类 + custom_settings = { + 'ITEM_PIPELINES': {f'{BASE_DIR}.TagPipeline': 301}, + 'LOG_LEVEL': 'INFO' + } + + def __init__(self): + self.total_pages = 45 + self.page_count = 0 + + def parse(self, response): + # 访问的页面数+1,使用CSS查询页面内的标签文本 + self.page_count += 1 + tags = response.css('.post-tag::text') + for tag in tags: + yield {'name': tag.get()} + + # 找到页面底部的页码,访问下一页 + # TODO(You): 请正确实现访问下一页代码 + +if __name__ == "__main__": + settings = Settings() + process = CrawlerProcess() + process.crawl(StackOverflowTagSpider) + process.start() +``` + +请选出下列能**正确**实现这一功能的选项。 + +## template + +```python +import scrapy +from scrapy.crawler import CrawlerProcess +from scrapy.settings import Settings + +BASE_DIR = __loader__.name + + +class StackOverflowTagSpider(scrapy.Spider): + name = "stackoverflow_tags" + allowed_domains = ["stackoverflow.com"] + start_urls = ['https://stackoverflow.com/tags/synonyms?page=1'] + custom_settings = { + 'ITEM_PIPELINES': {f'{BASE_DIR}.TagPipeline': 301}, + 'LOG_LEVEL': 'INFO' + } + + def __init__(self): + self.totgal_pages = 45 + self.page_count = 0 + + def parse(self, response): + self.page_count += 1 + tags = response.css('.post-tag::text') + for tag in tags: + yield {'name': tag.get()} + + if self.page_count < self.totgal_pages: + next_page_list = response.css('a.js-pagination-item::attr(href)') + if len(next_page_list) > 0: + next_page_item = next_page_list[len(next_page_list)-1] + next_page = next_page_item.get() + print('next_page:', next_page) + yield response.follow(next_page, callback=self.parse, dont_filter=True) + +if __name__ == "__main__": + settings = Settings() + process = CrawlerProcess() + process.crawl(StackOverflowTagSpider) + process.start() +``` + +## 答案 + +```python +if self.page_count < self.total_pages: + next_page_list = response.css('a.js-pagination-item::attr(href)') + if len(next_page_list) > 0: + next_page = next_page_list[len(next_page_list)-1].get() + yield response.follow(next_page, callback=self.parse, dont_filter=True) +``` + +## 选项 + +### A + +```python +if self.page_count < self.total_pages: + next_page_list = response.css('a.js-pagination-item::attr(href)') + if len(next_page_list) > 0: + next_page = next_page_list[len(next_page_list)-1].get() + return response.follow(next_page, callback=self.parse, dont_filter=True) +``` + +### B + +```python +if self.page_count < self.total_pages: + next_page_list = response.css('a.js-pagination-item::attr(href)') + if len(next_page_list) > 0: + next_page = next_page_list[len(next_page_list)-1] + yield response.follow(next_page, callback=self.parse, dont_filter=True) +``` + +### C + +```python +if self.page_count <= self.total_pages: + next_page_list = response.css('a.js-pagination-item::attr(href)') + if len(next_page_list) > 0: + next_page = next_page_list[len(next_page_list)-1].get() + yield response.follow(next_page, callback=self.parse, dont_filter=True) +``` + +### D + +```python +next_page_list = response.css('a.js-pagination-item::attr(href)') +if len(next_page_list) > 0: + next_page = next_page_list[len(next_page_list)-1].get() + yield response.follow(next_page, callback=self.parse, dont_filter=True) +``` + +### E + +```python +if self.page_count < self.total_pages: + next_page_list = response.xpath('a.js-pagination-item::attr(href)') + if len(next_page_list) > 0: + next_page = next_page_list[len(next_page_list)-1].get() + yield response.follow(next_page, callback=self.parse, dont_filter=True) +``` + +### F + +```python +if self.page_count < self.total_pages: + next_page_list = response.css('a.js-pagination-item::attr(href)') + if len(next_page_list) > 0: + next_page = next_page_list[len(next_page_list)-1].get() + yield response.next(next_page, callback=self.parse, dont_filter=True) +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/tag_pipeline.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/tag_pipeline.json" index 97a43e63d53481f132693ba19237983ea7c348f7..8df0f717f2a308e180d3767ab2d451daea78a0a6 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/tag_pipeline.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/tag_pipeline.json" @@ -1,20 +1,8 @@ { - "one_line": { - "if self.count > 0:": [ - "if self.count >= 0:" - ], - "process_item(self, item, spider)": [ - "process_item(self, spider, item)" - ], - "self.file.close()": [ - "" - ], - ", 'w')": [ - ", 'r')" - ] - }, - "source": "tag_pipeline.py", + "source": "tag_pipeline.md", "depends": [], "exercise_id": 187, - "type": "code_options" + "type": "code_options", + "author": "huanhuilong", + "notebook_enable": true } \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/tag_pipeline.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/tag_pipeline.md" new file mode 100644 index 0000000000000000000000000000000000000000..7dffc830118d8cabbff1961ee289c103526f290a --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/7.Scrapy\346\241\206\346\236\266/tag_pipeline.md" @@ -0,0 +1,137 @@ +# Python 爬虫(1) + +实现一个用以 scrapy 爬虫处理中保存 stackoverflow 标签数据的管道处理类,爬虫管道类必须实现3个方法 + +1. `open_spider(self, spider)` +2. `process_item(self, item, spider)` +3. `close_spider(self, spider)` + +本例最终输出的 json 文件格式请参考[stackoverflow.tag.json](https://codechina.csdn.net/csdn/csdn-tags/-/blob/master/src/dataset/stackoverflow.tag.json) + +```python +# -*- coding: UTF-8 -*- +import json + +class StackOverflowTagPipeline(object): + def open_spider(self, spider): + ''' 打开文件并写入'[\n' 到 json 文件''' + self.file = open('/tmp/stackoverflow.tag.json', 'w') + self.file.write('[\n') + self.count = 0 + self.tags = {} + + def process_item(self, item, spider): + ''' 写入一个 {"name":xxx} 格式的元素,注意逗号拼接 ''' + + # 去重 + if self.tags.get(item['name']) is not None: + return + self.tags[item['name']] = True + + # TODO:(You): 请正确实现拼接json写入的代码 + result = ... + + # 写入拼接文本 + self.file.write(result) + self.count += 1 + + def close_spider(self, spider): + ''' 写入'\n]' 并关闭文件 ''' + self.file.write('\n]') + self.file.close() + +if __name__ == "__main__": + t = StackOverflowTagPipeline() + t.open_spider(None) + t.process_item({'name': 'c++'}, None) + t.close_spider(None) +``` + +以下对json拼接写入处理正确的代码是? + +## template + +```python +import json + + +class StackOverflowTagPipeline(object): + def open_spider(self, spider): + ''' 打开文件并写入'[\n' 到 json 文件''' + self.file = open('/tmp/stackoverflow.tag.json', 'w') + self.file.write('[\n') + self.count = 0 + self.tags = {} + + def process_item(self, item, spider): + ''' 写入一个 {"name":xxx} 格式的元素,注意逗号拼接 ''' + if self.tags.get(item['name']) is not None: + return + self.tags[item['name']] = True + + words = [] + if self.count > 0: + words.append(',\n') + words.append(' ') + words.append(json.dumps(item, ensure_ascii=False).strip()) + line = ''.join(words) + + self.file.write(line) + self.count += 1 + + def close_spider(self, spider): + ''' 写入'\n]' 并关闭文件 ''' + self.file.write('\n]') + self.file.close() + +if __name__ == "__main__": + t = StackOverflowTagPipeline() + t.open_spider(None) + t.process_item({'name': 'c++'}, None) + t.close_spider(None) +``` + +## 答案 + +```python +words = [] +if self.count > 0: + words.append(',\n') +words.append(' ') +words.append(json.dumps(item, ensure_ascii=False).strip()) +line = ''.join(words) +``` + +## 选项 + +### A + +```python +words = [] +if self.count > 0: + words.append(',\n') +words.append(' ') +words.append(item)) +result = ''.join(words) +``` + +### B + +```python +words = [] +words.append(',\n') +words.append(' ') +words.append(json.dumps(item, ensure_ascii=False).strip()) +result = ''.join(words) +``` + +### C + +```python +words = [] +if self.count > 0: + words.append(',\n') +words.append(' ') +words.append(json.dumps(item, ensure_ascii=False).strip()) +line = words +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/config.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/config.json" index d9326ffba6cfe3ec3167f5de67e1533e8c3bb69e..c8cb25594e3c851d8658a411eeb164b7c97bd2b2 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/config.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/config.json" @@ -1,6 +1,7 @@ { "export": [ - "pyspider.json" + "pyspider.json", + "hello_pyspider.json" ], "keywords": [], "children": [ diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/hello_pyspider.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/hello_pyspider.json" new file mode 100644 index 0000000000000000000000000000000000000000..06f6ec16581e1b354d6121e3527a62c3c25b3aad --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/hello_pyspider.json" @@ -0,0 +1,8 @@ +{ + "author": "huanhuilong", + "source": "hello_pyspider.md", + "depends": [], + "type": "code_options", + "notebook_enable": false, + "exercise_id": "ed92d5e3360a4dabb6dfa3b408768083" +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/hello_pyspider.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/hello_pyspider.md" new file mode 100644 index 0000000000000000000000000000000000000000..9fd4485c1ad4030acb176680b5d488101fc282f5 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/hello_pyspider.md" @@ -0,0 +1,68 @@ +# pyspider 例子 + +以下是一个 PySpider 的示范例子代码 + +```python +# -*- coding: UTF-8 -*- +from pyspider.libs.base_handler import * + +class Handler(BaseHandler): + crawl_config = { + } + + @every(minutes=24 * 60) + def on_start(self): + self.crawl('http://scrapy.org/', callback=self.index_page) + + @config(age=10 * 24 * 60 * 60) + def index_page(self, response): + for each in response.doc('a[href^="http"]').items(): + self.crawl(each.attr.href, callback=self.detail_page) + + def detail_page(self, response): + return { + "url": response.url, + "title": response.doc('title').text(), + } +``` + +以下关于上述代码说法正确的是? + +## 答案 + +```bash +全部都正确 +``` + +## 选项 + +### A + +```python +def on_start(self): + '''该函数是入口函数,pyspider 命令启动 run 之后会调用该入口函数''' +``` + +### B + +```python +# 添加了一个爬虫任务到PySpider, +# 回调函数调用了 self.index_page 成员方法 +self.crawl(url, callback=self.index_page) +``` + +### C + +```python +def index_page(self, response): + '''该函数的 response 参数是一个 Response* 对象. + 它通过了一组类似jQuery 的 API 用来查询和提取网页数据。 + ''' +``` + +### D + +```python +def detail_page(self, response): + '''该函数返回一个字典对象. 返回值会被 resultdb 捕获. ''' +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/hello_pyspider.py" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/hello_pyspider.py" new file mode 100644 index 0000000000000000000000000000000000000000..fceb7c7a167fb0d1e6a3cb6c0b681eb6fe87812f --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/hello_pyspider.py" @@ -0,0 +1,21 @@ +from pyspider.libs.base_handler import * + + +class Handler(BaseHandler): + crawl_config = { + } + + @every(minutes=24 * 60) + def on_start(self): + self.crawl('http://scrapy.org/', callback=self.index_page) + + @config(age=10 * 24 * 60 * 60) + def index_page(self, response): + for each in response.doc('a[href^="http"]').items(): + self.crawl(each.attr.href, callback=self.detail_page) + + def detail_page(self, response): + return { + "url": response.url, + "title": response.doc('title').text(), + } diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/pyspider.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/pyspider.md" index 0f323a82887e2179cf117153cbd2e19d6658433e..d6447f43df0a66232430176e13cfa3f03e7e0068 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/pyspider.md" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/8.pyspider\346\241\206\346\236\266\347\232\204\344\275\277\347\224\250/pyspider.md" @@ -1,31 +1,29 @@ # pyspider -Pyspider与Scrapy都可以用来爬取数据,关于他们的说法错误的是: - - +Pyspider 与 Scrapy 都可以用来爬取数据,关于他们的说法错误的是: ## 答案 -``` -Scrapy提供了web界面,可以用来调试部署 +```bash +Scrapy 提供了 web 界面,可以用来调试部署 ``` ## 选项 ### A -``` -Pyspider提供了web界面,可以进行可视化调试 +```bash +Pyspider 提供了 web 界面,可以进行可视化调试 ``` ### B -``` -初学者如果想快速入门爬取一个新闻网站,推荐使用Pyspider +```bash +初学者如果想快速入门爬取一个新闻网站,推荐使用 Pyspider ``` ### C -``` -Scrapy的可扩展程度更高,主要用来应对一些复杂的爬取场景 +```bash +Scrapy 的可扩展程度更高,主要用来应对一些复杂的爬取场景 ``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/code.png" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/code.png" new file mode 100644 index 0000000000000000000000000000000000000000..066c69ed75efb28dfc3f6bf421dd51ec9e89e01d Binary files /dev/null and "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/code.png" differ diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/config.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/config.json" index 879b98dfb27709927b984f1458b488fb2fbe1248..fbec9acfb149cf2d03ed7f00bfed10020fdd8a5a 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/config.json" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/config.json" @@ -1,6 +1,7 @@ { "export": [ - "verification_code.json" + "verification_code.json", + "hello_paddle.json" ], "keywords": [], "children": [ diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/hello_paddle.json" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/hello_paddle.json" new file mode 100644 index 0000000000000000000000000000000000000000..5ee34b60c38e7215f90d7cd3d316f523b4a779b8 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/hello_paddle.json" @@ -0,0 +1,8 @@ +{ + "author": "huanhuilong", + "source": "hello_paddle.md", + "depends": [], + "type": "code_options", + "notebook_enable": false, + "exercise_id": "d925c57963714c1da1268ab4e4680f98" +} \ No newline at end of file diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/hello_paddle.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/hello_paddle.md" new file mode 100644 index 0000000000000000000000000000000000000000..c01adaf463ce4b780ea83efe9c6e3607aa0987fa --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/hello_paddle.md" @@ -0,0 +1,69 @@ +# 爬虫验证码识别 + +使用百度 paddle ocr 库可以识别验证码 + +1. 安装paddle:`pip install paddlepaddle==2.1.0` +2. 安装paddle ocr: `pip install paddleocr==2.0.6` +3. 编写代码 + +```python +# -*- coding: UTF-8 -*- +import re +from paddleocr import PaddleOCR + +if __name__ == "__main__": + ocr_client = PaddleOCR( + use_angle_cls=True, + lang="ch", + use_space_char=True, + use_zero_copy_run=True, + use_mp=True, + total_process_num=16, + ir_optim=True, + enable_mkldnn=True, + rec_batch_num=1, + max_batch_size=1 + ) + result = ocr_client.ocr('code.png', det=True, rec=True, cls=True) + code_text = [] + for line in result: + print(line) + # TODO(You): 请正确提取文本 + text = ... + code_text.append(text) + print(code_text) +``` + +其中 line 的打印例子是: + +```bash +[[[881.0, 77.0], [1128.0, 56.0], [1161.0, 439.0], [914.0, 460.0]], ('6', 0.97982866)] +``` + +以下正确提取`text`的是? + +## 答案 + +```bash +text = line[1][0] +``` + +## 选项 + +### A + +```bash +text = line[0][1] +``` + +### B + +```bash +text = line[0][0] +``` + +### C + +```bash +text = line[1][1] +``` diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/hello_paddle.py" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/hello_paddle.py" new file mode 100644 index 0000000000000000000000000000000000000000..c0f673d60729c0a248e4dc0847ecffa8433dc342 --- /dev/null +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/hello_paddle.py" @@ -0,0 +1,23 @@ + +import re +from paddleocr import PaddleOCR + +if __name__ == "__main__": + ocr_client = PaddleOCR( + use_angle_cls=True, + lang="ch", + use_space_char=True, + use_zero_copy_run=True, + use_mp=True, + total_process_num=16, + ir_optim=True, + enable_mkldnn=True, + rec_batch_num=1, + max_batch_size=1 + ) + result = ocr_client.ocr('code.png', det=True, rec=True, cls=True) + code_text = [] + for line in result: + print(line) + code_text.append(line[1][0]) + print(code_text) diff --git "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/verification_code.md" "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/verification_code.md" index 81ea22fd8d2a9ce6091b91fb9fe95b321d2ffc16..710bcb7f794840847465fe5b9daef10170c074f9 100644 --- "a/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/verification_code.md" +++ "b/data/2.python\344\270\255\351\230\266/3.\347\275\221\347\273\234\347\210\254\350\231\253/9.\351\252\214\350\257\201\347\240\201\345\244\204\347\220\206/verification_code.md" @@ -2,11 +2,9 @@ 验证码是用来区分人和机器的一种方式,以下关于验证码的说法错误的是: - - ## 答案 -``` +```bash 验证码的识别是一个老话题,已经做到了100%的识别率 ``` @@ -14,18 +12,18 @@ ### A -``` +```bash 验证码的种类繁多,包括中英混合,点选,滑动等等 ``` ### B -``` +```bash 验证码识别要使用到OCR(Optical Character Recognition)技术 ``` ### C -``` +```bash 对于有难度的验证码,可以对接打码平台或者第三方平台提供的识别服务 ``` diff --git a/main.py b/main.py index ea14d4644f66b2280e3e98e0390dc9fdb8ffbb9a..2e308f0ebb7765c1fa7be797772c66e6a69b2d3f 100644 --- a/main.py +++ b/main.py @@ -9,5 +9,5 @@ if __name__ == '__main__': walker = TreeWalker("data", "python", "python") walker.walk() - # md = MDWalker('data/2.python中阶/2.Web应用开发') + # md = MDWalker('data/2.python中阶/3.网络爬虫') # md.walk()