Python+Flask实现全国、全球疫情大数据可视化(一):爬取疫情数据并保存至mysql数据库

相关文章

Python+Flask实现全国、全球疫情大数据可视化(二):网页页面布局+echarts可视化中国地图、世界地图、柱状图和折线图
Python+Flask实现全国、全球疫情大数据可视化(三):ajax读取mysql中的数据并将参数传递至echarts表格中


2021.8.1更新,由于之前使用的百度疫情数据接口,现在接口已经不可用了,所以这里我将数据接口换成了腾讯的。

一、实现效果

最近简单学习了一下flask,决定来做一个疫情大数据的网页出来。
话不多说先上效果图。还是比较喜欢这样的排版的。
在这里插入图片描述

二、数据获取地址

数据来源于百度提供的api接口。直接在百度搜索疫情数据。就能看到国内疫情与国外疫情两个内容了。然后进入网页去找接口。那么接口我已经找到了,如下:
腾讯疫情数据页面

#国内疫情数据接口
url='https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=chinaDayList,chinaDayAddList,nowConfirmStatis,provinceCompare'
#国外疫情数据接口
url2='https://api.inews.qq.com/newsqa/v1/automation/modules/list?modules=FAutoforeignList'

在这里插入图片描述

我们爬虫的编写思路就是分别爬取国内和国外的数据。
关于国内的数据,从上图可以看到,有四个参数chinaDayList,chinaDayAddList,nowConfirmStatis,provinceCompare分别是每天汇总的疫情总数据、每天新增的数据、现存确诊统计数据、各省份现存确诊数据。爬虫代码如下

# -*- coding: utf-8 -*-
# @Time    :2021/7/30 23:34
# @Author  :lzh
# @File    : new_spider.py
# @Software: PyCharm
import datetime

import pandas as pd
import requests
from sqlalchemy import create_engine

from translate import COUNTRIES_CH_EN_DICT


def traslate(word):
    '''
    将世界各国的中文名转化为英文
    '''
    return COUNTRIES_CH_EN_DICT.get(word, "未知地区")


# %%
def save_data(df, table_name, if_exists="append", need_translate=False):
    if need_translate:
        df['name'] = df['疫情地区'].apply(traslate)
    conn = create_engine('mysql://root:123456@localhost:3306/myspider?charset=utf8')
    pd.io.sql.to_sql(df, table_name, con=conn, if_exists=if_exists, index=None)


def crawl_china_data():
    url = "https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=chinaDayList,chinaDayAddList,nowConfirmStatis,provinceCompare"
    data = requests.get(url)
    data = data.json().get("data", {})
    provinceCompare = data.get("provinceCompare")  # 每个省份的总数据(每日更新)
    chinaDayList = data.get("chinaDayList")  # 最近一个月的全国疫情的总数据
    chinaDayAddList = data.get("chinaDayAddList")  # 最近一个月的全国疫情的新增数据
    return provinceCompare, chinaDayList, chinaDayAddList


# %%
def crawl_countries_data():
    url = "https://api.inews.qq.com/newsqa/v1/automation/modules/list?modules=FAutoforeignList"
    data = requests.get(url)
    data = data.json().get("data")
    foreignList = data.get("FAutoforeignList", [])
    return foreignList


def parse_china_daily_data(day_list, day_add_list):
    """
    解析每日新增、每日累计数据
    :param api_rtn_data:
    :return:
    """
    day_df = pd.DataFrame(day_list)
    day_df["date"] = pd.to_datetime(day_df["y"] + "." + day_df["date"])
    day_add_df = pd.DataFrame(day_add_list)
    day_add_df["date"] = pd.to_datetime(day_add_df["y"] + "." + day_add_df["date"])
    save_data(day_df, "china_day", "append")
    save_data(day_add_df, "china_day_add", "append")


def parse_countries_total_data(api_rtn_data):
    """
    解析每个省份的新增数据
    :param api_rtn_data:
    :return:
    """
    dates = []
    countries = []
    dignose = []
    heal = []
    dead = []
    add = []
    for country in api_rtn_data:
        countries.append(country.get("name", ""))
        month, day = country.get("date").split(".")
        month = month if "0" not in month else month[-1]
        date = datetime.date(int(country.get("y")), int(month), int(day))
        dates.append(date)
        dignose.append(country.get("nowConfirm", 0))
        heal.append(country.get("heal", 0))
        dead.append(country.get("dead", 0))
        add.append(country.get("confirmAdd", 0))
    df = pd.DataFrame({
        "疫情地区": countries,
        "日期": dates,
        "确诊": dignose,
        "治愈": heal,
        "死亡": dead,
        "新增死亡": add
    })

    save_data(df, "world_epidemic", "replace", True)
    return df


def parse_provinces_total_data(api_rtn_data):
    dates = []
    provinces = []
    dignose = []
    heal = []
    dead = []
    add = []
    for province, total_data in api_rtn_data.items():
        provinces.append(province)
        date = datetime.datetime.now()
        dates.append(date)
        dignose.append(total_data.get("nowConfirm", 0))
        heal.append(total_data.get("heal", 0))
        dead.append(total_data.get("dead", 0))
        add.append(total_data.get("confirmAdd", 0))
    df = pd.DataFrame({
        "疫情地区": provinces,
        "日期": dates,
        "确诊": dignose,
        "治愈": heal,
        "死亡": dead,
        "新增死亡": add
    })
    save_data(df, 'china_total_epidemic')
    return df


# %%
def main():
    provinceCompare, chinaDayList, chinaDayAddList = crawl_china_data()
    parse_china_daily_data(chinaDayList, chinaDayAddList)
    parse_provinces_total_data(provinceCompare)
    parse_countries_total_data(crawl_countries_data())


if __name__ == '__main__':
    main()
    print("爬取完成")

注意:外国国家名称转转换为英文

我们需要爬的数据是中国国内数据与全球国家的数据。由于可视化时需要用到echarts绘制世界地图,而爬取到的各国家名称是中文,下面需要将中文转换为英文
转换字典如下:

COUNTRIES_CH_EN_DICT = {
    "索马里": "Somalia",
    "列支敦士登": "Liechtenstein",
    "摩洛哥": "Morocco",
    "西撒哈拉": "W. Sahara",
    "塞尔维亚": "Serbia",
    "阿富汗": "Afghanistan",
    "安哥拉": "Angola",
    "阿尔巴尼亚": "Albania",
    "安道尔共和国": "Andorra",
    "阿拉伯联合酋长国": "United Arab Emirates",
    "阿根廷": "Argentina",
    "亚美尼亚": "Armenia",
    "澳大利亚": "Australia",
    "奥地利": "Austria",
    "阿塞拜疆": "Azerbaijan",
    "布隆迪": "Burundi",
    "比利时": "Belgium",
    "贝宁": "Benin",
    "布基纳法索": "Burkina Faso",
    "孟加拉国": "Bangladesh",
    "保加利亚": "Bulgaria",
    "巴林": "Bahrain",
    "巴哈马": "Bahamas",
    "波斯尼亚和黑塞哥维那": "Bosnia and Herz.",
    "白俄罗斯": "Belarus",
    "伯利兹": "Belize",
    "百慕大": "Bermuda",
    "玻利维亚": "Bolivia",
    "巴西": "Brazil",
    "巴巴多斯": "Barbados",
    "文莱": "Brunei",
    "不丹": "Bhutan",
    "博茨瓦纳": "Botswana",
    "中非": "Central African Rep.",
    "加拿大": "Canada",
    "瑞士": "Switzerland",
    "智利": "Chile",
    "中国": "China",
    "科特迪瓦": "Côte dIvoire",
    "喀麦隆": "Cameroon",
    "刚果民主共和国": "Dem. Rep. Congo",
    "刚果": "Congo",
    "哥伦比亚": "Colombia",
    "佛得角": "Cape Verde",
    "哥斯达黎加": "Costa Rica",
    "古巴": "Cuba",
    "北塞浦路斯": "N. Cyprus",
    "塞浦路斯": "Cyprus",
    "捷克": "Czech Rep.",
    "德国": "Germany",
    "吉布提": "Djibouti",
    "丹麦": "Denmark",
    "多米尼加": "Dominica",
    "阿尔及利亚": "Algeria",
    "厄瓜多尔": "Ecuador",
    "埃及": "Egypt",
    "厄立特里亚": "Eritrea",
    "西班牙": "Spain",
    "爱沙尼亚": "Estonia",
    "埃塞俄比亚": "Ethiopia",
    "芬兰": "Finland",
    "斐济": "Fiji",
    "法国": "France",
    "加蓬": "Gabon",
    "英国": "United Kingdom",
    "格鲁吉亚": "Georgia",
    "加纳": "Ghana",
    "几内亚": "Guinea",
    "冈比亚": "Gambia",
    "几内亚比绍": "Guinea-Bissau",
    "赤道几内亚": "Eq. Guinea",
    "希腊": "Greece",
    "格林纳达": "Grenada",
    "格陵兰": "Greenland",
    "危地马拉": "Guatemala",
    "关岛": "Guam",
    "圭亚那": "Guyana",
    "洪都拉斯": "Honduras",
    "克罗地亚": "Croatia",
    "海地": "Haiti",
    "匈牙利": "Hungary",
    "印度尼西亚": "Indonesia",
    "印度": "India",
    "英属印度洋领土": "Br. Indian Ocean Ter.",
    "爱尔兰": "Ireland",
    "伊朗": "Iran",
    "伊拉克": "Iraq",
    "冰岛": "Iceland",
    "以色列": "Israel",
    "意大利": "Italy",
    "牙买加": "Jamaica",
    "约旦": "Jordan",
    "日本": "Japan",
    "锡亚琴冰川": "Siachen Glacier",
    "哈萨克斯坦": "Kazakhstan",
    "肯尼亚": "Kenya",
    "吉尔吉斯坦": "Kyrgyzstan",
    "柬埔寨": "Cambodia",
    "韩国": "Korea",
    "科威特": "Kuwait",
    "老挝": "Lao PDR",
    "黎巴嫩": "Lebanon",
    "利比里亚": "Liberia",
    "利比亚": "Libya",
    "斯里兰卡": "Sri Lanka",
    "莱索托": "Lesotho",
    "立陶宛": "Lithuania",
    "卢森堡": "Luxembourg",
    "拉脱维亚": "Latvia",
    "摩尔多瓦": "Moldova",
    "马达加斯加": "Madagascar",
    "墨西哥": "Mexico",
    "马其顿": "Macedonia",
    "马里": "Mali",
    "马耳他": "Malta",
    "缅甸": "Myanmar",
    "黑山": "Montenegro",
    "蒙古": "Mongolia",
    "莫桑比克": "Mozambique",
    "毛里塔尼亚": "Mauritania",
    "毛里求斯": "Mauritius",
    "马拉维": "Malawi",
    "马来西亚": "Malaysia",
    "纳米比亚": "Namibia",
    "新喀里多尼亚": "New Caledonia",
    "尼日尔": "Niger",
    "尼日利亚": "Nigeria",
    "尼加拉瓜": "Nicaragua",
    "荷兰": "Netherlands",
    "挪威": "Norway",
    "尼泊尔": "Nepal",
    "新西兰": "New Zealand",
    "阿曼": "Oman",
    "巴基斯坦": "Pakistan",
    "巴拿马": "Panama",
    "秘鲁": "Peru",
    "菲律宾": "Philippines",
    "巴布亚新几内亚": "Papua New Guinea",
    "波兰": "Poland",
    "波多黎各": "Puerto Rico",
    "朝鲜": "Dem. Rep. Korea",
    "葡萄牙": "Portugal",
    "巴拉圭": "Paraguay",
    "巴勒斯坦": "Palestine",
    "卡塔尔": "Qatar",
    "罗马尼亚": "Romania",
    "俄罗斯": "Russia",
    "卢旺达": "Rwanda",
    "沙特阿拉伯": "Saudi Arabia",
    "苏丹": "Sudan",
    "南苏丹": "S. Sudan",
    "塞内加尔": "Senegal",
    "新加坡": "Singapore",
    "所罗门群岛": "Solomon Is.",
    "塞拉利昂": "Sierra Leone",
    "萨尔瓦多": "El Salvador",
    "苏里南": "Suriname",
    "斯洛伐克": "Slovakia",
    "斯洛文尼亚": "Slovenia",
    "瑞典": "Sweden",
    "斯威士兰": "Swaziland",
    "塞舌尔": "Seychelles",
    "叙利亚": "Syria",
    "乍得": "Chad",
    "多哥": "Togo",
    "泰国": "Thailand",
    "塔吉克斯坦": "Tajikistan",
    "土库曼斯坦": "Turkmenistan",
    "东帝汶": "Timor-Leste",
    "汤加": "Tonga",
    "特立尼达和多巴哥": "Trinidad and Tobago",
    "突尼斯": "Tunisia",
    "土耳其": "Turkey",
    "坦桑尼亚": "Tanzania",
    "乌干达": "Uganda",
    "乌克兰": "Ukraine",
    "乌拉圭": "Uruguay",
    "美国": "United States",
    "乌兹别克斯坦": "Uzbekistan",
    "委内瑞拉": "Venezuela",
    "越南": "Vietnam",
    "瓦努阿图": "Vanuatu",
    "也门": "Yemen",
    "南非": "South Africa",
    "赞比亚": "Zambia",
    "津巴布韦": "Zimbabwe",
    "奥兰群岛": "Aland",
    "美属萨摩亚": "American Samoa",
    "南极洲": "Fr. S. Antarctic Lands",
    "安提瓜和巴布达": "Antigua and Barb.",
    "科摩罗": "Comoros",
    "库拉索岛": "Curaçao",
    "开曼群岛": "Cayman Is.",
    "马尔维纳斯群岛(福克兰)": "Falkland Is.",
    "法罗群岛": "Faeroe Is.",
    "密克罗尼西亚": "Micronesia",
    "赫德岛和麦克唐纳群岛": "Heard I. and McDonald Is.",
    "曼岛": "Isle of Man",
    "泽西岛": "Jersey",
    "基里巴斯": "Kiribati",
    "圣卢西亚": "Saint Lucia",
    "北马里亚纳群岛": "N. Mariana Is.",
    "蒙特塞拉特": "Montserrat",
    "纽埃": "Niue",
    "帕劳": "Palau",
    "法属波利尼西亚": "Fr. Polynesia",
    "南乔治亚岛和南桑威奇群岛": "S. Geo. and S. Sandw. Is.",
    "圣赫勒拿": "Saint Helena",
    "圣皮埃尔和密克隆群岛": "St. Pierre and Miquelon",
    "圣多美和普林西比": "São Tomé and Principe",
    "特克斯和凯科斯群岛": "Turks and Caicos Is.",
    "圣文森特和格林纳丁斯": "St. Vin. and Gren.",
    "美属维尔京群岛": "U.S. Virgin Is.",
    "萨摩亚": "Samoa"
}

最后再加上如下语句将英文国家名的中文名翻译为英文,并作为新的一列加入到DataFrame中

三、数据保存

def save_data(df, table_name, if_exists="append", need_translate=False):
    if need_translate:
        df['name'] = df['疫情地区'].apply(traslate)
    conn = create_engine('mysql://root:123456@localhost:3306/myspider?charset=utf8')
    pd.io.sql.to_sql(df, table_name, con=conn, if_exists=if_exists, index=None)

 #注意换成你的数据库的库名、表名、账号密码
save_data(df,db_name,user,password)

数据库一共保存四张表,分别为最近一个月每天汇总的累计疫情数据china_day、最近一个月每天新增数据china_day_add、每天各省份的疫情数据china_total_epidemic、最近一个月全球各地区的疫情累计数据world_epidemic
最后保存到数据中的数据格式如下
在这里插入图片描述

在这里插入图片描述

四、完整项目获取

关注一下公众号,回复"0007"即可get完整项目源码
在这里插入图片描述

评论 32
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Demonslzh6

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值