python打卡DAY5

复习:数据编码与缺失值填补
pip install pandas
import pandas as pd
data = pd.read_csv(r'data.csv')
data.info()
print(data.isnull().sum())
data.columns.tolist()
discrete_feature=[]
for discrete_feature in data.columns:
    if data[discrete_feature].dtype == 'object':
        if data[discrete_feature].isnull().sum() > 0:
            data[discrete_feature].fillna(data[discrete_feature].mode()[0],inplace=True)
print(data.isnull().sum())
for i in data.columns:
    if data[i].dtype !='object':
        if data[i].isnull().sum()>0:
            data[i].fillna(data[i].mean(),inplace=True)
print(data.isnull().sum())
list_dummies=[]
for i in data.columns:
    if data[i].dtype == 'object':
        list_dummies.append(i)
data= pd.get_dummies(data,columns=list_dummies,drop_first=True)
print(data.columns)
list_change=[]
data2 = pd.read_csv(r'data.csv')
for i in data.columns:
    if i not in data2.columns:
        list_change.append(i)
print(list_change)
for i in list_change:
    data[i]=data[i].astype(int)
print(data.head())
import pandas as pd
data=pd.read_csv(r'data.csv')
continuous_features = data.select_dtypes(include=['float64','int64']).columns.tolist()
continuous_features
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple seaborn matplotlib pandas
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

sns.boxplot(x=data['Annual Income'])
plt.title('Annual Income 的箱线图')
plt.xlabel('Annual Income')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=False
sns.boxplot(x=data['Annual Income'])
plt.title('年收入箱线图')
plt.xlabel('年收入')
plt.show
sns.histplot(data['Years in current job'])
plt.title('在当前工作年限直方图')
plt.xlabel('在当前工作年限')
plt.ylabel('员工数量')
plt.show
import seaborn as sns
sns.histplot(x=data['Years in current job'])
plt.title('在当前工作年限直方图')
plt.xlabel('在当前工作年限')
plt.ylabel('员工数量')
plt.xticks(rotation=45,ha='right')
plt.tight_layout()
plt.show()

即使你没有明确指定   y   轴的变量,  sns.histplot   函数仍然会自动生成一个   y   轴。这是因为直方图的特性决定了它需要一个   y   轴来表示频率或计数。直方图的原理直方图是一种用于展示数据分布的图表。它的核心思想是将数据分成若干个区间(称为“桶”或“bin”),然后统计每个区间内的数据点数量。这些数量就是直方图的“高度”,通常表示在   y   轴上。
绘制特征和标签的关系
plt.figure(figsize=(8,6))
sns.boxplot(x='Credit Default',y='Annual Income',data=data)

plt.title('Annual Income VS. CRedit Default')
plt.xlabel('Credit Default')
plt.ylabel('Annual Income')
plt.show

如何在不修改原始数据的情况下把Credit Default中的0 1转化为是,否?
plt.figure(figsize=(8,6))
sns.boxplot(x='Credit Default',y='Annual Income',data=data)
plt.xticks(ticks=[0,1],labels=['no','yes'])
plt.xlabel('credit default')
plt.ylabel('annual income')
plt.show
plt.figure(figsize=(8,6))
sns.histplot(x='Annual Income',hue='Credit Default',data=data,kde=True,element='step')
plt.title('Annual Income vs. credit default')
plt.xlabel('annual income')
plt.ylabel('count')
plt.show()
plt.figure(figsize=(12,5))
sns.countplot(x='Number of Open Accounts',hue='Credit Default',data=data)
plt.xticks(rotation=45,ha='right')
plt.tight_layout()
plt.xlabel('number of open accounts')
plt.ylabel('count')
plt.show

data['Open Accounts Group']=pd.cut(data['Number of Open Accounts'],bins=[0,5,10,15,20,float('inf')],labels=['0-5','6-10','11-15','16-20','20+'])
plt.figure(figsize=(6,4))
sns.countplot(x='Open Accounts Group',hue='Credit Default',data=data)
plt.title('number of open accounts(grouped) vs. credit defalt')
plt.xlabel('number of open accounts group')
plt.ylabel('count')
plt.show()
data['Open Accounts Group']=pd.cut(data['Number of Open Accounts'],bins=[0,5,10,15,20,float('inf')],labels=['0-5','6-10','11-15','16-20','20+'])
plt.figure(figsize=(6,4))
sns.histplot(x='Open Accounts Group',hue='Credit Default',data=data)
plt.title('number of open accounts(grouped) vs. credit defalt')
plt.xlabel('number of open accounts group')
plt.ylabel('count')
plt.show()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值