import pandas as pd
data=pd.read_csv(r'data.csv')
#查看基本信息与缺失值
print(f'{data.info()}\n{data.isnull().sum()}\n{data.head()}')
#图像绘制
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=True
# plt.figure(figsize=(6,4))
# sns.boxplot(x=data['Annual Income'])
# plt.title('annual income photo')
# plt.xlabel('annual income')
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.boxplot(x=data['Annual Income'])
# plt.title('年收入箱线图')
# plt.xlabel('年收入')
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.boxplot(x='Credit Default',y='Annual Income',data=data)
# plt.title('annual income vs.credit default')
# plt.xlabel('credit default')
# plt.ylabel('annual income')
# plt.xticks([0,1],['否','是'])
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.histplot(x='Annual Income',
# hue='Credit Default',
# data=data,
# kde=True,
# element='step')
# #kde 是 "Kernel Density Estimation"(核密度估计)的缩写
# plt.title('annual income')
# plt.xlabel('annual income')
# plt.ylabel('count')
# plt.legend(labels=['否','是'])
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.countplot(x='Number of Open Accounts',
# hue='Credit Default',
# data=data)
# plt.xticks(rotation=45,ha='right')
# plt.xlabel('number of open account')
# plt.ylabel('count')
# plt.legend(labels=['否','是'])
# plt.tight_layout()
# plt.show()
# print(data.info())
# data['Open Accounts Group']=pd.cut(data['Number of Open Accounts'],
# bins=[0,5,10,15,20,float('inf')],
# labels=['0-5','6-10','11-15','16-20','20+'])
# plt.figure(figsize=(6,4))
# sns.countplot(x='Open Accounts Group',hue='Credit Default',data=data)
# plt.title('number of open accounts(grouped) vs. Credit Default')
# plt.xlabel('nimber of open account group')
# plt.ylabel('count')
# plt.legend(labels=['否','是'])
# plt.tight_layout()
# plt.show()
#填补缺失值
for i in data.columns:
if data[i].dtype!='object':
data.isnull().sum()>0
data[i].fillna(data[i].mean(),inplace=True)
if data[i].dtype=='object':
data[i].isnull().sum()>0
data[i].fillna(data[i].mode()[0],inplace=True)
#数据编码
mapping={
'10+ years':0,
'9 years':1,
'8 years':2,
'7 years':3,
'6 years':4,
'5 years':5,
'4 years':6,
'3 years':7,
'2 years':8,
'< 1 year':9
}
data['Years in current job']=data['Years in current job'].map(mapping)
data=pd.get_dummies(data,drop_first=True)
dummies_list=[]
data2=pd.read_csv(r'data.csv')
for i in data.columns:
if i not in data2.columns:
dummies_list.append(i)
for i in dummies_list:
data[i]=data[i].astype(int)
print(data.head())
# #绘制热力图&子图
# print(f'{data.columns}\n{data.info()}\n{data.head()}')
# continuous_features=[
# 'Annual Income', 'Years in current job', 'Tax Liens',
# 'Number of Open Accounts', 'Years of Credit History',
# 'Maximum Open Credit', 'Number of Credit Problems',
# 'Months since last delinquent', 'Bankruptcies', 'Current Loan Amount',
# 'Current Credit Balance', 'Monthly Debt', 'Credit Score'
# ]
# correlation_matrix=data[continuous_features].corr()
# plt.rcParams['figure.dpi']=100
# plt.figure(figsize=(12,10))
# sns.heatmap(correlation_matrix,annot=True,cmap='coolwarm',vmin=-1,vmax=1)
# plt.title('相关热力图')
# plt.xticks(rotation=45,ha='right')
# plt.tight_layout()
# plt.show()
# features=['Annual Income','Years in current job','Tax Liens','Number of Open Accounts']
# plt.rcParams['figure.dpi']=100
# fig,axes=plt.subplots(2,2,figsize=(12,10))
# i=0
# feature=features[i]
# axes[0,0].boxplot(data[feature].dropna())
# axes[0,0].set_title(f'boxplot of {feature}')
# axes[0,0].set_ylabel(feature)
# i=1
# feature=features[i]
# axes[0,1].boxplot(data[feature].dropna())
# axes[0,1].set_title(f'boxplot of {feature}')
# axes[0,1].set_ylabel(feature)
# i=2
# feature=features[i]
# axes[1,0].boxplot(data[feature].dropna())
# axes[1,0].set_title(f'boxplot of {feature}')
# axes[1,0].set_ylabel(feature)
# i=3
# feature=features[i]
# axes[1,1].boxplot(data[feature].dropna())
# axes[1,1].set_title(f'boxplot of {feature}')
# axes[1,1].set_ylabel(feature)
# plt.tight_layout()
# plt.show()
# features=['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
# plt.rcParams['figure.dpi']=100
# fig,axes=plt.subplots(2,2,figsize=(12,8))
# for i in range(len(features)):
# row=i//2
# col=i%2
# feature=features[i]
# axes[row,col].boxplot(data[feature].dropna())
# axes[row,col].set_title(f'boxplot of {feature}')
# axes[row,col].set_ylabel(feature)
# plt.tight_layout()
# plt.show()
# features=['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
# for i,feature in enumerate(features):
# print(f'索引{i}对应的特征是:{feature}')
# features=['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
# plt.rcParams['figure.dpi']=100
# fig,axes=plt.subplots(2,2,figsize=(12,8))
# for i,feature in enumerate(features):
# row=i//2
# col=i%2
# axes[row,col].boxplot(data[feature].dropna())
# axes[row,col].set_title(f'boxplot of {feature}')
# axes[row,col].set_ylabel(feature)
# plt.tight_layout()
# plt.show()