import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data=pd.read_csv(r'data.csv')
#查看数据基本情况
print(f'{data.info()}\n{data.isnull().sum()}\n{data.head()}')
#设置中文字体与负号正确显示
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=True
plt.rcParams['figure.dpi']=100
# plt.figure(figsize=(6,4))
# sns.boxplot(x=data['Annual Income'])
# plt.title('annual income photo')
# plt.xlabel('annual income')
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.boxplot(x=data['Annual Income'])
# plt.title('年收入箱线图')
# plt.xlabel('年收入')
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.boxplot(x='Credit Default',y='Annual Income',data=data)
# plt.title('annual income vs, credit default')
# plt.xlabel('credit default')
# plt.ylabel('annual income')
# plt.xticks([0,1],['否','是'])
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.histplot(x='Annual Income',
# hue='Credit Default',
# data=data,
# kde=True,
# element='step')
# plt.title('annual income')
# plt.xlabel('annual income')
# plt.ylabel('count')
# plt.legend(labels=['否','是'])
# plt.tight_layout()
# plt.show()
# plt.figure(figsize=(6,4))
# sns.countplot(x='Number of Open Accounts',
# hue='Credit Default',
# data=data)
# plt.xticks(rotation=45,ha='right')
# plt.xlabel('number of open account')
# plt.ylabel('count')
# plt.legend(labels=['否','是'])
# plt.tight_layout()
# plt.show()
# print(data.info())
# data['Open Accounts Group']=pd.cut(data['Number of Open Accounts'],
# bins=[0,5,10,15,20,float('inf')],
# labels=['0-5','6-10','11-15','16-20','20+'])
# plt.figure(figsize=(6,4))
# sns.countplot(x='Open Accounts Group',
# hue='Credit Default',
# data=data)
# plt.title('number of open accounts(grouped) vs. credit default')
# plt.xlabel('number of open account group')
# plt.ylabel('count')
# plt.legend(labels=['否','是'])
# plt.tight_layout()
# plt.show()
#填补缺失值
for i in data.columns:
if data[i].dtype!='object':
if data[i].isnull().sum()>0:
data[i].fillna(data[i].mean(),inplace=True)
else:
if data[i].isnull().sum()>0:
data[i].fillna(data[i].mode()[0],inplace=True)
print(data.info())
print(data['Years in current job'].value_counts())
print(data['Years in current job'].value_counts())
#数据编码
print(f'{data["Home Ownership"].value_counts()}\n{data["Years in current job"].value_counts()}\n{data["Purpose"].value_counts()}\n{data["Term"].value_counts()}')
mapping={
'10+ years':0,
'9 years':1,
'8 years':2,
'7 years':3,
'6 years':4,
'5 years':5,
'4 years':6,
'3 years':7,
'2 years':8,
'1 year':9,
'< 1 year':10
}
data['Years in current job']=data['Years in current job'].map(mapping)
print(data.info())
print("所有唯一值:", data['Years in current job'].unique())
print(data['Years in current job'].value_counts())
data=pd.get_dummies(data,drop_first=True)
dummies_list=[]
data2=pd.read_csv(r'data.csv')
for i in data.columns:
if i not in data2.columns:
dummies_list.append(i)
for i in dummies_list:
data[i]=data[i].astype(int)
print(data.head())
print(f'{data.columns}\n{data.info()}\n{data.head()}')
# continuous_features=[
# 'Annual Income', 'Years in current job', 'Tax Liens',
# 'Number of Open Accounts', 'Years of Credit History',
# 'Maximum Open Credit', 'Number of Credit Problems',
# 'Months since last delinquent', 'Bankruptcies', 'Current Loan Amount',
# 'Current Credit Balance', 'Monthly Debt', 'Credit Score'
# ]
# correlation_matrix=data[continuous_features].corr()
# plt.figure(figsize=(12,10))
# sns.heatmap(correlation_matrix,annot=True,cmap='coolwarm',vmin=-1,vmax=1)
# plt.title('相关热力图')
# plt.xticks(rotation=45,ha='right')
# plt.tight_layout()
# plt.show()
features=['Annual Income','Years in current job','Tax Liens','Number of Open Accounts']
# fig,axes=plt.subplots(2,2,figsize=(12,10))
# i=0
# feature=features[i]
# axes[0,0].boxplot(data[feature].dropna())
# axes[0,0].set_title(f'boxplot of {feature}')
# axes[0,0].set_ylabel(feature)
# i=1
# feature=features[i]
# axes[0,1].boxplot(data[feature].dropna())
# axes[0,1].set_title(f'boxplot of {feature}')
# axes[0,1].set_ylabel(feature)
# i=2
# feature=features[i]
# axes[1,0].boxplot(data[feature].dropna())
# axes[1,0].set_title(f'boxplot of {feature}')
# axes[1,0].set_ylabel(feature)
# i=3
# feature=features[i]
# axes[1,1].boxplot(data[feature].dropna())
# axes[1,1].set_title(f'boxplot of {feature}')
# axes[1,1].set_ylabel(feature)
# plt.tight_layout()
# plt.show()
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i in range(len(features)):
# row=i//2
# col=i%2
# feature=features[i]
# axes[row,col].boxplot(data[feature].dropna())
# axes[row,col].set_title(f'boxplot of {feature}')
# axes[row,col].set_ylabel(feature)
# plt.tight_layout()
# plt.show()
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i,feature in enumerate(features):
# row=i//2
# col=i%2
# axes[row,col].boxplot(data[feature].dropna())
# axes[row,col].set_title(f'boxplot of {feature}')
# axes[row,col].set_ylabel(feature)
# plt.tight_layout()
# plt.show()
#另一种表达方式:调用seaborn库
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i,feature in enumerate(features):
# row=i//2
# col=i%2
# sns.boxplot(y=data[feature].dropna(),ax=axes[row,col])
# axes[row,col].set_title(f'boxplot of {feature}')
# axes[row,col].set_ylabel(feature)
# plt.tight_layout()
# plt.show()
# #这里是重点!!!!!!!!!!!!!!!!
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# for i,feature in enumerate(features):
# row=i//2
# col=i%2
# sns.histplot(
# x=feature,
# hue='Credit Default',
# data=data,
# kde=True,
# element='step',
# ax=axes[row,col]
# )
# axes[row,col].set_title(f'histplot of {feature}')
# axes[row,col].set_xlabel(feature)
# axes[row,col].set_ylabel(f'count')
# plt.tight_layout()
# plt.show()
from sklearn.model_selection import train_test_split
x=data.drop(['Credit Default'],axis=1)
y=data['Credit Default']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
print(f'train:{x_train.shape}\ntest:{x_test.shape}')
from sklearn.svm import SVC #支持向量机分类器
from sklearn.neighbors import KNeighborsClassifier #K近邻分类器
from sklearn.linear_model import LogisticRegression #逻辑回归分类器
import xgboost as xgb #XGBoost分类器
import lightgbm as lgb #LightGBM分类器
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
from catboost import CatBoostClassifier #CatBoost分类器
from sklearn.tree import DecisionTreeClassifier #决策树分类器
from sklearn.naive_bayes import GaussianNB #高斯朴素贝叶斯分类器
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
#SVM
svm_model=SVC(random_state=42)
svm_model.fit(x_train,y_train)
svm_pred=svm_model.predict(x_test)
print('\nSVM分类报告:')
print(classification_report(y_test,svm_pred))
print('SVM混淆矩阵:')
print(confusion_matrix(y_test,svm_pred))
svm_accuracy=accuracy_score(y_test,svm_pred)
svm_precision=precision_score(y_test,svm_pred)
svm_recall=recall_score(y_test,svm_pred)
svm_f1=f1_score(y_test,svm_pred)
print('SVM模型评估指标')
print(f'准确率:{svm_accuracy:.4f}\n精确率:{svm_precision}\n召回率:{svm_recall:.4f}\nF1值:{svm_f1:.4f}')
#RandomForest
rf_model=RandomForestClassifier(random_state=42)
rf_model.fit(x_train,y_train)
rf_pred=rf_model.predict(x_test)
print('\n随机森林分类报告:')
print(classification_report(y_test,rf_pred))
print('\n随机森林 混淆矩阵')
print(confusion_matrix(y_test,rf_pred))
#XGBoost
xgb_model=xgb.XGBClassifier(random_state=42)
xgb_model.fit(x_train,y_train)
xgb_pred=xgb_model.predict(x_test)
print('\nXGBoost分类报告')
print(classification_report(y_test,xgb_pred))
print('XGBOOst混淆矩阵')
print(confusion_matrix(y_test,xgb_pred))
#lightGBM
lgb_model=lgb.LGBMClassifier(random_state=42)
lgb_model.fit(x_train,y_train)
lgb_pred=lgb_model.predict(x_test)
print('\nLightGBM分类报告')
print(classification_report(y_test,lgb_pred))
print('LightGBM混淆矩阵')
print(confusion_matrix(y_test,lgb_pred))