Home Credit Default Risk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import os

import numpy as np

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

import warnings

warnings.filterwarnings('ignore')

数据读取

1
2
3
4
5
app_train = pd.read_csv('./dataset/application_train.csv')

print('Training data shape: ', app_train.shape)

app_train.head()
Training data shape:  (307511, 122)

SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY ... FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
0 100002 1 Cash loans M N Y 0 202500.0 406597.5 24700.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
1 100003 0 Cash loans F N N 0 270000.0 1293502.5 35698.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
2 100004 0 Revolving loans M Y Y 0 67500.0 135000.0 6750.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
3 100006 0 Cash loans F N Y 0 135000.0 312682.5 29686.5 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
4 100007 0 Cash loans M N Y 0 121500.0 513000.0 21865.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 122 columns

1
2
3
4
5
app_test = pd.read_csv('./dataset/application_test.csv')

print('Testing data shape: ', app_test.shape)

app_test.head()
Testing data shape:  (48744, 121)

SK_ID_CURR NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE ... FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
0 100001 Cash loans F N Y 0 135000.0 568800.0 20560.5 450000.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
1 100005 Cash loans M N Y 0 99000.0 222768.0 17370.0 180000.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 3.0
2 100013 Cash loans M Y Y 0 202500.0 663264.0 69777.0 630000.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 1.0 4.0
3 100028 Cash loans F N Y 2 315000.0 1575000.0 49018.5 1575000.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 3.0
4 100038 Cash loans M Y N 1 180000.0 625500.0 32067.0 625500.0 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN

5 rows × 121 columns

Exploratory Data Analysis(EDA):数据探索性分析

1
2
3
# 查看类别分布状况

app_train['TARGET'].value_counts()
0    282686
1     24825
Name: TARGET, dtype: int64
1
app_train['TARGET'].astype(int).plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x210807715f8>

fqvXfx.png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# 检查缺失值

def missing_values_table(df):

# 每列特征的缺失值数目

mis_val = df.isnull().sum()

# 每列特征缺失值的百分比

mis_val_percent = 100 * df.isnull().sum() / len(df)

# 拼接上述两列,axis=1左右连接,axis=0上下连接

mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

# 重置列名

mis_val_table_ren_columns = mis_val_table.rename(

columns = {0 : 'Missing Values', 1 : '% of Total Values'})

# 按特征缺失率排序,ascending=True按升序排列,Fasle降序排列,round()保留几位小数

mis_val_table_ren_columns = mis_val_table_ren_columns[

mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(

'% of Total Values', ascending=False).round(1)

# 输出总结信息

print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"

"There are " + str(mis_val_table_ren_columns.shape[0]) +

" columns that have missing values.")

# 返回缺失信息

return mis_val_table_ren_columns
1
2
3
4
5
# 训练集数据缺失情况,对于缺失值可采取填补措施,对于缺失率很高的列可删除,在此处暂时保留

missing_values = missing_values_table(app_train)

missing_values.head(20)
Your selected dataframe has 122 columns.
There are 67 columns that have missing values.

Missing Values % of Total Values
COMMONAREA_MEDI 214865 69.9
COMMONAREA_AVG 214865 69.9
COMMONAREA_MODE 214865 69.9
NONLIVINGAPARTMENTS_MEDI 213514 69.4
NONLIVINGAPARTMENTS_MODE 213514 69.4
NONLIVINGAPARTMENTS_AVG 213514 69.4
FONDKAPREMONT_MODE 210295 68.4
LIVINGAPARTMENTS_MODE 210199 68.4
LIVINGAPARTMENTS_MEDI 210199 68.4
LIVINGAPARTMENTS_AVG 210199 68.4
FLOORSMIN_MODE 208642 67.8
FLOORSMIN_MEDI 208642 67.8
FLOORSMIN_AVG 208642 67.8
YEARS_BUILD_MODE 204488 66.5
YEARS_BUILD_MEDI 204488 66.5
YEARS_BUILD_AVG 204488 66.5
OWN_CAR_AGE 202929 66.0
LANDAREA_AVG 182590 59.4
LANDAREA_MEDI 182590 59.4
LANDAREA_MODE 182590 59.4
1
2
3
# 查看各类型数据的列数

app_train.dtypes.value_counts()
float64    65
int64      41
object     16
dtype: int64
1
2
3
# 查看对象类型中唯一值的数目

app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)
NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 将对象类型进行编码,建议采用One-hot encoding避免标签编码带来的数值大小问题对预测结果产生影响

# 此处对于只有两个类别变量的特征采用标签编码,对于有两个及以上类别变量的特征采用独热编码

# 独热编码后数据维度会明显上升,可能需要降维算法删除相关度较高的特征和对结果几乎没有影响的垃圾特征

# 利用scikit-learn LabelEncoder()进行标签编码

le = LabelEncoder()

le_count = 0



for col in app_train:

if app_train[col].dtype == 'object':

if len(list(app_train[col].unique())) <= 2:

# 在训练集上训练编码器

le.fit(app_train[col])

# 并在训练集和测试集上分别编码转换

app_train[col] = le.transform(app_train[col])

app_test[col] = le.transform(app_test[col])

# 计数有多少个特征被转换

le_count += 1



print('%d columns were label encoded.' % le_count)
3 columns were label encoded.
1
2
3
4
5
6
7
8
9
10
11
# 利用pandas get_dummies实现独热编码

app_train = pd.get_dummies(app_train)

app_test = pd.get_dummies(app_test)



print('Training Features shape: ', app_train.shape)

print('Testing Features shape: ', app_test.shape)
Training Features shape:  (307511, 243)
Testing Features shape:  (48744, 239)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 通过shape可以观察到训练集测试集的特征数量不同,因此需要删除在训练集中而不在测试集中的特征列

# 通过pandas将训练集和测试集对齐

train_labels = app_train['TARGET']

# axis=1按columns对齐,inner内连接只保留匹配的项,outer保留双方所有的数据,没有的列补NAN

app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# 将第一步取出的label重新加入处理后的训练集

app_train['TARGET'] = train_labels



print('Training Features shape: ', app_train.shape)

print('Testing Features shape: ', app_test.shape)
Training Features shape:  (307511, 240)
Testing Features shape:  (48744, 239)
  • 异常值处理

    • 本项目中大部分特征均为金额,不存在异常情况因此该部分仅作为示例,不作为实际处理

    • 本项目中异常值主要关注日期信息

1
2
3
numerical_fea = list(app_train.select_dtypes(exclude=['object']).columns)

numerical_fea
['SK_ID_CURR',
 'NAME_CONTRACT_TYPE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'TOTALAREA_MODE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'CODE_GENDER_F',
 'CODE_GENDER_M',
 'NAME_TYPE_SUITE_Children',
 'NAME_TYPE_SUITE_Family',
 'NAME_TYPE_SUITE_Group of people',
 'NAME_TYPE_SUITE_Other_A',
 'NAME_TYPE_SUITE_Other_B',
 'NAME_TYPE_SUITE_Spouse, partner',
 'NAME_TYPE_SUITE_Unaccompanied',
 'NAME_INCOME_TYPE_Businessman',
 'NAME_INCOME_TYPE_Commercial associate',
 'NAME_INCOME_TYPE_Pensioner',
 'NAME_INCOME_TYPE_State servant',
 'NAME_INCOME_TYPE_Student',
 'NAME_INCOME_TYPE_Unemployed',
 'NAME_INCOME_TYPE_Working',
 'NAME_EDUCATION_TYPE_Academic degree',
 'NAME_EDUCATION_TYPE_Higher education',
 'NAME_EDUCATION_TYPE_Incomplete higher',
 'NAME_EDUCATION_TYPE_Lower secondary',
 'NAME_EDUCATION_TYPE_Secondary / secondary special',
 'NAME_FAMILY_STATUS_Civil marriage',
 'NAME_FAMILY_STATUS_Married',
 'NAME_FAMILY_STATUS_Separated',
 'NAME_FAMILY_STATUS_Single / not married',
 'NAME_FAMILY_STATUS_Widow',
 'NAME_HOUSING_TYPE_Co-op apartment',
 'NAME_HOUSING_TYPE_House / apartment',
 'NAME_HOUSING_TYPE_Municipal apartment',
 'NAME_HOUSING_TYPE_Office apartment',
 'NAME_HOUSING_TYPE_Rented apartment',
 'NAME_HOUSING_TYPE_With parents',
 'OCCUPATION_TYPE_Accountants',
 'OCCUPATION_TYPE_Cleaning staff',
 'OCCUPATION_TYPE_Cooking staff',
 'OCCUPATION_TYPE_Core staff',
 'OCCUPATION_TYPE_Drivers',
 'OCCUPATION_TYPE_HR staff',
 'OCCUPATION_TYPE_High skill tech staff',
 'OCCUPATION_TYPE_IT staff',
 'OCCUPATION_TYPE_Laborers',
 'OCCUPATION_TYPE_Low-skill Laborers',
 'OCCUPATION_TYPE_Managers',
 'OCCUPATION_TYPE_Medicine staff',
 'OCCUPATION_TYPE_Private service staff',
 'OCCUPATION_TYPE_Realty agents',
 'OCCUPATION_TYPE_Sales staff',
 'OCCUPATION_TYPE_Secretaries',
 'OCCUPATION_TYPE_Security staff',
 'OCCUPATION_TYPE_Waiters/barmen staff',
 'WEEKDAY_APPR_PROCESS_START_FRIDAY',
 'WEEKDAY_APPR_PROCESS_START_MONDAY',
 'WEEKDAY_APPR_PROCESS_START_SATURDAY',
 'WEEKDAY_APPR_PROCESS_START_SUNDAY',
 'WEEKDAY_APPR_PROCESS_START_THURSDAY',
 'WEEKDAY_APPR_PROCESS_START_TUESDAY',
 'WEEKDAY_APPR_PROCESS_START_WEDNESDAY',
 'ORGANIZATION_TYPE_Advertising',
 'ORGANIZATION_TYPE_Agriculture',
 'ORGANIZATION_TYPE_Bank',
 'ORGANIZATION_TYPE_Business Entity Type 1',
 'ORGANIZATION_TYPE_Business Entity Type 2',
 'ORGANIZATION_TYPE_Business Entity Type 3',
 'ORGANIZATION_TYPE_Cleaning',
 'ORGANIZATION_TYPE_Construction',
 'ORGANIZATION_TYPE_Culture',
 'ORGANIZATION_TYPE_Electricity',
 'ORGANIZATION_TYPE_Emergency',
 'ORGANIZATION_TYPE_Government',
 'ORGANIZATION_TYPE_Hotel',
 'ORGANIZATION_TYPE_Housing',
 'ORGANIZATION_TYPE_Industry: type 1',
 'ORGANIZATION_TYPE_Industry: type 10',
 'ORGANIZATION_TYPE_Industry: type 11',
 'ORGANIZATION_TYPE_Industry: type 12',
 'ORGANIZATION_TYPE_Industry: type 13',
 'ORGANIZATION_TYPE_Industry: type 2',
 'ORGANIZATION_TYPE_Industry: type 3',
 'ORGANIZATION_TYPE_Industry: type 4',
 'ORGANIZATION_TYPE_Industry: type 5',
 'ORGANIZATION_TYPE_Industry: type 6',
 'ORGANIZATION_TYPE_Industry: type 7',
 'ORGANIZATION_TYPE_Industry: type 8',
 'ORGANIZATION_TYPE_Industry: type 9',
 'ORGANIZATION_TYPE_Insurance',
 'ORGANIZATION_TYPE_Kindergarten',
 'ORGANIZATION_TYPE_Legal Services',
 'ORGANIZATION_TYPE_Medicine',
 'ORGANIZATION_TYPE_Military',
 'ORGANIZATION_TYPE_Mobile',
 'ORGANIZATION_TYPE_Other',
 'ORGANIZATION_TYPE_Police',
 'ORGANIZATION_TYPE_Postal',
 'ORGANIZATION_TYPE_Realtor',
 'ORGANIZATION_TYPE_Religion',
 'ORGANIZATION_TYPE_Restaurant',
 'ORGANIZATION_TYPE_School',
 'ORGANIZATION_TYPE_Security',
 'ORGANIZATION_TYPE_Security Ministries',
 'ORGANIZATION_TYPE_Self-employed',
 'ORGANIZATION_TYPE_Services',
 'ORGANIZATION_TYPE_Telecom',
 'ORGANIZATION_TYPE_Trade: type 1',
 'ORGANIZATION_TYPE_Trade: type 2',
 'ORGANIZATION_TYPE_Trade: type 3',
 'ORGANIZATION_TYPE_Trade: type 4',
 'ORGANIZATION_TYPE_Trade: type 5',
 'ORGANIZATION_TYPE_Trade: type 6',
 'ORGANIZATION_TYPE_Trade: type 7',
 'ORGANIZATION_TYPE_Transport: type 1',
 'ORGANIZATION_TYPE_Transport: type 2',
 'ORGANIZATION_TYPE_Transport: type 3',
 'ORGANIZATION_TYPE_Transport: type 4',
 'ORGANIZATION_TYPE_University',
 'ORGANIZATION_TYPE_XNA',
 'FONDKAPREMONT_MODE_not specified',
 'FONDKAPREMONT_MODE_org spec account',
 'FONDKAPREMONT_MODE_reg oper account',
 'FONDKAPREMONT_MODE_reg oper spec account',
 'HOUSETYPE_MODE_block of flats',
 'HOUSETYPE_MODE_specific housing',
 'HOUSETYPE_MODE_terraced house',
 'WALLSMATERIAL_MODE_Block',
 'WALLSMATERIAL_MODE_Mixed',
 'WALLSMATERIAL_MODE_Monolithic',
 'WALLSMATERIAL_MODE_Others',
 'WALLSMATERIAL_MODE_Panel',
 'WALLSMATERIAL_MODE_Stone, brick',
 'WALLSMATERIAL_MODE_Wooden',
 'EMERGENCYSTATE_MODE_No',
 'EMERGENCYSTATE_MODE_Yes',
 'TARGET']
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 3segama异常值检测函数,添加异常值指示列

def find_outliers_by_3segama(data,fea):

data_std = np.std(data[fea])

data_mean = np.mean(data[fea])

outliers_cut_off = data_std * 3

lower_rule = data_mean - outliers_cut_off

upper_rule = data_mean + outliers_cut_off

data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')

return data
1
2
3
4
5
6
7
# 异常值详情

for fea in numerical_fea:

outlier_train = find_outliers_by_3segama(app_train,fea)

outlier_train

SK_ID_CURR NAME_CONTRACT_TYPE FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE ... WALLSMATERIAL_MODE_Block_outliers WALLSMATERIAL_MODE_Mixed_outliers WALLSMATERIAL_MODE_Monolithic_outliers WALLSMATERIAL_MODE_Others_outliers WALLSMATERIAL_MODE_Panel_outliers WALLSMATERIAL_MODE_Stone, brick_outliers WALLSMATERIAL_MODE_Wooden_outliers EMERGENCYSTATE_MODE_No_outliers EMERGENCYSTATE_MODE_Yes_outliers TARGET_outliers
0 100002 0 0 1 0 202500.0 406597.5 24700.5 351000.0 0.018801 ... 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 异常值
1 100003 0 0 0 0 270000.0 1293502.5 35698.5 1129500.0 0.003541 ... 异常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值
2 100004 1 1 1 0 67500.0 135000.0 6750.0 135000.0 0.010032 ... 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值
3 100006 0 0 1 0 135000.0 312682.5 29686.5 297000.0 0.008019 ... 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值
4 100007 0 0 1 0 121500.0 513000.0 21865.5 513000.0 0.028663 ... 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
307506 456251 0 0 0 0 157500.0 254700.0 27558.0 225000.0 0.032561 ... 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值
307507 456252 0 0 1 0 72000.0 269550.0 12001.5 225000.0 0.025164 ... 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值
307508 456253 0 0 1 0 153000.0 677664.0 29979.0 585000.0 0.005002 ... 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值
307509 456254 0 0 1 0 171000.0 370107.0 20205.0 319500.0 0.005313 ... 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 异常值
307510 456255 0 0 0 0 157500.0 675000.0 49117.5 675000.0 0.046220 ... 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值 正常值

307511 rows × 480 columns

1
2
3
4
5
6
7
# 正常值与异常值各自占比情况

for fea in numerical_fea:

radio = outlier_train[fea+'_outliers'].value_counts('异常值')

print(radio)
正常值    1.0
Name: SK_ID_CURR_outliers, dtype: float64
正常值    0.904787
异常值    0.095213
Name: NAME_CONTRACT_TYPE_outliers, dtype: float64
正常值    1.0
Name: FLAG_OWN_CAR_outliers, dtype: float64
正常值    1.0
Name: FLAG_OWN_REALTY_outliers, dtype: float64
正常值    0.986108
异常值    0.013892
Name: CNT_CHILDREN_outliers, dtype: float64
正常值    0.998524
异常值    0.001476
Name: AMT_INCOME_TOTAL_outliers, dtype: float64
正常值    0.989415
异常值    0.010585
Name: AMT_CREDIT_outliers, dtype: float64
正常值    0.990378
异常值    0.009622
Name: AMT_ANNUITY_outliers, dtype: float64
正常值    0.98643
异常值    0.01357
Name: AMT_GOODS_PRICE_outliers, dtype: float64
正常值    0.972645
异常值    0.027355
Name: REGION_POPULATION_RELATIVE_outliers, dtype: float64
正常值    1.0
Name: DAYS_BIRTH_outliers, dtype: float64
正常值    1.0
Name: DAYS_EMPLOYED_outliers, dtype: float64
正常值    0.997564
异常值    0.002436
Name: DAYS_REGISTRATION_outliers, dtype: float64
正常值    1.0
Name: DAYS_ID_PUBLISH_outliers, dtype: float64
正常值    0.989044
异常值    0.010956
Name: OWN_CAR_AGE_outliers, dtype: float64
正常值    0.999997
异常值    0.000003
Name: FLAG_MOBIL_outliers, dtype: float64
正常值    1.0
Name: FLAG_EMP_PHONE_outliers, dtype: float64
正常值    1.0
Name: FLAG_WORK_PHONE_outliers, dtype: float64
正常值    0.998133
异常值    0.001867
Name: FLAG_CONT_MOBILE_outliers, dtype: float64
正常值    1.0
Name: FLAG_PHONE_outliers, dtype: float64
正常值    0.94328
异常值    0.05672
Name: FLAG_EMAIL_outliers, dtype: float64
正常值    0.98697
异常值    0.01303
Name: CNT_FAM_MEMBERS_outliers, dtype: float64
正常值    1.0
Name: REGION_RATING_CLIENT_outliers, dtype: float64
正常值    1.0
Name: REGION_RATING_CLIENT_W_CITY_outliers, dtype: float64
正常值    0.997977
异常值    0.002023
Name: HOUR_APPR_PROCESS_START_outliers, dtype: float64
正常值    0.984856
异常值    0.015144
Name: REG_REGION_NOT_LIVE_REGION_outliers, dtype: float64
正常值    0.949231
异常值    0.050769
Name: REG_REGION_NOT_WORK_REGION_outliers, dtype: float64
正常值    0.959341
异常值    0.040659
Name: LIVE_REGION_NOT_WORK_REGION_outliers, dtype: float64
正常值    0.921827
异常值    0.078173
Name: REG_CITY_NOT_LIVE_CITY_outliers, dtype: float64
正常值    1.0
Name: REG_CITY_NOT_WORK_CITY_outliers, dtype: float64
正常值    1.0
Name: LIVE_CITY_NOT_WORK_CITY_outliers, dtype: float64
正常值    1.0
Name: EXT_SOURCE_1_outliers, dtype: float64
正常值    1.0
Name: EXT_SOURCE_2_outliers, dtype: float64
正常值    1.0
Name: EXT_SOURCE_3_outliers, dtype: float64
正常值    0.990303
异常值    0.009697
Name: APARTMENTS_AVG_outliers, dtype: float64
正常值    0.993584
异常值    0.006416
Name: BASEMENTAREA_AVG_outliers, dtype: float64
正常值    0.997743
异常值    0.002257
Name: YEARS_BEGINEXPLUATATION_AVG_outliers, dtype: float64
正常值    0.99612
异常值    0.00388
Name: YEARS_BUILD_AVG_outliers, dtype: float64
正常值    0.994455
异常值    0.005545
Name: COMMONAREA_AVG_outliers, dtype: float64
正常值    0.992065
异常值    0.007935
Name: ELEVATORS_AVG_outliers, dtype: float64
正常值    0.9928
异常值    0.0072
Name: ENTRANCES_AVG_outliers, dtype: float64
正常值    0.991509
异常值    0.008491
Name: FLOORSMAX_AVG_outliers, dtype: float64
正常值    0.998078
异常值    0.001922
Name: FLOORSMIN_AVG_outliers, dtype: float64
正常值    0.993207
异常值    0.006793
Name: LANDAREA_AVG_outliers, dtype: float64
正常值    0.994322
异常值    0.005678
Name: LIVINGAPARTMENTS_AVG_outliers, dtype: float64
正常值    0.989623
异常值    0.010377
Name: LIVINGAREA_AVG_outliers, dtype: float64
正常值    0.997652
异常值    0.002348
Name: NONLIVINGAPARTMENTS_AVG_outliers, dtype: float64
正常值    0.992156
异常值    0.007844
Name: NONLIVINGAREA_AVG_outliers, dtype: float64
正常值    0.990212
异常值    0.009788
Name: APARTMENTS_MODE_outliers, dtype: float64
正常值    0.993291
异常值    0.006709
Name: BASEMENTAREA_MODE_outliers, dtype: float64
正常值    0.997789
异常值    0.002211
Name: YEARS_BEGINEXPLUATATION_MODE_outliers, dtype: float64
正常值    0.996072
异常值    0.003928
Name: YEARS_BUILD_MODE_outliers, dtype: float64
正常值    0.994495
异常值    0.005505
Name: COMMONAREA_MODE_outliers, dtype: float64
正常值    0.98909
异常值    0.01091
Name: ELEVATORS_MODE_outliers, dtype: float64
正常值    0.991454
异常值    0.008546
Name: ENTRANCES_MODE_outliers, dtype: float64
正常值    0.991421
异常值    0.008579
Name: FLOORSMAX_MODE_outliers, dtype: float64
正常值    0.998439
异常值    0.001561
Name: FLOORSMIN_MODE_outliers, dtype: float64
正常值    0.993041
异常值    0.006959
Name: LANDAREA_MODE_outliers, dtype: float64
正常值    0.994124
异常值    0.005876
Name: LIVINGAPARTMENTS_MODE_outliers, dtype: float64
正常值    0.989126
异常值    0.010874
Name: LIVINGAREA_MODE_outliers, dtype: float64
正常值    0.997795
异常值    0.002205
Name: NONLIVINGAPARTMENTS_MODE_outliers, dtype: float64
正常值    0.991994
异常值    0.008006
Name: NONLIVINGAREA_MODE_outliers, dtype: float64
正常值    0.99013
异常值    0.00987
Name: APARTMENTS_MEDI_outliers, dtype: float64
正常值    0.993525
异常值    0.006475
Name: BASEMENTAREA_MEDI_outliers, dtype: float64
正常值    0.99788
异常值    0.00212
Name: YEARS_BEGINEXPLUATATION_MEDI_outliers, dtype: float64
正常值    0.996078
异常值    0.003922
Name: YEARS_BUILD_MEDI_outliers, dtype: float64
正常值    0.994397
异常值    0.005603
Name: COMMONAREA_MEDI_outliers, dtype: float64
正常值    0.992101
异常值    0.007899
Name: ELEVATORS_MEDI_outliers, dtype: float64
正常值    0.992761
异常值    0.007239
Name: ENTRANCES_MEDI_outliers, dtype: float64
正常值    0.991109
异常值    0.008891
Name: FLOORSMAX_MEDI_outliers, dtype: float64
正常值    0.998202
异常值    0.001798
Name: FLOORSMIN_MEDI_outliers, dtype: float64
正常值    0.993064
异常值    0.006936
Name: LANDAREA_MEDI_outliers, dtype: float64
正常值    0.994247
异常值    0.005753
Name: LIVINGAPARTMENTS_MEDI_outliers, dtype: float64
正常值    0.989513
异常值    0.010487
Name: LIVINGAREA_MEDI_outliers, dtype: float64
正常值    0.997659
异常值    0.002341
Name: NONLIVINGAPARTMENTS_MEDI_outliers, dtype: float64
正常值    0.992091
异常值    0.007909
Name: NONLIVINGAREA_MEDI_outliers, dtype: float64
正常值    0.989194
异常值    0.010806
Name: TOTALAREA_MODE_outliers, dtype: float64
正常值    0.979965
异常值    0.020035
Name: OBS_30_CNT_SOCIAL_CIRCLE_outliers, dtype: float64
正常值    0.977763
异常值    0.022237
Name: DEF_30_CNT_SOCIAL_CIRCLE_outliers, dtype: float64
正常值    0.980537
异常值    0.019463
Name: OBS_60_CNT_SOCIAL_CIRCLE_outliers, dtype: float64
正常值    0.987226
异常值    0.012774
Name: DEF_60_CNT_SOCIAL_CIRCLE_outliers, dtype: float64
正常值    0.997919
异常值    0.002081
Name: DAYS_LAST_PHONE_CHANGE_outliers, dtype: float64
正常值    0.999958
异常值    0.000042
Name: FLAG_DOCUMENT_2_outliers, dtype: float64
正常值    1.0
Name: FLAG_DOCUMENT_3_outliers, dtype: float64
正常值    0.999919
异常值    0.000081
Name: FLAG_DOCUMENT_4_outliers, dtype: float64
正常值    0.984885
异常值    0.015115
Name: FLAG_DOCUMENT_5_outliers, dtype: float64
正常值    0.911945
异常值    0.088055
Name: FLAG_DOCUMENT_6_outliers, dtype: float64
正常值    0.999808
异常值    0.000192
Name: FLAG_DOCUMENT_7_outliers, dtype: float64
正常值    0.918624
异常值    0.081376
Name: FLAG_DOCUMENT_8_outliers, dtype: float64
正常值    0.996104
异常值    0.003896
Name: FLAG_DOCUMENT_9_outliers, dtype: float64
正常值    0.999977
异常值    0.000023
Name: FLAG_DOCUMENT_10_outliers, dtype: float64
正常值    0.996088
异常值    0.003912
Name: FLAG_DOCUMENT_11_outliers, dtype: float64
正常值    0.999993
异常值    0.000007
Name: FLAG_DOCUMENT_12_outliers, dtype: float64
正常值    0.996475
异常值    0.003525
Name: FLAG_DOCUMENT_13_outliers, dtype: float64
正常值    0.997064
异常值    0.002936
Name: FLAG_DOCUMENT_14_outliers, dtype: float64
正常值    0.99879
异常值    0.00121
Name: FLAG_DOCUMENT_15_outliers, dtype: float64
正常值    0.990072
异常值    0.009928
Name: FLAG_DOCUMENT_16_outliers, dtype: float64
正常值    0.999733
异常值    0.000267
Name: FLAG_DOCUMENT_17_outliers, dtype: float64
正常值    0.99187
异常值    0.00813
Name: FLAG_DOCUMENT_18_outliers, dtype: float64
正常值    0.999405
异常值    0.000595
Name: FLAG_DOCUMENT_19_outliers, dtype: float64
正常值    0.999493
异常值    0.000507
Name: FLAG_DOCUMENT_20_outliers, dtype: float64
正常值    0.999665
异常值    0.000335
Name: FLAG_DOCUMENT_21_outliers, dtype: float64
正常值    0.994712
异常值    0.005288
Name: AMT_REQ_CREDIT_BUREAU_HOUR_outliers, dtype: float64
正常值    0.995158
异常值    0.004842
Name: AMT_REQ_CREDIT_BUREAU_DAY_outliers, dtype: float64
正常值    0.972242
异常值    0.027758
Name: AMT_REQ_CREDIT_BUREAU_WEEK_outliers, dtype: float64
正常值    0.98948
异常值    0.01052
Name: AMT_REQ_CREDIT_BUREAU_MON_outliers, dtype: float64
正常值    0.992517
异常值    0.007483
Name: AMT_REQ_CREDIT_BUREAU_QRT_outliers, dtype: float64
正常值    0.989061
异常值    0.010939
Name: AMT_REQ_CREDIT_BUREAU_YEAR_outliers, dtype: float64
正常值    1.0
Name: CODE_GENDER_F_outliers, dtype: float64
正常值    1.0
Name: CODE_GENDER_M_outliers, dtype: float64
正常值    0.989376
异常值    0.010624
Name: NAME_TYPE_SUITE_Children_outliers, dtype: float64
正常值    1.0
Name: NAME_TYPE_SUITE_Family_outliers, dtype: float64
正常值    0.999119
异常值    0.000881
Name: NAME_TYPE_SUITE_Group of people_outliers, dtype: float64
正常值    0.997184
异常值    0.002816
Name: NAME_TYPE_SUITE_Other_A_outliers, dtype: float64
正常值    0.994244
异常值    0.005756
Name: NAME_TYPE_SUITE_Other_B_outliers, dtype: float64
正常值    0.963026
异常值    0.036974
Name: NAME_TYPE_SUITE_Spouse, partner_outliers, dtype: float64
正常值    1.0
Name: NAME_TYPE_SUITE_Unaccompanied_outliers, dtype: float64
正常值    0.999967
异常值    0.000033
Name: NAME_INCOME_TYPE_Businessman_outliers, dtype: float64
正常值    1.0
Name: NAME_INCOME_TYPE_Commercial associate_outliers, dtype: float64
正常值    1.0
Name: NAME_INCOME_TYPE_Pensioner_outliers, dtype: float64
正常值    0.929424
异常值    0.070576
Name: NAME_INCOME_TYPE_State servant_outliers, dtype: float64
正常值    0.999941
异常值    0.000059
Name: NAME_INCOME_TYPE_Student_outliers, dtype: float64
正常值    0.999928
异常值    0.000072
Name: NAME_INCOME_TYPE_Unemployed_outliers, dtype: float64
正常值    1.0
Name: NAME_INCOME_TYPE_Working_outliers, dtype: float64
正常值    0.999467
异常值    0.000533
Name: NAME_EDUCATION_TYPE_Academic degree_outliers, dtype: float64
正常值    1.0
Name: NAME_EDUCATION_TYPE_Higher education_outliers, dtype: float64
正常值    0.96658
异常值    0.03342
Name: NAME_EDUCATION_TYPE_Incomplete higher_outliers, dtype: float64
正常值    0.987591
异常值    0.012409
Name: NAME_EDUCATION_TYPE_Lower secondary_outliers, dtype: float64
正常值    1.0
Name: NAME_EDUCATION_TYPE_Secondary / secondary special_outliers, dtype: float64
正常值    0.903174
异常值    0.096826
Name: NAME_FAMILY_STATUS_Civil marriage_outliers, dtype: float64
正常值    1.0
Name: NAME_FAMILY_STATUS_Married_outliers, dtype: float64
正常值    0.93571
异常值    0.06429
Name: NAME_FAMILY_STATUS_Separated_outliers, dtype: float64
正常值    1.0
Name: NAME_FAMILY_STATUS_Single / not married_outliers, dtype: float64
正常值    0.947683
异常值    0.052317
Name: NAME_FAMILY_STATUS_Widow_outliers, dtype: float64
正常值    0.996351
异常值    0.003649
Name: NAME_HOUSING_TYPE_Co-op apartment_outliers, dtype: float64
正常值    1.0
Name: NAME_HOUSING_TYPE_House / apartment_outliers, dtype: float64
正常值    0.963634
异常值    0.036366
Name: NAME_HOUSING_TYPE_Municipal apartment_outliers, dtype: float64
正常值    0.99149
异常值    0.00851
Name: NAME_HOUSING_TYPE_Office apartment_outliers, dtype: float64
正常值    0.984127
异常值    0.015873
Name: NAME_HOUSING_TYPE_Rented apartment_outliers, dtype: float64
正常值    0.951742
异常值    0.048258
Name: NAME_HOUSING_TYPE_With parents_outliers, dtype: float64
正常值    0.968089
异常值    0.031911
Name: OCCUPATION_TYPE_Accountants_outliers, dtype: float64
正常值    0.984869
异常值    0.015131
Name: OCCUPATION_TYPE_Cleaning staff_outliers, dtype: float64
正常值    0.980664
异常值    0.019336
Name: OCCUPATION_TYPE_Cooking staff_outliers, dtype: float64
正常值    0.910345
异常值    0.089655
Name: OCCUPATION_TYPE_Core staff_outliers, dtype: float64
正常值    0.939505
异常值    0.060495
Name: OCCUPATION_TYPE_Drivers_outliers, dtype: float64
正常值    0.998169
异常值    0.001831
Name: OCCUPATION_TYPE_HR staff_outliers, dtype: float64
正常值    0.962993
异常值    0.037007
Name: OCCUPATION_TYPE_High skill tech staff_outliers, dtype: float64
正常值    0.998289
异常值    0.001711
Name: OCCUPATION_TYPE_IT staff_outliers, dtype: float64
正常值    1.0
Name: OCCUPATION_TYPE_Laborers_outliers, dtype: float64
正常值    0.993194
异常值    0.006806
Name: OCCUPATION_TYPE_Low-skill Laborers_outliers, dtype: float64
正常值    0.930503
异常值    0.069497
Name: OCCUPATION_TYPE_Managers_outliers, dtype: float64
正常值    0.972238
异常值    0.027762
Name: OCCUPATION_TYPE_Medicine staff_outliers, dtype: float64
正常值    0.991376
异常值    0.008624
Name: OCCUPATION_TYPE_Private service staff_outliers, dtype: float64
正常值    0.997558
异常值    0.002442
Name: OCCUPATION_TYPE_Realty agents_outliers, dtype: float64
正常值    1.0
Name: OCCUPATION_TYPE_Sales staff_outliers, dtype: float64
正常值    0.995756
异常值    0.004244
Name: OCCUPATION_TYPE_Secretaries_outliers, dtype: float64
正常值    0.978144
异常值    0.021856
Name: OCCUPATION_TYPE_Security staff_outliers, dtype: float64
正常值    0.995616
异常值    0.004384
Name: OCCUPATION_TYPE_Waiters/barmen staff_outliers, dtype: float64
正常值    1.0
Name: WEEKDAY_APPR_PROCESS_START_FRIDAY_outliers, dtype: float64
正常值    1.0
Name: WEEKDAY_APPR_PROCESS_START_MONDAY_outliers, dtype: float64
正常值    1.0
Name: WEEKDAY_APPR_PROCESS_START_SATURDAY_outliers, dtype: float64
正常值    0.947381
异常值    0.052619
Name: WEEKDAY_APPR_PROCESS_START_SUNDAY_outliers, dtype: float64
正常值    1.0
Name: WEEKDAY_APPR_PROCESS_START_THURSDAY_outliers, dtype: float64
正常值    1.0
Name: WEEKDAY_APPR_PROCESS_START_TUESDAY_outliers, dtype: float64
正常值    1.0
Name: WEEKDAY_APPR_PROCESS_START_WEDNESDAY_outliers, dtype: float64
正常值    0.998605
异常值    0.001395
Name: ORGANIZATION_TYPE_Advertising_outliers, dtype: float64
正常值    0.99202
异常值    0.00798
Name: ORGANIZATION_TYPE_Agriculture_outliers, dtype: float64
正常值    0.991847
异常值    0.008153
Name: ORGANIZATION_TYPE_Bank_outliers, dtype: float64
正常值    0.980541
异常值    0.019459
Name: ORGANIZATION_TYPE_Business Entity Type 1_outliers, dtype: float64
正常值    0.965683
异常值    0.034317
Name: ORGANIZATION_TYPE_Business Entity Type 2_outliers, dtype: float64
正常值    1.0
Name: ORGANIZATION_TYPE_Business Entity Type 3_outliers, dtype: float64
正常值    0.999155
异常值    0.000845
Name: ORGANIZATION_TYPE_Cleaning_outliers, dtype: float64
正常值    0.978144
异常值    0.021856
Name: ORGANIZATION_TYPE_Construction_outliers, dtype: float64
正常值    0.998768
异常值    0.001232
Name: ORGANIZATION_TYPE_Culture_outliers, dtype: float64
正常值    0.996911
异常值    0.003089
Name: ORGANIZATION_TYPE_Electricity_outliers, dtype: float64
正常值    0.998179
异常值    0.001821
Name: ORGANIZATION_TYPE_Emergency_outliers, dtype: float64
正常值    0.966167
异常值    0.033833
Name: ORGANIZATION_TYPE_Government_outliers, dtype: float64
正常值    0.996859
异常值    0.003141
Name: ORGANIZATION_TYPE_Hotel_outliers, dtype: float64
正常值    0.990381
异常值    0.009619
Name: ORGANIZATION_TYPE_Housing_outliers, dtype: float64
正常值    0.996621
异常值    0.003379
Name: ORGANIZATION_TYPE_Industry: type 1_outliers, dtype: float64
正常值    0.999646
异常值    0.000354
Name: ORGANIZATION_TYPE_Industry: type 10_outliers, dtype: float64
正常值    0.991207
异常值    0.008793
Name: ORGANIZATION_TYPE_Industry: type 11_outliers, dtype: float64
正常值    0.9988
异常值    0.0012
Name: ORGANIZATION_TYPE_Industry: type 12_outliers, dtype: float64
正常值    0.999782
异常值    0.000218
Name: ORGANIZATION_TYPE_Industry: type 13_outliers, dtype: float64
正常值    0.998511
异常值    0.001489
Name: ORGANIZATION_TYPE_Industry: type 2_outliers, dtype: float64
正常值    0.98934
异常值    0.01066
Name: ORGANIZATION_TYPE_Industry: type 3_outliers, dtype: float64
正常值    0.997148
异常值    0.002852
Name: ORGANIZATION_TYPE_Industry: type 4_outliers, dtype: float64
正常值    0.998052
异常值    0.001948
Name: ORGANIZATION_TYPE_Industry: type 5_outliers, dtype: float64
正常值    0.999636
异常值    0.000364
Name: ORGANIZATION_TYPE_Industry: type 6_outliers, dtype: float64
正常值    0.99575
异常值    0.00425
Name: ORGANIZATION_TYPE_Industry: type 7_outliers, dtype: float64
正常值    0.999922
异常值    0.000078
Name: ORGANIZATION_TYPE_Industry: type 8_outliers, dtype: float64
正常值    0.989048
异常值    0.010952
Name: ORGANIZATION_TYPE_Industry: type 9_outliers, dtype: float64
正常值    0.998059
异常值    0.001941
Name: ORGANIZATION_TYPE_Insurance_outliers, dtype: float64
正常值    0.977627
异常值    0.022373
Name: ORGANIZATION_TYPE_Kindergarten_outliers, dtype: float64
正常值    0.999008
异常值    0.000992
Name: ORGANIZATION_TYPE_Legal Services_outliers, dtype: float64
正常值    0.963601
异常值    0.036399
Name: ORGANIZATION_TYPE_Medicine_outliers, dtype: float64
正常值    0.991434
异常值    0.008566
Name: ORGANIZATION_TYPE_Military_outliers, dtype: float64
正常值    0.998969
异常值    0.001031
Name: ORGANIZATION_TYPE_Mobile_outliers, dtype: float64
正常值    0.945748
异常值    0.054252
Name: ORGANIZATION_TYPE_Other_outliers, dtype: float64
正常值    0.992387
异常值    0.007613
Name: ORGANIZATION_TYPE_Police_outliers, dtype: float64
正常值    0.992986
异常值    0.007014
Name: ORGANIZATION_TYPE_Postal_outliers, dtype: float64
正常值    0.998712
异常值    0.001288
Name: ORGANIZATION_TYPE_Realtor_outliers, dtype: float64
正常值    0.999724
异常值    0.000276
Name: ORGANIZATION_TYPE_Religion_outliers, dtype: float64
正常值    0.994111
异常值    0.005889
Name: ORGANIZATION_TYPE_Restaurant_outliers, dtype: float64
正常值    0.971081
异常值    0.028919
Name: ORGANIZATION_TYPE_School_outliers, dtype: float64
正常值    0.989441
异常值    0.010559
Name: ORGANIZATION_TYPE_Security_outliers, dtype: float64
正常值    0.993581
异常值    0.006419
Name: ORGANIZATION_TYPE_Security Ministries_outliers, dtype: float64
正常值    1.0
Name: ORGANIZATION_TYPE_Self-employed_outliers, dtype: float64
正常值    0.994878
异常值    0.005122
Name: ORGANIZATION_TYPE_Services_outliers, dtype: float64
正常值    0.998124
异常值    0.001876
Name: ORGANIZATION_TYPE_Telecom_outliers, dtype: float64
正常值    0.998868
异常值    0.001132
Name: ORGANIZATION_TYPE_Trade: type 1_outliers, dtype: float64
正常值    0.993821
异常值    0.006179
Name: ORGANIZATION_TYPE_Trade: type 2_outliers, dtype: float64
正常值    0.988644
异常值    0.011356
Name: ORGANIZATION_TYPE_Trade: type 3_outliers, dtype: float64
正常值    0.999792
异常值    0.000208
Name: ORGANIZATION_TYPE_Trade: type 4_outliers, dtype: float64
正常值    0.999841
异常值    0.000159
Name: ORGANIZATION_TYPE_Trade: type 5_outliers, dtype: float64
正常值    0.997948
异常值    0.002052
Name: ORGANIZATION_TYPE_Trade: type 6_outliers, dtype: float64
正常值    0.974534
异常值    0.025466
Name: ORGANIZATION_TYPE_Trade: type 7_outliers, dtype: float64
正常值    0.999346
异常值    0.000654
Name: ORGANIZATION_TYPE_Transport: type 1_outliers, dtype: float64
正常值    0.992833
异常值    0.007167
Name: ORGANIZATION_TYPE_Transport: type 2_outliers, dtype: float64
正常值    0.99614
异常值    0.00386
Name: ORGANIZATION_TYPE_Transport: type 3_outliers, dtype: float64
正常值    0.982446
异常值    0.017554
Name: ORGANIZATION_TYPE_Transport: type 4_outliers, dtype: float64
正常值    0.995685
异常值    0.004315
Name: ORGANIZATION_TYPE_University_outliers, dtype: float64
正常值    1.0
Name: ORGANIZATION_TYPE_XNA_outliers, dtype: float64
正常值    0.981506
异常值    0.018494
Name: FONDKAPREMONT_MODE_not specified_outliers, dtype: float64
正常值    0.981727
异常值    0.018273
Name: FONDKAPREMONT_MODE_org spec account_outliers, dtype: float64
正常值    1.0
Name: FONDKAPREMONT_MODE_reg oper account_outliers, dtype: float64
正常值    0.960717
异常值    0.039283
Name: FONDKAPREMONT_MODE_reg oper spec account_outliers, dtype: float64
正常值    1.0
Name: HOUSETYPE_MODE_block of flats_outliers, dtype: float64
正常值    0.995125
异常值    0.004875
Name: HOUSETYPE_MODE_specific housing_outliers, dtype: float64
正常值    0.996059
异常值    0.003941
Name: HOUSETYPE_MODE_terraced house_outliers, dtype: float64
正常值    0.96991
异常值    0.03009
Name: WALLSMATERIAL_MODE_Block_outliers, dtype: float64
正常值    0.992534
异常值    0.007466
Name: WALLSMATERIAL_MODE_Mixed_outliers, dtype: float64
正常值    0.994215
异常值    0.005785
Name: WALLSMATERIAL_MODE_Monolithic_outliers, dtype: float64
正常值    0.994716
异常值    0.005284
Name: WALLSMATERIAL_MODE_Others_outliers, dtype: float64
正常值    1.0
Name: WALLSMATERIAL_MODE_Panel_outliers, dtype: float64
正常值    1.0
Name: WALLSMATERIAL_MODE_Stone, brick_outliers, dtype: float64
正常值    0.982563
异常值    0.017437
Name: WALLSMATERIAL_MODE_Wooden_outliers, dtype: float64
正常值    1.0
Name: EMERGENCYSTATE_MODE_No_outliers, dtype: float64
正常值    0.99243
异常值    0.00757
Name: EMERGENCYSTATE_MODE_Yes_outliers, dtype: float64
正常值    0.919271
异常值    0.080729
Name: TARGET_outliers, dtype: float64
1
2
3
4
5
# 检查异常值,利用描述统计describe查看特征的均值极大值极小值等结合常识和业务判断是否存在异常

# 查看用户年龄分布,此处该特征为负数,反映的是申请贷款前用户存活天数故除以-365得到年龄

(app_train['DAYS_BIRTH'] / -365).describe()
count    307511.000000
mean         43.936973
std          11.956133
min          20.517808
25%          34.008219
50%          43.150685
75%          53.923288
max          69.120548
Name: DAYS_BIRTH, dtype: float64
1
2
3
# 查看工作年限,此处结果最小值为-1000年明显异常

(app_train['DAYS_EMPLOYED'] / -365).describe()
count    307511.000000
mean       -174.835742
std         387.056895
min       -1000.665753
25%           0.791781
50%           3.323288
75%           7.561644
max          49.073973
Name: DAYS_EMPLOYED, dtype: float64
1
2
3
4
5
# 经查看天数在数据集中均表示为负数,365243可能为替代缺失值的数字

app_train['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram')

plt.xlabel('Days Employment')
Text(0.5, 0, 'Days Employment')

fqvOt1.png

1
2
3
4
5
6
7
8
9
10
11
# 查看异常数据的违约率是否比其他的数据高,可以看到异常值违约率更低

anom = app_train[app_train['DAYS_EMPLOYED'] == 365243]

non_anom = app_train[app_train['DAYS_EMPLOYED'] != 365243]

print('The non-anomalies default on %0.2f%% of loans' % (100 * non_anom['TARGET'].mean()))

print('The anomalies default on %0.2f%% of loans' % (100 * anom['TARGET'].mean()))

print('There are %d anomalous days of employment' % len(anom))
The non-anomalies default on 8.66% of loans
The anomalies default on 5.40% of loans
There are 55374 anomalous days of employment
1
2
3
4
5
6
7
8
9
10
11
12
13
# 创建新列表示该数据是否异常

app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243

# 使用缺失值替换空值标志

app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)



app_train['DAYS_EMPLOYED'].plot.hist(title = 'Days Employment Histogram')

plt.xlabel('Days Employment')
Text(0.5, 0, 'Days Employment')

fqvLkR.png

1
2
3
4
5
6
7
8
9
# 对测试集做相同操作

app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243

app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)



print('There are %d anomalies in the test data out of %d entries' % (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test)))
There are 9274 anomalies in the test data out of 48744 entries
1
2
3
4
5
6
7
8
9
# 查看特征与标签的相关性,并排序

correlations = app_train.corr()['TARGET'].sort_values()



print('Most Positive Correlations:\n', correlations.tail(15))

print('\nMost Negative Correlations:\n', correlations.head(15))
Most Positive Correlations:
 OCCUPATION_TYPE_Laborers                             0.043019
FLAG_DOCUMENT_3                                      0.044346
REG_CITY_NOT_LIVE_CITY                               0.044395
FLAG_EMP_PHONE                                       0.045982
NAME_EDUCATION_TYPE_Secondary / secondary special    0.049824
REG_CITY_NOT_WORK_CITY                               0.050994
DAYS_ID_PUBLISH                                      0.051457
CODE_GENDER_M                                        0.054713
DAYS_LAST_PHONE_CHANGE                               0.055218
NAME_INCOME_TYPE_Working                             0.057481
REGION_RATING_CLIENT                                 0.058899
REGION_RATING_CLIENT_W_CITY                          0.060893
DAYS_EMPLOYED                                        0.074958
DAYS_BIRTH                                           0.078239
TARGET                                               1.000000
Name: TARGET, dtype: float64

Most Negative Correlations:
 EXT_SOURCE_3                           -0.178919
EXT_SOURCE_2                           -0.160472
EXT_SOURCE_1                           -0.155317
NAME_EDUCATION_TYPE_Higher education   -0.056593
CODE_GENDER_F                          -0.054704
NAME_INCOME_TYPE_Pensioner             -0.046209
DAYS_EMPLOYED_ANOM                     -0.045987
ORGANIZATION_TYPE_XNA                  -0.045987
FLOORSMAX_AVG                          -0.044003
FLOORSMAX_MEDI                         -0.043768
FLOORSMAX_MODE                         -0.043226
EMERGENCYSTATE_MODE_No                 -0.042201
HOUSETYPE_MODE_block of flats          -0.040594
AMT_GOODS_PRICE                        -0.039645
REGION_POPULATION_RELATIVE             -0.037227
Name: TARGET, dtype: float64
1
2
3
4
5
# 年龄特征天数在源数据中是负的,年龄越大应越不容易违约,在此处对该数据上了绝对值重新计算相关性

app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])

app_train['DAYS_BIRTH'].corr(app_train['TARGET'])
-0.07823930830982712
1
2
3
4
5
6
7
8
9
# 设置图形样式

plt.style.use('fivethirtyeight')

# 对客户年龄分布进行作图

plt.hist(app_train['DAYS_BIRTH'] / 365, edgecolor = 'k', bins = 25)

plt.title('Age of Client'); plt.xlabel('Age (years)'); plt.ylabel('Count')
Text(0, 0.5, 'Count')

fqvx1K.png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
plt.figure(figsize = (10, 8))

# 未违约用户年龄的KDE图

sns.kdeplot(data = app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = 'target == 0')

# 违约用户年龄的KDE图

sns.kdeplot(data = app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')

# 坐标轴

plt.xlabel('Age (years)')

plt.ylabel('Density')

plt.title('Distribution of Ages')

plt.legend() # 重要,没有显示标签语句,原图设置了标签但不显示
<matplotlib.legend.Legend at 0x158876e93c8>

fqvvp6.png

1
2
3
4
5
6
7
8
9
10
11
12
13
# 对年龄进行分箱

# 将年龄数据取出

age_data = app_train[['TARGET', 'DAYS_BIRTH']]

age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH'] / 365

# 将年龄数据分箱,利用np.linspace在20-70之间生成11个数来划分区间

age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))

age_data.head(10)

TARGET DAYS_BIRTH YEARS_BIRTH YEARS_BINNED
0 1 9461 25.920548 (25.0, 30.0]
1 0 16765 45.931507 (45.0, 50.0]
2 0 19046 52.180822 (50.0, 55.0]
3 0 19005 52.068493 (50.0, 55.0]
4 0 19932 54.608219 (50.0, 55.0]
5 0 16941 46.413699 (45.0, 50.0]
6 0 13778 37.747945 (35.0, 40.0]
7 0 18850 51.643836 (50.0, 55.0]
8 0 20099 55.065753 (55.0, 60.0]
9 0 14469 39.641096 (35.0, 40.0]
1
2
3
4
5
# 根据分箱进行分组并求均值

age_groups = age_data.groupby('YEARS_BINNED').mean()

age_groups

TARGET DAYS_BIRTH YEARS_BIRTH
YEARS_BINNED
(20.0, 25.0] 0.123036 8532.795625 23.377522
(25.0, 30.0] 0.111436 10155.219250 27.822518
(30.0, 35.0] 0.102814 11854.848377 32.479037
(35.0, 40.0] 0.089414 13707.908253 37.555913
(40.0, 45.0] 0.078491 15497.661233 42.459346
(45.0, 50.0] 0.074171 17323.900441 47.462741
(50.0, 55.0] 0.066968 19196.494791 52.593136
(55.0, 60.0] 0.055314 20984.262742 57.491131
(60.0, 65.0] 0.052737 22780.547460 62.412459
(65.0, 70.0] 0.037270 24292.614340 66.555108
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
plt.figure(figsize = (8, 8))

# 将年龄分箱和平均违约率作条形图

plt.bar(age_groups.index.astype(str), 100 * age_groups['TARGET'])

# 坐标轴

plt.xticks(rotation = 75)

plt.xlabel('Age Group (years)')

plt.ylabel('Failure to Repay (%)')

plt.title('Failure to Repay by Age Group')
Text(0.5, 1.0, 'Failure to Repay by Age Group')

fqvz6O.png

1
2
3
4
5
6
7
# 检查三个与目标负相关最强的变量彼此之间和与目标等属性间的相关性

ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

ext_data_corrs = ext_data.corr()

ext_data_corrs

TARGET EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH
TARGET 1.000000 -0.155317 -0.160472 -0.178919 -0.078239
EXT_SOURCE_1 -0.155317 1.000000 0.213982 0.186846 0.600610
EXT_SOURCE_2 -0.160472 0.213982 1.000000 0.109167 0.091996
EXT_SOURCE_3 -0.178919 0.186846 0.109167 1.000000 0.205478
DAYS_BIRTH -0.078239 0.600610 0.091996 0.205478 1.000000
1
2
3
4
5
6
7
plt.figure(figsize = (8, 6))

# 对相关性作热度图 annot=True格子上显示数字 vmin\vmax确定颜色范围

sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)

plt.title('Correlation Heatmap')
Text(0.5, 1.0, 'Correlation Heatmap')

fqxSXD.png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
plt.figure(figsize = (10, 12))

# 遍历三个变量

for i, source in enumerate(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']):

# subplot三个整数是子图行数、列数和索引值

plt.subplot(3, 1, i + 1)

# 未违约客户KDE图

sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, source], label = 'target == 0')

# 违约客户KDE图

sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, source], label = 'target == 1')

# 坐标轴

plt.title('Distribution of %s by Target Value' % source)

plt.xlabel('%s' % source); plt.ylabel('Density')

plt.legend()



plt.tight_layout(h_pad = 2.5)

fqx9ne.png

特征工程

两种简单的特征构建方式

  • 多项式特征
  • 领域知识特征

构造的特征是否有用不能轻易断定,只有试了之后看效果才能知道。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# 多项式特征通过scikit-learn polynomialfeatures构建

# 为构建多项式特征构建新的df

poly_features = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]

poly_features_test = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

# 缺失值填充策略中位数

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = 'median')



poly_target = poly_features['TARGET']



poly_features = poly_features.drop(columns = ['TARGET'])

# 进行缺失值填充,注意以训练集为基准测试集作相同变化

poly_features = imputer.fit_transform(poly_features)

poly_features_test = imputer.transform(poly_features_test)



from sklearn.preprocessing import PolynomialFeatures

# 创建多项式特征,定义度数

poly_transformer = PolynomialFeatures(degree = 3)

# 训练多项式特征

poly_transformer.fit(poly_features)

# 生成特征

poly_features = poly_transformer.transform(poly_features)

poly_features_test = poly_transformer.transform(poly_features_test)

print('Polynomial Features shape: ', poly_features.shape)
Polynomial Features shape:  (307511, 35)
1
2
3
# 创建特征很多使用get_feature_names为新特征命名,并预览前15个

poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])[:15]
['1',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'DAYS_BIRTH',
 'EXT_SOURCE_1^2',
 'EXT_SOURCE_1 EXT_SOURCE_2',
 'EXT_SOURCE_1 EXT_SOURCE_3',
 'EXT_SOURCE_1 DAYS_BIRTH',
 'EXT_SOURCE_2^2',
 'EXT_SOURCE_2 EXT_SOURCE_3',
 'EXT_SOURCE_2 DAYS_BIRTH',
 'EXT_SOURCE_3^2',
 'EXT_SOURCE_3 DAYS_BIRTH',
 'DAYS_BIRTH^2']
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 检查新特征与目标的相关度

# 为新特征建立df

poly_features = pd.DataFrame(poly_features,

columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2',

'EXT_SOURCE_3', 'DAYS_BIRTH']))

# 加入target列特征

poly_features['TARGET'] = poly_target

# 查看与target的相关性并排序

poly_corrs = poly_features.corr()['TARGET'].sort_values()

# 查看前十和后五项

print(poly_corrs.head(10))

print(poly_corrs.tail(5))
EXT_SOURCE_2 EXT_SOURCE_3                -0.193939
EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3   -0.189605
EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH     -0.181283
EXT_SOURCE_2^2 EXT_SOURCE_3              -0.176428
EXT_SOURCE_2 EXT_SOURCE_3^2              -0.172282
EXT_SOURCE_1 EXT_SOURCE_2                -0.166625
EXT_SOURCE_1 EXT_SOURCE_3                -0.164065
EXT_SOURCE_2                             -0.160295
EXT_SOURCE_2 DAYS_BIRTH                  -0.156873
EXT_SOURCE_1 EXT_SOURCE_2^2              -0.156867
Name: TARGET, dtype: float64
DAYS_BIRTH     -0.078239
DAYS_BIRTH^2   -0.076672
DAYS_BIRTH^3   -0.074273
TARGET          1.000000
1                    NaN
Name: TARGET, dtype: float64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# 为测试特征新建df

poly_features_test = pd.DataFrame(poly_features_test,

columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2',

'EXT_SOURCE_3', 'DAYS_BIRTH']))

# 将多项式特征与训练特征合并,以sk_id_curr为引,how=left保留所有左表信息,将右表拼上来不能对其的部分用NAN填充

poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR']

app_train_poly = app_train.merge(poly_features, on = 'SK_ID_CURR', how = 'left')

# 同样操作也作用于测试集

poly_features_test['SK_ID_CURR'] = app_test['SK_ID_CURR']

app_test_poly = app_test.merge(poly_features_test, on = 'SK_ID_CURR', how = 'left')

# 将训练集和数据集的特征对齐

app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join = 'inner', axis = 1)

# 查看训练集测试集特征尺寸

print('Training data with polynomial features shape: ', app_train_poly.shape)

print('Testing data with polynomial features shape: ', app_test_poly.shape)
Training data with polynomial features shape:  (307511, 275)
Testing data with polynomial features shape:   (48744, 275)
  • 领域知识特征
    • CREDIT_INCOME_PERCENT: 信用金额相对于客户收入的百分比
    • ANNUITY_INCOME_PERCENT: 贷款年金相对于客户收入的百分比
    • CREDIT_TERM: 支付期限以月为单位(因为年金是每月到期的金额
    • DAYS_EMPLOYED_PERCENT: 雇佣天数占客户年龄的百分比
1
2
3
4
5
6
7
8
9
10
11
12
13
app_train_domain = app_train.copy()

app_test_domain = app_test.copy()

# 构造训练集相关特征

app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL']

app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL']

app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT']

app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH']
1
2
3
4
5
6
7
8
9
# 测试集上采取相同操作

app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain['AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL']

app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL']

app_test_domain['CREDIT_TERM'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_CREDIT']

app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH']
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# 可视化新特征

plt.figure(figsize = (12, 20))

# 遍历新特征

for i, feature in enumerate(['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT']):

# 创建子图,4行1列编号为i+1

plt.subplot(4, 1, i + 1)

# 未违约kde图

sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 0, feature], label = 'target == 0')

# 违约kde图

sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 1, feature], label = 'target == 1')

# 标签及坐标轴

plt.title('Distribution of %s by Target Value' % feature)

plt.xlabel('%s' % feature)

plt.ylabel('Density')

plt.legend()



plt.tight_layout(h_pad = 2.5)

fqxFAA.png

Baseline

首先对数据进行预处理:

  • 对象数据编码(encoding)

  • 填充缺失值(imputation)

  • 归一化特征范围(feature scaling)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from sklearn.preprocessing import MinMaxScaler

from sklearn.impute import SimpleImputer

# 检查训练集中是否包含target,包含的话删掉

if 'TARGET' in app_train:

train = app_train.drop(columns = ['TARGET'])

else:

train = app_train.copy()

# 获取特征名

features = list(train.columns)

# 复制测试集

test = app_test.copy()

# 对缺失值采取中位数填充策略

imputer = SimpleImputer(strategy = 'median')

# 归一化特征至0-1区间

scaler = MinMaxScaler(feature_range = (0, 1))

# 通过训练集训练填充参数

imputer.fit(train)

# 填充训练集和测试集

train = imputer.transform(train)

test = imputer.transform(app_test)

# 通过训练集训练归一化参数,并且归一化训练集和测试集

scaler.fit(train)

train = scaler.transform(train)

test = scaler.transform(test)

# 查看训练集和测试集的尺寸

print('Training data shape: ', train.shape)

print('Testing data shape: ', test.shape)
Training data shape:  (307511, 240)
Testing data shape:  (48744, 240)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 基线模型使用Scikit-Learn的logistic_regression

from sklearn.linear_model import LogisticRegression

# 初始化模型

log_reg = LogisticRegression(C = 0.0001)

# 训练模型

log_reg.fit(train, train_labels)

# 做预测,注意predict返回的是一个预测值,而predict_proba返回的是各个类别的概率,行概率和为1

# 二分类则得到([px1,px2],(dtype)) 分别表示预测为0的概率和预测为1的概率

log_reg_pred = log_reg.predict_proba(test)

log_reg_pred
array([[0.92148542, 0.07851458],
       [0.8620737 , 0.1379263 ],
       [0.91780633, 0.08219367],
       ...,
       [0.92277534, 0.07722466],
       [0.91761109, 0.08238891],
       [0.89786466, 0.10213534]])
1
2
3
# 查看每列概率对应的类别,可以看出与以上描述一致,第一个为0的概率第二个为1的概率

log_reg.classes_
array([0, 1], dtype=int64)
1
2
3
4
5
# 由于我们需要的是违约的概率即选择第二列结果

log_reg_pred = log_reg.predict_proba(test)[:, 1]

log_reg_pred
array([0.07851458, 0.1379263 , 0.08219367, ..., 0.07722466, 0.08238891,
       0.10213534])
1
2
3
4
5
6
7
# Submission dataframe

submit = app_test[['SK_ID_CURR']]

submit['TARGET'] = log_reg_pred

submit.head()

SK_ID_CURR TARGET
0 100001 0.078515
1 100005 0.137926
2 100013 0.082194
3 100028 0.080921
4 100038 0.132618
1
2
3
# 将结果保存为csv

submit.to_csv('./1_result/log_reg_baseline.csv', index = False)

随机森林模型

1
2
3
4
5
6
7
from sklearn.ensemble import RandomForestClassifier

# 建立随机森林模型,n_estimators为树的数量,n_jobs设定工作的core数量,-1为所有core工作

# verbose为日志显示为0时不输出日志信息,为1时输出带进度条的日志信息,为2时无进度条得日志信息

random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 在训练集上训练随机森林

random_forest.fit(train, train_labels)



# 获取特征重要性

feature_importance_values = random_forest.feature_importances_

feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})



# 在测试集上做预测,选择违约的概率

predictions = random_forest.predict_proba(test)[:, 1]
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.6s finished
1
2
3
4
5
6
7
8
9
# 提交准备

submit = app_test[['SK_ID_CURR']]

submit['TARGET'] = predictions



submit.to_csv('./1_result/random_forest_baseline.csv', index = False)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# 验证特征工程创建的特征是否有用,多项式特征

# 获取特征名

poly_features_names = list(app_train_poly.columns)



# 中位数填补缺失值

imputer = SimpleImputer(strategy = 'median')



poly_features = imputer.fit_transform(app_train_poly)

poly_features_test = imputer.transform(app_test_poly)



# 归一化多项式特征

scaler = MinMaxScaler(feature_range = (0, 1))



poly_features = scaler.fit_transform(poly_features)

poly_features_test = scaler.transform(poly_features_test)

# 建立模型

random_forest_poly = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
1
2
3
4
5
6
7
8
9
# 训练模型

random_forest_poly.fit(poly_features, train_labels)



# 做预测

predictions = random_forest_poly.predict_proba(poly_features_test)[:, 1]
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.4s finished
1
2
3
4
5
6
7
8
9
# 提交准备

submit = app_test[['SK_ID_CURR']]

submit['TARGET'] = predictions



submit.to_csv('./1_result/random_forest_baseline_engineered.csv', index = False)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# 测试领域知识特征是否有用

app_train_domain = app_train_domain.drop(columns = 'TARGET')



domain_features_names = list(app_train_domain.columns)



# 中位数填补缺失值

imputer = SimpleImputer(strategy = 'median')



domain_features = imputer.fit_transform(app_train_domain)

domain_features_test = imputer.transform(app_test_domain)



# 归一化领域特征

scaler = MinMaxScaler(feature_range = (0, 1))



domain_features = scaler.fit_transform(domain_features)

domain_features_test = scaler.transform(domain_features_test)

# 建立模型

random_forest_domain = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)



# 训练模型

random_forest_domain.fit(domain_features, train_labels)



# 获取特征重要性

feature_importance_values_domain = random_forest_domain.feature_importances_

feature_importances_domain = pd.DataFrame({'feature': domain_features_names, 'importance': feature_importance_values_domain})



# 做预测

predictions = random_forest_domain.predict_proba(domain_features_test)[:, 1]
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.7min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished
1
2
3
4
5
6
7
8
9
# 提交准备

submit = app_test[['SK_ID_CURR']]

submit['TARGET'] = predictions



submit.to_csv('./1_result/random_forest_baseline_domain.csv', index = False)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def plot_feature_importances(df):

"""
绘制特征重要性,重要性越高越好

Args:

df (dataframe): 特征重要性. 包含`features`列和 `importance'列

Returns:

用图展示15个最重要的特征

"""



# 根据特征重要性排序,降序

df = df.sort_values('importance', ascending = False).reset_index()



# 归一化特征重要性

df['importance_normalized'] = df['importance'] / df['importance'].sum()



# 条形图

plt.figure(figsize = (10, 6))

ax = plt.subplot()



# 将最重要的索引置顶

ax.barh(list(reversed(list(df.index[:15]))),

df['importance_normalized'].head(15),

align = 'center', edgecolor = 'k')



# 设置标签

ax.set_yticks(list(reversed(list(df.index[:15]))))

ax.set_yticklabels(df['feature'].head(15))



# 绘制标签

plt.xlabel('Normalized Importance'); plt.title('Feature Importances')

plt.show()



return df



# 展示默认特征的重要性

feature_importances_sorted = plot_feature_importances(feature_importances)

fqxC0H.png

1
2
3
# 展示领域特征重要性

feature_importances_domain_sorted = plot_feature_importances(feature_importances_domain)

fqxP7d.png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
from sklearn.model_selection import KFold

from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import gc



def model(features, test_features, encoding = 'ohe', n_folds = 5):



"""lightgbm交叉验证



Parameters

--------

features (pd.DataFrame):

训练集特征

test_features (pd.DataFrame):

测试集特征

encoding (str, default = 'ohe'):

独热编码,le为标签编码

n_folds (int, default = 5): 交叉验证折数默认为5



Return

--------

submission (pd.DataFrame):

由`SK_ID_CURR` 和 `TARGET`组成的DF

feature_importances (pd.DataFrame):

特征重要性

valid_metrics (pd.DataFrame):

每折验证的ROC和AUC以及最终的性能值



"""



# 提取 ids

train_ids = features['SK_ID_CURR']

test_ids = test_features['SK_ID_CURR']



# 提取训练集标签

labels = features['TARGET']



# 只保留特征

features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])

test_features = test_features.drop(columns = ['SK_ID_CURR'])





# 独热编码

if encoding == 'ohe':

features = pd.get_dummies(features)

test_features = pd.get_dummies(test_features)



# 按列对齐DF

features, test_features = features.align(test_features, join = 'inner', axis = 1)



# 无分类索引记录

cat_indices = 'auto'



# 整数标签编码

elif encoding == 'le':



# 创建标签编码

label_encoder = LabelEncoder()



# 用于存储分类索引的列表

cat_indices = []



# 遍历每列

for i, col in enumerate(features):

if features[col].dtype == 'object':

# 将分类特性映射到整数

features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))

test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))



# 记录分类索引

cat_indices.append(i)



# 若标签编码无效则捕获错误

else:

raise ValueError("Encoding must be either 'ohe' or 'le'")



print('Training Data Shape: ', features.shape)

print('Testing Data Shape: ', test_features.shape)



# 提取特征名称

feature_names = list(features.columns)



# 将DF转化为np数组

features = np.array(features)

test_features = np.array(test_features)



# 创建交叉验证对象

k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)



# 建立空数组用于记录特征重要性

feature_importance_values = np.zeros(len(feature_names))



# 建立空数组用于保存预测值

test_predictions = np.zeros(test_features.shape[0])



# 用于交叉验证预测的空数组

out_of_fold = np.zeros(features.shape[0])



# 用于记录验证和训练分数的列表

valid_scores = []

train_scores = []



# 遍历每折

for train_indices, valid_indices in k_fold.split(features):



# 每折的训练数据

train_features, train_labels = features[train_indices], labels[train_indices]

# 每折的验证数据

valid_features, valid_labels = features[valid_indices], labels[valid_indices]



# 创建模型

model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary',

class_weight = 'balanced', learning_rate = 0.05,

reg_alpha = 0.1, reg_lambda = 0.1,

subsample = 0.8, n_jobs = -1, random_state = 50)



# 训练模型

model.fit(train_features, train_labels, eval_metric = 'auc',

eval_set = [(valid_features, valid_labels), (train_features, train_labels)],

eval_names = ['valid', 'train'], categorical_feature = cat_indices,

early_stopping_rounds = 100, verbose = 200)



# 记录最好的代数

best_iteration = model.best_iteration_



# 记录特征重要性

feature_importance_values += model.feature_importances_ / k_fold.n_splits



# 做预测

test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits



# 记录折外预测结果

out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]



# 记录最好的得分

valid_score = model.best_score_['valid']['auc']

train_score = model.best_score_['train']['auc']



valid_scores.append(valid_score)

train_scores.append(train_score)



# 清理内存

gc.enable()

del model, train_features, valid_features

gc.collect()



# 提交准备

submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})



# 特征重要性

feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})



# 交叉验证整体得分

valid_auc = roc_auc_score(labels, out_of_fold)



# 将得分加入矩阵

valid_scores.append(valid_auc)

train_scores.append(np.mean(train_scores))



# 创建验证分数的DF

fold_names = list(range(n_folds))

fold_names.append('overall')



metrics = pd.DataFrame({'fold': fold_names,

'train': train_scores,

'valid': valid_scores})



return submission, feature_importances, metrics
1
2
3
4
5
submission, fi, metrics = model(app_train, app_test)

print('Baseline metrics')

print(metrics)
Training Data Shape:  (307511, 239)
Testing Data Shape:  (48744, 239)
Training until validation scores don't improve for 100 rounds
[200]    train's auc: 0.798723    train's binary_logloss: 0.547797    valid's auc: 0.755039    valid's binary_logloss: 0.563266
[400]    train's auc: 0.82838    train's binary_logloss: 0.518334    valid's auc: 0.755107    valid's binary_logloss: 0.545575
Early stopping, best iteration is:
[315]    train's auc: 0.816657    train's binary_logloss: 0.530116    valid's auc: 0.755215    valid's binary_logloss: 0.552627
Training until validation scores don't improve for 100 rounds
[200]    train's auc: 0.798409    train's binary_logloss: 0.548179    valid's auc: 0.758332    valid's binary_logloss: 0.563587
[400]    train's auc: 0.828244    train's binary_logloss: 0.518308    valid's auc: 0.758563    valid's binary_logloss: 0.545588
Early stopping, best iteration is:
[317]    train's auc: 0.8169    train's binary_logloss: 0.529878    valid's auc: 0.758754    valid's binary_logloss: 0.552413
Training until validation scores don't improve for 100 rounds
[200]    train's auc: 0.797648    train's binary_logloss: 0.549331    valid's auc: 0.763246    valid's binary_logloss: 0.564236
Early stopping, best iteration is:
[264]    train's auc: 0.808111    train's binary_logloss: 0.539063    valid's auc: 0.76363    valid's binary_logloss: 0.557905
Training until validation scores don't improve for 100 rounds
[200]    train's auc: 0.798855    train's binary_logloss: 0.547952    valid's auc: 0.757131    valid's binary_logloss: 0.562234
Early stopping, best iteration is:
[280]    train's auc: 0.811887    train's binary_logloss: 0.535139    valid's auc: 0.757583    valid's binary_logloss: 0.554287
Training until validation scores don't improve for 100 rounds
[200]    train's auc: 0.797918    train's binary_logloss: 0.548584    valid's auc: 0.758065    valid's binary_logloss: 0.564721
Early stopping, best iteration is:
[287]    train's auc: 0.811617    train's binary_logloss: 0.535146    valid's auc: 0.758344    valid's binary_logloss: 0.556636
Baseline metrics
      fold     train     valid
0        0  0.816657  0.755215
1        1  0.816900  0.758754
2        2  0.808111  0.763630
3        3  0.811887  0.757583
4        4  0.811617  0.758344
5  overall  0.813034  0.758705
1
fi_sorted = plot_feature_importances(fi)

fqxktI.png

1
submission.to_csv('./1_result/baseline_lgb.csv', index = False)
1
2
3
4
5
6
7
8
9
10
11
app_train_domain['TARGET'] = train_labels



# 测试领域知识特征

submission_domain, fi_domain, metrics_domain = model(app_train_domain, app_test_domain)

print('Baseline with domain knowledge features metrics')

print(metrics_domain)
Training Data Shape:  (307511, 243)
Testing Data Shape:  (48744, 243)
Training until validation scores don't improve for 100 rounds
[200]    train's auc: 0.804779    train's binary_logloss: 0.541283    valid's auc: 0.762511    valid's binary_logloss: 0.557227
Early stopping, best iteration is:
[268]    train's auc: 0.815523    train's binary_logloss: 0.530413    valid's auc: 0.763069    valid's binary_logloss: 0.550276
Training until validation scores don't improve for 100 rounds
[200]    train's auc: 0.804016    train's binary_logloss: 0.542318    valid's auc: 0.765768    valid's binary_logloss: 0.557819
Early stopping, best iteration is:
[218]    train's auc: 0.807075    train's binary_logloss: 0.539112    valid's auc: 0.766062    valid's binary_logloss: 0.555952
Training until validation scores don't improve for 100 rounds
[200]    train's auc: 0.8038    train's binary_logloss: 0.542856    valid's auc: 0.7703    valid's binary_logloss: 0.557925
[400]    train's auc: 0.834559    train's binary_logloss: 0.511454    valid's auc: 0.770511    valid's binary_logloss: 0.538558
Early stopping, best iteration is:
[383]    train's auc: 0.832138    train's binary_logloss: 0.514008    valid's auc: 0.77073    valid's binary_logloss: 0.54009
Training until validation scores don't improve for 100 rounds
[200]    train's auc: 0.804603    train's binary_logloss: 0.541718    valid's auc: 0.765497    valid's binary_logloss: 0.556274
Early stopping, best iteration is:
[238]    train's auc: 0.8111    train's binary_logloss: 0.535149    valid's auc: 0.765884    valid's binary_logloss: 0.552351
Training until validation scores don't improve for 100 rounds
[200]    train's auc: 0.804782    train's binary_logloss: 0.541397    valid's auc: 0.765076    valid's binary_logloss: 0.558641
Early stopping, best iteration is:
[290]    train's auc: 0.819404    train's binary_logloss: 0.526653    valid's auc: 0.765249    valid's binary_logloss: 0.549657
Baseline with domain knowledge features metrics
      fold     train     valid
0        0  0.815523  0.763069
1        1  0.807075  0.766062
2        2  0.832138  0.770730
3        3  0.811100  0.765884
4        4  0.819404  0.765249
5  overall  0.817048  0.766186
1
fi_sorted = plot_feature_importances(fi_domain)

fqxV9P.png

1
submission_domain.to_csv('./1_result/baseline_lgb_domain_features.csv', index = False)

本文提及的数据集下载地址:
链接:https://pan.baidu.com/s/1Dlxp_C8H7Rjf0OKorSRQ0A
提取码:1111


Donate
  • Copyright: Copyright is owned by the author. For commercial reprints, please contact the author for authorization. For non-commercial reprints, please indicate the source.
  • Copyrights © 2019-2022 Woody
  • Visitors: | Views:

请我喝杯咖啡吧~

支付宝
微信