#Set up
knitr::opts_chunk$set(echo = TRUE)
library(forcats)
library(dplyr)
setwd("D:/Dropbox/1081-TBrain")
dta <- read.csv("train.csv")Data structure and overview
We could see that there were many missing values, so the first step we started to deal with the problem. (Here we do not want to show too many variables.)
## 'data.frame': 100000 obs. of 10 variables:
## $ CUS_ID : int 3418 4302 5545 7207 7213 8818 9681 9743 9839 10246 ...
## $ GENDER : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ AGE : Factor w/ 4 levels "中","中高","低",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ CHARGE_CITY_CD : Factor w/ 8 levels "A1","A2","B1",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ CONTACT_CITY_CD : Factor w/ 8 levels "A1","A2","B1",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ EDUCATION_CD : int NA NA 1 NA 1 NA NA NA 1 NA ...
## $ MARRIAGE_CD : int NA NA 0 0 0 0 0 0 0 0 ...
## $ LAST_A_CCONTACT_DT: Factor w/ 2 levels "N","Y": 2 2 2 2 1 2 1 2 1 1 ...
## $ L1YR_A_ISSUE_CNT : int 0 0 1 0 0 0 0 0 0 0 ...
## $ LAST_A_ISSUE_DT : Factor w/ 2 levels "N","Y": 1 1 2 1 1 1 1 1 1 1 ...
## CUS_ID GENDER AGE CHARGE_CITY_CD
## Min. : 12 F :52944 中 :27148 B1 :51618
## 1st Qu.: 627540 M :46373 中高:21737 A1 :16513
## Median :1261358 NA's: 683 低 :27213 C2 : 8601
## Mean :1267299 高 :23902 A2 : 8431
## 3rd Qu.:1903561 B2 : 7574
## Max. :2551470 C1 : 4820
## (Other): 2443
## CONTACT_CITY_CD EDUCATION_CD MARRIAGE_CD LAST_A_CCONTACT_DT
## A1 :27066 Min. :1.000 Min. :0.000 N:64595
## B1 :18680 1st Qu.:1.000 1st Qu.:0.000 Y:35405
## A2 :14489 Median :2.000 Median :0.000
## C2 :14454 Mean :2.169 Mean :0.315
## B2 :13027 3rd Qu.:3.000 3rd Qu.:1.000
## C1 : 7961 Max. :4.000 Max. :2.000
## (Other): 4323 NA's :20562 NA's :7951
## L1YR_A_ISSUE_CNT LAST_A_ISSUE_DT
## Min. : 0.0000 N:88631
## 1st Qu.: 0.0000 Y:11369
## Median : 0.0000
## Mean : 0.1182
## 3rd Qu.: 0.0000
## Max. :22.0000
##
Missing Value
Data exploration
At first, we calculated how many missing values in each variables and the variable type.
Binary Variables
For variables with only binary outcome, we replace NA with third class and turn this variable into three binary variables without missing value.
For example:
##Gender
dta$GENDER<-fct_explicit_na(dta$GENDER, "other")
dta$GENDER_MALE<-ifelse(dta$GENDER=="M",1,0)
dta$GENDER_FEMALE<-ifelse(dta$GENDER=="F",1,0)
dta$GENDER_OTH<-ifelse(dta$GENDER=="other",1,0)
summary(dta$GENDER_FEMALE,dta$GENDER_MALE,dta$GENDER_OTH)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.5294 1.0000 1.0000
##IF_ADD_INSD_G_IND
dta$IF_ADD_INSD_G_IND_Y <- ifelse(dta$IF_ADD_INSD_G_IND=="Y",1,0)
dta$IF_ADD_INSD_G_IND_Y <- ifelse(is.na(dta$IF_ADD_INSD_G_IND_Y),0,dta$IF_ADD_INSD_G_IND_Y)
dta$IF_ADD_INSD_G_IND_N <- ifelse(dta$IF_ADD_INSD_G_IND=="N",1,0)
dta$IF_ADD_INSD_G_IND_N <- ifelse(is.na(dta$IF_ADD_INSD_G_IND_N),0,dta$IF_ADD_INSD_G_IND_N)
dta$IF_ADD_INSD_G_IND_OTH <- ifelse(is.na(dta$IF_ADD_INSD_G_IND),1,0)
summary(dta$IF_ADD_INSD_G_IND_Y,dta$IF_ADD_INSD_G_IND_N,dta$IF_ADD_INSD_G_IND_OTH)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.00734 0.00000 1.00000
Nominal variables
As binary variables, we create a new category for missing value.
For example:
## .
## 1 2 3 4
## 24873 20120 30586 3859
dta$EDUCATION_CD <-ifelse(is.na(dta$EDUCATION_CD),5,dta$EDUCATION_CD)
dta$EDUCATION_1<-ifelse(dta$EDUCATION_CD==1,1,0)
dta$EDUCATION_2<-ifelse(dta$EDUCATION_CD==2,1,0)
dta$EDUCATION_3<-ifelse(dta$EDUCATION_CD==3,1,0)
dta$EDUCATION_4<-ifelse(dta$EDUCATION_CD==4,1,0)
dta$EDUCATION_5<-ifelse(dta$EDUCATION_CD==5,1,0)
##marriage
dta$MARRIAGE_CD %>% table()## .
## 0 1 2
## 64127 26822 1100
Ordinal variables
We replaced missing value with median because the distribution was not normal distribution. For variables with character level, we replace it with ordinal number and use the median value to replace the missing.
For example:
##OCCUPATION_CLASS_CD
dta$OCCUPATION_CLASS_CD <-dta$OCCUPATION_CLASS_CD %>% ifelse(is.na(.),median(na.omit(.)),.)
##APC_1ST_AGE NA--> median
dta$APC_1ST_AGE <- as.character(dta$APC_1ST_AGE)
dta$APC_1ST_AGE %>% table## .
## 中 中高 低 高
## 9010 16507 16443 14758
Continuous variables
We have two method to replace the missing value:
1. Cut into three categories
Because these variable were highly skewed, we set mean plus one stardand error to be the cutpoint, and seperate variables into two categories: above or below cutpoint. For missing value, we set it the third category.
For example:
##DIEACCIDENT_AMT
dta$DIEACCIDENT_AMT_CATE_NA<-ifelse(is.na(dta$DIEACCIDENT_AMT),1,0)
cutpoint<-mean(na.omit(dta$DIEACCIDENT_AMT))+sd(na.omit(dta$DIEACCIDENT_AMT))
dta$DIEACCIDENT_AMT_CATE_A<-ifelse(dta$DIEACCIDENT_AMT<cutpoint,1,0)
dta$DIEACCIDENT_AMT_CATE_A<-ifelse(is.na(dta$DIEACCIDENT_AMT_CATE_A),0,dta$DIEACCIDENT_AMT_CATE_A)
dta$DIEACCIDENT_AMT_CATE_B<-ifelse(dta$DIEACCIDENT_AMT>=cutpoint,1,0)
dta$DIEACCIDENT_AMT_CATE_B<-ifelse(is.na(dta$DIEACCIDENT_AMT_CATE_B),0,dta$DIEACCIDENT_AMT_CATE_B)2. Missing as median
We used the median value to replace the missing.
For example: