library(dplyr)
library(stringr)
library(readr)
library(mlbench)
library(caret)
fc <-"C:\\Users\\wgao4\\Documents\\banking.csv"
data <- read_csv(fc, na="unknown")
data
## # A tibble: 41,188 × 21
## age job marital education default housing loan contact month day_of_week
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 44 blue… married basic.4y <NA> yes no cellul… aug thu
## 2 53 tech… married <NA> no no no cellul… nov fri
## 3 28 mana… single universi… no yes no cellul… jun thu
## 4 39 serv… married high.sch… no no no cellul… apr fri
## 5 55 reti… married basic.4y no yes no cellul… aug fri
## 6 30 mana… divorc… basic.4y no yes no cellul… jul tue
## 7 37 blue… married basic.4y no yes no cellul… may thu
## 8 39 blue… divorc… basic.9y no yes no cellul… may fri
## 9 36 admi… married universi… no no no cellul… jun mon
## 10 27 blue… single basic.4y no yes no cellul… apr thu
## # ℹ 41,178 more rows
## # ℹ 11 more variables: duration <dbl>, campaign <dbl>, pdays <dbl>,
## # previous <dbl>, poutcome <chr>, emp_var_rate <dbl>, cons_price_idx <dbl>,
## # cons_conf_idx <dbl>, euribor3m <dbl>, nr_employed <dbl>, y <dbl>
dim(data)
## [1] 41188 21
colSums(is.na(data))
## age job marital education default
## 0 330 80 1731 8597
## housing loan contact month day_of_week
## 990 990 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed
## 0 0 0 0 0
## y
## 0
summary(data)
## age job marital education
## Min. :17.00 Length:41188 Length:41188 Length:41188
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.02
## 3rd Qu.:47.00
## Max. :98.00
## default housing loan contact
## Length:41188 Length:41188 Length:41188 Length:41188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:41188 Length:41188 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 102.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 180.0 Median : 2.000
## Mean : 258.3 Mean : 2.568
## 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :4918.0 Max. :56.000
## pdays previous poutcome emp_var_rate
## Min. : 0.0 Min. :0.000 Length:41188 Min. :-3.40000
## 1st Qu.:999.0 1st Qu.:0.000 Class :character 1st Qu.:-1.80000
## Median :999.0 Median :0.000 Mode :character Median : 1.10000
## Mean :962.5 Mean :0.173 Mean : 0.08189
## 3rd Qu.:999.0 3rd Qu.:0.000 3rd Qu.: 1.40000
## Max. :999.0 Max. :7.000 Max. : 1.40000
## cons_price_idx cons_conf_idx euribor3m nr_employed
## Min. :92.20 Min. :-50.8 Min. :0.634 Min. :4964
## 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.344 1st Qu.:5099
## Median :93.75 Median :-41.8 Median :4.857 Median :5191
## Mean :93.58 Mean :-40.5 Mean :3.621 Mean :5167
## 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961 3rd Qu.:5228
## Max. :94.77 Max. :-26.9 Max. :5.045 Max. :5228
## y
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1127
## 3rd Qu.:0.0000
## Max. :1.0000
data1 = select_if(data, is.numeric)
preprocessParams <- preProcess(data1, method = c("center", "scale", "pca"))
print(preprocessParams)
## Created from 41188 samples and 11 variables
##
## Pre-processing:
## - centered (11)
## - ignored (0)
## - principal component signal extraction (11)
## - scaled (11)
##
## PCA needed 8 components to capture 95 percent of the variance
transformed <- predict(preprocessParams, data1)
summary(transformed)
## PC1 PC2 PC3 PC4
## Min. :-7.517 Min. :-2.495032 Min. :-6.9406 Min. :-6.13584
## 1st Qu.:-1.929 1st Qu.:-0.650925 1st Qu.:-0.5269 1st Qu.:-0.56315
## Median : 1.316 Median : 0.009709 Median :-0.1428 Median : 0.02973
## Mean : 0.000 Mean : 0.000000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.501 3rd Qu.: 0.387218 3rd Qu.: 0.3893 3rd Qu.: 0.63502
## Max. : 3.311 Max. : 8.184944 Max. :12.3001 Max. : 6.59367
## PC5 PC6 PC7 PC8
## Min. :-2.4418 Min. :-5.14817 Min. :-10.74549 Min. :-4.1531
## 1st Qu.:-0.5935 1st Qu.:-0.57506 1st Qu.: -0.32705 1st Qu.:-0.3897
## Median :-0.1803 Median :-0.03721 Median : 0.04126 Median : 0.1223
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.3355 3rd Qu.: 0.53036 3rd Qu.: 0.31804 3rd Qu.: 0.3836
## Max. :16.3478 Max. : 6.78199 Max. : 3.53356 Max. : 3.2032
head(data1, 10)
## # A tibble: 10 × 11
## age duration campaign pdays previous emp_var_rate cons_price_idx
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 44 210 1 999 0 1.4 93.4
## 2 53 138 1 999 0 -0.1 93.2
## 3 28 339 3 6 2 -1.7 94.1
## 4 39 185 2 999 0 -1.8 93.1
## 5 55 137 1 3 1 -2.9 92.2
## 6 30 68 8 999 0 1.4 93.9
## 7 37 204 1 999 0 -1.8 92.9
## 8 39 191 1 999 0 -1.8 92.9
## 9 36 174 1 3 1 -2.9 93.0
## 10 27 191 2 999 1 -1.8 93.1
## # ℹ 4 more variables: cons_conf_idx <dbl>, euribor3m <dbl>, nr_employed <dbl>,
## # y <dbl>
head(transformed, 10)
## PC1 PC2 PC3 PC4 PC5 PC6
## 1 1.3427782 0.2426835 -0.27945909 -1.01045901 -0.60455973 0.2522292
## 2 0.1763565 -0.5972064 -0.26315669 -1.13490079 -0.02408861 -0.9049577
## 3 -5.0786948 4.3860971 -1.98843605 3.07103185 -0.55280523 0.2594472
## 4 -1.8128963 -1.7294175 0.44358572 0.20249279 0.14819086 -0.5858076
## 5 -5.2516814 3.5757205 -2.31905834 -1.24153445 -0.16747636 0.9164749
## 6 1.7841914 -0.5504122 -0.50288044 1.57576033 1.15698400 0.7081997
## 7 -1.9652512 -1.7500893 0.58063806 0.03088033 -0.24233826 -0.3809282
## 8 -1.9672491 -1.7405926 0.50418773 -0.09348202 -0.17290060 -0.4908857
## 9 -4.8648251 3.0424509 -1.42954055 1.35377872 -0.68408579 0.3047619
## 10 -2.4179374 -1.3443788 -0.02981516 1.50687789 -0.50522534 -0.2437168
## PC7 PC8
## 1 -0.24391472 0.47541228
## 2 0.01779039 0.72801907
## 3 0.81877382 -0.04021900
## 4 0.15997794 0.05897229
## 5 0.84231130 2.64670969
## 6 0.22745485 0.24280059
## 7 0.04386620 0.20124620
## 8 0.07406300 0.21141191
## 9 1.32093316 2.22332729
## 10 -0.56738189 -0.27577081