library(dplyr)
library(stringr)
library(readr)
library(mlbench)
library(caret)
fc <-"C:\\Users\\wgao4\\Documents\\banking.csv"
data <- read_csv(fc, na="unknown")
data
## # A tibble: 41,188 × 21
##      age job   marital education default housing loan  contact month day_of_week
##    <dbl> <chr> <chr>   <chr>     <chr>   <chr>   <chr> <chr>   <chr> <chr>      
##  1    44 blue… married basic.4y  <NA>    yes     no    cellul… aug   thu        
##  2    53 tech… married <NA>      no      no      no    cellul… nov   fri        
##  3    28 mana… single  universi… no      yes     no    cellul… jun   thu        
##  4    39 serv… married high.sch… no      no      no    cellul… apr   fri        
##  5    55 reti… married basic.4y  no      yes     no    cellul… aug   fri        
##  6    30 mana… divorc… basic.4y  no      yes     no    cellul… jul   tue        
##  7    37 blue… married basic.4y  no      yes     no    cellul… may   thu        
##  8    39 blue… divorc… basic.9y  no      yes     no    cellul… may   fri        
##  9    36 admi… married universi… no      no      no    cellul… jun   mon        
## 10    27 blue… single  basic.4y  no      yes     no    cellul… apr   thu        
## # ℹ 41,178 more rows
## # ℹ 11 more variables: duration <dbl>, campaign <dbl>, pdays <dbl>,
## #   previous <dbl>, poutcome <chr>, emp_var_rate <dbl>, cons_price_idx <dbl>,
## #   cons_conf_idx <dbl>, euribor3m <dbl>, nr_employed <dbl>, y <dbl>
dim(data)
## [1] 41188    21
colSums(is.na(data))
##            age            job        marital      education        default 
##              0            330             80           1731           8597 
##        housing           loan        contact          month    day_of_week 
##            990            990              0              0              0 
##       duration       campaign          pdays       previous       poutcome 
##              0              0              0              0              0 
##   emp_var_rate cons_price_idx  cons_conf_idx      euribor3m    nr_employed 
##              0              0              0              0              0 
##              y 
##              0
summary(data)
##       age            job              marital           education        
##  Min.   :17.00   Length:41188       Length:41188       Length:41188      
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :38.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.02                                                           
##  3rd Qu.:47.00                                                           
##  Max.   :98.00                                                           
##    default            housing              loan             contact         
##  Length:41188       Length:41188       Length:41188       Length:41188      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign     
##  Length:41188       Length:41188       Min.   :   0.0   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 102.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 180.0   Median : 2.000  
##                                        Mean   : 258.3   Mean   : 2.568  
##                                        3rd Qu.: 319.0   3rd Qu.: 3.000  
##                                        Max.   :4918.0   Max.   :56.000  
##      pdays          previous       poutcome          emp_var_rate     
##  Min.   :  0.0   Min.   :0.000   Length:41188       Min.   :-3.40000  
##  1st Qu.:999.0   1st Qu.:0.000   Class :character   1st Qu.:-1.80000  
##  Median :999.0   Median :0.000   Mode  :character   Median : 1.10000  
##  Mean   :962.5   Mean   :0.173                      Mean   : 0.08189  
##  3rd Qu.:999.0   3rd Qu.:0.000                      3rd Qu.: 1.40000  
##  Max.   :999.0   Max.   :7.000                      Max.   : 1.40000  
##  cons_price_idx  cons_conf_idx     euribor3m      nr_employed  
##  Min.   :92.20   Min.   :-50.8   Min.   :0.634   Min.   :4964  
##  1st Qu.:93.08   1st Qu.:-42.7   1st Qu.:1.344   1st Qu.:5099  
##  Median :93.75   Median :-41.8   Median :4.857   Median :5191  
##  Mean   :93.58   Mean   :-40.5   Mean   :3.621   Mean   :5167  
##  3rd Qu.:93.99   3rd Qu.:-36.4   3rd Qu.:4.961   3rd Qu.:5228  
##  Max.   :94.77   Max.   :-26.9   Max.   :5.045   Max.   :5228  
##        y         
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1127  
##  3rd Qu.:0.0000  
##  Max.   :1.0000
data1 = select_if(data, is.numeric)
preprocessParams <- preProcess(data1, method = c("center", "scale", "pca"))
print(preprocessParams)
## Created from 41188 samples and 11 variables
## 
## Pre-processing:
##   - centered (11)
##   - ignored (0)
##   - principal component signal extraction (11)
##   - scaled (11)
## 
## PCA needed 8 components to capture 95 percent of the variance
transformed <- predict(preprocessParams, data1)
summary(transformed)
##       PC1              PC2                 PC3               PC4          
##  Min.   :-7.517   Min.   :-2.495032   Min.   :-6.9406   Min.   :-6.13584  
##  1st Qu.:-1.929   1st Qu.:-0.650925   1st Qu.:-0.5269   1st Qu.:-0.56315  
##  Median : 1.316   Median : 0.009709   Median :-0.1428   Median : 0.02973  
##  Mean   : 0.000   Mean   : 0.000000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 1.501   3rd Qu.: 0.387218   3rd Qu.: 0.3893   3rd Qu.: 0.63502  
##  Max.   : 3.311   Max.   : 8.184944   Max.   :12.3001   Max.   : 6.59367  
##       PC5               PC6                PC7                 PC8         
##  Min.   :-2.4418   Min.   :-5.14817   Min.   :-10.74549   Min.   :-4.1531  
##  1st Qu.:-0.5935   1st Qu.:-0.57506   1st Qu.: -0.32705   1st Qu.:-0.3897  
##  Median :-0.1803   Median :-0.03721   Median :  0.04126   Median : 0.1223  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   :  0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.3355   3rd Qu.: 0.53036   3rd Qu.:  0.31804   3rd Qu.: 0.3836  
##  Max.   :16.3478   Max.   : 6.78199   Max.   :  3.53356   Max.   : 3.2032
head(data1, 10)
## # A tibble: 10 × 11
##      age duration campaign pdays previous emp_var_rate cons_price_idx
##    <dbl>    <dbl>    <dbl> <dbl>    <dbl>        <dbl>          <dbl>
##  1    44      210        1   999        0          1.4           93.4
##  2    53      138        1   999        0         -0.1           93.2
##  3    28      339        3     6        2         -1.7           94.1
##  4    39      185        2   999        0         -1.8           93.1
##  5    55      137        1     3        1         -2.9           92.2
##  6    30       68        8   999        0          1.4           93.9
##  7    37      204        1   999        0         -1.8           92.9
##  8    39      191        1   999        0         -1.8           92.9
##  9    36      174        1     3        1         -2.9           93.0
## 10    27      191        2   999        1         -1.8           93.1
## # ℹ 4 more variables: cons_conf_idx <dbl>, euribor3m <dbl>, nr_employed <dbl>,
## #   y <dbl>
head(transformed, 10)
##           PC1        PC2         PC3         PC4         PC5        PC6
## 1   1.3427782  0.2426835 -0.27945909 -1.01045901 -0.60455973  0.2522292
## 2   0.1763565 -0.5972064 -0.26315669 -1.13490079 -0.02408861 -0.9049577
## 3  -5.0786948  4.3860971 -1.98843605  3.07103185 -0.55280523  0.2594472
## 4  -1.8128963 -1.7294175  0.44358572  0.20249279  0.14819086 -0.5858076
## 5  -5.2516814  3.5757205 -2.31905834 -1.24153445 -0.16747636  0.9164749
## 6   1.7841914 -0.5504122 -0.50288044  1.57576033  1.15698400  0.7081997
## 7  -1.9652512 -1.7500893  0.58063806  0.03088033 -0.24233826 -0.3809282
## 8  -1.9672491 -1.7405926  0.50418773 -0.09348202 -0.17290060 -0.4908857
## 9  -4.8648251  3.0424509 -1.42954055  1.35377872 -0.68408579  0.3047619
## 10 -2.4179374 -1.3443788 -0.02981516  1.50687789 -0.50522534 -0.2437168
##            PC7         PC8
## 1  -0.24391472  0.47541228
## 2   0.01779039  0.72801907
## 3   0.81877382 -0.04021900
## 4   0.15997794  0.05897229
## 5   0.84231130  2.64670969
## 6   0.22745485  0.24280059
## 7   0.04386620  0.20124620
## 8   0.07406300  0.21141191
## 9   1.32093316  2.22332729
## 10 -0.56738189 -0.27577081