Problem 1

Load Data

pacman::p_load(tidyverse, magrittr, GGally, DT, caret, e1071,
               install = FALSE, update = FALSE)

BankData <- read_csv("G:/My Drive/homework/Muhammad W/BankData.csv")
BankData %<>% select(!X1)

BankData %>% glimpse()
## Rows: 690
## Columns: 12
## $ cont1        <dbl> 30.83, 58.67, 24.50, 27.83, 20.17, 32.08, 33.17, 22.92...
## $ cont2        <dbl> 0.000, 4.460, 0.500, 1.540, 5.625, 4.000, 1.040, 11.58...
## $ cont3        <dbl> 1.250, 3.040, 1.500, 3.750, 1.710, 2.500, 6.500, 0.040...
## $ bool1        <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, ...
## $ bool2        <lgl> TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, F...
## $ cont4        <dbl> 1, 6, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 10, 3, 10...
## $ bool3        <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, F...
## $ cont5        <dbl> 202, 43, 280, 100, 120, 360, 164, 80, 180, 52, 128, 26...
## $ cont6        <dbl> 0, 560, 824, 3, 0, 0, 31285, 1349, 314, 1442, 0, 200, ...
## $ approval     <chr> "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+",...
## $ credit.score <dbl> 664.60, 693.88, 621.82, 653.97, 670.26, 672.16, 693.53...
## $ ages         <dbl> 42, 54, 29, 58, 65, 61, 50, 41, 30, 35, 57, 58, 60, 43...
BankData %>% mutate(across(where(is_character), as_factor)) %>% summary()
##      cont1           cont2            cont3          bool1        
##  Min.   :13.75   Min.   : 0.000   Min.   : 0.000   Mode :logical  
##  1st Qu.:22.60   1st Qu.: 1.000   1st Qu.: 0.165   FALSE:329      
##  Median :28.46   Median : 2.750   Median : 1.000   TRUE :361      
##  Mean   :31.57   Mean   : 4.759   Mean   : 2.223                  
##  3rd Qu.:38.23   3rd Qu.: 7.207   3rd Qu.: 2.625                  
##  Max.   :80.25   Max.   :28.000   Max.   :28.500                  
##  NA's   :12                                                       
##    bool2             cont4        bool3             cont5     
##  Mode :logical   Min.   : 0.0   Mode :logical   Min.   :   0  
##  FALSE:395       1st Qu.: 0.0   FALSE:374       1st Qu.:  75  
##  TRUE :295       Median : 0.0   TRUE :316       Median : 160  
##                  Mean   : 2.4                   Mean   : 184  
##                  3rd Qu.: 3.0                   3rd Qu.: 276  
##                  Max.   :67.0                   Max.   :2000  
##                                                 NA's   :13    
##      cont6          approval  credit.score        ages      
##  Min.   :     0.0   +:307    Min.   :583.7   Min.   :11.00  
##  1st Qu.:     0.0   -:383    1st Qu.:666.7   1st Qu.:31.00  
##  Median :     5.0            Median :697.3   Median :38.00  
##  Mean   :  1017.4            Mean   :696.4   Mean   :39.67  
##  3rd Qu.:   395.5            3rd Qu.:726.4   3rd Qu.:48.00  
##  Max.   :100000.0            Max.   :806.0   Max.   :84.00  
## 
BankData %>% filter(!complete.cases(.)) %>% nrow() # omit, keep, replace?
## [1] 24
BankData %>% datatable(options = list(pageLength = 5)) # BankData %>% slice_sample(n = 10)
BankData %>% ggpairs(axisLabels = "none")

Part (a)

https://youtu.be/oN6D8_ztl04?t=148

BankData %>%
  select(where(is_double)) %>%
  gather() %>%
  ggplot(aes(value)) +
  geom_histogram(aes(y = stat(density))) +
  geom_density(col = "red") +
  facet_wrap(vars(key), scales = "free")

BankData %>%
  select(where(is_double)) %>%
  gather() %>%
  ggplot(aes(y = value)) +
  geom_boxplot() +
  facet_wrap(vars(key), scales = "free")

BankData %>%
  select(where(is_logical) | where(is_character)) %>%
  gather() %>%
  ggplot(aes(value)) +
  geom_bar() +
  facet_wrap(vars(key), scales = "free")

Part (b)

BankData.norm <-
  BankData %>%
  select(credit.score, ages, cont2)

# z-score normalization
credit.norm <-
  BankData.norm %>% 
  select(credit.score) %>%
  preProcess(method = c("center", "scale"))
BankData.norm %<>% mutate(credit.score =
                            predict(credit.norm, BankData.norm)$credit.score)

# min-max normalization
ages.norm <-
  BankData.norm %>% 
  select(ages) %>%
  preProcess(method = c("range"), rangeBounds = c(20, 70))
BankData.norm %<>% mutate(ages = predict(ages.norm, BankData.norm)$ages)

# decimal normalization
cont2.norm <-
  BankData.norm %>% 
  select(cont2) %>%
  preProcess(method = c("range"))
BankData.norm %<>% mutate(cont2 = predict(cont2.norm, BankData.norm)$cont2)

Part (c)

summary(BankData.norm)
##   credit.score           ages           cont2        
##  Min.   :-2.68864   Min.   :20.00   Min.   :0.00000  
##  1st Qu.:-0.70690   1st Qu.:33.70   1st Qu.:0.03571  
##  Median : 0.02146   Median :38.49   Median :0.09821  
##  Mean   : 0.00000   Mean   :39.64   Mean   :0.16995  
##  3rd Qu.: 0.71482   3rd Qu.:45.34   3rd Qu.:0.25741  
##  Max.   : 2.61446   Max.   :70.00   Max.   :1.00000
BankData.norm %>%
  gather() %>%
  ggplot(aes(value)) +
  geom_histogram() +
  facet_wrap(vars(key), scales = "free")

BankData.norm %>%
  gather() %>%
  ggplot(aes(y = value)) +
  geom_boxplot() +
  facet_wrap(vars(key), scales = "free")

# ggpairs(BankData.norm)

Part (d)

# Equal Depth
sum(BankData$ages) / 3
## [1] 9125
which.max(cumsum(sort(BankData$ages)) > 9125)
## [1] 312
which.max(cumsum(sort(BankData$ages)) > 9125*2)
## [1] 528
# Equal Count
cuts <-
  BankData %>%
  select(ages) %>%
  pull() %>%
  quantile(probs = c(0, 1/3, 2/3, 1))
cuts[1] <- cuts[1] - 1 # include left data point
cuts
##        0% 33.33333% 66.66667%      100% 
##  10.00000  32.00000  44.33333  84.00000
BankData %<>%
  mutate(age_bins = 
           cut(ages,
               breaks = cuts, # = 3 gives equal width
               labels = c("low", "medium", "high") # titles?
               ),
         .after = "ages"
         )

BankData %>%
  group_by(age_bins) %>%
  summarize(Mean = mean(ages),
            StdDev = sd(ages),
            Size = NROW(ages)
            )
## # A tibble: 3 x 4
##   age_bins  Mean StdDev  Size
## * <fct>    <dbl>  <dbl> <int>
## 1 low       27.6   3.87   234
## 2 medium    38.3   3.37   226
## 3 high      53.4   6.91   230
BankData %>%
  select(ages, age_bins) %>%
  slice_sample(n = 10)
## # A tibble: 10 x 2
##     ages age_bins
##    <dbl> <fct>   
##  1    29 low     
##  2    29 low     
##  3    42 medium  
##  4    32 low     
##  5    34 medium  
##  6    20 low     
##  7    38 medium  
##  8    54 high    
##  9    56 high    
## 10    28 low

Part (e)

BankData %<>%
  group_by(age_bins) %>%
  mutate(ages = mean(ages)) %>%
  ungroup(age_bins)

BankData %>%
  select(ages, age_bins) %>%
  slice_sample(n = 10)
## # A tibble: 10 x 2
##     ages age_bins
##    <dbl> <fct>   
##  1  38.3 medium  
##  2  53.4 high    
##  3  27.6 low     
##  4  53.4 high    
##  5  53.4 high    
##  6  53.4 high    
##  7  27.6 low     
##  8  27.6 low     
##  9  53.4 high    
## 10  38.3 medium

Problem 2

Load Data

BankData <- read_csv("G:/My Drive/homework/Muhammad W/BankData.csv")
BankData %<>%
  select(!X1) %>%
  filter(complete.cases(.)) # drop_na, na.omit

Part (a)

train_control = trainControl(method = "cv",
                             number = 10)

BankData.svm <-
  BankData %>%
  train(approval ~.,
        data = .,
        method = "svmLinear",
        trControl = train_control
        )

BankData.svm
## Support Vector Machines with Linear Kernel 
## 
## 666 samples
##  11 predictor
##   2 classes: '-', '+' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 599, 599, 600, 599, 600, 599, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8633114  0.7284304
## 
## Tuning parameter 'C' was held constant at a value of 1

Part (b)

train_control = trainControl(method = "cv",
                             number = 10,
                             search = "random"
                             )

BankData.svm <-
  BankData %>%
  train(approval ~.,
        data = .,
        method = "svmLinear",
        trControl = train_control
        )

BankData.svm
## Support Vector Machines with Linear Kernel 
## 
## 666 samples
##  11 predictor
##   2 classes: '-', '+' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 600, 599, 599, 599, 599, 599, ... 
## Resampling results across tuning parameters:
## 
##   C          Accuracy   Kappa    
##   0.3209801  0.8634273  0.7286486
##   2.1126227  0.8634273  0.7286486
##   3.7125351  0.8634273  0.7286486
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.3209801.

Problem 3

Data

starwars %>% dim()
## [1] 87 14
starwars %>% drop_na() %>% dim()
## [1]  6 14
sw <-
  starwars %>%
  select(!c(name, films:starships)) %>%
  drop_na()

sw %>% glimpse()
## Rows: 29
## Columns: 10
## $ height     <int> 172, 202, 150, 178, 165, 183, 182, 188, 228, 180, 170, 1...
## $ mass       <dbl> 77.0, 136.0, 49.0, 120.0, 75.0, 84.0, 77.0, 84.0, 112.0,...
## $ hair_color <chr> "blond", "none", "brown", "brown, grey", "brown", "black...
## $ skin_color <chr> "fair", "white", "light", "light", "light", "light", "fa...
## $ eye_color  <chr> "blue", "yellow", "brown", "blue", "blue", "brown", "blu...
## $ birth_year <dbl> 19.0, 41.9, 19.0, 52.0, 47.0, 24.0, 57.0, 41.9, 200.0, 2...
## $ sex        <chr> "male", "male", "female", "male", "female", "male", "mal...
## $ gender     <chr> "masculine", "masculine", "feminine", "masculine", "femi...
## $ homeworld  <chr> "Tatooine", "Tatooine", "Alderaan", "Tatooine", "Tatooin...
## $ species    <chr> "Human", "Human", "Human", "Human", "Human", "Human", "H...
sw %>% mutate(across(where(is_character), as_factor)) %>% summary()
##      height         mass              hair_color   skin_color     eye_color 
##  Min.   : 88   Min.   : 20.00   none       :9    fair   :7    brown    :10  
##  1st Qu.:170   1st Qu.: 75.00   brown      :7    light  :6    blue     : 8  
##  Median :180   Median : 79.00   black      :6    pale   :2    yellow   : 4  
##  Mean   :178   Mean   : 77.77   blond      :2    dark   :2    hazel    : 2  
##  3rd Qu.:188   3rd Qu.: 83.00   white      :2    orange :2    orange   : 2  
##  Max.   :228   Max.   :136.00   brown, grey:1    yellow :2    blue-gray: 1  
##                                 (Other)    :2    (Other):8    (Other)  : 2  
##    birth_year         sex           gender      homeworld          species  
##  Min.   :  8.00   male  :23   masculine:23   Tatooine: 6   Human       :18  
##  1st Qu.: 31.00   female: 6   feminine : 6   Naboo   : 3   Mirialan    : 2  
##  Median : 46.00                              Corellia: 2   Wookiee     : 1  
##  Mean   : 51.29                              Mirial  : 2   Trandoshan  : 1  
##  3rd Qu.: 57.00                              Alderaan: 1   Mon Calamari: 1  
##  Max.   :200.00                              Stewjon : 1   Ewok        : 1  
##                                              (Other) :14   (Other)     : 5
sw %>% datatable(options = list(pageLength = 5))
sw %>% select(!homeworld) %>% ggpairs(axisLabels = "none")

Part (a)

dummy <- dummyVars(gender ~., data = sw) 
sw.dummy <- as_tibble(predict(dummy, newdata = sw))

sw %>% dim()
## [1] 29 10
sw.dummy %>% dim()
## [1] 29 66

Part (b)

sw.dummy %<>% mutate(gender = sw$gender)

train_control = trainControl(method = "cv",
                             number = 10,
                             search = "grid"
                             )

sw.svm <-
  sw.dummy %>%
  train(gender ~ .,
        data = .,
        method = "svmLinear",
        trControl = train_control
        )

sw.svm
## Support Vector Machines with Linear Kernel 
## 
## 29 samples
## 66 predictors
##  2 classes: 'feminine', 'masculine' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 26, 27, 25, 27, 27, 25, ... 
## Resampling results:
## 
##   Accuracy   Kappa
##   0.9333333  0.75 
## 
## Tuning parameter 'C' was held constant at a value of 1

Part (c)

dummy <- dummyVars(gender ~., data = sw) 
sw.dummy <- as_tibble(predict(dummy, newdata = sw))

sw.dummy %<>% select(!nearZeroVar(.))
sw.dummy %>% dim()
## [1] 29 27
sw.pca <-
  sw.dummy %>%
  prcomp()
sw.pca %>% summary()
## Importance of components:
##                            PC1     PC2      PC3     PC4     PC5     PC6     PC7
## Standard deviation     41.2487 22.9459 10.69652 0.73879 0.72440 0.56879 0.55673
## Proportion of Variance  0.7256  0.2245  0.04879 0.00023 0.00022 0.00014 0.00013
## Cumulative Proportion   0.7256  0.9501  0.99886 0.99909 0.99932 0.99946 0.99959
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.46260 0.42802 0.33819 0.30140 0.26756 0.25373 0.24398
## Proportion of Variance 0.00009 0.00008 0.00005 0.00004 0.00003 0.00003 0.00003
## Cumulative Proportion  0.99968 0.99976 0.99981 0.99984 0.99988 0.99990 0.99993
##                           PC15    PC16    PC17    PC18    PC19   PC20    PC21
## Standard deviation     0.20403 0.18732 0.17363 0.14595 0.11313 0.1039 0.08506
## Proportion of Variance 0.00002 0.00001 0.00001 0.00001 0.00001 0.0000 0.00000
## Cumulative Proportion  0.99995 0.99996 0.99997 0.99998 0.99999 1.0000 1.00000
##                           PC22   PC23    PC24      PC25      PC26      PC27
## Standard deviation     0.07144 0.0581 0.03561 2.748e-15 2.748e-15 2.748e-15
## Proportion of Variance 0.00000 0.0000 0.00000 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.00000 1.0000 1.00000 1.000e+00 1.000e+00 1.000e+00
screeplot(sw.pca, type = "l") + title(xlab = "PCs")

## integer(0)
preProc <- preProcess(sw.dummy, method = "pca", pcaComp = 4) 
sw.pc <- predict(preProc, sw.dummy) %>% as_tibble()
sw.pc$gender <- sw$gender
sw.pc %>% slice_sample(n = 10)
## # A tibble: 10 x 5
##       PC1     PC2    PC3    PC4 gender   
##     <dbl>   <dbl>  <dbl>  <dbl> <chr>    
##  1  0.691 -0.712  -2.36  -1.55  masculine
##  2  1.35   0.0710  0.560  0.648 masculine
##  3  0.697 -1.65   -3.37  -1.69  masculine
##  4  1.36   3.39    2.98  -2.81  masculine
##  5  1.57  -0.350  -0.696  2.71  masculine
##  6  1.04   2.07    1.37  -1.90  masculine
##  7  1.89   2.93   -0.347  3.81  masculine
##  8  0.996 -2.84    0.653  1.11  masculine
##  9  0.337 -0.114  -0.457 -1.54  masculine
## 10 -1.79  -2.07   -0.590 -1.79  feminine
train_control = trainControl(method = "cv",
                             number = 10,
                             search = "grid"
                             )

sw.pca <-
  sw.pc %>%
  train(gender ~ .,
        data = .,
        method = "svmLinear",
        trControl = train_control
        )

sw.pca
## Support Vector Machines with Linear Kernel 
## 
## 29 samples
##  4 predictor
##  2 classes: 'feminine', 'masculine' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 25, 26, 25, 27, 26, 27, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9166667  0.7142857
## 
## Tuning parameter 'C' was held constant at a value of 1

Part (d)

sw.dummy %<>% mutate(gender = sw$gender)

train_control = trainControl(method = "cv",
                             number = 10,
                             search = "grid"
                             )

sw.svm <-
  sw.dummy %>% # used reduced data from part (c)
  train(gender ~ .,
        data = .,
        method = "svmLinear",
        trControl = train_control
        )

sw.svm
## Support Vector Machines with Linear Kernel 
## 
## 29 samples
## 27 predictors
##  2 classes: 'feminine', 'masculine' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 27, 27, 26, 25, 26, 26, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9666667  0.8571429
## 
## Tuning parameter 'C' was held constant at a value of 1
sw.svm %>% confusionMatrix()
## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##            Reference
## Prediction  feminine masculine
##   feminine      20.7       3.4
##   masculine      0.0      75.9
##                             
##  Accuracy (average) : 0.9655