pacman::p_load(tidyverse, magrittr, GGally, DT, caret, e1071,
install = FALSE, update = FALSE)
BankData <- read_csv("G:/My Drive/homework/Muhammad W/BankData.csv")
BankData %<>% select(!X1)
BankData %>% glimpse()
## Rows: 690
## Columns: 12
## $ cont1 <dbl> 30.83, 58.67, 24.50, 27.83, 20.17, 32.08, 33.17, 22.92...
## $ cont2 <dbl> 0.000, 4.460, 0.500, 1.540, 5.625, 4.000, 1.040, 11.58...
## $ cont3 <dbl> 1.250, 3.040, 1.500, 3.750, 1.710, 2.500, 6.500, 0.040...
## $ bool1 <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, ...
## $ bool2 <lgl> TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, F...
## $ cont4 <dbl> 1, 6, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 10, 3, 10...
## $ bool3 <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, F...
## $ cont5 <dbl> 202, 43, 280, 100, 120, 360, 164, 80, 180, 52, 128, 26...
## $ cont6 <dbl> 0, 560, 824, 3, 0, 0, 31285, 1349, 314, 1442, 0, 200, ...
## $ approval <chr> "+", "+", "+", "+", "+", "+", "+", "+", "+", "+", "+",...
## $ credit.score <dbl> 664.60, 693.88, 621.82, 653.97, 670.26, 672.16, 693.53...
## $ ages <dbl> 42, 54, 29, 58, 65, 61, 50, 41, 30, 35, 57, 58, 60, 43...
BankData %>% mutate(across(where(is_character), as_factor)) %>% summary()
## cont1 cont2 cont3 bool1
## Min. :13.75 Min. : 0.000 Min. : 0.000 Mode :logical
## 1st Qu.:22.60 1st Qu.: 1.000 1st Qu.: 0.165 FALSE:329
## Median :28.46 Median : 2.750 Median : 1.000 TRUE :361
## Mean :31.57 Mean : 4.759 Mean : 2.223
## 3rd Qu.:38.23 3rd Qu.: 7.207 3rd Qu.: 2.625
## Max. :80.25 Max. :28.000 Max. :28.500
## NA's :12
## bool2 cont4 bool3 cont5
## Mode :logical Min. : 0.0 Mode :logical Min. : 0
## FALSE:395 1st Qu.: 0.0 FALSE:374 1st Qu.: 75
## TRUE :295 Median : 0.0 TRUE :316 Median : 160
## Mean : 2.4 Mean : 184
## 3rd Qu.: 3.0 3rd Qu.: 276
## Max. :67.0 Max. :2000
## NA's :13
## cont6 approval credit.score ages
## Min. : 0.0 +:307 Min. :583.7 Min. :11.00
## 1st Qu.: 0.0 -:383 1st Qu.:666.7 1st Qu.:31.00
## Median : 5.0 Median :697.3 Median :38.00
## Mean : 1017.4 Mean :696.4 Mean :39.67
## 3rd Qu.: 395.5 3rd Qu.:726.4 3rd Qu.:48.00
## Max. :100000.0 Max. :806.0 Max. :84.00
##
BankData %>% filter(!complete.cases(.)) %>% nrow() # omit, keep, replace?
## [1] 24
BankData %>% datatable(options = list(pageLength = 5)) # BankData %>% slice_sample(n = 10)
BankData %>% ggpairs(axisLabels = "none")
https://youtu.be/oN6D8_ztl04?t=148
BankData %>%
select(where(is_double)) %>%
gather() %>%
ggplot(aes(value)) +
geom_histogram(aes(y = stat(density))) +
geom_density(col = "red") +
facet_wrap(vars(key), scales = "free")
BankData %>%
select(where(is_double)) %>%
gather() %>%
ggplot(aes(y = value)) +
geom_boxplot() +
facet_wrap(vars(key), scales = "free")
BankData %>%
select(where(is_logical) | where(is_character)) %>%
gather() %>%
ggplot(aes(value)) +
geom_bar() +
facet_wrap(vars(key), scales = "free")
BankData.norm <-
BankData %>%
select(credit.score, ages, cont2)
# z-score normalization
credit.norm <-
BankData.norm %>%
select(credit.score) %>%
preProcess(method = c("center", "scale"))
BankData.norm %<>% mutate(credit.score =
predict(credit.norm, BankData.norm)$credit.score)
# min-max normalization
ages.norm <-
BankData.norm %>%
select(ages) %>%
preProcess(method = c("range"), rangeBounds = c(20, 70))
BankData.norm %<>% mutate(ages = predict(ages.norm, BankData.norm)$ages)
# decimal normalization
cont2.norm <-
BankData.norm %>%
select(cont2) %>%
preProcess(method = c("range"))
BankData.norm %<>% mutate(cont2 = predict(cont2.norm, BankData.norm)$cont2)
summary(BankData.norm)
## credit.score ages cont2
## Min. :-2.68864 Min. :20.00 Min. :0.00000
## 1st Qu.:-0.70690 1st Qu.:33.70 1st Qu.:0.03571
## Median : 0.02146 Median :38.49 Median :0.09821
## Mean : 0.00000 Mean :39.64 Mean :0.16995
## 3rd Qu.: 0.71482 3rd Qu.:45.34 3rd Qu.:0.25741
## Max. : 2.61446 Max. :70.00 Max. :1.00000
BankData.norm %>%
gather() %>%
ggplot(aes(value)) +
geom_histogram() +
facet_wrap(vars(key), scales = "free")
BankData.norm %>%
gather() %>%
ggplot(aes(y = value)) +
geom_boxplot() +
facet_wrap(vars(key), scales = "free")
# ggpairs(BankData.norm)
# Equal Depth
sum(BankData$ages) / 3
## [1] 9125
which.max(cumsum(sort(BankData$ages)) > 9125)
## [1] 312
which.max(cumsum(sort(BankData$ages)) > 9125*2)
## [1] 528
# Equal Count
cuts <-
BankData %>%
select(ages) %>%
pull() %>%
quantile(probs = c(0, 1/3, 2/3, 1))
cuts[1] <- cuts[1] - 1 # include left data point
cuts
## 0% 33.33333% 66.66667% 100%
## 10.00000 32.00000 44.33333 84.00000
BankData %<>%
mutate(age_bins =
cut(ages,
breaks = cuts, # = 3 gives equal width
labels = c("low", "medium", "high") # titles?
),
.after = "ages"
)
BankData %>%
group_by(age_bins) %>%
summarize(Mean = mean(ages),
StdDev = sd(ages),
Size = NROW(ages)
)
## # A tibble: 3 x 4
## age_bins Mean StdDev Size
## * <fct> <dbl> <dbl> <int>
## 1 low 27.6 3.87 234
## 2 medium 38.3 3.37 226
## 3 high 53.4 6.91 230
BankData %>%
select(ages, age_bins) %>%
slice_sample(n = 10)
## # A tibble: 10 x 2
## ages age_bins
## <dbl> <fct>
## 1 29 low
## 2 29 low
## 3 42 medium
## 4 32 low
## 5 34 medium
## 6 20 low
## 7 38 medium
## 8 54 high
## 9 56 high
## 10 28 low
BankData %<>%
group_by(age_bins) %>%
mutate(ages = mean(ages)) %>%
ungroup(age_bins)
BankData %>%
select(ages, age_bins) %>%
slice_sample(n = 10)
## # A tibble: 10 x 2
## ages age_bins
## <dbl> <fct>
## 1 38.3 medium
## 2 53.4 high
## 3 27.6 low
## 4 53.4 high
## 5 53.4 high
## 6 53.4 high
## 7 27.6 low
## 8 27.6 low
## 9 53.4 high
## 10 38.3 medium
BankData <- read_csv("G:/My Drive/homework/Muhammad W/BankData.csv")
BankData %<>%
select(!X1) %>%
filter(complete.cases(.)) # drop_na, na.omit
train_control = trainControl(method = "cv",
number = 10)
BankData.svm <-
BankData %>%
train(approval ~.,
data = .,
method = "svmLinear",
trControl = train_control
)
BankData.svm
## Support Vector Machines with Linear Kernel
##
## 666 samples
## 11 predictor
## 2 classes: '-', '+'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 599, 599, 600, 599, 600, 599, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8633114 0.7284304
##
## Tuning parameter 'C' was held constant at a value of 1
train_control = trainControl(method = "cv",
number = 10,
search = "random"
)
BankData.svm <-
BankData %>%
train(approval ~.,
data = .,
method = "svmLinear",
trControl = train_control
)
BankData.svm
## Support Vector Machines with Linear Kernel
##
## 666 samples
## 11 predictor
## 2 classes: '-', '+'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 600, 599, 599, 599, 599, 599, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.3209801 0.8634273 0.7286486
## 2.1126227 0.8634273 0.7286486
## 3.7125351 0.8634273 0.7286486
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.3209801.
starwars %>% dim()
## [1] 87 14
starwars %>% drop_na() %>% dim()
## [1] 6 14
sw <-
starwars %>%
select(!c(name, films:starships)) %>%
drop_na()
sw %>% glimpse()
## Rows: 29
## Columns: 10
## $ height <int> 172, 202, 150, 178, 165, 183, 182, 188, 228, 180, 170, 1...
## $ mass <dbl> 77.0, 136.0, 49.0, 120.0, 75.0, 84.0, 77.0, 84.0, 112.0,...
## $ hair_color <chr> "blond", "none", "brown", "brown, grey", "brown", "black...
## $ skin_color <chr> "fair", "white", "light", "light", "light", "light", "fa...
## $ eye_color <chr> "blue", "yellow", "brown", "blue", "blue", "brown", "blu...
## $ birth_year <dbl> 19.0, 41.9, 19.0, 52.0, 47.0, 24.0, 57.0, 41.9, 200.0, 2...
## $ sex <chr> "male", "male", "female", "male", "female", "male", "mal...
## $ gender <chr> "masculine", "masculine", "feminine", "masculine", "femi...
## $ homeworld <chr> "Tatooine", "Tatooine", "Alderaan", "Tatooine", "Tatooin...
## $ species <chr> "Human", "Human", "Human", "Human", "Human", "Human", "H...
sw %>% mutate(across(where(is_character), as_factor)) %>% summary()
## height mass hair_color skin_color eye_color
## Min. : 88 Min. : 20.00 none :9 fair :7 brown :10
## 1st Qu.:170 1st Qu.: 75.00 brown :7 light :6 blue : 8
## Median :180 Median : 79.00 black :6 pale :2 yellow : 4
## Mean :178 Mean : 77.77 blond :2 dark :2 hazel : 2
## 3rd Qu.:188 3rd Qu.: 83.00 white :2 orange :2 orange : 2
## Max. :228 Max. :136.00 brown, grey:1 yellow :2 blue-gray: 1
## (Other) :2 (Other):8 (Other) : 2
## birth_year sex gender homeworld species
## Min. : 8.00 male :23 masculine:23 Tatooine: 6 Human :18
## 1st Qu.: 31.00 female: 6 feminine : 6 Naboo : 3 Mirialan : 2
## Median : 46.00 Corellia: 2 Wookiee : 1
## Mean : 51.29 Mirial : 2 Trandoshan : 1
## 3rd Qu.: 57.00 Alderaan: 1 Mon Calamari: 1
## Max. :200.00 Stewjon : 1 Ewok : 1
## (Other) :14 (Other) : 5
sw %>% datatable(options = list(pageLength = 5))
sw %>% select(!homeworld) %>% ggpairs(axisLabels = "none")
dummy <- dummyVars(gender ~., data = sw)
sw.dummy <- as_tibble(predict(dummy, newdata = sw))
sw %>% dim()
## [1] 29 10
sw.dummy %>% dim()
## [1] 29 66
sw.dummy %<>% mutate(gender = sw$gender)
train_control = trainControl(method = "cv",
number = 10,
search = "grid"
)
sw.svm <-
sw.dummy %>%
train(gender ~ .,
data = .,
method = "svmLinear",
trControl = train_control
)
sw.svm
## Support Vector Machines with Linear Kernel
##
## 29 samples
## 66 predictors
## 2 classes: 'feminine', 'masculine'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 26, 27, 25, 27, 27, 25, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9333333 0.75
##
## Tuning parameter 'C' was held constant at a value of 1
dummy <- dummyVars(gender ~., data = sw)
sw.dummy <- as_tibble(predict(dummy, newdata = sw))
sw.dummy %<>% select(!nearZeroVar(.))
sw.dummy %>% dim()
## [1] 29 27
sw.pca <-
sw.dummy %>%
prcomp()
sw.pca %>% summary()
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 41.2487 22.9459 10.69652 0.73879 0.72440 0.56879 0.55673
## Proportion of Variance 0.7256 0.2245 0.04879 0.00023 0.00022 0.00014 0.00013
## Cumulative Proportion 0.7256 0.9501 0.99886 0.99909 0.99932 0.99946 0.99959
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.46260 0.42802 0.33819 0.30140 0.26756 0.25373 0.24398
## Proportion of Variance 0.00009 0.00008 0.00005 0.00004 0.00003 0.00003 0.00003
## Cumulative Proportion 0.99968 0.99976 0.99981 0.99984 0.99988 0.99990 0.99993
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.20403 0.18732 0.17363 0.14595 0.11313 0.1039 0.08506
## Proportion of Variance 0.00002 0.00001 0.00001 0.00001 0.00001 0.0000 0.00000
## Cumulative Proportion 0.99995 0.99996 0.99997 0.99998 0.99999 1.0000 1.00000
## PC22 PC23 PC24 PC25 PC26 PC27
## Standard deviation 0.07144 0.0581 0.03561 2.748e-15 2.748e-15 2.748e-15
## Proportion of Variance 0.00000 0.0000 0.00000 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.00000 1.0000 1.00000 1.000e+00 1.000e+00 1.000e+00
screeplot(sw.pca, type = "l") + title(xlab = "PCs")
## integer(0)
preProc <- preProcess(sw.dummy, method = "pca", pcaComp = 4)
sw.pc <- predict(preProc, sw.dummy) %>% as_tibble()
sw.pc$gender <- sw$gender
sw.pc %>% slice_sample(n = 10)
## # A tibble: 10 x 5
## PC1 PC2 PC3 PC4 gender
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 0.691 -0.712 -2.36 -1.55 masculine
## 2 1.35 0.0710 0.560 0.648 masculine
## 3 0.697 -1.65 -3.37 -1.69 masculine
## 4 1.36 3.39 2.98 -2.81 masculine
## 5 1.57 -0.350 -0.696 2.71 masculine
## 6 1.04 2.07 1.37 -1.90 masculine
## 7 1.89 2.93 -0.347 3.81 masculine
## 8 0.996 -2.84 0.653 1.11 masculine
## 9 0.337 -0.114 -0.457 -1.54 masculine
## 10 -1.79 -2.07 -0.590 -1.79 feminine
train_control = trainControl(method = "cv",
number = 10,
search = "grid"
)
sw.pca <-
sw.pc %>%
train(gender ~ .,
data = .,
method = "svmLinear",
trControl = train_control
)
sw.pca
## Support Vector Machines with Linear Kernel
##
## 29 samples
## 4 predictor
## 2 classes: 'feminine', 'masculine'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 25, 26, 25, 27, 26, 27, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9166667 0.7142857
##
## Tuning parameter 'C' was held constant at a value of 1
sw.dummy %<>% mutate(gender = sw$gender)
train_control = trainControl(method = "cv",
number = 10,
search = "grid"
)
sw.svm <-
sw.dummy %>% # used reduced data from part (c)
train(gender ~ .,
data = .,
method = "svmLinear",
trControl = train_control
)
sw.svm
## Support Vector Machines with Linear Kernel
##
## 29 samples
## 27 predictors
## 2 classes: 'feminine', 'masculine'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 27, 27, 26, 25, 26, 26, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9666667 0.8571429
##
## Tuning parameter 'C' was held constant at a value of 1
sw.svm %>% confusionMatrix()
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction feminine masculine
## feminine 20.7 3.4
## masculine 0.0 75.9
##
## Accuracy (average) : 0.9655