Predict whether a person is married or divorced, based on the person’s self-reported responses regarding their relationships.
https://archive.ics.uci.edu/ml/datasets/Divorce+Predictors+data+set#
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## corrplot 0.84 loaded
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## atr1 atr2 atr3 atr4 atr5 atr6 atr7 atr8 atr9 atr10 atr11 atr12 atr13
## 1 2 2 4 1 0 0 0 0 0 0 1 0 1
## 2 4 4 4 4 4 0 0 4 4 4 4 3 4
## 3 2 2 2 2 1 3 2 1 1 2 3 4 2
## 4 3 2 3 2 3 3 3 3 3 3 4 3 3
## 5 2 2 1 1 1 1 0 0 0 0 0 1 0
## 6 0 0 1 0 0 2 0 0 0 1 0 2 1
## atr14 atr15 atr16 atr17 atr18 atr19 atr20 atr21 atr22 atr23 atr24 atr25
## 1 1 0 1 0 0 0 1 0 0 0 0 0
## 2 0 4 4 4 4 3 2 1 1 0 2 2
## 3 3 3 3 3 3 3 2 1 0 1 2 2
## 4 4 3 3 3 3 3 4 1 1 1 1 2
## 5 1 1 1 1 1 2 1 1 0 0 0 0
## 6 0 2 0 2 1 0 1 0 0 0 0 2
## atr26 atr27 atr28 atr29 atr30 atr31 atr32 atr33 atr34 atr35 atr36 atr37
## 1 0 0 0 0 1 1 2 1 2 0 1 2
## 2 1 2 0 1 1 0 4 2 3 0 2 3
## 3 2 2 2 3 2 3 3 1 1 1 1 2
## 4 1 1 1 1 3 2 3 2 2 1 1 3
## 5 2 1 2 1 1 1 1 1 1 0 0 0
## 6 2 0 0 0 0 4 1 1 1 1 1 1
## atr38 atr39 atr40 atr41 atr42 atr43 atr44 atr45 atr46 atr47 atr48 atr49
## 1 1 3 3 2 1 1 2 3 2 1 3 3
## 2 4 2 4 2 2 3 4 2 2 2 3 4
## 3 1 3 3 3 3 2 3 2 3 2 3 1
## 4 3 4 4 2 2 3 2 3 2 2 3 3
## 5 0 2 1 0 2 3 0 2 2 1 2 3
## 6 2 0 2 2 1 2 3 0 2 2 1 2
## atr50 atr51 atr52 atr53 atr54 class
## 1 3 2 3 2 1 1
## 2 4 4 4 2 2 1
## 3 1 1 2 2 2 1
## 4 3 3 2 2 2 1
## 5 2 2 2 1 0 1
## 6 1 1 1 2 0 1
## 'data.frame': 170 obs. of 55 variables:
## $ atr1 : int 2 4 2 3 2 0 3 2 2 1 ...
## $ atr2 : int 2 4 2 2 2 0 3 1 2 1 ...
## $ atr3 : int 4 4 2 3 1 1 3 2 1 1 ...
## $ atr4 : int 1 4 2 2 1 0 2 2 0 1 ...
## $ atr5 : int 0 4 1 3 1 0 1 2 0 1 ...
## $ atr6 : int 0 0 3 3 1 2 3 1 4 2 ...
## $ atr7 : int 0 0 2 3 0 0 4 0 1 0 ...
## $ atr8 : int 0 4 1 3 0 0 3 3 3 2 ...
## $ atr9 : int 0 4 1 3 0 0 2 3 3 2 ...
## $ atr10: int 0 4 2 3 0 1 2 2 3 2 ...
## $ atr11: int 1 4 3 4 0 0 2 4 3 3 ...
## $ atr12: int 0 3 4 3 1 2 2 3 3 0 ...
## $ atr13: int 1 4 2 3 0 1 2 2 3 0 ...
## $ atr14: int 1 0 3 4 1 0 3 3 3 2 ...
## $ atr15: int 0 4 3 3 1 2 2 4 3 1 ...
## $ atr16: int 1 4 3 3 1 0 3 3 3 0 ...
## $ atr17: int 0 4 3 3 1 2 3 2 3 1 ...
## $ atr18: int 0 4 3 3 1 1 3 3 3 2 ...
## $ atr19: int 0 3 3 3 2 0 3 2 3 1 ...
## $ atr20: int 1 2 2 4 1 1 2 1 3 0 ...
## $ atr21: int 0 1 1 1 1 0 3 2 2 0 ...
## $ atr22: int 0 1 0 1 0 0 3 1 2 0 ...
## $ atr23: int 0 0 1 1 0 0 3 1 2 0 ...
## $ atr24: int 0 2 2 1 0 0 3 2 3 1 ...
## $ atr25: int 0 2 2 2 0 2 2 3 2 1 ...
## $ atr26: int 0 1 2 1 2 2 3 3 3 1 ...
## $ atr27: int 0 2 2 1 1 0 3 2 2 1 ...
## $ atr28: int 0 0 2 1 2 0 2 2 3 1 ...
## $ atr29: int 0 1 3 1 1 0 2 2 2 1 ...
## $ atr30: int 1 1 2 3 1 0 2 3 3 1 ...
## $ atr31: int 1 0 3 2 1 4 1 1 1 1 ...
## $ atr32: int 2 4 3 3 1 1 2 1 1 1 ...
## $ atr33: int 1 2 1 2 1 1 2 0 1 0 ...
## $ atr34: int 2 3 1 2 1 1 1 2 1 1 ...
## $ atr35: int 0 0 1 1 0 1 1 2 1 0 ...
## $ atr36: int 1 2 1 1 0 1 2 1 1 0 ...
## $ atr37: int 2 3 2 3 0 1 3 4 1 1 ...
## $ atr38: int 1 4 1 3 0 2 2 4 2 1 ...
## $ atr39: int 3 2 3 4 2 0 2 4 2 2 ...
## $ atr40: int 3 4 3 4 1 2 3 4 2 2 ...
## $ atr41: int 2 2 3 2 0 2 3 4 2 1 ...
## $ atr42: int 1 2 3 2 2 1 3 4 2 2 ...
## $ atr43: int 1 3 2 3 3 2 3 3 2 3 ...
## $ atr44: int 2 4 3 2 0 3 4 2 2 2 ...
## $ atr45: int 3 2 2 3 2 0 3 0 2 2 ...
## $ atr46: int 2 2 3 2 2 2 3 0 1 2 ...
## $ atr47: int 1 2 2 2 1 2 2 1 1 0 ...
## $ atr48: int 3 3 3 3 2 1 3 2 1 2 ...
## $ atr49: int 3 4 1 3 3 2 2 2 1 2 ...
## $ atr50: int 3 4 1 3 2 1 3 2 1 2 ...
## $ atr51: int 2 4 1 3 2 1 3 1 1 2 ...
## $ atr52: int 3 4 2 2 2 1 2 1 1 4 ...
## $ atr53: int 2 2 2 2 1 2 2 1 1 3 ...
## $ atr54: int 1 2 2 2 0 0 2 0 1 3 ...
## $ class: int 1 1 1 1 1 1 1 1 1 1 ...
##
## 0 1
## 86 84
## mutate outcome variable
data = data %>%
mutate(
# label outcome column
class = case_when(
class == 0 ~ "married",
class == 1 ~ "divorced"
),
# factorize outcome column
class = as.factor(class),
)
## see modified data structure
str(data)## 'data.frame': 170 obs. of 55 variables:
## $ atr1 : int 2 4 2 3 2 0 3 2 2 1 ...
## $ atr2 : int 2 4 2 2 2 0 3 1 2 1 ...
## $ atr3 : int 4 4 2 3 1 1 3 2 1 1 ...
## $ atr4 : int 1 4 2 2 1 0 2 2 0 1 ...
## $ atr5 : int 0 4 1 3 1 0 1 2 0 1 ...
## $ atr6 : int 0 0 3 3 1 2 3 1 4 2 ...
## $ atr7 : int 0 0 2 3 0 0 4 0 1 0 ...
## $ atr8 : int 0 4 1 3 0 0 3 3 3 2 ...
## $ atr9 : int 0 4 1 3 0 0 2 3 3 2 ...
## $ atr10: int 0 4 2 3 0 1 2 2 3 2 ...
## $ atr11: int 1 4 3 4 0 0 2 4 3 3 ...
## $ atr12: int 0 3 4 3 1 2 2 3 3 0 ...
## $ atr13: int 1 4 2 3 0 1 2 2 3 0 ...
## $ atr14: int 1 0 3 4 1 0 3 3 3 2 ...
## $ atr15: int 0 4 3 3 1 2 2 4 3 1 ...
## $ atr16: int 1 4 3 3 1 0 3 3 3 0 ...
## $ atr17: int 0 4 3 3 1 2 3 2 3 1 ...
## $ atr18: int 0 4 3 3 1 1 3 3 3 2 ...
## $ atr19: int 0 3 3 3 2 0 3 2 3 1 ...
## $ atr20: int 1 2 2 4 1 1 2 1 3 0 ...
## $ atr21: int 0 1 1 1 1 0 3 2 2 0 ...
## $ atr22: int 0 1 0 1 0 0 3 1 2 0 ...
## $ atr23: int 0 0 1 1 0 0 3 1 2 0 ...
## $ atr24: int 0 2 2 1 0 0 3 2 3 1 ...
## $ atr25: int 0 2 2 2 0 2 2 3 2 1 ...
## $ atr26: int 0 1 2 1 2 2 3 3 3 1 ...
## $ atr27: int 0 2 2 1 1 0 3 2 2 1 ...
## $ atr28: int 0 0 2 1 2 0 2 2 3 1 ...
## $ atr29: int 0 1 3 1 1 0 2 2 2 1 ...
## $ atr30: int 1 1 2 3 1 0 2 3 3 1 ...
## $ atr31: int 1 0 3 2 1 4 1 1 1 1 ...
## $ atr32: int 2 4 3 3 1 1 2 1 1 1 ...
## $ atr33: int 1 2 1 2 1 1 2 0 1 0 ...
## $ atr34: int 2 3 1 2 1 1 1 2 1 1 ...
## $ atr35: int 0 0 1 1 0 1 1 2 1 0 ...
## $ atr36: int 1 2 1 1 0 1 2 1 1 0 ...
## $ atr37: int 2 3 2 3 0 1 3 4 1 1 ...
## $ atr38: int 1 4 1 3 0 2 2 4 2 1 ...
## $ atr39: int 3 2 3 4 2 0 2 4 2 2 ...
## $ atr40: int 3 4 3 4 1 2 3 4 2 2 ...
## $ atr41: int 2 2 3 2 0 2 3 4 2 1 ...
## $ atr42: int 1 2 3 2 2 1 3 4 2 2 ...
## $ atr43: int 1 3 2 3 3 2 3 3 2 3 ...
## $ atr44: int 2 4 3 2 0 3 4 2 2 2 ...
## $ atr45: int 3 2 2 3 2 0 3 0 2 2 ...
## $ atr46: int 2 2 3 2 2 2 3 0 1 2 ...
## $ atr47: int 1 2 2 2 1 2 2 1 1 0 ...
## $ atr48: int 3 3 3 3 2 1 3 2 1 2 ...
## $ atr49: int 3 4 1 3 3 2 2 2 1 2 ...
## $ atr50: int 3 4 1 3 2 1 3 2 1 2 ...
## $ atr51: int 2 4 1 3 2 1 3 1 1 2 ...
## $ atr52: int 3 4 2 2 2 1 2 1 1 4 ...
## $ atr53: int 2 2 2 2 1 2 2 1 1 3 ...
## $ atr54: int 1 2 2 2 0 0 2 0 1 3 ...
## $ class: Factor w/ 2 levels "divorced","married": 1 1 1 1 1 1 1 1 1 1 ...
# get the number of rows
nrows = nrow(data)
# set random seed for reproducibility
set.seed(1)
# get indices
indices = sample(1:nrows, 0.7 * nrows)
# obtain train and test datasets
train = data[indices, ]
test = data[-indices, ]
## proportion of outcome variable values in training dataset
prop.table(table(train$class))##
## divorced married
## 0.4745763 0.5254237
##
## divorced married
## 0.5384615 0.4615385
## # A tibble: 2 x 55
## class atr1 atr2 atr3 atr4 atr5 atr6 atr7 atr8 atr9 atr10
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 divo… 3.20 2.89 2.91 2.84 3.04 1.11 1.07 2.79 2.88 2.84
## 2 marr… 0.452 0.371 0.629 0.226 0.0968 0.435 0.0161 0.129 0.0645 0.387
## # … with 44 more variables: atr11 <dbl>, atr12 <dbl>, atr13 <dbl>,
## # atr14 <dbl>, atr15 <dbl>, atr16 <dbl>, atr17 <dbl>, atr18 <dbl>,
## # atr19 <dbl>, atr20 <dbl>, atr21 <dbl>, atr22 <dbl>, atr23 <dbl>,
## # atr24 <dbl>, atr25 <dbl>, atr26 <dbl>, atr27 <dbl>, atr28 <dbl>,
## # atr29 <dbl>, atr30 <dbl>, atr31 <dbl>, atr32 <dbl>, atr33 <dbl>,
## # atr34 <dbl>, atr35 <dbl>, atr36 <dbl>, atr37 <dbl>, atr38 <dbl>,
## # atr39 <dbl>, atr40 <dbl>, atr41 <dbl>, atr42 <dbl>, atr43 <dbl>,
## # atr44 <dbl>, atr45 <dbl>, atr46 <dbl>, atr47 <dbl>, atr48 <dbl>,
## # atr49 <dbl>, atr50 <dbl>, atr51 <dbl>, atr52 <dbl>, atr53 <dbl>,
## # atr54 <dbl>
## define predictor variables
predictor_variables = setdiff(colnames(data), "class")
## initialize an empty list to store plots
plot_list = list()
## set plot index
i = 1
## create plot and fill plot list
for (predictor_variable in predictor_variables) {
plot = ggplot(train) +
geom_density(aes_string(x=predictor_variable, fill="class"), alpha=0.5)
plot_list[[i]] = plot
i = i + 1
}
## display plots
do.call("grid.arrange", c(plot_list, ncol=3))Because we have many redundant, highly-correlated features in our dataset, we would like to reduce the number of feature variables. One method we can apply is principal component analysis (PCA), a very popular dimension reduction technique.
## create pca object using train dataset
train.pca = prcomp(train[ , predictor_variables], scale=TRUE, center=TRUE)
## view pca summary
summary(train.pca)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 6.3107 1.43265 1.28558 1.10536 0.95632 0.8940
## Proportion of Variance 0.7375 0.03801 0.03061 0.02263 0.01694 0.0148
## Cumulative Proportion 0.7375 0.77550 0.80611 0.82874 0.84567 0.8605
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.87436 0.78410 0.76637 0.72687 0.68727 0.65491
## Proportion of Variance 0.01416 0.01139 0.01088 0.00978 0.00875 0.00794
## Cumulative Proportion 0.87463 0.88602 0.89689 0.90668 0.91542 0.92337
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.63608 0.58895 0.53559 0.52314 0.51719 0.48026
## Proportion of Variance 0.00749 0.00642 0.00531 0.00507 0.00495 0.00427
## Cumulative Proportion 0.93086 0.93728 0.94259 0.94766 0.95262 0.95689
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.45829 0.42975 0.42119 0.4027 0.39126 0.36180
## Proportion of Variance 0.00389 0.00342 0.00329 0.0030 0.00283 0.00242
## Cumulative Proportion 0.96078 0.96420 0.96748 0.9705 0.97332 0.97574
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.35333 0.34048 0.32257 0.31750 0.29176 0.27841
## Proportion of Variance 0.00231 0.00215 0.00193 0.00187 0.00158 0.00144
## Cumulative Proportion 0.97806 0.98020 0.98213 0.98400 0.98557 0.98701
## PC31 PC32 PC33 PC34 PC35 PC36
## Standard deviation 0.27353 0.2546 0.25269 0.2434 0.23082 0.21535
## Proportion of Variance 0.00139 0.0012 0.00118 0.0011 0.00099 0.00086
## Cumulative Proportion 0.98839 0.9896 0.99078 0.9919 0.99286 0.99372
## PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 0.20213 0.19589 0.18960 0.17684 0.16804 0.15455
## Proportion of Variance 0.00076 0.00071 0.00067 0.00058 0.00052 0.00044
## Cumulative Proportion 0.99447 0.99519 0.99585 0.99643 0.99695 0.99740
## PC43 PC44 PC45 PC46 PC47 PC48
## Standard deviation 0.1462 0.13633 0.13259 0.12337 0.11604 0.10787
## Proportion of Variance 0.0004 0.00034 0.00033 0.00028 0.00025 0.00022
## Cumulative Proportion 0.9978 0.99814 0.99846 0.99874 0.99899 0.99921
## PC49 PC50 PC51 PC52 PC53 PC54
## Standard deviation 0.1031 0.08839 0.08762 0.08051 0.07336 0.06928
## Proportion of Variance 0.0002 0.00014 0.00014 0.00012 0.00010 0.00009
## Cumulative Proportion 0.9994 0.99955 0.99969 0.99981 0.99991 1.00000
## save PC components for train dataset
train.pc = as.data.frame(train.pca$x)
train.pc$class = train$class
head(train.pc)## PC1 PC2 PC3 PC4 PC5 PC6
## 68 -6.259996 -1.3344072 0.9615677 1.0220966 -1.5267867 1.0619955
## 167 5.161627 -2.0799412 0.4831553 -0.2867672 0.9425143 -0.8157273
## 129 5.485848 1.2755808 3.4167542 -2.2705010 -1.0460598 -1.3683011
## 162 4.846028 2.0288484 -0.7191337 -0.0974272 -0.9512514 2.1142603
## 43 -7.480399 0.4698435 -0.9137240 -1.3869813 0.6703389 -0.2796404
## 14 -8.995516 -0.2129881 -0.8438006 -1.5367287 0.6111641 -0.6286081
## PC7 PC8 PC9 PC10 PC11 PC12
## 68 -0.5616945 0.8662736 0.25187992 0.4959995 0.04979634 -0.363392924
## 167 0.0370225 -0.1461199 0.59544086 0.7775325 -0.24669965 0.759597405
## 129 -0.2422135 0.4964586 1.83894388 -0.6583357 -0.02918070 -1.117562886
## 162 -0.1072448 0.7273421 0.82690967 0.5469413 -0.64251858 -0.001581089
## 43 0.2915755 -1.1083297 -0.53706334 -0.5314219 0.01032799 0.017828368
## 14 0.2859603 -1.0277410 -0.07126251 0.2776853 0.73190191 -0.101421162
## PC13 PC14 PC15 PC16 PC17 PC18
## 68 -0.59046229 -0.8145455 -0.34694570 0.05142315 0.8219933 -0.5709258
## 167 -0.47583194 0.1298447 0.06280186 0.04275281 -0.1942569 -0.1017798
## 129 1.61103907 0.3230529 -0.24534365 1.31410663 -0.4367401 -0.3730870
## 162 1.21382002 0.3446329 -0.22929098 -0.91320578 -0.0591777 -0.5052457
## 43 0.29537589 1.2483133 0.20880418 -0.32031367 -0.3361535 0.5385272
## 14 0.03909493 0.2212094 0.17733723 -0.11567159 0.2066434 -0.3181193
## PC19 PC20 PC21 PC22 PC23
## 68 0.41757059 -0.57180456 0.362900223 0.13162007 0.003875525
## 167 0.07062134 -0.35746537 -0.128668049 -0.07550253 0.034094705
## 129 0.67130324 -0.40420446 -0.429534775 -0.38964109 -0.665138140
## 162 -0.80717947 0.09591556 -1.051969370 -0.75010046 -0.482219411
## 43 -0.28282620 -0.35634619 -0.227045648 -0.54371901 0.309271200
## 14 -0.17667712 -0.04335860 -0.005298769 -0.08699008 -0.131140976
## PC24 PC25 PC26 PC27 PC28
## 68 0.185311922 -0.32654577 -0.25914581 -0.12776065 -0.13632314
## 167 -0.009135711 0.33333576 0.14606989 0.20440681 0.11193984
## 129 0.207134743 -0.36289397 -0.36796969 -0.62659512 0.28814981
## 162 0.163056091 0.47359407 -0.28859398 0.02014161 -0.75680426
## 43 0.153347184 0.04225617 -0.17405453 -0.22220531 -0.06159581
## 14 0.225202744 -0.11202381 -0.04091147 0.05660232 -0.01148325
## PC29 PC30 PC31 PC32 PC33
## 68 0.09092058 0.17433000 0.21207672 0.023361127 0.02178616
## 167 -0.19620689 0.13932269 0.02814292 0.181911274 0.25050307
## 129 -0.49861228 -0.03663898 -0.21306284 0.051909498 0.08763653
## 162 0.54149325 -0.21865603 0.14615596 0.216909429 0.44371978
## 43 0.05219967 -0.20314112 -0.04733446 -0.097082597 0.08989949
## 14 0.03723305 -0.03124671 -0.01583253 0.004310976 0.12110513
## PC34 PC35 PC36 PC37 PC38
## 68 -0.051397134 0.053920964 0.07812843 -0.035183298 0.1295634763
## 167 0.452066641 -0.044172252 -0.15487503 0.243716888 0.0931599076
## 129 -0.248445076 -0.106630694 0.01893183 -0.113123059 -0.0372000835
## 162 0.207782423 0.314142855 0.06067841 0.401655432 -0.2107595783
## 43 0.006576239 0.031901045 -0.16872989 0.017669834 -0.1670168772
## 14 -0.001670217 -0.009402096 -0.05139885 0.005162092 0.0009918929
## PC39 PC40 PC41 PC42 PC43
## 68 0.01623038 0.08257751 0.028400291 -0.009031755 0.01970152
## 167 -0.01238713 -0.13661605 0.052926715 -0.124593234 0.17341487
## 129 0.13849899 0.12908410 0.095942637 0.021991969 0.24009812
## 162 0.03041638 0.19675623 0.261745098 0.143272794 -0.01013033
## 43 -0.02388117 0.05639081 -0.037846821 0.052624695 -0.13495473
## 14 0.02633626 -0.06628373 -0.003393285 -0.013377218 -0.01985946
## PC44 PC45 PC46 PC47 PC48
## 68 0.0033521843 -0.03749666 0.038289147 -0.024556551 -0.05361309
## 167 -0.0651303451 -0.01818043 -0.139926729 -0.005122791 -0.07706994
## 129 0.0263172523 -0.04345311 0.128703831 -0.017007282 -0.14981586
## 162 0.0233832761 0.39793099 0.196991816 -0.103414318 0.14139603
## 43 -0.1111785230 0.07883866 -0.077827430 -0.067599260 0.06722406
## 14 0.0006843765 0.01406667 -0.006444003 -0.021034721 0.04764730
## PC49 PC50 PC51 PC52 PC53
## 68 -0.053705774 -0.038328325 0.011451309 -0.005623588 -0.053125593
## 167 0.029167150 -0.027777368 0.031618514 0.011142500 0.015185277
## 129 -0.150516245 0.031539213 -0.089553873 -0.048958280 -0.069717344
## 162 -0.168703325 0.167553815 -0.072958706 0.102416013 0.048393311
## 43 0.060653177 -0.025027865 0.005867316 0.025421316 0.053846789
## 14 0.004883426 -0.000954366 -0.011922953 0.011424081 -0.005920278
## PC54 class
## 68 0.013772926 divorced
## 167 -0.039490820 married
## 129 -0.001300610 married
## 162 -0.006493680 married
## 43 -0.026524123 divorced
## 14 -0.001586914 divorced
fviz_eig(train.pca, addlabels=TRUE, ylim=c(0,80), geom = c("bar", "line"), barfill="pink", barcolor="grey", linecolor="red", ncp=10) +
labs(title = "Variance Explained By Each Principal Component",
x = "Principal Components", y = "% of Variance")## first and second principal components
ggplot(train.pc) +
geom_point(aes(x=PC1, y=PC2, color=class))## second and third principal components
ggplot(train.pc) +
geom_point(aes(x=PC2, y=PC3, color=class))fviz_pca_biplot(train.pca, col.ind = as.factor(train$class), col="black",
palette = "jco", geom = "point", repel=TRUE,
legend.title="Outcome", addEllipses = TRUE)## create model
glm_pca_slim_model = glm(formula = class ~ PC1,
data = train.pc,
family = "binomial")## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## create prediction probabilities (on train dataset)
glm_pca_slim_train_pred_probs = predict(glm_pca_slim_model, type="response")
## create predictions (on train dataset)
glm_pca_slim_train_preds = as.factor(ifelse(glm_pca_slim_train_pred_probs > 0.5, "married", "divorced"))
## evaluate performance (on train dataset)
confusionMatrix(glm_pca_slim_train_preds, train.pc$class)## Confusion Matrix and Statistics
##
## Reference
## Prediction divorced married
## divorced 56 0
## married 0 62
##
## Accuracy : 1
## 95% CI : (0.9692, 1)
## No Information Rate : 0.5254
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4746
## Detection Rate : 0.4746
## Detection Prevalence : 0.4746
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : divorced
##
## create prediction probabilities (on test dataset)
glm_pca_slim_test_pred_probs = predict(glm_pca_slim_model, type="response", newdata=test.pc)
## create predictions (on test dataset)
glm_pca_slim_test_preds = as.factor(ifelse(glm_pca_slim_test_pred_probs > 0.5, "married", "divorced"))
## evaluate performance (on test dataset)
confusionMatrix(glm_pca_slim_test_preds, test.pc$class)## Confusion Matrix and Statistics
##
## Reference
## Prediction divorced married
## divorced 28 0
## married 0 24
##
## Accuracy : 1
## 95% CI : (0.9315, 1)
## No Information Rate : 0.5385
## P-Value [Acc > NIR] : 1.047e-14
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5385
## Detection Rate : 0.5385
## Detection Prevalence : 0.5385
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : divorced
##
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## create prediction probabilities (on train dataset)
glm_train_pred_probs = predict(glm_model, type="response")
## create predictions (on train dataset)
glm_train_preds = as.factor(ifelse(glm_train_pred_probs > 0.5, "married", "divorced"))
## evaluate performance (on train dataset)
confusionMatrix(glm_train_preds, train.pc$class)## Confusion Matrix and Statistics
##
## Reference
## Prediction divorced married
## divorced 56 0
## married 0 62
##
## Accuracy : 1
## 95% CI : (0.9692, 1)
## No Information Rate : 0.5254
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4746
## Detection Rate : 0.4746
## Detection Prevalence : 0.4746
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : divorced
##
## create prediction probabilities (on test dataset)
glm_test_pred_probs = predict(glm_model, type="response", newdata=test)
## create predictions (on test dataset)
glm_test_preds = as.factor(ifelse(glm_test_pred_probs > 0.5, "married", "divorced"))
## evaluate performance (on test dataset)
confusionMatrix(glm_test_preds, test$class)## Confusion Matrix and Statistics
##
## Reference
## Prediction divorced married
## divorced 27 0
## married 1 24
##
## Accuracy : 0.9808
## 95% CI : (0.8974, 0.9995)
## No Information Rate : 0.5385
## P-Value [Acc > NIR] : 4.772e-13
##
## Kappa : 0.9614
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9643
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9600
## Prevalence : 0.5385
## Detection Rate : 0.5192
## Detection Prevalence : 0.5192
## Balanced Accuracy : 0.9821
##
## 'Positive' Class : divorced
##
##
## Call:
## glm(formula = class ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -9.537e-06 -1.261e-06 2.110e-08 2.078e-06 5.527e-06
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.563e+01 4.034e+05 0 1
## atr1 -1.403e+00 2.249e+05 0 1
## atr2 -1.195e+00 2.309e+05 0 1
## atr3 4.305e-01 1.604e+05 0 1
## atr4 -5.261e+00 1.711e+05 0 1
## atr5 1.627e+00 2.884e+05 0 1
## atr6 -4.401e+00 1.439e+05 0 1
## atr7 -2.183e+00 1.921e+05 0 1
## atr8 -5.297e+00 2.581e+05 0 1
## atr9 2.317e+00 3.710e+05 0 1
## atr10 1.507e+00 3.766e+05 0 1
## atr11 -8.501e+00 3.639e+05 0 1
## atr12 4.628e+00 4.715e+05 0 1
## atr13 3.578e+00 2.585e+05 0 1
## atr14 -1.112e+01 3.138e+05 0 1
## atr15 -6.412e+00 2.695e+05 0 1
## atr16 3.631e-01 5.054e+05 0 1
## atr17 -6.988e+00 3.205e+05 0 1
## atr18 1.314e+01 3.851e+05 0 1
## atr19 8.184e+00 3.069e+05 0 1
## atr20 1.408e+00 5.364e+05 0 1
## atr21 -5.290e-01 2.751e+05 0 1
## atr22 2.892e+00 3.322e+05 0 1
## atr23 2.705e+00 2.428e+05 0 1
## atr24 6.190e+00 2.335e+05 0 1
## atr25 1.645e+00 3.054e+05 0 1
## atr26 -8.749e+00 2.998e+05 0 1
## atr27 2.803e+00 2.106e+05 0 1
## atr28 -2.371e-02 2.487e+05 0 1
## atr29 -2.999e+00 4.093e+05 0 1
## atr30 -3.976e+00 2.632e+05 0 1
## atr31 -5.472e-01 1.210e+05 0 1
## atr32 -2.859e+00 2.362e+05 0 1
## atr33 3.807e+00 2.465e+05 0 1
## atr34 2.421e+00 2.068e+05 0 1
## atr35 -4.430e+00 2.733e+05 0 1
## atr36 1.568e-01 3.224e+05 0 1
## atr37 -1.456e+00 1.599e+05 0 1
## atr38 3.244e+00 2.700e+05 0 1
## atr39 3.553e+00 2.437e+05 0 1
## atr40 -1.212e+01 2.284e+05 0 1
## atr41 -1.311e+00 1.566e+05 0 1
## atr42 2.774e-01 1.770e+05 0 1
## atr43 3.273e+00 1.009e+05 0 1
## atr44 -1.810e+00 1.341e+05 0 1
## atr45 -1.770e+00 1.375e+05 0 1
## atr46 2.583e+00 1.118e+05 0 1
## atr47 -3.112e+00 1.025e+05 0 1
## atr48 6.350e+00 1.609e+05 0 1
## atr49 -4.755e+00 1.984e+05 0 1
## atr50 1.734e+00 1.628e+05 0 1
## atr51 8.476e-01 1.343e+05 0 1
## atr52 -2.961e+00 1.351e+05 0 1
## atr53 -1.034e+00 1.839e+05 0 1
## atr54 4.666e+00 2.271e+05 0 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.6328e+02 on 117 degrees of freedom
## Residual deviance: 8.8385e-10 on 63 degrees of freedom
## AIC: 110
##
## Number of Fisher Scoring iterations: 25
## create model
rpart_model = rpart(class ~ .,
data = train[ , c("class", predictor_variables)],
method = 'class',
control = rpart.control(minsplit=2),
model = TRUE)
## create prediction probabilities (on train dataset)
rpart_train_pred_probs = predict(rpart_model)
## create predictions (on train dataset)
rpart_train_preds = as.factor(ifelse(rpart_train_pred_probs[ , 2] > 0.5, "married", "divorced"))
## evaluate performance (on train dataset)
confusionMatrix(rpart_train_preds, train$class)## Confusion Matrix and Statistics
##
## Reference
## Prediction divorced married
## divorced 56 0
## married 0 62
##
## Accuracy : 1
## 95% CI : (0.9692, 1)
## No Information Rate : 0.5254
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4746
## Detection Rate : 0.4746
## Detection Prevalence : 0.4746
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : divorced
##
## create prediction probabilities (on test dataset)
rpart_test_pred_probs = predict(rpart_model, newdata=test)
## create predictions (on test dataset)
rpart_test_preds = as.factor(ifelse(rpart_test_pred_probs[ , 2] > 0.5, "married", "divorced"))
## evaluate performance (on test dataset)
confusionMatrix(rpart_test_preds, test$class)## Confusion Matrix and Statistics
##
## Reference
## Prediction divorced married
## divorced 28 1
## married 0 23
##
## Accuracy : 0.9808
## 95% CI : (0.8974, 0.9995)
## No Information Rate : 0.5385
## P-Value [Acc > NIR] : 4.772e-13
##
## Kappa : 0.9612
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.9583
## Pos Pred Value : 0.9655
## Neg Pred Value : 1.0000
## Prevalence : 0.5385
## Detection Rate : 0.5385
## Detection Prevalence : 0.5577
## Balanced Accuracy : 0.9792
##
## 'Positive' Class : divorced
##
## perform aggregation: get mean of each metric
mean.agg = data %>%
dplyr::group_by(class) %>%
dplyr::summarize_each(
mean
)
head(mean.agg)## # A tibble: 2 x 55
## class atr1 atr2 atr3 atr4 atr5 atr6 atr7 atr8 atr9 atr10 atr11
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 divo… 3.19 2.87 2.92 2.73 3.01 1.13 0.988 2.81 2.89 2.77 3.21
## 2 marr… 0.395 0.465 0.640 0.267 0.105 0.372 0.0116 0.128 0.0581 0.407 0.198
## # … with 43 more variables: atr12 <dbl>, atr13 <dbl>, atr14 <dbl>,
## # atr15 <dbl>, atr16 <dbl>, atr17 <dbl>, atr18 <dbl>, atr19 <dbl>,
## # atr20 <dbl>, atr21 <dbl>, atr22 <dbl>, atr23 <dbl>, atr24 <dbl>,
## # atr25 <dbl>, atr26 <dbl>, atr27 <dbl>, atr28 <dbl>, atr29 <dbl>,
## # atr30 <dbl>, atr31 <dbl>, atr32 <dbl>, atr33 <dbl>, atr34 <dbl>,
## # atr35 <dbl>, atr36 <dbl>, atr37 <dbl>, atr38 <dbl>, atr39 <dbl>,
## # atr40 <dbl>, atr41 <dbl>, atr42 <dbl>, atr43 <dbl>, atr44 <dbl>,
## # atr45 <dbl>, atr46 <dbl>, atr47 <dbl>, atr48 <dbl>, atr49 <dbl>,
## # atr50 <dbl>, atr51 <dbl>, atr52 <dbl>, atr53 <dbl>, atr54 <dbl>
## perform aggregation: get max of each metric
max.agg = data %>%
dplyr::select(-class) %>%
dplyr::summarize_each(
max
)
head(max.agg)## atr1 atr2 atr3 atr4 atr5 atr6 atr7 atr8 atr9 atr10 atr11 atr12 atr13
## 1 4 4 4 4 4 4 4 4 4 4 4 4 4
## atr14 atr15 atr16 atr17 atr18 atr19 atr20 atr21 atr22 atr23 atr24 atr25
## 1 4 4 4 4 4 4 4 4 4 4 4 4
## atr26 atr27 atr28 atr29 atr30 atr31 atr32 atr33 atr34 atr35 atr36 atr37
## 1 4 4 4 4 4 4 4 4 4 4 4 4
## atr38 atr39 atr40 atr41 atr42 atr43 atr44 atr45 atr46 atr47 atr48 atr49
## 1 4 4 4 4 4 4 4 4 4 4 4 4
## atr50 atr51 atr52 atr53 atr54
## 1 4 4 4 4 4
## create data frames that can be used to radarchart()
radiochart_df = rbind(
# values for outer radarchart edges (maximum values)
max.agg[ , predictor_variables],
# values for inner radarchart edges (0)
rep(0, 54),
# values for radarchart lines (first row for benign, second row for malignant)
mean.agg[ , predictor_variables]
)
rownames(radiochart_df) = c(1, 2, as.character(mean.agg$class))
head(radiochart_df)## atr1 atr2 atr3 atr4 atr5 atr6
## 1 4.0000000 4.0000000 4.0000000 4.0000000 4.0000000 4.000000
## 2 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.000000
## divorced 3.1904762 2.8690476 2.9166667 2.7261905 3.0119048 1.130952
## married 0.3953488 0.4651163 0.6395349 0.2674419 0.1046512 0.372093
## atr7 atr8 atr9 atr10 atr11 atr12
## 1 4.00000000 4.000000 4.00000000 4.0000000 4.0000000 4.0000000
## 2 0.00000000 0.000000 0.00000000 0.0000000 0.0000000 0.0000000
## divorced 0.98809524 2.809524 2.89285714 2.7738095 3.2142857 2.9404762
## married 0.01162791 0.127907 0.05813953 0.4069767 0.1976744 0.3953488
## atr13 atr14 atr15 atr16 atr17 atr18
## 1 4.0000000 4.0000000 4.0000000 4.0000000 4.0000000 4.00000000
## 2 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000
## divorced 3.0952381 2.8809524 2.9404762 2.8214286 3.1666667 2.97619048
## married 0.6046512 0.2906977 0.2325581 0.1627907 0.1744186 0.09302326
## atr19 atr20 atr21 atr22 atr23 atr24
## 1 4.0000000 4.00000000 4.0000000 4.00000000 4.00000000 4.0000000
## 2 0.0000000 0.00000000 0.0000000 0.00000000 0.00000000 0.0000000
## divorced 3.1785714 2.88095238 2.6547619 2.45238095 2.77380952 2.7857143
## married 0.1395349 0.06976744 0.1511628 0.06976744 0.08139535 0.2674419
## atr25 atr26 atr27 atr28 atr29 atr30
## 1 4.0000000 4.0000000 4.0000000 4.00000000 4.00000000 4.0000000
## 2 0.0000000 0.0000000 0.0000000 0.00000000 0.00000000 0.0000000
## divorced 2.9523810 2.8095238 2.6785714 2.55952381 2.92857143 2.8214286
## married 0.3372093 0.1976744 0.1511628 0.08139535 0.09302326 0.1976744
## atr31 atr32 atr33 atr34 atr35 atr36
## 1 4.0000000 4.0000000 4.0000000 4.0000000 4.0000000 4.00000000
## 2 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.00000000
## divorced 3.4404762 3.4166667 3.3571429 3.2738095 3.2738095 3.21428571
## married 0.8372093 0.7325581 0.2906977 0.5581395 0.1046512 0.03488372
## atr37 atr38 atr39 atr40 atr41 atr42
## 1 4.000000 4.0000000 4.0000000 4.0000000 4.0000000 4.000000
## 2 0.000000 0.0000000 0.0000000 0.0000000 0.0000000 0.000000
## divorced 3.583333 3.4047619 3.6428571 3.5714286 3.5476190 3.333333
## married 0.627907 0.3488372 0.5697674 0.2093023 0.4767442 1.011628
## atr43 atr44 atr45 atr46 atr47 atr48 atr49
## 1 4.000000 4.0000000 4.000000 4.000000 4.000000 4.000000 4.000000
## 2 0.000000 0.0000000 0.000000 0.000000 0.000000 0.000000 0.000000
## divorced 3.476190 3.3809524 3.285714 3.166667 3.321429 3.452381 3.511905
## married 1.953488 0.5348837 1.651163 1.953488 1.244186 2.046512 1.279070
## atr50 atr51 atr52 atr53 atr54
## 1 4.000000 4.000000 4.000000 4.000000 4.0000000
## 2 0.000000 0.000000 0.000000 0.000000 0.0000000
## divorced 3.500000 3.357143 3.488095 3.321429 3.3690476
## married 1.383721 1.616279 1.569767 1.186047 0.6860465
## default radar chart
radarchart(radiochart_df, axistype=2, title="Mean Metrics")
legend(x=1, y=1, legend = mean.agg$class, bty = "n", pch=20 , col=mean.agg$class, cex=1.2, pt.cex=3)