Discriminant Analysis
QUESTION 1
Library
## Loading required package: MASS
## ---
## biotools version 4.3
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
Load Dataset
## 'data.frame': 395 obs. of 33 variables:
## $ school : chr "GP" "GP" "GP" "GP" ...
## $ sex : chr "F" "F" "F" "F" ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : chr "U" "U" "U" "U" ...
## $ famsize : chr "GT3" "GT3" "LE3" "GT3" ...
## $ Pstatus : chr "A" "T" "T" "T" ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : chr "at_home" "at_home" "at_home" "health" ...
## $ Fjob : chr "teacher" "other" "other" "services" ...
## $ reason : chr "course" "course" "other" "home" ...
## $ guardian : chr "mother" "father" "mother" "mother" ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : chr "yes" "no" "yes" "no" ...
## $ famsup : chr "no" "yes" "no" "yes" ...
## $ paid : chr "no" "no" "yes" "yes" ...
## $ activities: chr "no" "no" "no" "yes" ...
## $ nursery : chr "yes" "no" "yes" "yes" ...
## $ higher : chr "yes" "yes" "yes" "yes" ...
## $ internet : chr "no" "yes" "yes" "yes" ...
## $ romantic : chr "no" "no" "no" "yes" ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ G1 : int 5 5 7 15 6 15 12 6 16 14 ...
## $ G2 : int 6 5 8 14 10 15 12 5 18 15 ...
## $ G3 : int 6 6 10 15 10 15 11 6 19 15 ...
Summary Statistics
## school sex age address
## Length:395 Length:395 Min. :15.0 Length:395
## Class :character Class :character 1st Qu.:16.0 Class :character
## Mode :character Mode :character Median :17.0 Mode :character
## Mean :16.7
## 3rd Qu.:18.0
## Max. :22.0
## famsize Pstatus Medu Fedu
## Length:395 Length:395 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:2.000 1st Qu.:2.000
## Mode :character Mode :character Median :3.000 Median :2.000
## Mean :2.749 Mean :2.522
## 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :4.000 Max. :4.000
## Mjob Fjob reason guardian
## Length:395 Length:395 Length:395 Length:395
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## traveltime studytime failures schoolsup
## Min. :1.000 Min. :1.000 Min. :0.0000 Length:395
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 Class :character
## Median :1.000 Median :2.000 Median :0.0000 Mode :character
## Mean :1.448 Mean :2.035 Mean :0.3342
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :4.000 Max. :4.000 Max. :3.0000
## famsup paid activities nursery
## Length:395 Length:395 Length:395 Length:395
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## higher internet romantic famrel
## Length:395 Length:395 Length:395 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:4.000
## Mode :character Mode :character Mode :character Median :4.000
## Mean :3.944
## 3rd Qu.:5.000
## Max. :5.000
## freetime goout Dalc Walc
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000
## Median :3.000 Median :3.000 Median :1.000 Median :2.000
## Mean :3.235 Mean :3.109 Mean :1.481 Mean :2.291
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## health absences G1 G2
## Min. :1.000 Min. : 0.000 Min. : 3.00 Min. : 0.00
## 1st Qu.:3.000 1st Qu.: 0.000 1st Qu.: 8.00 1st Qu.: 9.00
## Median :4.000 Median : 4.000 Median :11.00 Median :11.00
## Mean :3.554 Mean : 5.709 Mean :10.91 Mean :10.71
## 3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:13.00 3rd Qu.:13.00
## Max. :5.000 Max. :75.000 Max. :19.00 Max. :19.00
## G3
## Min. : 0.00
## 1st Qu.: 8.00
## Median :11.00
## Mean :10.42
## 3rd Qu.:14.00
## Max. :20.00
## vars n mean sd median trimmed mad min max range skew
## school* 1 395 1.12 0.32 1 1.02 0.00 1 2 1 2.38
## sex* 2 395 1.47 0.50 1 1.47 0.00 1 2 1 0.11
## age 3 395 16.70 1.28 17 16.63 1.48 15 22 7 0.46
## address* 4 395 1.78 0.42 2 1.85 0.00 1 2 1 -1.33
## famsize* 5 395 1.29 0.45 1 1.24 0.00 1 2 1 0.93
## Pstatus* 6 395 1.90 0.31 2 1.99 0.00 1 2 1 -2.59
## Medu 7 395 2.75 1.09 3 2.82 1.48 0 4 4 -0.32
## Fedu 8 395 2.52 1.09 2 2.53 1.48 0 4 4 -0.03
## Mjob* 9 395 3.17 1.23 3 3.21 1.48 1 5 4 -0.33
## Fjob* 10 395 3.28 0.86 3 3.32 0.00 1 5 4 -0.36
## reason* 11 395 2.26 1.21 2 2.20 1.48 1 4 3 0.41
## guardian* 12 395 1.85 0.54 2 1.84 0.00 1 3 2 -0.11
## traveltime 13 395 1.45 0.70 1 1.31 0.00 1 4 3 1.59
## studytime 14 395 2.04 0.84 2 1.96 0.00 1 4 3 0.63
## failures 15 395 0.33 0.74 0 0.14 0.00 0 3 3 2.37
## schoolsup* 16 395 1.13 0.34 1 1.04 0.00 1 2 1 2.20
## famsup* 17 395 1.61 0.49 2 1.64 0.00 1 2 1 -0.46
## paid* 18 395 1.46 0.50 1 1.45 0.00 1 2 1 0.17
## activities* 19 395 1.51 0.50 2 1.51 0.00 1 2 1 -0.04
## nursery* 20 395 1.79 0.40 2 1.87 0.00 1 2 1 -1.46
## higher* 21 395 1.95 0.22 2 2.00 0.00 1 2 1 -4.08
## internet* 22 395 1.83 0.37 2 1.91 0.00 1 2 1 -1.78
## romantic* 23 395 1.33 0.47 1 1.29 0.00 1 2 1 0.70
## famrel 24 395 3.94 0.90 4 4.04 1.48 1 5 4 -0.94
## freetime 25 395 3.24 1.00 3 3.23 1.48 1 5 4 -0.16
## goout 26 395 3.11 1.11 3 3.09 1.48 1 5 4 0.12
## Dalc 27 395 1.48 0.89 1 1.27 0.00 1 5 4 2.17
## Walc 28 395 2.29 1.29 2 2.15 1.48 1 5 4 0.61
## health 29 395 3.55 1.39 4 3.69 1.48 1 5 4 -0.49
## absences 30 395 5.71 8.00 4 4.24 5.93 0 75 75 3.64
## G1 31 395 10.91 3.32 11 10.80 4.45 3 19 16 0.24
## G2 32 395 10.71 3.76 11 10.84 2.97 0 19 19 -0.43
## G3 33 395 10.42 4.58 11 10.84 4.45 0 20 20 -0.73
## kurtosis se
## school* 3.68 0.02
## sex* -1.99 0.03
## age -0.03 0.06
## address* -0.24 0.02
## famsize* -1.14 0.02
## Pstatus* 4.71 0.02
## Medu -1.10 0.06
## Fedu -1.21 0.05
## Mjob* -0.69 0.06
## Fjob* 0.98 0.04
## reason* -1.40 0.06
## guardian* 0.15 0.03
## traveltime 2.27 0.04
## studytime -0.04 0.04
## failures 4.89 0.04
## schoolsup* 2.86 0.02
## famsup* -1.79 0.02
## paid* -1.98 0.03
## activities* -2.00 0.03
## nursery* 0.12 0.02
## higher* 14.71 0.01
## internet* 1.16 0.02
## romantic* -1.51 0.02
## famrel 1.09 0.05
## freetime -0.33 0.05
## goout -0.79 0.06
## Dalc 4.65 0.04
## Walc -0.81 0.06
## health -1.03 0.07
## absences 21.31 0.40
## G1 -0.71 0.17
## G2 0.59 0.19
## G3 0.37 0.23
Preprocessing
Check Missing and Duplicate Values
## school sex age address famsize Pstatus Medu
## 0 0 0 0 0 0 0
## Fedu Mjob Fjob reason guardian traveltime studytime
## 0 0 0 0 0 0 0
## failures schoolsup famsup paid activities nursery higher
## 0 0 0 0 0 0 0
## internet romantic famrel freetime goout Dalc Walc
## 0 0 0 0 0 0 0
## health absences G1 G2 G3
## 0 0 0 0 0
## [1] 0
Make Binary Target
Split Train Test
Assumption Test
Quadratic Discriminant Analysis (QDA)
Summarize Class Code
Resubstitution (prior = sample)
DA1 <- qda(formula = G3_pass ~ ., data = train, CV = FALSE)
pred_DA1 <- predict(object = DA1, newdata = test)
head(pred_DA1$posterior)## fail pass
## 3 4.119394e-09 1.0000000
## 4 4.681687e-08 1.0000000
## 7 7.219246e-08 0.9999999
## 11 2.677493e-01 0.7322507
## 12 3.628454e-06 0.9999964
## 20 1.564386e-08 1.0000000
## [1] pass pass pass pass pass pass
## Levels: fail pass
## $class.table
## classify
## original fail pass
## fail 13 13
## pass 2 51
##
## $prop
## classify
## original fail pass
## fail 0.5000 0.5000
## pass 0.0377 0.9623
##
## $overall.correct
## [1] 0.8101
Cross-Validation (prior = sample)
## fail pass
## 1 2.130383e-01 0.7869617
## 2 5.051233e-01 0.4948767
## 3 1.135985e-02 0.9886402
## 4 3.165301e-09 1.0000000
## 5 8.638383e-01 0.1361617
## 6 4.298512e-16 1.0000000
## [1] pass fail pass pass fail pass
## Levels: fail pass
## $class.table
## classify
## original fail pass
## fail 66 64
## pass 21 244
##
## $prop
## classify
## original fail pass
## fail 0.5077 0.4923
## pass 0.0792 0.9208
##
## $overall.correct
## [1] 0.7848
QUESTION 3
Library
library(biotools)
library(caret)
library(dplyr)
library(ggplot2)
library(GGally)
library(haven)
library(MASS)
library(MVN)##
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
##
## mardia
Load Dataset
## tibble [244 × 5] (S3: tbl_df/tbl/data.frame)
## $ OUTDOOR : num [1:244] 10 14 19 14 14 20 6 13 18 16 ...
## ..- attr(*, "format.spss")= chr "F2.0"
## ..- attr(*, "display_width")= int 9
## $ SOCIAL : num [1:244] 22 17 33 29 25 25 18 27 31 35 ...
## ..- attr(*, "format.spss")= chr "F2.0"
## $ CONSERVATIVE: num [1:244] 5 6 7 12 7 12 4 7 9 13 ...
## ..- attr(*, "format.spss")= chr "F2.0"
## ..- attr(*, "display_width")= int 14
## $ JOB : num [1:244] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "format.spss")= chr "F1.0"
## ..- attr(*, "display_width")= int 5
## $ JID : num [1:244] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "format.spss")= chr "F2.0"
## ..- attr(*, "display_width")= int 5
Summary Statistics
## OUTDOOR SOCIAL CONSERVATIVE JOB
## Min. : 0.00 Min. : 7.00 Min. : 0.00 Min. :1.000
## 1st Qu.:13.00 1st Qu.:17.00 1st Qu.: 8.00 1st Qu.:1.000
## Median :16.00 Median :21.00 Median :11.00 Median :2.000
## Mean :15.64 Mean :20.68 Mean :10.59 Mean :1.922
## 3rd Qu.:19.00 3rd Qu.:25.00 3rd Qu.:13.00 3rd Qu.:3.000
## Max. :28.00 Max. :35.00 Max. :20.00 Max. :3.000
## JID
## Min. : 1.00
## 1st Qu.:21.00
## Median :41.00
## Mean :41.95
## 3rd Qu.:61.25
## Max. :93.00
## vars n mean sd median trimmed mad min max range skew
## OUTDOOR 1 244 15.64 4.84 16 15.84 4.45 0 28 28 -0.41
## SOCIAL 2 244 20.68 5.48 21 20.76 5.93 7 35 28 -0.09
## CONSERVATIVE 3 244 10.59 3.73 11 10.53 4.45 0 20 20 0.05
## JOB 4 244 1.92 0.78 2 1.90 1.48 1 3 2 0.14
## JID 5 244 41.95 24.79 41 41.37 29.65 1 93 92 0.15
## kurtosis se
## OUTDOOR 0.22 0.31
## SOCIAL -0.69 0.35
## CONSERVATIVE -0.36 0.24
## JOB -1.37 0.05
## JID -1.05 1.59
Preprocessing
Check Missing and Duplicate Values
## OUTDOOR SOCIAL CONSERVATIVE JOB JID
## 0 0 0 0 0
## [1] 0
Assumption Test
Normality Multivariate Testing
group1 <- subset(quest3, JOB == "1")
group2 <- subset(quest3, JOB == "2")
group3 <- subset(quest3, JOB == "3")
result1 <- mvn(data = group1[, 1:3], mvn_test = "hz")
result2 <- mvn(data = group2[, 1:3], mvn_test = "hz")
result3 <- mvn(data = group3[, 1:3], mvn_test = "hz")
result1## $multivariate_normality
## Test Statistic p.value Method MVN
## 1 Henze-Zirkler 0.69 0.511 asymptotic ✓ Normal
##
## $univariate_normality
## Test Variable Statistic p.value Normality
## 1 Anderson-Darling OUTDOOR 0.430 0.301 ✓ Normal
## 2 Anderson-Darling SOCIAL 0.583 0.125 ✓ Normal
## 3 Anderson-Darling CONSERVATIVE 0.836 0.030 ✗ Not normal
##
## $descriptives
## Variable n Mean Std.Dev Median Min Max 25th 75th Skew Kurtosis
## 1 OUTDOOR 85 12.518 4.649 13 0 22 10 15 -0.252 2.988
## 2 SOCIAL 85 24.224 4.335 25 12 35 22 27 -0.360 3.241
## 3 CONSERVATIVE 85 9.024 3.143 9 2 17 7 12 0.175 2.319
##
## $data
## # A tibble: 85 × 3
## OUTDOOR SOCIAL CONSERVATIVE
## <dbl> <dbl> <dbl>
## 1 10 22 5
## 2 14 17 6
## 3 19 33 7
## 4 14 29 12
## 5 14 25 7
## 6 20 25 12
## 7 6 18 4
## 8 13 27 7
## 9 18 31 9
## 10 16 35 13
## # ℹ 75 more rows
##
## $subset
## NULL
##
## $outlierMethod
## [1] "none"
##
## attr(,"class")
## [1] "mvn"
## $multivariate_normality
## Test Statistic p.value Method MVN
## 1 Henze-Zirkler 0.782 0.296 asymptotic ✓ Normal
##
## $univariate_normality
## Test Variable Statistic p.value Normality
## 1 Anderson-Darling OUTDOOR 0.552 0.151 ✓ Normal
## 2 Anderson-Darling SOCIAL 0.468 0.244 ✓ Normal
## 3 Anderson-Darling CONSERVATIVE 0.659 0.083 ✓ Normal
##
## $descriptives
## Variable n Mean Std.Dev Median Min Max 25th 75th Skew Kurtosis
## 1 OUTDOOR 93 18.538 3.565 18 11 28 16 21 0.123 2.512
## 2 SOCIAL 93 21.140 4.551 21 9 29 18 24 -0.158 2.474
## 3 CONSERVATIVE 93 10.140 3.242 11 0 17 8 12 -0.426 3.344
##
## $data
## # A tibble: 93 × 3
## OUTDOOR SOCIAL CONSERVATIVE
## <dbl> <dbl> <dbl>
## 1 20 27 6
## 2 21 15 10
## 3 15 27 12
## 4 15 29 8
## 5 11 25 11
## 6 24 9 17
## 7 18 21 13
## 8 14 18 4
## 9 13 22 12
## 10 17 21 9
## # ℹ 83 more rows
##
## $subset
## NULL
##
## $outlierMethod
## [1] "none"
##
## attr(,"class")
## [1] "mvn"
## $multivariate_normality
## Test Statistic p.value Method MVN
## 1 Henze-Zirkler 0.712 0.392 asymptotic ✓ Normal
##
## $univariate_normality
## Test Variable Statistic p.value Normality
## 1 Anderson-Darling OUTDOOR 0.672 0.076 ✓ Normal
## 2 Anderson-Darling SOCIAL 0.360 0.438 ✓ Normal
## 3 Anderson-Darling CONSERVATIVE 0.614 0.106 ✓ Normal
##
## $descriptives
## Variable n Mean Std.Dev Median Min Max 25th 75th Skew Kurtosis
## 1 OUTDOOR 66 15.576 4.110 16.0 4 25 13.25 18.75 -0.546 3.631
## 2 SOCIAL 66 15.455 3.767 15.5 7 26 13.00 18.00 0.255 2.884
## 3 CONSERVATIVE 66 13.242 3.692 13.0 4 20 11.25 16.00 -0.390 2.611
##
## $data
## # A tibble: 66 × 3
## OUTDOOR SOCIAL CONSERVATIVE
## <dbl> <dbl> <dbl>
## 1 19 19 16
## 2 17 17 12
## 3 8 17 14
## 4 13 20 16
## 5 14 18 4
## 6 17 12 13
## 7 17 12 17
## 8 14 21 16
## 9 19 18 12
## 10 18 16 15
## # ℹ 56 more rows
##
## $subset
## NULL
##
## $outlierMethod
## [1] "none"
##
## attr(,"class")
## [1] "mvn"
Box’s M Test
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X
## Chi-Sq (approx.) = 25.642, df = 12, p-value = 0.01206
Equality of Mean Vectors Testing
mod.fit <- manova(cbind(OUTDOOR, SOCIAL, CONSERVATIVE)~JOB,data = quest3)
summary(mod.fit,test="Wilks")## Df Wilks approx F num Df den Df Pr(>F)
## JOB 2 0.36399 52.382 6 478 < 2.2e-16 ***
## Residuals 241
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Quadratic Discriminant Analysis (QDA)
Summarize Class Code
Resubstitution (prior = sample)
DA1 <- qda(formula = JOB ~ OUTDOOR + SOCIAL + CONSERVATIVE, data = train, CV = FALSE)
pred_DA1 <- predict(object = DA1, newdata = test)
head(pred_DA1$posterior)## 1 2 3
## 1 0.6360093 3.639634e-01 2.736341e-05
## 2 0.6166691 3.831007e-01 2.301680e-04
## 3 0.9467934 5.234756e-02 8.590894e-04
## 4 0.4835552 5.117689e-01 4.675837e-03
## 5 0.7657953 9.156177e-02 1.426429e-01
## 6 0.9997881 8.808656e-05 1.237689e-04
## [1] 1 1 1 2 1 1
## Levels: 1 2 3
## $class.table
## classify
## original 1 2 3
## 1 15 1 0
## 2 2 17 0
## 3 2 1 10
##
## $prop
## classify
## original 1 2 3
## 1 0.9375 0.0625 0.0000
## 2 0.1053 0.8947 0.0000
## 3 0.1538 0.0769 0.7692
##
## $overall.correct
## [1] 0.875
Cross-Validation (prior = sample)
DA2 <- qda(formula = JOB ~ OUTDOOR + SOCIAL + CONSERVATIVE, data = quest3, CV = TRUE)
head(DA2$posterior)## 1 2 3
## 1 0.8729977 0.1130229 1.397933e-02
## 2 0.3368544 0.4413754 2.217703e-01
## 3 0.6780235 0.3219278 4.865724e-05
## 4 0.7743745 0.2238340 1.791512e-03
## 5 0.6906375 0.3035745 5.788004e-03
## 6 0.1186082 0.8549055 2.648623e-02
## [1] 1 2 1 1 1 2
## Levels: 1 2 3
## $class.table
## classify
## original 1 2 3
## 1 66 15 4
## 2 17 66 10
## 3 3 14 49
##
## $prop
## classify
## original 1 2 3
## 1 0.7765 0.1765 0.0471
## 2 0.1828 0.7097 0.1075
## 3 0.0455 0.2121 0.7424
##
## $overall.correct
## [1] 0.7418