Discriminant Analysis

QUESTION 1

Library

library(biotools)
## Loading required package: MASS
## ---
## biotools version 4.3
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(GGally)
library(MASS)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Load Dataset

quest1 <- read.csv("C:/Users/user/Downloads/student-mat.csv", sep=";")
str(quest1)
## 'data.frame':    395 obs. of  33 variables:
##  $ school    : chr  "GP" "GP" "GP" "GP" ...
##  $ sex       : chr  "F" "F" "F" "F" ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : chr  "U" "U" "U" "U" ...
##  $ famsize   : chr  "GT3" "GT3" "LE3" "GT3" ...
##  $ Pstatus   : chr  "A" "T" "T" "T" ...
##  $ Medu      : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu      : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ Mjob      : chr  "at_home" "at_home" "at_home" "health" ...
##  $ Fjob      : chr  "teacher" "other" "other" "services" ...
##  $ reason    : chr  "course" "course" "other" "home" ...
##  $ guardian  : chr  "mother" "father" "mother" "mother" ...
##  $ traveltime: int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ schoolsup : chr  "yes" "no" "yes" "no" ...
##  $ famsup    : chr  "no" "yes" "no" "yes" ...
##  $ paid      : chr  "no" "no" "yes" "yes" ...
##  $ activities: chr  "no" "no" "no" "yes" ...
##  $ nursery   : chr  "yes" "no" "yes" "yes" ...
##  $ higher    : chr  "yes" "yes" "yes" "yes" ...
##  $ internet  : chr  "no" "yes" "yes" "yes" ...
##  $ romantic  : chr  "no" "no" "no" "yes" ...
##  $ famrel    : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ G1        : int  5 5 7 15 6 15 12 6 16 14 ...
##  $ G2        : int  6 5 8 14 10 15 12 5 18 15 ...
##  $ G3        : int  6 6 10 15 10 15 11 6 19 15 ...

Summary Statistics

summary(quest1)
##     school              sex                 age         address         
##  Length:395         Length:395         Min.   :15.0   Length:395        
##  Class :character   Class :character   1st Qu.:16.0   Class :character  
##  Mode  :character   Mode  :character   Median :17.0   Mode  :character  
##                                        Mean   :16.7                     
##                                        3rd Qu.:18.0                     
##                                        Max.   :22.0                     
##    famsize            Pstatus               Medu            Fedu      
##  Length:395         Length:395         Min.   :0.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:2.000   1st Qu.:2.000  
##  Mode  :character   Mode  :character   Median :3.000   Median :2.000  
##                                        Mean   :2.749   Mean   :2.522  
##                                        3rd Qu.:4.000   3rd Qu.:3.000  
##                                        Max.   :4.000   Max.   :4.000  
##      Mjob               Fjob              reason            guardian        
##  Length:395         Length:395         Length:395         Length:395        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    traveltime      studytime        failures       schoolsup        
##  Min.   :1.000   Min.   :1.000   Min.   :0.0000   Length:395        
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   Class :character  
##  Median :1.000   Median :2.000   Median :0.0000   Mode  :character  
##  Mean   :1.448   Mean   :2.035   Mean   :0.3342                     
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:0.0000                     
##  Max.   :4.000   Max.   :4.000   Max.   :3.0000                     
##     famsup              paid            activities          nursery         
##  Length:395         Length:395         Length:395         Length:395        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     higher            internet           romantic             famrel     
##  Length:395         Length:395         Length:395         Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:4.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :4.000  
##                                                           Mean   :3.944  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :5.000  
##     freetime         goout            Dalc            Walc      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :3.000   Median :3.000   Median :1.000   Median :2.000  
##  Mean   :3.235   Mean   :3.109   Mean   :1.481   Mean   :2.291  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##      health         absences            G1              G2       
##  Min.   :1.000   Min.   : 0.000   Min.   : 3.00   Min.   : 0.00  
##  1st Qu.:3.000   1st Qu.: 0.000   1st Qu.: 8.00   1st Qu.: 9.00  
##  Median :4.000   Median : 4.000   Median :11.00   Median :11.00  
##  Mean   :3.554   Mean   : 5.709   Mean   :10.91   Mean   :10.71  
##  3rd Qu.:5.000   3rd Qu.: 8.000   3rd Qu.:13.00   3rd Qu.:13.00  
##  Max.   :5.000   Max.   :75.000   Max.   :19.00   Max.   :19.00  
##        G3       
##  Min.   : 0.00  
##  1st Qu.: 8.00  
##  Median :11.00  
##  Mean   :10.42  
##  3rd Qu.:14.00  
##  Max.   :20.00
describe(quest1)
##             vars   n  mean   sd median trimmed  mad min max range  skew
## school*        1 395  1.12 0.32      1    1.02 0.00   1   2     1  2.38
## sex*           2 395  1.47 0.50      1    1.47 0.00   1   2     1  0.11
## age            3 395 16.70 1.28     17   16.63 1.48  15  22     7  0.46
## address*       4 395  1.78 0.42      2    1.85 0.00   1   2     1 -1.33
## famsize*       5 395  1.29 0.45      1    1.24 0.00   1   2     1  0.93
## Pstatus*       6 395  1.90 0.31      2    1.99 0.00   1   2     1 -2.59
## Medu           7 395  2.75 1.09      3    2.82 1.48   0   4     4 -0.32
## Fedu           8 395  2.52 1.09      2    2.53 1.48   0   4     4 -0.03
## Mjob*          9 395  3.17 1.23      3    3.21 1.48   1   5     4 -0.33
## Fjob*         10 395  3.28 0.86      3    3.32 0.00   1   5     4 -0.36
## reason*       11 395  2.26 1.21      2    2.20 1.48   1   4     3  0.41
## guardian*     12 395  1.85 0.54      2    1.84 0.00   1   3     2 -0.11
## traveltime    13 395  1.45 0.70      1    1.31 0.00   1   4     3  1.59
## studytime     14 395  2.04 0.84      2    1.96 0.00   1   4     3  0.63
## failures      15 395  0.33 0.74      0    0.14 0.00   0   3     3  2.37
## schoolsup*    16 395  1.13 0.34      1    1.04 0.00   1   2     1  2.20
## famsup*       17 395  1.61 0.49      2    1.64 0.00   1   2     1 -0.46
## paid*         18 395  1.46 0.50      1    1.45 0.00   1   2     1  0.17
## activities*   19 395  1.51 0.50      2    1.51 0.00   1   2     1 -0.04
## nursery*      20 395  1.79 0.40      2    1.87 0.00   1   2     1 -1.46
## higher*       21 395  1.95 0.22      2    2.00 0.00   1   2     1 -4.08
## internet*     22 395  1.83 0.37      2    1.91 0.00   1   2     1 -1.78
## romantic*     23 395  1.33 0.47      1    1.29 0.00   1   2     1  0.70
## famrel        24 395  3.94 0.90      4    4.04 1.48   1   5     4 -0.94
## freetime      25 395  3.24 1.00      3    3.23 1.48   1   5     4 -0.16
## goout         26 395  3.11 1.11      3    3.09 1.48   1   5     4  0.12
## Dalc          27 395  1.48 0.89      1    1.27 0.00   1   5     4  2.17
## Walc          28 395  2.29 1.29      2    2.15 1.48   1   5     4  0.61
## health        29 395  3.55 1.39      4    3.69 1.48   1   5     4 -0.49
## absences      30 395  5.71 8.00      4    4.24 5.93   0  75    75  3.64
## G1            31 395 10.91 3.32     11   10.80 4.45   3  19    16  0.24
## G2            32 395 10.71 3.76     11   10.84 2.97   0  19    19 -0.43
## G3            33 395 10.42 4.58     11   10.84 4.45   0  20    20 -0.73
##             kurtosis   se
## school*         3.68 0.02
## sex*           -1.99 0.03
## age            -0.03 0.06
## address*       -0.24 0.02
## famsize*       -1.14 0.02
## Pstatus*        4.71 0.02
## Medu           -1.10 0.06
## Fedu           -1.21 0.05
## Mjob*          -0.69 0.06
## Fjob*           0.98 0.04
## reason*        -1.40 0.06
## guardian*       0.15 0.03
## traveltime      2.27 0.04
## studytime      -0.04 0.04
## failures        4.89 0.04
## schoolsup*      2.86 0.02
## famsup*        -1.79 0.02
## paid*          -1.98 0.03
## activities*    -2.00 0.03
## nursery*        0.12 0.02
## higher*        14.71 0.01
## internet*       1.16 0.02
## romantic*      -1.51 0.02
## famrel          1.09 0.05
## freetime       -0.33 0.05
## goout          -0.79 0.06
## Dalc            4.65 0.04
## Walc           -0.81 0.06
## health         -1.03 0.07
## absences       21.31 0.40
## G1             -0.71 0.17
## G2              0.59 0.19
## G3              0.37 0.23

Preprocessing

Check Missing and Duplicate Values

colSums(is.na(quest1))
##     school        sex        age    address    famsize    Pstatus       Medu 
##          0          0          0          0          0          0          0 
##       Fedu       Mjob       Fjob     reason   guardian traveltime  studytime 
##          0          0          0          0          0          0          0 
##   failures  schoolsup     famsup       paid activities    nursery     higher 
##          0          0          0          0          0          0          0 
##   internet   romantic     famrel   freetime      goout       Dalc       Walc 
##          0          0          0          0          0          0          0 
##     health   absences         G1         G2         G3 
##          0          0          0          0          0
sum(duplicated(quest1))
## [1] 0

Make Binary Target

quest1$G3_pass <- ifelse(quest1$G3 >= 10, "pass", "fail")
quest1$G3_pass <- factor(quest1$G3_pass, levels = c("fail", "pass"))

Drop Variabel G3

quest1 <- quest1 %>% select(-G3)

Check Distribution

table(quest1$G3_pass)
## 
## fail pass 
##  130  265

Split Train Test

set.seed(2000)
index <- createDataPartition(quest1$G3_pass, p = 0.8, list = FALSE)
train <- quest1[index, ]
test  <- quest1[-index, ]

Scaling Numeric

numeric_vars <- c("age","absences","G1","G2")
train[, numeric_vars] <- scale(train[, numeric_vars])
test[, numeric_vars]  <- scale(test[, numeric_vars])

Visualization

ggpairs(train, columns = numeric_vars, aes(color=G3_pass))

Assumption Test

Box’s M Test

X <- as.matrix(quest1[, numeric_vars])
boxM(X, quest1$G3_pass)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  X
## Chi-Sq (approx.) = 226, df = 10, p-value < 2.2e-16

Quadratic Discriminant Analysis (QDA)

Summarize Class Code

summarize.class<-function(original, classify) {
 class.table<-table(original, classify)
 numb<-rowSums(class.table)
 prop<-round(class.table/numb,4)
 overall<-round(sum(diag(class.table))/sum(class.table), 4)
 list(class.table = class.table, prop = prop, 
overall.correct = overall)  
}

Resubstitution (prior = sample)

DA1 <- qda(formula = G3_pass ~ ., data = train, CV = FALSE)
pred_DA1 <- predict(object = DA1, newdata = test)
head(pred_DA1$posterior)
##            fail      pass
## 3  4.119394e-09 1.0000000
## 4  4.681687e-08 1.0000000
## 7  7.219246e-08 0.9999999
## 11 2.677493e-01 0.7322507
## 12 3.628454e-06 0.9999964
## 20 1.564386e-08 1.0000000
head(pred_DA1$class)
## [1] pass pass pass pass pass pass
## Levels: fail pass
summarize.class(original = test$G3_pass, classify = pred_DA1$class)
## $class.table
##         classify
## original fail pass
##     fail   13   13
##     pass    2   51
## 
## $prop
##         classify
## original   fail   pass
##     fail 0.5000 0.5000
##     pass 0.0377 0.9623
## 
## $overall.correct
## [1] 0.8101

Cross-Validation (prior = sample)

DA2 <- qda(formula = G3_pass ~ ., data = quest1, CV = TRUE)
head(DA2$posterior)
##           fail      pass
## 1 2.130383e-01 0.7869617
## 2 5.051233e-01 0.4948767
## 3 1.135985e-02 0.9886402
## 4 3.165301e-09 1.0000000
## 5 8.638383e-01 0.1361617
## 6 4.298512e-16 1.0000000
head(DA2$class)
## [1] pass fail pass pass fail pass
## Levels: fail pass
summarize.class(original = quest1$G3_pass, classify = DA2$class)
## $class.table
##         classify
## original fail pass
##     fail   66   64
##     pass   21  244
## 
## $prop
##         classify
## original   fail   pass
##     fail 0.5077 0.4923
##     pass 0.0792 0.9208
## 
## $overall.correct
## [1] 0.7848

QUESTION 3

Library

library(biotools)
library(caret)
library(dplyr)
library(ggplot2)
library(GGally)
library(haven)
library(MASS)
library(MVN)
## 
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
## 
##     mardia
library(psych)

Load Dataset

quest3 <- read_sav("C:/Users/user/Downloads/discrim.sav")
str(quest3)
## tibble [244 × 5] (S3: tbl_df/tbl/data.frame)
##  $ OUTDOOR     : num [1:244] 10 14 19 14 14 20 6 13 18 16 ...
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "display_width")= int 9
##  $ SOCIAL      : num [1:244] 22 17 33 29 25 25 18 27 31 35 ...
##   ..- attr(*, "format.spss")= chr "F2.0"
##  $ CONSERVATIVE: num [1:244] 5 6 7 12 7 12 4 7 9 13 ...
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "display_width")= int 14
##  $ JOB         : num [1:244] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 5
##  $ JID         : num [1:244] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "display_width")= int 5

Summary Statistics

summary(quest3)
##     OUTDOOR          SOCIAL       CONSERVATIVE        JOB       
##  Min.   : 0.00   Min.   : 7.00   Min.   : 0.00   Min.   :1.000  
##  1st Qu.:13.00   1st Qu.:17.00   1st Qu.: 8.00   1st Qu.:1.000  
##  Median :16.00   Median :21.00   Median :11.00   Median :2.000  
##  Mean   :15.64   Mean   :20.68   Mean   :10.59   Mean   :1.922  
##  3rd Qu.:19.00   3rd Qu.:25.00   3rd Qu.:13.00   3rd Qu.:3.000  
##  Max.   :28.00   Max.   :35.00   Max.   :20.00   Max.   :3.000  
##       JID       
##  Min.   : 1.00  
##  1st Qu.:21.00  
##  Median :41.00  
##  Mean   :41.95  
##  3rd Qu.:61.25  
##  Max.   :93.00
describe(quest3)
##              vars   n  mean    sd median trimmed   mad min max range  skew
## OUTDOOR         1 244 15.64  4.84     16   15.84  4.45   0  28    28 -0.41
## SOCIAL          2 244 20.68  5.48     21   20.76  5.93   7  35    28 -0.09
## CONSERVATIVE    3 244 10.59  3.73     11   10.53  4.45   0  20    20  0.05
## JOB             4 244  1.92  0.78      2    1.90  1.48   1   3     2  0.14
## JID             5 244 41.95 24.79     41   41.37 29.65   1  93    92  0.15
##              kurtosis   se
## OUTDOOR          0.22 0.31
## SOCIAL          -0.69 0.35
## CONSERVATIVE    -0.36 0.24
## JOB             -1.37 0.05
## JID             -1.05 1.59

Preprocessing

Check Missing and Duplicate Values

colSums(is.na(quest3))
##      OUTDOOR       SOCIAL CONSERVATIVE          JOB          JID 
##            0            0            0            0            0
sum(duplicated(quest3))
## [1] 0

Split Train Test

index <- createDataPartition(quest3$JOB, p = 0.8, list = FALSE)
train <- quest3[index, ]
test  <- quest3[-index, ]

Visualization

quest3$JOB <- as.factor(quest3$JOB) 
ggpairs(quest3, columns = 1:3, aes(color = JOB)) 

Assumption Test

Normality Multivariate Testing

group1 <- subset(quest3, JOB == "1") 
group2 <- subset(quest3, JOB == "2") 
group3 <- subset(quest3, JOB == "3") 

result1 <- mvn(data = group1[, 1:3], mvn_test = "hz") 
result2 <- mvn(data = group2[, 1:3], mvn_test = "hz") 
result3 <- mvn(data = group3[, 1:3], mvn_test = "hz") 

result1
## $multivariate_normality
##            Test Statistic p.value     Method      MVN
## 1 Henze-Zirkler      0.69   0.511 asymptotic ✓ Normal
## 
## $univariate_normality
##               Test     Variable Statistic p.value    Normality
## 1 Anderson-Darling      OUTDOOR     0.430   0.301     ✓ Normal
## 2 Anderson-Darling       SOCIAL     0.583   0.125     ✓ Normal
## 3 Anderson-Darling CONSERVATIVE     0.836   0.030 ✗ Not normal
## 
## $descriptives
##       Variable  n   Mean Std.Dev Median Min Max 25th 75th   Skew Kurtosis
## 1      OUTDOOR 85 12.518   4.649     13   0  22   10   15 -0.252    2.988
## 2       SOCIAL 85 24.224   4.335     25  12  35   22   27 -0.360    3.241
## 3 CONSERVATIVE 85  9.024   3.143      9   2  17    7   12  0.175    2.319
## 
## $data
## # A tibble: 85 × 3
##    OUTDOOR SOCIAL CONSERVATIVE
##      <dbl>  <dbl>        <dbl>
##  1      10     22            5
##  2      14     17            6
##  3      19     33            7
##  4      14     29           12
##  5      14     25            7
##  6      20     25           12
##  7       6     18            4
##  8      13     27            7
##  9      18     31            9
## 10      16     35           13
## # ℹ 75 more rows
## 
## $subset
## NULL
## 
## $outlierMethod
## [1] "none"
## 
## attr(,"class")
## [1] "mvn"
result2
## $multivariate_normality
##            Test Statistic p.value     Method      MVN
## 1 Henze-Zirkler     0.782   0.296 asymptotic ✓ Normal
## 
## $univariate_normality
##               Test     Variable Statistic p.value Normality
## 1 Anderson-Darling      OUTDOOR     0.552   0.151  ✓ Normal
## 2 Anderson-Darling       SOCIAL     0.468   0.244  ✓ Normal
## 3 Anderson-Darling CONSERVATIVE     0.659   0.083  ✓ Normal
## 
## $descriptives
##       Variable  n   Mean Std.Dev Median Min Max 25th 75th   Skew Kurtosis
## 1      OUTDOOR 93 18.538   3.565     18  11  28   16   21  0.123    2.512
## 2       SOCIAL 93 21.140   4.551     21   9  29   18   24 -0.158    2.474
## 3 CONSERVATIVE 93 10.140   3.242     11   0  17    8   12 -0.426    3.344
## 
## $data
## # A tibble: 93 × 3
##    OUTDOOR SOCIAL CONSERVATIVE
##      <dbl>  <dbl>        <dbl>
##  1      20     27            6
##  2      21     15           10
##  3      15     27           12
##  4      15     29            8
##  5      11     25           11
##  6      24      9           17
##  7      18     21           13
##  8      14     18            4
##  9      13     22           12
## 10      17     21            9
## # ℹ 83 more rows
## 
## $subset
## NULL
## 
## $outlierMethod
## [1] "none"
## 
## attr(,"class")
## [1] "mvn"
result3
## $multivariate_normality
##            Test Statistic p.value     Method      MVN
## 1 Henze-Zirkler     0.712   0.392 asymptotic ✓ Normal
## 
## $univariate_normality
##               Test     Variable Statistic p.value Normality
## 1 Anderson-Darling      OUTDOOR     0.672   0.076  ✓ Normal
## 2 Anderson-Darling       SOCIAL     0.360   0.438  ✓ Normal
## 3 Anderson-Darling CONSERVATIVE     0.614   0.106  ✓ Normal
## 
## $descriptives
##       Variable  n   Mean Std.Dev Median Min Max  25th  75th   Skew Kurtosis
## 1      OUTDOOR 66 15.576   4.110   16.0   4  25 13.25 18.75 -0.546    3.631
## 2       SOCIAL 66 15.455   3.767   15.5   7  26 13.00 18.00  0.255    2.884
## 3 CONSERVATIVE 66 13.242   3.692   13.0   4  20 11.25 16.00 -0.390    2.611
## 
## $data
## # A tibble: 66 × 3
##    OUTDOOR SOCIAL CONSERVATIVE
##      <dbl>  <dbl>        <dbl>
##  1      19     19           16
##  2      17     17           12
##  3       8     17           14
##  4      13     20           16
##  5      14     18            4
##  6      17     12           13
##  7      17     12           17
##  8      14     21           16
##  9      19     18           12
## 10      18     16           15
## # ℹ 56 more rows
## 
## $subset
## NULL
## 
## $outlierMethod
## [1] "none"
## 
## attr(,"class")
## [1] "mvn"

Box’s M Test

X <- as.matrix(quest3[, 1:3])
boxM(X, quest3$JOB)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  X
## Chi-Sq (approx.) = 25.642, df = 12, p-value = 0.01206

Equality of Mean Vectors Testing

mod.fit <- manova(cbind(OUTDOOR, SOCIAL, CONSERVATIVE)~JOB,data = quest3)   
summary(mod.fit,test="Wilks")
##            Df   Wilks approx F num Df den Df    Pr(>F)    
## JOB         2 0.36399   52.382      6    478 < 2.2e-16 ***
## Residuals 241                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Quadratic Discriminant Analysis (QDA)

Summarize Class Code

summarize.class<-function(original, classify) {
 class.table<-table(original, classify)
 numb<-rowSums(class.table)
 prop<-round(class.table/numb,4)
 overall<-round(sum(diag(class.table))/sum(class.table), 4)
 list(class.table = class.table, prop = prop, 
overall.correct = overall)  
}

Resubstitution (prior = sample)

DA1 <- qda(formula = JOB ~ OUTDOOR + SOCIAL + CONSERVATIVE, data = train, CV = FALSE)
pred_DA1 <- predict(object = DA1, newdata = test)
head(pred_DA1$posterior)
##           1            2            3
## 1 0.6360093 3.639634e-01 2.736341e-05
## 2 0.6166691 3.831007e-01 2.301680e-04
## 3 0.9467934 5.234756e-02 8.590894e-04
## 4 0.4835552 5.117689e-01 4.675837e-03
## 5 0.7657953 9.156177e-02 1.426429e-01
## 6 0.9997881 8.808656e-05 1.237689e-04
head(pred_DA1$class)
## [1] 1 1 1 2 1 1
## Levels: 1 2 3
summarize.class(original = test$JOB, classify = pred_DA1$class)
## $class.table
##         classify
## original  1  2  3
##        1 15  1  0
##        2  2 17  0
##        3  2  1 10
## 
## $prop
##         classify
## original      1      2      3
##        1 0.9375 0.0625 0.0000
##        2 0.1053 0.8947 0.0000
##        3 0.1538 0.0769 0.7692
## 
## $overall.correct
## [1] 0.875

Cross-Validation (prior = sample)

DA2 <- qda(formula = JOB ~ OUTDOOR + SOCIAL + CONSERVATIVE, data = quest3, CV = TRUE)
head(DA2$posterior)
##           1         2            3
## 1 0.8729977 0.1130229 1.397933e-02
## 2 0.3368544 0.4413754 2.217703e-01
## 3 0.6780235 0.3219278 4.865724e-05
## 4 0.7743745 0.2238340 1.791512e-03
## 5 0.6906375 0.3035745 5.788004e-03
## 6 0.1186082 0.8549055 2.648623e-02
head(DA2$class)
## [1] 1 2 1 1 1 2
## Levels: 1 2 3
summarize.class(original = quest3$JOB, classify = DA2$class)
## $class.table
##         classify
## original  1  2  3
##        1 66 15  4
##        2 17 66 10
##        3  3 14 49
## 
## $prop
##         classify
## original      1      2      3
##        1 0.7765 0.1765 0.0471
##        2 0.1828 0.7097 0.1075
##        3 0.0455 0.2121 0.7424
## 
## $overall.correct
## [1] 0.7418