submitted by: Navona
due date: 2019-11-12
last ran: 2019-11-12
website: http://rpubs.com/navona/PSY2002_assignment03


#load libraries
libraries <- c('klaR','MASS', 'knitr', 'kableExtra', 'GGally', 'mvnormtest', 'car', 'ggrepel')
lapply(libraries, require, character.only = T)

#set seed for reproducable randomeness
set.seed(100) 

#specify size of variables
k=3 #3 groups
n=3 #3 observations/participants
p=3 #3 features/variables

#create our dataset
cov_mat <- cov(matrix(rnorm(p*n), ncol=p, nrow=n)) #first, generate a random covariance matrix

#write a function to create group matrices (identical covariance but different means)
groupMat_fn <- function(mu1, mu2, mu3){
  groupMat <- MASS::mvrnorm(n=n, mu=c(mu1, mu2, mu3), Sigma=cov_mat)
  return(groupMat)
}

#apply the function for each of 3 groups (specifying independent means for each variable)
matrix_1 <- groupMat_fn(4, 5, 6); matrix_2 <- groupMat_fn(3, 5, 5); matrix_3 <- groupMat_fn(2, 4, 1) 

#bind together into a single matrix - our dataset
matrix <- rbind(matrix_1, matrix_2, matrix_3)

#add create row names
row.names(matrix) <- paste('par_', seq(1, 9), sep='')
#create formatted table
kable(matrix, 
      col.names=c('V1', 'V2', 'V3'),
      digits=3) %>%
  kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F,position='float_left') %>%
  pack_rows('group 1', 1, 3, label_row_css = "background-color: #F8766D;") %>%
  pack_rows('group 2', 4, 6, label_row_css = "background-color: #7CAE00;") %>%
  pack_rows('group 3', 7, 9, label_row_css = "background-color: #00BFC4;")
V1 V2 V3
group 1
par_1 3.869 5.160 5.740
par_2 4.171 4.773 5.954
par_3 4.049 4.938 6.058
group 2
par_4 2.921 5.061 4.136
par_5 3.638 4.251 6.834
par_6 3.044 4.919 4.520
group 3
par_7 2.040 3.957 1.202
par_8 2.058 3.864 -0.222
par_9 2.035 3.965 1.223
#add group to our matrix
group <- paste0(c(rep('group 1', 3), rep('group 2', 3), rep('group 3', 3)))

#add group names to matrix
matrix_vis <- as.data.frame(cbind(matrix, group))
                            
#make plot (uses ggalley)
ggpairs(as.data.frame(matrix_vis[1:3]), 
        aes(colour = group)) + 
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())




Problem 1: Compute the within-group scatter matrix.

\(S_{within} = \sum_{i=1}^k \sum_{j=1}^{N_i} (x_{i,j} - \bar{x_i})(x_{i,j} - \bar{x_i})^T\)

#calculate the column means of each groups' matrix
colMeans_1 <- colMeans(matrix_1); colMeans_2 <- colMeans(matrix_2); colMeans_3 <- colMeans(matrix_3)

#initialize scatter within 0 matrix for each group
sw_fn <- function(p=3, k=3){matrix(0, p, k)}
sw_1 <- sw_fn(); sw_2 <- sw_fn(); sw_3 <- sw_fn()

#for loop to create matrices
for (i in 1:n){
  sw_1 <- sw_1 + (matrix_1[i,] - colMeans_1) %*% t(matrix_1[i,] - colMeans_1)
  sw_2 <- sw_2 + (matrix_2[i,] - colMeans_2) %*% t(matrix_2[i,] - colMeans_2)
  sw_3 <- sw_3 + (matrix_3[i,] - colMeans_3) %*% t(matrix_3[i,] - colMeans_3)
}

#pool the matrices
scatterWithin <- sw_1 + sw_2 + sw_3

#create row names
rownames(scatterWithin) <- groups

#print table
kableTable_fn(scatterWithin)
group 1 group 2 group 3
group 1 0.340 -0.392 1.136
group 2 -0.392 0.456 -1.216
group 3 1.136 -1.216 5.685



Problem 2: Compute the between-group scatter matrix.

\(S_{between} = \sum_{i=1}^k N_i (\bar{x_i} - \bar{x})(\bar{x_i} - \bar{x})^T\)

#calculate grand mean (across all groups)
grandMean <- colMeans(matrix)

#for loop to create matrices
scatterBetween <- 
  nrow(matrix_1) * (colMeans_1 - grandMean) %*% t(colMeans_1 - grandMean) +
  nrow(matrix_2) * (colMeans_2 - grandMean) %*% t(colMeans_2 - grandMean) +
  nrow(matrix_3) * (colMeans_3 - grandMean) %*% t(colMeans_3 - grandMean)

#'pool' the matrices
#scatterBetween <- sb_1 + sb_2 + sb_3

#create row names
rownames(scatterBetween) <- groups

#print table
kableTable_fn(scatterBetween)
group 1 group 2 group 3
group 1 5.964 3.161 16.035
group 2 3.161 1.768 9.104
group 3 16.035 9.104 47.051



Problem 3: Compute the total scatter matrix.

\(S_{total} = \sum_{i=1}^n (\bar{x_i} - \bar{x})(\bar{x_i} - \bar{x})^T\)

#make scatterTotal matrix
scatterTotal <- scatterBetween + scatterWithin

#print table
kableTable_fn(scatterTotal)
group 1 group 2 group 3
group 1 6.304 2.769 17.171
group 2 2.769 2.224 7.888
group 3 17.171 7.888 52.736

Problem 4: Show that \(S_{total}\) = \(S_{between}\) + \(S_{within}\).

all.equal(scatterTotal, scatterBetween + scatterWithin)

## [1] TRUE

Problem 5: Compute the inverse of \(S_{within}\), i.e., \(S_{within}^{-1}\).

#compute inverse- need special package as not full rank
scatterWithinInverse <- MASS::ginv(scatterWithin) 

#create row names
rownames(scatterWithinInverse) <- groups

#print table
kableTable_fn(scatterWithinInverse)
group 1 group 2 group 3
group 1 1.158 -1.549 -0.558
group 2 -1.549 2.072 0.756
group 3 -0.558 0.756 0.449

Problem 6: Compute the eigendecomposition of \(S_{within}^{-1}S_{between}\).

lda_result <- eigen(scatterWithinInverse %*% scatterBetween) 

#separate eigenvalues and eivengectors
lda_eigenvalues  <- as.data.frame(lda_result[[1]])
lda_eigenvectors <- as.data.frame(lda_result[[2]])

#print table of eigenvectors
rownames(lda_eigenvectors) <- groups
kableTable_fn(lda_eigenvectors, caption='eigenvectors') 
eigenvectors
group 1 group 2 group 3
group 1 0.528 -0.604 0.116
group 2 -0.716 0.796 -0.982
group 3 -0.456 0.046 0.150
#create table of eigenvalues
lda_eigenvalues <- t(lda_eigenvalues)

#remove rownames
rownames(lda_eigenvalues) <- c()

#make the table pretty
kable(lda_eigenvalues, 
      digits=3,
      align = 'c',
      caption='eigenvalues') %>%
  kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F, position='left') 
eigenvalues
17.546 0.202 0

Problem 7: Project X onto the first discriminant function.

#multiply points by weights
lda_df <- as.data.frame(matrix %*% as.matrix(lda_eigenvectors[,1])) 

#change variable names
names(lda_df) <- 'LDA_1'

#add group information to df
lda_df$group <- as.factor(group)

#make table
kable(lda_df[1],
      digits=3,
      col.names='LDA 1') %>%
  kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F, position='left') %>%
  pack_rows('group 1', 1, 3, label_row_css = "background-color: #F8766D;") %>%
  pack_rows('group 2', 4, 6, label_row_css = "background-color: #7CAE00;") %>%
  pack_rows('group 3', 7, 9, label_row_css = "background-color: #00BFC4;")
LDA 1
group 1
par_1 -4.273
par_2 -3.934
par_3 -4.164
group 2
par_4 -3.970
par_5 -4.243
par_6 -3.979
group 3
par_7 -2.305
par_8 -1.580
par_9 -2.323

Problem 8: Plot discriminant scores derived above.

#plot
ggplot(lda_df, aes(x=row.names(lda_df), y=LDA_1, color=group, l)) +
  geom_point(size=5) +
  labs(x='participant',
       y='first discriminant score (LDA 1)') +
  theme_bw() +
  theme(legend.position = 'top', legend.title=element_blank(),
    panel.grid.major = element_blank(), panel.grid.minor = element_blank())



Problem 9: Fit a MANOVA to iris data with species as independent variable using the lm function.

#fit model with lm
manovaModel <- lm(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data=iris)

#summarize with manova
(manovaFit <- summary(manova(manovaModel)))
##            Df Pillai approx F num Df den Df                Pr(>F)    
## Species     2 1.1919   53.466      8    290 < 0.00000000000000022 ***
## Residuals 147                                                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Problem 10: Test for significance of Species factor using car::Anova.

#test for significance
(manovaSig <- Anova(manovaModel))
## 
## Type II MANOVA Tests: Pillai test statistic
##         Df test stat approx F num Df den Df                Pr(>F)    
## Species  2    1.1919   53.466      8    290 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Problem 11: Report all test statistics using car::summary.

#report all test statistics
summary(manovaSig)
## 
## Type II MANOVA Tests:
## 
## Sum of squares and products for error:
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length      38.9562     13.6300      24.6246      5.6450
## Sepal.Width       13.6300     16.9620       8.1208      4.8084
## Petal.Length      24.6246      8.1208      27.2226      6.2718
## Petal.Width        5.6450      4.8084       6.2718      6.1566
## 
## ------------------------------------------
##  
## Term: Species 
## 
## Sum of squares and products for the hypothesis:
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length     63.21213   -19.95267     165.2484    71.27933
## Sepal.Width     -19.95267    11.34493     -57.2396   -22.93267
## Petal.Length    165.24840   -57.23960     437.1028   186.77400
## Petal.Width      71.27933   -22.93267     186.7740    80.41333
## 
## Multivariate Tests: Species
##                  Df test stat  approx F num Df den Df
## Pillai            2   1.19190   53.4665      8    290
## Wilks             2   0.02344  199.1453      8    288
## Hotelling-Lawley  2  32.47732  580.5321      8    286
## Roy               2  32.19193 1166.9574      4    145
##                                  Pr(>F)    
## Pillai           < 0.000000000000000222 ***
## Wilks            < 0.000000000000000222 ***
## Hotelling-Lawley < 0.000000000000000222 ***
## Roy              < 0.000000000000000222 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Problem 12: extract the SSPE and SSPH from car::summary.

#SSPE
SSPE <- as.data.frame(manovaSig[2])

#SSPH
SSPH <- as.data.frame(manovaSig[1])

#create row names
rownames(SSPE) <- names(iris[1:4]); rownames(SSPH) <- names(iris[1:4])

#fn for nice tables
irisTables_fn <- function(df, caption){
  kable(df, digits=3, caption=caption, col.names = colnames(iris[1:4])) %>%
  kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F, position='left') %>%
  column_spec(1, bold = T)
}

#make nice tables
irisTables_fn(SSPE, 'SSPE')
SSPE
Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length 38.956 13.630 24.625 5.645
Sepal.Width 13.630 16.962 8.121 4.808
Petal.Length 24.625 8.121 27.223 6.272
Petal.Width 5.645 4.808 6.272 6.157
irisTables_fn(SSPH, 'SSPH')
SSPH
Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length 63.212 -19.953 165.248 71.279
Sepal.Width -19.953 11.345 -57.240 -22.933
Petal.Length 165.248 -57.240 437.103 186.774
Petal.Width 71.279 -22.933 186.774 80.413

Problem 13: Compute discriminant functions from SSPH and SSPE.

#take the inverse of both matrices
matrixInv <- solve(as.matrix(SSPE), as.matrix(SSPH))

#then, take the eigenvectors -- these are our discriminant functions
discriminant <- as.data.frame(eigen(matrixInv)[2])

#create row names
rownames(discriminant) <- names(iris[1:4])

#make nice tables
irisTables_fn(discriminant, 'Discriminant functions')
Discriminant functions
Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length 0.209 -0.007 0.826 -0.803
Sepal.Width 0.386 -0.587 -0.151 0.406
Petal.Length -0.554 0.253 -0.105 0.415
Petal.Width -0.707 -0.769 -0.532 -0.135

Problem 14: Compute Roy’s largest root from SSPE and SSPH.

#obtain eigenvalues
eigenValues <- as.data.frame(eigen(matrixInv)[1])

#select max value -- this is Roy's root
(roysRoot <- max(eigenValues))
## [1] 32.19193

Problem 15: Create 100 random permutations of iris dataset.

#initialize table to store Roy's root values
roy_df <- setNames(data.frame(matrix(ncol = 1, nrow = 100)), 'roys_root')

#set number of iterations
iterations = 100

#for loop
for (i in 1:iterations){
    set.seed=123
    iris$Species <- sample(iris$Species) #permute
    manovaModel <- lm(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data=iris) 
    manovaSig <- Anova(manovaModel) 
    SSPE <- as.data.frame(manovaSig[2])
    SSPH <- as.data.frame(manovaSig[1])
    matrixInv <- solve(as.matrix(SSPE), as.matrix(SSPH))
    eigenValues <- as.data.frame(eigen(matrixInv)[1])
    roysRoot <- max(abs(eigenValues)) #takes abs to avoid complex values on negative value rounding errors
    roy_df[i,] <- roysRoot
}

#make annotation for plot
roysRootMean <- mean(roy_df$roys_root)

#histogram
ggplot(roy_df, aes(x=roys_root)) +
 geom_histogram(aes(y=..density..),color='black', fill='lightgrey') +
 geom_density(alpha=.4, fill='pink') +
 geom_vline(aes(xintercept=mean(roys_root)), linetype='dashed', size=1) +
 labs(x='Roy\'s root') +
 theme_bw() +
 theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
 geom_text(x=.075, y=20, label=paste('permuted mean Roy\'s largest root = ', roysRootMean))