submitted by: Navona
due date: 2019-11-12
last ran: 2019-11-12
website: http://rpubs.com/navona/PSY2002_assignment03

#load libraries
libraries <- c('klaR','MASS', 'knitr', 'kableExtra', 'GGally', 'mvnormtest', 'car', 'ggrepel')
lapply(libraries, require, character.only = T)

#set seed for reproducable randomeness
set.seed(100) 

#specify size of variables
k=3 #3 groups
n=3 #3 observations/participants
p=3 #3 features/variables

#create our dataset
cov_mat <- cov(matrix(rnorm(p*n), ncol=p, nrow=n)) #first, generate a random covariance matrix

#write a function to create group matrices (identical covariance but different means)
groupMat_fn <- function(mu1, mu2, mu3){
  groupMat <- MASS::mvrnorm(n=n, mu=c(mu1, mu2, mu3), Sigma=cov_mat)
  return(groupMat)
}

#apply the function for each of 3 groups (specifying independent means for each variable)
matrix_1 <- groupMat_fn(4, 5, 6); matrix_2 <- groupMat_fn(3, 5, 5); matrix_3 <- groupMat_fn(2, 4, 1) 

#bind together into a single matrix - our dataset
matrix <- rbind(matrix_1, matrix_2, matrix_3)

#add create row names
row.names(matrix) <- paste('par_', seq(1, 9), sep='')

#create formatted table
kable(matrix, 
      col.names=c('V1', 'V2', 'V3'),
      digits=3) %>%
  kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F,position='float_left') %>%
  pack_rows('group 1', 1, 3, label_row_css = "background-color: #F8766D;") %>%
  pack_rows('group 2', 4, 6, label_row_css = "background-color: #7CAE00;") %>%
  pack_rows('group 3', 7, 9, label_row_css = "background-color: #00BFC4;")

	V1	V2	V3
group 1
par_1	3.869	5.160	5.740
par_2	4.171	4.773	5.954
par_3	4.049	4.938	6.058
group 2
par_4	2.921	5.061	4.136
par_5	3.638	4.251	6.834
par_6	3.044	4.919	4.520
group 3
par_7	2.040	3.957	1.202
par_8	2.058	3.864	-0.222
par_9	2.035	3.965	1.223

#add group to our matrix
group <- paste0(c(rep('group 1', 3), rep('group 2', 3), rep('group 3', 3)))

#add group names to matrix
matrix_vis <- as.data.frame(cbind(matrix, group))
                            
#make plot (uses ggalley)
ggpairs(as.data.frame(matrix_vis[1:3]), 
        aes(colour = group)) + 
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

Problem 1: Compute the within-group scatter matrix.

\(S_{within} = \sum_{i=1}^k \sum_{j=1}^{N_i} (x_{i,j} - \bar{x_i})(x_{i,j} - \bar{x_i})^T\)

#calculate the column means of each groups' matrix
colMeans_1 <- colMeans(matrix_1); colMeans_2 <- colMeans(matrix_2); colMeans_3 <- colMeans(matrix_3)

#initialize scatter within 0 matrix for each group
sw_fn <- function(p=3, k=3){matrix(0, p, k)}
sw_1 <- sw_fn(); sw_2 <- sw_fn(); sw_3 <- sw_fn()

#for loop to create matrices
for (i in 1:n){
  sw_1 <- sw_1 + (matrix_1[i,] - colMeans_1) %*% t(matrix_1[i,] - colMeans_1)
  sw_2 <- sw_2 + (matrix_2[i,] - colMeans_2) %*% t(matrix_2[i,] - colMeans_2)
  sw_3 <- sw_3 + (matrix_3[i,] - colMeans_3) %*% t(matrix_3[i,] - colMeans_3)
}

#pool the matrices
scatterWithin <- sw_1 + sw_2 + sw_3

#create row names
rownames(scatterWithin) <- groups

#print table
kableTable_fn(scatterWithin)


	group 1	group 2	group 3
group 1	0.340	-0.392	1.136
group 2	-0.392	0.456	-1.216
group 3	1.136	-1.216	5.685

Problem 2: Compute the between-group scatter matrix.

\(S_{between} = \sum_{i=1}^k N_i (\bar{x_i} - \bar{x})(\bar{x_i} - \bar{x})^T\)

#calculate grand mean (across all groups)
grandMean <- colMeans(matrix)

#for loop to create matrices
scatterBetween <- 
  nrow(matrix_1) * (colMeans_1 - grandMean) %*% t(colMeans_1 - grandMean) +
  nrow(matrix_2) * (colMeans_2 - grandMean) %*% t(colMeans_2 - grandMean) +
  nrow(matrix_3) * (colMeans_3 - grandMean) %*% t(colMeans_3 - grandMean)

#'pool' the matrices
#scatterBetween <- sb_1 + sb_2 + sb_3

#create row names
rownames(scatterBetween) <- groups

#print table
kableTable_fn(scatterBetween)


	group 1	group 2	group 3
group 1	5.964	3.161	16.035
group 2	3.161	1.768	9.104
group 3	16.035	9.104	47.051

Problem 3: Compute the total scatter matrix.

\(S_{total} = \sum_{i=1}^n (\bar{x_i} - \bar{x})(\bar{x_i} - \bar{x})^T\)

#make scatterTotal matrix
scatterTotal <- scatterBetween + scatterWithin

#print table
kableTable_fn(scatterTotal)


	group 1	group 2	group 3
group 1	6.304	2.769	17.171
group 2	2.769	2.224	7.888
group 3	17.171	7.888	52.736

Problem 4: Show that \(S_{total}\) = \(S_{between}\) + \(S_{within}\).

all.equal(scatterTotal, scatterBetween + scatterWithin)

## [1] TRUE

Problem 5: Compute the inverse of \(S_{within}\), i.e., \(S_{within}^{-1}\).

#compute inverse- need special package as not full rank
scatterWithinInverse <- MASS::ginv(scatterWithin) 

#create row names
rownames(scatterWithinInverse) <- groups

#print table
kableTable_fn(scatterWithinInverse)


	group 1	group 2	group 3
group 1	1.158	-1.549	-0.558
group 2	-1.549	2.072	0.756
group 3	-0.558	0.756	0.449

Problem 6: Compute the eigendecomposition of \(S_{within}^{-1}S_{between}\).

lda_result <- eigen(scatterWithinInverse %*% scatterBetween) 

#separate eigenvalues and eivengectors
lda_eigenvalues  <- as.data.frame(lda_result[[1]])
lda_eigenvectors <- as.data.frame(lda_result[[2]])

#print table of eigenvectors
rownames(lda_eigenvectors) <- groups
kableTable_fn(lda_eigenvectors, caption='eigenvectors')

eigenvectors
	group 1	group 2	group 3
group 1	0.528	-0.604	0.116
group 2	-0.716	0.796	-0.982
group 3	-0.456	0.046	0.150

#create table of eigenvalues
lda_eigenvalues <- t(lda_eigenvalues)

#remove rownames
rownames(lda_eigenvalues) <- c()

#make the table pretty
kable(lda_eigenvalues, 
      digits=3,
      align = 'c',
      caption='eigenvalues') %>%
  kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F, position='left')

eigenvalues
17.546	0.202	0

Problem 7: Project X onto the first discriminant function.

#multiply points by weights
lda_df <- as.data.frame(matrix %*% as.matrix(lda_eigenvectors[,1])) 

#change variable names
names(lda_df) <- 'LDA_1'

#add group information to df
lda_df$group <- as.factor(group)

#make table
kable(lda_df[1],
      digits=3,
      col.names='LDA 1') %>%
  kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F, position='left') %>%
  pack_rows('group 1', 1, 3, label_row_css = "background-color: #F8766D;") %>%
  pack_rows('group 2', 4, 6, label_row_css = "background-color: #7CAE00;") %>%
  pack_rows('group 3', 7, 9, label_row_css = "background-color: #00BFC4;")

	LDA 1
group 1
par_1	-4.273
par_2	-3.934
par_3	-4.164
group 2
par_4	-3.970
par_5	-4.243
par_6	-3.979
group 3
par_7	-2.305
par_8	-1.580
par_9	-2.323

Problem 8: Plot discriminant scores derived above.

#plot
ggplot(lda_df, aes(x=row.names(lda_df), y=LDA_1, color=group, l)) +
  geom_point(size=5) +
  labs(x='participant',
       y='first discriminant score (LDA 1)') +
  theme_bw() +
  theme(legend.position = 'top', legend.title=element_blank(),
    panel.grid.major = element_blank(), panel.grid.minor = element_blank())

Problem 9: Fit a MANOVA to iris data with species as independent variable using the lm function.

#fit model with lm
manovaModel <- lm(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data=iris)

#summarize with manova
(manovaFit <- summary(manova(manovaModel)))

##            Df Pillai approx F num Df den Df                Pr(>F)    
## Species     2 1.1919   53.466      8    290 < 0.00000000000000022 ***
## Residuals 147                                                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Problem 10: Test for significance of Species factor using car::Anova.

#test for significance
(manovaSig <- Anova(manovaModel))

## 
## Type II MANOVA Tests: Pillai test statistic
##         Df test stat approx F num Df den Df                Pr(>F)    
## Species  2    1.1919   53.466      8    290 < 0.00000000000000022 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Problem 11: Report all test statistics using car::summary.

#report all test statistics
summary(manovaSig)

## 
## Type II MANOVA Tests:
## 
## Sum of squares and products for error:
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length      38.9562     13.6300      24.6246      5.6450
## Sepal.Width       13.6300     16.9620       8.1208      4.8084
## Petal.Length      24.6246      8.1208      27.2226      6.2718
## Petal.Width        5.6450      4.8084       6.2718      6.1566
## 
## ------------------------------------------
##  
## Term: Species 
## 
## Sum of squares and products for the hypothesis:
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length     63.21213   -19.95267     165.2484    71.27933
## Sepal.Width     -19.95267    11.34493     -57.2396   -22.93267
## Petal.Length    165.24840   -57.23960     437.1028   186.77400
## Petal.Width      71.27933   -22.93267     186.7740    80.41333
## 
## Multivariate Tests: Species
##                  Df test stat  approx F num Df den Df
## Pillai            2   1.19190   53.4665      8    290
## Wilks             2   0.02344  199.1453      8    288
## Hotelling-Lawley  2  32.47732  580.5321      8    286
## Roy               2  32.19193 1166.9574      4    145
##                                  Pr(>F)    
## Pillai           < 0.000000000000000222 ***
## Wilks            < 0.000000000000000222 ***
## Hotelling-Lawley < 0.000000000000000222 ***
## Roy              < 0.000000000000000222 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Problem 12: extract the SSPE and SSPH from car::summary.

#SSPE
SSPE <- as.data.frame(manovaSig[2])

#SSPH
SSPH <- as.data.frame(manovaSig[1])

#create row names
rownames(SSPE) <- names(iris[1:4]); rownames(SSPH) <- names(iris[1:4])

#fn for nice tables
irisTables_fn <- function(df, caption){
  kable(df, digits=3, caption=caption, col.names = colnames(iris[1:4])) %>%
  kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F, position='left') %>%
  column_spec(1, bold = T)
}

#make nice tables
irisTables_fn(SSPE, 'SSPE')

SSPE
	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
Sepal.Length	38.956	13.630	24.625	5.645
Sepal.Width	13.630	16.962	8.121	4.808
Petal.Length	24.625	8.121	27.223	6.272
Petal.Width	5.645	4.808	6.272	6.157

irisTables_fn(SSPH, 'SSPH')

SSPH
	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
Sepal.Length	63.212	-19.953	165.248	71.279
Sepal.Width	-19.953	11.345	-57.240	-22.933
Petal.Length	165.248	-57.240	437.103	186.774
Petal.Width	71.279	-22.933	186.774	80.413

Problem 13: Compute discriminant functions from SSPH and SSPE.

#take the inverse of both matrices
matrixInv <- solve(as.matrix(SSPE), as.matrix(SSPH))

#then, take the eigenvectors -- these are our discriminant functions
discriminant <- as.data.frame(eigen(matrixInv)[2])

#create row names
rownames(discriminant) <- names(iris[1:4])

#make nice tables
irisTables_fn(discriminant, 'Discriminant functions')

Discriminant functions
	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
Sepal.Length	0.209	-0.007	0.826	-0.803
Sepal.Width	0.386	-0.587	-0.151	0.406
Petal.Length	-0.554	0.253	-0.105	0.415
Petal.Width	-0.707	-0.769	-0.532	-0.135

Problem 14: Compute Roy’s largest root from SSPE and SSPH.

#obtain eigenvalues
eigenValues <- as.data.frame(eigen(matrixInv)[1])

#select max value -- this is Roy's root
(roysRoot <- max(eigenValues))

## [1] 32.19193

Problem 15: Create 100 random permutations of iris dataset.

#initialize table to store Roy's root values
roy_df <- setNames(data.frame(matrix(ncol = 1, nrow = 100)), 'roys_root')

#set number of iterations
iterations = 100

#for loop
for (i in 1:iterations){
    set.seed=123
    iris$Species <- sample(iris$Species) #permute
    manovaModel <- lm(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data=iris) 
    manovaSig <- Anova(manovaModel) 
    SSPE <- as.data.frame(manovaSig[2])
    SSPH <- as.data.frame(manovaSig[1])
    matrixInv <- solve(as.matrix(SSPE), as.matrix(SSPH))
    eigenValues <- as.data.frame(eigen(matrixInv)[1])
    roysRoot <- max(abs(eigenValues)) #takes abs to avoid complex values on negative value rounding errors
    roy_df[i,] <- roysRoot
}

#make annotation for plot
roysRootMean <- mean(roy_df$roys_root)

#histogram
ggplot(roy_df, aes(x=roys_root)) +
 geom_histogram(aes(y=..density..),color='black', fill='lightgrey') +
 geom_density(alpha=.4, fill='pink') +
 geom_vline(aes(xintercept=mean(roys_root)), linetype='dashed', size=1) +
 labs(x='Roy\'s root') +
 theme_bw() +
 theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
 geom_text(x=.075, y=20, label=paste('permuted mean Roy\'s largest root = ', roysRootMean))

PSY2002 Assignment 3