submitted by: Navona
due date: 2019-11-12
last ran: 2019-11-12
website: http://rpubs.com/navona/PSY2002_assignment03
#load libraries
libraries <- c('klaR','MASS', 'knitr', 'kableExtra', 'GGally', 'mvnormtest', 'car', 'ggrepel')
lapply(libraries, require, character.only = T)
#set seed for reproducable randomeness
set.seed(100)
#specify size of variables
k=3 #3 groups
n=3 #3 observations/participants
p=3 #3 features/variables
#create our dataset
cov_mat <- cov(matrix(rnorm(p*n), ncol=p, nrow=n)) #first, generate a random covariance matrix
#write a function to create group matrices (identical covariance but different means)
groupMat_fn <- function(mu1, mu2, mu3){
groupMat <- MASS::mvrnorm(n=n, mu=c(mu1, mu2, mu3), Sigma=cov_mat)
return(groupMat)
}
#apply the function for each of 3 groups (specifying independent means for each variable)
matrix_1 <- groupMat_fn(4, 5, 6); matrix_2 <- groupMat_fn(3, 5, 5); matrix_3 <- groupMat_fn(2, 4, 1)
#bind together into a single matrix - our dataset
matrix <- rbind(matrix_1, matrix_2, matrix_3)
#add create row names
row.names(matrix) <- paste('par_', seq(1, 9), sep='')
#create formatted table
kable(matrix,
col.names=c('V1', 'V2', 'V3'),
digits=3) %>%
kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F,position='float_left') %>%
pack_rows('group 1', 1, 3, label_row_css = "background-color: #F8766D;") %>%
pack_rows('group 2', 4, 6, label_row_css = "background-color: #7CAE00;") %>%
pack_rows('group 3', 7, 9, label_row_css = "background-color: #00BFC4;")
V1 | V2 | V3 | |
---|---|---|---|
group 1 | |||
par_1 | 3.869 | 5.160 | 5.740 |
par_2 | 4.171 | 4.773 | 5.954 |
par_3 | 4.049 | 4.938 | 6.058 |
group 2 | |||
par_4 | 2.921 | 5.061 | 4.136 |
par_5 | 3.638 | 4.251 | 6.834 |
par_6 | 3.044 | 4.919 | 4.520 |
group 3 | |||
par_7 | 2.040 | 3.957 | 1.202 |
par_8 | 2.058 | 3.864 | -0.222 |
par_9 | 2.035 | 3.965 | 1.223 |
#add group to our matrix
group <- paste0(c(rep('group 1', 3), rep('group 2', 3), rep('group 3', 3)))
#add group names to matrix
matrix_vis <- as.data.frame(cbind(matrix, group))
#make plot (uses ggalley)
ggpairs(as.data.frame(matrix_vis[1:3]),
aes(colour = group)) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
Problem 1: Compute the within-group scatter matrix.
\(S_{within} = \sum_{i=1}^k \sum_{j=1}^{N_i} (x_{i,j} - \bar{x_i})(x_{i,j} - \bar{x_i})^T\)
#calculate the column means of each groups' matrix
colMeans_1 <- colMeans(matrix_1); colMeans_2 <- colMeans(matrix_2); colMeans_3 <- colMeans(matrix_3)
#initialize scatter within 0 matrix for each group
sw_fn <- function(p=3, k=3){matrix(0, p, k)}
sw_1 <- sw_fn(); sw_2 <- sw_fn(); sw_3 <- sw_fn()
#for loop to create matrices
for (i in 1:n){
sw_1 <- sw_1 + (matrix_1[i,] - colMeans_1) %*% t(matrix_1[i,] - colMeans_1)
sw_2 <- sw_2 + (matrix_2[i,] - colMeans_2) %*% t(matrix_2[i,] - colMeans_2)
sw_3 <- sw_3 + (matrix_3[i,] - colMeans_3) %*% t(matrix_3[i,] - colMeans_3)
}
#pool the matrices
scatterWithin <- sw_1 + sw_2 + sw_3
#create row names
rownames(scatterWithin) <- groups
#print table
kableTable_fn(scatterWithin)
group 1 | group 2 | group 3 | |
---|---|---|---|
group 1 | 0.340 | -0.392 | 1.136 |
group 2 | -0.392 | 0.456 | -1.216 |
group 3 | 1.136 | -1.216 | 5.685 |
Problem 2: Compute the between-group scatter matrix.
\(S_{between} = \sum_{i=1}^k N_i (\bar{x_i} - \bar{x})(\bar{x_i} - \bar{x})^T\)
#calculate grand mean (across all groups)
grandMean <- colMeans(matrix)
#for loop to create matrices
scatterBetween <-
nrow(matrix_1) * (colMeans_1 - grandMean) %*% t(colMeans_1 - grandMean) +
nrow(matrix_2) * (colMeans_2 - grandMean) %*% t(colMeans_2 - grandMean) +
nrow(matrix_3) * (colMeans_3 - grandMean) %*% t(colMeans_3 - grandMean)
#'pool' the matrices
#scatterBetween <- sb_1 + sb_2 + sb_3
#create row names
rownames(scatterBetween) <- groups
#print table
kableTable_fn(scatterBetween)
group 1 | group 2 | group 3 | |
---|---|---|---|
group 1 | 5.964 | 3.161 | 16.035 |
group 2 | 3.161 | 1.768 | 9.104 |
group 3 | 16.035 | 9.104 | 47.051 |
Problem 3: Compute the total scatter matrix.
\(S_{total} = \sum_{i=1}^n (\bar{x_i} - \bar{x})(\bar{x_i} - \bar{x})^T\)
#make scatterTotal matrix
scatterTotal <- scatterBetween + scatterWithin
#print table
kableTable_fn(scatterTotal)
group 1 | group 2 | group 3 | |
---|---|---|---|
group 1 | 6.304 | 2.769 | 17.171 |
group 2 | 2.769 | 2.224 | 7.888 |
group 3 | 17.171 | 7.888 | 52.736 |
Problem 4: Show that \(S_{total}\) = \(S_{between}\) + \(S_{within}\).
all.equal(scatterTotal, scatterBetween + scatterWithin)
## [1] TRUE
Problem 5: Compute the inverse of \(S_{within}\), i.e., \(S_{within}^{-1}\).
#compute inverse- need special package as not full rank
scatterWithinInverse <- MASS::ginv(scatterWithin)
#create row names
rownames(scatterWithinInverse) <- groups
#print table
kableTable_fn(scatterWithinInverse)
group 1 | group 2 | group 3 | |
---|---|---|---|
group 1 | 1.158 | -1.549 | -0.558 |
group 2 | -1.549 | 2.072 | 0.756 |
group 3 | -0.558 | 0.756 | 0.449 |
Problem 6: Compute the eigendecomposition of \(S_{within}^{-1}S_{between}\).
lda_result <- eigen(scatterWithinInverse %*% scatterBetween)
#separate eigenvalues and eivengectors
lda_eigenvalues <- as.data.frame(lda_result[[1]])
lda_eigenvectors <- as.data.frame(lda_result[[2]])
#print table of eigenvectors
rownames(lda_eigenvectors) <- groups
kableTable_fn(lda_eigenvectors, caption='eigenvectors')
group 1 | group 2 | group 3 | |
---|---|---|---|
group 1 | 0.528 | -0.604 | 0.116 |
group 2 | -0.716 | 0.796 | -0.982 |
group 3 | -0.456 | 0.046 | 0.150 |
#create table of eigenvalues
lda_eigenvalues <- t(lda_eigenvalues)
#remove rownames
rownames(lda_eigenvalues) <- c()
#make the table pretty
kable(lda_eigenvalues,
digits=3,
align = 'c',
caption='eigenvalues') %>%
kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F, position='left')
17.546 | 0.202 | 0 |
Problem 7: Project X onto the first discriminant function.
#multiply points by weights
lda_df <- as.data.frame(matrix %*% as.matrix(lda_eigenvectors[,1]))
#change variable names
names(lda_df) <- 'LDA_1'
#add group information to df
lda_df$group <- as.factor(group)
#make table
kable(lda_df[1],
digits=3,
col.names='LDA 1') %>%
kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F, position='left') %>%
pack_rows('group 1', 1, 3, label_row_css = "background-color: #F8766D;") %>%
pack_rows('group 2', 4, 6, label_row_css = "background-color: #7CAE00;") %>%
pack_rows('group 3', 7, 9, label_row_css = "background-color: #00BFC4;")
LDA 1 | |
---|---|
group 1 | |
par_1 | -4.273 |
par_2 | -3.934 |
par_3 | -4.164 |
group 2 | |
par_4 | -3.970 |
par_5 | -4.243 |
par_6 | -3.979 |
group 3 | |
par_7 | -2.305 |
par_8 | -1.580 |
par_9 | -2.323 |
Problem 8: Plot discriminant scores derived above.
#plot
ggplot(lda_df, aes(x=row.names(lda_df), y=LDA_1, color=group, l)) +
geom_point(size=5) +
labs(x='participant',
y='first discriminant score (LDA 1)') +
theme_bw() +
theme(legend.position = 'top', legend.title=element_blank(),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())
Problem 9: Fit a MANOVA to iris data with species as independent variable using the lm
function.
#fit model with lm
manovaModel <- lm(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data=iris)
#summarize with manova
(manovaFit <- summary(manova(manovaModel)))
## Df Pillai approx F num Df den Df Pr(>F)
## Species 2 1.1919 53.466 8 290 < 0.00000000000000022 ***
## Residuals 147
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Problem 10: Test for significance of Species factor using car::Anova
.
#test for significance
(manovaSig <- Anova(manovaModel))
##
## Type II MANOVA Tests: Pillai test statistic
## Df test stat approx F num Df den Df Pr(>F)
## Species 2 1.1919 53.466 8 290 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Problem 11: Report all test statistics using car::summary
.
#report all test statistics
summary(manovaSig)
##
## Type II MANOVA Tests:
##
## Sum of squares and products for error:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 38.9562 13.6300 24.6246 5.6450
## Sepal.Width 13.6300 16.9620 8.1208 4.8084
## Petal.Length 24.6246 8.1208 27.2226 6.2718
## Petal.Width 5.6450 4.8084 6.2718 6.1566
##
## ------------------------------------------
##
## Term: Species
##
## Sum of squares and products for the hypothesis:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 63.21213 -19.95267 165.2484 71.27933
## Sepal.Width -19.95267 11.34493 -57.2396 -22.93267
## Petal.Length 165.24840 -57.23960 437.1028 186.77400
## Petal.Width 71.27933 -22.93267 186.7740 80.41333
##
## Multivariate Tests: Species
## Df test stat approx F num Df den Df
## Pillai 2 1.19190 53.4665 8 290
## Wilks 2 0.02344 199.1453 8 288
## Hotelling-Lawley 2 32.47732 580.5321 8 286
## Roy 2 32.19193 1166.9574 4 145
## Pr(>F)
## Pillai < 0.000000000000000222 ***
## Wilks < 0.000000000000000222 ***
## Hotelling-Lawley < 0.000000000000000222 ***
## Roy < 0.000000000000000222 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Problem 12: extract the SSPE and SSPH from car::summary
.
#SSPE
SSPE <- as.data.frame(manovaSig[2])
#SSPH
SSPH <- as.data.frame(manovaSig[1])
#create row names
rownames(SSPE) <- names(iris[1:4]); rownames(SSPH) <- names(iris[1:4])
#fn for nice tables
irisTables_fn <- function(df, caption){
kable(df, digits=3, caption=caption, col.names = colnames(iris[1:4])) %>%
kable_styling(bootstrap_options=c('striped', 'hover'), full_width = F, position='left') %>%
column_spec(1, bold = T)
}
#make nice tables
irisTables_fn(SSPE, 'SSPE')
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | |
---|---|---|---|---|
Sepal.Length | 38.956 | 13.630 | 24.625 | 5.645 |
Sepal.Width | 13.630 | 16.962 | 8.121 | 4.808 |
Petal.Length | 24.625 | 8.121 | 27.223 | 6.272 |
Petal.Width | 5.645 | 4.808 | 6.272 | 6.157 |
irisTables_fn(SSPH, 'SSPH')
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | |
---|---|---|---|---|
Sepal.Length | 63.212 | -19.953 | 165.248 | 71.279 |
Sepal.Width | -19.953 | 11.345 | -57.240 | -22.933 |
Petal.Length | 165.248 | -57.240 | 437.103 | 186.774 |
Petal.Width | 71.279 | -22.933 | 186.774 | 80.413 |
Problem 13: Compute discriminant functions from SSPH and SSPE.
#take the inverse of both matrices
matrixInv <- solve(as.matrix(SSPE), as.matrix(SSPH))
#then, take the eigenvectors -- these are our discriminant functions
discriminant <- as.data.frame(eigen(matrixInv)[2])
#create row names
rownames(discriminant) <- names(iris[1:4])
#make nice tables
irisTables_fn(discriminant, 'Discriminant functions')
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | |
---|---|---|---|---|
Sepal.Length | 0.209 | -0.007 | 0.826 | -0.803 |
Sepal.Width | 0.386 | -0.587 | -0.151 | 0.406 |
Petal.Length | -0.554 | 0.253 | -0.105 | 0.415 |
Petal.Width | -0.707 | -0.769 | -0.532 | -0.135 |
Problem 14: Compute Roy’s largest root from SSPE and SSPH.
#obtain eigenvalues
eigenValues <- as.data.frame(eigen(matrixInv)[1])
#select max value -- this is Roy's root
(roysRoot <- max(eigenValues))
## [1] 32.19193
Problem 15: Create 100 random permutations of iris
dataset.
#initialize table to store Roy's root values
roy_df <- setNames(data.frame(matrix(ncol = 1, nrow = 100)), 'roys_root')
#set number of iterations
iterations = 100
#for loop
for (i in 1:iterations){
set.seed=123
iris$Species <- sample(iris$Species) #permute
manovaModel <- lm(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data=iris)
manovaSig <- Anova(manovaModel)
SSPE <- as.data.frame(manovaSig[2])
SSPH <- as.data.frame(manovaSig[1])
matrixInv <- solve(as.matrix(SSPE), as.matrix(SSPH))
eigenValues <- as.data.frame(eigen(matrixInv)[1])
roysRoot <- max(abs(eigenValues)) #takes abs to avoid complex values on negative value rounding errors
roy_df[i,] <- roysRoot
}
#make annotation for plot
roysRootMean <- mean(roy_df$roys_root)
#histogram
ggplot(roy_df, aes(x=roys_root)) +
geom_histogram(aes(y=..density..),color='black', fill='lightgrey') +
geom_density(alpha=.4, fill='pink') +
geom_vline(aes(xintercept=mean(roys_root)), linetype='dashed', size=1) +
labs(x='Roy\'s root') +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
geom_text(x=.075, y=20, label=paste('permuted mean Roy\'s largest root = ', roysRootMean))