MultiVariate Project 2

Import Data and creating a new data frame that only contains the needed variables

library(rio)
library(ggplot2)
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(CCA)

## Loading required package: fda

## Warning: package 'fda' was built under R version 3.6.2

## Loading required package: Matrix

## 
## Attaching package: 'fda'

## The following object is masked from 'package:graphics':
## 
##     matplot

## Loading required package: fields

## Warning: package 'fields' was built under R version 3.6.2

## Loading required package: spam

## Loading required package: dotCall64

## Loading required package: grid

## Spam version 2.5-1 (2019-12-12) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.

## 
## Attaching package: 'spam'

## The following object is masked from 'package:Matrix':
## 
##     det

## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve

## See https://github.com/NCAR/Fields for
##  an extensive vignette, other supplements and source code

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:GGally':
## 
##     nasa

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(CCP)
library(candisc)

## Warning: package 'candisc' was built under R version 3.6.2

## Loading required package: car

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## Loading required package: heplots

## 
## Attaching package: 'candisc'

## The following object is masked from 'package:stats':
## 
##     cancor

library(lessR)

## 
## lessR 3.8.9     feedback: gerbing@pdx.edu     web: lessRstats.com/new
## ---------------------------------------------------------------------
## 1. d <- Read("")           Read text, Excel, SPSS, SAS or R data file
##                            d: default data frame, no need for data=
## 2. l <- Read("", var_labels=TRUE)   Read variable labels into l,
##                            required name for data frame of labels
## 3. Help()                  Get help, and, e.g., Help(Read)
## 4. hs(), bc(), or ca()     All histograms, all bar charts, or both
## 5. Plot(X) or Plot(X,Y)    For continuous and categorical variables
## 6. by1= , by2=             Trellis graphics, a plot for each by1, by2
## 7. reg(Y ~ X, Rmd="eg")    Regression with full interpretative output
## 8. style("gray")           Grayscale theme, + many others available
##    style(show=TRUE)        all color/style options and current values
## 9. getColors()             create many styles of color palettes
## 
## lessR parameter names now use _'s. Names with a period are deprecated.
## Ex:  bin_width  instead of  bin.width

## 
## Attaching package: 'lessR'

## The following objects are masked from 'package:car':
## 
##     bc, Recode, sp

FullData <- read.csv("/Users/Lorraine/Desktop/Project 2.csv")
PJ2 <- subset(FullData, select=c(Rigorous.Instruction.., Collaborative.Teachers.., Supportive.Environment.., Effective.School.Leadership.., Strong.Family.Community.Ties.., Trust.., Average.ELA.Proficiency, Average.Math.Proficiency))

Change the variables from factors to numerics. Let’s change percentages to decimals. In order to change factors to numerics, we need to change them to characters first.

PJ2$Rigorous.Instruction.. <- as.numeric(sub("%", "",PJ2$Rigorous.Instruction..,fixed=TRUE))/100
PJ2$Collaborative.Teachers.. <- as.numeric(sub("%", "",PJ2$Collaborative.Teachers..,fixed=TRUE))/100
PJ2$Supportive.Environment.. <- as.numeric(sub("%", "",PJ2$Supportive.Environment..,fixed=TRUE))/100
PJ2$Effective.School.Leadership.. <- as.numeric(sub("%", "",PJ2$Effective.School.Leadership..,fixed=TRUE))/100
PJ2$Strong.Family.Community.Ties.. <- as.numeric(sub("%", "",PJ2$Strong.Family.Community.Ties..,fixed=TRUE))/100
PJ2$Trust.. <- as.numeric(sub("%", "",PJ2$Trust.. ,fixed=TRUE))/100
PJ2$Average.ELA.Proficiency <-as.character(PJ2$Average.ELA.Proficiency)
PJ2$Average.ELA.Proficiency <- as.numeric(PJ2$Average.ELA.Proficiency)
PJ2$Average.Math.Proficiency <- as.character(PJ2$Average.Math.Proficiency)
PJ2$Average.Math.Proficiency <- as.numeric(PJ2$Average.Math.Proficiency)

Running basic descriptives on the data

summary(PJ2)

##  Rigorous.Instruction.. Collaborative.Teachers.. Supportive.Environment..
##  Min.   :0.0000         Min.   :0.0000           Min.   :0.0000          
##  1st Qu.:0.8600         1st Qu.:0.8500           1st Qu.:0.8400          
##  Median :0.9000         Median :0.9000           Median :0.8900          
##  Mean   :0.8948         Mean   :0.8844           Mean   :0.8875          
##  3rd Qu.:0.9400         3rd Qu.:0.9400           3rd Qu.:0.9400          
##  Max.   :1.0000         Max.   :1.0000           Max.   :1.0000          
##  NA's   :25             NA's   :25               NA's   :25              
##  Effective.School.Leadership.. Strong.Family.Community.Ties..
##  Min.   :0.0000                Min.   :0.0000                
##  1st Qu.:0.7600                1st Qu.:0.8000                
##  Median :0.8300                Median :0.8300                
##  Mean   :0.8162                Mean   :0.8309                
##  3rd Qu.:0.8900                3rd Qu.:0.8700                
##  Max.   :0.9900                Max.   :0.9900                
##  NA's   :25                    NA's   :25                    
##     Trust..       Average.ELA.Proficiency Average.Math.Proficiency
##  Min.   :0.0000   Min.   :1.810           Min.   :1.830           
##  1st Qu.:0.8700   1st Qu.:2.250           1st Qu.:2.300           
##  Median :0.9200   Median :2.450           Median :2.580           
##  Mean   :0.9042   Mean   :2.534           Mean   :2.669           
##  3rd Qu.:0.9400   3rd Qu.:2.760           3rd Qu.:2.980           
##  Max.   :1.0000   Max.   :3.930           Max.   :4.200           
##  NA's   :25       NA's   :55              NA's   :55

sum(is.na(PJ2)) #260 missing data, less than 5% of the total data, run lisewise deletion

## [1] 260

PJ2 <- na.omit(PJ2)
summary(PJ2)

##  Rigorous.Instruction.. Collaborative.Teachers.. Supportive.Environment..
##  Min.   :0.0000         Min.   :0.0000           Min.   :0.0000          
##  1st Qu.:0.8600         1st Qu.:0.8500           1st Qu.:0.8400          
##  Median :0.9000         Median :0.9000           Median :0.8900          
##  Mean   :0.8939         Mean   :0.8832           Mean   :0.8854          
##  3rd Qu.:0.9400         3rd Qu.:0.9300           3rd Qu.:0.9300          
##  Max.   :1.0000         Max.   :1.0000           Max.   :1.0000          
##  Effective.School.Leadership.. Strong.Family.Community.Ties..
##  Min.   :0.0000                Min.   :0.0000                
##  1st Qu.:0.7600                1st Qu.:0.8000                
##  Median :0.8300                Median :0.8300                
##  Mean   :0.8146                Mean   :0.8296                
##  3rd Qu.:0.8800                3rd Qu.:0.8700                
##  Max.   :0.9900                Max.   :0.9900                
##     Trust..       Average.ELA.Proficiency Average.Math.Proficiency
##  Min.   :0.0000   Min.   :1.810           Min.   :1.830           
##  1st Qu.:0.8700   1st Qu.:2.250           1st Qu.:2.300           
##  Median :0.9100   Median :2.450           Median :2.580           
##  Mean   :0.9031   Mean   :2.534           Mean   :2.669           
##  3rd Qu.:0.9400   3rd Qu.:2.760           3rd Qu.:2.980           
##  Max.   :1.0000   Max.   :3.930           Max.   :4.200

boxplot(PJ2$Average.Math.Proficiency)

boxplot.stats(PJ2$Average.Math.Proficiency)$out

## [1] 4.03 4.15 4.19 4.07 4.16 4.20

#Only Average Math Proficiency has outliers(statistically defined). However, I do not have more information to determine whether they are actual outliers given its practical context. Also, there are only 6 of them. Therefore, I am keeping them in my data.

Test Kurtosis and Skewness

library(moments)
skewness(PJ2)

##         Rigorous.Instruction..       Collaborative.Teachers.. 
##                     -5.3958196                     -2.2033102 
##       Supportive.Environment..  Effective.School.Leadership.. 
##                     -2.2939757                     -1.2900541 
## Strong.Family.Community.Ties..                        Trust.. 
##                     -2.0293937                     -3.4608423 
##        Average.ELA.Proficiency       Average.Math.Proficiency 
##                      0.9040115                      0.6586586

kurtosis(PJ2)

##         Rigorous.Instruction..       Collaborative.Teachers.. 
##                      66.967941                      19.500977 
##       Supportive.Environment..  Effective.School.Leadership.. 
##                      29.949557                       7.561101 
## Strong.Family.Community.Ties..                        Trust.. 
##                      27.772212                      42.435323 
##        Average.ELA.Proficiency       Average.Math.Proficiency 
##                       3.488466                       2.806103

agostino.test(PJ2$Rigorous.Instruction.., alternative = c("two.sided", "less", "greater"))

## 
##  D'Agostino skewness test
## 
## data:  PJ2$Rigorous.Instruction..
## skew = -5.3958, z = -30.4398, p-value < 0.00000000000000022
## alternative hypothesis: data have a skewness

agostino.test(PJ2$Collaborative.Teachers.., alternative = c("two.sided", "less", "greater"))

## 
##  D'Agostino skewness test
## 
## data:  PJ2$Collaborative.Teachers..
## skew = -2.2033, z = -20.1892, p-value < 0.00000000000000022
## alternative hypothesis: data have a skewness

agostino.test(PJ2$Supportive.Environment.., alternative = c("two.sided", "less", "greater"))

## 
##  D'Agostino skewness test
## 
## data:  PJ2$Supportive.Environment..
## skew = -2.294, z = -20.636, p-value < 0.00000000000000022
## alternative hypothesis: data have a skewness

agostino.test(PJ2$Effective.School.Leadership.., alternative = c("two.sided", "less", "greater"))

## 
##  D'Agostino skewness test
## 
## data:  PJ2$Effective.School.Leadership..
## skew = -1.2901, z = -14.5295, p-value < 0.00000000000000022
## alternative hypothesis: data have a skewness

agostino.test(PJ2$Strong.Family.Community.Ties.., alternative = c("two.sided", "less", "greater"))

## 
##  D'Agostino skewness test
## 
## data:  PJ2$Strong.Family.Community.Ties..
## skew = -2.0294, z = -19.2846, p-value < 0.00000000000000022
## alternative hypothesis: data have a skewness

agostino.test(PJ2$Trust.., alternative = c("two.sided", "less", "greater"))

## 
##  D'Agostino skewness test
## 
## data:  PJ2$Trust..
## skew = -3.4608, z = -25.2918, p-value < 0.00000000000000022
## alternative hypothesis: data have a skewness

agostino.test(PJ2$Average.ELA.Proficiency, alternative = c("two.sided", "less", "greater"))

## 
##  D'Agostino skewness test
## 
## data:  PJ2$Average.ELA.Proficiency
## skew = 0.90401, z = 11.19746, p-value < 0.00000000000000022
## alternative hypothesis: data have a skewness

agostino.test(PJ2$Average.Math.Proficiency, alternative = c("two.sided", "less", "greater"))

## 
##  D'Agostino skewness test
## 
## data:  PJ2$Average.Math.Proficiency
## skew = 0.65866, z = 8.64434, p-value < 0.00000000000000022
## alternative hypothesis: data have a skewness

# D'Agostino skewness test indicates that all variables are significantly skewed, with two criterion variable moderately skewed and the predictor variables highly skewed. Since the criterion variables are moderately skewed, I would not transform any of the variables.

Mutiple regression analysis with Average ELA Proficiency as the criterion variable

#renaming all variables so the codes could be ran without a problem
library(reshape) #detached package dplyr

## 
## Attaching package: 'reshape'

## The following object is masked from 'package:dplyr':
## 
##     rename

## The following object is masked from 'package:Matrix':
## 
##     expand

PJ2 <- rename(PJ2,c(Rigorous.Instruction..="RI",Collaborative.Teachers..="CT",Supportive.Environment..="SE", Effective.School.Leadership..="ESL",Strong.Family.Community.Ties.. ="FCT",Trust..="Trust", Average.ELA.Proficiency = "AEP", Average.Math.Proficiency = "AMP"))
#Run regression using the lm function
Reg1 <- lm(AEP ~ RI + CT + SE + ESL + FCT + Trust, data = PJ2)
summary(Reg1)

## 
## Call:
## lm(formula = AEP ~ RI + CT + SE + ESL + FCT + Trust, data = PJ2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8227 -0.2367 -0.0515  0.1830  1.4328 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)
## (Intercept)   1.1712     0.1787   6.553      0.0000000000831
## RI            0.5632     0.1891   2.978              0.00296
## CT           -0.1324     0.3721  -0.356              0.72202
## SE            2.1311     0.2264   9.413 < 0.0000000000000002
## ESL           1.3929     0.2513   5.543      0.0000000364993
## FCT          -0.2718     0.1903  -1.428              0.15361
## Trust        -2.0147     0.3856  -5.225      0.0000002044152
## 
## Residual standard error: 0.3309 on 1210 degrees of freedom
## Multiple R-squared:  0.1756, Adjusted R-squared:  0.1715 
## F-statistic: 42.97 on 6 and 1210 DF,  p-value: < 0.00000000000000022

#Hierarchical Regression
Model1 <- lm(AEP ~ RI, data = PJ2)
summary(Model1)

## 
## Call:
## lm(formula = AEP ~ RI, data = PJ2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.68372 -0.26241 -0.06811  0.20628  1.42750 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)
## (Intercept)   1.1307     0.1274   8.875 <0.0000000000000002
## RI            1.5701     0.1421  11.049 <0.0000000000000002
## 
## Residual standard error: 0.3467 on 1215 degrees of freedom
## Multiple R-squared:  0.09131,    Adjusted R-squared:  0.09056 
## F-statistic: 122.1 on 1 and 1215 DF,  p-value: < 0.00000000000000022

Model2 <- lm(AEP ~ RI + CT, data = PJ2)
Model3 <- lm(AEP ~ RI + CT + SE, data = PJ2)
Model4 <- lm(AEP ~ RI + CT + SE + ESL, data = PJ2)
Model5 <- lm(AEP ~ RI + CT + SE + ESL + FCT, data = PJ2)
#Testing change in R-suqare
rsq.delta1 <- summary(Model2)$r.sqiared - summary(Model1)$r.squared
rsq.delta1

## numeric(0)

anova(Model1, Model2, test = "F")

## Analysis of Variance Table
## 
## Model 1: AEP ~ RI
## Model 2: AEP ~ RI + CT
##   Res.Df    RSS Df Sum of Sq      F      Pr(>F)
## 1   1215 146.07                                
## 2   1214 143.43  1    2.6457 22.394 0.000002482

rsq.delta2 <- summary(Model3)$r.sqiared - summary(Model2)$r.squared
rsq.delta2

## numeric(0)

anova(Model2, Model3, test = "F")

## Analysis of Variance Table
## 
## Model 1: AEP ~ RI + CT
## Model 2: AEP ~ RI + CT + SE
##   Res.Df    RSS Df Sum of Sq     F             Pr(>F)
## 1   1214 143.43                                      
## 2   1213 137.52  1    5.9056 52.09 0.0000000000009325

rsq.delta3 <- summary(Model4)$r.sqiared - summary(Model3)$r.squared
rsq.delta3

## numeric(0)

anova(Model3, Model4, test = "F")

## Analysis of Variance Table
## 
## Model 1: AEP ~ RI + CT + SE
## Model 2: AEP ~ RI + CT + SE + ESL
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)
## 1   1213 137.52                              
## 2   1212 136.08  1    1.4396 12.822 0.0003562

rsq.delta4 <- summary(Model5)$r.sqiared - summary(Model4)$r.squared
rsq.delta4

## numeric(0)

anova(Model4, Model5, test = "F")

## Analysis of Variance Table
## 
## Model 1: AEP ~ RI + CT + SE + ESL
## Model 2: AEP ~ RI + CT + SE + ESL + FCT
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)
## 1   1212 136.08                            
## 2   1211 135.51  1   0.57427 5.1321 0.02366

rsq.delta5 <- summary(Reg1)$r.sqiared - summary(Model5)$r.squared
rsq.delta5

## numeric(0)

anova(Model5, Reg1, test = "F")

## Analysis of Variance Table
## 
## Model 1: AEP ~ RI + CT + SE + ESL + FCT
## Model 2: AEP ~ RI + CT + SE + ESL + FCT + Trust
##   Res.Df    RSS Df Sum of Sq      F       Pr(>F)
## 1   1211 135.51                                 
## 2   1210 132.52  1    2.9905 27.306 0.0000002044

#checking for multicollinearity 
vif(Reg1)

##       RI       CT       SE      ESL      FCT    Trust 
## 1.943860 8.560109 2.414204 6.769008 1.571616 6.170379

#multiple regression without CT and FCT
New <- lm(AEP ~ RI + SE + ESL + Trust, data = PJ2)
summary(New)

## 
## Call:
## lm(formula = AEP ~ RI + SE + ESL + Trust, data = PJ2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81437 -0.23605 -0.04935  0.18320  1.42567 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)
## (Intercept)   1.1098     0.1736   6.392      0.0000000002331
## RI            0.5202     0.1772   2.936              0.00339
## SE            2.0304     0.2149   9.447 < 0.0000000000000002
## ESL           1.3450     0.1969   6.831      0.0000000000133
## Trust        -2.1415     0.3599  -5.950      0.0000000035089
## 
## Residual standard error: 0.331 on 1212 degrees of freedom
## Multiple R-squared:  0.1742, Adjusted R-squared:  0.1715 
## F-statistic: 63.91 on 4 and 1212 DF,  p-value: < 0.00000000000000022

Mutiple regression analysis with Average Math Proficiency as the criterion variable

#Run regression using the lm function
Reg2 <- lm(AMP ~ RI + CT + SE + ESL + FCT + Trust, data = PJ2)
summary(Reg2)

## 
## Call:
## lm(formula = AMP ~ RI + CT + SE + ESL + FCT + Trust, data = PJ2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9596 -0.3114 -0.0492  0.2806  2.3432 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)
## (Intercept)   0.5068     0.2264   2.239              0.02534
## RI            0.6233     0.2395   2.602              0.00938
## CT           -0.1100     0.4713  -0.233              0.81548
## SE            3.2908     0.2868  11.475 < 0.0000000000000002
## ESL           1.4970     0.3183   4.703          0.000002860
## FCT          -0.1472     0.2411  -0.610              0.54175
## Trust        -2.5566     0.4884  -5.235          0.000000195
## 
## Residual standard error: 0.4192 on 1210 degrees of freedom
## Multiple R-squared:   0.21,  Adjusted R-squared:  0.2061 
## F-statistic: 53.61 on 6 and 1210 DF,  p-value: < 0.00000000000000022

#Hierarchical Regression
Model6 <- lm(AMP ~ RI, data = PJ2)
summary(Model6)

## 
## Call:
## lm(formula = AMP ~ RI, data = PJ2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.86372 -0.35219 -0.07913  0.27616  2.11605 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)
## (Intercept)   0.7339     0.1637   4.482           0.00000808
## RI            2.1647     0.1826  11.854 < 0.0000000000000002
## 
## Residual standard error: 0.4456 on 1215 degrees of freedom
## Multiple R-squared:  0.1037, Adjusted R-squared:  0.1029 
## F-statistic: 140.5 on 1 and 1215 DF,  p-value: < 0.00000000000000022

Model7 <- lm(AMP ~ RI + CT, data = PJ2)
Model8 <- lm(AMP ~ RI + CT + SE, data = PJ2)
Model9 <- lm(AMP ~ RI + CT + SE + ESL, data = PJ2)
Model10 <- lm(AMP ~ RI + CT + SE + ESL + FCT, data = PJ2)
#Testing change in R-suqare
rsq.delta6 <- summary(Model7)$r.sqiared - summary(Model6)$r.squared
rsq.delta6

## numeric(0)

anova(Model6, Model7, test = "F")

## Analysis of Variance Table
## 
## Model 1: AMP ~ RI
## Model 2: AMP ~ RI + CT
##   Res.Df    RSS Df Sum of Sq      F      Pr(>F)
## 1   1215 241.25                                
## 2   1214 237.36  1    3.8916 19.904 0.000008899

rsq.delta7 <- summary(Model8)$r.sqiared - summary(Model7)$r.squared
rsq.delta7

## numeric(0)

anova(Model7, Model8, test = "F")

## Analysis of Variance Table
## 
## Model 1: AMP ~ RI + CT
## Model 2: AMP ~ RI + CT + SE
##   Res.Df    RSS Df Sum of Sq      F                Pr(>F)
## 1   1214 237.36                                          
## 2   1213 219.16  1    18.203 100.75 < 0.00000000000000022

rsq.delta8 <- summary(Model9)$r.sqiared - summary(Model8)$r.squared
rsq.delta8

## numeric(0)

anova(Model8, Model9, test = "F")

## Analysis of Variance Table
## 
## Model 1: AMP ~ RI + CT + SE
## Model 2: AMP ~ RI + CT + SE + ESL
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)
## 1   1213 219.16                             
## 2   1212 217.82  1    1.3369 7.4385 0.006476

rsq.delta9 <- summary(Model10)$r.sqiared - summary(Model9)$r.squared
rsq.delta9

## numeric(0)

anova(Model9, Model10, test = "F")

## Analysis of Variance Table
## 
## Model 1: AMP ~ RI + CT + SE + ESL
## Model 2: AMP ~ RI + CT + SE + ESL + FCT
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   1212 217.82                           
## 2   1211 217.44  1   0.37631 2.0957  0.148

rsq.delta10 <- summary(Reg2)$r.sqiared - summary(Model10)$r.squared
rsq.delta10

## numeric(0)

anova(Model10, Reg2, test = "F")

## Analysis of Variance Table
## 
## Model 1: AMP ~ RI + CT + SE + ESL + FCT
## Model 2: AMP ~ RI + CT + SE + ESL + FCT + Trust
##   Res.Df    RSS Df Sum of Sq      F       Pr(>F)
## 1   1211 217.44                                 
## 2   1210 212.63  1    4.8153 27.402 0.0000001946

#multiple regression without FCT
New1 <- lm(AMP ~ RI + CT + SE + ESL + Trust, data = PJ2)
summary(New1)

## 
## Call:
## lm(formula = AMP ~ RI + CT + SE + ESL + Trust, data = PJ2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97225 -0.31364 -0.04711  0.27974  2.37584 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)
## (Intercept)   0.4742     0.2199   2.156               0.0313
## RI            0.6109     0.2386   2.560               0.0106
## CT           -0.1017     0.4710  -0.216               0.8291
## SE            3.2418     0.2752  11.778 < 0.0000000000000002
## ESL           1.4977     0.3182   4.706         0.0000028118
## Trust        -2.6041     0.4820  -5.402         0.0000000791
## 
## Residual standard error: 0.4191 on 1211 degrees of freedom
## Multiple R-squared:  0.2098, Adjusted R-squared:  0.2065 
## F-statistic: 64.29 on 5 and 1211 DF,  p-value: < 0.00000000000000022

New2 <- lm(AMP ~ RI + SE + ESL + Trust, data = PJ2)
summary(New2)

## 
## Call:
## lm(formula = AMP ~ RI + SE + ESL + Trust, data = PJ2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97766 -0.31238 -0.04729  0.27784  2.37673 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)
## (Intercept)   0.4733     0.2198   2.153              0.03148
## RI            0.5934     0.2243   2.646              0.00825
## SE            3.2329     0.2720  11.884 < 0.0000000000000002
## ESL           1.4550     0.2492   5.838        0.00000000677
## Trust        -2.6380     0.4556  -5.790        0.00000000895
## 
## Residual standard error: 0.4189 on 1212 degrees of freedom
## Multiple R-squared:  0.2097, Adjusted R-squared:  0.2071 
## F-statistic: 80.41 on 4 and 1212 DF,  p-value: < 0.00000000000000022

Canonical Regression

#Identifying predictor and criterion variables, library dplyr
pred <- PJ2 %>% select(RI, CT, SE, ESL, FCT, Trust)
crit <- PJ2 %>% select(AEP, AMP)
ggpairs(pred) #some high correlations

ggpairs(crit) #high correlation

matcor(pred, crit)

## $Xcor
##              RI        CT        SE       ESL       FCT     Trust
## RI    1.0000000 0.6130197 0.6048823 0.4726465 0.4252463 0.5269134
## CT    0.6130197 1.0000000 0.6019965 0.8992480 0.4707950 0.8824691
## SE    0.6048823 0.6019965 1.0000000 0.4597990 0.5605202 0.6362909
## ESL   0.4726465 0.8992480 0.4597990 1.0000000 0.4073891 0.8605750
## FCT   0.4252463 0.4707950 0.5605202 0.4073891 1.0000000 0.5199311
## Trust 0.5269134 0.8824691 0.6362909 0.8605750 0.5199311 1.0000000
## 
## $Ycor
##           AEP       AMP
## AEP 1.0000000 0.9359562
## AMP 0.9359562 1.0000000
## 
## $XYcor
##              RI        CT        SE       ESL       FCT     Trust
## RI    1.0000000 0.6130197 0.6048823 0.4726465 0.4252463 0.5269134
## CT    0.6130197 1.0000000 0.6019965 0.8992480 0.4707950 0.8824691
## SE    0.6048823 0.6019965 1.0000000 0.4597990 0.5605202 0.6362909
## ESL   0.4726465 0.8992480 0.4597990 1.0000000 0.4073891 0.8605750
## FCT   0.4252463 0.4707950 0.5605202 0.4073891 1.0000000 0.5199311
## Trust 0.5269134 0.8824691 0.6362909 0.8605750 0.5199311 1.0000000
## AEP   0.3021701 0.2865938 0.3622688 0.2681362 0.1777522 0.2368743
## AMP   0.3219612 0.2923694 0.4225323 0.2563458 0.2216340 0.2499606
##             AEP       AMP
## RI    0.3021701 0.3219612
## CT    0.2865938 0.2923694
## SE    0.3622688 0.4225323
## ESL   0.2681362 0.2563458
## FCT   0.1777522 0.2216340
## Trust 0.2368743 0.2499606
## AEP   1.0000000 0.9359562
## AMP   0.9359562 1.0000000

CC1 <- cc(pred, crit)
CC1$cor

## [1] 0.4605990 0.1844779

CC1[3:4]

## $xcoef
##              [,1]       [,2]
## RI     -2.6494684  -5.518006
## CT      0.4061550   2.362393
## SE    -15.5582870   6.324098
## ESL    -6.2887145 -14.943308
## FCT     0.3670306   7.143441
## Trust  11.4182399  10.200002
## 
## $ycoef
##          [,1]      [,2]
## AEP  0.857947 -7.763662
## AMP -2.733250  5.382202

CC2 <- comput(pred, crit, CC1)
CC2[3:6]

## $corr.X.xscores
##             [,1]       [,2]
## RI    -0.6942150 -0.2043748
## CT    -0.6221488 -0.3722150
## SE    -0.9342900  0.2564751
## ESL   -0.5340775 -0.5842506
## FCT   -0.4983808  0.3223009
## Trust -0.5374234 -0.1935407
## 
## $corr.Y.xscores
##           [,1]        [,2]
## AEP -0.4106778 -0.08352988
## AMP -0.4578121 -0.02026293
## 
## $corr.X.yscores
##             [,1]        [,2]
## RI    -0.3197547 -0.03770262
## CT    -0.2865611 -0.06866542
## SE    -0.4303331  0.04731399
## ESL   -0.2459956 -0.10778131
## FCT   -0.2295537  0.05945739
## Trust -0.2475367 -0.03570397
## 
## $corr.Y.yscores
##           [,1]       [,2]
## AEP -0.8916168 -0.4527908
## AMP -0.9939494 -0.1098394

rho <- CC1$cor
n <- dim(pred)[1]
p <- length(pred)
q <- length(crit)
p.asym(rho, n, p, q, tstat = "Wilks")

## Wilks' Lambda, using F-approximation (Rao's F):
##               stat    approx df1  df2          p.value
## 1 to 2:  0.7610364 29.478912  12 2418 0.00000000000000
## 2 to 2:  0.9659679  8.525919   5 1210 0.00000006035999

#standard coefficient for canonical regression
s1 <- diag(sqrt(diag(cov(pred))))
s1 %*% CC1$xcoef

##            [,1]       [,2]
## [1,] -0.1853930 -0.3861150
## [2,]  0.0303112  0.1763045
## [3,] -1.0133588  0.4119078
## [4,] -0.6179166 -1.4682998
## [5,]  0.0229423  0.4465212
## [6,]  0.6981402  0.6236540

s2 <- diag(sqrt(diag(cov(crit))))
s2 %*% CC1$ycoef

##            [,1]      [,2]
## [1,]  0.3119404 -2.822786
## [2,] -1.2859119  2.532164

MultiVariate Project 2

YanwenXu

10/6/2020

Import Data and creating a new data frame that only contains the needed variables

Change the variables from factors to numerics. Let’s change percentages to decimals. In order to change factors to numerics, we need to change them to characters first.

Running basic descriptives on the data

Test Kurtosis and Skewness

Mutiple regression analysis with Average ELA Proficiency as the criterion variable

Mutiple regression analysis with Average Math Proficiency as the criterion variable

Canonical Regression