part1 <- read.csv("~/Box/Courses/2020 Fall/BER 643 Multivariate/HW/part1.csv")
View(part1)
data <- part1[,2:4]
data
## x1 x2 x3
## 1 2 3 15
## 2 6 8 9
## 3 5 2 7
## 4 9 4 3
## 5 11 10 2
## 6 12 15 1
## 7 1 4 12
## 8 7 3 4
A <- data.frame(data)
A
## x1 x2 x3
## 1 2 3 15
## 2 6 8 9
## 3 5 2 7
## 4 9 4 3
## 5 11 10 2
## 6 12 15 1
## 7 1 4 12
## 8 7 3 4
A <- part1[,2:4]
a_matrix <- as.matrix(A)
meanA <- matrix(c(mean(A$x1), mean(A$x2),mean(A$x3)),3,1)
meanA
## [,1]
## [1,] 6.625
## [2,] 6.125
## [3,] 6.625
x1 <- A$x1
x1
## [1] 2 6 5 9 11 12 1 7
x2 <- A$x2
x2t <- t(x2)
x2t
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 3 8 2 4 10 15 4 3
x32 <- A[3,2]
x32
## [1] 2
x1+x2
## [1] 5 14 7 13 21 27 5 10
n <- matrix(c(1, 1, 1, 1, 1, 1, 1, 1),nrow=8, byrow = FALSE)
xd2 <- matrix(data.frame(x2 - n*mean(x2))[,],8,1)
txd2 <- t(xd2)
var2 <- 1/(8-1)*(txd2%*%xd2)
var2
## [,1]
## [1,] 20.41071
xd <- a_matrix - n%*%t(meanA)
xd
## x1 x2 x3
## [1,] -4.625 -3.125 8.375
## [2,] -0.625 1.875 2.375
## [3,] -1.625 -4.125 0.375
## [4,] 2.375 -2.125 -3.625
## [5,] 4.375 3.875 -4.625
## [6,] 5.375 8.875 -5.625
## [7,] -5.625 -2.125 5.375
## [8,] 0.375 -3.125 -2.625
ss <- t(xd)%*%xd
ss
## x1 x2 x3
## x1 109.875 90.375 -131.125
## x2 90.375 142.875 -86.625
## x3 -131.125 -86.625 177.875
S <- 1/(8-1)*ss
S
## x1 x2 x3
## x1 15.69643 12.91071 -18.73214
## x2 12.91071 20.41071 -12.37500
## x3 -18.73214 -12.37500 25.41071
x1x2 <- part1[,2:3]
x1x2 <- as.matrix(x1x2)
meanx1x2 <- matrix(c(mean(A$x1), mean(A$x2)))
xd12 <- x1x2 - n%*%t(meanx1x2)
cov12 <- 1/(8-1)*t(xd12)%*%xd12
cov12
## x1 x2
## x1 15.69643 12.91071
## x2 12.91071 20.41071
diag(S)
## x1 x2 x3
## 15.69643 20.41071 25.41071
MatrixA <- matrix(c(3,1,0,-2,0,1,4,2,0),3)
MatrixA
## [,1] [,2] [,3]
## [1,] 3 -2 4
## [2,] 1 0 2
## [3,] 0 1 0
MatrixB <- matrix(c(1,0,-1/2,-2,0,3/2,2,1,-1),3)
MatrixB
## [,1] [,2] [,3]
## [1,] 1.0 -2.0 2
## [2,] 0.0 0.0 1
## [3,] -0.5 1.5 -1
solve(MatrixA)
## [,1] [,2] [,3]
## [1,] 1.0 -2.0 2
## [2,] 0.0 0.0 1
## [3,] -0.5 1.5 -1
solve(MatrixB)
## [,1] [,2] [,3]
## [1,] 3 -2 4
## [2,] 1 0 2
## [3,] 0 1 0
solve(MatrixA)-MatrixB
## [,1] [,2] [,3]
## [1,] 0.000000e+00 2.220446e-16 0.000000e+00
## [2,] 0.000000e+00 0.000000e+00 0.000000e+00
## [3,] 5.551115e-17 -2.220446e-16 1.110223e-16
solve(MatrixB)-MatrixA
## [,1] [,2] [,3]
## [1,] 0 0 0
## [2,] 0 0 0
## [3,] 0 0 0
MatrixA%*%MatrixB
## [,1] [,2] [,3]
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
MatrixB%*%MatrixA
## [,1] [,2] [,3]
## [1,] 1 0 0
## [2,] 0 1 0
## [3,] 0 0 1
`HSB1(1)` <- read.csv("~/Box/Courses/2020 Fall/BER 643 Multivariate/HW/HSB1(1).csv")
library(psych)
data2 = `HSB1(1)`
describe(data2)
## vars n mean sd median trimmed mad min max range skew
## id 1 350 304.04 174.11 302.00 305.52 223.87 1.00 600.00 599.00 -0.05
## female 2 350 0.56 0.50 1.00 0.58 0.00 0.00 1.00 1.00 -0.25
## race 3 350 3.44 1.03 4.00 3.67 0.00 1.00 4.00 3.00 -1.59
## ses 4 350 2.10 0.73 2.00 2.12 1.48 1.00 3.00 2.00 -0.15
## schtype 5 350 1.16 0.36 1.00 1.07 0.00 1.00 2.00 1.00 1.88
## program 6 350 2.01 0.70 2.00 2.01 0.00 1.00 3.00 2.00 -0.02
## locus 7 350 0.09 0.66 0.20 0.12 0.62 -2.23 1.36 3.59 -0.57
## concept 8 350 0.05 0.68 0.03 0.09 0.47 -2.54 1.19 3.73 -0.70
## mot 9 350 0.66 0.35 0.67 0.70 0.49 0.00 1.00 1.00 -0.59
## career 10 350 8.65 4.33 9.00 8.67 2.97 1.00 17.00 16.00 -0.16
## read 11 350 51.93 9.91 52.10 51.73 11.71 33.60 76.00 42.40 0.19
## write 12 350 52.51 9.58 54.10 53.03 11.56 25.50 67.10 41.60 -0.38
## math 13 350 52.34 9.40 52.30 52.13 10.82 32.70 75.50 42.80 0.15
## science 14 350 51.90 9.87 52.60 52.08 12.01 26.00 74.20 48.20 -0.18
## socst 15 350 51.89 9.97 50.60 52.20 12.01 25.70 70.50 44.80 -0.19
## kurtosis se
## id -1.23 9.31
## female -1.94 0.03
## race 0.94 0.05
## ses -1.10 0.04
## schtype 1.52 0.02
## program -0.95 0.04
## locus 0.22 0.04
## concept 1.01 0.04
## mot -0.93 0.02
## career -0.57 0.23
## read -0.76 0.53
## write -0.86 0.51
## math -0.80 0.50
## science -0.67 0.53
## socst -0.65 0.53
sim_reg <- lm(math~mot,data2)
summary(sim_reg)
##
## Call:
## lm(formula = math ~ mot, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.5802 -7.6584 0.1182 6.9190 25.0166
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.613 1.054 46.138 < 2e-16 ***
## mot 5.667 1.415 4.004 7.63e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.199 on 348 degrees of freedom
## Multiple R-squared: 0.04403, Adjusted R-squared: 0.04129
## F-statistic: 16.03 on 1 and 348 DF, p-value: 7.627e-05
anova(sim_reg)
## Analysis of Variance Table
##
## Response: math
## Df Sum Sq Mean Sq F value Pr(>F)
## mot 1 1356.5 1356.47 16.029 7.627e-05 ***
## Residuals 348 29449.3 84.62
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qt(0.05, 348)
## [1] -1.649244
confint(sim_reg,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 46.541067 50.685700
## mot 2.883009 8.450722
library(robustHD)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Loading required package: perry
## Loading required package: parallel
## Loading required package: robustbase
hist(standardize(sim_reg$residuals), breaks = 20)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
dwtest(math~mot, data=data2)
##
## Durbin-Watson test
##
## data: math ~ mot
## DW = 1.6503, p-value = 0.0005133
## alternative hypothesis: true autocorrelation is greater than 0
plot(sim_reg)
Run a multiple regression (Dependent variable: read (reading score); Independent variable: locus (locus of control), concept (self-concept), and ses (social economic status). Write down the population model and the estimated model (once you have estimated it)
Obtain the table of parameter estimates, the ANOVA table, and the adjusted R2 for the linear regression models from R software and provide the output (screenshot of the results).
data2$ses.f = factor(data2$ses, labels=c("Low", "Middle", "High"))
data2$ses.ct <- C(data2$ses.f, contr.treatment)
attributes(data$ses.ct)
## NULL
mul_reg1 = lm(read ~ locus + concept + ses.ct, data2)
summary(mul_reg1)
##
## Call:
## lm(formula = read ~ locus + concept + ses.ct, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.4565 -7.0965 -0.1387 6.4192 23.1501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.1824 1.0512 45.838 < 2e-16 ***
## locus 4.6425 0.7679 6.045 3.87e-09 ***
## concept -0.1559 0.7290 -0.214 0.83074
## ses.ct2 3.4336 1.2747 2.694 0.00741 **
## ses.ct3 5.5288 1.3925 3.970 8.73e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.133 on 345 degrees of freedom
## Multiple R-squared: 0.1608, Adjusted R-squared: 0.1511
## F-statistic: 16.53 on 4 and 345 DF, p-value: 2.112e-12
anova(mul_reg1)
## Analysis of Variance Table
##
## Response: read
## Df Sum Sq Mean Sq F value Pr(>F)
## locus 1 4195.6 4195.6 50.2982 7.524e-12 ***
## concept 1 0.4 0.4 0.0045 0.9463439
## ses.ct 2 1318.4 659.2 7.9029 0.0004407 ***
## Residuals 345 28777.9 83.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mul_reg1)
hist(mul_reg1$residuals,breaks = 20)
6. Rerun the model without the concept (self-concept) predictor, and evaluate whether adding concept could improve the overall model fit. Provide R output.
mul_reg2 = lm(read ~ locus + ses.ct, data2)
summary(mul_reg2)
##
## Call:
## lm(formula = read ~ locus + ses.ct, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.575 -7.112 -0.183 6.484 23.232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.1804 1.0497 45.901 < 2e-16 ***
## locus 4.6160 0.7568 6.099 2.85e-09 ***
## ses.ct2 3.4335 1.2729 2.697 0.00733 **
## ses.ct3 5.5173 1.3895 3.971 8.72e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.121 on 346 degrees of freedom
## Multiple R-squared: 0.1607, Adjusted R-squared: 0.1534
## F-statistic: 22.08 on 3 and 346 DF, p-value: 4.169e-13
anova(mul_reg2, mul_reg1)
## Analysis of Variance Table
##
## Model 1: read ~ locus + ses.ct
## Model 2: read ~ locus + concept + ses.ct
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 346 28782
## 2 345 28778 1 3.8169 0.0458 0.8307