PART 1

part1 <- read.csv("~/Box/Courses/2020 Fall/BER 643 Multivariate/HW/part1.csv")
View(part1)
data <- part1[,2:4]
data
##   x1 x2 x3
## 1  2  3 15
## 2  6  8  9
## 3  5  2  7
## 4  9  4  3
## 5 11 10  2
## 6 12 15  1
## 7  1  4 12
## 8  7  3  4
A <- data.frame(data)
A
##   x1 x2 x3
## 1  2  3 15
## 2  6  8  9
## 3  5  2  7
## 4  9  4  3
## 5 11 10  2
## 6 12 15  1
## 7  1  4 12
## 8  7  3  4
A <- part1[,2:4]
a_matrix <- as.matrix(A)
  1. The mean vector
meanA <- matrix(c(mean(A$x1), mean(A$x2),mean(A$x3)),3,1)
meanA
##       [,1]
## [1,] 6.625
## [2,] 6.125
## [3,] 6.625
  1. The vector x1 and the vector \(x_2^T\)
x1 <- A$x1
x1
## [1]  2  6  5  9 11 12  1  7
x2 <- A$x2 
x2t <- t(x2)
x2t
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,]    3    8    2    4   10   15    4    3
  1. The value of x32
x32 <- A[3,2]
x32
## [1] 2
  1. x1+x2
x1+x2
## [1]  5 14  7 13 21 27  5 10
  1. The variance of x2
n <- matrix(c(1, 1, 1, 1, 1, 1, 1, 1),nrow=8, byrow = FALSE)
xd2 <- matrix(data.frame(x2 - n*mean(x2))[,],8,1)
txd2 <- t(xd2)
var2 <- 1/(8-1)*(txd2%*%xd2)
var2
##          [,1]
## [1,] 20.41071
  1. The mean-differenced matrix Xd
xd <- a_matrix - n%*%t(meanA)
xd
##          x1     x2     x3
## [1,] -4.625 -3.125  8.375
## [2,] -0.625  1.875  2.375
## [3,] -1.625 -4.125  0.375
## [4,]  2.375 -2.125 -3.625
## [5,]  4.375  3.875 -4.625
## [6,]  5.375  8.875 -5.625
## [7,] -5.625 -2.125  5.375
## [8,]  0.375 -3.125 -2.625
  1. The sum of squares matrix X′d Xd
ss <- t(xd)%*%xd
ss
##          x1      x2       x3
## x1  109.875  90.375 -131.125
## x2   90.375 142.875  -86.625
## x3 -131.125 -86.625  177.875
  1. The covariance matrix \(S = (n −1)^{−1}X′_dX_d\)
S <- 1/(8-1)*ss
S
##           x1        x2        x3
## x1  15.69643  12.91071 -18.73214
## x2  12.91071  20.41071 -12.37500
## x3 -18.73214 -12.37500  25.41071
  1. The covariance between x1 and x2
x1x2 <- part1[,2:3]
x1x2 <- as.matrix(x1x2)
meanx1x2 <- matrix(c(mean(A$x1), mean(A$x2)))
xd12 <- x1x2 - n%*%t(meanx1x2)
cov12 <- 1/(8-1)*t(xd12)%*%xd12
cov12
##          x1       x2
## x1 15.69643 12.91071
## x2 12.91071 20.41071
  1. The variance components of S (using a vector expression: diag(S) = )
diag(S)
##       x1       x2       x3 
## 15.69643 20.41071 25.41071
  1. Proof matrix A is the inverse of the matrix B, and vice versa.
MatrixA <- matrix(c(3,1,0,-2,0,1,4,2,0),3)
MatrixA
##      [,1] [,2] [,3]
## [1,]    3   -2    4
## [2,]    1    0    2
## [3,]    0    1    0
MatrixB <- matrix(c(1,0,-1/2,-2,0,3/2,2,1,-1),3)
MatrixB
##      [,1] [,2] [,3]
## [1,]  1.0 -2.0    2
## [2,]  0.0  0.0    1
## [3,] -0.5  1.5   -1
solve(MatrixA)
##      [,1] [,2] [,3]
## [1,]  1.0 -2.0    2
## [2,]  0.0  0.0    1
## [3,] -0.5  1.5   -1
solve(MatrixB)
##      [,1] [,2] [,3]
## [1,]    3   -2    4
## [2,]    1    0    2
## [3,]    0    1    0
solve(MatrixA)-MatrixB
##              [,1]          [,2]         [,3]
## [1,] 0.000000e+00  2.220446e-16 0.000000e+00
## [2,] 0.000000e+00  0.000000e+00 0.000000e+00
## [3,] 5.551115e-17 -2.220446e-16 1.110223e-16
solve(MatrixB)-MatrixA
##      [,1] [,2] [,3]
## [1,]    0    0    0
## [2,]    0    0    0
## [3,]    0    0    0
MatrixA%*%MatrixB
##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1
MatrixB%*%MatrixA
##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1

PART 2 : Initial model: Use data set HSB1

Task 1

  1. Run a simple regression (Dependent variable: math; Independent variable: mot). Write down the population model and the estimated model (once you have estimated it).
`HSB1(1)` <- read.csv("~/Box/Courses/2020 Fall/BER 643 Multivariate/HW/HSB1(1).csv")
library(psych)
data2 = `HSB1(1)`
describe(data2)
##         vars   n   mean     sd median trimmed    mad   min    max  range  skew
## id         1 350 304.04 174.11 302.00  305.52 223.87  1.00 600.00 599.00 -0.05
## female     2 350   0.56   0.50   1.00    0.58   0.00  0.00   1.00   1.00 -0.25
## race       3 350   3.44   1.03   4.00    3.67   0.00  1.00   4.00   3.00 -1.59
## ses        4 350   2.10   0.73   2.00    2.12   1.48  1.00   3.00   2.00 -0.15
## schtype    5 350   1.16   0.36   1.00    1.07   0.00  1.00   2.00   1.00  1.88
## program    6 350   2.01   0.70   2.00    2.01   0.00  1.00   3.00   2.00 -0.02
## locus      7 350   0.09   0.66   0.20    0.12   0.62 -2.23   1.36   3.59 -0.57
## concept    8 350   0.05   0.68   0.03    0.09   0.47 -2.54   1.19   3.73 -0.70
## mot        9 350   0.66   0.35   0.67    0.70   0.49  0.00   1.00   1.00 -0.59
## career    10 350   8.65   4.33   9.00    8.67   2.97  1.00  17.00  16.00 -0.16
## read      11 350  51.93   9.91  52.10   51.73  11.71 33.60  76.00  42.40  0.19
## write     12 350  52.51   9.58  54.10   53.03  11.56 25.50  67.10  41.60 -0.38
## math      13 350  52.34   9.40  52.30   52.13  10.82 32.70  75.50  42.80  0.15
## science   14 350  51.90   9.87  52.60   52.08  12.01 26.00  74.20  48.20 -0.18
## socst     15 350  51.89   9.97  50.60   52.20  12.01 25.70  70.50  44.80 -0.19
##         kurtosis   se
## id         -1.23 9.31
## female     -1.94 0.03
## race        0.94 0.05
## ses        -1.10 0.04
## schtype     1.52 0.02
## program    -0.95 0.04
## locus       0.22 0.04
## concept     1.01 0.04
## mot        -0.93 0.02
## career     -0.57 0.23
## read       -0.76 0.53
## write      -0.86 0.51
## math       -0.80 0.50
## science    -0.67 0.53
## socst      -0.65 0.53
sim_reg <- lm(math~mot,data2)
summary(sim_reg)
## 
## Call:
## lm(formula = math ~ mot, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.5802  -7.6584   0.1182   6.9190  25.0166 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   48.613      1.054  46.138  < 2e-16 ***
## mot            5.667      1.415   4.004 7.63e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.199 on 348 degrees of freedom
## Multiple R-squared:  0.04403,    Adjusted R-squared:  0.04129 
## F-statistic: 16.03 on 1 and 348 DF,  p-value: 7.627e-05
anova(sim_reg)
## Analysis of Variance Table
## 
## Response: math
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## mot         1  1356.5 1356.47  16.029 7.627e-05 ***
## Residuals 348 29449.3   84.62                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qt(0.05, 348)
## [1] -1.649244
confint(sim_reg,level=0.95)
##                 2.5 %    97.5 %
## (Intercept) 46.541067 50.685700
## mot          2.883009  8.450722
  1. Obtain diagnostic plots to determine whether any of the key model assumptions have been violated to some degree (as much as that is possible with such a small data set). Discuss the implications of your findings
library(robustHD)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Loading required package: perry
## Loading required package: parallel
## Loading required package: robustbase
hist(standardize(sim_reg$residuals), breaks = 20)

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
dwtest(math~mot, data=data2)
## 
##  Durbin-Watson test
## 
## data:  math ~ mot
## DW = 1.6503, p-value = 0.0005133
## alternative hypothesis: true autocorrelation is greater than 0
plot(sim_reg)

Task 2

  1. Run a multiple regression (Dependent variable: read (reading score); Independent variable: locus (locus of control), concept (self-concept), and ses (social economic status). Write down the population model and the estimated model (once you have estimated it)

  2. Obtain the table of parameter estimates, the ANOVA table, and the adjusted R2 for the linear regression models from R software and provide the output (screenshot of the results).

data2$ses.f = factor(data2$ses, labels=c("Low", "Middle", "High"))
data2$ses.ct <- C(data2$ses.f, contr.treatment) 
attributes(data$ses.ct)
## NULL
mul_reg1 = lm(read ~ locus + concept + ses.ct, data2)
summary(mul_reg1)
## 
## Call:
## lm(formula = read ~ locus + concept + ses.ct, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.4565  -7.0965  -0.1387   6.4192  23.1501 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  48.1824     1.0512  45.838  < 2e-16 ***
## locus         4.6425     0.7679   6.045 3.87e-09 ***
## concept      -0.1559     0.7290  -0.214  0.83074    
## ses.ct2       3.4336     1.2747   2.694  0.00741 ** 
## ses.ct3       5.5288     1.3925   3.970 8.73e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.133 on 345 degrees of freedom
## Multiple R-squared:  0.1608, Adjusted R-squared:  0.1511 
## F-statistic: 16.53 on 4 and 345 DF,  p-value: 2.112e-12
anova(mul_reg1)
## Analysis of Variance Table
## 
## Response: read
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## locus       1  4195.6  4195.6 50.2982 7.524e-12 ***
## concept     1     0.4     0.4  0.0045 0.9463439    
## ses.ct      2  1318.4   659.2  7.9029 0.0004407 ***
## Residuals 345 28777.9    83.4                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. Obtain diagnostic plots to determine whether any of the key model assumptions have been violated to some degree (as much as that is possible with such a small data set). Discuss the implications of your findings.
plot(mul_reg1)

hist(mul_reg1$residuals,breaks = 20)

6. Rerun the model without the concept (self-concept) predictor, and evaluate whether adding concept could improve the overall model fit. Provide R output.

mul_reg2 = lm(read ~ locus + ses.ct, data2)
summary(mul_reg2)
## 
## Call:
## lm(formula = read ~ locus + ses.ct, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.575  -7.112  -0.183   6.484  23.232 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  48.1804     1.0497  45.901  < 2e-16 ***
## locus         4.6160     0.7568   6.099 2.85e-09 ***
## ses.ct2       3.4335     1.2729   2.697  0.00733 ** 
## ses.ct3       5.5173     1.3895   3.971 8.72e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.121 on 346 degrees of freedom
## Multiple R-squared:  0.1607, Adjusted R-squared:  0.1534 
## F-statistic: 22.08 on 3 and 346 DF,  p-value: 4.169e-13
anova(mul_reg2, mul_reg1)
## Analysis of Variance Table
## 
## Model 1: read ~ locus + ses.ct
## Model 2: read ~ locus + concept + ses.ct
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    346 28782                           
## 2    345 28778  1    3.8169 0.0458 0.8307