suppressPackageStartupMessages(library("tidyr"))
suppressPackageStartupMessages(library("ggplot2"))
suppressPackageStartupMessages(library("dplyr"))

Problem 1

prob1 <- read.csv("prob1.csv", header=TRUE)

y = as.vector(prob1$y)
X = as.matrix(cbind(1, prob1[,c("x1", "x2")]))
XTranspose.X <- t(X) %*% X
XTranspose.X.Inverse <- round(solve(t(X) %*% X), digits=2)
XTranspose.y <- t(X) %*% y
BetaHat <- XTranspose.X.Inverse %*% XTranspose.y
e <- y - predict(lm(y~x1+x2, data=prob1))

y
## [1] 2 2 6 1 6 9 8 1
X
##      1 x1 x2
## [1,] 1  0 -3
## [2,] 1  0 -1
## [3,] 1  0  2
## [4,] 1  0  2
## [5,] 1  1  1
## [6,] 1  1  3
## [7,] 1  1  0
## [8,] 1  1 -4
XTranspose.X
##    1 x1 x2
## 1  8  4  0
## x1 4  4  0
## x2 0  0 44
XTranspose.X.Inverse
##        1    x1   x2
## 1   0.25 -0.25 0.00
## x1 -0.25  0.50 0.00
## x2  0.00  0.00 0.02
XTranspose.y
##    [,1]
## 1    35
## x1   24
## x2   35
BetaHat
##    [,1]
## 1  2.75
## x1 3.25
## x2 0.70
e
##           1           2           3           4           5           6 
##  1.63636364  0.04545455  1.65909091 -3.34090909 -0.79545455  0.61363636 
##           7           8 
##  2.00000000 -1.81818182
rm(list=ls())

Problem 2

Beta0Hat <- 5.62
Beta1Hat <- 0.82
Beta2Hat <- -1.327

V <- matrix(c(.160,-.110,-.022,-.110, .170, -.009,-.022,-.009,0.014), nrow=3,ncol=3)
s.squared <- 2.82
s <- sqrt(s.squared)

Part A

YHatStar <- Beta0Hat + Beta1Hat*1 + Beta2Hat*2
YHatStar
## [1] 3.786
xstar <- c(1, 1, 2)
xstar.transpose.V.xstar <- t(xstar) %*% V %*% xstar
standard.error.yhat <- round(s * sqrt(xstar.transpose.V.xstar), digits=3)
standard.error.yhat
##       [,1]
## [1,] 0.344

Part B

lower.ci <- round(YHatStar + qt(.025, 34-3) * standard.error.yhat, digits=3)
upper.ci <- round(YHatStar - qt(.025, 34-3) * standard.error.yhat, digits=3)
c(lower.ci, upper.ci)
## [1] 3.084 4.488

Part C

lower.pi <- round(YHatStar + qt(.025, 34-3)*s*sqrt(1 + xstar.transpose.V.xstar), digits=3)
upper.pi <- round(YHatStar - qt(.025, 34-3)*s*sqrt(1 + xstar.transpose.V.xstar), digits=3)
c(lower.pi, upper.pi)
## [1] 0.290 7.282
rm(list=ls())

Problem 3

prob3 <- read.csv("prob3.csv", header=TRUE)
prob3 <- mutate(prob3, time = Year - 1920)

Part A

model3a <- lm(Male~poly(time, 2, raw = TRUE), data=prob3)
summary(model3a)
## 
## Call:
## lm(formula = Male ~ poly(time, 2, raw = TRUE), data = prob3)
## 
## Residuals:
##        1        2        3        4        5        6        7        8 
## -0.36667  0.28571 -0.40000  1.47619  0.01429 -1.48571 -0.12381  0.60000 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                53.9666667  0.8531687  63.254 1.87e-08 ***
## poly(time, 2, raw = TRUE)1  0.4078571  0.0569376   7.163 0.000824 ***
## poly(time, 2, raw = TRUE)2 -0.0023095  0.0007821  -2.953 0.031776 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.014 on 5 degrees of freedom
## Multiple R-squared:  0.9809, Adjusted R-squared:  0.9732 
## F-statistic: 128.2 on 2 and 5 DF,  p-value: 5.058e-05

Part B

Null Hypothesis: Beta2 or the coefficient of the time^2 term = 0

Alternative Hypothesis: Beta2 or the coefficient of the time^2 term != 0

P Value = 0.031776 which is greater than alpha 0.01 -> fail to reject the null hypothesis.

Thus, the quadratic term is not signficant at alpha = 0.01.

Part C

model3b <- lm(Female~poly(time, 2, raw = TRUE), data=prob3)
summary(model3b)
## 
## Call:
## lm(formula = Female ~ poly(time, 2, raw = TRUE), data = prob3)
## 
## Residuals:
##       1       2       3       4       5       6       7       8 
## -0.5083  0.7440 -0.5964  1.1702 -0.1560 -1.0750  0.0131  0.4083 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                55.108333   0.741782  74.292 8.37e-09 ***
## poly(time, 2, raw = TRUE)1  0.615119   0.049504  12.426 5.98e-05 ***
## poly(time, 2, raw = TRUE)2 -0.004036   0.000680  -5.935  0.00194 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8814 on 5 degrees of freedom
## Multiple R-squared:  0.9922, Adjusted R-squared:  0.989 
## F-statistic: 316.7 on 2 and 5 DF,  p-value: 5.429e-06

Null Hypothesis: Beta2 or the coefficient of the time^2 term = 0

Alternative Hypothesis: Beta2 or the coefficient of the time^2 term != 0

P Value = 0.00194 which is less than alpha = 0.01 -> reject null hypothesis.

Thus, the quadratic term IS significant at alpha = 0.01.

rm(list=ls())

Problem 4

prob4 <- read.csv("prob4.csv", header=TRUE)

model4 <- lm(y~x1+x2,data=prob4)
summary(model4)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = prob4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10980 -0.20134  0.05034  0.22738  0.74313 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.570537   0.493749  -3.181  0.00297 ** 
## x1           0.025732   0.004024   6.395 1.83e-07 ***
## x2           0.033615   0.004928   6.822 4.90e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4023 on 37 degrees of freedom
## Multiple R-squared:  0.6811, Adjusted R-squared:  0.6638 
## F-statistic: 39.51 on 2 and 37 DF,  p-value: 6.585e-10

95% CI for the Verbal Coefficient is given by

round(confint(model4, 'x1', level=0.95), digits=3)
##    2.5 % 97.5 %
## x1 0.018  0.034

95% CI for the Math Coefficient is given by

round(confint(model4, 'x2', level=0.95), digits=3)
##    2.5 % 97.5 %
## x2 0.024  0.044

Problem 5

#model5 <- lm(y~poly(x1, 2, raw = TRUE)+poly(x2, 2, raw = TRUE), data=prob4)
# model5 <- lm(y~poly(x1+x2, 2, raw = TRUE), data=prob4)
model5 <- lm(y~x1+x2+I(x1^2)+I(x2^2)+(x1*x2),data=prob4) 

summary(model5)
## 
## Call:
## lm(formula = y ~ x1 + x2 + I(x1^2) + I(x2^2) + (x1 * x2), data = prob4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.39203 -0.10935 -0.04432  0.09508  0.42601 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.9167631  1.3544134  -7.322 1.75e-08 ***
## x1           0.1668098  0.0212447   7.852 3.85e-09 ***
## x2           0.1375972  0.0267340   5.147 1.11e-05 ***
## I(x1^2)     -0.0011082  0.0001173  -9.449 4.88e-11 ***
## I(x2^2)     -0.0008433  0.0001594  -5.290 7.23e-06 ***
## x1:x2        0.0002411  0.0001440   1.675    0.103    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1871 on 34 degrees of freedom
## Multiple R-squared:  0.9366, Adjusted R-squared:  0.9272 
## F-statistic: 100.4 on 5 and 34 DF,  p-value: < 2.2e-16
# http://www.unesco.org/webworld/idams/advguide/Chapt5_2.htm

SSE <- function(input.lm) { return(sum(input.lm$residuals^2)) } # SST - SSR
SST <- function(input.lm) {
  y <- input.lm$fitted.values + input.lm$residuals
  return(sum((y - mean(y))^2)) 
}
SSR <- function(input.lm) { 
  y <- input.lm$fitted.values + input.lm$residuals
  return(sum((predict(input.lm) - mean(y))^2)) 
}
MSE <- function(input.lm) { return (SSE(input.lm)/model5$df) }
MSR <- function(input.lm) {
  p <- length(model5$coefficients)
  return (SSR(input.lm)/(p-1))
}

SSE.linear <- SSE(model4)
SSR.linear <- SSR(model4)
SST.linear <- SST(model4)
c(SSE.linear, SSR.linear, SST.linear)
## [1]  5.987552 12.785946 18.773497
SSE.quad <- SSE(model5)
SSR.quad <- SSR(model5)
SST.quad <- SST(model5)
c(SSE.quad, SSR.quad, SST.quad)
## [1]  1.190756 17.582741 18.773498
Ftest <- function(F.lm, R.lm) {
  SSE.F <- SSE(F.lm)
  SSE.R <- SSE(R.lm)
  df.N <- R.lm$df - F.lm$df
  df.D <- F.lm$df
  SSE.N <- SSE.R - SSE.F
  SSE.D <- SSE.F 
  F <- (SSE.N / df.N) / (SSE.D / df.D)
  pval <- 1 - pf(F, df.N, df.D)
  return(data.frame(F, df.N, df.D, pval))
}

c(SSE.linear, SSE.quad)
## [1] 5.987552 1.190756

Alt

Ftest(model5, model4)
##          F df.N df.D         pval
## 1 45.65476    3   34 5.100809e-12
anova(model5, model4)
## Analysis of Variance Table
## 
## Model 1: y ~ x1 + x2 + I(x1^2) + I(x2^2) + (x1 * x2)
## Model 2: y ~ x1 + x2
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     34 1.1908                                  
## 2     37 5.9876 -3   -4.7968 45.655 5.101e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MSR(model5)/MSE(model5)
## [1] 100.409

Null Hypothesis: B3 = B4 = B5 = 0

Alternative Hypothesis: At least one of B3, B4, B5 != 0

Since F > f_3,34,0.05 = 2.92, we reject H_0 and conclude that the quadratic fit is signficantly better.

rm(list=ls())

Problem 6

prob6 <- read.csv("prob6.csv", header=TRUE)

Part A

model6a <- lm(y~.,data=prob6)
summary(model6a)
## 
## Call:
## lm(formula = y ~ ., data = prob6)
## 
## Residuals:
## ALL 10 residuals are 0: no residual degrees of freedom!
## 
## Coefficients: (3 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.5361         NA      NA       NA
## x1            0.8962         NA      NA       NA
## x2           -0.4413         NA      NA       NA
## x3            0.5443         NA      NA       NA
## x4           -0.2494         NA      NA       NA
## x5           -0.2364         NA      NA       NA
## x6           -0.5523         NA      NA       NA
## x7            0.2164         NA      NA       NA
## x8           -0.7080         NA      NA       NA
## x9           -0.4114         NA      NA       NA
## x10               NA         NA      NA       NA
## x11               NA         NA      NA       NA
## x12               NA         NA      NA       NA
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 9 and 0 DF,  p-value: NA

Because there are more parameters than the number of observations (k+1 > 10), the X’X matrix is not invertible. This means the least squares solution cannot be calculated. Part B

for (i in 11:9) {
  f <- (fmla <- as.formula(paste("y ~ ", paste(paste0("x", 1:i), collapse= "+"))))
  print(f)
  print(summary(lm(f, data=prob6)))
}
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11
## 
## Call:
## lm(formula = f, data = prob6)
## 
## Residuals:
## ALL 10 residuals are 0: no residual degrees of freedom!
## 
## Coefficients: (2 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.5361         NA      NA       NA
## x1            0.8962         NA      NA       NA
## x2           -0.4413         NA      NA       NA
## x3            0.5443         NA      NA       NA
## x4           -0.2494         NA      NA       NA
## x5           -0.2364         NA      NA       NA
## x6           -0.5523         NA      NA       NA
## x7            0.2164         NA      NA       NA
## x8           -0.7080         NA      NA       NA
## x9           -0.4114         NA      NA       NA
## x10               NA         NA      NA       NA
## x11               NA         NA      NA       NA
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 9 and 0 DF,  p-value: NA
## 
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10
## 
## Call:
## lm(formula = f, data = prob6)
## 
## Residuals:
## ALL 10 residuals are 0: no residual degrees of freedom!
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.5361         NA      NA       NA
## x1            0.8962         NA      NA       NA
## x2           -0.4413         NA      NA       NA
## x3            0.5443         NA      NA       NA
## x4           -0.2494         NA      NA       NA
## x5           -0.2364         NA      NA       NA
## x6           -0.5523         NA      NA       NA
## x7            0.2164         NA      NA       NA
## x8           -0.7080         NA      NA       NA
## x9           -0.4114         NA      NA       NA
## x10               NA         NA      NA       NA
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 9 and 0 DF,  p-value: NA
## 
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9
## 
## Call:
## lm(formula = f, data = prob6)
## 
## Residuals:
## ALL 10 residuals are 0: no residual degrees of freedom!
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.5361         NA      NA       NA
## x1            0.8962         NA      NA       NA
## x2           -0.4413         NA      NA       NA
## x3            0.5443         NA      NA       NA
## x4           -0.2494         NA      NA       NA
## x5           -0.2364         NA      NA       NA
## x6           -0.5523         NA      NA       NA
## x7            0.2164         NA      NA       NA
## x8           -0.7080         NA      NA       NA
## x9           -0.4114         NA      NA       NA
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 9 and 0 DF,  p-value: NA

When k=9 predictors, k+1 == 10, thus X`X becomes invertible and you can calculate a least squares solution. However, some quantaties still can’t be calculated such as t-statistics and P-values. This is because with the same number of paramaters to estimate (k+1) and observations, you will be able to get a perfect fit with SSE = 0. The model will be overfit and one should have no confidence in the regression model.

rm(list=ls())