suppressPackageStartupMessages(library("tidyr"))
suppressPackageStartupMessages(library("ggplot2"))
suppressPackageStartupMessages(library("dplyr"))
prob1 <- read.csv("prob1.csv", header=TRUE)
y = as.vector(prob1$y)
X = as.matrix(cbind(1, prob1[,c("x1", "x2")]))
XTranspose.X <- t(X) %*% X
XTranspose.X.Inverse <- round(solve(t(X) %*% X), digits=2)
XTranspose.y <- t(X) %*% y
BetaHat <- XTranspose.X.Inverse %*% XTranspose.y
e <- y - predict(lm(y~x1+x2, data=prob1))
y
## [1] 2 2 6 1 6 9 8 1
X
## 1 x1 x2
## [1,] 1 0 -3
## [2,] 1 0 -1
## [3,] 1 0 2
## [4,] 1 0 2
## [5,] 1 1 1
## [6,] 1 1 3
## [7,] 1 1 0
## [8,] 1 1 -4
XTranspose.X
## 1 x1 x2
## 1 8 4 0
## x1 4 4 0
## x2 0 0 44
XTranspose.X.Inverse
## 1 x1 x2
## 1 0.25 -0.25 0.00
## x1 -0.25 0.50 0.00
## x2 0.00 0.00 0.02
XTranspose.y
## [,1]
## 1 35
## x1 24
## x2 35
BetaHat
## [,1]
## 1 2.75
## x1 3.25
## x2 0.70
e
## 1 2 3 4 5 6
## 1.63636364 0.04545455 1.65909091 -3.34090909 -0.79545455 0.61363636
## 7 8
## 2.00000000 -1.81818182
rm(list=ls())
Beta0Hat <- 5.62
Beta1Hat <- 0.82
Beta2Hat <- -1.327
V <- matrix(c(.160,-.110,-.022,-.110, .170, -.009,-.022,-.009,0.014), nrow=3,ncol=3)
s.squared <- 2.82
s <- sqrt(s.squared)
Part A
YHatStar <- Beta0Hat + Beta1Hat*1 + Beta2Hat*2
YHatStar
## [1] 3.786
xstar <- c(1, 1, 2)
xstar.transpose.V.xstar <- t(xstar) %*% V %*% xstar
standard.error.yhat <- round(s * sqrt(xstar.transpose.V.xstar), digits=3)
standard.error.yhat
## [,1]
## [1,] 0.344
Part B
lower.ci <- round(YHatStar + qt(.025, 34-3) * standard.error.yhat, digits=3)
upper.ci <- round(YHatStar - qt(.025, 34-3) * standard.error.yhat, digits=3)
c(lower.ci, upper.ci)
## [1] 3.084 4.488
Part C
lower.pi <- round(YHatStar + qt(.025, 34-3)*s*sqrt(1 + xstar.transpose.V.xstar), digits=3)
upper.pi <- round(YHatStar - qt(.025, 34-3)*s*sqrt(1 + xstar.transpose.V.xstar), digits=3)
c(lower.pi, upper.pi)
## [1] 0.290 7.282
rm(list=ls())
prob3 <- read.csv("prob3.csv", header=TRUE)
prob3 <- mutate(prob3, time = Year - 1920)
Part A
model3a <- lm(Male~poly(time, 2, raw = TRUE), data=prob3)
summary(model3a)
##
## Call:
## lm(formula = Male ~ poly(time, 2, raw = TRUE), data = prob3)
##
## Residuals:
## 1 2 3 4 5 6 7 8
## -0.36667 0.28571 -0.40000 1.47619 0.01429 -1.48571 -0.12381 0.60000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.9666667 0.8531687 63.254 1.87e-08 ***
## poly(time, 2, raw = TRUE)1 0.4078571 0.0569376 7.163 0.000824 ***
## poly(time, 2, raw = TRUE)2 -0.0023095 0.0007821 -2.953 0.031776 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.014 on 5 degrees of freedom
## Multiple R-squared: 0.9809, Adjusted R-squared: 0.9732
## F-statistic: 128.2 on 2 and 5 DF, p-value: 5.058e-05
Part B
Null Hypothesis: Beta2 or the coefficient of the time^2 term = 0
Alternative Hypothesis: Beta2 or the coefficient of the time^2 term != 0
P Value = 0.031776 which is greater than alpha 0.01 -> fail to reject the null hypothesis.
Thus, the quadratic term is not signficant at alpha = 0.01.
Part C
model3b <- lm(Female~poly(time, 2, raw = TRUE), data=prob3)
summary(model3b)
##
## Call:
## lm(formula = Female ~ poly(time, 2, raw = TRUE), data = prob3)
##
## Residuals:
## 1 2 3 4 5 6 7 8
## -0.5083 0.7440 -0.5964 1.1702 -0.1560 -1.0750 0.0131 0.4083
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.108333 0.741782 74.292 8.37e-09 ***
## poly(time, 2, raw = TRUE)1 0.615119 0.049504 12.426 5.98e-05 ***
## poly(time, 2, raw = TRUE)2 -0.004036 0.000680 -5.935 0.00194 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8814 on 5 degrees of freedom
## Multiple R-squared: 0.9922, Adjusted R-squared: 0.989
## F-statistic: 316.7 on 2 and 5 DF, p-value: 5.429e-06
Null Hypothesis: Beta2 or the coefficient of the time^2 term = 0
Alternative Hypothesis: Beta2 or the coefficient of the time^2 term != 0
P Value = 0.00194 which is less than alpha = 0.01 -> reject null hypothesis.
Thus, the quadratic term IS significant at alpha = 0.01.
rm(list=ls())
prob4 <- read.csv("prob4.csv", header=TRUE)
model4 <- lm(y~x1+x2,data=prob4)
summary(model4)
##
## Call:
## lm(formula = y ~ x1 + x2, data = prob4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.10980 -0.20134 0.05034 0.22738 0.74313
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.570537 0.493749 -3.181 0.00297 **
## x1 0.025732 0.004024 6.395 1.83e-07 ***
## x2 0.033615 0.004928 6.822 4.90e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4023 on 37 degrees of freedom
## Multiple R-squared: 0.6811, Adjusted R-squared: 0.6638
## F-statistic: 39.51 on 2 and 37 DF, p-value: 6.585e-10
95% CI for the Verbal Coefficient is given by
round(confint(model4, 'x1', level=0.95), digits=3)
## 2.5 % 97.5 %
## x1 0.018 0.034
95% CI for the Math Coefficient is given by
round(confint(model4, 'x2', level=0.95), digits=3)
## 2.5 % 97.5 %
## x2 0.024 0.044
#model5 <- lm(y~poly(x1, 2, raw = TRUE)+poly(x2, 2, raw = TRUE), data=prob4)
# model5 <- lm(y~poly(x1+x2, 2, raw = TRUE), data=prob4)
model5 <- lm(y~x1+x2+I(x1^2)+I(x2^2)+(x1*x2),data=prob4)
summary(model5)
##
## Call:
## lm(formula = y ~ x1 + x2 + I(x1^2) + I(x2^2) + (x1 * x2), data = prob4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39203 -0.10935 -0.04432 0.09508 0.42601
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.9167631 1.3544134 -7.322 1.75e-08 ***
## x1 0.1668098 0.0212447 7.852 3.85e-09 ***
## x2 0.1375972 0.0267340 5.147 1.11e-05 ***
## I(x1^2) -0.0011082 0.0001173 -9.449 4.88e-11 ***
## I(x2^2) -0.0008433 0.0001594 -5.290 7.23e-06 ***
## x1:x2 0.0002411 0.0001440 1.675 0.103
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1871 on 34 degrees of freedom
## Multiple R-squared: 0.9366, Adjusted R-squared: 0.9272
## F-statistic: 100.4 on 5 and 34 DF, p-value: < 2.2e-16
# http://www.unesco.org/webworld/idams/advguide/Chapt5_2.htm
SSE <- function(input.lm) { return(sum(input.lm$residuals^2)) } # SST - SSR
SST <- function(input.lm) {
y <- input.lm$fitted.values + input.lm$residuals
return(sum((y - mean(y))^2))
}
SSR <- function(input.lm) {
y <- input.lm$fitted.values + input.lm$residuals
return(sum((predict(input.lm) - mean(y))^2))
}
MSE <- function(input.lm) { return (SSE(input.lm)/model5$df) }
MSR <- function(input.lm) {
p <- length(model5$coefficients)
return (SSR(input.lm)/(p-1))
}
SSE.linear <- SSE(model4)
SSR.linear <- SSR(model4)
SST.linear <- SST(model4)
c(SSE.linear, SSR.linear, SST.linear)
## [1] 5.987552 12.785946 18.773497
SSE.quad <- SSE(model5)
SSR.quad <- SSR(model5)
SST.quad <- SST(model5)
c(SSE.quad, SSR.quad, SST.quad)
## [1] 1.190756 17.582741 18.773498
Ftest <- function(F.lm, R.lm) {
SSE.F <- SSE(F.lm)
SSE.R <- SSE(R.lm)
df.N <- R.lm$df - F.lm$df
df.D <- F.lm$df
SSE.N <- SSE.R - SSE.F
SSE.D <- SSE.F
F <- (SSE.N / df.N) / (SSE.D / df.D)
pval <- 1 - pf(F, df.N, df.D)
return(data.frame(F, df.N, df.D, pval))
}
c(SSE.linear, SSE.quad)
## [1] 5.987552 1.190756
Ftest(model5, model4)
## F df.N df.D pval
## 1 45.65476 3 34 5.100809e-12
anova(model5, model4)
## Analysis of Variance Table
##
## Model 1: y ~ x1 + x2 + I(x1^2) + I(x2^2) + (x1 * x2)
## Model 2: y ~ x1 + x2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 34 1.1908
## 2 37 5.9876 -3 -4.7968 45.655 5.101e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MSR(model5)/MSE(model5)
## [1] 100.409
Null Hypothesis: B3 = B4 = B5 = 0
Alternative Hypothesis: At least one of B3, B4, B5 != 0
Since F > f_3,34,0.05 = 2.92, we reject H_0 and conclude that the quadratic fit is signficantly better.
rm(list=ls())
prob6 <- read.csv("prob6.csv", header=TRUE)
Part A
model6a <- lm(y~.,data=prob6)
summary(model6a)
##
## Call:
## lm(formula = y ~ ., data = prob6)
##
## Residuals:
## ALL 10 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5361 NA NA NA
## x1 0.8962 NA NA NA
## x2 -0.4413 NA NA NA
## x3 0.5443 NA NA NA
## x4 -0.2494 NA NA NA
## x5 -0.2364 NA NA NA
## x6 -0.5523 NA NA NA
## x7 0.2164 NA NA NA
## x8 -0.7080 NA NA NA
## x9 -0.4114 NA NA NA
## x10 NA NA NA NA
## x11 NA NA NA NA
## x12 NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 9 and 0 DF, p-value: NA
Because there are more parameters than the number of observations (k+1 > 10), the X’X matrix is not invertible. This means the least squares solution cannot be calculated. Part B
for (i in 11:9) {
f <- (fmla <- as.formula(paste("y ~ ", paste(paste0("x", 1:i), collapse= "+"))))
print(f)
print(summary(lm(f, data=prob6)))
}
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11
##
## Call:
## lm(formula = f, data = prob6)
##
## Residuals:
## ALL 10 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5361 NA NA NA
## x1 0.8962 NA NA NA
## x2 -0.4413 NA NA NA
## x3 0.5443 NA NA NA
## x4 -0.2494 NA NA NA
## x5 -0.2364 NA NA NA
## x6 -0.5523 NA NA NA
## x7 0.2164 NA NA NA
## x8 -0.7080 NA NA NA
## x9 -0.4114 NA NA NA
## x10 NA NA NA NA
## x11 NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 9 and 0 DF, p-value: NA
##
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10
##
## Call:
## lm(formula = f, data = prob6)
##
## Residuals:
## ALL 10 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5361 NA NA NA
## x1 0.8962 NA NA NA
## x2 -0.4413 NA NA NA
## x3 0.5443 NA NA NA
## x4 -0.2494 NA NA NA
## x5 -0.2364 NA NA NA
## x6 -0.5523 NA NA NA
## x7 0.2164 NA NA NA
## x8 -0.7080 NA NA NA
## x9 -0.4114 NA NA NA
## x10 NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 9 and 0 DF, p-value: NA
##
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9
##
## Call:
## lm(formula = f, data = prob6)
##
## Residuals:
## ALL 10 residuals are 0: no residual degrees of freedom!
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5361 NA NA NA
## x1 0.8962 NA NA NA
## x2 -0.4413 NA NA NA
## x3 0.5443 NA NA NA
## x4 -0.2494 NA NA NA
## x5 -0.2364 NA NA NA
## x6 -0.5523 NA NA NA
## x7 0.2164 NA NA NA
## x8 -0.7080 NA NA NA
## x9 -0.4114 NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 9 and 0 DF, p-value: NA
When k=9 predictors, k+1 == 10, thus X`X becomes invertible and you can calculate a least squares solution. However, some quantaties still can’t be calculated such as t-statistics and P-values. This is because with the same number of paramaters to estimate (k+1) and observations, you will be able to get a perfect fit with SSE = 0. The model will be overfit and one should have no confidence in the regression model.
rm(list=ls())