fuel <- read.csv("C:/Users/Sabuj Ganguly/OneDrive/Documents/PhD 2nd sem/Linear Model Theory/HW/Fuel.csv")
#Response and predictors
y <- as.matrix(fuel$Fuel)
X <- as.matrix(cbind(1, fuel$Tax, fuel$License, fuel$Inc, fuel$Road))
colnames(X) <- c("Intercept", "Tax", "License", "Inc", "Road")
n <- nrow(X)
p <- ncol(X)
#Least squares
XtX <- t(X) %*% X
Xty <- t(X) %*% y
beta_hat <- solve(XtX, Xty)
# Predicted values and residuals
y_hat <- X %*% beta_hat
e<- y - y_hat
# Covariance matrix
sigma2_hat <- as.numeric(t(e) %*% e) / (n - p)
cov_beta <- sigma2_hat * solve(XtX)
# Sums of squares
SST <- as.numeric(sum((y - mean(y))^2))
SSR <- as.numeric(sum((y_hat - mean(y))^2))
SSE <- as.numeric(sum(e^2))
#Overall F
F_stat <- (SSR / (p - 1)) / (SSE / (n - p))
pf(F_stat, p - 1, n - p, lower.tail = FALSE)
## [1] 3.907167e-10
list(beta_hat = beta_hat,
y_hat = y_hat,
residuals = e,
cov_beta = cov_beta,
SST = SST,
SSR = SSR,
SSE = SSE,
F_stat = F_stat)
## $beta_hat
## [,1]
## Intercept 377.291146
## Tax -34.790149
## License 13.364494
## Inc -66.588752
## Road -2.425889
##
## $y_hat
## [,1]
## [1,] 523.2337
## [2,] 553.1153
## [3,] 578.1074
## [4,] 493.3563
## [5,] 532.0289
## [6,] 433.5514
## [7,] 318.7326
## [8,] 491.5060
## [9,] 489.0246
## [10,] 550.3947
## [11,] 500.4895
## [12,] 442.2533
## [13,] 563.3126
## [14,] 566.0238
## [15,] 638.0660
## [16,] 604.3055
## [17,] 597.4974
## [18,] 596.4035
## [19,] 772.9678
## [20,] 682.7097
## [21,] 694.9795
## [22,] 570.2403
## [23,] 415.0793
## [24,] 460.2215
## [25,] 507.0281
## [26,] 531.9182
## [27,] 588.6488
## [28,] 612.0879
## [29,] 558.0226
## [30,] 471.9829
## [31,] 566.9071
## [32,] 581.4220
## [33,] 651.6498
## [34,] 613.8673
## [35,] 506.4172
## [36,] 716.8238
## [37,] 647.2821
## [38,] 641.9338
## [39,] 717.6483
## [40,] 733.0528
## [41,] 662.8703
## [42,] 633.0654
## [43,] 644.4893
## [44,] 556.9675
## [45,] 713.7995
## [46,] 519.6783
## [47,] 670.3979
## [48,] 569.4385
##
## $residuals
## [,1]
## [1,] 17.766273
## [2,] -29.115303
## [3,] -17.107446
## [4,] -79.356252
## [5,] -122.028926
## [6,] 23.448584
## [7,] 25.267407
## [8,] -24.505956
## [9,] -25.024635
## [10,] -52.394663
## [11,] 79.510451
## [12,] 28.746661
## [13,] -38.312606
## [14,] -58.023774
## [15,] -72.066011
## [16,] 30.694496
## [17,] 5.502618
## [18,] 117.596549
## [19,] 92.032249
## [20,] -42.709730
## [21,] -45.979476
## [22,] -30.240331
## [23,] 48.920694
## [24,] 86.778499
## [25,] -47.028121
## [26,] 34.081760
## [27,] -11.648811
## [28,] 18.912113
## [29,] 15.977437
## [30,] 62.017141
## [31,] 4.092950
## [32,] -27.422002
## [33,] -74.649836
## [34,] 14.132702
## [35,] -19.417192
## [36,] -72.823763
## [37,] -7.282080
## [38,] 62.066241
## [39,] -69.648329
## [40,] 234.947152
## [41,] -75.870345
## [42,] 65.934553
## [43,] -12.489326
## [44,] 34.032496
## [45,] 68.200517
## [46,] -9.678280
## [47,] -60.397870
## [48,] -45.438479
##
## $cov_beta
## Intercept Tax License Inc Road
## Intercept 34425.5335 -1875.170150 -268.660429 -657.318144 -331.273652
## Tax -1875.1701 168.226131 9.741839 -25.999720 25.125345
## License -268.6604 9.741839 3.697858 -6.375132 1.814394
## Inc -657.3181 -25.999720 -6.375132 296.588601 -6.790462
## Road -331.2737 25.125345 1.814394 -6.790462 11.486503
##
## $SST
## [1] 588366.5
##
## $SSR
## [1] 399316.5
##
## $SSE
## [1] 189050
##
## $F_stat
## [1] 22.70644
# Interpretation:
#beta_hat gives the fitted regression coefficients.
# SSR/SST = 0.679, so about 67.9% of the variation in Fuel is explained by the model.
# The overall F-statistic is 22.70644, so the regression is significant overall;
#we reject H0: beta1 = beta2 = beta3 = beta4 = 0.
# (b) H0: 2 beta1 = beta3
C <- matrix(c(0, 2, 0, -1, 0), nrow = 1)
t0 <- matrix(0, nrow = 1, ncol = 1)
SSH_b <- as.numeric(
t(C %*% beta_hat - t0) %*%
solve(C %*% solve(XtX) %*% t(C)) %*%
(C %*% beta_hat - t0)
)
SSH_b
## [1] 36.65227
# Interpretation:
#SSH_b = 36.65227 is the sum of squares due to testing the restriction 2*beta1 = beta3
# (c) Reduced model for 2 beta1 = beta3
X_reduced <- cbind(1, fuel$Tax + 2 * fuel$Inc, fuel$License, fuel$Road)
colnames(X_reduced) <- c("Intercept", "Tax_plus_2Inc", "License", "Road")
XtX_r <- t(X_reduced) %*% X_reduced
Xty_r <- t(X_reduced) %*% y
beta_r <- solve(XtX_r, Xty_r)
y_hat_r <- X_reduced %*% beta_r
SSE_r <- as.numeric(sum((y - y_hat_r)^2))
SSR_r <- as.numeric(sum((y_hat_r - mean(y))^2))
SSH_c1 <- SSE_r - SSE
SSH_c2 <- SSR - SSR_r
list(SSE_r = SSE_r, SSR_r = SSR_r, SSH_c1 = SSH_c1, SSH_c2 = SSH_c2)
## $SSE_r
## [1] 189086.6
##
## $SSR_r
## [1] 399279.9
##
## $SSH_c1
## [1] 36.65227
##
## $SSH_c2
## [1] 36.65227
# Interpretation:
#Imposing 2*beta1 = beta3 slightly worsens the fit.
#Both SSE(reduced) - SSE(full) and SSR(full) - SSR(reduced) equal 36.65227,
# confirming the same SSH as in part (b).
#(d) H0: 2 beta1 - 1 = beta3 or 2 beta1- beta3 = 1
y_star <- y + fuel$Inc
beta_r_d <- solve(t(X_reduced) %*% X_reduced, t(X_reduced) %*% y_star)
y_hat_r_d <- X_reduced %*% beta_r_d
SSE_r_d <- as.numeric(sum((y_star - y_hat_r_d)^2))
t1 <- matrix(1, nrow = 1, ncol = 1)
SSH_d_formula <- as.numeric(
t(C %*% beta_hat - t1) %*%
solve(C %*% solve(XtX) %*% t(C)) %*%
(C %*% beta_hat - t1)
)
SSH_d_diff <- SSE_r_d - SSE
SST_r_d <- as.numeric(sum((y_star - mean(y_star))^2))
SSR_r_d <- as.numeric(sum((y_hat_r_d - mean(y_star))^2))
SSH_d_SSRdiff <- SSR - SSR_r_d
list(SSH_formula = SSH_d_formula,
SSH_diffSSE = SSH_d_diff,
SSH_diffSSR = SSH_d_SSRdiff)
## $SSH_formula
## [1] 65.25169
##
## $SSH_diffSSE
## [1] 65.25169
##
## $SSH_diffSSR
## [1] 1527.028
#Interpretation:
# SSH from the formula and from the SSE difference both equal 65.25169.
# But SSR(full) - SSR(reduced) does not match, because this reduced model uses
# the transformed response y*
# (e) and (f)
P <- function(A) A %*% solve(t(A) %*% A) %*% t(A)
P0 <- P(X[, 1, drop = FALSE])
P01 <- P(X[, 1:2])
P012 <- P(X[, 1:3])
PX <- P(X)
SS0 <- as.numeric(t(y) %*% P0 %*% y)
SS1_given_0 <- as.numeric(t(y) %*% (P01 - P0) %*% y)
SS2_given_01 <- as.numeric(t(y) %*% (P012 - P01) %*% y)
SS34_given_012 <- as.numeric(t(y) %*% (PX - P012) %*% y)
list(SS0 = SS0,
SS1_given_0 = SS1_given_0,
SS2_given_01 = SS2_given_01,
SS34_given_012 = SS34_given_012)
## $SS0
## [1] 15967901
##
## $SS1_given_0
## [1] 119823.1
##
## $SS2_given_01
## [1] 207709.4
##
## $SS34_given_012
## [1] 71784.02
#Interpretation:
#These are sequential sums of squares.
# After the intercept, Tax explains 119823.1.
# After intercept and Tax, License explains 207709.4.
# After intercept, Tax, and License, the pair (Inc, Road) explains 71784.02.
#(g) 95% CI for 2 beta1 - beta3
cvec <- matrix(c(0, 2, 0, -1, 0), ncol = 1)
theta_hat <- as.numeric(t(cvec) %*% beta_hat)
se_theta <- sqrt(sigma2_hat * as.numeric(t(cvec) %*% solve(XtX) %*% cvec))
t_crit <- qt(0.975, n - p)
CI <- c(theta_hat - t_crit * se_theta,
theta_hat + t_crit * se_theta)
list(theta_hat = theta_hat, se = se_theta, CI = CI)
## $theta_hat
## [1] -2.991547
##
## $se
## [1] 32.76419
##
## $CI
## [1] -69.06683 63.08374
#Interpretation:
# The estimate of 2*beta1 - beta3 is -2.991547.
# The 95% confidence interval is (-69.06683, 63.08374).
# Since 0 lies in this interval, there is no evidence at the 5% level that
# 2*beta1 - beta3 differs from 0.