fuel <- read.csv("C:/Users/Sabuj Ganguly/OneDrive/Documents/PhD 2nd sem/Linear Model Theory/HW/Fuel.csv")

#Response and predictors
y <- as.matrix(fuel$Fuel)
X <- as.matrix(cbind(1, fuel$Tax, fuel$License, fuel$Inc, fuel$Road))
colnames(X) <- c("Intercept", "Tax", "License", "Inc", "Road")

n <- nrow(X)
p <- ncol(X)

#Least squares
XtX <- t(X) %*% X
Xty <- t(X) %*% y
beta_hat <- solve(XtX, Xty)

# Predicted values and residuals
y_hat <- X %*% beta_hat
e<- y - y_hat

# Covariance matrix
sigma2_hat <- as.numeric(t(e) %*% e) / (n - p)
cov_beta <- sigma2_hat * solve(XtX)

# Sums of squares
SST <- as.numeric(sum((y - mean(y))^2))
 SSR <- as.numeric(sum((y_hat - mean(y))^2))
SSE  <- as.numeric(sum(e^2))

#Overall F
F_stat <- (SSR / (p - 1)) / (SSE / (n - p))
pf(F_stat, p - 1, n - p, lower.tail = FALSE)
## [1] 3.907167e-10
   list(beta_hat = beta_hat,
     y_hat = y_hat,
     residuals = e,
     cov_beta = cov_beta,
     SST = SST,
     SSR = SSR,
     SSE = SSE,
       F_stat = F_stat)
## $beta_hat
##                 [,1]
## Intercept 377.291146
## Tax       -34.790149
## License    13.364494
## Inc       -66.588752
## Road       -2.425889
## 
## $y_hat
##           [,1]
##  [1,] 523.2337
##  [2,] 553.1153
##  [3,] 578.1074
##  [4,] 493.3563
##  [5,] 532.0289
##  [6,] 433.5514
##  [7,] 318.7326
##  [8,] 491.5060
##  [9,] 489.0246
## [10,] 550.3947
## [11,] 500.4895
## [12,] 442.2533
## [13,] 563.3126
## [14,] 566.0238
## [15,] 638.0660
## [16,] 604.3055
## [17,] 597.4974
## [18,] 596.4035
## [19,] 772.9678
## [20,] 682.7097
## [21,] 694.9795
## [22,] 570.2403
## [23,] 415.0793
## [24,] 460.2215
## [25,] 507.0281
## [26,] 531.9182
## [27,] 588.6488
## [28,] 612.0879
## [29,] 558.0226
## [30,] 471.9829
## [31,] 566.9071
## [32,] 581.4220
## [33,] 651.6498
## [34,] 613.8673
## [35,] 506.4172
## [36,] 716.8238
## [37,] 647.2821
## [38,] 641.9338
## [39,] 717.6483
## [40,] 733.0528
## [41,] 662.8703
## [42,] 633.0654
## [43,] 644.4893
## [44,] 556.9675
## [45,] 713.7995
## [46,] 519.6783
## [47,] 670.3979
## [48,] 569.4385
## 
## $residuals
##              [,1]
##  [1,]   17.766273
##  [2,]  -29.115303
##  [3,]  -17.107446
##  [4,]  -79.356252
##  [5,] -122.028926
##  [6,]   23.448584
##  [7,]   25.267407
##  [8,]  -24.505956
##  [9,]  -25.024635
## [10,]  -52.394663
## [11,]   79.510451
## [12,]   28.746661
## [13,]  -38.312606
## [14,]  -58.023774
## [15,]  -72.066011
## [16,]   30.694496
## [17,]    5.502618
## [18,]  117.596549
## [19,]   92.032249
## [20,]  -42.709730
## [21,]  -45.979476
## [22,]  -30.240331
## [23,]   48.920694
## [24,]   86.778499
## [25,]  -47.028121
## [26,]   34.081760
## [27,]  -11.648811
## [28,]   18.912113
## [29,]   15.977437
## [30,]   62.017141
## [31,]    4.092950
## [32,]  -27.422002
## [33,]  -74.649836
## [34,]   14.132702
## [35,]  -19.417192
## [36,]  -72.823763
## [37,]   -7.282080
## [38,]   62.066241
## [39,]  -69.648329
## [40,]  234.947152
## [41,]  -75.870345
## [42,]   65.934553
## [43,]  -12.489326
## [44,]   34.032496
## [45,]   68.200517
## [46,]   -9.678280
## [47,]  -60.397870
## [48,]  -45.438479
## 
## $cov_beta
##            Intercept          Tax     License         Inc        Road
## Intercept 34425.5335 -1875.170150 -268.660429 -657.318144 -331.273652
## Tax       -1875.1701   168.226131    9.741839  -25.999720   25.125345
## License    -268.6604     9.741839    3.697858   -6.375132    1.814394
## Inc        -657.3181   -25.999720   -6.375132  296.588601   -6.790462
## Road       -331.2737    25.125345    1.814394   -6.790462   11.486503
## 
## $SST
## [1] 588366.5
## 
## $SSR
## [1] 399316.5
## 
## $SSE
## [1] 189050
## 
## $F_stat
## [1] 22.70644
# Interpretation:
#beta_hat gives the fitted regression coefficients.
# SSR/SST = 0.679, so about 67.9% of the variation in Fuel is explained by the model.
# The overall F-statistic is 22.70644, so the regression is significant overall;
#we reject H0: beta1 = beta2 = beta3 = beta4 = 0.


# (b) H0: 2 beta1 = beta3
C <- matrix(c(0, 2, 0, -1, 0), nrow = 1)
t0 <- matrix(0, nrow = 1, ncol = 1)

SSH_b <- as.numeric(
  t(C %*% beta_hat - t0) %*%
    solve(C %*% solve(XtX) %*% t(C)) %*%
    (C %*% beta_hat - t0)
)

SSH_b
## [1] 36.65227
# Interpretation:
#SSH_b = 36.65227 is the sum of squares due to testing the restriction 2*beta1 = beta3


# (c) Reduced model for 2 beta1 = beta3
X_reduced <- cbind(1, fuel$Tax + 2 * fuel$Inc, fuel$License, fuel$Road)
colnames(X_reduced) <- c("Intercept", "Tax_plus_2Inc", "License", "Road")

XtX_r <- t(X_reduced) %*% X_reduced
Xty_r <- t(X_reduced) %*% y
beta_r <- solve(XtX_r, Xty_r)

y_hat_r <- X_reduced %*% beta_r
SSE_r <- as.numeric(sum((y - y_hat_r)^2))
SSR_r <- as.numeric(sum((y_hat_r - mean(y))^2))

SSH_c1 <- SSE_r - SSE
SSH_c2 <- SSR - SSR_r

list(SSE_r = SSE_r, SSR_r = SSR_r, SSH_c1 = SSH_c1, SSH_c2 = SSH_c2)
## $SSE_r
## [1] 189086.6
## 
## $SSR_r
## [1] 399279.9
## 
## $SSH_c1
## [1] 36.65227
## 
## $SSH_c2
## [1] 36.65227
# Interpretation:
#Imposing 2*beta1 = beta3 slightly worsens the fit.
#Both SSE(reduced) - SSE(full) and SSR(full) - SSR(reduced) equal 36.65227,
# confirming the same SSH as in part (b).


#(d) H0: 2 beta1 - 1 = beta3 or 2 beta1- beta3 = 1
y_star <- y + fuel$Inc

beta_r_d <- solve(t(X_reduced) %*% X_reduced, t(X_reduced) %*% y_star)
y_hat_r_d <- X_reduced %*% beta_r_d
SSE_r_d <- as.numeric(sum((y_star - y_hat_r_d)^2))

t1 <- matrix(1, nrow = 1, ncol = 1)

SSH_d_formula <- as.numeric(
  t(C %*% beta_hat - t1) %*%
    solve(C %*% solve(XtX) %*% t(C)) %*%
    (C %*% beta_hat - t1)
)

SSH_d_diff <- SSE_r_d - SSE

SST_r_d <- as.numeric(sum((y_star - mean(y_star))^2))
SSR_r_d <- as.numeric(sum((y_hat_r_d - mean(y_star))^2))

SSH_d_SSRdiff <- SSR - SSR_r_d

list(SSH_formula = SSH_d_formula,
     SSH_diffSSE = SSH_d_diff,
     SSH_diffSSR = SSH_d_SSRdiff)
## $SSH_formula
## [1] 65.25169
## 
## $SSH_diffSSE
## [1] 65.25169
## 
## $SSH_diffSSR
## [1] 1527.028
#Interpretation:
# SSH from the formula and from the SSE difference both equal 65.25169.
# But SSR(full) - SSR(reduced) does not match, because this reduced model uses
# the transformed response y*


# (e) and (f)
P <- function(A) A %*% solve(t(A) %*% A) %*% t(A)

P0   <- P(X[, 1, drop = FALSE])
P01  <- P(X[, 1:2])
P012 <- P(X[, 1:3])
PX   <- P(X)

SS0 <- as.numeric(t(y) %*% P0 %*% y)
SS1_given_0 <- as.numeric(t(y) %*% (P01 - P0) %*% y)
SS2_given_01 <- as.numeric(t(y) %*% (P012 - P01) %*% y)
SS34_given_012 <- as.numeric(t(y) %*% (PX - P012) %*% y)

list(SS0 = SS0,
     SS1_given_0 = SS1_given_0,
     SS2_given_01 = SS2_given_01,
     SS34_given_012 = SS34_given_012)
## $SS0
## [1] 15967901
## 
## $SS1_given_0
## [1] 119823.1
## 
## $SS2_given_01
## [1] 207709.4
## 
## $SS34_given_012
## [1] 71784.02
#Interpretation:
#These are sequential sums of squares.
# After the intercept, Tax explains 119823.1.
# After intercept and Tax, License explains 207709.4.
#  After intercept, Tax, and License, the pair (Inc, Road) explains 71784.02.


#(g) 95% CI for 2 beta1 - beta3
cvec <- matrix(c(0, 2, 0, -1, 0), ncol = 1)
theta_hat <- as.numeric(t(cvec) %*% beta_hat)
se_theta <- sqrt(sigma2_hat * as.numeric(t(cvec) %*% solve(XtX) %*% cvec))

t_crit <- qt(0.975, n - p)
CI <- c(theta_hat - t_crit * se_theta,
        theta_hat + t_crit * se_theta)

list(theta_hat = theta_hat, se = se_theta, CI = CI)
## $theta_hat
## [1] -2.991547
## 
## $se
## [1] 32.76419
## 
## $CI
## [1] -69.06683  63.08374
#Interpretation:
# The estimate of 2*beta1 - beta3 is -2.991547.
#  The 95% confidence interval is (-69.06683, 63.08374).
#   Since 0 lies in this interval, there is no evidence at the 5% level that
# 2*beta1 - beta3 differs from 0.