1. Work Problem 6.1 with the last observation deleted.

# Perform a thorough influence analysis of the solar thermal energy test data 
# given in Table B.2. Discuss your results

# Variables explanation (for reference):
# y: Total heat flux (kwatts)
# x1: Insolation (watts/m²)
# x2: Position of focal point in east direction (inches)
# x3: Position of focal point in south direction (inches)
# x4: Position of focal point in north direction (inches)
# x5: Time of day
# Load packages
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(car)
## Loading required package: carData
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.4.3
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
# Step1: Read data
solar <- read.csv("SolarThermalEnergy.csv")
solar <- solar[-nrow(solar), ] # Remove the last observation
#Pairwise scatter plot to identify relationships
pairs(solar)

# From the plot above the following relationships are revealed
# Insolation(x1) appears to have a positive linear relationship with total heat flux
# x2 appears to have a weak non linear relationship with total heat flux
#  (x3)Position of focal point in south direction (inches) has no strong or clear
# linear pattern with total heat flux, x3 doesn't seem to explain much variation in y.
# Total heat flux appears to have a negative linear relationship with
#Position of focal point in north direction (x4)
# x5 doesn't equally seem to demonstrate any clear linear relationship
# or trend at all with total heat flux
# Fit the regression model
model <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = solar) 
summary(model)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = solar)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6188  -2.7896   0.4168   4.3807  16.1877 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 345.15900   97.42412   3.543  0.00183 ** 
## x1            0.07105    0.02905   2.445  0.02293 *  
## x2            2.12624    1.30292   1.632  0.11693    
## x3            3.50171    1.48064   2.365  0.02727 *  
## x4          -22.92065    2.69262  -8.512 2.07e-08 ***
## x5            2.59559    1.80825   1.435  0.16523    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.006 on 22 degrees of freedom
## Multiple R-squared:  0.9026, Adjusted R-squared:  0.8804 
## F-statistic: 40.76 on 5 and 22 DF,  p-value: 2.102e-10
#Model: A multiple regression model is used to predict y (total heat
#flux) using five predictors (x1, x2, x3, x4, x5).
#Total Heat Flux=345.159+0.071(x1)+2.126(x2)+3.50(x)3-22.921(x4)+2.596(x5)

#Significant Predictors: x1, x3, and x4 are statistically significant
#with p-values less than 0.05.

#Non-significant Predictors: x2 and x5 are not statistically significant
#(p-values greater than 0.05).

#R-squared: 90.26% of the variance in y is explained by the model.
#This implies that the model has a good fit

#F-statistic: The model is significant overall (p-value = 2.102e-10)is
#less than the significance level of 0.05
#  Influence analysis of the linear regression model
 print(influence.measures(model))
## Influence measures of
##   lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = solar) :
## 
##      dfb.1_    dfb.x1   dfb.x2    dfb.x3   dfb.x4    dfb.x5    dffit cov.r
## 1  -0.91252 -0.613581  0.33482  1.917149  0.50576 -1.497227  2.03355 3.038
## 2  -0.02900  0.008054  0.05682  0.012634 -0.02246  0.020080  0.09156 1.462
## 3  -0.02252 -0.030876 -0.00770  0.032000  0.02422  0.026760  0.20588 1.320
## 4   0.11807  1.003053 -1.11582 -0.936690  1.78944 -0.527107  3.06483 0.195
## 5   0.03757 -0.055752 -0.03002  0.009995 -0.04293  0.011786 -0.08754 1.590
## 6  -0.00440  0.061718 -0.01241 -0.030432  0.02340  0.010436  0.08909 1.554
## 7   0.03152 -0.279612  0.01425  0.121947 -0.02863 -0.121432 -0.33007 1.527
## 8   0.31270 -0.743619 -0.25178  0.095571 -0.03395 -0.279790 -0.90860 0.795
## 9   0.18854  0.178511 -0.20501 -0.214718 -0.31188  0.498905 -0.68269 0.871
## 10  0.07161  0.079154 -0.08946 -0.095170 -0.11940  0.228990 -0.33854 1.206
## 11 -0.00516 -0.003207  0.00631  0.009481  0.00394 -0.015204  0.02680 1.428
## 12 -0.03437  0.024187 -0.00168 -0.053038  0.08973  0.055520 -0.28640 1.019
## 13 -0.01571 -0.011129  0.02562 -0.012179  0.02942 -0.006496 -0.08245 1.423
## 14  0.00376 -0.000542 -0.00568  0.000403 -0.00305  0.001031  0.00791 1.733
## 15  0.02182  0.003405 -0.00168 -0.056261  0.00889  0.014537 -0.09867 1.445
## 16  0.37543 -0.101990 -0.52467 -0.135834 -0.16128  0.178586  0.59767 2.004
## 17 -0.05028  0.020238  0.03736  0.013465  0.04520  0.002779  0.08267 1.574
## 18 -0.05411  0.084591  0.27473  0.176169 -0.31268 -0.208430 -1.02527 0.562
## 19  0.33753 -0.015749 -0.18770 -0.267330 -0.37788  0.307271  0.44859 1.476
## 20 -0.06605  0.010843  0.00497  0.049317  0.11806 -0.084982 -0.14223 1.570
## 21 -0.02556  0.064805  0.10794 -0.052464 -0.09477  0.140478  0.25053 1.376
## 22 -1.20997  0.457219  1.43700  0.269503  0.58920  0.105091  1.74425 0.327
## 23 -0.01498  0.026125 -0.00166 -0.062870  0.05213  0.087230  0.21668 1.441
## 24  0.06932  0.268008  0.00536  0.187162 -0.46320 -0.105972 -1.24294 0.795
## 25 -0.01009  0.015645  0.00441  0.002778  0.00206  0.007917 -0.02294 2.032
## 26  0.13422 -0.177776 -0.03164 -0.031789 -0.10188 -0.054080  0.26841 1.465
## 27  0.08938 -0.163520  0.04987 -0.007202 -0.13662 -0.027987  0.27907 1.405
## 28 -0.00403  0.007026 -0.00309 -0.000568  0.00920 -0.000673 -0.01445 1.529
##      cook.d    hat inf
## 1  6.71e-01 0.7203   *
## 2  1.46e-03 0.1128    
## 3  7.27e-03 0.1004    
## 4  1.08e+00 0.4601   *
## 5  1.34e-03 0.1769    
## 6  1.38e-03 0.1595    
## 7  1.87e-02 0.2227    
## 8  1.27e-01 0.2247    
## 9  7.35e-02 0.1725    
## 10 1.93e-02 0.1249    
## 11 1.25e-04 0.0767    
## 12 1.36e-02 0.0651    
## 13 1.18e-03 0.0894    
## 14 1.09e-05 0.2372    
## 15 1.69e-03 0.1062    
## 16 6.10e-02 0.4246   *
## 17 1.19e-03 0.1683    
## 18 1.53e-01 0.2023    
## 19 3.41e-02 0.2468    
## 20 3.52e-03 0.1800    
## 21 1.08e-02 0.1391    
## 22 3.97e-01 0.2991   *
## 23 8.09e-03 0.1496    
## 24 2.33e-01 0.3156    
## 25 9.19e-05 0.3497   *
## 26 1.24e-02 0.1786    
## 27 1.33e-02 0.1611    
## 28 3.65e-05 0.1358
# This code gives a detailed influence measures for each observation:
# DFBETAS for each coefficient (dfb.1_, dfb.x1, dfb.x2,....dfb.x5)
# DFFITS (dffit)
# COVRATIO (cov.r)
# Cook's distance (cook.d)
# Hat values/leverage (hat)
# Detailed Summary of Influence Analysis Results
# The analysis identified 5 observations as potentially influential (marked with asterisks):

# Observation 1: Very high leverage (0.7203) and Cook's distance (0.671),
# suggesting it strongly affects the model fit. Shows extreme DFFIT (2.03)
# and affects multiple coefficients, particularly x3 (1.917) and x5 (-1.497).

# Observation 4: Highest Cook's distance (1.08) and very high DFFIT (3.06),
# indicating substantial influence on the overall model. Particularly
# affects coefficients for x1 (1.003), x2 (-1.116), and x4 (1.789).

# Observation 16: High leverage (0.4246) with moderate influence on
# coefficient estimates, especially the intercept (0.375).

# Observation 22: High Cook's distance (0.397) with large DFFIT (1.74).
# Strongly influences the intercept (-1.21) and x2 coefficient (1.437).
# Observation 25: High leverage (0.3497) but relatively low Cook's distance,
# suggesting it has unusual predictor values but doesn't severely distort the model.

# Additional Points of Interest
# Observation 24: Not flagged but shows high influence with Cook's distance
# of 0.233 and negative DFFIT (-1.24), particularly affecting x4 (-0.463).
# Observation 18: Moderately high Cook's distance (0.153) with substantial
# DFFIT (-1.03), influencing multiple coefficients.

# Observation 8: Notable influence on x1 coefficient (-0.744) with DFFIT of -0.909.

# This analysis indicates that our model is sensitive to several
# observations, particularly observations 1 and 4, which have
# disproportionate influence on the coefficient estimates. 
# Further investigations need to be carried out to determine if the identified
# observations influence is indeed significant.
# refitting the model without observation four to see if this improves the model.

# Original data and model
solar_data <- data.frame(
  y = c(271.8, 264.0, 238.8, 230.7, 251.6, 257.9, 263.9, 266.5, 229.1, 239.3, 258.0, 
        257.6, 267.3, 267.0, 259.6, 240.4, 227.2, 196.0, 278.7, 272.3, 267.4, 254.5, 
        224.7, 181.5, 227.5, 253.6, 263.0, 265.8, 263.8),
  x1 = c(783.35, 748.45, 684.45, 827.80, 860.45, 875.15, 909.45, 905.55, 756.00, 769.35, 
         793.50, 801.65, 819.65, 808.55, 774.95, 711.85, 694.85, 638.10, 774.55, 757.90, 
         753.35, 704.70, 666.80, 568.55, 653.10, 704.05, 709.60, 726.90, 697.15),
  x2 = c(33.53, 36.50, 34.66, 33.13, 35.75, 34.46, 34.60, 35.38, 35.85, 35.68, 35.35, 
         35.04, 34.07, 32.20, 34.32, 31.08, 35.73, 34.11, 34.79, 35.77, 36.44, 37.82, 
         35.07, 35.26, 35.56, 35.73, 36.46, 36.26, 37.20),
  x3 = c(40.55, 36.19, 37.31, 32.52, 33.71, 34.14, 34.85, 35.89, 33.53, 33.79, 34.72, 
         35.22, 36.50, 37.60, 37.89, 37.71, 37.00, 36.76, 34.62, 35.40, 35.96, 36.26, 
         36.34, 35.90, 31.84, 33.16, 33.83, 34.89, 36.27),
  x4 = c(16.66, 16.46, 17.66, 17.50, 16.40, 16.28, 16.06, 15.93, 16.60, 16.41, 16.17, 
         15.92, 16.04, 16.19, 16.62, 17.37, 18.12, 18.53, 15.54, 15.70, 16.45, 17.62, 
         18.12, 19.05, 16.51, 16.02, 15.89, 15.83, 16.71),
  x5 = c(13.20, 14.11, 15.68, 10.53, 11.00, 11.31, 11.96, 12.58, 10.66, 10.85, 11.41, 
         11.91, 12.85, 13.58, 14.21, 15.56, 15.83, 16.41, 13.10, 13.63, 14.51, 15.38, 
         16.10, 16.73, 10.58, 11.28, 11.91, 12.65, 14.06)
)

# Original model with all observations
model_full <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = solar_data)
summary_full <- summary(model_full)

# Model without observations 4 and the last observation
solar_d1 <- solar_data[-c(4, 25), ]
model_2 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = solar_d1)
summary_d1 <- summary(model_2)

# Compare the two models
cat("Full Model R-squared:", summary_full$r.squared, "\n")
## Full Model R-squared: 0.8987602
cat("Model without Obs 4 R-squared:", summary_d1$r.squared, "\n\n")
## Model without Obs 4 R-squared: 0.9325809
cat("Full Model Adjusted R-squared:", summary_full$adj.r.squared, "\n")
## Full Model Adjusted R-squared: 0.8767516
cat("Model without Obs 4 Adjusted R-squared:", summary_d1$adj.r.squared, "\n\n")
## Model without Obs 4 Adjusted R-squared: 0.9165287
cat("Full Model Residual Standard Error:", summary_full$sigma, "\n")
## Full Model Residual Standard Error: 8.039022
cat("Model without Obs 4 Residual Standard Error:", summary_d1$sigma, "\n\n")
## Model without Obs 4 Residual Standard Error: 6.649055
# Compare coefficients
coef_comparison <- data.frame(
  Full_Model = coefficients(model_full),
  Without_Obs4 = coefficients(model_2),
  Difference = coefficients(model_full) - coefficients(model_2),
  Percent_Change = 100 * abs((coefficients(model_full) - coefficients(model_2)) / coefficients(model_full))
)

print("Coefficient Comparison:")
## [1] "Coefficient Comparison:"
print(coef_comparison)
##              Full_Model Without_Obs4  Difference Percent_Change
## (Intercept) 325.4361167 308.57870539 16.85741133       5.179945
## x1            0.0675257   0.05086003  0.01666567      24.680483
## x2            2.5519813   3.73218524 -1.18020391      46.246573
## x3            3.8001944   4.95645600 -1.15626162      30.426381
## x4          -22.9494678 -27.13409443  4.18462660      18.234090
## x5            2.4174843   3.59180849 -1.17432419      48.576290
# Check the change in the influential observations in the new model
new_influence <- influence.measures(model_2)
# Model Fit Metrics
# R-squared: Increased from 0.899 to 0.933 (a 3.4% improvement)
# Adjusted R-squared: Increased from 0.877 to 0.917 (a 4% improvement)
# Residual Standard Error: Decreased from 8.039 to 6.649 (a 17.3% reduction)

# Coefficient Changes
# The removal of observation 4 led to significant changes in the estimated coefficients:

# x1 (Insolation): Decreased by 24.7% (from 0.068 to 0.051)
# x2 (East direction): Increased by 46.2% (from 2.55 to 3.73)
# x3 (South direction): Increased by 30.4% (from 3.80 to 4.96)
# x4 (North direction): More negative by 18.2% (from -22.95 to -27.13)
# x5 (Time of day): Increased by 48.6% (from 2.42 to 3.59)

# Interpretation
# The dramatic changes in the coefficients indicate that observation 4
# was indeed highly influential and potentially distorting the original model.
# The improved model fit metrics suggest that the new model provides a more
# accurate representation of the relationships between the predictors and
# total solar thermal energy .

# The positioning variables (x2, x3, x4) show particularly large changes,
# suggesting that the focal point positioning is more critical than initially estimated.

# The substantial decrease in residual standard error means the model's predictions are now more precise.

# These results confirm that removing observation 4  was justified and has
# led to a more reliable model for understanding the factors affecting solar
# thermal energy performance.

# Despite removing influential point, observation 4, the model still has
# several observations with substantial influence on the parameter estimates.
# The persistence of these influential points suggests there might be
# Inherent variability in the solar thermal system

# Observations 16 and 22 particularly affect the east direction focal point
# (x2) coefficient, suggesting this parameter's estimate is still somewhat unstable.
# Influence Plot (Leverage vs. Studentized Residuals with Cook's Distance)

influencePlot(model_2, id = list(method = "identify"), main = "Influence Plot")

# The influence plot confirms that even after removing observation 4, several
# influential points remain:
# Observation 1: High leverage (~0.7) with moderate studentized residual
# Observation 16: High leverage (~0.43) with large positive studentized residual
# Observation 22: High leverage (~0.28) with large positive studentized residual
# Observation 8: Moderate leverage with negative studentized residual
# Check if multicollinearity is present
# Initial model with influential observation 4
vif(model)
##       x1       x2       x3       x4       x5 
## 2.303242 1.367654 3.268810 2.612310 5.377488
# new model without influential observation
vif(model_2)
##       x1       x2       x3       x4       x5 
## 2.712728 1.419960 2.569879 3.244517 5.120087
# x5 is the only variable near the threshold of concern (VIF > 5). 
# It's slightly improved in model_2 after removing the influential observation.
# All other variables have VIFs < 5, suggesting no serious multicollinearity.
# Removing the influential observation reduced VIF for x3 and x5, showing it slightly
# helped stabilize the model.
# Still more investigations needs to be carried out on the influential points
# identified above to determine if they too have a significant influence.

2. Work Problem 7.4.

# Create the dataset from the problem
data <- data.frame(
  x = c(4.00, 4.00, 4.00, 5.00, 5.00, 6.00, 6.50, 6.50, 6.75, 7.00, 7.10, 7.30),
  y = c(24.60, 24.71, 23.90, 39.50, 39.60, 57.12, 67.11, 67.24, 67.15, 77.87, 80.11, 84.67)
)

# Display the data
data
# a. Fit a second-order polynomial model to these data
# A second-order polynomial model has the form: y = β₀ + β₁x + β₂x²

# Create the squared term
data$x_squared <- data$x^2

# Fit the second-order polynomial model
model2 <- lm(y ~ x + x_squared, data = data)

# Display the model summary
summary(model2)
## 
## Call:
## lm(formula = y ~ x + x_squared, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5729  0.1344  0.4641  0.7315  0.8495 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  -4.4595    14.6343  -0.305   0.7675  
## x             1.3837     5.4971   0.252   0.8069  
## x_squared     1.4670     0.4936   2.972   0.0156 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.657 on 9 degrees of freedom
## Multiple R-squared:  0.9957, Adjusted R-squared:  0.9948 
## F-statistic:  1045 on 2 and 9 DF,  p-value: 2.213e-11
# Extract model coefficients for the equation
model_coef <- coef(model2)
cat("\nThe fitted second-order polynomial model is:\n")
## 
## The fitted second-order polynomial model is:
cat(sprintf("y = %.4f + %.4f·x + %.4f·x²\n", 
            model_coef[1], model_coef[2], model_coef[3]))
## y = -4.4595 + 1.3837·x + 1.4670·x²
# The regression model examines the relationship between the response
# variable y and the predictors x and x_squared. 
# The output shows that the quadratic term x_squared is statistically
# significant with a p-value of 0.0156*, at significance level of 0.05. 
# This means that there is evidence of a non-linear relationship between x and y.
# The linear term x and the intercept are not statistically significant,
# with p-values of 0.8069 and 0.7675, respectively. 

# The overall fit of the model is pretty good, with an R-squared value of
# 0.9957, indicating that approximately 99.6% of the variability in y is
# explained by the model. 
# The F-statistic is very high (1045) and highly significant 
# (p-value < 0.05), suggesting that the model as a whole is statistically significant.

# In conclusion, the model strongly supports a quadratic relationship between
# x and y, and it provides an excellent fit to the data.
# Part b: Test for significance of regression
# The F-test in the summary output, tests if the model is significantly
# better than a model with no predictors
# But we can also conduct this test explicitly

# Calculate F-statistic for the overall model
anova_result <- anova(lm(y ~ 1, data = data), model2)
cat("\n--- Test for Significance of Regression ---\n")
## 
## --- Test for Significance of Regression ---
print(anova_result)
## Analysis of Variance Table
## 
## Model 1: y ~ 1
## Model 2: y ~ x + x_squared
##   Res.Df    RSS Df Sum of Sq    F    Pr(>F)    
## 1     11 5765.3                                
## 2      9   24.7  2    5740.6 1045 2.213e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# F-test: it test whether the overall regression model is significant
# Null Hypothesis (H₀):The regression coefficients of all predictors are 
# equal to zero.
     #H0:β1=β2=0
# Alternative Hypothesis (H1):
# At least one predictor is significantly different from zero
       # H1:At least one βj ≠ 0

# Since the p-value(2.213e-11) is much smaller than 0.05, we reject the null
# hypothesis. This means the regression model with x and x_squared provides a
# significantly better fit than a model with no predictors. 
# In other words, at least one of the predictors significantly explains
# variation in y.

#Based on summary(model2) above
# T-test: test whether each individual regression coefficient is significantly 
# Null Hypothesis: There is no linear relationship between Xj and Y            # Statistically: βj = 0.
# Alternative Hypothesis: There is a significant linear relationship between Xj and Y .
       #Statistically: βj ≠ 0.
#The t-test shows that only the quadratic term (x_squared) is individually significant.

# Conclusion:
# The F-test shows that the regression as a whole is significant.
# The t-test shows that only the quadratic term (x_squared) is individually significant.
# Even though x is not significant alone, we keep it because it's part of a
# hierarchical polynomial model.
# Part c: Test for lack of fit and comment on the adequacy of the second-order model. 
# For this, we need to identify replicated x-values and calculate pure error
# Step 1: Identify points with the same x-values
x_values <- unique(data$x)
pure_error_ss <- 0
pure_error_df <- 0

for (x_val in x_values) {
  y_subset <- data$y[data$x == x_val]
  if (length(y_subset) > 1) {
    pure_error_ss <- pure_error_ss + sum((y_subset - mean(y_subset))^2)
    pure_error_df <- pure_error_df + (length(y_subset) - 1)
  }
}

# Step 2: Calculate lack of fit statistics
residual_ss <- sum(residuals(model2)^2)
residual_df <- df.residual(model2)

lack_of_fit_ss <- residual_ss - pure_error_ss
lack_of_fit_df <- residual_df - pure_error_df

# Step 3: Calculate F-statistic for lack of fit
if (pure_error_df > 0) {
  F_lack_of_fit <- (lack_of_fit_ss / lack_of_fit_df) / (pure_error_ss / pure_error_df)
  p_lack_of_fit <- pf(F_lack_of_fit, lack_of_fit_df, pure_error_df, lower.tail = FALSE)
  
  cat("\n--- Test for Lack of Fit ---\n")
  cat("Lack of Fit SS:", lack_of_fit_ss, "\n")
  cat("Lack of Fit DF:", lack_of_fit_df, "\n")
  cat("Pure Error SS:", pure_error_ss, "\n")
  cat("Pure Error DF:", pure_error_df, "\n")
  cat("F-statistic:", F_lack_of_fit, "\n")
  cat("p-value:", p_lack_of_fit, "\n")
  
  if (p_lack_of_fit > 0.05) {
    cat("The model shows no significant lack of fit (p > 0.05).\n")
  } else {
    cat("The model shows significant lack of fit (p < 0.05).\n")
  }
} else {
  cat("\nCannot test for lack of fit as there are no replicated x values with more than one observation.\n")
}
## 
## --- Test for Lack of Fit ---
## Lack of Fit SS: 24.32069 
## Lack of Fit DF: 5 
## Pure Error SS: 0.3995167 
## Pure Error DF: 4 
## F-statistic: 48.70023 
## p-value: 0.001124318 
## The model shows significant lack of fit (p < 0.05).
# The lack of fit test indicates that despite the high R² value of 0.9957, 
# there is statistically significant evidence (F=48.70, p=0.001124) that the
# second-order polynomial model doesn't capture all systematic variation in the data.
# This suggests that while the quadratic model explains most of the variability, 
# there may be higher-order terms that could provide an even better fit.
# Part d: Test the hypothesis H₀: β₂ = 0.  Can the quadratic term be deleted from this equation?
# This tests whether the quadratic term can be removed from the model

# The t-test for β₂ is already in the summary output
# We can extract the relevant statistics
t_value_beta2 <- summary(model2)$coefficients["x_squared", "t value"]
p_value_beta2 <- summary(model2)$coefficients["x_squared", "Pr(>|t|)"]

cat("\n--- Test for H₀: β₂ = 0 ---\n")
## 
## --- Test for H₀: β₂ = 0 ---
cat("t-value for β₂:", t_value_beta2, "\n")
## t-value for β₂: 2.972083
cat("p-value for β₂:", p_value_beta2, "\n")
## p-value for β₂: 0.01564942
if (p_value_beta2 < 0.05) {
  cat("We reject H₀ and conclude that the quadratic term is statistically significant.\n")
  cat("The quadratic term should be kept in the model.\n")
} else {
  cat("We fail to reject H₀. The quadratic term is not statistically significant.\n")
  cat("The quadratic term could potentially be removed from the model.\n")
}
## We reject H₀ and conclude that the quadratic term is statistically significant.
## The quadratic term should be kept in the model.
# We can also fit a linear model (without the quadratic term) and compare the two models
linear_model <- lm(y ~ x, data = data)
cat("\nComparison between linear and quadratic models:\n")
## 
## Comparison between linear and quadratic models:
anova_comparison <- anova(linear_model, model2)
print(anova_comparison)
## Analysis of Variance Table
## 
## Model 1: y ~ x
## Model 2: y ~ x + x_squared
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     10 48.982                              
## 2      9 24.720  1    24.262 8.8333 0.01565 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Extract F-statistic and p-value from the model comparison
F_compare <- anova_comparison$F[2]
p_compare <- anova_comparison$`Pr(>F)`[2]

if (p_compare < 0.05) {
  cat("\nThe quadratic model is significantly better than the linear model (p < 0.05).\n")
  cat("Therefore, the quadratic term should not be deleted from the equation.\n")
} else {
  cat("\nThe quadratic model is not significantly better than the linear model (p > 0.05).\n")
  cat("Therefore, the quadratic term could be deleted from the equation for parsimony.\n")
}
## 
## The quadratic model is significantly better than the linear model (p < 0.05).
## Therefore, the quadratic term should not be deleted from the equation.

3. Work Problem 10.4, Parts (a) through (e), BUT use the solar thermal data from Problem 6.1 after deleting the last observation. Interpret results.

#  a. Use forward selection to specify a subset regression model. 
# forward selection
d1 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = solar)
model.forward<-ols_step_forward_p(d1, penter = 0.10, details = F) # penter: threshold p-value for enter
model.forward   
## 
## 
##                              Stepwise Summary                              
## -------------------------------------------------------------------------
## Step    Variable        AIC        SBC       SBIC        R2       Adj. R2 
## -------------------------------------------------------------------------
##  0      Base Model    258.402    261.067    175.655    0.00000    0.00000 
##  1      x4            223.615    227.612    141.419    0.73121    0.72087 
##  2      x3            206.314    211.643    126.355    0.86509    0.85430 
##  3      x2            206.139    212.800    126.625    0.87518    0.85957 
##  4      x1            203.705    211.699    125.829    0.89345    0.87492 
##  5      x5            203.199    212.524    126.862    0.90258    0.88044 
## -------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                          
## ---------------------------------------------------------------
## R                       0.950       RMSE                 7.097 
## R-Squared               0.903       MSE                 50.361 
## Adj. R-Squared          0.880       Coef. Var            3.214 
## Pred R-Squared          0.781       AIC                203.199 
## MAE                     5.364       SBC                212.524 
## ---------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                 ANOVA                                 
## ---------------------------------------------------------------------
##                  Sum of                                              
##                 Squares        DF    Mean Square      F         Sig. 
## ---------------------------------------------------------------------
## Regression    13064.095         5       2612.819    40.764    0.0000 
## Residual       1410.106        22         64.096                     
## Total         14474.201        27                                    
## ---------------------------------------------------------------------
## 
##                                     Parameter Estimates                                     
## -------------------------------------------------------------------------------------------
##       model       Beta    Std. Error    Std. Beta      t        Sig       lower      upper 
## -------------------------------------------------------------------------------------------
## (Intercept)    345.159        97.424                  3.543    0.002    143.114    547.204 
##          x4    -22.921         2.693       -0.916    -8.512    0.000    -28.505    -17.336 
##          x3      3.502         1.481        0.285     2.365    0.027      0.431      6.572 
##          x2      2.126         1.303        0.127     1.632    0.117     -0.576      4.828 
##          x1      0.071         0.029        0.247     2.445    0.023      0.011      0.131 
##          x5      2.596         1.808        0.222     1.435    0.165     -1.154      6.346 
## -------------------------------------------------------------------------------------------
# Forward Selection Analysis Results
# Key Results
# Variables added in order: x4, x3, x2, x1, x5
# Final model performance: R² = 0.903, Adjusted R² = 0.880
# Significant predictors at α = 0.10:
# x4 (p < 0.001)
# x3 (p = 0.027)
# x1 (p = 0.023)
# Predictors not significant at α = 0.10 in final model:
# x2 (p = 0.117)
# x5 (p = 0.165)

# Interpretation
# Based on the forward selection results with entry threshold α = 0.10, all five
# variables were included in the model. 
# The algorithm added variables that were significant at the time of entry, though in
# the final model, x2 and x5 no longer meet the significance threshold.
# The full model explains approximately 90.3% of the variation in the response
# variable (R² = 0.903), with an adjusted R² of 0.880 accounting for the number of predictors.
# Despite x2 and x5 not being statistically significant in the final model, their
# inclusion during the forward selection process suggests they provided enough
# improvement at their step of entry to meet the α = 0.10 threshold.
# The initial significance of these variables may have been affected by the subsequent
# addition of other predictors.
# b. Use backward elimination to specify a subset regression model.
# backward selection
model.backward<-ols_step_backward_p(d1, prem = 0.10, details = F) # prem: threshold p-value for removal
model.backward
## [1] "No variables have been removed from the model."
# The result "No variables have been removed from the model." indicates that all
# variables in the full model (y ~ x1 + x2 + x3 + x4 + x5) have p-values less than the
# removal threshold of 0.10, so none were eliminated.

# Interpretation of Results
# Backward Elimination Analysis
# Starting model: Full model with all five predictors (x1, x2, x3, x4, x5)
# Removal threshold: α = 0.10
# Result: No variables were removed from the model
# Final model: Includes all five predictors (x1, x2, x3, x4, x5)

# Explanation
# Despite the individual p-values for x2 and x5 being above 0.10 in the forward
# selection output, the backward elimination process determined that all variables
# should be retained when starting with the full model. This suggests that the
# contribution of each variable is being evaluated differently in the context of
# backward elimination.

# Conclusion
# The backward elimination procedure suggests that the full model with all five
# predictors (x1, x2, x3, x4, x5) is the best subset model according to the specified
# threshold (α = 0.10). This reinforces the forward selection result, which also
# included all variables, albeit with different p-values for some predictors.
# c. Use stepwise regression to specify a subset regression model. 
# stepwise selection
model.stepwise<-ols_step_both_p(d1, pent = 0.10, prem = 0.10, details = FALSE)
model.stepwise
## 
## 
##                              Stepwise Summary                              
## -------------------------------------------------------------------------
## Step    Variable        AIC        SBC       SBIC        R2       Adj. R2 
## -------------------------------------------------------------------------
##  0      Base Model    258.402    261.067    175.655    0.00000    0.00000 
##  1      x4 (+)        223.615    227.612    141.419    0.73121    0.72087 
##  2      x3 (+)        206.314    211.643    126.355    0.86509    0.85430 
## -------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                          
## ---------------------------------------------------------------
## R                       0.930       RMSE                 8.351 
## R-Squared               0.865       MSE                 69.740 
## Adj. R-Squared          0.854       Coef. Var            3.547 
## Pred R-Squared          0.821       AIC                206.314 
## MAE                     6.351       SBC                211.643 
## ---------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                 ANOVA                                 
## ---------------------------------------------------------------------
##                  Sum of                                              
##                 Squares        DF    Mean Square      F         Sig. 
## ---------------------------------------------------------------------
## Regression    12521.486         2       6260.743    80.154    0.0000 
## Residual       1952.715        25         78.109                     
## Total         14474.201        27                                    
## ---------------------------------------------------------------------
## 
##                                     Parameter Estimates                                      
## --------------------------------------------------------------------------------------------
##       model       Beta    Std. Error    Std. Beta       t        Sig       lower      upper 
## --------------------------------------------------------------------------------------------
## (Intercept)    485.765        39.186                  12.396    0.000    405.060    566.470 
##          x4    -24.164         1.921       -0.965    -12.582    0.000    -28.119    -20.208 
##          x3      4.702         0.944        0.382      4.981    0.000      2.758      6.647 
## --------------------------------------------------------------------------------------------
# Stepwise Regression Analysis Results
# Key Results
# Variables included in final model: x4, x3
# Variables added in order:
# x4 (added first)
# x3 (added second)

# Final model performance:
# R² = 0.865
# Adjusted R² = 0.854
# AIC = 206.314
# SBC = 211.643

# Significance of predictors:
# x4 (p < 0.001)
# x3 (p < 0.001)

# Interpretation
# The stepwise regression procedure (combining forward selection and
# backward elimination with α = 0.10) resulted in a more parsimonious model
# than both the forward selection and backward elimination methods. 
# Only two variables, x4 and x3, were selected for the final model.
# These two predictors explain approximately 86.5% of the variation in the
# response variable (R² = 0.865), with an adjusted R² of 0.854. 
# Both predictors are highly significant with p-values < 0.001.
# Unlike the forward selection and backward elimination procedures that
# included all five variables, the stepwise approach systematically
# evaluated both addition and removal at each step, resulting in a more
# streamlined model. 
# The stepwise algorithm stopped after adding x3, suggesting that the
# remaining variables (x1, x2, and x5) did not meet the entry criteria when
# evaluated in the context of a model already containing x4 and x3.
# The final model from stepwise regression represents a good balance between
# model fit  with strong statistical significance for both included
# predictors and a high R² value of 0.865.
# d. Apply all possible regressions to the data. Evaluate R^2p, C p, and  
# MSRes  for each model. Which subset model do you recommend? 
# best subset selection (MSRes,  R-square, C(p) )
model.best.subset<-ols_step_best_subset(d1) 
model.best.subset
##    Best Subsets Regression   
## -----------------------------
## Model Index    Predictors
## -----------------------------
##      1         x4             
##      2         x3 x4          
##      3         x1 x4 x5       
##      4         x1 x2 x3 x4    
##      5         x1 x2 x3 x4 x5 
## -----------------------------
## 
##                                                      Subsets Regression Summary                                                      
## -------------------------------------------------------------------------------------------------------------------------------------
##                        Adj.        Pred                                                                                               
## Model    R-Square    R-Square    R-Square     C(p)        AIC         SBIC        SBC         MSEP         FPE        HSP       APC  
## -------------------------------------------------------------------------------------------------------------------------------------
##   1        0.7312      0.7209      0.6805    36.6984    223.6151    141.4193    227.6117    4190.6313    160.3231    5.9854    0.3101 
##   2        0.8651      0.8543      0.8207     8.4656    206.3142    126.3549    211.6430    2190.9932     86.4774    3.2545    0.1673 
##   3        0.8753      0.8597      0.8191     8.1624    206.1131    126.6063    212.7741    2113.4157     85.9568    3.2701    0.1663 
##   4        0.8935      0.8749      0.8058     6.0604    203.7053    125.8285    211.6986    1887.6568     79.0243    3.0478    0.1529 
##   5        0.9026      0.8804      0.7807     6.0000    203.1986    126.8620    212.5240    1808.1973     77.8305    3.0522    0.1506 
## -------------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria 
##  SBIC: Sawa's Bayesian Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
##  MSEP: Estimated error of prediction, assuming multivariate normality 
##  FPE: Final Prediction Error 
##  HSP: Hocking's Sp 
##  APC: Amemiya Prediction Criteria
# Analysis of Results
# Looking at the key metrics requested in the question:
# 1. R²p (Predicted R-Square):
# Model 1 (x4): 0.6805
# Model 2 (x3, x4): 0.8207 <- Highest
# Model 3 (x1, x4, x5): 0.8191
# Model 4 (x1, x2, x3, x4): 0.8058
# Model 5 (x1, x2, x3, x4, x5): 0.7807

# 2. Cp (Mallows' Cp):
# Model 1 (x4): 36.6984
# Model 2 (x3, x4): 8.4656
# Model 3 (x1, x4, x5): 8.1624
# Model 4 (x1, x2, x3, x4): 6.0604
# Model 5 (x1, x2, x3, x4, x5): 6.0000 <- Closest to p+1 (6)

# 3. MSRes (Mean Square Residual - shown as MSEP in output):
# Model 1 (x4): 4190.6313
# Model 2 (x3, x4): 2190.9932
# Model 3 (x1, x4, x5): 2113.4157
# Model 4 (x1, x2, x3, x4): 1887.6568
# Model 5 (x1, x2, x3, x4, x5): 1808.1973 <- Lowest

# Recommendation
# Based on these metrics:
# Model 2 (x3, x4) has the highest predicted R-square (0.8207), indicating
# strong predictive ability with just two variables.
# Model 5 (full model) has the lowest MSRes and Cp value exactly equal to
# p+1 (6), suggesting it's theoretically the best fit.
# Model 4 (x1, x2, x3, x4) has a good balance with Cp close to p+1 (6.0604),
# a strong adjusted R-square (0.8749), and lower MSRes than models 1-3.

# Recommendation: Model 2 (x3, x4) is the best subset model to recommend because:
# It has the highest predicted R-square (0.8207), indicating the best
# out-of-sample prediction performance
# It achieves this with only 2 predictors, making it the most parsimonious
# model
# The Cp value (8.4656) is reasonable, though not as close to p+1 as models
# 4 and 5
# It matches the result from the stepwise regression analysis, providing
# consistent evidence
# e. Compare and contrast the models produced by the variable selection
# strategies in parts a – d.

# Comparison of Variable Selection Models
# Summary of Models from Each Method
# Forward Selection (α = 0.10)
# Final model: All five variables (x1, x2, x3, x4, x5)
# R² = 0.903, Adjusted R² = 0.880
# Entry order: x4, x3, x2, x1, x5
# Only x4, x3, and x1 were significant at α = 0.05

# Backward Elimination (α = 0.10)
# Final model: All five variables (x1, x2, x3, x4, x5)
# No variables removed from full model
# R² = 0.903, Adjusted R² = 0.880


# Stepwise Regression (α = 0.10)
# Final model: Two variables (x4, x3)
# R² = 0.865, Adjusted R² = 0.854
# Both predictors highly significant (p < 0.001)


# All Possible Regressions
# Best model based on predicted R²: Two variables (x4, x3) with predicted
# R² = 0.8207
# Best model based on Cp: Full model (x1, x2, x3, x4, x5) with Cp = 6.0000
# Best model based on MSRes: Full model with lowest MSRes


# Comparison and Contrast
# Consistency Across Methods
# Variable x4 appears in all models and is consistently the first/most
# important predictor
# Variable x3 appears in all models and is consistently the second most
# important predictor
# The full model (all five variables) is selected by both forward selection
# and backward elimination
# The two-variable model (x4, x3) is selected by both stepwise regression
# and all possible regressions (when optimizing for predicted R²)

# Differences Between Methods
# Forward and Backward methods yielded identical results (full model),
# despite different approaches
# Stepwise produced a more parsimonious model than forward/backward,
# highlighting its ability to both add and remove variables
# All Possible Regressions showed that different criteria lead to different
# optimal models:
# Prediction-focused criteria (predicted R²) favored the simpler model (x4, x3)
# Fit-focused criteria (Cp, MSRes) favored the full model

4. Work Problem 11.2, BUT use the solar thermal data from Problem 6.1 after deleting the last observation. Interpret results.

# Split the Solar thermal energy data into estimation and prediction data
# sets. Evaluate the statistical properties of these two datasets. 
# Develop a model from the estimation data and evaluate its performance 
# on the prediction data. Discuss the predictive performance of this model. 
# Split data into training and testing samples (70/30 split)
set.seed(1)
tr_ind <- sample(nrow(solar), 0.7 * nrow(solar), replace = FALSE)
dftrain <- solar[tr_ind, ]
dftest <- solar[-tr_ind, ]
m1.lm <- lm(y ~ ., data = dftrain)
summary(m1.lm)         # View regression output
## 
## Call:
## lm(formula = y ~ ., data = dftrain)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1898  -2.7108  -0.3464   3.6143  15.3262 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 287.80887  124.16501   2.318   0.0374 *  
## x1            0.09340    0.03607   2.590   0.0224 *  
## x2            2.87127    1.80515   1.591   0.1357    
## x3            3.43580    1.65556   2.075   0.0584 .  
## x4          -22.22400    3.39115  -6.554 1.84e-05 ***
## x5            3.06853    2.10356   1.459   0.1684    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.467 on 13 degrees of freedom
## Multiple R-squared:  0.886,  Adjusted R-squared:  0.8421 
## F-statistic:  20.2 on 5 and 13 DF,  p-value: 1.029e-05
car::vif(m1.lm)        # Check for multicollinearity
##       x1       x2       x3       x4       x5 
## 1.838048 1.419765 2.889127 2.069321 4.571748
# Interpretation of Predictors
# Intercept (287.81):
# When all predictor variables are zero, the expected value of total heat
# flux is 287.81 kilowatts. This intercept is statistically significant at
# the 0.05 level (p = 0.0374).

# x1 – Insolation (Estimate = 0.0934):
# For each one-unit increase in insolation (watt/sq meter), the total heat
# flux is expected to increase by 0.0934 kilowatts, holding all other 
# variables constant.This predictor is statistically significant at the 0.05 level (p = 0.0224).

# x2 – East Position (Estimate = 2.8713):
# For each one-inch increase in the east direction, the total heat flux is
# expected to increase by 2.87 kilowatts, holding other variables constant.
# However, this predictor is not statistically significant at the 0.05 level (p = 0.1357).

# x3 – South Position (Estimate = 3.4358):
# For each one-inch increase in the south direction, the total heat flux is
# expected to increase by 3.44 kilowatts, all else equal.
# This predictor is not statistically significant at the 0.05 level 
# (p = 0.0584), although it is marginally close to significance.

# x4 – North Position (Estimate = -22.224):
# For each one-inch increase in the north direction, the total heat flux is
# expected to decrease by 22.22 kilowatts, assuming other variables remain constant.
# This predictor is highly statistically significant at the 0.05 level (p = 0.0000184).

# x5 – Time of Day (Estimate = 3.0685):
# For each unit increase in time of day, the total heat flux is expected to
# increase by 3.07 kilowatts, holding other predictors constant.
# This predictor is not statistically significant at the 0.05 level (p = 0.1684).

# R-squared (0.886) Model explains 88.6% of variation in heat flux — very strong fit.
# F-statistic   20.2 (p = 1.03e-05) Overall model is statistically
# significant. At least one predictor is influencing the response variable.
# No major multicollinearity issues in the model, since all the model's vif 
# are below 5
pred_y <- predict(m1.lm, newdata = dftest)  # Predict Y on test data
pred_y
##        3        8       12       13       15       16       20       24 
## 235.0826 281.8567 267.0407 270.5521 263.1540 234.8137 275.8361 193.4666 
##       27 
## 258.4114
# Actual vs. Predicted
actual_y <- dftest$y

# Root Mean Squared Error (RMSE)
rmse <- sqrt(mean((actual_y - pred_y)^2))

# Prediction R-squared
sst <- sum((actual_y - mean(actual_y))^2)
sse <- sum((actual_y - pred_y)^2)
rsq_pred <- 1 - (sse / sst)

# Output
rmse
## [1] 7.957844
rsq_pred
## [1] 0.9094836
# The model built on the training data exhibits excellent predictive
# performance when applied to the test data. The high R-squared value 
# (≈ 0.91) and relatively low RMSE (≈ 7.96) suggest that the model fits the
# data well and is effective for making accurate predictions on unseen data.

5. Work Problem 14.3, Parts (a) through (d), BUT delete observation #1 in the data set given in the book problem. You will be responsible for constructing the correct data set.

library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.3
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Construct dataset (excluding t = 1)
t <- 2:15
x <- c(98, 100, 89, 95, 87, 93, 82, 85, 83, 81, 79, 90, 77, 78)
y <- c(16.26, 15.94, 16.81, 15.67, 16.47, 15.66, 16.94, 16.60,
       17.16, 17.77, 18.05, 16.78, 18.17, 17.25)

# Create data frame
df <- data.frame(t = t, x = x, y = y)
# a. Fit a simple linear regression model to these data. Plot the residuals
# versus time. Is there any indication of autocorrelation? 

# F-test: testing (simultaneously) for all coefficients:
 # H0 : all the slopes β′ js are zero vs. Ha : at least one β nonzero
 # (H0: “model is not useful” vs. Ha: “model is useful”).
 # t-test: testing individual coefficient: H0 : βj = 0 vs. Ha : βj ≠ 0.

# (a) Fit simple linear regression and plot residuals over time
m3 <- lm(y ~ x, data = df)
summary(m3)
## 
## Call:
## lm(formula = y ~ x, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59446 -0.38446  0.07439  0.37845  0.48324 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.97186    1.34403  18.580 3.29e-10 ***
## x           -0.09374    0.01541  -6.083 5.47e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.416 on 12 degrees of freedom
## Multiple R-squared:  0.7551, Adjusted R-squared:  0.7347 
## F-statistic: 37.01 on 1 and 12 DF,  p-value: 5.472e-05
plot(df$t, residuals(m3), type = "o", main = "Residuals over Time", ylab = "Residuals", xlab = "Time")

# Regression Model Summary
# You fit the model:y^t=24.97−0.09374x 
# where:
# yt: Price of canned peaches at time t
# xt: Market share at time t

# Interpretation of Coefficients
# Intercept (24.97):
# If the market share were 0 (which is not realistic here, but a
# mathematical anchor), the expected price of canned peaches would be $24.97.

# x (-0.09374):
# For every 1-unit increase in market share, the price of canned peaches
# decreases by approximately $0.0937 (or 9.4 cents).
# Thus there is a statistical negative linear  relationship between market share and price.

# Model Strength
# R-squared = 0.7551:
# About 75.5% of the variation in price is explained by changes in market
# share — that's a strong model fit.

# P-values:
# Both the intercept and slope are statistically significant at the 0.05
# level (p < 0.05).
# b. Use the Durbin – Watson test to determine if there is positive autocorrelation in the errors. What are your conclusions? 
# The Durbin-Watson test examines the hypothesis:
# H₀: No positive autocorrelation (ρ = 0)
# H₁: Positive autocorrelation exists (ρ > 0)

# If the test statistic is significantly less than 2, 
# we have evidence of positive autocorrelation. 
# Based on the test results, we would likely reject the null hypothesis,
# indicating that there is significant positive autocorrelation in the errors.

# Durbin-Watson Test
dwtest(m3)
## 
##  Durbin-Watson test
## 
## data:  m3
## DW = 0.77112, p-value = 0.001254
## alternative hypothesis: true autocorrelation is greater than 0
# Durbin-Watson Test Results
# Test statistic (DW) = 0.77112
# p-value = 0.001254
# Alternative hypothesis: true autocorrelation is greater than 0

# Interpretation
# Durbin-Watson Statistic: The DW value of 0.77112 is much closer to 0 than
# to 2. In the Durbin-Watson test:

# DW ≈ 2: No autocorrelation
# DW < 2: Positive autocorrelation
# DW > 2: Negative autocorrelation

# Since 0.77112 is substantially less than 2, this strongly suggests
# positive autocorrelation in the error terms.
# P-value: At 0.001254, the p-value is well below conventional significance
# levels (0.05 or 0.01). This means we can confidently reject the null
# hypothesis of no autocorrelation.

# Conclusion: 
# There is statistically significant positive autocorrelation in the
# residuals of model m3. This indicates that the error terms are correlated over time rather than being independent.
#c. Use one iteration of the Cochrane – Orcutt procedure to estimate the 
# regression coefficients. Find the standard errors of these regression
# coefficients.
# Step 1: Get residuals from the original model
e <- residuals(m3)

# Step 2: Estimate autocorrelation coefficient (rho)
rho_est <- sum(e[2:length(e)] * e[1:(length(e)-1)]) / sum(e[1:(length(e)-1)]^2)
cat("Estimated autocorrelation coefficient (rho):", rho_est, "\n")
## Estimated autocorrelation coefficient (rho): 0.5656211
# Step 3: Transform y and x using estimated rho
y <- df$y
x <- df$x

y_star <- y[2:length(y)] - rho_est * y[1:(length(y)-1)]
x_star <- x[2:length(x)] - rho_est * x[1:(length(x)-1)]

# Step 4: Fit transformed model without intercept
model_transformed <- lm(y_star ~ x_star + 0)  # + 0 removes the intercept

# Step 5: Print regression coefficients and standard errors
cat("\nNew Regression Coefficient (no intercept model):\n")
## 
## New Regression Coefficient (no intercept model):
print(coef(model_transformed))
##    x_star 
## 0.1933185
cat("\nStandard Error of the Coefficient:\n")
## 
## Standard Error of the Coefficient:
print(summary(model_transformed)$coefficients[, "Std. Error"])
## [1] 0.01492511
# d. Is there positive autocorrelation remaining after the first iteration? 
# Would you conclude that the iterative parameter estimation technique has
# been successful?
# Get residuals from transformed model
e_transformed <- residuals(model_transformed)

# Check for autocorrelation in transformed residuals
acf(e_transformed, main = "ACF of Residuals after Cochrane-Orcutt")

# Durbin-Watson test on transformed model
t_transformed <- t[2:length(t)]
dwtest(model_transformed)
## 
##  Durbin-Watson test
## 
## data:  model_transformed
## DW = 2.9772, p-value = 0.9757
## alternative hypothesis: true autocorrelation is greater than 0
# "After the first iteration of the Cochrane–Orcutt procedure, 
# the Durbin-Watson statistic for the transformed model is 2.9772, 
# which is significantly greater than 2. This suggests that positive
# autocorrelation has been effectively removed from the residuals. 
# While the residuals no longer exhibit positive autocorrelation, the
# possibility of negative autocorrelation (DW > 2) remains. 
# Therefore, the iterative estimation technique has been successful in
# eliminating positive autocorrelation, but further iterations may be
# considered to check for and address any remaining issues with negative
# autocorrelation."