# load the packages
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(car)

## Loading required package: carData

library(MASS)
library(olsrr)

## Warning: package 'olsrr' was built under R version 4.4.3

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:MASS':
## 
##     cement

## The following object is masked from 'package:datasets':
## 
##     rivers

library(lmtest)

## Warning: package 'lmtest' was built under R version 4.4.3

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# Load the data
solar <- read.csv("SolarThermalEnergy.csv")

#Remove the last observation
solar <- solar[-nrow(solar), ]

Problem 1

Work Problem 4.4, BUT use the solar energy data that you used in Problem 1. For parts (a) and (b), interpret each plot.

#Consider the multiple regression model relating total heat flux to all
#five predictor variables

# Fit the multiple linear regression model
model <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = solar)
summary(model)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = solar)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6188  -2.7896   0.4168   4.3807  16.1877 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 345.15900   97.42412   3.543  0.00183 ** 
## x1            0.07105    0.02905   2.445  0.02293 *  
## x2            2.12624    1.30292   1.632  0.11693    
## x3            3.50171    1.48064   2.365  0.02727 *  
## x4          -22.92065    2.69262  -8.512 2.07e-08 ***
## x5            2.59559    1.80825   1.435  0.16523    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.006 on 22 degrees of freedom
## Multiple R-squared:  0.9026, Adjusted R-squared:  0.8804 
## F-statistic: 40.76 on 5 and 22 DF,  p-value: 2.102e-10

#Model: A multiple regression model is used to predict y (total heat
#flux) using five predictors (x1, x2, x3, x4, x5).
#Total Heat Flux=345.159+0.071(x1)+2.126(x2)+3.502(x3)-22.921(x4)+2.596(x5)

#Significant Predictors: x1, x3, and x4 are statistically significant
#with p-values less than 0.05.

#Non-significant Predictors: x2 and x5 are not statistically significant
#(p-values greater than 0.05).

#R-squared: 90.26% of the variance in y is explained by the model.
#This implies that the model has a good fit

#F-statistic: The model is significant overall (p-value = 2.102e-10)is
#less than the significance level of 0.05

#x1: Estimate = 0.07105, t-value = 2.445, p-value = 0.02293
#This predictor is statistically significant (p=0.02293) at the 5%
#significance level. A unit increase in x1 is associated with a
#significant increase in total heat flux.

#x2: Estimate = 2.12624, t-value = 1.632, p-value = 0.11693
#This predictor is not statistically significant (p=0.11693). 
#The evidence suggests that x2 does not contribute significantly to
#explaining the variation in total heat flux at the 5% significance level.

#x3: Estimate = 3.50171, t-value = 2.365, p-value = 0.02727
#This predictor is statistically significant (p=0.02727).
#A unit increase in x3 leads to a significant increase in total heat flux.

#x4: Estimate = -22.92065, t-value = -8.512, p-value = 2.07e-08
#This predictor is highly statistically significant (p=2.07e-08), 
#with a very small p-value. A unit increase in x4 leads to a significant
#decrease in total heat flux.

#x5: Estimate = 2.59559, t-value = 1.435, p-value = 0.16523
#This predictor is not statistically significant (p=0.16523), meaning it
#does not contribute significantly to explaining the variation in total heat flux.

#In conclusion, x1, x3, and x4 are important predictors in the model
#while x2 and x5 do not appear to provide significant additional explanatory value.

#a.Construct a normal probability plot of the residuals. Does there seem to #be any problem with the normality assumption?

# Calculate residuals
residuals <- resid(model)

# Create Q-Q plot for normality
qqnorm(residuals, main = "Normal Q-Q Plot of Residuals")
qqline(residuals)

# Based on the normality plot above, we can see most of the data
# points fall along the straight line, particularly in the middle range of
# the distribution. 
#However, there are notable deviations at both tails.
# These patterns suggest the residuals have heavier tails than a normal
# distribution would predict, particularly in the upper tail. 
# While the deviations aren't extreme throughout the entire plot, 
# they are substantial enough to raise some concerns about the strict
# validity of the normality assumption, which could potentially affect the
#  hypothesis tests in the regression analysis.

#b.Construct and interpret a plot of the residuals versus the predicted response. 
# Get residuals and fitted values
predicted <- fitted(model)

# Create plot of residuals vs fitted values
plot(predicted, residuals, 
     xlab = "Fitted Values", 
     ylab = "Residuals",
     main = "Residuals vs Predicted Values")
abline(h = 0, lty = 2)

# Overall pattern: The residuals appear to be randomly scattered around the
# zero line across the range of fitted values (from about 200 to 280),
# which generally supports the linear relationship assumption.

#Potential outliers: There are two noticeable outliers with residual values
# greater than 10 (at fitted values around 220 and 240), which correspond 
# to the extreme points we observed in the upper tail of the Q-Q plot.

# The spread of residuals doesn't show a clear funnel shape across the
# range of fitted values, suggesting the homoscedasticity 
# (constant variance) assumption is reasonably met.

# Residuals appear relatively balanced above and below the zero
# line, indicating that the model isn't systematically over- or under-predicting in certain ranges.

# Overall, the residual plot suggests that your linear model assumptions 
# are generally reasonable, with the main concern being the two potential
# outliers that might warrant further investigation.

# c. Construct and interpret the partial regression plots for this model.
# partial regression plots for each predictor
avPlots(model, main="Partial Regression Plots")

# Interpretation of Partial Regression Plots

# x1 | others: Shows a positive linear relationship with y after
# controlling for other predictors.
# The slope is moderate and the pattern is relatively clear, indicating x1
# has a meaningful positive effect on the response. 
# The points follow the line fairly well, suggesting this relationship is consistent.

# x2 | others: Displays a positive relationship with y. 
# The slope is less steep than x1, suggesting a weaker effect. 
# There's some scatter around the line but the trend is still visible, 
# indicating x2 contributes to the model but with less impact than x1.

# x3 | others: Exhibits a positive linear relationship with y, with a
# steeper slope than x2.
# The association appears relatively strong with most points following the
# trend.

# x4 | others: Shows a clear negative relationship with y,this is the only
# predictor with a negative slope. The relationship appears strong and
# well defined with relatively little scatter, suggesting x4 is an
# important predictor that inversely affects the response variable.

# x5 | others: Presents a positive relationship with y, with an upward
# slope similar to x1. 
# indicating x5 has a meaningful contribution to the model.

# Overall observations:
# All relationships appear linear, supporting the linear regression approach
# The partial regression plots collectively confirm that all five predictors
# have meaningful relationships with the response variable after accounting
# for the effects of other predictors

# d. Compute the studentized residuals and the  R  - student residuals for
# this model. What information is conveyed by these scaled residuals? 

# Compute studentized residuals (internally studentized residuals)
stud_resid <- rstandard(model)

# Compute R-student residuals (externally studentized residuals)
r_stud_resid <- rstudent(model)

# View both sets of residuals
cbind(studentized = stud_resid, Rstudent = r_stud_resid)

##    studentized    Rstudent
## 1   1.25010918  1.26720421
## 2   0.26244992  0.25681814
## 3   0.62507482  0.61619971
## 4   2.75180072  3.31993582
## 5  -0.19309601 -0.18881650
## 6   0.20913743  0.20453245
## 7  -0.62556004 -0.61668670
## 8  -1.62107383 -1.68781122
## 9  -1.45505318 -1.49537295
## 10 -0.90027645 -0.89624173
## 11  0.09516998  0.09300102
## 12 -1.08082669 -1.08518097
## 13 -0.26893583 -0.26318555
## 14  0.01452182  0.01418801
## 15 -0.29245928 -0.28629225
## 16  0.70402216  0.69571728
## 17  0.18793131  0.18375804
## 18 -1.90455655 -2.03618601
## 19  0.79056376  0.78359814
## 20 -0.31002388 -0.30355978
## 21  0.63222959  0.62338266
## 22  2.36137293  2.67015114
## 23  0.52537463  0.51654603
## 24 -1.73977314 -1.83034138
## 25 -0.03201153 -0.03127626
## 26  0.58456986  0.57561766
## 27  0.64560120  0.63681908
## 28 -0.03730699 -0.03645040

# Plot these residuals
par(mfrow=c(1,2))
plot(stud_resid, main="Studentized Residuals", 
     ylab="Studentized Residuals", xlab="Observation Number")
abline(h=c(-2,0,2), lty=c(2,1,2), col=c("red","black","red"))

plot(r_stud_resid, main="R-Student Residuals", 
     ylab="R-Student Residuals", xlab="Observation Number")
abline(h=c(-2,0,2), lty=c(2,1,2), col=c("red","black","red"))

par(mfrow=c(1,1))

# Studentized residuals range from approximately -2.5 to 2.75, with most
# values falling within the -2 to 2 range. However, a few points exceed ±2,
# such as observation 4 (≈ 2.75) and observation 22 (≈ 2.36), indicating
# potential outliers.

# R-student residuals range from approximately -2.04 to 3.32, confirming
#the possibility of potential outliers. 
# The most extreme value, observation 4 (≈ 3.32), suggests that this 
# observation may be influential and warrants further investigation.

# Only one residual slightly exceeds the ±3 threshold (observation 4),
# making it a candidate for an extreme outlier, while others remain within
# acceptable limits.

2. Work Problem 4.16.

(Note that the 10th observation of y should be 38.5, instead of 385.) In Problem 3.12, you were asked to fi t a model to the clathrate formation data in Table B.8

# Load clathrate dataset (correcting observation 10 from 385 to 38.5)
clathrate_data <- data.frame(
  x1 = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05),
  x2 = c(10,50,85,110,140,170,200,230,260,290,10,30,62,90,150,210,270,10,30,60,90,120,210,30,60,120,20,40,130,190,250,60,90,120,150),
  y = c(7.5,15,22,28.6,31.6,34,35,35.5,36.5,38.5,12.3,18,20.8,25.7,32.5,34,35,14.4,19,26.4,28.5,29,35,15.1,26.4,27,21,27.3,48.5,50.4,52.5,34.4,46.5,50,51.9)
)

#2a. Construct a normality plot of the residuals from the full model. 
# Does there seem to be any problem with the normality assumption?

# Full model: y ~ X1 + X2
full_model <- lm(y ~ x1 + x2, data = clathrate_data)
summary(full_model)

## 
## Call:
## lm(formula = y ~ x1 + x2, data = clathrate_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8868 -3.9698  0.0728  3.8991  8.1424 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.113e+01  1.661e+00   6.701 1.45e-07 ***
## x1          3.513e+02  3.949e+01   8.895 3.67e-10 ***
## x2          1.097e-01  9.954e-03  11.022 1.99e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.757 on 32 degrees of freedom
## Multiple R-squared:  0.8478, Adjusted R-squared:  0.8383 
## F-statistic: 89.15 on 2 and 32 DF,  p-value: 8.262e-14

# The regression equation is
# y =  1.113e+01 + 3.513e+02(x1) + 1.097e-01(x2)
# This multiple regression model is highly statistically significant
# (p < 0.05), with both x1 and x2 being strong predictors of y. 
# The model explains  about 85% of the variation in y, and residuals appear
# reasonably centered and symmetric.
# Based on R-squared,  we can conclude that the model fits the data
# well and is appropriate for prediction and inference.

# a. Construct a normality plot of residuals
residuals_full <- residuals(full_model)
qqnorm(residuals_full, main = "Normal Q-Q Plot of Residuals (Full Model)")
qqline(residuals_full)

# Based on the Q-Q plot most of the data points fall on the straight line
# with slight deviations at both tails, thus we conclude that there is
# no clear pattern to suggest normality assumption has been violated
# thus the normality assumption is satisfied.

#b. Construct and interpret a plot of the residuals versus the predicted response.

# Compute fitted (predicted) values from the full model
fitted_values_full <- fitted(full_model)

# Plot of residuals vs predicted response
plot(fitted_values_full, residuals_full, main = "Residuals vs. Predicted Response",
     xlab = "Predicted Response", ylab = "Residuals", pch = 19)
abline(h = 0, col = "red", lty = 2)

# The plot shows a fairly random scatter of residuals around the zero line
# across the range of fitted values (from approximately 10 to 55). 
# There's no obvious pattern such as a curve, funnel shape, or systematic
# trend that would indicate problems with the model.
# The residuals appear to be reasonably evenly distributed above and below
# zero throughout the range of fitted values, suggesting that the linearity
# assumption is adequately satisfied.

# Overall, this residuals plot supports the appropriateness of the full
# model, as it shows no clear violations of the key regression assumptions
# regarding linearity and homoscedasticity.

#c. In Problem 3.12, you were asked to fit a second model. Compute the 
# PRESS statistic for both models. Based on this statistic, which model is 
# most likely to provide better predictions of new data?

reduced_model <- lm(y ~ x2, data = clathrate_data)

# Calculate PRESS statistic for both models
calculate_press <- function(model) {
  residuals <- residuals(model)
  h <- hatvalues(model)
  press_residuals <- residuals/(1-h)
  press <- sum(press_residuals^2)
  return(press)
}

press_full <- calculate_press(full_model)
press_reduced <- calculate_press(reduced_model)

# Print PRESS statistics for comparison
cat("PRESS statistic for full model:", press_full, "\n")

## PRESS statistic for full model: 887.5867

cat("PRESS statistic for reduced model:", press_reduced, "\n")

## PRESS statistic for reduced model: 2813.665

cat("Better model based on PRESS statistic:", 
    ifelse(press_full < press_reduced, "Full model", "Reduced model"))

## Better model based on PRESS statistic: Full model

# Results:
# Full Model PRESS Statistic: 887.5867
# Reduced Model PRESS Statistic: 2813.665

# Interpretation:
# Full Model has a lower PRESS statistic compared to the Reduced Model,
# indicating that the full model is more accurate in predicting new data points.

# The reduced model, having a higher PRESS value, suggests it may not
# generalize as well to new data.

# Conclusion:
# Based on the PRESS statistic, the Full Model is more likely to provide
# better predictions of new data, as it has a smaller PRESS value.

Problem 3 Work Problem 4.19.

# Load the dataset
data <- data.frame(
  x1 = c(-1, 1, -1, 1, -1, 1, -1, 1, 0, 0, 0, 0, 0, 0),
  x2 = c(-1, -1, 1, 1, -1, -1, 1, 1, 0, 0, 0, 0, 0, 0),
  x3 = c(1, -1, -1, 1, -1, 1, 1, -1, 0, 0, 0, 0, 0, 0),
  y = c(102, 120, 117, 198, 103, 132, 132, 139, 133, 133, 140, 142, 145, 142)
)

# a. Perform a thorough analysis of the results including residual plots.

# Step1: Fit linear model
model3 <- lm(y ~ x1 + x2 + x3, data = data)
summary(model3)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.518  -8.768  -1.143   7.857  20.232 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  134.143      3.406  39.382 2.66e-12 ***
## x1            16.875      4.506   3.745  0.00382 ** 
## x2            16.125      4.506   3.579  0.00502 ** 
## x3            10.625      4.506   2.358  0.04009 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.74 on 10 degrees of freedom
## Multiple R-squared:  0.7641, Adjusted R-squared:  0.6933 
## F-statistic:  10.8 on 3 and 10 DF,  p-value: 0.001772

# Interpretation of Multiple Linear Regression Results
# Hypothesis Testing Framework
# For each coefficient in the model, we can state the hypotheses as:

# For the intercept:
# H₀: β₀ = 0 (The intercept is zero)
# H₁: β₀ ≠ 0 (The intercept is not zero)

# For hydrated silica level (x1):
# H₀: β₁ = 0 (x1 has no effect on abrasion index)
# H₁: β₁ ≠ 0 (x1 has a significant effect on abrasion index)

# For silane coupling agent level (x2):
# H₀: β₂ = 0 (x2 has no effect on abrasion index)
# H₁: β₂ ≠ 0 (x2 has a significant effect on abrasion index)

# For sulfur level (x3):
# H₀: β₃ = 0 (x3 has no effect on abrasion index)
# H₁: β₃ ≠ 0 (x3 has a significant effect on abrasion index)

# For overall model significance:
# H₀: β₁ = β₂ = β₃ = 0 (The model doesn't explain any variance)
# H₁: At least one βⱼ ≠ 0 (The model explains some variance)

# Results Interpretation
# Overall Model
# F-statistic: 10.8 with 3 and 10 degrees of freedom
# p-value: 0.001772 (< 0.05)
# Decision: Reject H₀
#Interpretation: The model as a whole is statistically significant at the 0.05 level

# Coefficient Interpretation

# Intercept (β₀ = 134.143)
# p-value: 2.66e-12 (< 0.05)
# Decision: Reject H₀
# Interpretation: The expected abrasion index is 134.143 when all factors
# are at their center points (0,0,0)


# Hydrated silica level (β₁ = 16.875)
# p-value: 0.00382 (< 0.05)
# Decision: Reject H₀
# Interpretation: A one-unit increase in hydrated silica level is
# associated with a 16.875 unit increase in abrasion index, holding other
# factors constant

# Silane coupling agent level (β₂ = 16.125)
# p-value: 0.00502 (< 0.05)
# Decision: Reject H₀
# Interpretation: A one-unit increase in silane coupling agent level is 
# associated with a 16.125 unit increase in abrasion index, holding other
# factors constant


# Sulfur level (β₃ = 10.625)
# p-value: 0.04009 (< 0.05)
# Decision: Reject H₀
# Interpretation: A one-unit increase in sulfur level is associated with a
# 10.625 unit increase in abrasion index, holding other factors constant

# Model Fit
# R-squared: 0.7641, about 76.41% of variance is explained by the model
# Adjusted R-squared: 0.6933 (69.33% of variance explained, adjusted for
# model complexity)


# Conclusion
# At the 0.05 significance level, all three factors (hydrated silica,
# silane coupling agent, and sulfur) have statistically significant
# positive effects on the abrasion index. The model explains approximately
# 76% of the variance in the abrasion index, indicating good predictive
# power. The final regression equation is:
# Abrasion Index = 134.143 + 16.875(x1) + 16.125(x2) + 10.625(x3)

# Normality plot
qqnorm(residuals(model3))
qqline(residuals(model3))

# Looking at the Q-Q  plot,its clear to see that normality assumption is
# reasonable since most of the data points fall on the straight

# Construct the ANOVA Table and test for significance of the model

  #H0: The regression model is not significant
  #H1: The regression model is significant

#Full F test
anova_result <- anova(model3)
print(anova_result)

## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value   Pr(>F)   
## x1         1 2278.12 2278.12  14.025 0.003815 **
## x2         1 2080.12 2080.12  12.806 0.005024 **
## x3         1  903.13  903.13   5.560 0.040093 * 
## Residuals 10 1624.34  162.43                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# The individual p-values for all predictors (x1, x2, x3) are less than
# 0.05, indicating that each variable contributes significantly to
# explaining the variation in the response.
# So, the overall regression model is statistically significant.
# Since all three predictors have significant F values with p < 0.05, 
# we reject the null hypothesis and conclude that the model is significant

# Create partial regression plots for each predictor
avPlots(model3, main="Partial Regression Plots")

# All predictors (x1, x2, x3) show positive partial relationships with the
# response variable y.
# The blue regression lines indicate a positive slope in each case.
# The plots reinforce the regression results, that all predictors are
# contributing meaningfully to the model.
# A few data points like observation 4 and 22 stand out and may deserve
# further attention.

# Check For Multicollinearity
vif(model3)

## x1 x2 x3 
##  1  1  1

# All three predictors (x1, x2, x3) have VIF = 1, meaning;
# There is no multicollinearity among them.
# Each predictor provides unique, independent information about the 
# response variable. 
# The estimates of the regression coefficients are stable and reliable.

# Model Diagnostic plots
par(mfrow = c(2, 2))
plot(model3)

# Model Diagnostic plots interpretation

# Residuals vs Fitted Plot
# Linearity and constant variance are satisfied since
#  Residuals are roughly centered around zero with slight curvature,
# suggesting minor non-linearity but overall acceptable.

# Q-Q Plot
# Most points lie close to the reference line, with slight deviation at the
# tails (not severe)
# Thus Normality of residuals are satisfied

# Scale-Location Plot, residuals have a fairly even spread across fitted
# values, no clear pattern or funnel shape thus there is constant variance.

# Residuals vs Leverage
# No points exceed Cook's distance threshold (dashed lines), meaning no
# observations are excessively influential. Point 4 has highest influence 
# but not critical.The plot indicates no serious problems with influential observations

# Generate predicted values and residuals from model3
predicted_Y <- predict(model3)               
residuals <- residuals(model3)              
# Create a data frame with Observation number, predicted values, and residuals
residual_output <- data.frame(
  Observation = 1:length(predicted_Y),
  Predicted_Y = round(predicted_Y, 7),
  Residuals = round(residuals, 5)
)


print(residual_output, row.names = FALSE)

##  Observation Predicted_Y Residuals
##            1   111.76786  -9.76786
##            2   124.26786  -4.26786
##            3   122.76786  -5.76786
##            4   177.76786  20.23214
##            5    90.51786  12.48214
##            6   145.51786 -13.51786
##            7   144.01786 -12.01786
##            8   156.51786 -17.51786
##            9   134.14286  -1.14286
##           10   134.14286  -1.14286
##           11   134.14286   5.85714
##           12   134.14286   7.85714
##           13   134.14286  10.85714
##           14   134.14286   7.85714

#Interpretation of Residual Output 
# Small residuals (close to 0) indicate good prediction.
# Large residuals (positive or negative) indicate poor fit for those observations
# Most predictions are reasonable, observation 4 and 8 have the largest residuals,but based on our model diagnostics this is not critical.

# Compute the studentized residuals and the  R  - student residuals for
# this model. 
# Compute studentized residuals 
stud <- rstandard(model3)

# Compute R-student residuals 
r_stud <- rstudent(model3)

# View both sets of residuals
cbind(studentized = stud, Rstudent = r_stud)

##    studentized    Rstudent
## 1  -1.03008744 -1.03360077
## 2  -0.45007477 -0.43136984
## 3  -0.60826004 -0.58802658
## 4   2.13361804  2.74240993
## 5   1.31632746  1.37342143
## 6  -1.42555063 -1.51507642
## 7  -1.26736535 -1.31233334
## 8  -1.84737802 -2.15936862
## 9  -0.09305633 -0.08831923
## 10 -0.09305633 -0.08831923
## 11  0.47691368  0.45767481
## 12  0.63976226  0.61974727
## 13  0.88403512  0.87349630
## 14  0.63976226  0.61974727

# b. Perform the appropriate test for lack of fit

# Test for lack of fit
# For this, we need to identify replicate observations (center points)
# The center points (0,0,0) appear 6 times

# Calculate pure error sum of squares
center_points <- subset(data, x1 == 0 & x2 == 0 & x3 == 0)
SSpe <- sum((center_points$y - mean(center_points$y))^2)
df_pe <- nrow(center_points) - 1

# Calculate lack of fit sum of squares
SSE <- sum(residuals(model3)^2)
SSlof <- SSE - SSpe
df_lof <- model3$df.residual - df_pe

# F-test for lack of fit
F_lof <- (SSlof/df_lof)/(SSpe/df_pe)
p_value <- 1 - pf(F_lof, df_lof, df_pe)

# Display lack of fit test results
cat("Lack of Fit Test:\n")

## Lack of Fit Test:

cat("F value:", F_lof, "\n")

## F value: 11.80688

cat("p-value:", p_value, "\n")

## p-value: 0.008492882

cat("Degrees of freedom:", df_lof, "and", df_pe, "\n")

## Degrees of freedom: 5 and 5

# Conclusion:
# Since the p-value (0.0085) < significance level (0.05), we reject the
# null hypothesis that the model fits the data adequately

Problem 4

Work Problem 5.3, BUT delete the 7th observation in the data set given in the problem.

# Load the dataset (excluded the 7th observation)
bacteria_data <- data.frame(
  bacteria = c(175, 108, 95, 82, 71, 50, 31, 28, 17, 16, 11),
  minutes = c(1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12)
)

# a.Plot a scatter diagram. Does it seem likely that a straight - line 
# model will be adequate?

# Create a scatter plot
plot(bacteria_data$minutes, bacteria_data$bacteria, 
     main = "Bacteria Survival vs. Exposure Time",
     xlab = "Minutes of Exposure", 
     ylab = "Number of Bacteria",
     pch = 19)

# The scatter plot shows a negative linear relationship between bacteria
# survival and heat exposure time, the points do  follow a downward linear
# trend. The bacteria count decreases steadily as exposure time increases, 
# with points falling roughly along a straight line.

# Conclusion
# The scatter plot between the Number of Bacteria (Bacteria) and minutes of
# Expose (Minute) does seem to follow a straight line. 
# But it appears to be negatively linearly related and 
# thus maybe a straight line will be adequate, but this requires further investigation

# b. Fit the straight - line model. Compute the summary statistics and the 
# residual plots. What are your conclusions regarding model adequacy? 
#  Fit linear model
linear_model <- lm(bacteria ~ minutes, data = bacteria_data)
summary(linear_model)

## 
## Call:
## lm(formula = bacteria ~ minutes, data = bacteria_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.844 -10.486  -9.301   4.711  44.873 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  142.584     11.867  12.015 7.62e-07 ***
## minutes      -12.457      1.605  -7.759 2.82e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.18 on 9 degrees of freedom
## Multiple R-squared:  0.8699, Adjusted R-squared:  0.8555 
## F-statistic:  60.2 on 1 and 9 DF,  p-value: 2.824e-05

# Results
# The fitted simple linear regression model is:
# y-hat = 142.584 − 12.457×minutes
# Minutes (-12.457): There is a statistically significant negative
# relationship between exposure time and bacteria count (p < 0.05). 
# Each additional minute of exposure decreases the bacteria count by
# approximately 12.5 bacteria.

# Intercept (142.584): The estimated initial bacteria count is
# approximately 143 at time zero, and this estimate is statistically
# significant (p < 0.05).But not useful for us.
# R-squared (0.8699): About 87% of the variation in bacteria counts is
# explained by exposure time.
# Adjusted R-squared (0.8555): Accounts for the number of predictors, still
# showing strong explanatory power.
# F-statistic (60.2) with a very low p-value confirms the model as a whole 
# is statistically significant.

# Fitting the straight line
# Create a scatter plot
plot(bacteria_data$minutes, bacteria_data$bacteria, 
     main = "Bacteria Survival vs. Exposure Time",
     xlab = "Minutes of Exposure", 
     ylab = "Number of Bacteria",
     pch = 19)

# Add the fitted regression line
abline(linear_model, col = "blue", lwd = 2)

# The data points are very closed and on the line,
# thus the straight fitted line is appropriate

# Diagnostic plots 
par(mfrow = c(2, 2))  
plot(linear_model)

# Residuals vs Fitted: While there's some pattern with a few outliers
# (points 1, 40, and 11 labeled), the red smoothed line is relatively flat
# through most of the range, though there's some curvature at the extremes.

# Q-Q Plot: The points follow the diagonal line reasonably well in the
# middle but deviate somewhat at the tails (especially points 1 and 11),
# suggesting slight departures from normality but not severe.

# Scale-Location: There does appear to be a slight upward trend in the
# spread of standardized residuals across fitted values, suggesting
# potential mild heteroscedasticity.

# Residuals vs Leverage: Points 1, 11, and 40 stand out, but none appear to
# exceed Cook's distance thresholds (dotted lines) that would indicate truly problematic influence.

Model Adequacy

# F-test
# H₀: β = 0 
# H₁: βᵢ ≠ 0

#Full F test
anova_result <- anova(linear_model)
print(anova_result)

## Analysis of Variance Table
## 
## Response: bacteria
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## minutes    1 22146.9 22146.9  60.204 2.824e-05 ***
## Residuals  9  3310.8   367.9                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Model adequacy
# Multiple R-squared:  0.8699,  Adjusted R-squared:  0.8555 
# F-statistic:  60.2 on 1 and 9 DF,  p-value: 2.824e-05
# The F-test confirms that the model is statistically significant.
# The predictor minutes significantly explains the variation in the number of surviving bacteria.
# This is supported by the Multiple R-squared:  0.8699, indicating its good fit

# c.Identify an appropriate transformed model for these data. 
# Fit this model to the data and conduct the usual tests of model adequacy. 

# Here we use the log transformation, so we transform the the response 
# variable Bacteria 
# Log-transform the response
bacteria_data$log_bacteria <- log(bacteria_data$bacteria)

# Fit linear model on the transformed response
log_model <- lm(log_bacteria ~ minutes, data = bacteria_data)

# Summary of the transformed model
summary(log_model)

## 
## Call:
## lm(formula = log_bacteria ~ minutes, data = bacteria_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.169064 -0.072029  0.008668  0.065332  0.139730 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.325116   0.064708   82.30 2.93e-14 ***
## minutes     -0.236960   0.008754  -27.07 6.21e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1046 on 9 degrees of freedom
## Multiple R-squared:  0.9879, Adjusted R-squared:  0.9865 
## F-statistic: 732.7 on 1 and 9 DF,  p-value: 6.209e-10

# The transformed model is: log(bacteria) = 5.3251 – 0.2370 × minutes
# Both coefficients are statistically significant at the 0.05 level (p < 0.001).
# Each additional minute reduces bacteria by about 21.1%.
# The model explains 98.8% of the variation in the data (R² = 0.9879).
# Residuals show no major issues, indicating good model assumptions.
#Conclusion: The log-transformed model is statistically adequate and
# appropriate at the 0.05 significance level.

# F-test
# H₀: β = 0 
# H₁: βᵢ ≠ 0
# Full F-test
summary(aov(log_model))

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## minutes      1  8.014   8.014   732.7 6.21e-10 ***
## Residuals    9  0.098   0.011                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# For the model adequacy, R-square is a good measure, based on 
# R-squared = 0.9879, meaning the model explains 98.8% of the variance in
# the log bacteria counts thus the model has a very good fit.
# F-statistic: 732.7 on 1 and 9 DF,  p-value: 6.209e-10
# F-statistics and F-test  indicates that the over all model is
#statistically significant. Residual standard error = 0.1046 is small.
# Conclusion: The model is adequate and fits the data very well.

# Visual check if the log transform model has a better fit
resd1 = residuals(log_model)
qqnorm(resd1)
qqline(resd1)

# The above Q-Q plot of the residuals approximately follows a straight 
# line. Hence, the assumption of the normality on the residuals is 
# satisfied.Therefore, this transform model fits the data better.

Work Problem 5.24,

BUT delete the two observations at Locations H and I in the data set given in the problem. Instead of obtaining a thorough analysis of the data (as asked in the problem), only produce an analysis to determine if the following regression model assumptions are valid:

# Load the dataset without locations H and I
wheat_data <- data.frame(
  Location = c("A", "B", "B", "C", "C", "D", "E", "F", "G", "J", "K"),
  x1 = c(227, 243, 254, 296, 296, 327, 441, 356, 419, 363, 253),
  x2 = c(145, 194, 183, 179, 181, 196, 189, 186, 195, 194, 189),
  x3 = c(196, 193, 195, 239, 239, 358, 363, 340, 340, 316, 255),
  x4 = c(203, 226, 268, 327, 304, 388, 465, 441, 387, 370, 340),
  x5 = c(727, 810, 790, 711, 705, 641, 663, 846, 713, 766, 778),
  y = c(810, 1500, 1340, 2750, 3240, 2860, 4970, 3780, 2740, 4440, 2110)
)

# Hypotheses

# F-test Hypothesis:

# H₀: β₁ = β₂ = β₃ = β₄ = β₅ = 0 (All regression coefficients are zero)
# H₁: At least one βᵢ ≠ 0 (At least one predictor has a non-zero effect)

# t-test Hypotheses (for each coefficient):For each predictor variable xᵢ:

# H₀: βᵢ = 0 (The predictor has no significant effect on yield)
# H₁: βᵢ ≠ 0 (The predictor has a significant effect on yield)

# Fit the multiple regression model
model4 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = wheat_data)

# Summary of the model
summary(model4)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = wheat_data)
## 
## Residuals:
##        1        2        3        4        5        6        7        8 
##   -53.50   159.12  -612.20   -28.06   749.30  -134.39   143.09   -47.02 
##        9       10       11 
## -1044.34  1191.33  -323.34 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1655.090   5017.631  -0.330    0.755
## x1              8.001      8.437   0.948    0.387
## x2              2.257     23.114   0.098    0.926
## x3             -7.811     13.010  -0.600    0.574
## x4             12.949     10.207   1.269    0.260
## x5             -0.994      5.111  -0.194    0.853
## 
## Residual standard error: 850.9 on 5 degrees of freedom
## Multiple R-squared:  0.7847, Adjusted R-squared:  0.5694 
## F-statistic: 3.644 on 5 and 5 DF,  p-value: 0.09108

# Results
# Based on the regression output at a significance level of 0.05:

# x1 (rain amount Oct-Apr): Positive relationship (coefficient = 8.001), not
# statistically significant (p = 0.387 > 0.05)

# x2 (number of days in growing season): Positive relationship 
# (coefficient = 2.257), not statistically significant (p = 0.926 > 0.05)

# x3 (rain amount during growing season): Negative relationship 
# (coefficient = -7.811), not statistically significant (p = 0.574 > 0.05)

# x4 (water use during growing season): Positive relationship (coefficient =
# 12.949), not statistically significant (p = 0.260 > 0.05)

# x5 (pan evaporation during growing season): Negative relationship
# (coefficient = -0.994), not statistically significant (p = 0.853 > 0.05)

# None of the predictors show a statistically significant relationship with
# wheat yield at the 0.05 significance level. The overall model is also not
# statistically significant (F-statistic p-value = 0.09108 > 0.05).

# Multiple R-squared: 0.7847 - This indicates that approximately 78.47% of
# the variability in wheat yield is explained by the five predictor
# variables in the model.

# Adjusted R-squared: 0.5694, adjusts for the number of predictors. 
# The substantial drop from the Multiple R-squared suggests that some
# predictors may not be contributing meaningfully to the model.

# F-statistic: 3.644 on 5 and 5 DF, p-value: 0.09108 - This tests whether
# any of the predictor variables have a significant relationship with the
# response variable. Since the p-value (0.09108) is greater than the
# significance level of 0.05, we fail to reject the null hypothesis that all
# regression coefficients are zero. 
# This means that the overall model is not statistically significant at the
# 0.05 level..

#  (a) constant error variance
# Hypothesis for Breusch-Pagan test:
# H0: Homoscedasticity (constant error variance)
# H1: Heteroscedasticity (non-constant error variance)
bp_test <- bptest(model4)
print("Breusch-Pagan Test for Heteroscedasticity:")

## [1] "Breusch-Pagan Test for Heteroscedasticity:"

print(bp_test)

## 
##  studentized Breusch-Pagan test
## 
## data:  model4
## BP = 5.0764, df = 5, p-value = 0.4066

# Based on the Breusch-Pagan test results:
# The test statistic BP = 5.0764 with 5 degrees of freedom yields a p-value
# of 0.4066.Since the p-value (0.4066) is greater than the significance 
# level of 0.05,we fail to reject the null hypothesis of homoscedasticity. 
# This means there is insufficient evidence to conclude that the error
# variance is non-constant.
# Therefore, the assumption of constant error variance (homoscedasticity)
# appears to be valid for this regression model based on the Breusch-Pagan test.

# (b) normality for the error terms
# Hypothesis for Shapiro-Wilk test:
# H0: Residuals are normally distributed
# H1: Residuals are not normally distributed
sw_test <- shapiro.test(residuals(model4))
print("Shapiro-Wilk Test for Normality:")

## [1] "Shapiro-Wilk Test for Normality:"

print(sw_test)

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model4)
## W = 0.94937, p-value = 0.6359

# Based on the Shapiro-Wilk test results:
# The test statistic W = 0.94937 with a p-value of 0.6359.
# Since the p-value (0.6359) is greater than the significance level of 0.05,
# we fail to reject the null hypothesis that the residuals are normally distributed.
# Therefore, the assumption of normality for the error terms appears to be
# valid for this regression model based on the Shapiro-Wilk test.

STA6013

Doris Mbitazi Asongafac

2025-04-06