# load the packages
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(car)
## Loading required package: carData
library(MASS)
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.4.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
##
## cement
## The following object is masked from 'package:datasets':
##
## rivers
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Load the data
solar <- read.csv("SolarThermalEnergy.csv")
#Remove the last observation
solar <- solar[-nrow(solar), ]
Work Problem 4.4, BUT use the solar energy data that you used in Problem 1. For parts (a) and (b), interpret each plot.
#Consider the multiple regression model relating total heat flux to all
#five predictor variables
# Fit the multiple linear regression model
model <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = solar)
summary(model)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = solar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6188 -2.7896 0.4168 4.3807 16.1877
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 345.15900 97.42412 3.543 0.00183 **
## x1 0.07105 0.02905 2.445 0.02293 *
## x2 2.12624 1.30292 1.632 0.11693
## x3 3.50171 1.48064 2.365 0.02727 *
## x4 -22.92065 2.69262 -8.512 2.07e-08 ***
## x5 2.59559 1.80825 1.435 0.16523
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.006 on 22 degrees of freedom
## Multiple R-squared: 0.9026, Adjusted R-squared: 0.8804
## F-statistic: 40.76 on 5 and 22 DF, p-value: 2.102e-10
#Model: A multiple regression model is used to predict y (total heat
#flux) using five predictors (x1, x2, x3, x4, x5).
#Total Heat Flux=345.159+0.071(x1)+2.126(x2)+3.502(x3)-22.921(x4)+2.596(x5)
#Significant Predictors: x1, x3, and x4 are statistically significant
#with p-values less than 0.05.
#Non-significant Predictors: x2 and x5 are not statistically significant
#(p-values greater than 0.05).
#R-squared: 90.26% of the variance in y is explained by the model.
#This implies that the model has a good fit
#F-statistic: The model is significant overall (p-value = 2.102e-10)is
#less than the significance level of 0.05
#x1: Estimate = 0.07105, t-value = 2.445, p-value = 0.02293
#This predictor is statistically significant (p=0.02293) at the 5%
#significance level. A unit increase in x1 is associated with a
#significant increase in total heat flux.
#x2: Estimate = 2.12624, t-value = 1.632, p-value = 0.11693
#This predictor is not statistically significant (p=0.11693).
#The evidence suggests that x2 does not contribute significantly to
#explaining the variation in total heat flux at the 5% significance level.
#x3: Estimate = 3.50171, t-value = 2.365, p-value = 0.02727
#This predictor is statistically significant (p=0.02727).
#A unit increase in x3 leads to a significant increase in total heat flux.
#x4: Estimate = -22.92065, t-value = -8.512, p-value = 2.07e-08
#This predictor is highly statistically significant (p=2.07e-08),
#with a very small p-value. A unit increase in x4 leads to a significant
#decrease in total heat flux.
#x5: Estimate = 2.59559, t-value = 1.435, p-value = 0.16523
#This predictor is not statistically significant (p=0.16523), meaning it
#does not contribute significantly to explaining the variation in total heat flux.
#In conclusion, x1, x3, and x4 are important predictors in the model
#while x2 and x5 do not appear to provide significant additional explanatory value.
#a.Construct a normal probability plot of the residuals. Does there seem to #be any problem with the normality assumption?
# Calculate residuals
residuals <- resid(model)
# Create Q-Q plot for normality
qqnorm(residuals, main = "Normal Q-Q Plot of Residuals")
qqline(residuals)
# Based on the normality plot above, we can see most of the data
# points fall along the straight line, particularly in the middle range of
# the distribution.
#However, there are notable deviations at both tails.
# These patterns suggest the residuals have heavier tails than a normal
# distribution would predict, particularly in the upper tail.
# While the deviations aren't extreme throughout the entire plot,
# they are substantial enough to raise some concerns about the strict
# validity of the normality assumption, which could potentially affect the
# hypothesis tests in the regression analysis.
#b.Construct and interpret a plot of the residuals versus the predicted response.
# Get residuals and fitted values
predicted <- fitted(model)
# Create plot of residuals vs fitted values
plot(predicted, residuals,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Residuals vs Predicted Values")
abline(h = 0, lty = 2)
# Overall pattern: The residuals appear to be randomly scattered around the
# zero line across the range of fitted values (from about 200 to 280),
# which generally supports the linear relationship assumption.
#Potential outliers: There are two noticeable outliers with residual values
# greater than 10 (at fitted values around 220 and 240), which correspond
# to the extreme points we observed in the upper tail of the Q-Q plot.
# The spread of residuals doesn't show a clear funnel shape across the
# range of fitted values, suggesting the homoscedasticity
# (constant variance) assumption is reasonably met.
# Residuals appear relatively balanced above and below the zero
# line, indicating that the model isn't systematically over- or under-predicting in certain ranges.
# Overall, the residual plot suggests that your linear model assumptions
# are generally reasonable, with the main concern being the two potential
# outliers that might warrant further investigation.
# c. Construct and interpret the partial regression plots for this model.
# partial regression plots for each predictor
avPlots(model, main="Partial Regression Plots")
# Interpretation of Partial Regression Plots
# x1 | others: Shows a positive linear relationship with y after
# controlling for other predictors.
# The slope is moderate and the pattern is relatively clear, indicating x1
# has a meaningful positive effect on the response.
# The points follow the line fairly well, suggesting this relationship is consistent.
# x2 | others: Displays a positive relationship with y.
# The slope is less steep than x1, suggesting a weaker effect.
# There's some scatter around the line but the trend is still visible,
# indicating x2 contributes to the model but with less impact than x1.
# x3 | others: Exhibits a positive linear relationship with y, with a
# steeper slope than x2.
# The association appears relatively strong with most points following the
# trend.
# x4 | others: Shows a clear negative relationship with y,this is the only
# predictor with a negative slope. The relationship appears strong and
# well defined with relatively little scatter, suggesting x4 is an
# important predictor that inversely affects the response variable.
# x5 | others: Presents a positive relationship with y, with an upward
# slope similar to x1.
# indicating x5 has a meaningful contribution to the model.
# Overall observations:
# All relationships appear linear, supporting the linear regression approach
# The partial regression plots collectively confirm that all five predictors
# have meaningful relationships with the response variable after accounting
# for the effects of other predictors
# d. Compute the studentized residuals and the R - student residuals for
# this model. What information is conveyed by these scaled residuals?
# Compute studentized residuals (internally studentized residuals)
stud_resid <- rstandard(model)
# Compute R-student residuals (externally studentized residuals)
r_stud_resid <- rstudent(model)
# View both sets of residuals
cbind(studentized = stud_resid, Rstudent = r_stud_resid)
## studentized Rstudent
## 1 1.25010918 1.26720421
## 2 0.26244992 0.25681814
## 3 0.62507482 0.61619971
## 4 2.75180072 3.31993582
## 5 -0.19309601 -0.18881650
## 6 0.20913743 0.20453245
## 7 -0.62556004 -0.61668670
## 8 -1.62107383 -1.68781122
## 9 -1.45505318 -1.49537295
## 10 -0.90027645 -0.89624173
## 11 0.09516998 0.09300102
## 12 -1.08082669 -1.08518097
## 13 -0.26893583 -0.26318555
## 14 0.01452182 0.01418801
## 15 -0.29245928 -0.28629225
## 16 0.70402216 0.69571728
## 17 0.18793131 0.18375804
## 18 -1.90455655 -2.03618601
## 19 0.79056376 0.78359814
## 20 -0.31002388 -0.30355978
## 21 0.63222959 0.62338266
## 22 2.36137293 2.67015114
## 23 0.52537463 0.51654603
## 24 -1.73977314 -1.83034138
## 25 -0.03201153 -0.03127626
## 26 0.58456986 0.57561766
## 27 0.64560120 0.63681908
## 28 -0.03730699 -0.03645040
# Plot these residuals
par(mfrow=c(1,2))
plot(stud_resid, main="Studentized Residuals",
ylab="Studentized Residuals", xlab="Observation Number")
abline(h=c(-2,0,2), lty=c(2,1,2), col=c("red","black","red"))
plot(r_stud_resid, main="R-Student Residuals",
ylab="R-Student Residuals", xlab="Observation Number")
abline(h=c(-2,0,2), lty=c(2,1,2), col=c("red","black","red"))
par(mfrow=c(1,1))
# Studentized residuals range from approximately -2.5 to 2.75, with most
# values falling within the -2 to 2 range. However, a few points exceed ±2,
# such as observation 4 (≈ 2.75) and observation 22 (≈ 2.36), indicating
# potential outliers.
# R-student residuals range from approximately -2.04 to 3.32, confirming
#the possibility of potential outliers.
# The most extreme value, observation 4 (≈ 3.32), suggests that this
# observation may be influential and warrants further investigation.
# Only one residual slightly exceeds the ±3 threshold (observation 4),
# making it a candidate for an extreme outlier, while others remain within
# acceptable limits.
(Note that the 10th observation of y should be 38.5, instead of 385.) In Problem 3.12, you were asked to fi t a model to the clathrate formation data in Table B.8
# Load clathrate dataset (correcting observation 10 from 385 to 38.5)
clathrate_data <- data.frame(
x1 = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05),
x2 = c(10,50,85,110,140,170,200,230,260,290,10,30,62,90,150,210,270,10,30,60,90,120,210,30,60,120,20,40,130,190,250,60,90,120,150),
y = c(7.5,15,22,28.6,31.6,34,35,35.5,36.5,38.5,12.3,18,20.8,25.7,32.5,34,35,14.4,19,26.4,28.5,29,35,15.1,26.4,27,21,27.3,48.5,50.4,52.5,34.4,46.5,50,51.9)
)
#2a. Construct a normality plot of the residuals from the full model.
# Does there seem to be any problem with the normality assumption?
# Full model: y ~ X1 + X2
full_model <- lm(y ~ x1 + x2, data = clathrate_data)
summary(full_model)
##
## Call:
## lm(formula = y ~ x1 + x2, data = clathrate_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8868 -3.9698 0.0728 3.8991 8.1424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.113e+01 1.661e+00 6.701 1.45e-07 ***
## x1 3.513e+02 3.949e+01 8.895 3.67e-10 ***
## x2 1.097e-01 9.954e-03 11.022 1.99e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.757 on 32 degrees of freedom
## Multiple R-squared: 0.8478, Adjusted R-squared: 0.8383
## F-statistic: 89.15 on 2 and 32 DF, p-value: 8.262e-14
# The regression equation is
# y = 1.113e+01 + 3.513e+02(x1) + 1.097e-01(x2)
# This multiple regression model is highly statistically significant
# (p < 0.05), with both x1 and x2 being strong predictors of y.
# The model explains about 85% of the variation in y, and residuals appear
# reasonably centered and symmetric.
# Based on R-squared, we can conclude that the model fits the data
# well and is appropriate for prediction and inference.
# a. Construct a normality plot of residuals
residuals_full <- residuals(full_model)
qqnorm(residuals_full, main = "Normal Q-Q Plot of Residuals (Full Model)")
qqline(residuals_full)
# Based on the Q-Q plot most of the data points fall on the straight line
# with slight deviations at both tails, thus we conclude that there is
# no clear pattern to suggest normality assumption has been violated
# thus the normality assumption is satisfied.
#b. Construct and interpret a plot of the residuals versus the predicted response.
# Compute fitted (predicted) values from the full model
fitted_values_full <- fitted(full_model)
# Plot of residuals vs predicted response
plot(fitted_values_full, residuals_full, main = "Residuals vs. Predicted Response",
xlab = "Predicted Response", ylab = "Residuals", pch = 19)
abline(h = 0, col = "red", lty = 2)
# The plot shows a fairly random scatter of residuals around the zero line
# across the range of fitted values (from approximately 10 to 55).
# There's no obvious pattern such as a curve, funnel shape, or systematic
# trend that would indicate problems with the model.
# The residuals appear to be reasonably evenly distributed above and below
# zero throughout the range of fitted values, suggesting that the linearity
# assumption is adequately satisfied.
# Overall, this residuals plot supports the appropriateness of the full
# model, as it shows no clear violations of the key regression assumptions
# regarding linearity and homoscedasticity.
#c. In Problem 3.12, you were asked to fit a second model. Compute the
# PRESS statistic for both models. Based on this statistic, which model is
# most likely to provide better predictions of new data?
reduced_model <- lm(y ~ x2, data = clathrate_data)
# Calculate PRESS statistic for both models
calculate_press <- function(model) {
residuals <- residuals(model)
h <- hatvalues(model)
press_residuals <- residuals/(1-h)
press <- sum(press_residuals^2)
return(press)
}
press_full <- calculate_press(full_model)
press_reduced <- calculate_press(reduced_model)
# Print PRESS statistics for comparison
cat("PRESS statistic for full model:", press_full, "\n")
## PRESS statistic for full model: 887.5867
cat("PRESS statistic for reduced model:", press_reduced, "\n")
## PRESS statistic for reduced model: 2813.665
cat("Better model based on PRESS statistic:",
ifelse(press_full < press_reduced, "Full model", "Reduced model"))
## Better model based on PRESS statistic: Full model
# Results:
# Full Model PRESS Statistic: 887.5867
# Reduced Model PRESS Statistic: 2813.665
# Interpretation:
# Full Model has a lower PRESS statistic compared to the Reduced Model,
# indicating that the full model is more accurate in predicting new data points.
# The reduced model, having a higher PRESS value, suggests it may not
# generalize as well to new data.
# Conclusion:
# Based on the PRESS statistic, the Full Model is more likely to provide
# better predictions of new data, as it has a smaller PRESS value.
# Load the dataset
data <- data.frame(
x1 = c(-1, 1, -1, 1, -1, 1, -1, 1, 0, 0, 0, 0, 0, 0),
x2 = c(-1, -1, 1, 1, -1, -1, 1, 1, 0, 0, 0, 0, 0, 0),
x3 = c(1, -1, -1, 1, -1, 1, 1, -1, 0, 0, 0, 0, 0, 0),
y = c(102, 120, 117, 198, 103, 132, 132, 139, 133, 133, 140, 142, 145, 142)
)
# a. Perform a thorough analysis of the results including residual plots.
# Step1: Fit linear model
model3 <- lm(y ~ x1 + x2 + x3, data = data)
summary(model3)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.518 -8.768 -1.143 7.857 20.232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 134.143 3.406 39.382 2.66e-12 ***
## x1 16.875 4.506 3.745 0.00382 **
## x2 16.125 4.506 3.579 0.00502 **
## x3 10.625 4.506 2.358 0.04009 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.74 on 10 degrees of freedom
## Multiple R-squared: 0.7641, Adjusted R-squared: 0.6933
## F-statistic: 10.8 on 3 and 10 DF, p-value: 0.001772
# Interpretation of Multiple Linear Regression Results
# Hypothesis Testing Framework
# For each coefficient in the model, we can state the hypotheses as:
# For the intercept:
# H₀: β₀ = 0 (The intercept is zero)
# H₁: β₀ ≠ 0 (The intercept is not zero)
# For hydrated silica level (x1):
# H₀: β₁ = 0 (x1 has no effect on abrasion index)
# H₁: β₁ ≠ 0 (x1 has a significant effect on abrasion index)
# For silane coupling agent level (x2):
# H₀: β₂ = 0 (x2 has no effect on abrasion index)
# H₁: β₂ ≠ 0 (x2 has a significant effect on abrasion index)
# For sulfur level (x3):
# H₀: β₃ = 0 (x3 has no effect on abrasion index)
# H₁: β₃ ≠ 0 (x3 has a significant effect on abrasion index)
# For overall model significance:
# H₀: β₁ = β₂ = β₃ = 0 (The model doesn't explain any variance)
# H₁: At least one βⱼ ≠ 0 (The model explains some variance)
# Results Interpretation
# Overall Model
# F-statistic: 10.8 with 3 and 10 degrees of freedom
# p-value: 0.001772 (< 0.05)
# Decision: Reject H₀
#Interpretation: The model as a whole is statistically significant at the 0.05 level
# Coefficient Interpretation
# Intercept (β₀ = 134.143)
# p-value: 2.66e-12 (< 0.05)
# Decision: Reject H₀
# Interpretation: The expected abrasion index is 134.143 when all factors
# are at their center points (0,0,0)
# Hydrated silica level (β₁ = 16.875)
# p-value: 0.00382 (< 0.05)
# Decision: Reject H₀
# Interpretation: A one-unit increase in hydrated silica level is
# associated with a 16.875 unit increase in abrasion index, holding other
# factors constant
# Silane coupling agent level (β₂ = 16.125)
# p-value: 0.00502 (< 0.05)
# Decision: Reject H₀
# Interpretation: A one-unit increase in silane coupling agent level is
# associated with a 16.125 unit increase in abrasion index, holding other
# factors constant
# Sulfur level (β₃ = 10.625)
# p-value: 0.04009 (< 0.05)
# Decision: Reject H₀
# Interpretation: A one-unit increase in sulfur level is associated with a
# 10.625 unit increase in abrasion index, holding other factors constant
# Model Fit
# R-squared: 0.7641, about 76.41% of variance is explained by the model
# Adjusted R-squared: 0.6933 (69.33% of variance explained, adjusted for
# model complexity)
# Conclusion
# At the 0.05 significance level, all three factors (hydrated silica,
# silane coupling agent, and sulfur) have statistically significant
# positive effects on the abrasion index. The model explains approximately
# 76% of the variance in the abrasion index, indicating good predictive
# power. The final regression equation is:
# Abrasion Index = 134.143 + 16.875(x1) + 16.125(x2) + 10.625(x3)
# Normality plot
qqnorm(residuals(model3))
qqline(residuals(model3))
# Looking at the Q-Q plot,its clear to see that normality assumption is
# reasonable since most of the data points fall on the straight
# Construct the ANOVA Table and test for significance of the model
#H0: The regression model is not significant
#H1: The regression model is significant
#Full F test
anova_result <- anova(model3)
print(anova_result)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 2278.12 2278.12 14.025 0.003815 **
## x2 1 2080.12 2080.12 12.806 0.005024 **
## x3 1 903.13 903.13 5.560 0.040093 *
## Residuals 10 1624.34 162.43
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The individual p-values for all predictors (x1, x2, x3) are less than
# 0.05, indicating that each variable contributes significantly to
# explaining the variation in the response.
# So, the overall regression model is statistically significant.
# Since all three predictors have significant F values with p < 0.05,
# we reject the null hypothesis and conclude that the model is significant
# Create partial regression plots for each predictor
avPlots(model3, main="Partial Regression Plots")
# All predictors (x1, x2, x3) show positive partial relationships with the
# response variable y.
# The blue regression lines indicate a positive slope in each case.
# The plots reinforce the regression results, that all predictors are
# contributing meaningfully to the model.
# A few data points like observation 4 and 22 stand out and may deserve
# further attention.
# Check For Multicollinearity
vif(model3)
## x1 x2 x3
## 1 1 1
# All three predictors (x1, x2, x3) have VIF = 1, meaning;
# There is no multicollinearity among them.
# Each predictor provides unique, independent information about the
# response variable.
# The estimates of the regression coefficients are stable and reliable.
# Model Diagnostic plots
par(mfrow = c(2, 2))
plot(model3)
# Model Diagnostic plots interpretation
# Residuals vs Fitted Plot
# Linearity and constant variance are satisfied since
# Residuals are roughly centered around zero with slight curvature,
# suggesting minor non-linearity but overall acceptable.
# Q-Q Plot
# Most points lie close to the reference line, with slight deviation at the
# tails (not severe)
# Thus Normality of residuals are satisfied
# Scale-Location Plot, residuals have a fairly even spread across fitted
# values, no clear pattern or funnel shape thus there is constant variance.
# Residuals vs Leverage
# No points exceed Cook's distance threshold (dashed lines), meaning no
# observations are excessively influential. Point 4 has highest influence
# but not critical.The plot indicates no serious problems with influential observations
# Generate predicted values and residuals from model3
predicted_Y <- predict(model3)
residuals <- residuals(model3)
# Create a data frame with Observation number, predicted values, and residuals
residual_output <- data.frame(
Observation = 1:length(predicted_Y),
Predicted_Y = round(predicted_Y, 7),
Residuals = round(residuals, 5)
)
print(residual_output, row.names = FALSE)
## Observation Predicted_Y Residuals
## 1 111.76786 -9.76786
## 2 124.26786 -4.26786
## 3 122.76786 -5.76786
## 4 177.76786 20.23214
## 5 90.51786 12.48214
## 6 145.51786 -13.51786
## 7 144.01786 -12.01786
## 8 156.51786 -17.51786
## 9 134.14286 -1.14286
## 10 134.14286 -1.14286
## 11 134.14286 5.85714
## 12 134.14286 7.85714
## 13 134.14286 10.85714
## 14 134.14286 7.85714
#Interpretation of Residual Output
# Small residuals (close to 0) indicate good prediction.
# Large residuals (positive or negative) indicate poor fit for those observations
# Most predictions are reasonable, observation 4 and 8 have the largest residuals,but based on our model diagnostics this is not critical.
# Compute the studentized residuals and the R - student residuals for
# this model.
# Compute studentized residuals
stud <- rstandard(model3)
# Compute R-student residuals
r_stud <- rstudent(model3)
# View both sets of residuals
cbind(studentized = stud, Rstudent = r_stud)
## studentized Rstudent
## 1 -1.03008744 -1.03360077
## 2 -0.45007477 -0.43136984
## 3 -0.60826004 -0.58802658
## 4 2.13361804 2.74240993
## 5 1.31632746 1.37342143
## 6 -1.42555063 -1.51507642
## 7 -1.26736535 -1.31233334
## 8 -1.84737802 -2.15936862
## 9 -0.09305633 -0.08831923
## 10 -0.09305633 -0.08831923
## 11 0.47691368 0.45767481
## 12 0.63976226 0.61974727
## 13 0.88403512 0.87349630
## 14 0.63976226 0.61974727
# b. Perform the appropriate test for lack of fit
# Test for lack of fit
# For this, we need to identify replicate observations (center points)
# The center points (0,0,0) appear 6 times
# Calculate pure error sum of squares
center_points <- subset(data, x1 == 0 & x2 == 0 & x3 == 0)
SSpe <- sum((center_points$y - mean(center_points$y))^2)
df_pe <- nrow(center_points) - 1
# Calculate lack of fit sum of squares
SSE <- sum(residuals(model3)^2)
SSlof <- SSE - SSpe
df_lof <- model3$df.residual - df_pe
# F-test for lack of fit
F_lof <- (SSlof/df_lof)/(SSpe/df_pe)
p_value <- 1 - pf(F_lof, df_lof, df_pe)
# Display lack of fit test results
cat("Lack of Fit Test:\n")
## Lack of Fit Test:
cat("F value:", F_lof, "\n")
## F value: 11.80688
cat("p-value:", p_value, "\n")
## p-value: 0.008492882
cat("Degrees of freedom:", df_lof, "and", df_pe, "\n")
## Degrees of freedom: 5 and 5
# Conclusion:
# Since the p-value (0.0085) < significance level (0.05), we reject the
# null hypothesis that the model fits the data adequately
Work Problem 5.3, BUT delete the 7th observation in the data set given in the problem.
# Load the dataset (excluded the 7th observation)
bacteria_data <- data.frame(
bacteria = c(175, 108, 95, 82, 71, 50, 31, 28, 17, 16, 11),
minutes = c(1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12)
)
# a.Plot a scatter diagram. Does it seem likely that a straight - line
# model will be adequate?
# Create a scatter plot
plot(bacteria_data$minutes, bacteria_data$bacteria,
main = "Bacteria Survival vs. Exposure Time",
xlab = "Minutes of Exposure",
ylab = "Number of Bacteria",
pch = 19)
# The scatter plot shows a negative linear relationship between bacteria
# survival and heat exposure time, the points do follow a downward linear
# trend. The bacteria count decreases steadily as exposure time increases,
# with points falling roughly along a straight line.
# Conclusion
# The scatter plot between the Number of Bacteria (Bacteria) and minutes of
# Expose (Minute) does seem to follow a straight line.
# But it appears to be negatively linearly related and
# thus maybe a straight line will be adequate, but this requires further investigation
# b. Fit the straight - line model. Compute the summary statistics and the
# residual plots. What are your conclusions regarding model adequacy?
# Fit linear model
linear_model <- lm(bacteria ~ minutes, data = bacteria_data)
summary(linear_model)
##
## Call:
## lm(formula = bacteria ~ minutes, data = bacteria_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.844 -10.486 -9.301 4.711 44.873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 142.584 11.867 12.015 7.62e-07 ***
## minutes -12.457 1.605 -7.759 2.82e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.18 on 9 degrees of freedom
## Multiple R-squared: 0.8699, Adjusted R-squared: 0.8555
## F-statistic: 60.2 on 1 and 9 DF, p-value: 2.824e-05
# Results
# The fitted simple linear regression model is:
# y-hat = 142.584 − 12.457×minutes
# Minutes (-12.457): There is a statistically significant negative
# relationship between exposure time and bacteria count (p < 0.05).
# Each additional minute of exposure decreases the bacteria count by
# approximately 12.5 bacteria.
# Intercept (142.584): The estimated initial bacteria count is
# approximately 143 at time zero, and this estimate is statistically
# significant (p < 0.05).But not useful for us.
# R-squared (0.8699): About 87% of the variation in bacteria counts is
# explained by exposure time.
# Adjusted R-squared (0.8555): Accounts for the number of predictors, still
# showing strong explanatory power.
# F-statistic (60.2) with a very low p-value confirms the model as a whole
# is statistically significant.
# Fitting the straight line
# Create a scatter plot
plot(bacteria_data$minutes, bacteria_data$bacteria,
main = "Bacteria Survival vs. Exposure Time",
xlab = "Minutes of Exposure",
ylab = "Number of Bacteria",
pch = 19)
# Add the fitted regression line
abline(linear_model, col = "blue", lwd = 2)
# The data points are very closed and on the line,
# thus the straight fitted line is appropriate
# Diagnostic plots
par(mfrow = c(2, 2))
plot(linear_model)
# Residuals vs Fitted: While there's some pattern with a few outliers
# (points 1, 40, and 11 labeled), the red smoothed line is relatively flat
# through most of the range, though there's some curvature at the extremes.
# Q-Q Plot: The points follow the diagonal line reasonably well in the
# middle but deviate somewhat at the tails (especially points 1 and 11),
# suggesting slight departures from normality but not severe.
# Scale-Location: There does appear to be a slight upward trend in the
# spread of standardized residuals across fitted values, suggesting
# potential mild heteroscedasticity.
# Residuals vs Leverage: Points 1, 11, and 40 stand out, but none appear to
# exceed Cook's distance thresholds (dotted lines) that would indicate truly problematic influence.
# F-test
# H₀: β = 0
# H₁: βᵢ ≠ 0
#Full F test
anova_result <- anova(linear_model)
print(anova_result)
## Analysis of Variance Table
##
## Response: bacteria
## Df Sum Sq Mean Sq F value Pr(>F)
## minutes 1 22146.9 22146.9 60.204 2.824e-05 ***
## Residuals 9 3310.8 367.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model adequacy
# Multiple R-squared: 0.8699, Adjusted R-squared: 0.8555
# F-statistic: 60.2 on 1 and 9 DF, p-value: 2.824e-05
# The F-test confirms that the model is statistically significant.
# The predictor minutes significantly explains the variation in the number of surviving bacteria.
# This is supported by the Multiple R-squared: 0.8699, indicating its good fit
# c.Identify an appropriate transformed model for these data.
# Fit this model to the data and conduct the usual tests of model adequacy.
# Here we use the log transformation, so we transform the the response
# variable Bacteria
# Log-transform the response
bacteria_data$log_bacteria <- log(bacteria_data$bacteria)
# Fit linear model on the transformed response
log_model <- lm(log_bacteria ~ minutes, data = bacteria_data)
# Summary of the transformed model
summary(log_model)
##
## Call:
## lm(formula = log_bacteria ~ minutes, data = bacteria_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.169064 -0.072029 0.008668 0.065332 0.139730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.325116 0.064708 82.30 2.93e-14 ***
## minutes -0.236960 0.008754 -27.07 6.21e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1046 on 9 degrees of freedom
## Multiple R-squared: 0.9879, Adjusted R-squared: 0.9865
## F-statistic: 732.7 on 1 and 9 DF, p-value: 6.209e-10
# The transformed model is: log(bacteria) = 5.3251 – 0.2370 × minutes
# Both coefficients are statistically significant at the 0.05 level (p < 0.001).
# Each additional minute reduces bacteria by about 21.1%.
# The model explains 98.8% of the variation in the data (R² = 0.9879).
# Residuals show no major issues, indicating good model assumptions.
#Conclusion: The log-transformed model is statistically adequate and
# appropriate at the 0.05 significance level.
# F-test
# H₀: β = 0
# H₁: βᵢ ≠ 0
# Full F-test
summary(aov(log_model))
## Df Sum Sq Mean Sq F value Pr(>F)
## minutes 1 8.014 8.014 732.7 6.21e-10 ***
## Residuals 9 0.098 0.011
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# For the model adequacy, R-square is a good measure, based on
# R-squared = 0.9879, meaning the model explains 98.8% of the variance in
# the log bacteria counts thus the model has a very good fit.
# F-statistic: 732.7 on 1 and 9 DF, p-value: 6.209e-10
# F-statistics and F-test indicates that the over all model is
#statistically significant. Residual standard error = 0.1046 is small.
# Conclusion: The model is adequate and fits the data very well.
# Visual check if the log transform model has a better fit
resd1 = residuals(log_model)
qqnorm(resd1)
qqline(resd1)
# The above Q-Q plot of the residuals approximately follows a straight
# line. Hence, the assumption of the normality on the residuals is
# satisfied.Therefore, this transform model fits the data better.
BUT delete the two observations at Locations H and I in the data set given in the problem. Instead of obtaining a thorough analysis of the data (as asked in the problem), only produce an analysis to determine if the following regression model assumptions are valid:
# Load the dataset without locations H and I
wheat_data <- data.frame(
Location = c("A", "B", "B", "C", "C", "D", "E", "F", "G", "J", "K"),
x1 = c(227, 243, 254, 296, 296, 327, 441, 356, 419, 363, 253),
x2 = c(145, 194, 183, 179, 181, 196, 189, 186, 195, 194, 189),
x3 = c(196, 193, 195, 239, 239, 358, 363, 340, 340, 316, 255),
x4 = c(203, 226, 268, 327, 304, 388, 465, 441, 387, 370, 340),
x5 = c(727, 810, 790, 711, 705, 641, 663, 846, 713, 766, 778),
y = c(810, 1500, 1340, 2750, 3240, 2860, 4970, 3780, 2740, 4440, 2110)
)
# Hypotheses
# F-test Hypothesis:
# H₀: β₁ = β₂ = β₃ = β₄ = β₅ = 0 (All regression coefficients are zero)
# H₁: At least one βᵢ ≠ 0 (At least one predictor has a non-zero effect)
# t-test Hypotheses (for each coefficient):For each predictor variable xᵢ:
# H₀: βᵢ = 0 (The predictor has no significant effect on yield)
# H₁: βᵢ ≠ 0 (The predictor has a significant effect on yield)
# Fit the multiple regression model
model4 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = wheat_data)
# Summary of the model
summary(model4)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = wheat_data)
##
## Residuals:
## 1 2 3 4 5 6 7 8
## -53.50 159.12 -612.20 -28.06 749.30 -134.39 143.09 -47.02
## 9 10 11
## -1044.34 1191.33 -323.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1655.090 5017.631 -0.330 0.755
## x1 8.001 8.437 0.948 0.387
## x2 2.257 23.114 0.098 0.926
## x3 -7.811 13.010 -0.600 0.574
## x4 12.949 10.207 1.269 0.260
## x5 -0.994 5.111 -0.194 0.853
##
## Residual standard error: 850.9 on 5 degrees of freedom
## Multiple R-squared: 0.7847, Adjusted R-squared: 0.5694
## F-statistic: 3.644 on 5 and 5 DF, p-value: 0.09108
# Results
# Based on the regression output at a significance level of 0.05:
# x1 (rain amount Oct-Apr): Positive relationship (coefficient = 8.001), not
# statistically significant (p = 0.387 > 0.05)
# x2 (number of days in growing season): Positive relationship
# (coefficient = 2.257), not statistically significant (p = 0.926 > 0.05)
# x3 (rain amount during growing season): Negative relationship
# (coefficient = -7.811), not statistically significant (p = 0.574 > 0.05)
# x4 (water use during growing season): Positive relationship (coefficient =
# 12.949), not statistically significant (p = 0.260 > 0.05)
# x5 (pan evaporation during growing season): Negative relationship
# (coefficient = -0.994), not statistically significant (p = 0.853 > 0.05)
# None of the predictors show a statistically significant relationship with
# wheat yield at the 0.05 significance level. The overall model is also not
# statistically significant (F-statistic p-value = 0.09108 > 0.05).
# Multiple R-squared: 0.7847 - This indicates that approximately 78.47% of
# the variability in wheat yield is explained by the five predictor
# variables in the model.
# Adjusted R-squared: 0.5694, adjusts for the number of predictors.
# The substantial drop from the Multiple R-squared suggests that some
# predictors may not be contributing meaningfully to the model.
# F-statistic: 3.644 on 5 and 5 DF, p-value: 0.09108 - This tests whether
# any of the predictor variables have a significant relationship with the
# response variable. Since the p-value (0.09108) is greater than the
# significance level of 0.05, we fail to reject the null hypothesis that all
# regression coefficients are zero.
# This means that the overall model is not statistically significant at the
# 0.05 level..
# (a) constant error variance
# Hypothesis for Breusch-Pagan test:
# H0: Homoscedasticity (constant error variance)
# H1: Heteroscedasticity (non-constant error variance)
bp_test <- bptest(model4)
print("Breusch-Pagan Test for Heteroscedasticity:")
## [1] "Breusch-Pagan Test for Heteroscedasticity:"
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: model4
## BP = 5.0764, df = 5, p-value = 0.4066
# Based on the Breusch-Pagan test results:
# The test statistic BP = 5.0764 with 5 degrees of freedom yields a p-value
# of 0.4066.Since the p-value (0.4066) is greater than the significance
# level of 0.05,we fail to reject the null hypothesis of homoscedasticity.
# This means there is insufficient evidence to conclude that the error
# variance is non-constant.
# Therefore, the assumption of constant error variance (homoscedasticity)
# appears to be valid for this regression model based on the Breusch-Pagan test.
# (b) normality for the error terms
# Hypothesis for Shapiro-Wilk test:
# H0: Residuals are normally distributed
# H1: Residuals are not normally distributed
sw_test <- shapiro.test(residuals(model4))
print("Shapiro-Wilk Test for Normality:")
## [1] "Shapiro-Wilk Test for Normality:"
print(sw_test)
##
## Shapiro-Wilk normality test
##
## data: residuals(model4)
## W = 0.94937, p-value = 0.6359
# Based on the Shapiro-Wilk test results:
# The test statistic W = 0.94937 with a p-value of 0.6359.
# Since the p-value (0.6359) is greater than the significance level of 0.05,
# we fail to reject the null hypothesis that the residuals are normally distributed.
# Therefore, the assumption of normality for the error terms appears to be
# valid for this regression model based on the Shapiro-Wilk test.