#Loading the necessary packages
library(MASS)
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.4.3
##
## Attaching package: 'olsrr'
##
## The following object is masked from 'package:MASS':
##
## cement
##
## The following object is masked from 'package:datasets':
##
## rivers
# Loading in the Data
Solar <- data.frame(
Y = c(271.80, 264.00, 238.80, 230.70, 251.60, 257.90, 263.90, 266.50, 229.10, 239.30, 258.00, 257.60, 267.30, 267.00, 259.60, 240.40, 227.20, 196.00, 278.70, 272.30, 267.40, 254.50, 224.70, 181.50, 227.50, 253.60, 263.00, 265.80),
x1 = c(783.35, 748.45, 684.45, 827.00, 860.45, 875.15, 909.45, 905.55, 756.00, 769.35, 793.50, 801.65, 819.65, 808.55, 774.95, 711.85, 694.85, 638.10, 774.55, 757.90, 753.35, 704.70, 666.00, 568.55, 653.10, 704.05, 709.60, 726.90),
x2 = c(33.53, 36.50, 34.66, 33.13, 35.75, 34.46, 34.60, 35.38, 35.85, 35.68, 35.35, 35.04, 34.07, 32.20, 34.32, 31.08, 35.73, 34.11, 34.79, 35.77, 36.44, 37.82, 35.07, 35.26, 35.56, 35.73, 36.46, 36.26),
x3 = c(40.55, 36.19, 37.31, 32.52, 33.71, 34.14, 34.85, 35.89, 33.53, 33.79, 34.72, 35.22, 36.50, 37.60, 37.89, 37.71, 37.00, 36.76, 34.62, 35.40, 35.96, 36.26, 36.34, 35.90, 31.84, 33.16, 33.83, 34.89),
x4 = c(16.66, 16.46, 17.66, 17.50, 16.40, 16.28, 16.06, 15.93, 16.60, 16.41, 16.17, 15.92, 16.04, 16.19, 16.62, 17.37, 18.12, 18.53, 15.54, 15.70, 16.45, 17.62, 18.12, 19.05, 16.51, 16.02, 15.89, 15.83),
x5 = c(13.20, 14.11, 15.68, 10.53, 11.00, 11.31, 11.96, 12.58, 10.66, 10.85, 11.41, 11.91, 12.85, 13.58, 14.21, 15.56, 15.83, 16.41, 13.10, 13.63, 14.51, 15.38, 16.10, 16.73, 10.58, 11.28, 11.91, 12.65)
)
# Add column labels
colnames(Solar) <- c(
"Total Heat Flux (kilowatts)" = "Y",
"Insolation (watt/sq meter)" = "x1",
"Position-East Direction (inches)" = "x2",
"Position-South Direction (inches)" = "x3",
"Position-North Direction (inches)" = "x4",
"Time of the Day" = "x5"
)
# Check for missing values
colSums(is.na(Solar))
## Y x1 x2 x3 x4 x5
## 0 0 0 0 0 0
# See the composition and data structure
str(Solar)
## 'data.frame': 28 obs. of 6 variables:
## $ Y : num 272 264 239 231 252 ...
## $ x1: num 783 748 684 827 860 ...
## $ x2: num 33.5 36.5 34.7 33.1 35.8 ...
## $ x3: num 40.5 36.2 37.3 32.5 33.7 ...
## $ x4: num 16.7 16.5 17.7 17.5 16.4 ...
## $ x5: num 13.2 14.1 15.7 10.5 11 ...
# The Solar data set is clean, there are no missing values
# Scatter plot to visualize the relationship
plot(Solar$Y,Solar$x1, pch =19,
xlab="Insolation",ylab="Total heat flux ")
#There is a positive linear relationship between Insolation and total heat
# flux, this is shown by the upward trend, this means that
# as insolation increases total heat flux increases as well
##Problem 1a
#a. Fit a simple linear regression model relating total heat flux to
#Insolation (watt/sq meter). Write the fitted regression model along with
#the interpretation of the slop coefficient
# F-test: testing (simultaneously) for all coefficients:
#H0 : all the slopes β′ js are zero vs. Ha : at least one β nonzero
#(H0: “model is not useful” vs. Ha: “model is useful”).
# t-test: testing individual coefficient: H0 : βj = 0 vs. Ha : β1 ≠ 0 .
#Fit the Linear Regression Model
model <- lm(Y ~ x1, data = Solar)
summary(model)
##
## Call:
## lm(formula = Y ~ x1, data = Solar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.194 -12.600 0.915 15.442 26.157
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106.4726 32.4006 3.286 0.002908 **
## x1 0.1886 0.0426 4.427 0.000153 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.82 on 26 degrees of freedom
## Multiple R-squared: 0.4298, Adjusted R-squared: 0.4079
## F-statistic: 19.6 on 1 and 26 DF, p-value: 0.0001525
# The fitted regression model is: Y-hat = 6.4726+0.1886⋅x 1
# where:
#Y = Total Heat Flux (kilowatts)
#x1= Insolation (watt/sq meter)
# 106.47 is the intercept, which represents the estimated Total Heat Flux when Insolation is zero.
# 0.1886 is the slope coefficient, which represents the change in Total Heat Flux for each additional unit of Insolation.
# Interpretation of the Slope Coefficient
# The estimated slope coefficient is 0.1886, meaning that for every
# additional 1 watt/sq meter increase in Insolation, the Total Heat Flux
# increases by 0.1886 kilowatts, on average.
# p-value for x1 = 0.000153: The slope coefficient is highly statistically
#significant (p < 0.001), meaning Insolation has a strong effect on Total Heat Flux.
# R-squared = 0.4298: The model explains 42.98% of the variability in Total Heat Flux.
##Problem 1b
# b. Test the hypothesis of linear dependence between total heat flux to
#Insolation (watt/sq meter) at 5% level of significance (25 points- Each
#element of the test structure will take 5 points).
summary(model)
##
## Call:
## lm(formula = Y ~ x1, data = Solar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.194 -12.600 0.915 15.442 26.157
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106.4726 32.4006 3.286 0.002908 **
## x1 0.1886 0.0426 4.427 0.000153 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.82 on 26 degrees of freedom
## Multiple R-squared: 0.4298, Adjusted R-squared: 0.4079
## F-statistic: 19.6 on 1 and 26 DF, p-value: 0.0001525
# Analysis of variance
summary(aov(model))
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 6221 6221 19.6 0.000153 ***
## Residuals 26 8253 317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Hypothesis Testing Structure
# F-test
# Define the Hypotheses
# Null Hypothesis (H0):β1=0 (No linear relationship between total heat flux and insolation)
# Alternative Hypothesis (Ha):β1 ≠ 0 (There is a significant linear relationship)
# Statistic: Fobs ~ F1,26 under H0
# The chosen significance level is α=0.05.
# To test the slope
# t-test: testing individual coefficient: H0 : βj = 0 vs. Ha : β1 ≠ 0
# Interpretation of results
#Decision:Since P(F1,26>19.598)=0.0001525<0.05, we reject the null hypothesis.
#Conclusion:At a significance level of α=0.05, there is a significant
#relationship between total heat flux (Y) and Insolation (x₁).
#Therefore, we conclude that Insolation (x₁) significantly impacts total heat flux (Y).
# We can also look into the P(t26 > 4.427) < 0.05 and reach to the same conclusion.
# To test the intercept:
#H0 :βj = 0 vs H1 : β1 ≠ 0
# α=0.05
# Statistic: tobs ~ t26 under H0
# Decision: Since P(t26 > 3.286) = 002908 < 005, we reject the null
# hypothesis. At a significance level of α=0.05
# Interpretation of R-squared and Coefficients
# R-squared = 0.4298: This means that approximately 42.98% of the variation in total heat flux (Y) is explained by Insolation (x₁) in the sample.
# Adjusted R-squared = 0.4079: This means that approximately 40.79% of the variation in total heat flux (Y) is explained by Insolation (x₁) in the population, taking into account the number of predictors in the model.
# The hypothesis test will examine whether there's a statistically
#significant linear relationship between the variables:
# Hypotheses:
#Null Hypothesis (H₀): β₁ = 0 (no linear relationship between total heat flux and insolation)
#Alternative Hypothesis (H₁): β₁ ≠ 0 (there is a linear relationship between total heat flux and insolation)
#Test Statistic:t = β̂₁ / SE(β̂₁)
#where β̂₁ is the estimated slope coefficient and SE(β̂₁) is its standard error
# Assuming your regression model is stored as "model"
# where model <- lm(heat_flux ~ insolation, data=your_data)
# The model has already been created as:
# model <- lm(formula = Y ~ x1, data = Solar)
# Extract key statistics from the model summary
model_summary <- summary(model)
# Extract the slope coefficient, standard error, t-statistic and p-value
slope_estimate <- model_summary$coefficients[2, 1] # 0.1886
se_slope <- model_summary$coefficients[2, 2] # 0.0426
t_statistic <- model_summary$coefficients[2, 3] # 4.427
p_value <- model_summary$coefficients[2, 4] # 0.000153
# Extract degrees of freedom
df <- model_summary$df[2] # 26
# Calculate critical t-value for two-tailed test at 5% significance
t_critical <- qt(0.975, df) # 0.975 for two-tailed at 5% level
# Display results
cat("Hypothesis Test for Linear Dependence\n")
## Hypothesis Test for Linear Dependence
cat("--------------------------------------\n")
## --------------------------------------
cat("Slope coefficient (β₁):", slope_estimate, "\n")
## Slope coefficient (β₁): 0.1885873
cat("Standard error:", se_slope, "\n")
## Standard error: 0.04259976
cat("t-statistic:", t_statistic, "\n")
## t-statistic: 4.426956
cat("Degrees of freedom:", df, "\n")
## Degrees of freedom: 26
cat("Critical t-value (two-tailed, 5%):", t_critical, "\n")
## Critical t-value (two-tailed, 5%): 2.055529
cat("p-value:", p_value, "\n\n")
## p-value: 0.0001525174
# Decision
if (abs(t_statistic) > t_critical) {
cat("Decision: Reject the null hypothesis (|t| > t_critical)\n")
} else {
cat("Decision: Fail to reject the null hypothesis (|t| ≤ t_critical)\n")
}
## Decision: Reject the null hypothesis (|t| > t_critical)
if (p_value < 0.05) {
cat("Decision: Reject the null hypothesis (p < 0.05)\n")
} else {
cat("Decision: Fail to reject the null hypothesis (p ≥ 0.05)\n")
}
## Decision: Reject the null hypothesis (p < 0.05)
cat("\nConclusion: There is", ifelse(p_value < 0.05, "sufficient", "insufficient"),
"evidence at the 5% significance level to conclude that there is a linear relationship\n",
"between total heat flux and insolation.\n")
##
## Conclusion: There is sufficient evidence at the 5% significance level to conclude that there is a linear relationship
## between total heat flux and insolation.
#State the Hypotheses:
# Null Hypothesis (H0): Intercept ≤ 100
# Alternative Hypothesis (H1): Intercept >100
# Test statistic: t = (β̂₀ - 100) / SE(β̂₀)
#Calculate the Test Statistic and p-value:
# Get the summary of the model
model_summary <- summary(model)
# Extract the intercept and its standard error
intercept_estimate <- model_summary$coefficients[1, 1] # Intercept coefficient
se_intercept <- model_summary$coefficients[1, 2] # Standard error of the intercept
# Calculate the t-statistic
t_statistic <- (intercept_estimate - 100) / se_intercept
# Extract degrees of freedom
df <- model_summary$df[2]
# Find the critical value for a one-tailed t-test at 5% significance
t_critical <- qt(0.95, df)
# Calculate the p-value for the one-tailed test
p_value <- 1 - pt(t_statistic, df)
# Display results
t_statistic
## [1] 0.1997681
p_value
## [1] 0.4216081
t_critical
## [1] 1.705618
# Decision: Compare the t-statistic to the critical value
if (t_statistic > t_critical) {
cat("Decision: Reject the null hypothesis. The intercept is greater than 100.\n")
} else {
cat("Decision: Fail to reject the null hypothesis. The intercept is not greater than 100.\n")
}
## Decision: Fail to reject the null hypothesis. The intercept is not greater than 100.
# Decision rule: Reject H0, if t_statistic > t_critical
#Test Statistic, Test critical and p-value Results:
#Test Statistic = 0.1998
# T_critical = 1.7056
#p-value: 0.4216
# Since t_statistic=0.1998 is less than t_critical=1.7056, fail to reject H0
#Conclusion:Since the p-value (0.4216) > 0.05, we fail to reject the null hypothesis.
#Interpretation: There is insufficient evidence to conclude that the
#intercept is significantly greater than 100 at the 5% significance level.
#d. Interpret the estimated coefficient of determination from 1(a) (2 points).
# Interpretation of the Coefficient of Determination (R-squared):
#R-squared : 0.4298
#This value represents the proportion of the variance in the dependent
#variable (total heat flux) that is explained by the independent variable
#(insolation). In this case:
#Interpretation: Approximately 42.98% of the variation in total heat flux
#can be explained by insolation (watt/sq meter). The remaining 57.02% of
#the variation is unexplained by the model and could be due to other
#factors not included in the model.
# 1e. Obtain the prediction, 95% Confidence and 95% Prediction interval at
# x1=1000, using the fitted model in 1(a) and interpret the interval
#results (8 points).
# To predict the responses on new values
# Create a new data frame with x1 = 1000
x0 <- data.frame(x1 = 1000)
# Get the 95% Confidence Interval prediction
predict(model, x0, interval = "confidence", level = 0.95, type = "response")
## fit lwr upr
## 1 295.0599 272.6397 317.4801
# Get the 95% Prediction Interval prediction
predict(model, x0, interval = "prediction", level = 0.95, type = "response")
## fit lwr upr
## 1 295.0599 252.1195 338.0002
# 95% Confidence Interval (for the mean response at x1=1000)
# Prediction: 295.06 kW
#Lower Bound: 272.64 kW
#Upper Bound: 317.48 kW
#Interpretation:
#If we repeatedly sample and fit the regression model, the average total heat
#flux (kilowatts) at an insolation of 1000 W/m² is expected to be within
#(272.64, 317.48) kW about 95% of the time.
#95% Prediction Interval (for a single new observation at x1=1000)
#Prediction: 295.06 kW
#Lower Bound: 252.12 kW
#Upper Bound: 338.00 kW
#Interpretation:
#For an individual new observation where the insolation is 1000 W/m², the
#total heat flux is expected to fall within (252.12, 338.00) kW about 95% of the time.
##Problem 1f
# f. Fit a third order polynomial regression relating total heat flux to
# Insolation (watt/sq meter) and write the fitted model (5 points).
# Fit a third-order polynomial regression model
model_poly <- lm(Y ~ poly(x1, 3), data = Solar)
# Display the model summary
summary(model_poly)
##
## Call:
## lm(formula = Y ~ poly(x1, 3), data = Solar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.950 -7.313 3.051 8.920 19.815
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 249.132 2.699 92.314 < 2e-16 ***
## poly(x1, 3)1 78.873 14.280 5.523 1.11e-05 ***
## poly(x1, 3)2 -57.071 14.280 -3.996 0.000532 ***
## poly(x1, 3)3 10.088 14.280 0.706 0.486737
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.28 on 24 degrees of freedom
## Multiple R-squared: 0.6619, Adjusted R-squared: 0.6196
## F-statistic: 15.66 on 3 and 24 DF, p-value: 7.467e-06
# Fitted Third-Order Polynomial Regression Model:
#The fitted third-order polynomial regression model is:
#Y = 249.132 + 78.873.x1 − 57.071.x12 + 10.088.x13
#Where:
#Y is the total heat flux (response variable),
#x1 is the insolation (predictor variable),
#249.132 is the intercept,
#78.873 is the coefficient for x1,
#57.071 is the coefficient for x12,
#10.088 is the coefficient for x13.
#Interpretation of Coefficients:
#Intercept (β0): The estimated value of total heat flux when insolation is zero is 249.132.
#First-order coefficient (β1): For each unit increase in insolation (x 1),
#the total heat flux is expected to increase by 78.873 units, assuming all
#other terms are held constant.
#Second-order coefficient (β 2):
#For each unit increase in x12, the total heat flux is expected to decrease
#by 57.071 units, reflecting a curvilinear relationship.
#Third-order coefficient (β3):
#The third-order term suggests that for each unit increase in x13,
#the total heat flux will increase by 10.088 units, but the effect is not
#statistically significant (p-value = 0.487).
#Model Significance:
#The overall model is significant with a p-value of 7.467 × 10 − 6,
#indicating that at least one of the coefficients is significantly
#different from zero.
#The R2 value of 0.6619 suggests that about 66.2% of the variability in
#total heat flux is explained by the model.
##Problem 2
#2. Again, use solar energy data to work on the following problems below.
# a. Fit a multiple linear regression model using all independent
#variables. Write the fitted regression model along with the interpretation
#of the partial slop coefficients (5 points)
# Fit the multiple linear regression model using all independent variables
model2 <- lm(Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
# Display the model summary
summary(model2)
##
## Call:
## lm(formula = Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6351 -2.7826 0.4129 4.3698 16.2289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 345.42264 97.50297 3.543 0.00183 **
## x1 0.07089 0.02910 2.436 0.02340 *
## x2 2.12235 1.30382 1.628 0.11781
## x3 3.50194 1.48229 2.363 0.02741 *
## x4 -22.91744 2.69599 -8.501 2.12e-08 ***
## x5 2.59057 1.80979 1.431 0.16636
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.012 on 22 degrees of freedom
## Multiple R-squared: 0.9024, Adjusted R-squared: 0.8802
## F-statistic: 40.69 on 5 and 22 DF, p-value: 2.139e-10
# Fitted Regression Model:
# Ŷ = 345.423 + 0.0709x₁ + 2.122x₂ + 3.502x₃ - 22.917x₄ + 2.591x₅
#Where:
#Y = Total Heat Flux (kilowatts)
#x1= Insolation (watt/sq meter)
#x2= Position-East Direction (inches)
#x3= Position-South Direction (inches)
#x4= Position-North Direction (inches)
#x5= Time of the Day
#Interpretation of the partial slope coefficients:
#β₁ = 0.071: For each unit increase in x₁, the solar energy output (Y) is
#expected to increase by 0.071 units, holding all other variables constant.
#This coefficient is statistically significant (p-value = 0.023).
#β₂ = 2.122: For each unit increase in x₂, Y is expected to increase by 2.122
#units, holding all other variables constant. This coefficient is not
#statistically significant (p-value = 0.118).
#β₃ = 3.502: For each unit increase in x₃, Y is expected to increase by 3.502
#units, holding all other variables constant. This coefficient is statistically
#significant (p-value = 0.027).
#β₄ = -22.917: For each unit increase in x₄, Y is expected to decrease by
#22.917 units, holding all other variables constant. This coefficient is highly
#significant (p-value = 2.12e-08).
#β₅ = 2.591: For each unit increase in x₅, Y is expected to increase by 2.59
#1 units, holding all other variables constant. This coefficient is not
#statistically significant (p-value = 0.166).
#The model has a high R-squared value of 0.9024, indicating that approximately
#90.24% of the variability in solar energy output is explained by these five
#predictor variables. The overall model is statistically significant with an
#F-statistic of 40.69 and a p-value of 2.139e-10.
#Interpretation of the Partial Slope Coefficients:
#Intercept (345.42):
#When all predictors (x1,x2,x3,x4,x5) are zero, the estimated total heat flux
#is 345.42 kW. However, this may not have a practical interpretation if zero values of the predictors are unrealistic.
#Insolation (β1=0.07089, p = 0.0234)**:
#For each 1 unit increase in Insolation (watt/sq meter), the Total Heat Flux
#(kilowatts) increases by 0.07089, holding other variables constant.
#This relationship is statistically significant (p-value = 0.02340 < 0.05).
#Position-East Direction (x2 coefficient = 2.12235):
#For each 1 inch increase in the Eastward position, the Total Heat Flux
#(kilowatts) increases by 2.12235, keeping other variables constant.
#However, this relationship is not statistically significant (p-value = 0.11781).
#Position-South Direction (x3,coefficient = 3.50194):
#For each 1 inch increase in the Southward position, the Total Heat Flux
#(kilowatts) increases by 3.50194, holding other factors constant.
#This relationship is statistically significant (p-value = 0.02741).
#Position-North Direction (x4,coefficient = -22.91744):
#For each 1 inch increase in the Northward position, the Total Heat Flux
#(kilowatts) decreases by 22.91744, keeping other variables constant. This
#relationship is highly statistically significant (p-value < 0.001).
#Time of the Day (x5,coefficient = 2.59057):
#For each 1 unit increase in Time of the Day (the time variable), the Total
#Heat Flux (kilowatts) increases by 2.59057, holding other variables
#constant. This relationship is not statistically significant (p-value = 0.16636).
#R-squared = 0.9024 (90.24%): The model explains that jointly the 5
#predictors 90.24% of the variability in total heat flux.
#Adjusted R-squared = 0.8802 (88.02%): After adjusting for the number of
#predictors, the model still explains 88.02% of the variance of predicting
#Total heat flux in the population.
#2b. Conduct a full F-test for your fitted model at 5% level of significance (25 points- Each element of the test structure will take 5 points).
#To test the full model in 2a, we have the following test structure:
# H0 : β1 = β2 = β3 = β4 = β5 = 0
# H1 : At least one of the coefficients is significantly different from zero.
# α=0.05
#Statistic: Fobs = MSR/MSE ~ F522 under H0
# 22 the degrees of freedom
# 5 indicates the number of regressors (k=5).
# 22 indicates 28-5-1(n-k-1)
# Full F-test
summary(aov(model2))
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 1 6221 6221 96.901 1.61e-09 ***
## x2 1 511 511 7.952 0.009974 **
## x3 1 999 999 15.555 0.000691 ***
## x4 1 5200 5200 81.000 7.92e-09 ***
## x5 1 132 132 2.049 0.166364
## Residuals 22 1412 64
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# SSR = 6221 + 511 + 999 + 5200 + 132 + 1412 = 14475 and
# MSR = 14,475/5 = 2895
# SSE = 1412 and MSE = 1412/22 = 64.18
# Fobs = MSR/MSE = 2895/64.18 = 45.11
#Decision: Since the p-value := P(F522 > 45.11) = 7.75×10^−11 which is less
# than α=0.05, we reject the null hypothesis.
# Indicating that at least one of the regression coefficients is significantly different from zero.
# 2c. Conduct a partial F-test to test the hypothesis that H0: β2=β5=0 at 5%
#level of significance (25 points- Each element of the test structure will
#take 5 points).
#Partial F-Test
#First Run full model
full_model=lm(Y ~ x1 + x2 + x3 + x4 + x5, data=Solar)
# Run a model with the excluded variables
exc_model=lm(Y ~ x1 + x3 + x4, data=Solar)
#Run ANOVA of the two
anova(exc_model, full_model)
# The test structure:
# H0 : β1 = β3 = β4 = 0
# Alternative Hypothesis: Ha:At least one of the coefficients in the reduced model is different from zero
# α = 0.05
# Interpretation:
# Decision: If p.value:=P(F3,22 > Fobs) < α, we reject H0.
# The F-statistic is 3.1473, and the corresponding p-value is 0.06279.
# Since the p-value is greater than 0.05 (our significance level),
# we fail to reject the null hypothesis at the 5% significance level.
# Conclusion: There is no strong evidence that the predictors x2 and
# x5 significantly improve the model. Therefore, we conclude that these
# variables do not have a significant contribution at the 5% significance level.
# 2d. Conduct Forward Selection, Backward Elimination and Step wise Regression,
# Write the fitted regression models from each of these variable selection
# methods and write your observation (15 Points).
#Forward Selection Method
minModel=lm(Y ~ 1, data = Solar)
maxModel=formula(lm(Y ~ x1 + x2 + x3 + x4 + x5, data = Solar))
forward=step(minModel, direction='forward', scope=maxModel)
## Start: AIC=176.94
## Y ~ 1
##
## Df Sum of Sq RSS AIC
## + x4 1 10583.7 3890.5 142.16
## + x1 1 6221.0 8253.2 163.21
## + x5 1 1925.5 12548.7 174.94
## <none> 14474.2 176.94
## + x3 1 156.5 14317.7 178.64
## + x2 1 74.8 14399.4 178.80
##
## Step: AIC=142.15
## Y ~ x4
##
## Df Sum of Sq RSS AIC
## + x3 1 1937.79 1952.7 124.85
## + x5 1 1358.72 2531.8 132.12
## + x1 1 283.88 3606.6 142.03
## <none> 3890.5 142.16
## + x2 1 16.21 3874.3 144.04
##
## Step: AIC=124.85
## Y ~ x4 + x3
##
## Df Sum of Sq RSS AIC
## + x2 1 145.982 1806.7 124.68
## + x1 1 136.217 1816.5 124.83
## <none> 1952.7 124.85
## + x5 1 60.396 1892.3 125.97
##
## Step: AIC=124.68
## Y ~ x4 + x3 + x2
##
## Df Sum of Sq RSS AIC
## + x1 1 262.801 1543.9 122.28
## <none> 1806.7 124.68
## + x5 1 13.318 1793.4 126.47
##
## Step: AIC=122.28
## Y ~ x4 + x3 + x2 + x1
##
## Df Sum of Sq RSS AIC
## + x5 1 131.54 1412.4 121.78
## <none> 1543.9 122.28
##
## Step: AIC=121.78
## Y ~ x4 + x3 + x2 + x1 + x5
summary(forward)
##
## Call:
## lm(formula = Y ~ x4 + x3 + x2 + x1 + x5, data = Solar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6351 -2.7826 0.4129 4.3698 16.2289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 345.42264 97.50297 3.543 0.00183 **
## x4 -22.91744 2.69599 -8.501 2.12e-08 ***
## x3 3.50194 1.48229 2.363 0.02741 *
## x2 2.12235 1.30382 1.628 0.11781
## x1 0.07089 0.02910 2.436 0.02340 *
## x5 2.59057 1.80979 1.431 0.16636
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.012 on 22 degrees of freedom
## Multiple R-squared: 0.9024, Adjusted R-squared: 0.8802
## F-statistic: 40.69 on 5 and 22 DF, p-value: 2.139e-10
# Backward Elimination Method
model=lm(Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
backward=step(model, direction='backward')
## Start: AIC=121.78
## Y ~ x1 + x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## <none> 1412.4 121.78
## - x5 1 131.5 1543.9 122.28
## - x2 1 170.1 1582.5 122.97
## - x3 1 358.3 1770.7 126.11
## - x1 1 381.0 1793.4 126.47
## - x4 1 4639.0 6051.4 160.52
summary(backward)
##
## Call:
## lm(formula = Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6351 -2.7826 0.4129 4.3698 16.2289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 345.42264 97.50297 3.543 0.00183 **
## x1 0.07089 0.02910 2.436 0.02340 *
## x2 2.12235 1.30382 1.628 0.11781
## x3 3.50194 1.48229 2.363 0.02741 *
## x4 -22.91744 2.69599 -8.501 2.12e-08 ***
## x5 2.59057 1.80979 1.431 0.16636
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.012 on 22 degrees of freedom
## Multiple R-squared: 0.9024, Adjusted R-squared: 0.8802
## F-statistic: 40.69 on 5 and 22 DF, p-value: 2.139e-10
# Step wise Regression Method
model=lm(Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
stepwise=step(model, direction='both')
## Start: AIC=121.78
## Y ~ x1 + x2 + x3 + x4 + x5
##
## Df Sum of Sq RSS AIC
## <none> 1412.4 121.78
## - x5 1 131.5 1543.9 122.28
## - x2 1 170.1 1582.5 122.97
## - x3 1 358.3 1770.7 126.11
## - x1 1 381.0 1793.4 126.47
## - x4 1 4639.0 6051.4 160.52
summary(stepwise)
##
## Call:
## lm(formula = Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6351 -2.7826 0.4129 4.3698 16.2289
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 345.42264 97.50297 3.543 0.00183 **
## x1 0.07089 0.02910 2.436 0.02340 *
## x2 2.12235 1.30382 1.628 0.11781
## x3 3.50194 1.48229 2.363 0.02741 *
## x4 -22.91744 2.69599 -8.501 2.12e-08 ***
## x5 2.59057 1.80979 1.431 0.16636
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.012 on 22 degrees of freedom
## Multiple R-squared: 0.9024, Adjusted R-squared: 0.8802
## F-statistic: 40.69 on 5 and 22 DF, p-value: 2.139e-10
# Forward Selection
# Fitted Model:
#Y=345.42+(0.0709 × x1)+(2.122 × x2)+(3.502 × x3)+(−22.917 × x4)+(2.590 × x5)
# Model Summary:
#Multiple R-Squared: 0.9024
#Adjusted R-squared: 0.8802
#Residual Standard Error: 8.012
#F-statistic: 40.69 on 5 and 22 DF (p-value: 2.139e-10)
#Selection Process:
#Added x4 first (largest improvement in AIC).
#Added x3, then x2, x1, and finally x5 based on AIC reduction
# Backward Elimination
#Fitted Model:
# Y=345.42+(0.0709 × x1)+(2.122 × x2)+(3.502 × x3)+(−22.917 × x4)+(2.591 × x5)
# Model Summary:
#Multiple R-squared: 0.9024
#Adjusted R-squared: 0.8802
#Residual Standard Error: 8.012
#F-statistic: 40.69 on 5 and 22 DF (p-value: 2.139e-10)
#Selection Process:
#No variables were removed, indicating that all predictors were statistically
#important at the threshold used
#Step wise Regression (Both Directions)
#Fitted Model:
# Y=345.42+(0.0709 × x1)+(2.122 × x2)+(3.502 × x3)+(−22.917 × x4)+(2.590 × x5)
#Model Summary:
#Multiple R-squared: 0.9024
#Adjusted R2: 0.8802
#Residual Standard Error: 8.012
#F-statistic: 40.69 on 5 and 22 DF (p-value: 2.139e-10)
#Selection Process:
#Since the full model was optimal, no variables were removed during backward or
#added during forward selection.
#Observations and Conclusion
#All three methods resulted in the same final model, indicating that each
#predictor contributes significantly to explaining the variance in Y.
#Backward elimination retained all variables from the start, meaning all
#predictors met the significance threshold.
#Forward selection added variables incrementally but ultimately included all predictors.
#Stepwise regression confirmed the full model was optimal, reinforcing the results from forward and backward selection.
#The model has a high R-squared(0.9024), indicating strong explanatory power.
#Final Decision: No variable should be excluded, as all are significant contributors.