#Loading the necessary packages
library(MASS)
library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(olsrr)

## Warning: package 'olsrr' was built under R version 4.4.3

## 
## Attaching package: 'olsrr'
## 
## The following object is masked from 'package:MASS':
## 
##     cement
## 
## The following object is masked from 'package:datasets':
## 
##     rivers

# Loading in the Data
Solar <- data.frame(
  Y = c(271.80, 264.00, 238.80, 230.70, 251.60, 257.90, 263.90, 266.50, 229.10, 239.30, 258.00, 257.60, 267.30, 267.00, 259.60, 240.40, 227.20, 196.00, 278.70, 272.30, 267.40, 254.50, 224.70, 181.50, 227.50, 253.60, 263.00, 265.80),
  x1 = c(783.35, 748.45, 684.45, 827.00, 860.45, 875.15, 909.45, 905.55, 756.00, 769.35, 793.50, 801.65, 819.65, 808.55, 774.95, 711.85, 694.85, 638.10, 774.55, 757.90, 753.35, 704.70, 666.00, 568.55, 653.10, 704.05, 709.60, 726.90),
  x2 = c(33.53, 36.50, 34.66, 33.13, 35.75, 34.46, 34.60, 35.38, 35.85, 35.68, 35.35, 35.04, 34.07, 32.20, 34.32, 31.08, 35.73, 34.11, 34.79, 35.77, 36.44, 37.82, 35.07, 35.26, 35.56, 35.73, 36.46, 36.26),
  x3 = c(40.55, 36.19, 37.31, 32.52, 33.71, 34.14, 34.85, 35.89, 33.53, 33.79, 34.72, 35.22, 36.50, 37.60, 37.89, 37.71, 37.00, 36.76, 34.62, 35.40, 35.96, 36.26, 36.34, 35.90, 31.84, 33.16, 33.83, 34.89),
  x4 = c(16.66, 16.46, 17.66, 17.50, 16.40, 16.28, 16.06, 15.93, 16.60, 16.41, 16.17, 15.92, 16.04, 16.19, 16.62, 17.37, 18.12, 18.53, 15.54, 15.70, 16.45, 17.62, 18.12, 19.05, 16.51, 16.02, 15.89, 15.83),
  x5 = c(13.20, 14.11, 15.68, 10.53, 11.00, 11.31, 11.96, 12.58, 10.66, 10.85, 11.41, 11.91, 12.85, 13.58, 14.21, 15.56, 15.83, 16.41, 13.10, 13.63, 14.51, 15.38, 16.10, 16.73, 10.58, 11.28, 11.91, 12.65)
 
)

# Add column labels
 colnames(Solar) <- c(
  "Total Heat Flux (kilowatts)" = "Y",
   "Insolation (watt/sq meter)" = "x1",
   "Position-East Direction (inches)" = "x2",
   "Position-South Direction (inches)" = "x3",
   "Position-North Direction (inches)" = "x4",
   "Time of the Day" = "x5"
  )

# Check for missing values
colSums(is.na(Solar))

##  Y x1 x2 x3 x4 x5 
##  0  0  0  0  0  0

# See the composition and data structure
str(Solar)

## 'data.frame':    28 obs. of  6 variables:
##  $ Y : num  272 264 239 231 252 ...
##  $ x1: num  783 748 684 827 860 ...
##  $ x2: num  33.5 36.5 34.7 33.1 35.8 ...
##  $ x3: num  40.5 36.2 37.3 32.5 33.7 ...
##  $ x4: num  16.7 16.5 17.7 17.5 16.4 ...
##  $ x5: num  13.2 14.1 15.7 10.5 11 ...

# The Solar data set is clean, there are no missing values

Problem 1

# Scatter plot to visualize the relationship
 plot(Solar$Y,Solar$x1, pch =19,
xlab="Insolation",ylab="Total heat flux ")

#There is a positive linear relationship between Insolation and total heat 
# flux, this is shown by the upward trend, this means that
# as insolation increases total heat flux increases as well

##Problem 1a

#a. Fit a simple linear regression model relating total heat flux to
#Insolation (watt/sq meter). Write the fitted regression model along with
#the interpretation of the slop coefficient 

# F-test: testing (simultaneously) for all coefficients:
 #H0 : all the slopes β′ js are zero vs. Ha : at least one β nonzero
 #(H0: “model is not useful” vs. Ha: “model is useful”).
 # t-test: testing individual coefficient: H0 : βj = 0 vs. Ha : β1 ≠ 0 . 

#Fit the Linear Regression Model
model <- lm(Y ~ x1, data = Solar)
summary(model)

## 
## Call:
## lm(formula = Y ~ x1, data = Solar)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.194 -12.600   0.915  15.442  26.157 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 106.4726    32.4006   3.286 0.002908 ** 
## x1            0.1886     0.0426   4.427 0.000153 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.82 on 26 degrees of freedom
## Multiple R-squared:  0.4298, Adjusted R-squared:  0.4079 
## F-statistic:  19.6 on 1 and 26 DF,  p-value: 0.0001525

# The fitted regression model is: Y-hat = 6.4726+0.1886⋅x 1

# where:
#Y = Total Heat Flux (kilowatts)
#x1= Insolation (watt/sq meter)

# 106.47 is the intercept, which represents the estimated Total Heat Flux when Insolation is zero.
# 0.1886 is the slope coefficient, which represents the change in Total Heat Flux for each additional unit of Insolation.

# Interpretation of the Slope Coefficient
# The estimated slope coefficient is 0.1886, meaning that for every
# additional 1 watt/sq meter increase in Insolation, the Total Heat Flux
# increases by 0.1886 kilowatts, on average.
# p-value for x1 = 0.000153: The slope coefficient is highly statistically
#significant (p < 0.001), meaning Insolation has a strong effect on Total Heat Flux.
# R-squared = 0.4298: The model explains 42.98% of the variability in Total Heat Flux.

##Problem 1b

# b.    Test the hypothesis of linear dependence between total heat flux to
#Insolation (watt/sq meter) at 5% level of significance (25 points- Each
#element of the test structure will take 5 points).

summary(model)

## 
## Call:
## lm(formula = Y ~ x1, data = Solar)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.194 -12.600   0.915  15.442  26.157 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 106.4726    32.4006   3.286 0.002908 ** 
## x1            0.1886     0.0426   4.427 0.000153 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.82 on 26 degrees of freedom
## Multiple R-squared:  0.4298, Adjusted R-squared:  0.4079 
## F-statistic:  19.6 on 1 and 26 DF,  p-value: 0.0001525

# Analysis of variance
summary(aov(model))

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## x1           1   6221    6221    19.6 0.000153 ***
## Residuals   26   8253     317                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Hypothesis Testing Structure
# F-test
# Define the Hypotheses
# Null Hypothesis (H0):β1=0 (No linear relationship between total heat flux and insolation)

# Alternative Hypothesis (Ha):β1 ≠ 0 (There is a significant linear relationship)
# Statistic: Fobs ~ F1,26 under H0

# The chosen significance level is α=0.05.

# To test the slope
 # t-test: testing individual coefficient: H0 : βj = 0 vs. Ha : β1 ≠ 0

# Interpretation of results
#Decision:Since P(F1,26>19.598)=0.0001525<0.05, we reject the null hypothesis.
#Conclusion:At a significance level of α=0.05, there is a significant
#relationship between total heat flux (Y) and Insolation (x₁).
#Therefore, we conclude that Insolation (x₁) significantly impacts total heat flux (Y).

#  We can also look into the P(t26 > 4.427) < 0.05 and reach to the same conclusion.
 # To test the intercept:
 #H0 :βj = 0 vs H1 : β1 ≠ 0 
   # α=0.05
# Statistic: tobs ~ t26 under H0

#  Decision: Since P(t26 > 3.286) = 002908 < 005, we reject the null
# hypothesis. At a significance level of α=0.05

# Interpretation of R-squared and Coefficients
# R-squared = 0.4298: This means that approximately 42.98% of the variation in total heat flux (Y) is explained by Insolation (x₁) in the sample.

# Adjusted R-squared = 0.4079: This means that approximately 40.79% of the variation in total heat flux (Y) is explained by Insolation (x₁) in the population, taking into account the number of predictors in the model.

# The hypothesis test will examine whether there's a statistically
#significant linear relationship between the variables:

# Hypotheses:
#Null Hypothesis (H₀): β₁ = 0 (no linear relationship between total heat flux and insolation)
#Alternative Hypothesis (H₁): β₁ ≠ 0 (there is a linear relationship between total heat flux and insolation)

#Test Statistic:t = β̂₁ / SE(β̂₁)
#where β̂₁ is the estimated slope coefficient and SE(β̂₁) is its standard error

# Assuming your regression model is stored as "model"
# where model <- lm(heat_flux ~ insolation, data=your_data)

# The model has already been created as:
# model <- lm(formula = Y ~ x1, data = Solar)

# Extract key statistics from the model summary
model_summary <- summary(model)

# Extract the slope coefficient, standard error, t-statistic and p-value
slope_estimate <- model_summary$coefficients[2, 1]  # 0.1886
se_slope <- model_summary$coefficients[2, 2]       # 0.0426
t_statistic <- model_summary$coefficients[2, 3]    # 4.427
p_value <- model_summary$coefficients[2, 4]        # 0.000153

# Extract degrees of freedom
df <- model_summary$df[2]  # 26

# Calculate critical t-value for two-tailed test at 5% significance
t_critical <- qt(0.975, df)  # 0.975 for two-tailed at 5% level

# Display results
cat("Hypothesis Test for Linear Dependence\n")

## Hypothesis Test for Linear Dependence

cat("--------------------------------------\n")

## --------------------------------------

cat("Slope coefficient (β₁):", slope_estimate, "\n")

## Slope coefficient (β₁): 0.1885873

cat("Standard error:", se_slope, "\n")

## Standard error: 0.04259976

cat("t-statistic:", t_statistic, "\n")

## t-statistic: 4.426956

cat("Degrees of freedom:", df, "\n")

## Degrees of freedom: 26

cat("Critical t-value (two-tailed, 5%):", t_critical, "\n")

## Critical t-value (two-tailed, 5%): 2.055529

cat("p-value:", p_value, "\n\n")

## p-value: 0.0001525174

# Decision
if (abs(t_statistic) > t_critical) {
  cat("Decision: Reject the null hypothesis (|t| > t_critical)\n")
} else {
  cat("Decision: Fail to reject the null hypothesis (|t| ≤ t_critical)\n")
}

## Decision: Reject the null hypothesis (|t| > t_critical)

if (p_value < 0.05) {
  cat("Decision: Reject the null hypothesis (p < 0.05)\n")
} else {
  cat("Decision: Fail to reject the null hypothesis (p ≥ 0.05)\n")
}

## Decision: Reject the null hypothesis (p < 0.05)

cat("\nConclusion: There is", ifelse(p_value < 0.05, "sufficient", "insufficient"), 
    "evidence at the 5% significance level to conclude that there is a linear relationship\n",
    "between total heat flux and insolation.\n")

## 
## Conclusion: There is sufficient evidence at the 5% significance level to conclude that there is a linear relationship
##  between total heat flux and insolation.

Problem 1c

#State the Hypotheses:
# Null Hypothesis (H0): Intercept ≤ 100
# Alternative Hypothesis (H1): Intercept >100

# Test statistic: t = (β̂₀ - 100) / SE(β̂₀)

#Calculate the Test Statistic and p-value:
# Get the summary of the model
model_summary <- summary(model)

# Extract the intercept and its standard error
intercept_estimate <- model_summary$coefficients[1, 1]  # Intercept coefficient
se_intercept <- model_summary$coefficients[1, 2]        # Standard error of the intercept

# Calculate the t-statistic
t_statistic <- (intercept_estimate - 100) / se_intercept

# Extract degrees of freedom
df <- model_summary$df[2]

# Find the critical value for a one-tailed t-test at 5% significance
t_critical <- qt(0.95, df)

# Calculate the p-value for the one-tailed test
p_value <- 1 - pt(t_statistic, df)

# Display results
t_statistic

## [1] 0.1997681

p_value

## [1] 0.4216081

t_critical

## [1] 1.705618

# Decision: Compare the t-statistic to the critical value
if (t_statistic > t_critical) {
  cat("Decision: Reject the null hypothesis. The intercept is greater than 100.\n")
} else {
  cat("Decision: Fail to reject the null hypothesis. The intercept is not greater than 100.\n")
}

## Decision: Fail to reject the null hypothesis. The intercept is not greater than 100.

#  Decision rule: Reject H0, if t_statistic > t_critical
#Test Statistic, Test critical and p-value Results:
#Test Statistic = 0.1998
# T_critical = 1.7056
#p-value: 0.4216

# Since t_statistic=0.1998 is less than t_critical=1.7056, fail to reject H0 

#Conclusion:Since the p-value (0.4216) > 0.05, we fail to reject the null hypothesis.
#Interpretation: There is insufficient evidence to conclude that the
#intercept is significantly greater than 100 at the 5% significance level.

Problem 1d

#d. Interpret the estimated coefficient of determination from 1(a) (2 points).

# Interpretation of the Coefficient of Determination (R-squared):
#R-squared : 0.4298
#This value represents the proportion of the variance in the dependent
#variable (total heat flux) that is explained by the independent variable
#(insolation). In this case:

#Interpretation: Approximately 42.98% of the variation in total heat flux
#can be explained by insolation (watt/sq meter). The remaining 57.02% of
#the variation is unexplained by the model and could be due to other
#factors not included in the model.

Problem 1e

# 1e. Obtain the prediction, 95% Confidence and 95% Prediction interval at
# x1=1000, using the fitted model in 1(a) and interpret the interval
#results (8 points). 

# To predict the responses on new values
# Create a new data frame with x1 = 1000
x0 <- data.frame(x1 = 1000)

# Get the 95% Confidence Interval prediction
predict(model, x0, interval = "confidence", level = 0.95, type = "response")

##        fit      lwr      upr
## 1 295.0599 272.6397 317.4801

# Get the 95% Prediction Interval prediction
predict(model, x0, interval = "prediction", level = 0.95, type = "response")

##        fit      lwr      upr
## 1 295.0599 252.1195 338.0002

# 95% Confidence Interval (for the mean response at x1=1000)
# Prediction: 295.06 kW
#Lower Bound: 272.64 kW
#Upper Bound: 317.48 kW

#Interpretation:
#If we repeatedly sample and fit the regression model, the average total heat
#flux (kilowatts) at an insolation of 1000 W/m² is expected to be within
#(272.64, 317.48) kW about 95% of the time.

#95% Prediction Interval (for a single new observation at x1=1000)
#Prediction: 295.06 kW
#Lower Bound: 252.12 kW
#Upper Bound: 338.00 kW

#Interpretation:
#For an individual new observation where the insolation is 1000 W/m², the
#total heat flux is expected to fall within (252.12, 338.00) kW about 95% of the time.

##Problem 1f

# f.    Fit a third order polynomial regression relating total heat flux to
# Insolation (watt/sq meter) and write the fitted model (5 points).

# Fit a third-order polynomial regression model
model_poly <- lm(Y ~ poly(x1, 3), data = Solar)

# Display the model summary
summary(model_poly)

## 
## Call:
## lm(formula = Y ~ poly(x1, 3), data = Solar)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -30.950  -7.313   3.051   8.920  19.815 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   249.132      2.699  92.314  < 2e-16 ***
## poly(x1, 3)1   78.873     14.280   5.523 1.11e-05 ***
## poly(x1, 3)2  -57.071     14.280  -3.996 0.000532 ***
## poly(x1, 3)3   10.088     14.280   0.706 0.486737    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.28 on 24 degrees of freedom
## Multiple R-squared:  0.6619, Adjusted R-squared:  0.6196 
## F-statistic: 15.66 on 3 and 24 DF,  p-value: 7.467e-06

# Fitted Third-Order Polynomial Regression Model:
#The fitted third-order polynomial regression model is:
#Y = 249.132 + 78.873.x1 − 57.071.x12 + 10.088.x13

#Where:
#Y is the total heat flux (response variable),
#x1 is the insolation (predictor variable),
#249.132 is the intercept,
#78.873 is the coefficient for x1,
#57.071 is the coefficient for x12,
#10.088 is the coefficient for x13.

#Interpretation of Coefficients:
#Intercept (β0): The estimated value of total heat flux when insolation is zero is 249.132.
#First-order coefficient (β1): For each unit increase in insolation (x 1),
#the total heat flux is expected to increase by 78.873 units, assuming all
#other terms are held constant.

#Second-order coefficient (β 2): 
#For each unit increase in x12, the total heat flux is expected to decrease
#by 57.071 units, reflecting a curvilinear relationship.

#Third-order coefficient (β3): 
#The third-order term suggests that for each unit increase in x13, 
#the total heat flux will increase by 10.088 units, but the effect is not
#statistically significant (p-value = 0.487).

#Model Significance:
#The overall model is significant with a p-value of 7.467 × 10 − 6,
#indicating that at least one of the coefficients is significantly
#different from zero.

#The R2 value of 0.6619 suggests that about 66.2% of the variability in
#total heat flux is explained by the model.

##Problem 2

#2. Again, use solar energy data to work on the following problems below.
# a.    Fit a multiple linear regression model using all independent
#variables. Write the fitted regression model along with the interpretation
#of the partial slop coefficients (5 points)

# Fit the multiple linear regression model using all independent variables
model2 <- lm(Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)

# Display the model summary
summary(model2)

## 
## Call:
## lm(formula = Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6351  -2.7826   0.4129   4.3698  16.2289 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 345.42264   97.50297   3.543  0.00183 ** 
## x1            0.07089    0.02910   2.436  0.02340 *  
## x2            2.12235    1.30382   1.628  0.11781    
## x3            3.50194    1.48229   2.363  0.02741 *  
## x4          -22.91744    2.69599  -8.501 2.12e-08 ***
## x5            2.59057    1.80979   1.431  0.16636    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.012 on 22 degrees of freedom
## Multiple R-squared:  0.9024, Adjusted R-squared:  0.8802 
## F-statistic: 40.69 on 5 and 22 DF,  p-value: 2.139e-10

# Fitted Regression Model:
        # Ŷ = 345.423 + 0.0709x₁ + 2.122x₂ + 3.502x₃ - 22.917x₄ + 2.591x₅
#Where:
#Y = Total Heat Flux (kilowatts)
#x1= Insolation (watt/sq meter)
#x2= Position-East Direction (inches)
#x3= Position-South Direction (inches)
#x4= Position-North Direction (inches)
#x5= Time of the Day

#Interpretation of the partial slope coefficients:
#β₁ = 0.071: For each unit increase in x₁, the solar energy output (Y) is
#expected to increase by 0.071 units, holding all other variables constant.
#This coefficient is statistically significant (p-value = 0.023).

#β₂ = 2.122: For each unit increase in x₂, Y is expected to increase by 2.122
#units, holding all other variables constant. This coefficient is not
#statistically significant (p-value = 0.118).

#β₃ = 3.502: For each unit increase in x₃, Y is expected to increase by 3.502
#units, holding all other variables constant. This coefficient is statistically
#significant (p-value = 0.027).

#β₄ = -22.917: For each unit increase in x₄, Y is expected to decrease by
#22.917 units, holding all other variables constant. This coefficient is highly
#significant (p-value = 2.12e-08).

#β₅ = 2.591: For each unit increase in x₅, Y is expected to increase by 2.59
#1 units, holding all other variables constant. This coefficient is not
#statistically significant (p-value = 0.166).

#The model has a high R-squared value of 0.9024, indicating that approximately
#90.24% of the variability in solar energy output is explained by these five
#predictor variables. The overall model is statistically significant with an
#F-statistic of 40.69 and a p-value of 2.139e-10.

#Interpretation of the Partial Slope Coefficients:
#Intercept (345.42):
#When all predictors (x1,x2,x3,x4,x5) are zero, the estimated total heat flux
#is 345.42 kW. However, this may not have a practical interpretation if zero values of the predictors are unrealistic.

#Insolation (β1=0.07089, p = 0.0234)**:
#For each 1 unit increase in Insolation (watt/sq meter), the Total Heat Flux
#(kilowatts) increases by 0.07089, holding other variables constant. 
#This relationship is statistically significant (p-value = 0.02340 < 0.05).

#Position-East Direction (x2 coefficient = 2.12235):
#For each 1 inch increase in the Eastward position, the Total Heat Flux
#(kilowatts) increases by 2.12235, keeping other variables constant.
#However, this relationship is not statistically significant (p-value = 0.11781).

#Position-South Direction (x3,coefficient = 3.50194):
#For each 1 inch increase in the Southward position, the Total Heat Flux
#(kilowatts) increases by 3.50194, holding other factors constant. 
#This relationship is statistically significant (p-value = 0.02741).

#Position-North Direction (x4,coefficient = -22.91744):
#For each 1 inch increase in the Northward position, the Total Heat Flux
#(kilowatts) decreases by 22.91744, keeping other variables constant. This
#relationship is highly statistically significant (p-value < 0.001).

#Time of the Day (x5,coefficient = 2.59057):
#For each 1 unit increase in Time of the Day (the time variable), the Total
#Heat Flux (kilowatts) increases by 2.59057, holding other variables 
#constant. This relationship is not statistically significant (p-value = 0.16636).

#R-squared = 0.9024 (90.24%): The model explains that jointly the 5
#predictors 90.24% of the variability in total heat flux.

#Adjusted R-squared = 0.8802 (88.02%): After adjusting for the number of
#predictors, the model still explains 88.02% of the variance of predicting
#Total heat flux in the population.

Problem 2b

#2b.    Conduct a full F-test for your fitted model at 5% level of significance (25 points- Each element of the test structure will take 5 points).

#To test the full model in 2a, we have the following test structure:
 # H0 : β1 = β2 = β3 = β4 = β5 = 0
 # H1 : At least one of the coefficients is significantly different from zero.
 # α=0.05

#Statistic: Fobs = MSR/MSE ~ F522 under H0
# 22 the degrees of freedom
# 5 indicates the number of regressors (k=5).
# 22 indicates 28-5-1(n-k-1)

# Full F-test
summary(aov(model2))

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## x1           1   6221    6221  96.901 1.61e-09 ***
## x2           1    511     511   7.952 0.009974 ** 
## x3           1    999     999  15.555 0.000691 ***
## x4           1   5200    5200  81.000 7.92e-09 ***
## x5           1    132     132   2.049 0.166364    
## Residuals   22   1412      64                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# SSR = 6221 + 511 + 999 + 5200 + 132 + 1412 = 14475 and
#  MSR = 14,475/5 = 2895
# SSE = 1412 and MSE = 1412/22 = 64.18
#  Fobs = MSR/MSE = 2895/64.18 = 45.11
#Decision: Since the p-value := P(F522 > 45.11) = 7.75×10^−11 which is less
# than α=0.05, we reject the null hypothesis.
# Indicating that at least one of the regression coefficients is significantly different from zero.

# 2c. Conduct a partial F-test to test the hypothesis that H0: β2=β5=0 at 5%
#level of significance (25 points- Each element of the test structure will
#take 5 points).

#Partial F-Test
#First Run full model
full_model=lm(Y ~ x1 + x2 + x3 + x4 + x5, data=Solar)
# Run a model with the excluded variables
exc_model=lm(Y ~  x1 + x3 + x4, data=Solar)

#Run ANOVA of the two
anova(exc_model, full_model)

#  The test structure:
 # H0 : β1 = β3 = β4 = 0
 # Alternative Hypothesis: Ha:At least one of the coefficients in the reduced model is different from zero
#  α = 0.05


# Interpretation:
# Decision: If p.value:=P(F3,22 > Fobs) < α, we reject H0.
# The F-statistic is 3.1473, and the corresponding p-value is 0.06279.
# Since the p-value is greater than 0.05 (our significance level), 
# we fail to reject the null hypothesis at the 5% significance level.
# Conclusion: There is no strong evidence that the predictors x2 and
# x5 significantly improve the model. Therefore, we conclude that these
# variables do not have a significant contribution at the 5% significance level.

# 2d. Conduct Forward Selection, Backward Elimination and Step wise Regression,
# Write the fitted regression models from each of these variable selection
# methods and write your observation (15 Points). 

#Forward Selection Method
minModel=lm(Y ~ 1, data = Solar)
maxModel=formula(lm(Y ~ x1 + x2 + x3 + x4 + x5, data = Solar))
forward=step(minModel,  direction='forward', scope=maxModel)

## Start:  AIC=176.94
## Y ~ 1
## 
##        Df Sum of Sq     RSS    AIC
## + x4    1   10583.7  3890.5 142.16
## + x1    1    6221.0  8253.2 163.21
## + x5    1    1925.5 12548.7 174.94
## <none>              14474.2 176.94
## + x3    1     156.5 14317.7 178.64
## + x2    1      74.8 14399.4 178.80
## 
## Step:  AIC=142.15
## Y ~ x4
## 
##        Df Sum of Sq    RSS    AIC
## + x3    1   1937.79 1952.7 124.85
## + x5    1   1358.72 2531.8 132.12
## + x1    1    283.88 3606.6 142.03
## <none>              3890.5 142.16
## + x2    1     16.21 3874.3 144.04
## 
## Step:  AIC=124.85
## Y ~ x4 + x3
## 
##        Df Sum of Sq    RSS    AIC
## + x2    1   145.982 1806.7 124.68
## + x1    1   136.217 1816.5 124.83
## <none>              1952.7 124.85
## + x5    1    60.396 1892.3 125.97
## 
## Step:  AIC=124.68
## Y ~ x4 + x3 + x2
## 
##        Df Sum of Sq    RSS    AIC
## + x1    1   262.801 1543.9 122.28
## <none>              1806.7 124.68
## + x5    1    13.318 1793.4 126.47
## 
## Step:  AIC=122.28
## Y ~ x4 + x3 + x2 + x1
## 
##        Df Sum of Sq    RSS    AIC
## + x5    1    131.54 1412.4 121.78
## <none>              1543.9 122.28
## 
## Step:  AIC=121.78
## Y ~ x4 + x3 + x2 + x1 + x5

summary(forward)

## 
## Call:
## lm(formula = Y ~ x4 + x3 + x2 + x1 + x5, data = Solar)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6351  -2.7826   0.4129   4.3698  16.2289 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 345.42264   97.50297   3.543  0.00183 ** 
## x4          -22.91744    2.69599  -8.501 2.12e-08 ***
## x3            3.50194    1.48229   2.363  0.02741 *  
## x2            2.12235    1.30382   1.628  0.11781    
## x1            0.07089    0.02910   2.436  0.02340 *  
## x5            2.59057    1.80979   1.431  0.16636    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.012 on 22 degrees of freedom
## Multiple R-squared:  0.9024, Adjusted R-squared:  0.8802 
## F-statistic: 40.69 on 5 and 22 DF,  p-value: 2.139e-10

# Backward Elimination Method
model=lm(Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
backward=step(model,  direction='backward')

## Start:  AIC=121.78
## Y ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## <none>              1412.4 121.78
## - x5    1     131.5 1543.9 122.28
## - x2    1     170.1 1582.5 122.97
## - x3    1     358.3 1770.7 126.11
## - x1    1     381.0 1793.4 126.47
## - x4    1    4639.0 6051.4 160.52

summary(backward)

## 
## Call:
## lm(formula = Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6351  -2.7826   0.4129   4.3698  16.2289 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 345.42264   97.50297   3.543  0.00183 ** 
## x1            0.07089    0.02910   2.436  0.02340 *  
## x2            2.12235    1.30382   1.628  0.11781    
## x3            3.50194    1.48229   2.363  0.02741 *  
## x4          -22.91744    2.69599  -8.501 2.12e-08 ***
## x5            2.59057    1.80979   1.431  0.16636    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.012 on 22 degrees of freedom
## Multiple R-squared:  0.9024, Adjusted R-squared:  0.8802 
## F-statistic: 40.69 on 5 and 22 DF,  p-value: 2.139e-10

# Step wise Regression Method
model=lm(Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
stepwise=step(model,  direction='both')

## Start:  AIC=121.78
## Y ~ x1 + x2 + x3 + x4 + x5
## 
##        Df Sum of Sq    RSS    AIC
## <none>              1412.4 121.78
## - x5    1     131.5 1543.9 122.28
## - x2    1     170.1 1582.5 122.97
## - x3    1     358.3 1770.7 126.11
## - x1    1     381.0 1793.4 126.47
## - x4    1    4639.0 6051.4 160.52

summary(stepwise)

## 
## Call:
## lm(formula = Y ~ x1 + x2 + x3 + x4 + x5, data = Solar)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6351  -2.7826   0.4129   4.3698  16.2289 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 345.42264   97.50297   3.543  0.00183 ** 
## x1            0.07089    0.02910   2.436  0.02340 *  
## x2            2.12235    1.30382   1.628  0.11781    
## x3            3.50194    1.48229   2.363  0.02741 *  
## x4          -22.91744    2.69599  -8.501 2.12e-08 ***
## x5            2.59057    1.80979   1.431  0.16636    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.012 on 22 degrees of freedom
## Multiple R-squared:  0.9024, Adjusted R-squared:  0.8802 
## F-statistic: 40.69 on 5 and 22 DF,  p-value: 2.139e-10

# Forward Selection
# Fitted Model:
#Y=345.42+(0.0709 × x1)+(2.122 × x2)+(3.502 × x3)+(−22.917 × x4)+(2.590 × x5)
# Model Summary:
#Multiple R-Squared: 0.9024
#Adjusted R-squared: 0.8802
#Residual Standard Error: 8.012
#F-statistic: 40.69 on 5 and 22 DF (p-value: 2.139e-10)

#Selection Process:
#Added x4 first (largest improvement in AIC).
#Added x3, then x2, x1, and finally x5 based on AIC reduction

# Backward Elimination
#Fitted Model:
# Y=345.42+(0.0709 × x1)+(2.122 × x2)+(3.502 × x3)+(−22.917 × x4)+(2.591 × x5)
# Model Summary:
#Multiple R-squared: 0.9024
#Adjusted R-squared: 0.8802
#Residual Standard Error: 8.012
#F-statistic: 40.69 on 5 and 22 DF (p-value: 2.139e-10)
#Selection Process:
#No variables were removed, indicating that all predictors were statistically
#important at the threshold used


#Step wise Regression (Both Directions)
#Fitted Model:
# Y=345.42+(0.0709 × x1)+(2.122 × x2)+(3.502 × x3)+(−22.917 × x4)+(2.590 × x5)
#Model Summary:
#Multiple R-squared: 0.9024
#Adjusted R2: 0.8802
#Residual Standard Error: 8.012
#F-statistic: 40.69 on 5 and 22 DF (p-value: 2.139e-10)
#Selection Process:
#Since the full model was optimal, no variables were removed during backward or
#added during forward selection.

#Observations and Conclusion
#All three methods resulted in the same final model, indicating that each
#predictor contributes significantly to explaining the variance in Y.

#Backward elimination retained all variables from the start, meaning all
#predictors met the significance threshold.

#Forward selection added variables incrementally but ultimately included all predictors.

#Stepwise regression confirmed the full model was optimal, reinforcing the results from forward and backward selection.

#The model has a high R-squared(0.9024), indicating strong explanatory power.

#Final Decision: No variable should be excluded, as all are significant contributors.

Assignment 3

Doris Mbitazi Asongafac

2025-03-19

Problem 1

Problem 1c

Problem 1d

Problem 1e

Problem 2b