Assignment Regression 4

library(car)

## Warning: package 'car' was built under R version 4.5.2

## Loading required package: carData

library(leaps) 

#problem1
data=read.table("C:/Users/Sabuj Ganguly/OneDrive/Documents/PhD 1st sem/Regression/prob1data.txt",header=TRUE)
head(data)

##      Y     X1   X2 X3
## 1 4264 305657 7.17  0
## 2 4496 328476 6.20  0
## 3 4317 317164 4.61  0
## 4 4292 366745 7.02  0
## 5 4945 265518 8.61  1
## 6 4325 301995 6.88  0

#a)
# model fitting
model <- lm(Y ~ X1 + X2 + X3, data = data)
summary(model)

## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -264.05 -110.73  -22.52   79.29  295.75 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.150e+03  1.956e+02  21.220  < 2e-16 ***
## X1           7.871e-04  3.646e-04   2.159   0.0359 *  
## X2          -1.317e+01  2.309e+01  -0.570   0.5712    
## X3           6.236e+02  6.264e+01   9.954 2.94e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 143.3 on 48 degrees of freedom
## Multiple R-squared:  0.6883, Adjusted R-squared:  0.6689 
## F-statistic: 35.34 on 3 and 48 DF,  p-value: 3.316e-12

# Estimated regression model
cat("Estimated regression function:\n")

## Estimated regression function:

cat("Y =", coef(model)[1], "+", coef(model)[2], "X1 +", coef(model)[3], "X2 +", coef(model)[4], "X3\n")

## Y = 4149.887 + 0.0007870804 X1 + -13.16602 X2 + 623.5545 X3

#b)
#residuals
residuals <- resid(model)

#Box plot of residuals
boxplot(residuals, main = "Box Plot of Residuals", ylab = "Residuals")

#c)
par(mfrow = c(2, 3))

#Residuals against Y, X1, X2, X3, and X1*X2
plot(fitted(model), residuals, main = "Residuals vs Y-hat", xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red")

plot(data$X1, residuals, main = "Residuals vs X1", xlab = "X1", ylab = "Residuals")
abline(h = 0, col = "red")

plot(data$X2, residuals, main = "Residuals vs X2", xlab = "X2", ylab = "Residuals")
abline(h = 0, col = "red")

plot(data$X3, residuals, main = "Residuals vs X3", xlab = "X3", ylab = "Residuals")
abline(h = 0, col = "red")

# X1*X2 interaction
plot(data$X1 * data$X2, residuals, main = "Residuals vs X1*X2", xlab = "X1*X2", ylab = "Residuals")
abline(h = 0, col = "red")

#Normal probability plot
qqnorm(residuals)
qqline(residuals, col = "red")

par(mfrow = c(1, 1))



#d)
#Time plot
n <- nrow(data)
plot(1:n, residuals, type = "l", main = "Time Plot of Residuals", 
     xlab = "Time Order", ylab = "Residuals")
abline(h = 0, col = "red")

#e)
# Brown-Forsythe test for constancy of error variance
fitted_values <- fitted(model)

#Splitting into two groups based on fitted values
sorted_indices <- order(fitted_values)
group1 <- residuals[sorted_indices[1:26]]
group2 <- residuals[sorted_indices[27:52]]


d1 <- abs(group1 - median(group1))
d2 <- abs(group2 - median(group2))

# t-test
bf_test <- t.test(d1, d2, var.equal = FALSE)
print(bf_test)

## 
##  Welch Two Sample t-test
## 
## data:  d1 and d2
## t = 2.9859, df = 42.125, p-value = 0.004695
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   21.93407 113.37908
## sample estimates:
## mean of x mean of y 
## 145.03103  77.37446

#Critical value for alpha = 0.01
critical_value <- qt(0.995, df = min(25, 25))
cat("Critical t-value (alpha=0.01):", critical_value, "\n")

## Critical t-value (alpha=0.01): 2.787436

#f)
#Test for overall regression relation
anova_model <- anova(model)
print(anova_model)

## Analysis of Variance Table
## 
## Response: Y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## X1         1  136366  136366  6.6417   0.01309 *  
## X2         1    5726    5726  0.2789   0.59987    
## X3         1 2034514 2034514 99.0905 2.941e-13 ***
## Residuals 48  985530   20532                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#F-test for overall significance
f_statistic <- anova_model$"F value"[1]
p_value <- anova_model$"Pr(>F)"[1]

cat("F-statistic:", f_statistic, "\n")

## F-statistic: 6.641687

cat("p-value:", p_value, "\n")

## p-value: 0.01309038

# Critical F-value
critical_f <- qf(0.95, df1 = 3, df2 = n-4)
cat("Critical F-value (alpha=0.05):", critical_f, "\n")

## Critical F-value (alpha=0.05): 2.798061

#g)
# Bonferroni confidence intervals
conf_int <- confint(model, level = 1 - 0.05/3)
cat("95% Family Confidence Intervals (Bonferroni):\n")

## 95% Family Confidence Intervals (Bonferroni):

print(conf_int)

##                   0.833 %     99.167 %
## (Intercept)  3.664732e+03 4.635043e+03
## X1          -1.172992e-04 1.691460e-03
## X2          -7.045161e+01 4.411957e+01
## X3           4.681559e+02 7.789531e+02

#h)
# Sequential ANOVA
cat("Sequential ANOVA:\n")

## Sequential ANOVA:

anova_model_seq <- anova(model)
print(anova_model_seq)

## Analysis of Variance Table
## 
## Response: Y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## X1         1  136366  136366  6.6417   0.01309 *  
## X2         1    5726    5726  0.2789   0.59987    
## X3         1 2034514 2034514 99.0905 2.941e-13 ***
## Residuals 48  985530   20532                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#using different orders
model_X1 <- lm(Y ~ X1, data = data)
model_X1_X3 <- lm(Y ~ X1 + X3, data = data)
model_full <- lm(Y ~ X1 + X3 + X2, data = data)

cat("\nExtra sums of squares:\n")

## 
## Extra sums of squares:

cat("SSR(X1):", anova(model_X1)$"Sum Sq"[1], "\n")

## SSR(X1): 136366.2

cat("SSR(X3|X1):", anova(model_X1_X3)$"Sum Sq"[2], "\n")

## SSR(X3|X1): 2033565

cat("SSR(X2|X1,X3):", anova(model_full)$"Sum Sq"[3], "\n")

## SSR(X2|X1,X3): 6674.588

#i)
#whether X2 can be dropped
model_reduced <- lm(Y ~ X1 + X3, data = data)
anova_test <- anova(model_reduced, model)
print(anova_test)

## Analysis of Variance Table
## 
## Model 1: Y ~ X1 + X3
## Model 2: Y ~ X1 + X2 + X3
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     49 992204                           
## 2     48 985530  1    6674.6 0.3251 0.5712

# F-test for X2 given X1 and X3
f_test_X2 <- anova_test$F[2]
p_value_X2 <- anova_test$"Pr(>F)"[2]

cat("F-statistic for testing X2:", f_test_X2, "\n")

## F-statistic for testing X2: 0.3250843

cat("p-value:", p_value_X2, "\n")

## p-value: 0.5712274

#j)
#various R-squared values
r2_full <- summary(model)$r.squared
r2_X1 <- summary(lm(Y ~ X1, data = data))$r.squared
r2_X2 <- summary(lm(Y ~ X2, data = data))$r.squared
r2_X1_X2 <- summary(lm(Y ~ X1 + X2, data = data))$r.squared

# Partial R-squared
ss_full <- anova(model)$"Sum Sq"
ss_X1_only <- anova(lm(Y ~ X1, data = data))$"Sum Sq"
ss_X2_given_X1 <- anova(lm(Y ~ X1 + X2, data = data))$"Sum Sq"[2]

cat("R-squared values:\n")

## R-squared values:

cat("R2_Y1:", r2_X1, "\n")

## R2_Y1: 0.04312473

cat("R2_Y2:", r2_X2, "\n")

## R2_Y2: 0.003603553

cat("R2_Y12:", r2_X1_X2, "\n")

## R2_Y12: 0.0449355

cat("R2 (full model):", r2_full, "\n")

## R2 (full model): 0.6883342

#k)
#Number of observations
n <- nrow(data)

# Correlation transformation: 
Y_star <- (1/sqrt(n-1)) * (data$Y - mean(data$Y)) / sd(data$Y)
X1_star <- (1/sqrt(n-1)) * (data$X1 - mean(data$X1)) / sd(data$X1)
X2_star <- (1/sqrt(n-1)) * (data$X2 - mean(data$X2)) / sd(data$X2)
X3_star <- (1/sqrt(n-1)) * (data$X3 - mean(data$X3)) / sd(data$X3)


data_star <- data.frame(Y_star = Y_star, X1_star = X1_star, X2_star = X2_star, X3_star = X3_star)

#standardized regression model (no intercept)
corr_model <- lm(Y_star ~ 0 + X1_star + X2_star + X3_star, data = data_star)
cat("Standardized Regression Model (Correlation Transformation):\n")

## Standardized Regression Model (Correlation Transformation):

summary(corr_model)

## 
## Call:
## lm(formula = Y_star ~ 0 + X1_star + X2_star + X3_star, data = data_star)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.14849 -0.06227 -0.01266  0.04459  0.16632 
## 
## Coefficients:
##         Estimate Std. Error t value Pr(>|t|)    
## X1_star  0.17472    0.08009   2.181    0.034 *  
## X2_star -0.04639    0.08053  -0.576    0.567    
## X3_star  0.80786    0.08032  10.058 1.66e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07975 on 49 degrees of freedom
## Multiple R-squared:  0.6883, Adjusted R-squared:  0.6693 
## F-statistic: 36.07 on 3 and 49 DF,  p-value: 1.869e-12

#l)
#coefficients of determination between all pairs of predictor variables
cor_matrix <- cor(data[, c("X1", "X2", "X3")])
cat("Correlation matrix of predictor variables:\n")

## Correlation matrix of predictor variables:

print(cor_matrix)

##            X1         X2         X3
## X1 1.00000000 0.08489639 0.04565698
## X2 0.08489639 1.00000000 0.11337076
## X3 0.04565698 0.11337076 1.00000000

# R-squared values between predictor pairs
r_sq_matrix <- cor_matrix^2
cat("\nR-squared values between predictor pairs:\n")

## 
## R-squared values between predictor pairs:

print(r_sq_matrix)

##             X1          X2         X3
## X1 1.000000000 0.007207396 0.00208456
## X2 0.007207396 1.000000000 0.01285293
## X3 0.002084560 0.012852929 1.00000000

# Checking standardized regression coefficients reflect individual effects
vif_values <- vif(model)
cat("\nVariance Inflation Factors:\n")

## 
## Variance Inflation Factors:

print(vif_values)

##       X1       X2       X3 
## 1.008596 1.019598 1.014364

if(max(vif_values) > 10) {
  cat(" Serious multicollinearity prevents meaningful interpretation\n")
} else if(max(vif_values) > 5) {
  cat(" Moderate multicollinearity affects interpretation\n")
} else {
  cat(" Low multicollinearity allows meaningful interpretation\n")
}

##  Low multicollinearity allows meaningful interpretation

#m)
# standardized coefficients from correlation model
std_coef <- coef(corr_model)

#Transform back
sd_Y <- sd(data$Y)
sd_X1 <- sd(data$X1)
sd_X2 <- sd(data$X2)
sd_X3 <- sd(data$X3)

b1_orig <- std_coef["X1_star"] * (sd_Y / sd_X1)
b2_orig <- std_coef["X2_star"] * (sd_Y / sd_X2)
b3_orig <- std_coef["X3_star"] * (sd_Y / sd_X3)

#intercept
b0_orig <- mean(data$Y) - b1_orig * mean(data$X1) - b2_orig * mean(data$X2) - b3_orig * mean(data$X3)

cat("Coefficients transformed back to original scale:\n")

## Coefficients transformed back to original scale:

cat("b0 =", b0_orig, "\n")

## b0 = 4149.887

cat("b1 =", b1_orig, "\n")

## b1 = 0.0007870804

cat("b2 =", b2_orig, "\n")

## b2 = -13.16602

cat("b3 =", b3_orig, "\n")

## b3 = 623.5545

original_coef <- coef(model)
cat("\nComparison with original model coefficients:\n")

## 
## Comparison with original model coefficients:

comparison <- data.frame(
  Original = original_coef,
  Transformed = c(b0_orig, b1_orig, b2_orig, b3_orig)
)
print(comparison)

##                  Original   Transformed
## (Intercept)  4.149887e+03  4.149887e+03
## X1           7.870804e-04  7.870804e-04
## X2          -1.316602e+01 -1.316602e+01
## X3           6.235545e+02  6.235545e+02

#n)
# SSR(X1) 
model_X1 <- lm(Y ~ X1, data = data)
SSR_X1 <- sum((fitted(model_X1) - mean(data$Y))^2)

#SSR(X1|X2) 
model_X2 <- lm(Y ~ X2, data = data)
model_X1_X2 <- lm(Y ~ X1 + X2, data = data)

anova_comparison <- anova(model_X2, model_X1_X2)
SSR_X1_given_X2 <- anova_comparison$Sum[2]  

cat("SSR Comparison:\n")

## SSR Comparison:

cat("SSR(X1) =", SSR_X1, "\n")

## SSR(X1) = 136366.2

cat("SSR(X1|X2) =", SSR_X1_given_X2, "\n")

## SSR(X1|X2) = 130697.2

cat("Difference =", SSR_X1 - SSR_X1_given_X2, "\n")

## Difference = 5669.001

# if difference is substantial
rel_diff <- abs(SSR_X1 - SSR_X1_given_X2) / SSR_X1 * 100
cat("Relative difference:", round(rel_diff, 2), "%\n")

## Relative difference: 4.16 %

if(rel_diff < 1) {
  cat("The difference is not substantial.\n")
} else if(rel_diff < 10) {
  cat("The difference is moderate.\n")
} else {
  cat("The difference is substantial.\n")
}

## The difference is moderate.

#Problem 2
data2=read.table("C:/Users/Sabuj Ganguly/OneDrive/Documents/PhD 1st sem/Regression/prob2data.txt",header=TRUE)
head(data2)

##    Y  X1  X2  X3  X4
## 1 88  86 110 100  87
## 2 80  62  97  99 100
## 3 96 110 107 103 103
## 4 76 101 117  93  95
## 5 80 100 101  95  88
## 6 73  78  85  95  84

#a)
library(car)

#Scatter plot matrix
scatterplotMatrix(data2, main = "Scatter Plot Matrix")

#Correlation matrix of X variables
cor_matrix <- cor(data2[, c("X1", "X2", "X3", "X4")])
print("Correlation Matrix of X variables:")

## [1] "Correlation Matrix of X variables:"

print(cor_matrix)

##           X1        X2        X3        X4
## X1 1.0000000 0.1022689 0.1807692 0.3266632
## X2 0.1022689 1.0000000 0.5190448 0.3967101
## X3 0.1807692 0.5190448 1.0000000 0.7820385
## X4 0.3266632 0.3967101 0.7820385 1.0000000

#Full correlation matrix including Y
full_cor_matrix <- cor(data2)
print("Full Correlation Matrix:")

## [1] "Full Correlation Matrix:"

print(full_cor_matrix)

##            Y        X1        X2        X3        X4
## Y  1.0000000 0.5144107 0.4970057 0.8970645 0.8693865
## X1 0.5144107 1.0000000 0.1022689 0.1807692 0.3266632
## X2 0.4970057 0.1022689 1.0000000 0.5190448 0.3967101
## X3 0.8970645 0.1807692 0.5190448 1.0000000 0.7820385
## X4 0.8693865 0.3266632 0.3967101 0.7820385 1.0000000

#b)
#full model
full_model <- lm(Y ~ X1 + X2 + X3 + X4, data = data2)
summary(full_model)

## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9779 -3.4506  0.0941  2.4749  5.9959 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -124.38182    9.94106 -12.512 6.48e-11 ***
## X1             0.29573    0.04397   6.725 1.52e-06 ***
## X2             0.04829    0.05662   0.853  0.40383    
## X3             1.30601    0.16409   7.959 1.26e-07 ***
## X4             0.51982    0.13194   3.940  0.00081 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.099 on 20 degrees of freedom
## Multiple R-squared:  0.9629, Adjusted R-squared:  0.9555 
## F-statistic: 129.7 on 4 and 20 DF,  p-value: 5.262e-14

cat("\nTesting significance of each predictor:\n")

## 
## Testing significance of each predictor:

summary_table <- summary(full_model)$coefficients
print(summary_table)

##                  Estimate Std. Error     t value     Pr(>|t|)
## (Intercept) -124.38182058 9.94106316 -12.5119234 6.475642e-11
## X1             0.29572537 0.04397141   6.7254011 1.523710e-06
## X2             0.04828772 0.05661745   0.8528769 4.038262e-01
## X3             1.30601100 0.16409129   7.9590514 1.261726e-07
## X4             0.51981909 0.13194285   3.9397290 8.099714e-04

# Variance Inflation Factor to check multicollinearity
vif_values <- vif(full_model)
cat("\nVariance Inflation Factors (VIF):\n")

## 
## Variance Inflation Factors (VIF):

print(vif_values)

##       X1       X2       X3       X4 
## 1.138043 1.369512 3.016549 2.834776

#c)

library(leaps)

#Best subset selection
best_subsets <- regsubsets(Y ~ X1 + X2 + X3 + X4, data = data2, nbest = 4)
summary_best <- summary(best_subsets)


cat("Best subset models by adjusted R-squared:\n")

## Best subset models by adjusted R-squared:

results_table <- data.frame(
  Predictors = apply(summary_best$which, 1, function(x) paste(names(x)[x], collapse = ", ")),
  R_squared = summary_best$rsq,
  Adj_R_squared = summary_best$adjr2,
  Cp = summary_best$cp,
  BIC = summary_best$bic
)
print(results_table[order(-results_table$Adj_R_squared), ])

##                     Predictors R_squared Adj_R_squared         Cp         BIC
## 9      (Intercept), X1, X3, X4 0.9615422     0.9560482   3.727399 -68.5793341
## 13 (Intercept), X1, X2, X3, X4 0.9628918     0.9554702   5.000000 -66.2535626
## 5          (Intercept), X1, X3 0.9329956     0.9269043  17.112978 -57.9183134
## 10     (Intercept), X1, X2, X3 0.9340931     0.9246779  18.521465 -55.1123162
## 6          (Intercept), X3, X4 0.8772573     0.8660988  47.153985 -42.7849927
## 11     (Intercept), X2, X3, X4 0.8789698     0.8616797  48.231020 -39.9173671
## 12     (Intercept), X1, X2, X4 0.8453581     0.8232664  66.346500 -33.7905799
## 7          (Intercept), X1, X4 0.8152656     0.7984716  80.565307 -32.5642766
## 1              (Intercept), X3 0.8047247     0.7962344  84.246496 -34.3958674
## 8          (Intercept), X2, X3 0.8060733     0.7884436  85.519650 -31.3502438
## 2              (Intercept), X4 0.7558329     0.7452170 110.597414 -28.8098126
## 3              (Intercept), X1 0.2646184     0.2326452 375.344689  -1.2463904
## 4              (Intercept), X2 0.2470147     0.2142762 384.832454  -0.6549869

#d)
#model diagnostics for top models
cat("Additional criteria for model selection:\n")

## Additional criteria for model selection:

#top 4 models
top_models <- order(summary_best$adjr2, decreasing = TRUE)[1:4]

for(i in top_models) {
  cat("\n--- Model", i, "---\n")
  cat("Predictors:", paste(names(which(summary_best$which[i,])[-1]), collapse = ", "), "\n")
  cat("Adj R sqr:", round(summary_best$adjr2[i], 4), "\n")
  cat("Cp:", round(summary_best$cp[i], 2), "\n")
  cat("BIC:", round(summary_best$bic[i], 2), "\n")
  
  #model fitting
  model_vars <- names(which(summary_best$which[i,]))[-1]
  if(length(model_vars) > 0) {
    formula <- as.formula(paste("Y ~", paste(model_vars, collapse = " + ")))
    current_model <- lm(formula, data = data2)
    
    #Residual standard error
    cat("RSE:", round(summary(current_model)$sigma, 3), "\n")
    
    #normality of residuals
    shapiro_test <- shapiro.test(resid(current_model))
    cat("Shapiro-Wilk p-value:", round(shapiro_test$p.value, 4), "\n")
  }
}

## 
## --- Model 9 ---
## Predictors: X1, X3, X4 
## Adj R sqr: 0.956 
## Cp: 3.73 
## BIC: -68.58 
## RSE: 4.072 
## Shapiro-Wilk p-value: 0.1603 
## 
## --- Model 13 ---
## Predictors: X1, X2, X3, X4 
## Adj R sqr: 0.9555 
## Cp: 5 
## BIC: -66.25 
## RSE: 4.099 
## Shapiro-Wilk p-value: 0.1741 
## 
## --- Model 5 ---
## Predictors: X1, X3 
## Adj R sqr: 0.9269 
## Cp: 17.11 
## BIC: -57.92 
## RSE: 5.251 
## Shapiro-Wilk p-value: 0.6738 
## 
## --- Model 10 ---
## Predictors: X1, X2, X3 
## Adj R sqr: 0.9247 
## Cp: 18.52 
## BIC: -55.11 
## RSE: 5.331 
## Shapiro-Wilk p-value: 0.4855

#e)
# Backward elimination
backward_model <- regsubsets(Y ~ X1 + X2 + X3 + X4, data = data2, 
                             method = "backward", nvmax = 4)
backward_summary <- summary(backward_model)

cat("Backward Elimination Results:\n")

## Backward Elimination Results:

backward_table <- data.frame(
  Predictors = apply(backward_summary$which, 1, function(x) paste(names(x)[x], collapse = ", ")),
  Adj_R_squared = backward_summary$adjr2
)
print(backward_table)

##                    Predictors Adj_R_squared
## 1             (Intercept), X3     0.7962344
## 2         (Intercept), X1, X3     0.9269043
## 3     (Intercept), X1, X3, X4     0.9560482
## 4 (Intercept), X1, X2, X3, X4     0.9554702

#f)
# Forward selection
forward_model <- regsubsets(Y ~ X1 + X2 + X3 + X4, data = data2, 
                            method = "forward", nvmax = 4)
forward_summary <- summary(forward_model)

cat("Forward Selection Results:\n")

## Forward Selection Results:

forward_table <- data.frame(
  Predictors = apply(forward_summary$which, 1, function(x) paste(names(x)[x], collapse = ", ")),
  Adj_R_squared = forward_summary$adjr2
)
print(forward_table)

##                    Predictors Adj_R_squared
## 1             (Intercept), X3     0.7962344
## 2         (Intercept), X1, X3     0.9269043
## 3     (Intercept), X1, X3, X4     0.9560482
## 4 (Intercept), X1, X2, X3, X4     0.9554702

#g)
# Stepwise regression 
stepwise_model <- step(full_model, direction = "both", trace = 0)
cat("Stepwise Regression Final Model:\n")

## Stepwise Regression Final Model:

summary(stepwise_model)

## 
## Call:
## lm(formula = Y ~ X1 + X3 + X4, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4579 -3.1563 -0.2057  1.8070  6.6083 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -124.20002    9.87406 -12.578 3.04e-11 ***
## X1             0.29633    0.04368   6.784 1.04e-06 ***
## X3             1.35697    0.15183   8.937 1.33e-08 ***
## X4             0.51742    0.13105   3.948 0.000735 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.072 on 21 degrees of freedom
## Multiple R-squared:  0.9615, Adjusted R-squared:  0.956 
## F-statistic:   175 on 3 and 21 DF,  p-value: 5.16e-15

stepwise_model2 <- regsubsets(Y ~ X1 + X2 + X3 + X4, data = data2, 
                              method = "seqrep", nvmax = 4)
stepwise_summary <- summary(stepwise_model2)

cat("\nStepwise Regression Results:\n")

## 
## Stepwise Regression Results:

stepwise_table <- data.frame(
  Predictors = apply(stepwise_summary$which, 1, function(x) paste(names(x)[x], collapse = ", ")),
  Adj_R_squared = stepwise_summary$adjr2
)
print(stepwise_table)

##                    Predictors Adj_R_squared
## 1             (Intercept), X3     0.7962344
## 2         (Intercept), X1, X2     0.4154853
## 3     (Intercept), X1, X3, X4     0.9560482
## 4 (Intercept), X1, X2, X3, X4     0.9554702

#comparison

# Compare all methods
cat("Comparison of All Selection Methods:\n\n")

## Comparison of All Selection Methods:

# Best model from each method
best_backward <- which.max(backward_summary$adjr2)
best_forward <- which.max(forward_summary$adjr2)
best_stepwise <- which.max(stepwise_summary$adjr2)
best_subset <- which.max(summary_best$adjr2)

comparison <- data.frame(
  Method = c("Best Subset", "Backward", "Forward", "Stepwise"),
  Predictors = c(
    paste(names(which(summary_best$which[best_subset,]))[-1], collapse = ", "),
    paste(names(which(backward_summary$which[best_backward,]))[-1], collapse = ", "),
    paste(names(which(forward_summary$which[best_forward,]))[-1], collapse = ", "),
    paste(names(which(stepwise_summary$which[best_stepwise,]))[-1], collapse = ", ")
  ),
  Adj_R2 = c(
    summary_best$adjr2[best_subset],
    backward_summary$adjr2[best_backward],
    forward_summary$adjr2[best_forward],
    stepwise_summary$adjr2[best_stepwise]
  )
)

print(comparison)

##        Method Predictors    Adj_R2
## 1 Best Subset X1, X3, X4 0.9560482
## 2    Backward X1, X3, X4 0.9560482
## 3     Forward X1, X3, X4 0.9560482
## 4    Stepwise X1, X3, X4 0.9560482

# Final recommendation
cat("\nFinal Recommendation:\n")

## 
## Final Recommendation:

best_overall <- comparison[which.max(comparison$Adj_R2), ]
print(best_overall)

##        Method Predictors    Adj_R2
## 1 Best Subset X1, X3, X4 0.9560482

Assignment Regression 4

Sabuj Ganguly

2025-11-20