library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
library(leaps)
#problem1
data=read.table("C:/Users/Sabuj Ganguly/OneDrive/Documents/PhD 1st sem/Regression/prob1data.txt",header=TRUE)
head(data)
## Y X1 X2 X3
## 1 4264 305657 7.17 0
## 2 4496 328476 6.20 0
## 3 4317 317164 4.61 0
## 4 4292 366745 7.02 0
## 5 4945 265518 8.61 1
## 6 4325 301995 6.88 0
#a)
# model fitting
model <- lm(Y ~ X1 + X2 + X3, data = data)
summary(model)
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -264.05 -110.73 -22.52 79.29 295.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.150e+03 1.956e+02 21.220 < 2e-16 ***
## X1 7.871e-04 3.646e-04 2.159 0.0359 *
## X2 -1.317e+01 2.309e+01 -0.570 0.5712
## X3 6.236e+02 6.264e+01 9.954 2.94e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 143.3 on 48 degrees of freedom
## Multiple R-squared: 0.6883, Adjusted R-squared: 0.6689
## F-statistic: 35.34 on 3 and 48 DF, p-value: 3.316e-12
# Estimated regression model
cat("Estimated regression function:\n")
## Estimated regression function:
cat("Y =", coef(model)[1], "+", coef(model)[2], "X1 +", coef(model)[3], "X2 +", coef(model)[4], "X3\n")
## Y = 4149.887 + 0.0007870804 X1 + -13.16602 X2 + 623.5545 X3
#b)
#residuals
residuals <- resid(model)
#Box plot of residuals
boxplot(residuals, main = "Box Plot of Residuals", ylab = "Residuals")

#c)
par(mfrow = c(2, 3))
#Residuals against Y, X1, X2, X3, and X1*X2
plot(fitted(model), residuals, main = "Residuals vs Y-hat", xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red")
plot(data$X1, residuals, main = "Residuals vs X1", xlab = "X1", ylab = "Residuals")
abline(h = 0, col = "red")
plot(data$X2, residuals, main = "Residuals vs X2", xlab = "X2", ylab = "Residuals")
abline(h = 0, col = "red")
plot(data$X3, residuals, main = "Residuals vs X3", xlab = "X3", ylab = "Residuals")
abline(h = 0, col = "red")
# X1*X2 interaction
plot(data$X1 * data$X2, residuals, main = "Residuals vs X1*X2", xlab = "X1*X2", ylab = "Residuals")
abline(h = 0, col = "red")
#Normal probability plot
qqnorm(residuals)
qqline(residuals, col = "red")

par(mfrow = c(1, 1))
#d)
#Time plot
n <- nrow(data)
plot(1:n, residuals, type = "l", main = "Time Plot of Residuals",
xlab = "Time Order", ylab = "Residuals")
abline(h = 0, col = "red")

#e)
# Brown-Forsythe test for constancy of error variance
fitted_values <- fitted(model)
#Splitting into two groups based on fitted values
sorted_indices <- order(fitted_values)
group1 <- residuals[sorted_indices[1:26]]
group2 <- residuals[sorted_indices[27:52]]
d1 <- abs(group1 - median(group1))
d2 <- abs(group2 - median(group2))
# t-test
bf_test <- t.test(d1, d2, var.equal = FALSE)
print(bf_test)
##
## Welch Two Sample t-test
##
## data: d1 and d2
## t = 2.9859, df = 42.125, p-value = 0.004695
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 21.93407 113.37908
## sample estimates:
## mean of x mean of y
## 145.03103 77.37446
#Critical value for alpha = 0.01
critical_value <- qt(0.995, df = min(25, 25))
cat("Critical t-value (alpha=0.01):", critical_value, "\n")
## Critical t-value (alpha=0.01): 2.787436
#f)
#Test for overall regression relation
anova_model <- anova(model)
print(anova_model)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X1 1 136366 136366 6.6417 0.01309 *
## X2 1 5726 5726 0.2789 0.59987
## X3 1 2034514 2034514 99.0905 2.941e-13 ***
## Residuals 48 985530 20532
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#F-test for overall significance
f_statistic <- anova_model$"F value"[1]
p_value <- anova_model$"Pr(>F)"[1]
cat("F-statistic:", f_statistic, "\n")
## F-statistic: 6.641687
cat("p-value:", p_value, "\n")
## p-value: 0.01309038
# Critical F-value
critical_f <- qf(0.95, df1 = 3, df2 = n-4)
cat("Critical F-value (alpha=0.05):", critical_f, "\n")
## Critical F-value (alpha=0.05): 2.798061
#g)
# Bonferroni confidence intervals
conf_int <- confint(model, level = 1 - 0.05/3)
cat("95% Family Confidence Intervals (Bonferroni):\n")
## 95% Family Confidence Intervals (Bonferroni):
print(conf_int)
## 0.833 % 99.167 %
## (Intercept) 3.664732e+03 4.635043e+03
## X1 -1.172992e-04 1.691460e-03
## X2 -7.045161e+01 4.411957e+01
## X3 4.681559e+02 7.789531e+02
#h)
# Sequential ANOVA
cat("Sequential ANOVA:\n")
## Sequential ANOVA:
anova_model_seq <- anova(model)
print(anova_model_seq)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X1 1 136366 136366 6.6417 0.01309 *
## X2 1 5726 5726 0.2789 0.59987
## X3 1 2034514 2034514 99.0905 2.941e-13 ***
## Residuals 48 985530 20532
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#using different orders
model_X1 <- lm(Y ~ X1, data = data)
model_X1_X3 <- lm(Y ~ X1 + X3, data = data)
model_full <- lm(Y ~ X1 + X3 + X2, data = data)
cat("\nExtra sums of squares:\n")
##
## Extra sums of squares:
cat("SSR(X1):", anova(model_X1)$"Sum Sq"[1], "\n")
## SSR(X1): 136366.2
cat("SSR(X3|X1):", anova(model_X1_X3)$"Sum Sq"[2], "\n")
## SSR(X3|X1): 2033565
cat("SSR(X2|X1,X3):", anova(model_full)$"Sum Sq"[3], "\n")
## SSR(X2|X1,X3): 6674.588
#i)
#whether X2 can be dropped
model_reduced <- lm(Y ~ X1 + X3, data = data)
anova_test <- anova(model_reduced, model)
print(anova_test)
## Analysis of Variance Table
##
## Model 1: Y ~ X1 + X3
## Model 2: Y ~ X1 + X2 + X3
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 49 992204
## 2 48 985530 1 6674.6 0.3251 0.5712
# F-test for X2 given X1 and X3
f_test_X2 <- anova_test$F[2]
p_value_X2 <- anova_test$"Pr(>F)"[2]
cat("F-statistic for testing X2:", f_test_X2, "\n")
## F-statistic for testing X2: 0.3250843
cat("p-value:", p_value_X2, "\n")
## p-value: 0.5712274
#j)
#various R-squared values
r2_full <- summary(model)$r.squared
r2_X1 <- summary(lm(Y ~ X1, data = data))$r.squared
r2_X2 <- summary(lm(Y ~ X2, data = data))$r.squared
r2_X1_X2 <- summary(lm(Y ~ X1 + X2, data = data))$r.squared
# Partial R-squared
ss_full <- anova(model)$"Sum Sq"
ss_X1_only <- anova(lm(Y ~ X1, data = data))$"Sum Sq"
ss_X2_given_X1 <- anova(lm(Y ~ X1 + X2, data = data))$"Sum Sq"[2]
cat("R-squared values:\n")
## R-squared values:
cat("R2_Y1:", r2_X1, "\n")
## R2_Y1: 0.04312473
cat("R2_Y2:", r2_X2, "\n")
## R2_Y2: 0.003603553
cat("R2_Y12:", r2_X1_X2, "\n")
## R2_Y12: 0.0449355
cat("R2 (full model):", r2_full, "\n")
## R2 (full model): 0.6883342
#k)
#Number of observations
n <- nrow(data)
# Correlation transformation:
Y_star <- (1/sqrt(n-1)) * (data$Y - mean(data$Y)) / sd(data$Y)
X1_star <- (1/sqrt(n-1)) * (data$X1 - mean(data$X1)) / sd(data$X1)
X2_star <- (1/sqrt(n-1)) * (data$X2 - mean(data$X2)) / sd(data$X2)
X3_star <- (1/sqrt(n-1)) * (data$X3 - mean(data$X3)) / sd(data$X3)
data_star <- data.frame(Y_star = Y_star, X1_star = X1_star, X2_star = X2_star, X3_star = X3_star)
#standardized regression model (no intercept)
corr_model <- lm(Y_star ~ 0 + X1_star + X2_star + X3_star, data = data_star)
cat("Standardized Regression Model (Correlation Transformation):\n")
## Standardized Regression Model (Correlation Transformation):
summary(corr_model)
##
## Call:
## lm(formula = Y_star ~ 0 + X1_star + X2_star + X3_star, data = data_star)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.14849 -0.06227 -0.01266 0.04459 0.16632
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## X1_star 0.17472 0.08009 2.181 0.034 *
## X2_star -0.04639 0.08053 -0.576 0.567
## X3_star 0.80786 0.08032 10.058 1.66e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07975 on 49 degrees of freedom
## Multiple R-squared: 0.6883, Adjusted R-squared: 0.6693
## F-statistic: 36.07 on 3 and 49 DF, p-value: 1.869e-12
#l)
#coefficients of determination between all pairs of predictor variables
cor_matrix <- cor(data[, c("X1", "X2", "X3")])
cat("Correlation matrix of predictor variables:\n")
## Correlation matrix of predictor variables:
print(cor_matrix)
## X1 X2 X3
## X1 1.00000000 0.08489639 0.04565698
## X2 0.08489639 1.00000000 0.11337076
## X3 0.04565698 0.11337076 1.00000000
# R-squared values between predictor pairs
r_sq_matrix <- cor_matrix^2
cat("\nR-squared values between predictor pairs:\n")
##
## R-squared values between predictor pairs:
print(r_sq_matrix)
## X1 X2 X3
## X1 1.000000000 0.007207396 0.00208456
## X2 0.007207396 1.000000000 0.01285293
## X3 0.002084560 0.012852929 1.00000000
# Checking standardized regression coefficients reflect individual effects
vif_values <- vif(model)
cat("\nVariance Inflation Factors:\n")
##
## Variance Inflation Factors:
print(vif_values)
## X1 X2 X3
## 1.008596 1.019598 1.014364
if(max(vif_values) > 10) {
cat(" Serious multicollinearity prevents meaningful interpretation\n")
} else if(max(vif_values) > 5) {
cat(" Moderate multicollinearity affects interpretation\n")
} else {
cat(" Low multicollinearity allows meaningful interpretation\n")
}
## Low multicollinearity allows meaningful interpretation
#m)
# standardized coefficients from correlation model
std_coef <- coef(corr_model)
#Transform back
sd_Y <- sd(data$Y)
sd_X1 <- sd(data$X1)
sd_X2 <- sd(data$X2)
sd_X3 <- sd(data$X3)
b1_orig <- std_coef["X1_star"] * (sd_Y / sd_X1)
b2_orig <- std_coef["X2_star"] * (sd_Y / sd_X2)
b3_orig <- std_coef["X3_star"] * (sd_Y / sd_X3)
#intercept
b0_orig <- mean(data$Y) - b1_orig * mean(data$X1) - b2_orig * mean(data$X2) - b3_orig * mean(data$X3)
cat("Coefficients transformed back to original scale:\n")
## Coefficients transformed back to original scale:
cat("b0 =", b0_orig, "\n")
## b0 = 4149.887
cat("b1 =", b1_orig, "\n")
## b1 = 0.0007870804
cat("b2 =", b2_orig, "\n")
## b2 = -13.16602
cat("b3 =", b3_orig, "\n")
## b3 = 623.5545
original_coef <- coef(model)
cat("\nComparison with original model coefficients:\n")
##
## Comparison with original model coefficients:
comparison <- data.frame(
Original = original_coef,
Transformed = c(b0_orig, b1_orig, b2_orig, b3_orig)
)
print(comparison)
## Original Transformed
## (Intercept) 4.149887e+03 4.149887e+03
## X1 7.870804e-04 7.870804e-04
## X2 -1.316602e+01 -1.316602e+01
## X3 6.235545e+02 6.235545e+02
#n)
# SSR(X1)
model_X1 <- lm(Y ~ X1, data = data)
SSR_X1 <- sum((fitted(model_X1) - mean(data$Y))^2)
#SSR(X1|X2)
model_X2 <- lm(Y ~ X2, data = data)
model_X1_X2 <- lm(Y ~ X1 + X2, data = data)
anova_comparison <- anova(model_X2, model_X1_X2)
SSR_X1_given_X2 <- anova_comparison$Sum[2]
cat("SSR Comparison:\n")
## SSR Comparison:
cat("SSR(X1) =", SSR_X1, "\n")
## SSR(X1) = 136366.2
cat("SSR(X1|X2) =", SSR_X1_given_X2, "\n")
## SSR(X1|X2) = 130697.2
cat("Difference =", SSR_X1 - SSR_X1_given_X2, "\n")
## Difference = 5669.001
# if difference is substantial
rel_diff <- abs(SSR_X1 - SSR_X1_given_X2) / SSR_X1 * 100
cat("Relative difference:", round(rel_diff, 2), "%\n")
## Relative difference: 4.16 %
if(rel_diff < 1) {
cat("The difference is not substantial.\n")
} else if(rel_diff < 10) {
cat("The difference is moderate.\n")
} else {
cat("The difference is substantial.\n")
}
## The difference is moderate.
#Problem 2
data2=read.table("C:/Users/Sabuj Ganguly/OneDrive/Documents/PhD 1st sem/Regression/prob2data.txt",header=TRUE)
head(data2)
## Y X1 X2 X3 X4
## 1 88 86 110 100 87
## 2 80 62 97 99 100
## 3 96 110 107 103 103
## 4 76 101 117 93 95
## 5 80 100 101 95 88
## 6 73 78 85 95 84
#a)
library(car)
#Scatter plot matrix
scatterplotMatrix(data2, main = "Scatter Plot Matrix")

#Correlation matrix of X variables
cor_matrix <- cor(data2[, c("X1", "X2", "X3", "X4")])
print("Correlation Matrix of X variables:")
## [1] "Correlation Matrix of X variables:"
print(cor_matrix)
## X1 X2 X3 X4
## X1 1.0000000 0.1022689 0.1807692 0.3266632
## X2 0.1022689 1.0000000 0.5190448 0.3967101
## X3 0.1807692 0.5190448 1.0000000 0.7820385
## X4 0.3266632 0.3967101 0.7820385 1.0000000
#Full correlation matrix including Y
full_cor_matrix <- cor(data2)
print("Full Correlation Matrix:")
## [1] "Full Correlation Matrix:"
print(full_cor_matrix)
## Y X1 X2 X3 X4
## Y 1.0000000 0.5144107 0.4970057 0.8970645 0.8693865
## X1 0.5144107 1.0000000 0.1022689 0.1807692 0.3266632
## X2 0.4970057 0.1022689 1.0000000 0.5190448 0.3967101
## X3 0.8970645 0.1807692 0.5190448 1.0000000 0.7820385
## X4 0.8693865 0.3266632 0.3967101 0.7820385 1.0000000
#b)
#full model
full_model <- lm(Y ~ X1 + X2 + X3 + X4, data = data2)
summary(full_model)
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9779 -3.4506 0.0941 2.4749 5.9959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -124.38182 9.94106 -12.512 6.48e-11 ***
## X1 0.29573 0.04397 6.725 1.52e-06 ***
## X2 0.04829 0.05662 0.853 0.40383
## X3 1.30601 0.16409 7.959 1.26e-07 ***
## X4 0.51982 0.13194 3.940 0.00081 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.099 on 20 degrees of freedom
## Multiple R-squared: 0.9629, Adjusted R-squared: 0.9555
## F-statistic: 129.7 on 4 and 20 DF, p-value: 5.262e-14
cat("\nTesting significance of each predictor:\n")
##
## Testing significance of each predictor:
summary_table <- summary(full_model)$coefficients
print(summary_table)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -124.38182058 9.94106316 -12.5119234 6.475642e-11
## X1 0.29572537 0.04397141 6.7254011 1.523710e-06
## X2 0.04828772 0.05661745 0.8528769 4.038262e-01
## X3 1.30601100 0.16409129 7.9590514 1.261726e-07
## X4 0.51981909 0.13194285 3.9397290 8.099714e-04
# Variance Inflation Factor to check multicollinearity
vif_values <- vif(full_model)
cat("\nVariance Inflation Factors (VIF):\n")
##
## Variance Inflation Factors (VIF):
print(vif_values)
## X1 X2 X3 X4
## 1.138043 1.369512 3.016549 2.834776
#c)
library(leaps)
#Best subset selection
best_subsets <- regsubsets(Y ~ X1 + X2 + X3 + X4, data = data2, nbest = 4)
summary_best <- summary(best_subsets)
cat("Best subset models by adjusted R-squared:\n")
## Best subset models by adjusted R-squared:
results_table <- data.frame(
Predictors = apply(summary_best$which, 1, function(x) paste(names(x)[x], collapse = ", ")),
R_squared = summary_best$rsq,
Adj_R_squared = summary_best$adjr2,
Cp = summary_best$cp,
BIC = summary_best$bic
)
print(results_table[order(-results_table$Adj_R_squared), ])
## Predictors R_squared Adj_R_squared Cp BIC
## 9 (Intercept), X1, X3, X4 0.9615422 0.9560482 3.727399 -68.5793341
## 13 (Intercept), X1, X2, X3, X4 0.9628918 0.9554702 5.000000 -66.2535626
## 5 (Intercept), X1, X3 0.9329956 0.9269043 17.112978 -57.9183134
## 10 (Intercept), X1, X2, X3 0.9340931 0.9246779 18.521465 -55.1123162
## 6 (Intercept), X3, X4 0.8772573 0.8660988 47.153985 -42.7849927
## 11 (Intercept), X2, X3, X4 0.8789698 0.8616797 48.231020 -39.9173671
## 12 (Intercept), X1, X2, X4 0.8453581 0.8232664 66.346500 -33.7905799
## 7 (Intercept), X1, X4 0.8152656 0.7984716 80.565307 -32.5642766
## 1 (Intercept), X3 0.8047247 0.7962344 84.246496 -34.3958674
## 8 (Intercept), X2, X3 0.8060733 0.7884436 85.519650 -31.3502438
## 2 (Intercept), X4 0.7558329 0.7452170 110.597414 -28.8098126
## 3 (Intercept), X1 0.2646184 0.2326452 375.344689 -1.2463904
## 4 (Intercept), X2 0.2470147 0.2142762 384.832454 -0.6549869
#d)
#model diagnostics for top models
cat("Additional criteria for model selection:\n")
## Additional criteria for model selection:
#top 4 models
top_models <- order(summary_best$adjr2, decreasing = TRUE)[1:4]
for(i in top_models) {
cat("\n--- Model", i, "---\n")
cat("Predictors:", paste(names(which(summary_best$which[i,])[-1]), collapse = ", "), "\n")
cat("Adj R sqr:", round(summary_best$adjr2[i], 4), "\n")
cat("Cp:", round(summary_best$cp[i], 2), "\n")
cat("BIC:", round(summary_best$bic[i], 2), "\n")
#model fitting
model_vars <- names(which(summary_best$which[i,]))[-1]
if(length(model_vars) > 0) {
formula <- as.formula(paste("Y ~", paste(model_vars, collapse = " + ")))
current_model <- lm(formula, data = data2)
#Residual standard error
cat("RSE:", round(summary(current_model)$sigma, 3), "\n")
#normality of residuals
shapiro_test <- shapiro.test(resid(current_model))
cat("Shapiro-Wilk p-value:", round(shapiro_test$p.value, 4), "\n")
}
}
##
## --- Model 9 ---
## Predictors: X1, X3, X4
## Adj R sqr: 0.956
## Cp: 3.73
## BIC: -68.58
## RSE: 4.072
## Shapiro-Wilk p-value: 0.1603
##
## --- Model 13 ---
## Predictors: X1, X2, X3, X4
## Adj R sqr: 0.9555
## Cp: 5
## BIC: -66.25
## RSE: 4.099
## Shapiro-Wilk p-value: 0.1741
##
## --- Model 5 ---
## Predictors: X1, X3
## Adj R sqr: 0.9269
## Cp: 17.11
## BIC: -57.92
## RSE: 5.251
## Shapiro-Wilk p-value: 0.6738
##
## --- Model 10 ---
## Predictors: X1, X2, X3
## Adj R sqr: 0.9247
## Cp: 18.52
## BIC: -55.11
## RSE: 5.331
## Shapiro-Wilk p-value: 0.4855
#e)
# Backward elimination
backward_model <- regsubsets(Y ~ X1 + X2 + X3 + X4, data = data2,
method = "backward", nvmax = 4)
backward_summary <- summary(backward_model)
cat("Backward Elimination Results:\n")
## Backward Elimination Results:
backward_table <- data.frame(
Predictors = apply(backward_summary$which, 1, function(x) paste(names(x)[x], collapse = ", ")),
Adj_R_squared = backward_summary$adjr2
)
print(backward_table)
## Predictors Adj_R_squared
## 1 (Intercept), X3 0.7962344
## 2 (Intercept), X1, X3 0.9269043
## 3 (Intercept), X1, X3, X4 0.9560482
## 4 (Intercept), X1, X2, X3, X4 0.9554702
#f)
# Forward selection
forward_model <- regsubsets(Y ~ X1 + X2 + X3 + X4, data = data2,
method = "forward", nvmax = 4)
forward_summary <- summary(forward_model)
cat("Forward Selection Results:\n")
## Forward Selection Results:
forward_table <- data.frame(
Predictors = apply(forward_summary$which, 1, function(x) paste(names(x)[x], collapse = ", ")),
Adj_R_squared = forward_summary$adjr2
)
print(forward_table)
## Predictors Adj_R_squared
## 1 (Intercept), X3 0.7962344
## 2 (Intercept), X1, X3 0.9269043
## 3 (Intercept), X1, X3, X4 0.9560482
## 4 (Intercept), X1, X2, X3, X4 0.9554702
#g)
# Stepwise regression
stepwise_model <- step(full_model, direction = "both", trace = 0)
cat("Stepwise Regression Final Model:\n")
## Stepwise Regression Final Model:
summary(stepwise_model)
##
## Call:
## lm(formula = Y ~ X1 + X3 + X4, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4579 -3.1563 -0.2057 1.8070 6.6083
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -124.20002 9.87406 -12.578 3.04e-11 ***
## X1 0.29633 0.04368 6.784 1.04e-06 ***
## X3 1.35697 0.15183 8.937 1.33e-08 ***
## X4 0.51742 0.13105 3.948 0.000735 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.072 on 21 degrees of freedom
## Multiple R-squared: 0.9615, Adjusted R-squared: 0.956
## F-statistic: 175 on 3 and 21 DF, p-value: 5.16e-15
stepwise_model2 <- regsubsets(Y ~ X1 + X2 + X3 + X4, data = data2,
method = "seqrep", nvmax = 4)
stepwise_summary <- summary(stepwise_model2)
cat("\nStepwise Regression Results:\n")
##
## Stepwise Regression Results:
stepwise_table <- data.frame(
Predictors = apply(stepwise_summary$which, 1, function(x) paste(names(x)[x], collapse = ", ")),
Adj_R_squared = stepwise_summary$adjr2
)
print(stepwise_table)
## Predictors Adj_R_squared
## 1 (Intercept), X3 0.7962344
## 2 (Intercept), X1, X2 0.4154853
## 3 (Intercept), X1, X3, X4 0.9560482
## 4 (Intercept), X1, X2, X3, X4 0.9554702
#comparison
# Compare all methods
cat("Comparison of All Selection Methods:\n\n")
## Comparison of All Selection Methods:
# Best model from each method
best_backward <- which.max(backward_summary$adjr2)
best_forward <- which.max(forward_summary$adjr2)
best_stepwise <- which.max(stepwise_summary$adjr2)
best_subset <- which.max(summary_best$adjr2)
comparison <- data.frame(
Method = c("Best Subset", "Backward", "Forward", "Stepwise"),
Predictors = c(
paste(names(which(summary_best$which[best_subset,]))[-1], collapse = ", "),
paste(names(which(backward_summary$which[best_backward,]))[-1], collapse = ", "),
paste(names(which(forward_summary$which[best_forward,]))[-1], collapse = ", "),
paste(names(which(stepwise_summary$which[best_stepwise,]))[-1], collapse = ", ")
),
Adj_R2 = c(
summary_best$adjr2[best_subset],
backward_summary$adjr2[best_backward],
forward_summary$adjr2[best_forward],
stepwise_summary$adjr2[best_stepwise]
)
)
print(comparison)
## Method Predictors Adj_R2
## 1 Best Subset X1, X3, X4 0.9560482
## 2 Backward X1, X3, X4 0.9560482
## 3 Forward X1, X3, X4 0.9560482
## 4 Stepwise X1, X3, X4 0.9560482
# Final recommendation
cat("\nFinal Recommendation:\n")
##
## Final Recommendation:
best_overall <- comparison[which.max(comparison$Adj_R2), ]
print(best_overall)
## Method Predictors Adj_R2
## 1 Best Subset X1, X3, X4 0.9560482