# Load the dataset
data("iris")

# View the first 6 rows
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
# Check the structure (Notice all variables are continuous/numeric)
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# Summary statistics
summary(iris[, 1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

#Exploratory Data Analysis (EDA)

# Correlation matrix of the 4 numeric variables
cor(iris[, 1:4])
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000
# Scatterplot matrix
pairs(~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, 
      data = iris, main = "Iris Scatterplot Matrix", pch = 19, col = "blue")

This is a phenomenon called Multicollinearity (when independent variables are too highly correlated with each other)

#Build the Initial Multiple Linear Regression Model

# Fit the MLR model
model_initial <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = iris)

# View the summary
summary(model_initial)
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
##     data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82816 -0.21989  0.01875  0.19709  0.84570 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.85600    0.25078   7.401 9.85e-12 ***
## Sepal.Width   0.65084    0.06665   9.765  < 2e-16 ***
## Petal.Length  0.70913    0.05672  12.502  < 2e-16 ***
## Petal.Width  -0.55648    0.12755  -4.363 2.41e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3145 on 146 degrees of freedom
## Multiple R-squared:  0.8586, Adjusted R-squared:  0.8557 
## F-statistic: 295.5 on 3 and 146 DF,  p-value: < 2.2e-16

#Check Assumptions and Fix the Model

# Load the 'car' package for VIF
install.packages("car", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/LENOVO/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## package 'car' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\LENOVO\AppData\Local\Temp\RtmpeEpuRp\downloaded_packages
library(car)
## Loading required package: carData
# Check VIF for the initial model
vif(model_initial)
##  Sepal.Width Petal.Length  Petal.Width 
##     1.270815    15.097572    14.234335
# Rebuild the model without Petal.Width
model_final <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)

# Check the new summary
summary(model_final)
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96159 -0.23489  0.00077  0.21453  0.78557 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.24914    0.24797    9.07 7.04e-16 ***
## Sepal.Width   0.59552    0.06933    8.59 1.16e-14 ***
## Petal.Length  0.47192    0.01712   27.57  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3333 on 147 degrees of freedom
## Multiple R-squared:  0.8402, Adjusted R-squared:  0.838 
## F-statistic: 386.4 on 2 and 147 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model_final)

par(mfrow = c(1, 1))
#Visualization

library(ggplot2)

iris$Predicted <- predict(model_final)

ggplot(iris, aes(x = Sepal.Length, y = Predicted)) +
  geom_point(color = "steelblue", alpha = 0.7) +
  geom_abline(intercept = 0, slope = 1,
              color = "red", linewidth = 1) +
  labs(
    title = "Actual vs Predicted Sepal Length",
    x = "Actual Values",
    y = "Predicted Values"
  ) +
  theme_minimal()

======Question 2======== ## Variable Selection Methods in R

Variable selection is the process of identifying the most important predictor variables for a regression model. It helps improve model accuracy, reduce overfitting, and simplify interpretation.

1. Forward Selection

Forward selection starts with no predictors and adds variables one at a time based on their contribution to the model.

library(MASS)

# Full and null models
full_model <- lm(mpg ~ ., data = mtcars)
null_model <- lm(mpg ~ 1, data = mtcars)

# Forward Selection
forward_model <- stepAIC(
  null_model,
  scope = list(lower = null_model, upper = full_model),
  direction = "forward",
  trace = FALSE
)

summary(forward_model)
## 
## Call:
## lm(formula = mpg ~ wt + cyl + hp, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9290 -1.5598 -0.5311  1.1850  5.8986 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 38.75179    1.78686  21.687  < 2e-16 ***
## wt          -3.16697    0.74058  -4.276 0.000199 ***
## cyl         -0.94162    0.55092  -1.709 0.098480 .  
## hp          -0.01804    0.01188  -1.519 0.140015    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.512 on 28 degrees of freedom
## Multiple R-squared:  0.8431, Adjusted R-squared:  0.8263 
## F-statistic: 50.17 on 3 and 28 DF,  p-value: 2.184e-11

2. Backward Elimination

Backward elimination starts with all predictors and removes the least significant variable at each step.

library(MASS)

full_model <- lm(mpg ~ ., data = mtcars)

backward_model <- stepAIC(
  full_model,
  direction = "backward",
  trace = FALSE
)

summary(backward_model)
## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4811 -1.5555 -0.7257  1.4110  4.6610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6178     6.9596   1.382 0.177915    
## wt           -3.9165     0.7112  -5.507 6.95e-06 ***
## qsec          1.2259     0.2887   4.247 0.000216 ***
## am            2.9358     1.4109   2.081 0.046716 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared:  0.8497, Adjusted R-squared:  0.8336 
## F-statistic: 52.75 on 3 and 28 DF,  p-value: 1.21e-11

3. Stepwise Selection

Stepwise selection combines forward selection and backward elimination by adding and removing variables iteratively.

library(MASS)

full_model <- lm(mpg ~ ., data = mtcars)

stepwise_model <- stepAIC(
  full_model,
  direction = "both",
  trace = FALSE
)

summary(stepwise_model)
## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4811 -1.5555 -0.7257  1.4110  4.6610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6178     6.9596   1.382 0.177915    
## wt           -3.9165     0.7112  -5.507 6.95e-06 ***
## qsec          1.2259     0.2887   4.247 0.000216 ***
## am            2.9358     1.4109   2.081 0.046716 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared:  0.8497, Adjusted R-squared:  0.8336 
## F-statistic: 52.75 on 3 and 28 DF,  p-value: 1.21e-11

Comparison of Methods

The selected model can be evaluated using:

AIC(forward_model)
## [1] 155.4766
AIC(backward_model)
## [1] 154.1194
AIC(stepwise_model)
## [1] 154.1194

Conclusion

Forward Selection, Backward Elimination, and Stepwise Selection are common variable selection methods used in multiple linear regression. Among these methods, the model with the lowest AIC and highest Adjusted R-squared is generally preferred because it balances model fit and complexity.