# Load the dataset
data("iris")
# View the first 6 rows
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# Check the structure (Notice all variables are continuous/numeric)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# Summary statistics
summary(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
#Exploratory Data Analysis (EDA)
# Correlation matrix of the 4 numeric variables
cor(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
# Scatterplot matrix
pairs(~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
data = iris, main = "Iris Scatterplot Matrix", pch = 19, col = "blue")
This is a phenomenon called Multicollinearity (when independent variables are too highly correlated with each other)
#Build the Initial Multiple Linear Regression Model
# Fit the MLR model
model_initial <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = iris)
# View the summary
summary(model_initial)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
## data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82816 -0.21989 0.01875 0.19709 0.84570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.85600 0.25078 7.401 9.85e-12 ***
## Sepal.Width 0.65084 0.06665 9.765 < 2e-16 ***
## Petal.Length 0.70913 0.05672 12.502 < 2e-16 ***
## Petal.Width -0.55648 0.12755 -4.363 2.41e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3145 on 146 degrees of freedom
## Multiple R-squared: 0.8586, Adjusted R-squared: 0.8557
## F-statistic: 295.5 on 3 and 146 DF, p-value: < 2.2e-16
#Check Assumptions and Fix the Model
# Load the 'car' package for VIF
install.packages("car", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/LENOVO/AppData/Local/R/win-library/4.6'
## (as 'lib' is unspecified)
## package 'car' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LENOVO\AppData\Local\Temp\RtmpeEpuRp\downloaded_packages
library(car)
## Loading required package: carData
# Check VIF for the initial model
vif(model_initial)
## Sepal.Width Petal.Length Petal.Width
## 1.270815 15.097572 14.234335
# Rebuild the model without Petal.Width
model_final <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
# Check the new summary
summary(model_final)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96159 -0.23489 0.00077 0.21453 0.78557
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.24914 0.24797 9.07 7.04e-16 ***
## Sepal.Width 0.59552 0.06933 8.59 1.16e-14 ***
## Petal.Length 0.47192 0.01712 27.57 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3333 on 147 degrees of freedom
## Multiple R-squared: 0.8402, Adjusted R-squared: 0.838
## F-statistic: 386.4 on 2 and 147 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model_final)
par(mfrow = c(1, 1))
#Visualization
library(ggplot2)
iris$Predicted <- predict(model_final)
ggplot(iris, aes(x = Sepal.Length, y = Predicted)) +
geom_point(color = "steelblue", alpha = 0.7) +
geom_abline(intercept = 0, slope = 1,
color = "red", linewidth = 1) +
labs(
title = "Actual vs Predicted Sepal Length",
x = "Actual Values",
y = "Predicted Values"
) +
theme_minimal()
======Question 2======== ## Variable Selection Methods in R
Variable selection is the process of identifying the most important predictor variables for a regression model. It helps improve model accuracy, reduce overfitting, and simplify interpretation.
Forward selection starts with no predictors and adds variables one at a time based on their contribution to the model.
library(MASS)
# Full and null models
full_model <- lm(mpg ~ ., data = mtcars)
null_model <- lm(mpg ~ 1, data = mtcars)
# Forward Selection
forward_model <- stepAIC(
null_model,
scope = list(lower = null_model, upper = full_model),
direction = "forward",
trace = FALSE
)
summary(forward_model)
##
## Call:
## lm(formula = mpg ~ wt + cyl + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9290 -1.5598 -0.5311 1.1850 5.8986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.75179 1.78686 21.687 < 2e-16 ***
## wt -3.16697 0.74058 -4.276 0.000199 ***
## cyl -0.94162 0.55092 -1.709 0.098480 .
## hp -0.01804 0.01188 -1.519 0.140015
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.512 on 28 degrees of freedom
## Multiple R-squared: 0.8431, Adjusted R-squared: 0.8263
## F-statistic: 50.17 on 3 and 28 DF, p-value: 2.184e-11
Backward elimination starts with all predictors and removes the least significant variable at each step.
library(MASS)
full_model <- lm(mpg ~ ., data = mtcars)
backward_model <- stepAIC(
full_model,
direction = "backward",
trace = FALSE
)
summary(backward_model)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
Stepwise selection combines forward selection and backward elimination by adding and removing variables iteratively.
library(MASS)
full_model <- lm(mpg ~ ., data = mtcars)
stepwise_model <- stepAIC(
full_model,
direction = "both",
trace = FALSE
)
summary(stepwise_model)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
The selected model can be evaluated using:
AIC(forward_model)
## [1] 155.4766
AIC(backward_model)
## [1] 154.1194
AIC(stepwise_model)
## [1] 154.1194
Forward Selection, Backward Elimination, and Stepwise Selection are common variable selection methods used in multiple linear regression. Among these methods, the model with the lowest AIC and highest Adjusted R-squared is generally preferred because it balances model fit and complexity.