library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data(iris) # Load the built-in dataset
head(iris) # Preview the data
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
model <- lm(Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
summary(model)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.18534 -0.29838 -0.02763 0.28925 1.02320
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.19058 0.09705 43.181 < 2e-16 ***
## Petal.Length 0.54178 0.06928 7.820 9.41e-13 ***
## Petal.Width -0.31955 0.16045 -1.992 0.0483 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4031 on 147 degrees of freedom
## Multiple R-squared: 0.7663, Adjusted R-squared: 0.7631
## F-statistic: 241 on 2 and 147 DF, p-value: < 2.2e-16
sum(is.na(iris))
## [1] 0
summary(iris) # summary statistics for all variables
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
str(iris) # shows variable names, data types, and number of observations
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
colSums(is.na(iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
# Count the number of missing values in each variable
tree <- na.omit(iris)
# Remove rows containing missing values
dim(iris)
## [1] 150 5
# Check the dimensions of the cleaned dataset
model <- lm(Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
predicted_sepal_length <- predict(model) # Obtain predicted Sepal.Length values from the model
head(predicted_sepal_length)
## 1 2 3 4 5 6
## 4.885160 4.885160 4.830983 4.939338 4.885160 4.983783
# Boxplot outlier detection for all numerical variables combined in one plot
boxplot(tree$Sepal.Length, tree$Sepal.Width, tree$Petal.Length, tree$Petal.Width,
names = c("Sepal Length", "Sepal Width", "Petal Length", "Petal Width"),
main = "Outlier Detection for Iris Numerical Variables",
ylab = "Measurement (cm)",
col = c("lightblue", "lightgreen", "lightyellow", "lightsalmon"),
outline = TRUE) # outline = TRUE highlights outliers as dots
###MODEL DIAGNOSTICS
###Diagnostic plots include:
# Residuals vs Fitted
#Checks linearity assumption.
# Normal Q-Q Plot
#Checks normality of residuals.
#Scale-Location Plot
#Checks constant variance (homoscedasticity).
#Residuals vs Leverage
#Identifies influential observations.
par(mfrow = c(2,2)) # Arrange four plots in a 2 × 2 layout
plot(model)
###SCATTER PLOTS FOR VISUALIZATION
## Relationship between Petal.Length and Sepal.Length
plot(iris$Petal.Length, iris$Sepal.Length,
main = "Sepal.Length vs Petal.Length",
xlab = "Petal Length (cm)",
ylab = "Sepal Length (cm)")
abline(lm(Sepal.Length ~ Petal.Length, data = iris), col = "tomato", lwd = 2)
## Relationship between Petal.Width and Sepal.Length
plot(iris$Petal.Width, iris$Sepal.Length,
main = "Sepal.Length vs Petal.Width",
xlab = "Petal Width (cm)",
ylab = "Sepal Length (cm)")
abline(lm(Sepal.Length ~ Petal.Width, data = iris), col = "darkblue", lwd = 2)
## MULTIPLE LINEAR REGRESSION SUMMARY
## The multiple linear regression analysis indicates that both Petal.Length and
# Petal.Width positively influence Sepal.Length in iris flowers. Petal.Length
# is the strongest predictor, with a one-cm increase in petal length associated
# with approximately 0.54 cm increase in sepal length, while Petal.Width has
# a smaller but still statistically significant positive effect. The model
# explains 76.6% of the variability in Sepal.Length (R² = 0.766), demonstrating
# a good fit, and can reliably be used to predict the sepal length of iris
# flowers based on their petal measurements.
model <- lm(Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
summary(model)
##
## Call:
## lm(formula = Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.18534 -0.29838 -0.02763 0.28925 1.02320
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.19058 0.09705 43.181 < 2e-16 ***
## Petal.Length 0.54178 0.06928 7.820 9.41e-13 ***
## Petal.Width -0.31955 0.16045 -1.992 0.0483 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4031 on 147 degrees of freedom
## Multiple R-squared: 0.7663, Adjusted R-squared: 0.7631
## F-statistic: 241 on 2 and 147 DF, p-value: < 2.2e-16
## CORRELATION MATRIX
# Examining the correlation among selected variables reveals the strength
# and direction of linear relationships between Sepal.Length, Petal.Length,
# and Petal.Width. Values close to 1 indicate a strong positive correlation.
# Examine correlations among selected variables
cor(iris[, c("Sepal.Length", "Petal.Length", "Petal.Width")])
## Sepal.Length Petal.Length Petal.Width
## Sepal.Length 1.0000000 0.8717538 0.8179411
## Petal.Length 0.8717538 1.0000000 0.9628654
## Petal.Width 0.8179411 0.9628654 1.0000000
# Examine correlations among all numerical variables
cor(iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
Definition: An automated algorithm that adds or removes variables one by one based on statistical tests or criteria like AIC or BIC.
Role: To quickly simplify an overwhelming list of predictors down to a manageable baseline model.
When to use: Use this when you have a moderate number of predictors (under 30) and need a fast, traditional statistical model where every step is easy to explain.
Example:
# 1. Fit the full model
full_model <- lm(mpg ~ ., data = mtcars)
# 2. Run bidirectional stepwise selection using AIC
step_model <- step(full_model, direction = "both", trace = 0)
# 3. View the selected final variables
summary(step_model)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
Definition: A brute-force search that evaluates every possible combination of variables to find the mathematically ideal model.
Role: To guarantee that you are not missing out on the single best combination of a specific model size.
When to use: Use this only when your dataset has a very small number of predictors (fewer than 20) and you want absolute certainty that you found the best subset.
Definition: A regression technique that adds a mathematical penalty to the size of coefficients, forcing the least important coefficients to become exactly zero.
Role: To perform automated variable selection and prevent overfitting simultaneously.
When to use: Use this when you have more variables than observations (p>n), or when you suspect only a few variables are actually important out of a massive list
Definition: Trains an ensemble of decision trees; measures each variable’s importance by the mean decrease in impurity (Gini) or accuracy when the variable is permuted.
Role: Model-agnostic screening. Detects non-linear effects and interactions that regression-based methods miss. Good first step before modelling.