library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(ggplot2)
library(readxl)
library(readr)
#knitr::opts_chunk$set(error = TRUE)
Dataset: state.x77
# Load and prepare dataset
head(state.x77)
## Population Income Illiteracy Life Exp Murder HS Grad Frost Area
## Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
## Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
## Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
## Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945
## California 21198 5114 1.1 71.71 10.3 62.6 20 156361
## Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766
statedata <- data.frame(state.x77)
colnames(statedata)[4] <- "Life.Exp"
colnames(statedata)[6] <- "HS.Grad"
head(statedata)
## Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
## Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
## Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
## Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
## Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945
## California 21198 5114 1.1 71.71 10.3 62.6 20 156361
## Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766
cat("\n STRUCTURE\n", strrep("===",10),"\n")
##
## STRUCTURE
## ==============================
str(statedata)
## 'data.frame': 50 obs. of 8 variables:
## $ Population: num 3615 365 2212 2110 21198 ...
## $ Income : num 3624 6315 4530 3378 5114 ...
## $ Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
## $ Life.Exp : num 69 69.3 70.5 70.7 71.7 ...
## $ Murder : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
## $ HS.Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
## $ Frost : num 20 152 15 65 20 166 139 103 11 60 ...
## $ Area : num 50708 566432 113417 51945 156361 ...
cat("\n SUMMARY\n", strrep("===",10),"\n")
##
## SUMMARY
## ==============================
summary(statedata)
## Population Income Illiteracy Life.Exp
## Min. : 365 Min. :3098 Min. :0.500 Min. :67.96
## 1st Qu.: 1080 1st Qu.:3993 1st Qu.:0.625 1st Qu.:70.12
## Median : 2838 Median :4519 Median :0.950 Median :70.67
## Mean : 4246 Mean :4436 Mean :1.170 Mean :70.88
## 3rd Qu.: 4968 3rd Qu.:4814 3rd Qu.:1.575 3rd Qu.:71.89
## Max. :21198 Max. :6315 Max. :2.800 Max. :73.60
## Murder HS.Grad Frost Area
## Min. : 1.400 Min. :37.80 Min. : 0.00 Min. : 1049
## 1st Qu.: 4.350 1st Qu.:48.05 1st Qu.: 66.25 1st Qu.: 36985
## Median : 6.850 Median :53.25 Median :114.50 Median : 54277
## Mean : 7.378 Mean :53.11 Mean :104.46 Mean : 70736
## 3rd Qu.:10.675 3rd Qu.:59.15 3rd Qu.:139.75 3rd Qu.: 81163
## Max. :15.100 Max. :67.30 Max. :188.00 Max. :566432
My interest is to look at the history of all 50 states and find out ” How a state’s Murder rate, Graduation rate, Illiteracy, and Income levels simultaneously impact or predict the average Life Expectancy of its citizens”
model <- lm(Life.Exp ~ Murder + HS.Grad + Illiteracy + Income,
data = statedata)
summary(model)
##
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad + Illiteracy + Income,
## data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.56498 -0.53611 0.05303 0.58972 1.73972
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.4833066 1.3253230 52.427 < 2e-16 ***
## Murder -0.2619402 0.0444659 -5.891 4.53e-07 ***
## HS.Grad 0.0461443 0.0218485 2.112 0.0403 *
## Illiteracy 0.2760771 0.3105081 0.889 0.3787
## Income 0.0001249 0.0002422 0.516 0.6084
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8049 on 45 degrees of freedom
## Multiple R-squared: 0.6698, Adjusted R-squared: 0.6405
## F-statistic: 22.82 on 4 and 45 DF, p-value: 2.39e-10
Coefficients: Estimate 1. (Intercept) 69.48 : baseline life expectancy
Murder -0.262 : each unit increase in murder rate DECREASES life expectancy by 0.262 years
HS.Grad 0.046 : each 1% increase in graduation rate INCREASES life expectancy by 0.046 years
4.Illiteracy 0.276 :NOT significant (p = 0.3787)
5.Income 0.0001 : NOT significant (p = 0.6084)
F-statistic p-value = 2.39e-10
the overall model is highly statistically significant (p < 0.05)
#install.packages("GGally")
library(GGally)
# Create a correlation matrix plot for model variables
ggpairs(statedata, columns = c("Life.Exp", "Murder", "HS.Grad", "Illiteracy", "Income"),
title = "Correlation Matrix of Model Variables")
1. How variables affect Life Expectancy Murder Rate: This has the
strongest connection. High murder rates strongly match lower life
expectancy.
High School Graduation: States with more high school graduates clearly have a longer life expectancy.
Illiteracy: High illiteracy rates match lower life expectancy.
Income: Higher income connects to longer life, but this connection is the weakest of the four.
Illiteracy and Murder are highly linked (0.703): States with high illiteracy also tend to have high murder rates. Because they share the same information, the model chooses Murder as the main predictor and makes Illiteracy look unnecessary.
Illiteracy and Graduation are highly linked (-0.657): This makes sense because higher illiteracy naturally goes hand-in-hand with lower graduation rates.
Income and Graduation are highly linked (0.620): Higher income strongly connects to higher graduation rates.
# 1. Another model based on our variable selection insights
simple_model <- lm(Life.Exp ~ Murder + HS.Grad, data = statedata)
# 2. Check the summary of the new model
summary(simple_model)
##
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad, data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.66758 -0.41801 0.05602 0.55913 2.05625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.29708 1.01567 69.213 < 2e-16 ***
## Murder -0.23709 0.03529 -6.719 2.18e-08 ***
## HS.Grad 0.04389 0.01613 2.721 0.00909 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7959 on 47 degrees of freedom
## Multiple R-squared: 0.6628, Adjusted R-squared: 0.6485
## F-statistic: 46.2 on 2 and 47 DF, p-value: 8.016e-12
Adjusted R-squared stays almost exactly the same as your full model (around 0.648).
#model equation
# Life.Exp = 69.48 - 0.262(Murder) + 0.046(HS.Grad)
# + 0.276(Illiteracy) + 0.0001(Income)
par(mfrow = c(2, 2))
plot(model)
# check:
# Residuals vs Fitted → random scatter = linearity OK
# Normal Q-Q → points on diagonal = normality OK
# Scale-Location → flat band = equal variance OK
# Residuals vs Leverage → any named points = potential outliers
Residuals vs Fitted: The red line is roughly flat and points scatter randomly around zero, linearity assumption is reasonably satisfied. Hawaii appears as a mild outlier above.
Normal Q-Q: Most points follow the diagonal line closely, normality assumption is satisfied. Alaska and Maine deviate slightly at the lower tail.
Scale-Location: The band is roughly horizontal, homoscedasticity (equal variance) assumption is satisfied.
Residuals vs Leverage: Hawaii has high residual but low leverage. Alaska falls near the Cook’s distance boundary (0.5 line), it may be an influential observation worth investigating.
#test if my model works:
newstate <- data.frame(Murder = 8,
HS.Grad = 55,
Illiteracy = 1.5,
Income = 4500)
predict(model, newdata = newstate, interval = "confidence")
## fit lwr upr
## 1 70.90211 70.57825 71.22596
# interval="confidence" gives you lower and upper bounds too
library(ggplot2)
ggplot(statedata, aes(x = Murder, y = Life.Exp)) +
geom_point(pch = 21, fill = "black", size = 3) +
geom_smooth(method = "lm", color = "navy", se = TRUE) +
labs(title = "Murder Rate vs Life Expectancy",
x = "Murder Rate (per 100,000)",
y = "Life Expectancy (years)")
## `geom_smooth()` using formula = 'y ~ x'
When a dataset contains a large number of predictors, including every variable can introduce unnecessary noise and cause over fitting. Many of these predictors are often redundant or do not add any value to the analysis.
Variable selection is the statistical process of identifying the most important features to include in a model. In R programming, this can be done using automated functions like step for stepwise regression, or packages like glmnet for regularization. The goal is to eliminate irrelevant data to build the simplest and most accurate model possible.
Method 1:Stepwise Selection
Concept: Automatically adds or removes variables one at a time based on AIC. Three directions:
A. backward: Starts with all variables, removes the weakest one at a time B.forward: Starts with no variables, adds the strongest one at a time C.both: Combines both most thorough
Key rule: at each step, the variable whose removal/addition gives the lowest AIC is selected. Stops when no change improves AIC.
# Full model with all 4 predictors
full_model <- lm(Life.Exp ~ Murder + HS.Grad + Illiteracy + Income, data = statedata)
# Automatic stepwise selection (checking both directions)
stepwise_model <- step(full_model, direction = "both")
## Start: AIC=-16.97
## Life.Exp ~ Murder + HS.Grad + Illiteracy + Income
##
## Df Sum of Sq RSS AIC
## - Income 1 0.1725 29.328 -18.6732
## - Illiteracy 1 0.5122 29.668 -18.0974
## <none> 29.156 -16.9682
## - HS.Grad 1 2.8901 32.046 -14.2425
## - Murder 1 22.4835 51.640 9.6132
##
## Step: AIC=-18.67
## Life.Exp ~ Murder + HS.Grad + Illiteracy
##
## Df Sum of Sq RSS AIC
## - Illiteracy 1 0.4419 29.770 -19.9255
## <none> 29.328 -18.6732
## + Income 1 0.1725 29.156 -16.9682
## - HS.Grad 1 4.8596 34.188 -13.0073
## - Murder 1 22.4531 51.782 7.7505
##
## Step: AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
##
## Df Sum of Sq RSS AIC
## <none> 29.770 -19.925
## + Illiteracy 1 0.4419 29.328 -18.673
## + Income 1 0.1022 29.668 -18.097
## - HS.Grad 1 4.6910 34.461 -14.609
## - Murder 1 28.5974 58.368 11.737
# View the final chosen model
summary(stepwise_model)
##
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad, data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.66758 -0.41801 0.05602 0.55913 2.05625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.29708 1.01567 69.213 < 2e-16 ***
## Murder -0.23709 0.03529 -6.719 2.18e-08 ***
## HS.Grad 0.04389 0.01613 2.721 0.00909 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7959 on 47 degrees of freedom
## Multiple R-squared: 0.6628, Adjusted R-squared: 0.6485
## F-statistic: 46.2 on 2 and 47 DF, p-value: 8.016e-12
The above it checks the AIC at each step. It will automatically drop Illiteracy and Income because removing them lowers the AIC score, leaving you with just Murder and HS.Grad.
# Run the stepwise selection
full_model <- lm(Life.Exp ~ Murder + HS.Grad + Illiteracy + Income, data = statedata)
stepwise_model <- step(full_model, direction = "both")
## Start: AIC=-16.97
## Life.Exp ~ Murder + HS.Grad + Illiteracy + Income
##
## Df Sum of Sq RSS AIC
## - Income 1 0.1725 29.328 -18.6732
## - Illiteracy 1 0.5122 29.668 -18.0974
## <none> 29.156 -16.9682
## - HS.Grad 1 2.8901 32.046 -14.2425
## - Murder 1 22.4835 51.640 9.6132
##
## Step: AIC=-18.67
## Life.Exp ~ Murder + HS.Grad + Illiteracy
##
## Df Sum of Sq RSS AIC
## - Illiteracy 1 0.4419 29.770 -19.9255
## <none> 29.328 -18.6732
## + Income 1 0.1725 29.156 -16.9682
## - HS.Grad 1 4.8596 34.188 -13.0073
## - Murder 1 22.4531 51.782 7.7505
##
## Step: AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
##
## Df Sum of Sq RSS AIC
## <none> 29.770 -19.925
## + Illiteracy 1 0.4419 29.328 -18.673
## + Income 1 0.1022 29.668 -18.097
## - HS.Grad 1 4.6910 34.461 -14.609
## - Murder 1 28.5974 58.368 11.737
# Extract the AIC values from the stepwise history and plot them as a line
aic_history <- stepwise_model$anova$AIC
plot(aic_history, type = "b", pch = 19, col = "blue", xlab = "Step", ylab = "AIC Score",
main = "Stepwise Selection: Downward AIC Linear Path", xaxt = "n")
axis(1, at = 1:length(aic_history), labels = c("Start", "Drop Income", "Drop Illiteracy"))
Method 2: AIC and BIC Criteria
Concept: Compare competing models using a score, lower is better.
AIC (Akaike Information Criterion): Penalizes adding extra variables moderately, so it tends to select slightly larger models.
BIC (Bayesian Information Criterion): Penalizes adding extra variables more heavily than AIC, so it tends to select simpler and smaller models.
Rule: If adding a variable does NOT lower AIC or BIC, that variable is not helping the model.
# Define the two models
full_model <- lm(Life.Exp ~ Murder + HS.Grad + Illiteracy + Income, data = statedata)
simple_model <- lm(Life.Exp ~ Murder + HS.Grad, data = statedata)
# Compare AIC values (Lower is better)
AIC(full_model, simple_model)
## df AIC
## full_model 6 126.9257
## simple_model 4 123.9684
# Compare BIC values (Lower is better)
BIC(full_model, simple_model)
## df BIC
## full_model 6 138.3978
## simple_model 4 131.6165
The above shows that simple_model has a lower AIC and a much lower BIC score. This confirms that adding Illiteracy and Income does not help the model enough to justify their complexity.
# Define the two models
simple_model <- lm(Life.Exp ~ Murder + HS.Grad, data = statedata)
# Create a clean line plot comparing the scores
plot(c(1, 2), c(AIC(full_model), AIC(simple_model)), type = "b", pch = 19, col = "darkgreen",
ylim = c(120, 140), xlab = "Model Complexity", ylab = "Information Criterion Score",
main = "AIC & BIC Model Comparison Lines", xaxt = "n")
lines(c(1, 2), c(BIC(full_model), BIC(simple_model)), type = "b", pch = 17, col = "darkred", lty = 2)
axis(1, at = c(1, 2), labels = c("Full Model (4 Vars)", "Simple Model (2 Vars)"))
legend("topright", legend = c("AIC Line", "BIC Line"), col = c("darkgreen", "darkred"), lty = c(1, 2), pch = c(19, 17))
Method 3: LASSO (Least Absolute Shrinkage and Selection Operator) Concept: Penalizes large coefficients. Weak predictors are shrunk all the way to exactly zero, meaning they are automatically removed from the model. Unlike stepwise, LASSO does selection and estimation at the same time.
#install.packages("glmnet")
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 5.0
# 1. Separate your dataset into your predictors (X) and your outcome target (Y)
X <- as.matrix(statedata[, c("Murder", "HS.Grad", "Illiteracy", "Income")])
Y <- statedata$Life.Exp
# 2. Run cross-validation to find the best penalty level
cv_lasso <- cv.glmnet(X, Y, alpha = 1)
# 3. Print the final results to see which variables stay and which are dropped
coef(cv_lasso, s = "lambda.min")
## 5 x 1 sparse Matrix of class "dgCMatrix"
## lambda.min
## (Intercept) 7.049005e+01
## Murder -2.244320e-01
## HS.Grad 3.730485e-02
## Illiteracy .
## Income 1.425179e-05
The LASSO regression automatically removed Illiteracy from the model by shrinking its coefficient to exactly zero because it was redundant. It kept Murder and HS.Grad as the most critical predictors of life expectancy, while keeping a very tiny, heavily penalized weight for Income.
library(glmnet)
# Prepare data format for LASSO
X <- as.matrix(statedata[, c("Murder", "HS.Grad", "Illiteracy", "Income")])
Y <- statedata$Life.Exp
# Fit the LASSO pathways
lasso_path <- glmnet(X, Y, alpha = 1)
# Plot the linear lines tracking each variable's weight
plot(lasso_path, xvar = "lambda", label = TRUE)
title("LASSO Linear Coefficient Paths", line = 2.5)
When to use each method: Few candidate models you already have : use AIC/BIC comparison. Many variables and want automatic selection : use Stepwise. Many variables and want penalized selection : use LASSO. Want interpretable coefficients : use Stepwise or AIC/BIC. Want to handle correlated predictors well : use LASSO or Ridge.
It lets you:
-Practice on data you create yourself -Test whether your model recovers known true values -Bootstrap confidence intervals -Model disease outbreaks when real data is unavailable
#Key Functions :
# Always set seed first — makes results reproducible
set.seed(123)
# Normal distribution — continuous outcomes (e.g. blood pressure, height)
rnorm(n = 10, mean = 0, sd = 1)
## [1] -0.56047565 -0.23017749 1.55870831 0.07050839 0.12928774 1.71506499
## [7] 0.46091621 -1.26506123 -0.68685285 -0.44566197
# Uniform distribution — equal probability across a range
runif(n = 10, min = 0, max = 1)
## [1] 0.8895393 0.6928034 0.6405068 0.9942698 0.6557058 0.7085305 0.5440660
## [8] 0.5941420 0.2891597 0.1471136
# Poisson distribution — COUNT data (e.g. disease cases per week)
rpois(n = 10, lambda = 5) # lambda = expected count
## [1] 9 8 6 7 1 5 6 3 4 3
# Binomial distribution — YES/NO outcomes (e.g. diseased or not)
rbinom(n = 10, size = 1, prob = 0.3) # 30% chance of disease
## [1] 0 0 0 0 0 0 0 0 0 1
# Sample from existing values
sample(1:100, size = 10, replace = FALSE)
## [1] 60 53 7 99 27 38 89 34 69 72
#The Most Powerful Use :Simulate Then Recover
# You CREATE the true model, then check if lm() finds it
set.seed(100)
n <- 100
age <- rnorm(n, mean = 45, sd = 10)
bmi <- rnorm(n, mean = 27, sd = 4)
smoker <- rbinom(n, size = 1, prob = 0.3)
error <- rnorm(n, mean = 0, sd = 5)
# TRUE model — you define this yourself
SBP <- 100 + 0.5*age + 1.2*bmi + 8*smoker + error
sim_data <- data.frame(SBP, age, bmi, smoker)
sim_model <- lm(SBP ~ age + bmi + smoker, data = sim_data)
summary(sim_model)
##
## Call:
## lm(formula = SBP ~ age + bmi + smoker, data = sim_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.6212 -3.3072 0.0266 3.2657 10.3007
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 102.82756 5.02961 20.444 < 2e-16 ***
## age 0.47881 0.04828 9.917 2.24e-16 ***
## bmi 1.11127 0.15546 7.148 1.72e-10 ***
## smoker 9.88731 1.05841 9.342 3.86e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.85 on 96 degrees of freedom
## Multiple R-squared: 0.6963, Adjusted R-squared: 0.6868
## F-statistic: 73.36 on 3 and 96 DF, p-value: < 2.2e-16
# Coefficients should be close to:
# age = 0.5, bmi = 1.2, smoker = 8
# If they are : the model is working correctly