Assignment 1

Find a data set of which you can fit multiple linear regression and interpret your results

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(ggplot2)
library(readxl)
library(readr)
#knitr::opts_chunk$set(error = TRUE)

Dataset: state.x77

# Load and prepare dataset

head(state.x77)
##            Population Income Illiteracy Life Exp Murder HS Grad Frost   Area
## Alabama          3615   3624        2.1    69.05   15.1    41.3    20  50708
## Alaska            365   6315        1.5    69.31   11.3    66.7   152 566432
## Arizona          2212   4530        1.8    70.55    7.8    58.1    15 113417
## Arkansas         2110   3378        1.9    70.66   10.1    39.9    65  51945
## California      21198   5114        1.1    71.71   10.3    62.6    20 156361
## Colorado         2541   4884        0.7    72.06    6.8    63.9   166 103766
statedata <- data.frame(state.x77)
colnames(statedata)[4] <- "Life.Exp"
colnames(statedata)[6] <- "HS.Grad"

head(statedata)
##            Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
## Alabama          3615   3624        2.1    69.05   15.1    41.3    20  50708
## Alaska            365   6315        1.5    69.31   11.3    66.7   152 566432
## Arizona          2212   4530        1.8    70.55    7.8    58.1    15 113417
## Arkansas         2110   3378        1.9    70.66   10.1    39.9    65  51945
## California      21198   5114        1.1    71.71   10.3    62.6    20 156361
## Colorado         2541   4884        0.7    72.06    6.8    63.9   166 103766
cat("\n STRUCTURE\n", strrep("===",10),"\n")
## 
##  STRUCTURE
##  ==============================
str(statedata)
## 'data.frame':    50 obs. of  8 variables:
##  $ Population: num  3615 365 2212 2110 21198 ...
##  $ Income    : num  3624 6315 4530 3378 5114 ...
##  $ Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
##  $ Life.Exp  : num  69 69.3 70.5 70.7 71.7 ...
##  $ Murder    : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
##  $ HS.Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
##  $ Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
##  $ Area      : num  50708 566432 113417 51945 156361 ...
cat("\n SUMMARY\n", strrep("===",10),"\n")
## 
##  SUMMARY
##  ==============================
summary(statedata)
##    Population        Income       Illiteracy       Life.Exp    
##  Min.   :  365   Min.   :3098   Min.   :0.500   Min.   :67.96  
##  1st Qu.: 1080   1st Qu.:3993   1st Qu.:0.625   1st Qu.:70.12  
##  Median : 2838   Median :4519   Median :0.950   Median :70.67  
##  Mean   : 4246   Mean   :4436   Mean   :1.170   Mean   :70.88  
##  3rd Qu.: 4968   3rd Qu.:4814   3rd Qu.:1.575   3rd Qu.:71.89  
##  Max.   :21198   Max.   :6315   Max.   :2.800   Max.   :73.60  
##      Murder          HS.Grad          Frost             Area       
##  Min.   : 1.400   Min.   :37.80   Min.   :  0.00   Min.   :  1049  
##  1st Qu.: 4.350   1st Qu.:48.05   1st Qu.: 66.25   1st Qu.: 36985  
##  Median : 6.850   Median :53.25   Median :114.50   Median : 54277  
##  Mean   : 7.378   Mean   :53.11   Mean   :104.46   Mean   : 70736  
##  3rd Qu.:10.675   3rd Qu.:59.15   3rd Qu.:139.75   3rd Qu.: 81163  
##  Max.   :15.100   Max.   :67.30   Max.   :188.00   Max.   :566432

My interest is to look at the history of all 50 states and find out ” How a state’s Murder rate, Graduation rate, Illiteracy, and Income levels simultaneously impact or predict the average Life Expectancy of its citizens”

model <- lm(Life.Exp ~ Murder + HS.Grad + Illiteracy + Income,
            data = statedata)

summary(model)
## 
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad + Illiteracy + Income, 
##     data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.56498 -0.53611  0.05303  0.58972  1.73972 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 69.4833066  1.3253230  52.427  < 2e-16 ***
## Murder      -0.2619402  0.0444659  -5.891 4.53e-07 ***
## HS.Grad      0.0461443  0.0218485   2.112   0.0403 *  
## Illiteracy   0.2760771  0.3105081   0.889   0.3787    
## Income       0.0001249  0.0002422   0.516   0.6084    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8049 on 45 degrees of freedom
## Multiple R-squared:  0.6698, Adjusted R-squared:  0.6405 
## F-statistic: 22.82 on 4 and 45 DF,  p-value: 2.39e-10

Coefficients: Estimate 1. (Intercept) 69.48 : baseline life expectancy

  1. Murder -0.262 : each unit increase in murder rate DECREASES life expectancy by 0.262 years

  2. HS.Grad 0.046 : each 1% increase in graduation rate INCREASES life expectancy by 0.046 years

4.Illiteracy 0.276 :NOT significant (p = 0.3787)

5.Income 0.0001 : NOT significant (p = 0.6084)

  1. R-squared = 0.6698
  1. 66.98% of variation in life expectancy is explained by this model
  1. F-statistic p-value = 2.39e-10

  2. the overall model is highly statistically significant (p < 0.05)

#install.packages("GGally")
library(GGally)

# Create a correlation matrix plot for model variables
ggpairs(statedata, columns = c("Life.Exp", "Murder", "HS.Grad", "Illiteracy", "Income"),
        title = "Correlation Matrix of Model Variables")

1. How variables affect Life Expectancy Murder Rate: This has the strongest connection. High murder rates strongly match lower life expectancy.

High School Graduation: States with more high school graduates clearly have a longer life expectancy.

Illiteracy: High illiteracy rates match lower life expectancy.

Income: Higher income connects to longer life, but this connection is the weakest of the four.

  1. Why some variables become “redundant” (The Big Takeaway) The plot also shows that the independent variables are heavily linked to each other. This explains why Illiteracy and Income lost their significance in your final multiple regression model:

Illiteracy and Murder are highly linked (0.703): States with high illiteracy also tend to have high murder rates. Because they share the same information, the model chooses Murder as the main predictor and makes Illiteracy look unnecessary.

Illiteracy and Graduation are highly linked (-0.657): This makes sense because higher illiteracy naturally goes hand-in-hand with lower graduation rates.

Income and Graduation are highly linked (0.620): Higher income strongly connects to higher graduation rates.

# 1. Another model based on our variable selection insights
simple_model <- lm(Life.Exp ~ Murder + HS.Grad, data = statedata)

# 2. Check the summary of the new model
summary(simple_model)
## 
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad, data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.66758 -0.41801  0.05602  0.55913  2.05625 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 70.29708    1.01567  69.213  < 2e-16 ***
## Murder      -0.23709    0.03529  -6.719 2.18e-08 ***
## HS.Grad      0.04389    0.01613   2.721  0.00909 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7959 on 47 degrees of freedom
## Multiple R-squared:  0.6628, Adjusted R-squared:  0.6485 
## F-statistic:  46.2 on 2 and 47 DF,  p-value: 8.016e-12

Adjusted R-squared stays almost exactly the same as your full model (around 0.648).

#model equation 
# Life.Exp = 69.48 - 0.262(Murder) + 0.046(HS.Grad)
#          + 0.276(Illiteracy) + 0.0001(Income)
par(mfrow = c(2, 2))
plot(model)

# check:
# Residuals vs Fitted → random scatter = linearity OK
# Normal Q-Q         → points on diagonal = normality OK
# Scale-Location     → flat band = equal variance OK
# Residuals vs Leverage → any named points = potential outliers

Residuals vs Fitted: The red line is roughly flat and points scatter randomly around zero, linearity assumption is reasonably satisfied. Hawaii appears as a mild outlier above.

Normal Q-Q: Most points follow the diagonal line closely, normality assumption is satisfied. Alaska and Maine deviate slightly at the lower tail.

Scale-Location: The band is roughly horizontal, homoscedasticity (equal variance) assumption is satisfied.

Residuals vs Leverage: Hawaii has high residual but low leverage. Alaska falls near the Cook’s distance boundary (0.5 line), it may be an influential observation worth investigating.

#test if my model works:

newstate <- data.frame(Murder     = 8,
                        HS.Grad    = 55,
                        Illiteracy = 1.5,
                        Income     = 4500)

predict(model, newdata = newstate, interval = "confidence")
##        fit      lwr      upr
## 1 70.90211 70.57825 71.22596
# interval="confidence" gives you lower and upper bounds too
library(ggplot2)

ggplot(statedata, aes(x = Murder, y = Life.Exp)) +
  geom_point(pch = 21, fill = "black", size = 3) +
  geom_smooth(method = "lm", color = "navy", se = TRUE) +
  labs(title = "Murder Rate vs Life Expectancy",
       x = "Murder Rate (per 100,000)",
       y = "Life Expectancy (years)") 
## `geom_smooth()` using formula = 'y ~ x'

Assignment 2: read about variable selection methods

What is Variable Selection?

When a dataset contains a large number of predictors, including every variable can introduce unnecessary noise and cause over fitting. Many of these predictors are often redundant or do not add any value to the analysis.

Variable selection is the statistical process of identifying the most important features to include in a model. In R programming, this can be done using automated functions like step for stepwise regression, or packages like glmnet for regularization. The goal is to eliminate irrelevant data to build the simplest and most accurate model possible.

Method 1:Stepwise Selection

Concept: Automatically adds or removes variables one at a time based on AIC. Three directions:

A. backward: Starts with all variables, removes the weakest one at a time B.forward: Starts with no variables, adds the strongest one at a time C.both: Combines both most thorough

Key rule: at each step, the variable whose removal/addition gives the lowest AIC is selected. Stops when no change improves AIC.

# Full model with all 4 predictors
full_model <- lm(Life.Exp ~ Murder + HS.Grad + Illiteracy + Income, data = statedata)

# Automatic stepwise selection (checking both directions)
stepwise_model <- step(full_model, direction = "both")
## Start:  AIC=-16.97
## Life.Exp ~ Murder + HS.Grad + Illiteracy + Income
## 
##              Df Sum of Sq    RSS      AIC
## - Income      1    0.1725 29.328 -18.6732
## - Illiteracy  1    0.5122 29.668 -18.0974
## <none>                    29.156 -16.9682
## - HS.Grad     1    2.8901 32.046 -14.2425
## - Murder      1   22.4835 51.640   9.6132
## 
## Step:  AIC=-18.67
## Life.Exp ~ Murder + HS.Grad + Illiteracy
## 
##              Df Sum of Sq    RSS      AIC
## - Illiteracy  1    0.4419 29.770 -19.9255
## <none>                    29.328 -18.6732
## + Income      1    0.1725 29.156 -16.9682
## - HS.Grad     1    4.8596 34.188 -13.0073
## - Murder      1   22.4531 51.782   7.7505
## 
## Step:  AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    29.770 -19.925
## + Illiteracy  1    0.4419 29.328 -18.673
## + Income      1    0.1022 29.668 -18.097
## - HS.Grad     1    4.6910 34.461 -14.609
## - Murder      1   28.5974 58.368  11.737
# View the final chosen model
summary(stepwise_model)
## 
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad, data = statedata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.66758 -0.41801  0.05602  0.55913  2.05625 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 70.29708    1.01567  69.213  < 2e-16 ***
## Murder      -0.23709    0.03529  -6.719 2.18e-08 ***
## HS.Grad      0.04389    0.01613   2.721  0.00909 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7959 on 47 degrees of freedom
## Multiple R-squared:  0.6628, Adjusted R-squared:  0.6485 
## F-statistic:  46.2 on 2 and 47 DF,  p-value: 8.016e-12

The above it checks the AIC at each step. It will automatically drop Illiteracy and Income because removing them lowers the AIC score, leaving you with just Murder and HS.Grad.

# Run the stepwise selection
full_model <- lm(Life.Exp ~ Murder + HS.Grad + Illiteracy + Income, data = statedata)
stepwise_model <- step(full_model, direction = "both")
## Start:  AIC=-16.97
## Life.Exp ~ Murder + HS.Grad + Illiteracy + Income
## 
##              Df Sum of Sq    RSS      AIC
## - Income      1    0.1725 29.328 -18.6732
## - Illiteracy  1    0.5122 29.668 -18.0974
## <none>                    29.156 -16.9682
## - HS.Grad     1    2.8901 32.046 -14.2425
## - Murder      1   22.4835 51.640   9.6132
## 
## Step:  AIC=-18.67
## Life.Exp ~ Murder + HS.Grad + Illiteracy
## 
##              Df Sum of Sq    RSS      AIC
## - Illiteracy  1    0.4419 29.770 -19.9255
## <none>                    29.328 -18.6732
## + Income      1    0.1725 29.156 -16.9682
## - HS.Grad     1    4.8596 34.188 -13.0073
## - Murder      1   22.4531 51.782   7.7505
## 
## Step:  AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    29.770 -19.925
## + Illiteracy  1    0.4419 29.328 -18.673
## + Income      1    0.1022 29.668 -18.097
## - HS.Grad     1    4.6910 34.461 -14.609
## - Murder      1   28.5974 58.368  11.737
# Extract the AIC values from the stepwise history and plot them as a line
aic_history <- stepwise_model$anova$AIC
plot(aic_history, type = "b", pch = 19, col = "blue", xlab = "Step", ylab = "AIC Score",
     main = "Stepwise Selection: Downward AIC Linear Path", xaxt = "n")
axis(1, at = 1:length(aic_history), labels = c("Start", "Drop Income", "Drop Illiteracy"))

Method 2: AIC and BIC Criteria

Concept: Compare competing models using a score, lower is better.

AIC (Akaike Information Criterion): Penalizes adding extra variables moderately, so it tends to select slightly larger models.

BIC (Bayesian Information Criterion): Penalizes adding extra variables more heavily than AIC, so it tends to select simpler and smaller models.

Rule: If adding a variable does NOT lower AIC or BIC, that variable is not helping the model.

# Define the two models
full_model <- lm(Life.Exp ~ Murder + HS.Grad + Illiteracy + Income, data = statedata)
simple_model <- lm(Life.Exp ~ Murder + HS.Grad, data = statedata)

# Compare AIC values (Lower is better)
AIC(full_model, simple_model)
##              df      AIC
## full_model    6 126.9257
## simple_model  4 123.9684
# Compare BIC values (Lower is better)
BIC(full_model, simple_model)
##              df      BIC
## full_model    6 138.3978
## simple_model  4 131.6165

The above shows that simple_model has a lower AIC and a much lower BIC score. This confirms that adding Illiteracy and Income does not help the model enough to justify their complexity.

# Define the two models
simple_model <- lm(Life.Exp ~ Murder + HS.Grad, data = statedata)

# Create a clean line plot comparing the scores
plot(c(1, 2), c(AIC(full_model), AIC(simple_model)), type = "b", pch = 19, col = "darkgreen",
     ylim = c(120, 140), xlab = "Model Complexity", ylab = "Information Criterion Score",
     main = "AIC & BIC Model Comparison Lines", xaxt = "n")
lines(c(1, 2), c(BIC(full_model), BIC(simple_model)), type = "b", pch = 17, col = "darkred", lty = 2)
axis(1, at = c(1, 2), labels = c("Full Model (4 Vars)", "Simple Model (2 Vars)"))
legend("topright", legend = c("AIC Line", "BIC Line"), col = c("darkgreen", "darkred"), lty = c(1, 2), pch = c(19, 17))

Method 3: LASSO (Least Absolute Shrinkage and Selection Operator) Concept: Penalizes large coefficients. Weak predictors are shrunk all the way to exactly zero, meaning they are automatically removed from the model. Unlike stepwise, LASSO does selection and estimation at the same time.

#install.packages("glmnet") 
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 5.0
# 1. Separate your dataset into your predictors (X) and your outcome target (Y)
X <- as.matrix(statedata[, c("Murder", "HS.Grad", "Illiteracy", "Income")])
Y <- statedata$Life.Exp

# 2. Run cross-validation to find the best penalty level
cv_lasso <- cv.glmnet(X, Y, alpha = 1)

# 3. Print the final results to see which variables stay and which are dropped
coef(cv_lasso, s = "lambda.min")
## 5 x 1 sparse Matrix of class "dgCMatrix"
##                lambda.min
## (Intercept)  7.049005e+01
## Murder      -2.244320e-01
## HS.Grad      3.730485e-02
## Illiteracy   .           
## Income       1.425179e-05

The LASSO regression automatically removed Illiteracy from the model by shrinking its coefficient to exactly zero because it was redundant. It kept Murder and HS.Grad as the most critical predictors of life expectancy, while keeping a very tiny, heavily penalized weight for Income.

library(glmnet)

#  Prepare data format for LASSO
X <- as.matrix(statedata[, c("Murder", "HS.Grad", "Illiteracy", "Income")])
Y <- statedata$Life.Exp

#  Fit the LASSO pathways
lasso_path <- glmnet(X, Y, alpha = 1)

# Plot the linear lines tracking each variable's weight
plot(lasso_path, xvar = "lambda", label = TRUE)
title("LASSO Linear Coefficient Paths", line = 2.5)

When to use each method: Few candidate models you already have : use AIC/BIC comparison. Many variables and want automatic selection : use Stepwise. Many variables and want penalized selection : use LASSO. Want interpretable coefficients : use Stepwise or AIC/BIC. Want to handle correlated predictors well : use LASSO or Ridge.

Generation of Random Numbers

It lets you:

-Practice on data you create yourself -Test whether your model recovers known true values -Bootstrap confidence intervals -Model disease outbreaks when real data is unavailable

#Key Functions :

# Always set seed first — makes results reproducible
set.seed(123)

# Normal distribution — continuous outcomes (e.g. blood pressure, height)
rnorm(n = 10, mean = 0, sd = 1)
##  [1] -0.56047565 -0.23017749  1.55870831  0.07050839  0.12928774  1.71506499
##  [7]  0.46091621 -1.26506123 -0.68685285 -0.44566197
# Uniform distribution — equal probability across a range
runif(n = 10, min = 0, max = 1)
##  [1] 0.8895393 0.6928034 0.6405068 0.9942698 0.6557058 0.7085305 0.5440660
##  [8] 0.5941420 0.2891597 0.1471136
# Poisson distribution — COUNT data (e.g. disease cases per week)
rpois(n = 10, lambda = 5)   # lambda = expected count
##  [1] 9 8 6 7 1 5 6 3 4 3
# Binomial distribution — YES/NO outcomes (e.g. diseased or not)
rbinom(n = 10, size = 1, prob = 0.3)   # 30% chance of disease
##  [1] 0 0 0 0 0 0 0 0 0 1
# Sample from existing values
sample(1:100, size = 10, replace = FALSE)
##  [1] 60 53  7 99 27 38 89 34 69 72
#The Most Powerful Use :Simulate Then Recover

# You CREATE the true model, then check if lm() finds it
set.seed(100)
n      <- 100
age    <- rnorm(n, mean = 45, sd = 10)
bmi    <- rnorm(n, mean = 27, sd = 4)
smoker <- rbinom(n, size = 1, prob = 0.3)
error  <- rnorm(n, mean = 0, sd = 5)

# TRUE model — you define this yourself
SBP <- 100 + 0.5*age + 1.2*bmi + 8*smoker + error

sim_data  <- data.frame(SBP, age, bmi, smoker)
sim_model <- lm(SBP ~ age + bmi + smoker, data = sim_data)
summary(sim_model)
## 
## Call:
## lm(formula = SBP ~ age + bmi + smoker, data = sim_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.6212  -3.3072   0.0266   3.2657  10.3007 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 102.82756    5.02961  20.444  < 2e-16 ***
## age           0.47881    0.04828   9.917 2.24e-16 ***
## bmi           1.11127    0.15546   7.148 1.72e-10 ***
## smoker        9.88731    1.05841   9.342 3.86e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.85 on 96 degrees of freedom
## Multiple R-squared:  0.6963, Adjusted R-squared:  0.6868 
## F-statistic: 73.36 on 3 and 96 DF,  p-value: < 2.2e-16
# Coefficients should be close to:
# age = 0.5, bmi = 1.2, smoker = 8
# If they are : the model is working correctly