#1. Load and Explore the Dataset
Research Question
Can we predict a country’s 2022 Population using:
.Area (km²) .Density (per km²) .Growth Rate
# Load dataset
population_data <-read.csv("C:/Users/HP/Desktop/My_Masters/R Programming/world_population.csv")
data(population_data)
## Warning in data(population_data): data set 'population_data' not found
# View first 6 rows
head(population_data)
## Rank CCA3 Country.Territory Capital Continent X2022.Population
## 1 36 AFG Afghanistan Kabul Asia 41128771
## 2 138 ALB Albania Tirana Europe 2842321
## 3 34 DZA Algeria Algiers Africa 44903225
## 4 213 ASM American Samoa Pago Pago Oceania 44273
## 5 203 AND Andorra Andorra la Vella Europe 79824
## 6 42 AGO Angola Luanda Africa 35588987
## X2020.Population X2015.Population X2010.Population X2000.Population
## 1 38972230 33753499 28189672 19542982
## 2 2866849 2882481 2913399 3182021
## 3 43451666 39543154 35856344 30774621
## 4 46189 51368 54849 58230
## 5 77700 71746 71519 66097
## 6 33428485 28127721 23364185 16394062
## X1990.Population X1980.Population X1970.Population Area..km..
## 1 10694796 12486631 10752971 652230
## 2 3295066 2941651 2324731 28748
## 3 25518074 18739378 13795915 2381741
## 4 47818 32886 27075 199
## 5 53569 35611 19860 468
## 6 11828638 8330047 6029700 1246700
## Density..per.km.. Growth.Rate World.Population.Percentage
## 1 63.0587 1.0257 0.52
## 2 98.8702 0.9957 0.04
## 3 18.8531 1.0164 0.56
## 4 222.4774 0.9831 0.00
## 5 170.5641 1.0100 0.00
## 6 28.5466 1.0315 0.45
# Structure of dataset
str(population_data)
## 'data.frame': 234 obs. of 17 variables:
## $ Rank : int 36 138 34 213 203 42 224 201 33 140 ...
## $ CCA3 : chr "AFG" "ALB" "DZA" "ASM" ...
## $ Country.Territory : chr "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## $ Capital : chr "Kabul" "Tirana" "Algiers" "Pago Pago" ...
## $ Continent : chr "Asia" "Europe" "Africa" "Oceania" ...
## $ X2022.Population : int 41128771 2842321 44903225 44273 79824 35588987 15857 93763 45510318 2780469 ...
## $ X2020.Population : int 38972230 2866849 43451666 46189 77700 33428485 15585 92664 45036032 2805608 ...
## $ X2015.Population : int 33753499 2882481 39543154 51368 71746 28127721 14525 89941 43257065 2878595 ...
## $ X2010.Population : int 28189672 2913399 35856344 54849 71519 23364185 13172 85695 41100123 2946293 ...
## $ X2000.Population : int 19542982 3182021 30774621 58230 66097 16394062 11047 75055 37070774 3168523 ...
## $ X1990.Population : int 10694796 3295066 25518074 47818 53569 11828638 8316 63328 32637657 3556539 ...
## $ X1980.Population : int 12486631 2941651 18739378 32886 35611 8330047 6560 64888 28024803 3135123 ...
## $ X1970.Population : int 10752971 2324731 13795915 27075 19860 6029700 6283 64516 23842803 2534377 ...
## $ Area..km.. : int 652230 28748 2381741 199 468 1246700 91 442 2780400 29743 ...
## $ Density..per.km.. : num 63.1 98.9 18.9 222.5 170.6 ...
## $ Growth.Rate : num 1.026 0.996 1.016 0.983 1.01 ...
## $ World.Population.Percentage: num 0.52 0.04 0.56 0 0 0.45 0 0 0.57 0.03 ...
# Summary statistics
summary(population_data)
## Rank CCA3 Country.Territory Capital
## Min. : 1.00 Length :234 Length :234 Length :234
## 1st Qu.: 59.25 N.unique :234 N.unique :234 N.unique :234
## Median :117.50 N.blank : 0 N.blank : 0 N.blank : 0
## Mean :117.50 Min.nchar: 3 Min.nchar: 4 Min.nchar: 4
## 3rd Qu.:175.75 Max.nchar: 3 Max.nchar: 32 Max.nchar: 19
## Max. :234.00
## Continent X2022.Population X2020.Population X2015.Population
## Length :234 Min. :5.100e+02 Min. :5.200e+02 Min. :5.640e+02
## N.unique : 6 1st Qu.:4.197e+05 1st Qu.:4.153e+05 1st Qu.:4.047e+05
## N.blank : 0 Median :5.560e+06 Median :5.493e+06 Median :5.307e+06
## Min.nchar: 4 Mean :3.407e+07 Mean :3.350e+07 Mean :3.173e+07
## Max.nchar: 13 3rd Qu.:2.248e+07 3rd Qu.:2.145e+07 3rd Qu.:1.973e+07
## Max. :1.426e+09 Max. :1.425e+09 Max. :1.394e+09
## X2010.Population X2000.Population X1990.Population
## Min. :5.960e+02 Min. :6.510e+02 Min. :7.000e+02
## 1st Qu.:3.931e+05 1st Qu.:3.272e+05 1st Qu.:2.641e+05
## Median :4.943e+06 Median :4.293e+06 Median :3.825e+06
## Mean :2.985e+07 Mean :2.627e+07 Mean :2.271e+07
## 3rd Qu.:1.916e+07 3rd Qu.:1.576e+07 3rd Qu.:1.187e+07
## Max. :1.348e+09 Max. :1.264e+09 Max. :1.154e+09
## X1980.Population X1970.Population Area..km.. Density..per.km..
## Min. : 733 Min. : 752 Min. : 1 Min. :2.610e-02
## 1st Qu.: 229614 1st Qu.: 155997 1st Qu.: 2650 1st Qu.:3.842e+01
## Median : 3141146 Median : 2604830 Median : 81200 Median :9.535e+01
## Mean : 18984617 Mean : 15786909 Mean : 581449 Mean :4.521e+02
## 3rd Qu.: 9826054 3rd Qu.: 8817329 3rd Qu.: 430426 3rd Qu.:2.389e+02
## Max. :982372466 Max. :822534450 Max. :17098242 Max. :2.317e+04
## Growth.Rate World.Population.Percentage
## Min. :0.912 Min. : 0.0000
## 1st Qu.:1.002 1st Qu.: 0.0100
## Median :1.008 Median : 0.0700
## Mean :1.010 Mean : 0.4271
## 3rd Qu.:1.017 3rd Qu.: 0.2800
## Max. :1.069 Max. :17.8800
#Fit Multiple Linear Regression
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
population_data <-clean_names(population_data)
model <- lm(`x2022_population` ~ `area_km` +
`density_per_km` +
`growth_rate`,
data = population_data)
summary(model)
##
## Call:
## lm(formula = x2022_population ~ area_km + density_per_km + growth_rate,
## data = population_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -472263778 -15197089 -13091149 -4416333 1287481300
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.635e+08 6.080e+08 0.269 0.788
## area_km 3.518e+01 4.571e+00 7.696 4.12e-13 ***
## density_per_km -1.463e+00 3.907e+03 0.000 1.000
## growth_rate -1.485e+08 6.020e+08 -0.247 0.805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 122700000 on 230 degrees of freedom
## Multiple R-squared: 0.2058, Adjusted R-squared: 0.1954
## F-statistic: 19.87 on 3 and 230 DF, p-value: 1.739e-11
##Regression Equation The model has the form:
Where: Y = response (dependent) variable β₀ = intercept β₁ = effect of area β₂ = effect of density β₃ = effect of growth rate ε = random error term
\[ Y = \beta_0 + \beta_1 \text{Area} + \beta_2 \text{Density} + \beta_3 \text{Growth Rate} + \varepsilon \]
##Check Model Assumptions
par(mfrow = c(2,2))
plot(model)
##Predicted Values
population_data$predicted_pop <- predict(model)
head(population_data[, c("country_territory",
"x2022_population",
"predicted_pop")])
## country_territory x2022_population predicted_pop
## 1 Afghanistan 41128771 34170861
## 2 Albania 2842321 16691121
## 3 Algeria 44903225 96398090
## 4 American Samoa 44273 17557694
## 5 Andorra 79824 13572502
## 6 Angola 35588987 54223698
##Actual vs Predicted Plot
plot(population_data$predicted_pop,
population_data$`x2022_population`,
xlab = "Predicted Population",
ylab = "Actual Population",
main = "Actual vs Predicted Population")
abline(0, 1, col = "red", lwd = 2)
#HW 2
#Variable Selection Methods in R
Variable selection is the process of choosing the most relevant predictor variables for a regression model. It helps improve model interpretability, reduce overfitting, and enhance prediction accuracy.
This report demonstrates common variable selection methods using the
built-in mtcars dataset in R.
# Load dataset
data(mtcars)
# View first rows
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# Dataset structure
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
We use miles per gallon (mpg) as the response variable
and all other variables as predictors.
full_model <- lm(mpg ~ ., data = mtcars)
summary(full_model)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
Forward selection starts with no predictors and adds variables one at a time based on model improvement.
empty_model <- lm(mpg ~ 1, data = mtcars)
forward_model <- step(
empty_model,
scope = formula(full_model),
direction = "forward"
)
## Start: AIC=115.94
## mpg ~ 1
##
## Df Sum of Sq RSS AIC
## + wt 1 847.73 278.32 73.217
## + cyl 1 817.71 308.33 76.494
## + disp 1 808.89 317.16 77.397
## + hp 1 678.37 447.67 88.427
## + drat 1 522.48 603.57 97.988
## + vs 1 496.53 629.52 99.335
## + am 1 405.15 720.90 103.672
## + carb 1 341.78 784.27 106.369
## + gear 1 259.75 866.30 109.552
## + qsec 1 197.39 928.66 111.776
## <none> 1126.05 115.943
##
## Step: AIC=73.22
## mpg ~ wt
##
## Df Sum of Sq RSS AIC
## + cyl 1 87.150 191.17 63.198
## + hp 1 83.274 195.05 63.840
## + qsec 1 82.858 195.46 63.908
## + vs 1 54.228 224.09 68.283
## + carb 1 44.602 233.72 69.628
## + disp 1 31.639 246.68 71.356
## <none> 278.32 73.217
## + drat 1 9.081 269.24 74.156
## + gear 1 1.137 277.19 75.086
## + am 1 0.002 278.32 75.217
##
## Step: AIC=63.2
## mpg ~ wt + cyl
##
## Df Sum of Sq RSS AIC
## + hp 1 14.5514 176.62 62.665
## + carb 1 13.7724 177.40 62.805
## <none> 191.17 63.198
## + qsec 1 10.5674 180.60 63.378
## + gear 1 3.0281 188.14 64.687
## + disp 1 2.6796 188.49 64.746
## + vs 1 0.7059 190.47 65.080
## + am 1 0.1249 191.05 65.177
## + drat 1 0.0010 191.17 65.198
##
## Step: AIC=62.66
## mpg ~ wt + cyl + hp
##
## Df Sum of Sq RSS AIC
## <none> 176.62 62.665
## + am 1 6.6228 170.00 63.442
## + disp 1 6.1762 170.44 63.526
## + carb 1 2.5187 174.10 64.205
## + drat 1 2.2453 174.38 64.255
## + qsec 1 1.4010 175.22 64.410
## + gear 1 0.8558 175.76 64.509
## + vs 1 0.0599 176.56 64.654
summary(forward_model)
##
## Call:
## lm(formula = mpg ~ wt + cyl + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9290 -1.5598 -0.5311 1.1850 5.8986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.75179 1.78686 21.687 < 2e-16 ***
## wt -3.16697 0.74058 -4.276 0.000199 ***
## cyl -0.94162 0.55092 -1.709 0.098480 .
## hp -0.01804 0.01188 -1.519 0.140015
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.512 on 28 degrees of freedom
## Multiple R-squared: 0.8431, Adjusted R-squared: 0.8263
## F-statistic: 50.17 on 3 and 28 DF, p-value: 2.184e-11
Forward selection adds predictors that significantly improve model performance according to AIC.
Backward elimination starts with all predictors and removes the least useful variables.
backward_model <- step(
full_model,
direction = "backward"
)
## Start: AIC=70.9
## mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - cyl 1 0.0799 147.57 68.915
## - vs 1 0.1601 147.66 68.932
## - carb 1 0.4067 147.90 68.986
## - gear 1 1.3531 148.85 69.190
## - drat 1 1.6270 149.12 69.249
## - disp 1 3.9167 151.41 69.736
## - hp 1 6.8399 154.33 70.348
## - qsec 1 8.8641 156.36 70.765
## <none> 147.49 70.898
## - am 1 10.5467 158.04 71.108
## - wt 1 27.0144 174.51 74.280
##
## Step: AIC=68.92
## mpg ~ disp + hp + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - vs 1 0.2685 147.84 66.973
## - carb 1 0.5201 148.09 67.028
## - gear 1 1.8211 149.40 67.308
## - drat 1 1.9826 149.56 67.342
## - disp 1 3.9009 151.47 67.750
## - hp 1 7.3632 154.94 68.473
## <none> 147.57 68.915
## - qsec 1 10.0933 157.67 69.032
## - am 1 11.8359 159.41 69.384
## - wt 1 27.0280 174.60 72.297
##
## Step: AIC=66.97
## mpg ~ disp + hp + drat + wt + qsec + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - carb 1 0.6855 148.53 65.121
## - gear 1 2.1437 149.99 65.434
## - drat 1 2.2139 150.06 65.449
## - disp 1 3.6467 151.49 65.753
## - hp 1 7.1060 154.95 66.475
## <none> 147.84 66.973
## - am 1 11.5694 159.41 67.384
## - qsec 1 15.6830 163.53 68.200
## - wt 1 27.3799 175.22 70.410
##
## Step: AIC=65.12
## mpg ~ disp + hp + drat + wt + qsec + am + gear
##
## Df Sum of Sq RSS AIC
## - gear 1 1.565 150.09 63.457
## - drat 1 1.932 150.46 63.535
## <none> 148.53 65.121
## - disp 1 10.110 158.64 65.229
## - am 1 12.323 160.85 65.672
## - hp 1 14.826 163.35 66.166
## - qsec 1 26.408 174.94 68.358
## - wt 1 69.127 217.66 75.350
##
## Step: AIC=63.46
## mpg ~ disp + hp + drat + wt + qsec + am
##
## Df Sum of Sq RSS AIC
## - drat 1 3.345 153.44 62.162
## - disp 1 8.545 158.64 63.229
## <none> 150.09 63.457
## - hp 1 13.285 163.38 64.171
## - am 1 20.036 170.13 65.466
## - qsec 1 25.574 175.67 66.491
## - wt 1 67.572 217.66 73.351
##
## Step: AIC=62.16
## mpg ~ disp + hp + wt + qsec + am
##
## Df Sum of Sq RSS AIC
## - disp 1 6.629 160.07 61.515
## <none> 153.44 62.162
## - hp 1 12.572 166.01 62.682
## - qsec 1 26.470 179.91 65.255
## - am 1 32.198 185.63 66.258
## - wt 1 69.043 222.48 72.051
##
## Step: AIC=61.52
## mpg ~ hp + wt + qsec + am
##
## Df Sum of Sq RSS AIC
## - hp 1 9.219 169.29 61.307
## <none> 160.07 61.515
## - qsec 1 20.225 180.29 63.323
## - am 1 25.993 186.06 64.331
## - wt 1 78.494 238.56 72.284
##
## Step: AIC=61.31
## mpg ~ wt + qsec + am
##
## Df Sum of Sq RSS AIC
## <none> 169.29 61.307
## - am 1 26.178 195.46 63.908
## - qsec 1 109.034 278.32 75.217
## - wt 1 183.347 352.63 82.790
summary(backward_model)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
Variables that do not contribute significantly to explaining the response are removed.
Stepwise selection combines forward selection and backward elimination.
stepwise_model <- step(
full_model,
direction = "both"
)
## Start: AIC=70.9
## mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - cyl 1 0.0799 147.57 68.915
## - vs 1 0.1601 147.66 68.932
## - carb 1 0.4067 147.90 68.986
## - gear 1 1.3531 148.85 69.190
## - drat 1 1.6270 149.12 69.249
## - disp 1 3.9167 151.41 69.736
## - hp 1 6.8399 154.33 70.348
## - qsec 1 8.8641 156.36 70.765
## <none> 147.49 70.898
## - am 1 10.5467 158.04 71.108
## - wt 1 27.0144 174.51 74.280
##
## Step: AIC=68.92
## mpg ~ disp + hp + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - vs 1 0.2685 147.84 66.973
## - carb 1 0.5201 148.09 67.028
## - gear 1 1.8211 149.40 67.308
## - drat 1 1.9826 149.56 67.342
## - disp 1 3.9009 151.47 67.750
## - hp 1 7.3632 154.94 68.473
## <none> 147.57 68.915
## - qsec 1 10.0933 157.67 69.032
## - am 1 11.8359 159.41 69.384
## + cyl 1 0.0799 147.49 70.898
## - wt 1 27.0280 174.60 72.297
##
## Step: AIC=66.97
## mpg ~ disp + hp + drat + wt + qsec + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - carb 1 0.6855 148.53 65.121
## - gear 1 2.1437 149.99 65.434
## - drat 1 2.2139 150.06 65.449
## - disp 1 3.6467 151.49 65.753
## - hp 1 7.1060 154.95 66.475
## <none> 147.84 66.973
## - am 1 11.5694 159.41 67.384
## - qsec 1 15.6830 163.53 68.200
## + vs 1 0.2685 147.57 68.915
## + cyl 1 0.1883 147.66 68.932
## - wt 1 27.3799 175.22 70.410
##
## Step: AIC=65.12
## mpg ~ disp + hp + drat + wt + qsec + am + gear
##
## Df Sum of Sq RSS AIC
## - gear 1 1.565 150.09 63.457
## - drat 1 1.932 150.46 63.535
## <none> 148.53 65.121
## - disp 1 10.110 158.64 65.229
## - am 1 12.323 160.85 65.672
## - hp 1 14.826 163.35 66.166
## + carb 1 0.685 147.84 66.973
## + vs 1 0.434 148.09 67.028
## + cyl 1 0.414 148.11 67.032
## - qsec 1 26.408 174.94 68.358
## - wt 1 69.127 217.66 75.350
##
## Step: AIC=63.46
## mpg ~ disp + hp + drat + wt + qsec + am
##
## Df Sum of Sq RSS AIC
## - drat 1 3.345 153.44 62.162
## - disp 1 8.545 158.64 63.229
## <none> 150.09 63.457
## - hp 1 13.285 163.38 64.171
## + gear 1 1.565 148.53 65.121
## + cyl 1 1.003 149.09 65.242
## + vs 1 0.645 149.45 65.319
## + carb 1 0.107 149.99 65.434
## - am 1 20.036 170.13 65.466
## - qsec 1 25.574 175.67 66.491
## - wt 1 67.572 217.66 73.351
##
## Step: AIC=62.16
## mpg ~ disp + hp + wt + qsec + am
##
## Df Sum of Sq RSS AIC
## - disp 1 6.629 160.07 61.515
## <none> 153.44 62.162
## - hp 1 12.572 166.01 62.682
## + drat 1 3.345 150.09 63.457
## + gear 1 2.977 150.46 63.535
## + cyl 1 2.447 150.99 63.648
## + vs 1 1.121 152.32 63.927
## + carb 1 0.011 153.43 64.160
## - qsec 1 26.470 179.91 65.255
## - am 1 32.198 185.63 66.258
## - wt 1 69.043 222.48 72.051
##
## Step: AIC=61.52
## mpg ~ hp + wt + qsec + am
##
## Df Sum of Sq RSS AIC
## - hp 1 9.219 169.29 61.307
## <none> 160.07 61.515
## + disp 1 6.629 153.44 62.162
## + carb 1 3.227 156.84 62.864
## + drat 1 1.428 158.64 63.229
## - qsec 1 20.225 180.29 63.323
## + cyl 1 0.249 159.82 63.465
## + vs 1 0.249 159.82 63.466
## + gear 1 0.171 159.90 63.481
## - am 1 25.993 186.06 64.331
## - wt 1 78.494 238.56 72.284
##
## Step: AIC=61.31
## mpg ~ wt + qsec + am
##
## Df Sum of Sq RSS AIC
## <none> 169.29 61.307
## + hp 1 9.219 160.07 61.515
## + carb 1 8.036 161.25 61.751
## + disp 1 3.276 166.01 62.682
## + cyl 1 1.501 167.78 63.022
## + drat 1 1.400 167.89 63.042
## + gear 1 0.123 169.16 63.284
## + vs 1 0.000 169.29 63.307
## - am 1 26.178 195.46 63.908
## - qsec 1 109.034 278.32 75.217
## - wt 1 183.347 352.63 82.790
summary(stepwise_model)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
The algorithm adds and removes variables until the optimal model is obtained.
AIC(full_model,
forward_model,
backward_model,
stepwise_model)
## df AIC
## full_model 12 163.7098
## forward_model 5 155.4766
## backward_model 5 154.1194
## stepwise_model 5 154.1194
The model with the lowest AIC is generally preferred because it balances model fit and complexity.
The diagnostic plots help assess:
coef(stepwise_model)
## (Intercept) wt qsec am
## 9.617781 -3.916504 1.225886 2.935837
The regression equation can be written using the estimated coefficients from the selected model.
Variable selection techniques help identify the most important predictors while reducing model complexity.
In this analysis, Forward Selection, Backward Elimination, and
Stepwise Selection were applied to the mtcars dataset. The
final selected model retained only the predictors that contributed
meaningfully to explaining variation in fuel efficiency
(mpg).
Among the methods, the model with the lowest AIC is generally considered the best balance between accuracy and simplicity. ```