Find a data set of which you can fit multiple linear regression and interpret your results
#install.packages("readxl")
library(readxl)
data <- read_excel("D:/Emerance - personal stuffs/Big data/R Programming/Last Assignment/pathogen_dataset.xlsx")
str(data)
## tibble [3,040 Ă— 31] (S3: tbl_df/tbl/data.frame)
## $ Var1 : num [1:3040] 63.5 46.1 60.3 49.3 52.1 ...
## $ Var2 : num [1:3040] 37 34.2 64.9 54.4 47.3 ...
## $ Var3 : num [1:3040] 37.5 58.1 46.7 30.4 60.2 ...
## $ Var4 : num [1:3040] 57.2 36.5 62.3 52.9 51.4 ...
## $ Var5 : num [1:3040] 59.7 59.5 41.5 75 35.1 ...
## $ Var6 : num [1:3040] 64.2 54.9 53.3 64.9 53.9 ...
## $ Var7 : num [1:3040] 46.5 54.2 61.3 37.9 52.3 ...
## $ Var8 : num [1:3040] 65.2 51.3 72.6 51.3 71.7 ...
## $ Var9 : num [1:3040] 54.9 37.9 32.2 55 41.9 ...
## $ Var10 : num [1:3040] 38.9 59.7 44.4 60.2 55.8 ...
## $ Var11 : num [1:3040] 54.5 24.6 49.2 47.5 47.9 ...
## $ Var12 : num [1:3040] 30.1 43.9 44.4 51.3 49.8 ...
## $ Var13 : num [1:3040] 28.7 50.3 49.3 68.6 47.4 ...
## $ Var14 : num [1:3040] 33.3 47.2 31.5 52.7 62.9 ...
## $ Var15 : num [1:3040] 48.8 42.6 48.8 62.8 41.7 ...
## $ Var16 : num [1:3040] 49.4 41.1 56.4 47.1 49.6 ...
## $ Var17 : num [1:3040] 58.3 51 51.5 49.5 62.3 ...
## $ Var18 : num [1:3040] 40.7 39.7 58.4 57.3 29.1 ...
## $ Var19 : num [1:3040] 53.2 57.8 47.9 56 53.1 ...
## $ Var20 : num [1:3040] 50.5 51.9 41.1 53 46.7 ...
## $ ID : num [1:3040] 1782 1860 2878 2297 1492 ...
## $ Year : num [1:3040] 2013 2011 2023 2020 2019 ...
## $ Sex : chr [1:3040] "Female" "Male" "Female" "Male" ...
## $ Education : chr [1:3040] "Tertiary" "None" "Primary" "Secondary" ...
## $ WaterSource : chr [1:3040] "Borehole" "Borehole" "Tap" "Tap" ...
## $ BoiledWater : chr [1:3040] "No" "No" "No" "No" ...
## $ HandWashing : chr [1:3040] "No" "Yes" "Yes" "No" ...
## $ ToiletType : chr [1:3040] "Pit Latrine" "Pit Latrine" "Flush" "Pit Latrine" ...
## $ Age : num [1:3040] 23 59 21 26 28 64 39 76 19 19 ...
## $ Site : chr [1:3040] "Hospital B" "Hospital B" "Hospital A" "Clinic C" ...
## $ bact_isolated: chr [1:3040] "None" "E.coli" "None" "E.coli" ...
head(data)
## # A tibble: 6 Ă— 31
## Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var11 Var12 Var13
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 63.5 37.0 37.5 57.2 59.7 64.2 46.5 65.2 54.9 38.9 54.5 30.1 28.7
## 2 46.1 34.2 58.1 36.5 59.5 54.9 54.2 51.3 37.9 59.7 24.6 43.9 50.3
## 3 60.3 64.9 46.7 62.3 41.5 53.3 61.3 72.6 32.2 44.4 49.2 44.4 49.3
## 4 49.3 54.4 30.4 52.9 75.0 64.9 37.9 51.3 55.0 60.2 47.5 51.3 68.6
## 5 52.1 47.3 60.2 51.4 35.1 53.9 52.3 71.7 41.9 55.8 47.9 49.8 47.4
## 6 44.9 36.5 55.0 55.2 66.7 57.9 60.0 57.2 30.8 37.9 39.3 44.6 56.0
## # ℹ 18 more variables: Var14 <dbl>, Var15 <dbl>, Var16 <dbl>, Var17 <dbl>,
## # Var18 <dbl>, Var19 <dbl>, Var20 <dbl>, ID <dbl>, Year <dbl>, Sex <chr>,
## # Education <chr>, WaterSource <chr>, BoiledWater <chr>, HandWashing <chr>,
## # ToiletType <chr>, Age <dbl>, Site <chr>, bact_isolated <chr>
#Check for missing values
colSums(is.na(data))
## Var1 Var2 Var3 Var4 Var5
## 0 0 0 0 153
## Var6 Var7 Var8 Var9 Var10
## 0 0 0 0 0
## Var11 Var12 Var13 Var14 Var15
## 0 0 0 0 0
## Var16 Var17 Var18 Var19 Var20
## 0 0 0 0 0
## ID Year Sex Education WaterSource
## 0 0 50 30 0
## BoiledWater HandWashing ToiletType Age Site
## 0 0 0 0 0
## bact_isolated
## 0
#Using Var1 as the dependent variable:
model <- lm(Var1 ~ Var2 + Var3 + Var4 + Var5 + Age + Year,
data = data)
summary(model)
##
## Call:
## lm(formula = Var1 ~ Var2 + Var3 + Var4 + Var5 + Age + Year, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.642 -7.496 -1.017 5.782 115.823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.52032 118.71592 1.259 0.2080
## Var2 0.01899 0.02630 0.722 0.4704
## Var3 0.02203 0.02689 0.819 0.4128
## Var4 -0.04848 0.02666 -1.819 0.0691 .
## Var5 0.04581 0.02662 1.721 0.0853 .
## Age 0.01504 0.01158 1.298 0.1943
## Year -0.05010 0.05882 -0.852 0.3944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.36 on 2880 degrees of freedom
## (153 observations deleted due to missingness)
## Multiple R-squared: 0.00353, Adjusted R-squared: 0.001454
## F-statistic: 1.7 on 6 and 2880 DF, p-value: 0.1169
#Include categorical variables
model2 <- lm(Var1 ~ Var2 + Var3 + Age + Year +
Sex + Education + WaterSource +
BoiledWater + HandWashing +
ToiletType + Site,
data = data)
summary(model2)
##
## Call:
## lm(formula = Var1 ~ Var2 + Var3 + Age + Year + Sex + Education +
## WaterSource + BoiledWater + HandWashing + ToiletType + Site,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.373 -7.454 -0.755 6.002 116.185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 148.623267 112.159642 1.325 0.18524
## Var2 0.031090 0.025089 1.239 0.21538
## Var3 0.019473 0.025527 0.763 0.44563
## Age 0.009809 0.011031 0.889 0.37397
## Year -0.049384 0.055584 -0.888 0.37437
## SexMale 0.271239 0.507234 0.535 0.59287
## EducationPrimary 0.796523 0.709823 1.122 0.26189
## EducationSecondary 0.957072 0.724872 1.320 0.18683
## EducationTertiary 0.884035 0.715501 1.236 0.21673
## WaterSourceRiver -2.274560 0.725014 -3.137 0.00172 **
## WaterSourceTap -1.970834 0.710251 -2.775 0.00556 **
## WaterSourceWell -2.173780 0.722993 -3.007 0.00266 **
## BoiledWaterYes -0.483528 0.507197 -0.953 0.34050
## HandWashingYes -0.132203 0.507326 -0.261 0.79443
## ToiletTypeNone -0.036528 0.622172 -0.059 0.95319
## ToiletTypePit Latrine -0.294781 0.618160 -0.477 0.63349
## SiteHospital A 0.832230 0.625417 1.331 0.18340
## SiteHospital B -0.419027 0.620120 -0.676 0.49927
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.77 on 2942 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.008628, Adjusted R-squared: 0.0029
## F-statistic: 1.506 on 17 and 2942 DF, p-value: 0.08286
#Check regression assumptions:Residual plots
par(mfrow = c(2,2))
plot(model)
#Check multicollinearity
#install.packages("car")
library(car)
## Loading required package: carData
vif(model)
## Var2 Var3 Var4 Var5 Age Year
## 1.002086 1.002513 1.001141 1.001107 1.002167 1.000534
It’s acceptable since the VIF <5
#Obtain predicted values
predicted_values <- predict(model)
head(predicted_values)
## 1 2 3 4 5 6
## 50.50058 52.53551 49.62061 51.27420 50.12604 52.06168
#Obtain residuals
residuals_model <- residuals(model)
head(residuals_model)
## 1 2 3 4 5 6
## 13.027259 -6.402198 10.655525 -1.991431 2.014692 -7.188180
summary(model)
##
## Call:
## lm(formula = Var1 ~ Var2 + Var3 + Var4 + Var5 + Age + Year, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.642 -7.496 -1.017 5.782 115.823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.52032 118.71592 1.259 0.2080
## Var2 0.01899 0.02630 0.722 0.4704
## Var3 0.02203 0.02689 0.819 0.4128
## Var4 -0.04848 0.02666 -1.819 0.0691 .
## Var5 0.04581 0.02662 1.721 0.0853 .
## Age 0.01504 0.01158 1.298 0.1943
## Year -0.05010 0.05882 -0.852 0.3944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.36 on 2880 degrees of freedom
## (153 observations deleted due to missingness)
## Multiple R-squared: 0.00353, Adjusted R-squared: 0.001454
## F-statistic: 1.7 on 6 and 2880 DF, p-value: 0.1169
The interpretation
The multiple linear regression model shows a very weak relationship between Var1 and the independent variables (Var2, Var3, Var4, Var5, Age, and Year). The model explains only 0.35% of the variation in Var1, and the overall model is not statistically significant (p = 0.1169). None of the predictors are statistically significant at the 5% level, although Var4 and Var5 show weak (marginal) effects. Therefore, these variables are not strong predictors of Var1 in this dataset.
Variable selection is the process of choosing the most important independent variables (predictors) in a regression model.
Main Variable Selection Methods:
data(mtcars)
# Full model (all variables)
full_model <- lm(mpg ~ ., data = mtcars)
summary(full_model)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
#Backward Elimination
step_backward <- step(full_model, direction = "backward")
## Start: AIC=70.9
## mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - cyl 1 0.0799 147.57 68.915
## - vs 1 0.1601 147.66 68.932
## - carb 1 0.4067 147.90 68.986
## - gear 1 1.3531 148.85 69.190
## - drat 1 1.6270 149.12 69.249
## - disp 1 3.9167 151.41 69.736
## - hp 1 6.8399 154.33 70.348
## - qsec 1 8.8641 156.36 70.765
## <none> 147.49 70.898
## - am 1 10.5467 158.04 71.108
## - wt 1 27.0144 174.51 74.280
##
## Step: AIC=68.92
## mpg ~ disp + hp + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - vs 1 0.2685 147.84 66.973
## - carb 1 0.5201 148.09 67.028
## - gear 1 1.8211 149.40 67.308
## - drat 1 1.9826 149.56 67.342
## - disp 1 3.9009 151.47 67.750
## - hp 1 7.3632 154.94 68.473
## <none> 147.57 68.915
## - qsec 1 10.0933 157.67 69.032
## - am 1 11.8359 159.41 69.384
## - wt 1 27.0280 174.60 72.297
##
## Step: AIC=66.97
## mpg ~ disp + hp + drat + wt + qsec + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - carb 1 0.6855 148.53 65.121
## - gear 1 2.1437 149.99 65.434
## - drat 1 2.2139 150.06 65.449
## - disp 1 3.6467 151.49 65.753
## - hp 1 7.1060 154.95 66.475
## <none> 147.84 66.973
## - am 1 11.5694 159.41 67.384
## - qsec 1 15.6830 163.53 68.200
## - wt 1 27.3799 175.22 70.410
##
## Step: AIC=65.12
## mpg ~ disp + hp + drat + wt + qsec + am + gear
##
## Df Sum of Sq RSS AIC
## - gear 1 1.565 150.09 63.457
## - drat 1 1.932 150.46 63.535
## <none> 148.53 65.121
## - disp 1 10.110 158.64 65.229
## - am 1 12.323 160.85 65.672
## - hp 1 14.826 163.35 66.166
## - qsec 1 26.408 174.94 68.358
## - wt 1 69.127 217.66 75.350
##
## Step: AIC=63.46
## mpg ~ disp + hp + drat + wt + qsec + am
##
## Df Sum of Sq RSS AIC
## - drat 1 3.345 153.44 62.162
## - disp 1 8.545 158.64 63.229
## <none> 150.09 63.457
## - hp 1 13.285 163.38 64.171
## - am 1 20.036 170.13 65.466
## - qsec 1 25.574 175.67 66.491
## - wt 1 67.572 217.66 73.351
##
## Step: AIC=62.16
## mpg ~ disp + hp + wt + qsec + am
##
## Df Sum of Sq RSS AIC
## - disp 1 6.629 160.07 61.515
## <none> 153.44 62.162
## - hp 1 12.572 166.01 62.682
## - qsec 1 26.470 179.91 65.255
## - am 1 32.198 185.63 66.258
## - wt 1 69.043 222.48 72.051
##
## Step: AIC=61.52
## mpg ~ hp + wt + qsec + am
##
## Df Sum of Sq RSS AIC
## - hp 1 9.219 169.29 61.307
## <none> 160.07 61.515
## - qsec 1 20.225 180.29 63.323
## - am 1 25.993 186.06 64.331
## - wt 1 78.494 238.56 72.284
##
## Step: AIC=61.31
## mpg ~ wt + qsec + am
##
## Df Sum of Sq RSS AIC
## <none> 169.29 61.307
## - am 1 26.178 195.46 63.908
## - qsec 1 109.034 278.32 75.217
## - wt 1 183.347 352.63 82.790
summary(step_backward)
##
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4811 -1.5555 -0.7257 1.4110 4.6610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6178 6.9596 1.382 0.177915
## wt -3.9165 0.7112 -5.507 6.95e-06 ***
## qsec 1.2259 0.2887 4.247 0.000216 ***
## am 2.9358 1.4109 2.081 0.046716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8336
## F-statistic: 52.75 on 3 and 28 DF, p-value: 1.21e-11
#Forward Selection
null_model <- lm(mpg ~ 1, data = mtcars)
full_model <- lm(mpg ~ ., data = mtcars)
step_forward <- step(null_model,
scope = list(lower = null_model, upper = full_model),
direction = "forward")
## Start: AIC=115.94
## mpg ~ 1
##
## Df Sum of Sq RSS AIC
## + wt 1 847.73 278.32 73.217
## + cyl 1 817.71 308.33 76.494
## + disp 1 808.89 317.16 77.397
## + hp 1 678.37 447.67 88.427
## + drat 1 522.48 603.57 97.988
## + vs 1 496.53 629.52 99.335
## + am 1 405.15 720.90 103.672
## + carb 1 341.78 784.27 106.369
## + gear 1 259.75 866.30 109.552
## + qsec 1 197.39 928.66 111.776
## <none> 1126.05 115.943
##
## Step: AIC=73.22
## mpg ~ wt
##
## Df Sum of Sq RSS AIC
## + cyl 1 87.150 191.17 63.198
## + hp 1 83.274 195.05 63.840
## + qsec 1 82.858 195.46 63.908
## + vs 1 54.228 224.09 68.283
## + carb 1 44.602 233.72 69.628
## + disp 1 31.639 246.68 71.356
## <none> 278.32 73.217
## + drat 1 9.081 269.24 74.156
## + gear 1 1.137 277.19 75.086
## + am 1 0.002 278.32 75.217
##
## Step: AIC=63.2
## mpg ~ wt + cyl
##
## Df Sum of Sq RSS AIC
## + hp 1 14.5514 176.62 62.665
## + carb 1 13.7724 177.40 62.805
## <none> 191.17 63.198
## + qsec 1 10.5674 180.60 63.378
## + gear 1 3.0281 188.14 64.687
## + disp 1 2.6796 188.49 64.746
## + vs 1 0.7059 190.47 65.080
## + am 1 0.1249 191.05 65.177
## + drat 1 0.0010 191.17 65.198
##
## Step: AIC=62.66
## mpg ~ wt + cyl + hp
##
## Df Sum of Sq RSS AIC
## <none> 176.62 62.665
## + am 1 6.6228 170.00 63.442
## + disp 1 6.1762 170.44 63.526
## + carb 1 2.5187 174.10 64.205
## + drat 1 2.2453 174.38 64.255
## + qsec 1 1.4010 175.22 64.410
## + gear 1 0.8558 175.76 64.509
## + vs 1 0.0599 176.56 64.654
summary(step_forward)
##
## Call:
## lm(formula = mpg ~ wt + cyl + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9290 -1.5598 -0.5311 1.1850 5.8986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.75179 1.78686 21.687 < 2e-16 ***
## wt -3.16697 0.74058 -4.276 0.000199 ***
## cyl -0.94162 0.55092 -1.709 0.098480 .
## hp -0.01804 0.01188 -1.519 0.140015
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.512 on 28 degrees of freedom
## Multiple R-squared: 0.8431, Adjusted R-squared: 0.8263
## F-statistic: 50.17 on 3 and 28 DF, p-value: 2.184e-11
#Stepwise Selection
stepwise_model <- step(null_model,
scope = list(lower = null_model, upper = full_model),
direction = "both")
## Start: AIC=115.94
## mpg ~ 1
##
## Df Sum of Sq RSS AIC
## + wt 1 847.73 278.32 73.217
## + cyl 1 817.71 308.33 76.494
## + disp 1 808.89 317.16 77.397
## + hp 1 678.37 447.67 88.427
## + drat 1 522.48 603.57 97.988
## + vs 1 496.53 629.52 99.335
## + am 1 405.15 720.90 103.672
## + carb 1 341.78 784.27 106.369
## + gear 1 259.75 866.30 109.552
## + qsec 1 197.39 928.66 111.776
## <none> 1126.05 115.943
##
## Step: AIC=73.22
## mpg ~ wt
##
## Df Sum of Sq RSS AIC
## + cyl 1 87.15 191.17 63.198
## + hp 1 83.27 195.05 63.840
## + qsec 1 82.86 195.46 63.908
## + vs 1 54.23 224.09 68.283
## + carb 1 44.60 233.72 69.628
## + disp 1 31.64 246.68 71.356
## <none> 278.32 73.217
## + drat 1 9.08 269.24 74.156
## + gear 1 1.14 277.19 75.086
## + am 1 0.00 278.32 75.217
## - wt 1 847.73 1126.05 115.943
##
## Step: AIC=63.2
## mpg ~ wt + cyl
##
## Df Sum of Sq RSS AIC
## + hp 1 14.551 176.62 62.665
## + carb 1 13.772 177.40 62.805
## <none> 191.17 63.198
## + qsec 1 10.567 180.60 63.378
## + gear 1 3.028 188.14 64.687
## + disp 1 2.680 188.49 64.746
## + vs 1 0.706 190.47 65.080
## + am 1 0.125 191.05 65.177
## + drat 1 0.001 191.17 65.198
## - cyl 1 87.150 278.32 73.217
## - wt 1 117.162 308.33 76.494
##
## Step: AIC=62.66
## mpg ~ wt + cyl + hp
##
## Df Sum of Sq RSS AIC
## <none> 176.62 62.665
## - hp 1 14.551 191.17 63.198
## + am 1 6.623 170.00 63.442
## + disp 1 6.176 170.44 63.526
## - cyl 1 18.427 195.05 63.840
## + carb 1 2.519 174.10 64.205
## + drat 1 2.245 174.38 64.255
## + qsec 1 1.401 175.22 64.410
## + gear 1 0.856 175.76 64.509
## + vs 1 0.060 176.56 64.654
## - wt 1 115.354 291.98 76.750
summary(stepwise_model)
##
## Call:
## lm(formula = mpg ~ wt + cyl + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9290 -1.5598 -0.5311 1.1850 5.8986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.75179 1.78686 21.687 < 2e-16 ***
## wt -3.16697 0.74058 -4.276 0.000199 ***
## cyl -0.94162 0.55092 -1.709 0.098480 .
## hp -0.01804 0.01188 -1.519 0.140015
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.512 on 28 degrees of freedom
## Multiple R-squared: 0.8431, Adjusted R-squared: 0.8263
## F-statistic: 50.17 on 3 and 28 DF, p-value: 2.184e-11
##Conclusion: Variable selection helps build a better regression model by identifying the most important predictors while removing irrelevant ones. Stepwise selection is the most commonly used method in R because it balances accuracy and simplicity.