#Using the “swiss” dataset, build the best multiple regression model you can for the variable Fertility.
df <- swiss
cor(df)
## Fertility Agriculture Examination Education Catholic
## Fertility 1.0000000 0.35307918 -0.6458827 -0.66378886 0.4636847
## Agriculture 0.3530792 1.00000000 -0.6865422 -0.63952252 0.4010951
## Examination -0.6458827 -0.68654221 1.0000000 0.69841530 -0.5727418
## Education -0.6637889 -0.63952252 0.6984153 1.00000000 -0.1538589
## Catholic 0.4636847 0.40109505 -0.5727418 -0.15385892 1.0000000
## Infant.Mortality 0.4165560 -0.06085861 -0.1140216 -0.09932185 0.1754959
## Infant.Mortality
## Fertility 0.41655603
## Agriculture -0.06085861
## Examination -0.11402160
## Education -0.09932185
## Catholic 0.17549591
## Infant.Mortality 1.00000000
Based on the correlation matrix, Fertility is most related to Examination and Education, so the plan is to create a multiple regression model using both features.
summary(lm(Fertility ~ Examination*Education, data = df))
##
## Call:
## lm(formula = Fertility ~ Examination * Education, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.522 -6.901 -1.169 7.383 19.412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.178104 4.334560 20.112 <2e-16 ***
## Examination -0.625731 0.257117 -2.434 0.0192 *
## Education -0.807552 0.463474 -1.742 0.0886 .
## Examination:Education 0.009201 0.014451 0.637 0.5277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.043 on 43 degrees of freedom
## Multiple R-squared: 0.5101, Adjusted R-squared: 0.4759
## F-statistic: 14.92 on 3 and 43 DF, p-value: 8.437e-07
summary(lm(Fertility ~ Examination, data = df))
##
## Call:
## lm(formula = Fertility ~ Examination, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.9375 -6.0044 -0.3393 7.9239 19.7399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.8185 3.2576 26.651 < 2e-16 ***
## Examination -1.0113 0.1782 -5.675 9.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.642 on 45 degrees of freedom
## Multiple R-squared: 0.4172, Adjusted R-squared: 0.4042
## F-statistic: 32.21 on 1 and 45 DF, p-value: 9.45e-07
summary(lm(Fertility ~ Education, data = df))
##
## Call:
## lm(formula = Fertility ~ Education, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.036 -6.711 -1.011 9.526 19.689
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.6101 2.1041 37.836 < 2e-16 ***
## Education -0.8624 0.1448 -5.954 3.66e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.446 on 45 degrees of freedom
## Multiple R-squared: 0.4406, Adjusted R-squared: 0.4282
## F-statistic: 35.45 on 1 and 45 DF, p-value: 3.659e-07
Using these features, the R-squared value is low, so other features will be tested.
summary(lm(Fertility ~ Agriculture, data = df))
##
## Call:
## lm(formula = Fertility ~ Agriculture, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.5374 -7.8685 -0.6362 9.0464 24.4858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.30438 4.25126 14.185 <2e-16 ***
## Agriculture 0.19420 0.07671 2.532 0.0149 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.82 on 45 degrees of freedom
## Multiple R-squared: 0.1247, Adjusted R-squared: 0.1052
## F-statistic: 6.409 on 1 and 45 DF, p-value: 0.01492
summary(lm(Fertility ~ Examination, data = df))
##
## Call:
## lm(formula = Fertility ~ Examination, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.9375 -6.0044 -0.3393 7.9239 19.7399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.8185 3.2576 26.651 < 2e-16 ***
## Examination -1.0113 0.1782 -5.675 9.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.642 on 45 degrees of freedom
## Multiple R-squared: 0.4172, Adjusted R-squared: 0.4042
## F-statistic: 32.21 on 1 and 45 DF, p-value: 9.45e-07
summary(lm(Fertility ~ Education, data = df))
##
## Call:
## lm(formula = Fertility ~ Education, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.036 -6.711 -1.011 9.526 19.689
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.6101 2.1041 37.836 < 2e-16 ***
## Education -0.8624 0.1448 -5.954 3.66e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.446 on 45 degrees of freedom
## Multiple R-squared: 0.4406, Adjusted R-squared: 0.4282
## F-statistic: 35.45 on 1 and 45 DF, p-value: 3.659e-07
summary(lm(Fertility ~ Catholic, data = df))
##
## Call:
## lm(formula = Fertility ~ Catholic, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.309 -4.060 0.511 6.851 16.682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.42826 2.30510 27.950 < 2e-16 ***
## Catholic 0.13889 0.03956 3.511 0.00103 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.19 on 45 degrees of freedom
## Multiple R-squared: 0.215, Adjusted R-squared: 0.1976
## F-statistic: 12.33 on 1 and 45 DF, p-value: 0.001029
summary(lm(Fertility ~ Infant.Mortality, data = df))
##
## Call:
## lm(formula = Fertility ~ Infant.Mortality, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.672 -5.687 -0.381 7.239 28.565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.5155 11.7113 2.947 0.00507 **
## Infant.Mortality 1.7865 0.5812 3.074 0.00359 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.48 on 45 degrees of freedom
## Multiple R-squared: 0.1735, Adjusted R-squared: 0.1552
## F-statistic: 9.448 on 1 and 45 DF, p-value: 0.003585
summary(lm(Fertility ~ Agriculture*Examination*Education*Catholic*Infant.Mortality, data = df))
##
## Call:
## lm(formula = Fertility ~ Agriculture * Examination * Education *
## Catholic * Infant.Mortality, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7861 -2.6263 0.0973 2.3639 9.5548
##
## Coefficients:
## Estimate
## (Intercept) 1.017e+03
## Agriculture -1.312e+01
## Examination -2.403e+01
## Education -9.896e+01
## Catholic -1.006e+01
## Infant.Mortality -4.174e+01
## Agriculture:Examination 2.071e-01
## Agriculture:Education 1.387e+00
## Examination:Education 2.721e+00
## Agriculture:Catholic 1.353e-01
## Examination:Catholic 4.886e-01
## Education:Catholic 9.542e-01
## Agriculture:Infant.Mortality 5.675e-01
## Examination:Infant.Mortality 9.692e-01
## Education:Infant.Mortality 4.511e+00
## Catholic:Infant.Mortality 4.584e-01
## Agriculture:Examination:Education -2.933e-02
## Agriculture:Examination:Catholic -5.551e-03
## Agriculture:Education:Catholic -1.066e-02
## Examination:Education:Catholic -2.550e-02
## Agriculture:Examination:Infant.Mortality -6.727e-03
## Agriculture:Education:Infant.Mortality -6.230e-02
## Examination:Education:Infant.Mortality -1.188e-01
## Agriculture:Catholic:Infant.Mortality -5.830e-03
## Examination:Catholic:Infant.Mortality -2.029e-02
## Education:Catholic:Infant.Mortality -4.291e-02
## Agriculture:Examination:Education:Catholic 9.402e-06
## Agriculture:Examination:Education:Infant.Mortality 1.176e-03
## Agriculture:Examination:Catholic:Infant.Mortality 1.984e-04
## Agriculture:Education:Catholic:Infant.Mortality 4.408e-04
## Examination:Education:Catholic:Infant.Mortality 9.393e-04
## Agriculture:Examination:Education:Catholic:Infant.Mortality 6.292e-06
## Std. Error t value
## (Intercept) 1.106e+03 0.919
## Agriculture 1.721e+01 -0.762
## Examination 4.747e+01 -0.506
## Education 8.530e+01 -1.160
## Catholic 1.740e+01 -0.578
## Infant.Mortality 5.360e+01 -0.779
## Agriculture:Examination 7.504e-01 0.276
## Agriculture:Education 1.341e+00 1.034
## Examination:Education 3.040e+00 0.895
## Agriculture:Catholic 2.399e-01 0.564
## Examination:Catholic 9.428e-01 0.518
## Education:Catholic 1.745e+00 0.547
## Agriculture:Infant.Mortality 8.337e-01 0.681
## Examination:Infant.Mortality 2.312e+00 0.419
## Education:Infant.Mortality 4.113e+00 1.097
## Catholic:Infant.Mortality 8.504e-01 0.539
## Agriculture:Examination:Education 4.995e-02 -0.587
## Agriculture:Examination:Catholic 1.306e-02 -0.425
## Agriculture:Education:Catholic 2.238e-02 -0.476
## Examination:Education:Catholic 6.952e-02 -0.367
## Agriculture:Examination:Infant.Mortality 3.680e-02 -0.183
## Agriculture:Education:Infant.Mortality 6.448e-02 -0.966
## Examination:Education:Infant.Mortality 1.466e-01 -0.811
## Agriculture:Catholic:Infant.Mortality 1.174e-02 -0.497
## Examination:Catholic:Infant.Mortality 4.675e-02 -0.434
## Education:Catholic:Infant.Mortality 8.466e-02 -0.507
## Agriculture:Examination:Education:Catholic 9.692e-04 0.010
## Agriculture:Examination:Education:Infant.Mortality 2.446e-03 0.481
## Agriculture:Examination:Catholic:Infant.Mortality 6.505e-04 0.305
## Agriculture:Education:Catholic:Infant.Mortality 1.104e-03 0.399
## Examination:Education:Catholic:Infant.Mortality 3.469e-03 0.271
## Agriculture:Examination:Education:Catholic:Infant.Mortality 4.968e-05 0.127
## Pr(>|t|)
## (Intercept) 0.372
## Agriculture 0.458
## Examination 0.620
## Education 0.264
## Catholic 0.572
## Infant.Mortality 0.448
## Agriculture:Examination 0.786
## Agriculture:Education 0.317
## Examination:Education 0.385
## Agriculture:Catholic 0.581
## Examination:Catholic 0.612
## Education:Catholic 0.593
## Agriculture:Infant.Mortality 0.506
## Examination:Infant.Mortality 0.681
## Education:Infant.Mortality 0.290
## Catholic:Infant.Mortality 0.598
## Agriculture:Examination:Education 0.566
## Agriculture:Examination:Catholic 0.677
## Agriculture:Education:Catholic 0.641
## Examination:Education:Catholic 0.719
## Agriculture:Examination:Infant.Mortality 0.857
## Agriculture:Education:Infant.Mortality 0.349
## Examination:Education:Infant.Mortality 0.430
## Agriculture:Catholic:Infant.Mortality 0.627
## Examination:Catholic:Infant.Mortality 0.671
## Education:Catholic:Infant.Mortality 0.620
## Agriculture:Examination:Education:Catholic 0.992
## Agriculture:Examination:Education:Infant.Mortality 0.638
## Agriculture:Examination:Catholic:Infant.Mortality 0.765
## Agriculture:Education:Catholic:Infant.Mortality 0.695
## Examination:Education:Catholic:Infant.Mortality 0.790
## Agriculture:Examination:Education:Catholic:Infant.Mortality 0.901
##
## Residual standard error: 7.165 on 15 degrees of freedom
## Multiple R-squared: 0.8927, Adjusted R-squared: 0.671
## F-statistic: 4.027 on 31 and 15 DF, p-value: 0.003078
Using any one feature, the R-squared value was about 0.5 or less, but using all features we get an R-squared value of 0.8927. So this seems to be a good model for Fertility to consider all features.
#Then build a logistic regression model for predicting Fertility > 70.0.
above_70 <- ifelse(df$Fertility > 70, 1, 0)
log_mod <- glm(above_70 ~ Agriculture*Examination*Education*Catholic*Infant.Mortality, family = "binomial", data = df)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_mod)
##
## Call:
## glm(formula = above_70 ~ Agriculture * Examination * Education *
## Catholic * Infant.Mortality, family = "binomial", data = df)
##
## Coefficients:
## Estimate
## (Intercept) 1.649e+04
## Agriculture -2.411e+02
## Examination -7.073e+02
## Education -6.000e+02
## Catholic -1.542e+02
## Infant.Mortality -8.121e+02
## Agriculture:Examination 1.010e+01
## Agriculture:Education 2.212e+00
## Examination:Education 2.706e+01
## Agriculture:Catholic 2.795e+00
## Examination:Catholic -2.431e+00
## Education:Catholic 1.880e+01
## Agriculture:Infant.Mortality 1.206e+01
## Examination:Infant.Mortality 3.557e+01
## Education:Infant.Mortality 3.225e+01
## Catholic:Infant.Mortality 8.459e+00
## Agriculture:Examination:Education -7.973e-02
## Agriculture:Examination:Catholic 1.879e-02
## Agriculture:Education:Catholic -3.122e-01
## Examination:Education:Catholic -5.716e-01
## Agriculture:Examination:Infant.Mortality -5.215e-01
## Agriculture:Education:Infant.Mortality -1.838e-01
## Examination:Education:Infant.Mortality -1.468e+00
## Agriculture:Catholic:Infant.Mortality -1.534e-01
## Examination:Catholic:Infant.Mortality 3.109e-02
## Education:Catholic:Infant.Mortality -1.003e+00
## Agriculture:Examination:Education:Catholic 1.083e-02
## Agriculture:Examination:Education:Infant.Mortality 7.705e-03
## Agriculture:Examination:Catholic:Infant.Mortality 6.442e-04
## Agriculture:Education:Catholic:Infant.Mortality 1.698e-02
## Examination:Education:Catholic:Infant.Mortality 3.285e-02
## Agriculture:Examination:Education:Catholic:Infant.Mortality -6.260e-04
## Std. Error z value
## (Intercept) 8.158e+07 0
## Agriculture 1.152e+06 0
## Examination 4.495e+06 0
## Education 6.117e+06 0
## Catholic 1.352e+06 0
## Infant.Mortality 3.900e+06 0
## Agriculture:Examination 5.788e+04 0
## Agriculture:Education 8.469e+04 0
## Examination:Education 2.675e+05 0
## Agriculture:Catholic 1.622e+04 0
## Examination:Catholic 1.535e+05 0
## Education:Catholic 1.534e+05 0
## Agriculture:Infant.Mortality 5.281e+04 0
## Examination:Infant.Mortality 2.215e+05 0
## Education:Infant.Mortality 2.886e+05 0
## Catholic:Infant.Mortality 6.230e+04 0
## Agriculture:Examination:Education 2.888e+03 0
## Agriculture:Examination:Catholic 1.823e+03 0
## Agriculture:Education:Catholic 3.479e+03 0
## Examination:Education:Catholic 4.705e+03 0
## Agriculture:Examination:Infant.Mortality 2.682e+03 0
## Agriculture:Education:Infant.Mortality 3.866e+03 0
## Examination:Education:Infant.Mortality 1.257e+04 0
## Agriculture:Catholic:Infant.Mortality 7.924e+02 0
## Examination:Catholic:Infant.Mortality 6.624e+03 0
## Education:Catholic:Infant.Mortality 7.933e+03 0
## Agriculture:Examination:Education:Catholic 1.368e+02 0
## Agriculture:Examination:Education:Infant.Mortality 1.200e+02 0
## Agriculture:Examination:Catholic:Infant.Mortality 7.014e+01 0
## Agriculture:Education:Catholic:Infant.Mortality 1.868e+02 0
## Examination:Education:Catholic:Infant.Mortality 2.623e+02 0
## Agriculture:Examination:Education:Catholic:Infant.Mortality 8.210e+00 0
## Pr(>|z|)
## (Intercept) 1
## Agriculture 1
## Examination 1
## Education 1
## Catholic 1
## Infant.Mortality 1
## Agriculture:Examination 1
## Agriculture:Education 1
## Examination:Education 1
## Agriculture:Catholic 1
## Examination:Catholic 1
## Education:Catholic 1
## Agriculture:Infant.Mortality 1
## Examination:Infant.Mortality 1
## Education:Infant.Mortality 1
## Catholic:Infant.Mortality 1
## Agriculture:Examination:Education 1
## Agriculture:Examination:Catholic 1
## Agriculture:Education:Catholic 1
## Examination:Education:Catholic 1
## Agriculture:Examination:Infant.Mortality 1
## Agriculture:Education:Infant.Mortality 1
## Examination:Education:Infant.Mortality 1
## Agriculture:Catholic:Infant.Mortality 1
## Examination:Catholic:Infant.Mortality 1
## Education:Catholic:Infant.Mortality 1
## Agriculture:Examination:Education:Catholic 1
## Agriculture:Examination:Education:Infant.Mortality 1
## Agriculture:Examination:Catholic:Infant.Mortality 1
## Agriculture:Education:Catholic:Infant.Mortality 1
## Examination:Education:Catholic:Infant.Mortality 1
## Agriculture:Examination:Education:Catholic:Infant.Mortality 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.5135e+01 on 46 degrees of freedom
## Residual deviance: 2.0699e-09 on 15 degrees of freedom
## AIC: 64
##
## Number of Fisher Scoring iterations: 25
ggplot(df, aes(x= Fertility, y= above_70)) + geom_point() +
stat_smooth(method="glm", color="green", se=FALSE,
method.args = list(family=binomial)) +
labs(title = "Logistic Regression for Fertility > 70",
x = "Fertility",
y = "Probability of Fertility > 70")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred