1. Using the “swiss” dataset, build the best multiple regression model you can for the variable Fertility.

# Standardized fertility measure and socioeconomic indicators for each of 47 French-speaking provinces of Switzerland at about 1888. 

# Switzerland, in 1888, was entering a period known as the demographic transition; i.e., its fertility was beginning to fall from the high level typical of underdeveloped countries. The data collected are for 47 French-speaking “provinces” at about 1888. Here, all variables are scaled to [0, 100] where in the original, all but Catholic were scaled to [0,1]

# Fertility = common standardized fertility measure
# Agriculture = % of males involved in agriculture as occupation
# Examination = % draftees receiving highest mark on army examination
# Education = % education beyond primary school for draftees
# Catholic = % Catholic (as opposed to Protestant)
# Infant.Mortality = live births who live less than 1 year

# step wise function in all directions starting with regular full model 
full_model = lm(Fertility ~ ., data = swiss)
my_model_reg_forward = step(full_model, direction = "forward")
## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality
my_model_reg_backward = step(full_model, direction = "backward")
## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## - Examination       1     53.03 2158.1 189.86
## <none>                          2105.0 190.69
## - Agriculture       1    307.72 2412.8 195.10
## - Infant.Mortality  1    408.75 2513.8 197.03
## - Catholic          1    447.71 2552.8 197.75
## - Education         1   1162.56 3267.6 209.36
## 
## Step:  AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          2158.1 189.86
## - Agriculture       1    264.18 2422.2 193.29
## - Infant.Mortality  1    409.81 2567.9 196.03
## - Catholic          1    956.57 3114.6 205.10
## - Education         1   2249.97 4408.0 221.43
my_model_reg_both = step(full_model, direction = "both")
## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## - Examination       1     53.03 2158.1 189.86
## <none>                          2105.0 190.69
## - Agriculture       1    307.72 2412.8 195.10
## - Infant.Mortality  1    408.75 2513.8 197.03
## - Catholic          1    447.71 2552.8 197.75
## - Education         1   1162.56 3267.6 209.36
## 
## Step:  AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          2158.1 189.86
## + Examination       1     53.03 2105.0 190.69
## - Agriculture       1    264.18 2422.2 193.29
## - Infant.Mortality  1    409.81 2567.9 196.03
## - Catholic          1    956.57 3114.6 205.10
## - Education         1   2249.97 4408.0 221.43
# step wise all directions starting with full model, including interaction terms
int_model = lm(Fertility ~ .^2, data = swiss)
my_model_int_forward = step(int_model, direction = "forward")
## Start:  AIC=188.01
## Fertility ~ (Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality)^2
my_model_int_backward = step(int_model, direction = "backward")
## Start:  AIC=188.01
## Fertility ~ (Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality)^2
## 
##                                Df Sum of Sq    RSS    AIC
## - Examination:Catholic          1     0.846 1300.0 186.04
## - Education:Infant.Mortality    1     3.065 1302.2 186.12
## - Catholic:Infant.Mortality     1    15.768 1314.9 186.57
## - Education:Catholic            1    20.582 1319.7 186.75
## - Agriculture:Catholic          1    35.592 1334.7 187.28
## <none>                                      1299.2 188.01
## - Agriculture:Education         1    65.644 1364.8 188.32
## - Examination:Infant.Mortality  1    73.578 1372.7 188.60
## - Agriculture:Examination       1   100.889 1400.0 189.52
## - Examination:Education         1   179.288 1478.4 192.08
## - Agriculture:Infant.Mortality  1   191.373 1490.5 192.47
## 
## Step:  AIC=186.04
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality + Agriculture:Examination + Agriculture:Education + 
##     Agriculture:Catholic + Agriculture:Infant.Mortality + Examination:Education + 
##     Examination:Infant.Mortality + Education:Catholic + Education:Infant.Mortality + 
##     Catholic:Infant.Mortality
## 
##                                Df Sum of Sq    RSS    AIC
## - Education:Infant.Mortality    1     3.908 1303.9 184.18
## - Catholic:Infant.Mortality     1    17.269 1317.3 184.66
## - Agriculture:Catholic          1    37.061 1337.1 185.36
## <none>                                      1300.0 186.04
## - Education:Catholic            1    56.755 1356.8 186.05
## - Agriculture:Education         1    69.460 1369.5 186.49
## - Examination:Infant.Mortality  1    85.963 1386.0 187.05
## - Agriculture:Examination       1   114.333 1414.3 188.00
## - Examination:Education         1   178.445 1478.4 190.08
## - Agriculture:Infant.Mortality  1   205.256 1505.2 190.93
## 
## Step:  AIC=184.18
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality + Agriculture:Examination + Agriculture:Education + 
##     Agriculture:Catholic + Agriculture:Infant.Mortality + Examination:Education + 
##     Examination:Infant.Mortality + Education:Catholic + Catholic:Infant.Mortality
## 
##                                Df Sum of Sq    RSS    AIC
## - Catholic:Infant.Mortality     1    25.786 1329.7 183.10
## - Agriculture:Catholic          1    36.413 1340.3 183.47
## <none>                                      1303.9 184.18
## - Agriculture:Education         1    79.202 1383.1 184.95
## - Education:Catholic            1    79.302 1383.2 184.96
## - Agriculture:Examination       1   116.350 1420.2 186.20
## - Examination:Education         1   185.887 1489.8 188.44
## - Agriculture:Infant.Mortality  1   219.826 1523.7 189.50
## - Examination:Infant.Mortality  1   230.537 1534.4 189.83
## 
## Step:  AIC=183.1
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality + Agriculture:Examination + Agriculture:Education + 
##     Agriculture:Catholic + Agriculture:Infant.Mortality + Examination:Education + 
##     Examination:Infant.Mortality + Education:Catholic
## 
##                                Df Sum of Sq    RSS    AIC
## - Agriculture:Catholic          1    26.657 1356.3 182.03
## <none>                                      1329.7 183.10
## - Education:Catholic            1    91.721 1421.4 184.24
## - Agriculture:Education         1    92.242 1421.9 184.25
## - Agriculture:Examination       1   121.234 1450.9 185.20
## - Examination:Education         1   197.231 1526.9 187.60
## - Examination:Infant.Mortality  1   210.665 1540.3 188.01
## - Agriculture:Infant.Mortality  1   220.436 1550.1 188.31
## 
## Step:  AIC=182.03
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality + Agriculture:Examination + Agriculture:Education + 
##     Agriculture:Infant.Mortality + Examination:Education + Examination:Infant.Mortality + 
##     Education:Catholic
## 
##                                Df Sum of Sq    RSS    AIC
## <none>                                      1356.3 182.03
## - Agriculture:Education         1    74.979 1431.3 182.56
## - Agriculture:Examination       1    99.701 1456.0 183.37
## - Examination:Education         1   174.619 1531.0 185.72
## - Education:Catholic            1   216.628 1573.0 187.00
## - Agriculture:Infant.Mortality  1   271.060 1627.4 188.60
## - Examination:Infant.Mortality  1   272.913 1629.3 188.65
my_model_int_both = step(int_model, direction = "both")
## Start:  AIC=188.01
## Fertility ~ (Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality)^2
## 
##                                Df Sum of Sq    RSS    AIC
## - Examination:Catholic          1     0.846 1300.0 186.04
## - Education:Infant.Mortality    1     3.065 1302.2 186.12
## - Catholic:Infant.Mortality     1    15.768 1314.9 186.57
## - Education:Catholic            1    20.582 1319.7 186.75
## - Agriculture:Catholic          1    35.592 1334.7 187.28
## <none>                                      1299.2 188.01
## - Agriculture:Education         1    65.644 1364.8 188.32
## - Examination:Infant.Mortality  1    73.578 1372.7 188.60
## - Agriculture:Examination       1   100.889 1400.0 189.52
## - Examination:Education         1   179.288 1478.4 192.08
## - Agriculture:Infant.Mortality  1   191.373 1490.5 192.47
## 
## Step:  AIC=186.04
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality + Agriculture:Examination + Agriculture:Education + 
##     Agriculture:Catholic + Agriculture:Infant.Mortality + Examination:Education + 
##     Examination:Infant.Mortality + Education:Catholic + Education:Infant.Mortality + 
##     Catholic:Infant.Mortality
## 
##                                Df Sum of Sq    RSS    AIC
## - Education:Infant.Mortality    1     3.908 1303.9 184.18
## - Catholic:Infant.Mortality     1    17.269 1317.3 184.66
## - Agriculture:Catholic          1    37.061 1337.1 185.36
## <none>                                      1300.0 186.04
## - Education:Catholic            1    56.755 1356.8 186.05
## - Agriculture:Education         1    69.460 1369.5 186.49
## - Examination:Infant.Mortality  1    85.963 1386.0 187.05
## - Agriculture:Examination       1   114.333 1414.3 188.00
## + Examination:Catholic          1     0.846 1299.2 188.01
## - Examination:Education         1   178.445 1478.4 190.08
## - Agriculture:Infant.Mortality  1   205.256 1505.2 190.93
## 
## Step:  AIC=184.18
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality + Agriculture:Examination + Agriculture:Education + 
##     Agriculture:Catholic + Agriculture:Infant.Mortality + Examination:Education + 
##     Examination:Infant.Mortality + Education:Catholic + Catholic:Infant.Mortality
## 
##                                Df Sum of Sq    RSS    AIC
## - Catholic:Infant.Mortality     1    25.786 1329.7 183.10
## - Agriculture:Catholic          1    36.413 1340.3 183.47
## <none>                                      1303.9 184.18
## - Agriculture:Education         1    79.202 1383.1 184.95
## - Education:Catholic            1    79.302 1383.2 184.96
## + Education:Infant.Mortality    1     3.908 1300.0 186.04
## + Examination:Catholic          1     1.690 1302.2 186.12
## - Agriculture:Examination       1   116.350 1420.2 186.20
## - Examination:Education         1   185.887 1489.8 188.44
## - Agriculture:Infant.Mortality  1   219.826 1523.7 189.50
## - Examination:Infant.Mortality  1   230.537 1534.4 189.83
## 
## Step:  AIC=183.1
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality + Agriculture:Examination + Agriculture:Education + 
##     Agriculture:Catholic + Agriculture:Infant.Mortality + Examination:Education + 
##     Examination:Infant.Mortality + Education:Catholic
## 
##                                Df Sum of Sq    RSS    AIC
## - Agriculture:Catholic          1    26.657 1356.3 182.03
## <none>                                      1329.7 183.10
## + Catholic:Infant.Mortality     1    25.786 1303.9 184.18
## - Education:Catholic            1    91.721 1421.4 184.24
## - Agriculture:Education         1    92.242 1421.9 184.25
## + Education:Infant.Mortality    1    12.425 1317.3 184.66
## + Examination:Catholic          1     2.448 1327.2 185.01
## - Agriculture:Examination       1   121.234 1450.9 185.20
## - Examination:Education         1   197.231 1526.9 187.60
## - Examination:Infant.Mortality  1   210.665 1540.3 188.01
## - Agriculture:Infant.Mortality  1   220.436 1550.1 188.31
## 
## Step:  AIC=182.03
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality + Agriculture:Examination + Agriculture:Education + 
##     Agriculture:Infant.Mortality + Examination:Education + Examination:Infant.Mortality + 
##     Education:Catholic
## 
##                                Df Sum of Sq    RSS    AIC
## <none>                                      1356.3 182.03
## - Agriculture:Education         1    74.979 1431.3 182.56
## + Agriculture:Catholic          1    26.657 1329.7 183.10
## - Agriculture:Examination       1    99.701 1456.0 183.37
## + Catholic:Infant.Mortality     1    16.030 1340.3 183.47
## + Education:Infant.Mortality    1     9.277 1347.1 183.71
## + Examination:Catholic          1     7.957 1348.4 183.76
## - Examination:Education         1   174.619 1531.0 185.72
## - Education:Catholic            1   216.628 1573.0 187.00
## - Agriculture:Infant.Mortality  1   271.060 1627.4 188.60
## - Examination:Infant.Mortality  1   272.913 1629.3 188.65
adj_r2_df = data.frame(Regular = c(summary(my_model_reg_forward)$adj.r.squared, summary(my_model_reg_backward)$adj.r.squared, summary(my_model_reg_both)$adj.r.squared), Interaction = c(summary(my_model_int_forward)$adj.r.squared, summary(my_model_int_backward)$adj.r.squared, summary(my_model_int_both)$adj.r.squared))

adj_r2_df
##    Regular Interaction
## 1 0.670971   0.7314321
## 2 0.670714   0.7516527
## 3 0.670714   0.7516527
# We can see that the highest Adjusted R-squared is 0.7516527, and this belongs to the function we found with both backward and both direction step wise regression that includes interaction terms

summary(my_model_int_both)
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education + 
##     Catholic + Infant.Mortality + Agriculture:Examination + Agriculture:Education + 
##     Agriculture:Infant.Mortality + Examination:Education + Examination:Infant.Mortality + 
##     Education:Catholic, data = swiss)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6080 -3.6647 -0.5637  2.9216 13.7363 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  225.910055  52.457566   4.307 0.000128 ***
## Agriculture                   -1.906654   0.562824  -3.388 0.001756 ** 
## Examination                   -5.120204   1.578305  -3.244 0.002593 ** 
## Education                     -2.473498   1.202766  -2.057 0.047249 *  
## Catholic                       0.211157   0.054180   3.897 0.000420 ***
## Infant.Mortality              -5.269347   2.287270  -2.304 0.027292 *  
## Agriculture:Examination        0.014880   0.009277   1.604 0.117706    
## Agriculture:Education          0.019081   0.013718   1.391 0.173013    
## Agriculture:Infant.Mortality   0.063530   0.024021   2.645 0.012158 *  
## Examination:Education          0.063891   0.030098   2.123 0.040925 *  
## Examination:Infant.Mortality   0.172194   0.064887   2.654 0.011891 *  
## Education:Catholic            -0.012381   0.005237  -2.364 0.023741 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.225 on 35 degrees of freedom
## Multiple R-squared:  0.811,  Adjusted R-squared:  0.7517 
## F-statistic: 13.66 on 11 and 35 DF,  p-value: 1.283e-09
# Best model: Fertility ~ Agriculture + Examination + Education + Catholic + Infant.Mortality + Agriculture:Examination + Agriculture:Education + Agriculture:Infant.Mortality + Examination:Education + Examination:Infant.Mortality + Education:Catholic

best_model = (lm(Fertility ~ Agriculture + Examination + Education + Catholic + Infant.Mortality + Agriculture:Examination + Agriculture:Education + Agriculture:Infant.Mortality + Examination:Education + Examination:Infant.Mortality + Education:Catholic, data = swiss))

summary(best_model)
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Examination + Education + 
##     Catholic + Infant.Mortality + Agriculture:Examination + Agriculture:Education + 
##     Agriculture:Infant.Mortality + Examination:Education + Examination:Infant.Mortality + 
##     Education:Catholic, data = swiss)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6080 -3.6647 -0.5637  2.9216 13.7363 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  225.910055  52.457566   4.307 0.000128 ***
## Agriculture                   -1.906654   0.562824  -3.388 0.001756 ** 
## Examination                   -5.120204   1.578305  -3.244 0.002593 ** 
## Education                     -2.473498   1.202766  -2.057 0.047249 *  
## Catholic                       0.211157   0.054180   3.897 0.000420 ***
## Infant.Mortality              -5.269347   2.287270  -2.304 0.027292 *  
## Agriculture:Examination        0.014880   0.009277   1.604 0.117706    
## Agriculture:Education          0.019081   0.013718   1.391 0.173013    
## Agriculture:Infant.Mortality   0.063530   0.024021   2.645 0.012158 *  
## Examination:Education          0.063891   0.030098   2.123 0.040925 *  
## Examination:Infant.Mortality   0.172194   0.064887   2.654 0.011891 *  
## Education:Catholic            -0.012381   0.005237  -2.364 0.023741 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.225 on 35 degrees of freedom
## Multiple R-squared:  0.811,  Adjusted R-squared:  0.7517 
## F-statistic: 13.66 on 11 and 35 DF,  p-value: 1.283e-09
plot(swiss$Agriculture, swiss$Fertility, main = "Plot of Fertility vs Agriculture", xlab = "Agriculture", ylab = "Fertility")
points(sort(swiss$Agriculture), best_model$fitted.values, col = "orange", pch = 20)

2. Then build a logistic regression model for predicting Fertility>70.0. Post your solutions, interpretation, and code for your peers to see.

# split the data into 1 and 0 if over 70 or not
over_seventy = ifelse(swiss$Fertility > 70, 1, 0)
swiss$Fertility = over_seventy

my_model = glm(Fertility ~ ., family = "binomial", data = swiss)
summary(my_model)
## 
## Call:
## glm(formula = Fertility ~ ., family = "binomial", data = swiss)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)       4.82826    5.25607   0.919   0.3583  
## Agriculture      -0.09615    0.04011  -2.397   0.0165 *
## Examination      -0.32116    0.13844  -2.320   0.0203 *
## Education        -0.12078    0.08610  -1.403   0.1607  
## Catholic          0.02078    0.01376   1.509   0.1312  
## Infant.Mortality  0.29078    0.21051   1.381   0.1672  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 65.135  on 46  degrees of freedom
## Residual deviance: 32.887  on 41  degrees of freedom
## AIC: 44.887
## 
## Number of Fisher Scoring iterations: 6
exp(coef(my_model))
##      (Intercept)      Agriculture      Examination        Education 
##      124.9934070        0.9083322        0.7253057        0.8862309 
##         Catholic Infant.Mortality 
##        1.0209935        1.3374691