Exercise 1

write.csv("../datafiles/statedata.csv")
## "","x"
## "1","../datafiles/statedata.csv"
statedata = read.csv("../datafiles/statedata.csv")
 (statedata2 <- statedata[,-1])
##    Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
## 1        3615   3624        2.1    69.05   15.1    41.3    20  50708
## 2         365   6315        1.5    69.31   11.3    66.7   152 566432
## 3        2212   4530        1.8    70.55    7.8    58.1    15 113417
## 4        2110   3378        1.9    70.66   10.1    39.9    65  51945
## 5       21198   5114        1.1    71.71   10.3    62.6    20 156361
## 6        2541   4884        0.7    72.06    6.8    63.9   166 103766
## 7        3100   5348        1.1    72.48    3.1    56.0   139   4862
## 8         579   4809        0.9    70.06    6.2    54.6   103   1982
## 9        8277   4815        1.3    70.66   10.7    52.6    11  54090
## 10       4931   4091        2.0    68.54   13.9    40.6    60  58073
## 11        868   4963        1.9    73.60    6.2    61.9     0   6425
## 12        813   4119        0.6    71.87    5.3    59.5   126  82677
## 13      11197   5107        0.9    70.14   10.3    52.6   127  55748
## 14       5313   4458        0.7    70.88    7.1    52.9   122  36097
## 15       2861   4628        0.5    72.56    2.3    59.0   140  55941
## 16       2280   4669        0.6    72.58    4.5    59.9   114  81787
## 17       3387   3712        1.6    70.10   10.6    38.5    95  39650
## 18       3806   3545        2.8    68.76   13.2    42.2    12  44930
## 19       1058   3694        0.7    70.39    2.7    54.7   161  30920
## 20       4122   5299        0.9    70.22    8.5    52.3   101   9891
## 21       5814   4755        1.1    71.83    3.3    58.5   103   7826
## 22       9111   4751        0.9    70.63   11.1    52.8   125  56817
## 23       3921   4675        0.6    72.96    2.3    57.6   160  79289
## 24       2341   3098        2.4    68.09   12.5    41.0    50  47296
## 25       4767   4254        0.8    70.69    9.3    48.8   108  68995
## 26        746   4347        0.6    70.56    5.0    59.2   155 145587
## 27       1544   4508        0.6    72.60    2.9    59.3   139  76483
## 28        590   5149        0.5    69.03   11.5    65.2   188 109889
## 29        812   4281        0.7    71.23    3.3    57.6   174   9027
## 30       7333   5237        1.1    70.93    5.2    52.5   115   7521
## 31       1144   3601        2.2    70.32    9.7    55.2   120 121412
## 32      18076   4903        1.4    70.55   10.9    52.7    82  47831
## 33       5441   3875        1.8    69.21   11.1    38.5    80  48798
## 34        637   5087        0.8    72.78    1.4    50.3   186  69273
## 35      10735   4561        0.8    70.82    7.4    53.2   124  40975
## 36       2715   3983        1.1    71.42    6.4    51.6    82  68782
## 37       2284   4660        0.6    72.13    4.2    60.0    44  96184
## 38      11860   4449        1.0    70.43    6.1    50.2   126  44966
## 39        931   4558        1.3    71.90    2.4    46.4   127   1049
## 40       2816   3635        2.3    67.96   11.6    37.8    65  30225
## 41        681   4167        0.5    72.08    1.7    53.3   172  75955
## 42       4173   3821        1.7    70.11   11.0    41.8    70  41328
## 43      12237   4188        2.2    70.90   12.2    47.4    35 262134
## 44       1203   4022        0.6    72.90    4.5    67.3   137  82096
## 45        472   3907        0.6    71.64    5.5    57.1   168   9267
## 46       4981   4701        1.4    70.08    9.5    47.8    85  39780
## 47       3559   4864        0.6    71.72    4.3    63.5    32  66570
## 48       1799   3617        1.4    69.48    6.7    41.6   100  24070
## 49       4589   4468        0.7    72.48    3.0    54.5   149  54464
## 50        376   4566        0.6    70.29    6.9    62.9   173  97203

Hypothesis:

##Full Model
mod.1 <- (lm(Life.Exp ~ ., data = statedata2))
## Null Model
mod.0 <- (lm(Life.Exp ~1, data = statedata2))

Stepwise Variable Selection:

step(mod.0, scope = formula(mod.1))
## Start:  AIC=30.44
## Life.Exp ~ 1
## 
##              Df Sum of Sq    RSS     AIC
## + Murder      1    53.838 34.461 -14.609
## + Illiteracy  1    30.578 57.721  11.179
## + HS.Grad     1    29.931 58.368  11.737
## + Income      1    10.223 78.076  26.283
## + Frost       1     6.064 82.235  28.878
## <none>                    88.299  30.435
## + Area        1     1.017 87.282  31.856
## + Population  1     0.409 87.890  32.203
## 
## Step:  AIC=-14.61
## Life.Exp ~ Murder
## 
##              Df Sum of Sq    RSS     AIC
## + HS.Grad     1     4.691 29.770 -19.925
## + Population  1     4.016 30.445 -18.805
## + Frost       1     3.135 31.327 -17.378
## + Income      1     2.405 32.057 -16.226
## <none>                    34.461 -14.609
## + Area        1     0.470 33.992 -13.295
## + Illiteracy  1     0.273 34.188 -13.007
## - Murder      1    53.838 88.299  30.435
## 
## Step:  AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
## 
##              Df Sum of Sq    RSS     AIC
## + Frost       1    4.3987 25.372 -25.920
## + Population  1    3.3405 26.430 -23.877
## <none>                    29.770 -19.925
## + Illiteracy  1    0.4419 29.328 -18.673
## + Area        1    0.2775 29.493 -18.394
## + Income      1    0.1022 29.668 -18.097
## - HS.Grad     1    4.6910 34.461 -14.609
## - Murder      1   28.5974 58.368  11.737
## 
## Step:  AIC=-25.92
## Life.Exp ~ Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## + Population  1     2.064 23.308 -28.161
## <none>                    25.372 -25.920
## + Income      1     0.182 25.189 -24.280
## + Illiteracy  1     0.172 25.200 -24.259
## + Area        1     0.026 25.346 -23.970
## - Frost       1     4.399 29.770 -19.925
## - HS.Grad     1     5.955 31.327 -17.378
## - Murder      1    32.756 58.128  13.531
## 
## Step:  AIC=-28.16
## Life.Exp ~ Murder + HS.Grad + Frost + Population
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    23.308 -28.161
## + Income      1     0.006 23.302 -26.174
## + Illiteracy  1     0.004 23.304 -26.170
## + Area        1     0.001 23.307 -26.163
## - Population  1     2.064 25.372 -25.920
## - Frost       1     3.122 26.430 -23.877
## - HS.Grad     1     5.112 28.420 -20.246
## - Murder      1    34.816 58.124  15.528
## 
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad + Frost + Population, 
##     data = statedata2)
## 
## Coefficients:
## (Intercept)       Murder      HS.Grad        Frost   Population  
##   7.103e+01   -3.001e-01    4.658e-02   -5.943e-03    5.014e-05

Final Model:
Life expectancy ~ Population + Murder + HS Grad + Frost

The final model includes the following variables:
-life expectancy
-population
-murder
-HS Graduation status
-Frost

AIC: -28.16

Summary:

summary(step(mod.0, scope = formula(mod.1)))
## Start:  AIC=30.44
## Life.Exp ~ 1
## 
##              Df Sum of Sq    RSS     AIC
## + Murder      1    53.838 34.461 -14.609
## + Illiteracy  1    30.578 57.721  11.179
## + HS.Grad     1    29.931 58.368  11.737
## + Income      1    10.223 78.076  26.283
## + Frost       1     6.064 82.235  28.878
## <none>                    88.299  30.435
## + Area        1     1.017 87.282  31.856
## + Population  1     0.409 87.890  32.203
## 
## Step:  AIC=-14.61
## Life.Exp ~ Murder
## 
##              Df Sum of Sq    RSS     AIC
## + HS.Grad     1     4.691 29.770 -19.925
## + Population  1     4.016 30.445 -18.805
## + Frost       1     3.135 31.327 -17.378
## + Income      1     2.405 32.057 -16.226
## <none>                    34.461 -14.609
## + Area        1     0.470 33.992 -13.295
## + Illiteracy  1     0.273 34.188 -13.007
## - Murder      1    53.838 88.299  30.435
## 
## Step:  AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
## 
##              Df Sum of Sq    RSS     AIC
## + Frost       1    4.3987 25.372 -25.920
## + Population  1    3.3405 26.430 -23.877
## <none>                    29.770 -19.925
## + Illiteracy  1    0.4419 29.328 -18.673
## + Area        1    0.2775 29.493 -18.394
## + Income      1    0.1022 29.668 -18.097
## - HS.Grad     1    4.6910 34.461 -14.609
## - Murder      1   28.5974 58.368  11.737
## 
## Step:  AIC=-25.92
## Life.Exp ~ Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## + Population  1     2.064 23.308 -28.161
## <none>                    25.372 -25.920
## + Income      1     0.182 25.189 -24.280
## + Illiteracy  1     0.172 25.200 -24.259
## + Area        1     0.026 25.346 -23.970
## - Frost       1     4.399 29.770 -19.925
## - HS.Grad     1     5.955 31.327 -17.378
## - Murder      1    32.756 58.128  13.531
## 
## Step:  AIC=-28.16
## Life.Exp ~ Murder + HS.Grad + Frost + Population
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    23.308 -28.161
## + Income      1     0.006 23.302 -26.174
## + Illiteracy  1     0.004 23.304 -26.170
## + Area        1     0.001 23.307 -26.163
## - Population  1     2.064 25.372 -25.920
## - Frost       1     3.122 26.430 -23.877
## - HS.Grad     1     5.112 28.420 -20.246
## - Murder      1    34.816 58.124  15.528
## 
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad + Frost + Population, 
##     data = statedata2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47095 -0.53464 -0.03701  0.57621  1.50683 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.103e+01  9.529e-01  74.542  < 2e-16 ***
## Murder      -3.001e-01  3.661e-02  -8.199 1.77e-10 ***
## HS.Grad      4.658e-02  1.483e-02   3.142  0.00297 ** 
## Frost       -5.943e-03  2.421e-03  -2.455  0.01802 *  
## Population   5.014e-05  2.512e-05   1.996  0.05201 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared:  0.736,  Adjusted R-squared:  0.7126 
## F-statistic: 31.37 on 4 and 45 DF,  p-value: 1.696e-12

Goodness of Fit
-F Statistic: 31.37
-P-Value: 1.696e-12
-Adjusted R-squared: 0.7126
Utah Prediction

goodmodel = (step(mod.0, scope = formula(mod.1)))
## Start:  AIC=30.44
## Life.Exp ~ 1
## 
##              Df Sum of Sq    RSS     AIC
## + Murder      1    53.838 34.461 -14.609
## + Illiteracy  1    30.578 57.721  11.179
## + HS.Grad     1    29.931 58.368  11.737
## + Income      1    10.223 78.076  26.283
## + Frost       1     6.064 82.235  28.878
## <none>                    88.299  30.435
## + Area        1     1.017 87.282  31.856
## + Population  1     0.409 87.890  32.203
## 
## Step:  AIC=-14.61
## Life.Exp ~ Murder
## 
##              Df Sum of Sq    RSS     AIC
## + HS.Grad     1     4.691 29.770 -19.925
## + Population  1     4.016 30.445 -18.805
## + Frost       1     3.135 31.327 -17.378
## + Income      1     2.405 32.057 -16.226
## <none>                    34.461 -14.609
## + Area        1     0.470 33.992 -13.295
## + Illiteracy  1     0.273 34.188 -13.007
## - Murder      1    53.838 88.299  30.435
## 
## Step:  AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
## 
##              Df Sum of Sq    RSS     AIC
## + Frost       1    4.3987 25.372 -25.920
## + Population  1    3.3405 26.430 -23.877
## <none>                    29.770 -19.925
## + Illiteracy  1    0.4419 29.328 -18.673
## + Area        1    0.2775 29.493 -18.394
## + Income      1    0.1022 29.668 -18.097
## - HS.Grad     1    4.6910 34.461 -14.609
## - Murder      1   28.5974 58.368  11.737
## 
## Step:  AIC=-25.92
## Life.Exp ~ Murder + HS.Grad + Frost
## 
##              Df Sum of Sq    RSS     AIC
## + Population  1     2.064 23.308 -28.161
## <none>                    25.372 -25.920
## + Income      1     0.182 25.189 -24.280
## + Illiteracy  1     0.172 25.200 -24.259
## + Area        1     0.026 25.346 -23.970
## - Frost       1     4.399 29.770 -19.925
## - HS.Grad     1     5.955 31.327 -17.378
## - Murder      1    32.756 58.128  13.531
## 
## Step:  AIC=-28.16
## Life.Exp ~ Murder + HS.Grad + Frost + Population
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    23.308 -28.161
## + Income      1     0.006 23.302 -26.174
## + Illiteracy  1     0.004 23.304 -26.170
## + Area        1     0.001 23.307 -26.163
## - Population  1     2.064 25.372 -25.920
## - Frost       1     3.122 26.430 -23.877
## - HS.Grad     1     5.112 28.420 -20.246
## - Murder      1    34.816 58.124  15.528
(newstate <- statedata[44, ])
##    State Population Income Illiteracy Life.Exp Murder HS.Grad Frost  Area
## 44    UT       1203   4022        0.6     72.9    4.5    67.3   137 82096
(newstate[2] <- 2785)
## [1] 2785
(newstate[7] <- 75)
## [1] 75
(newstate[6] <- 1.3)
## [1] 1.3
predict(goodmodel, newdata=newstate, int='pred')
##         fit     lwr      upr
## 44 73.45601 71.8809 75.03112

New Life Expectancy: 74.45601 years
Confidence Interval: 71.8809 - 75.03112

California Prediction

(newstate2 <- statedata[5, ])
##   State Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
## 5    CA      21198   5114        1.1    71.71   10.3    62.6    20 156361
(newstate2[2] <- 36962)
## [1] 36962
(newstate2[7] <- 68.3)
## [1] 68.3
(newstate2[6] <- 5.3)
## [1] 5.3
predict(goodmodel, newdata=newstate2, int='pred')
##        fit      lwr      upr
## 5 74.35232 72.11398 76.59065

New Life Expectancy: 74.35232 years
Confidence Interval: 72.11398 - 76.59065

Exercise 2

normtemp <- read.csv("../datafiles/normtemp.csv")
(normtemp$sex <- factor(normtemp$sex, labels = c("male", "female")))
##   [1] male   male   male   male   male   male   male   male   male   male  
##  [11] male   male   male   male   male   male   male   male   male   male  
##  [21] male   male   male   male   male   male   male   male   male   male  
##  [31] male   male   male   male   male   male   male   male   male   male  
##  [41] male   male   male   male   male   male   male   male   male   male  
##  [51] male   male   male   male   male   male   male   male   male   male  
##  [61] male   male   male   male   male   female female female female female
##  [71] female female female female female female female female female female
##  [81] female female female female female female female female female female
##  [91] female female female female female female female female female female
## [101] female female female female female female female female female female
## [111] female female female female female female female female female female
## [121] female female female female female female female female female female
## Levels: male female
normtemp$weight <- (normtemp$weight - mean(normtemp$weight))
cor.test(normtemp$temp,normtemp$weight, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  normtemp$temp and normtemp$weight
## t = 2.9668, df = 128, p-value = 0.003591
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.08519113 0.40802170
## sample estimates:
##       cor 
## 0.2536564

Pearson’s Correlation Coefficient: 0.2536564

Model:

weightandsexlm <- lm(normtemp$temp ~ normtemp$weight + normtemp$sex, data = normtemp)
summary(weightandsexlm)
## 
## Call:
## lm(formula = normtemp$temp ~ normtemp$weight + normtemp$sex, 
##     data = normtemp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.86363 -0.45624  0.01841  0.47366  2.33424 
## 
## Coefficients:
##                     Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)        98.114528   0.087102 1126.428  < 2e-16 ***
## normtemp$weight     0.025267   0.008762    2.884  0.00462 ** 
## normtemp$sexfemale  0.269406   0.123277    2.185  0.03070 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7017 on 127 degrees of freedom
## Multiple R-squared:  0.09825,    Adjusted R-squared:  0.08405 
## F-statistic: 6.919 on 2 and 127 DF,  p-value: 0.001406

Goodness of Fit -F statistic: 6.919
-P-Value: 0.001406
-Adjusted R-Squared: 0.08405
Coefficients & Intepretations
-98.1145277 (intercept)
-0.0252667 (slope)
-0.2694061
We can assume that the intercept, 98.114, is the average temperature for a male of the average weight. When you increase in one unit of weight, you add about 0.0252 degrees for a male of average weight.
For a female of average weight, her average temperature is 0.269 degrees higher than the average temperature for a man of average weight.

subset model only weight

normtemp2 = lm(normtemp$temp ~ normtemp$weight, data = normtemp)
summary(normtemp2)
## 
## Call:
## lm(formula = normtemp$temp ~ normtemp$weight, data = normtemp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.85017 -0.39999  0.01033  0.43915  2.46549 
## 
## Coefficients:
##                  Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)     98.249231   0.062444 1573.402  < 2e-16 ***
## normtemp$weight  0.026335   0.008876    2.967  0.00359 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.712 on 128 degrees of freedom
## Multiple R-squared:  0.06434,    Adjusted R-squared:  0.05703 
## F-statistic: 8.802 on 1 and 128 DF,  p-value: 0.003591

Goodness of Fit:
-F statistic: 8.802
-P-Value: 0.003591
-Adjusted R-Squared: 0.05703

anova(weightandsexlm, normtemp2)
## Analysis of Variance Table
## 
## Model 1: normtemp$temp ~ normtemp$weight + normtemp$sex
## Model 2: normtemp$temp ~ normtemp$weight
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)  
## 1    127 62.532                             
## 2    128 64.883 -1   -2.3515 4.7758 0.0307 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Which model is better?

Hi Simon,
This question is challenging because the p-value(0.0307), f-statistic(4.7758), and RSS are so similar for both models. I think you could go with either, but I would choose the complex model because it as a smaller p-value and smaller RSS.