write.csv("../datafiles/statedata.csv")
## "","x"
## "1","../datafiles/statedata.csv"
statedata = read.csv("../datafiles/statedata.csv")
(statedata2 <- statedata[,-1])
## Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
## 1 3615 3624 2.1 69.05 15.1 41.3 20 50708
## 2 365 6315 1.5 69.31 11.3 66.7 152 566432
## 3 2212 4530 1.8 70.55 7.8 58.1 15 113417
## 4 2110 3378 1.9 70.66 10.1 39.9 65 51945
## 5 21198 5114 1.1 71.71 10.3 62.6 20 156361
## 6 2541 4884 0.7 72.06 6.8 63.9 166 103766
## 7 3100 5348 1.1 72.48 3.1 56.0 139 4862
## 8 579 4809 0.9 70.06 6.2 54.6 103 1982
## 9 8277 4815 1.3 70.66 10.7 52.6 11 54090
## 10 4931 4091 2.0 68.54 13.9 40.6 60 58073
## 11 868 4963 1.9 73.60 6.2 61.9 0 6425
## 12 813 4119 0.6 71.87 5.3 59.5 126 82677
## 13 11197 5107 0.9 70.14 10.3 52.6 127 55748
## 14 5313 4458 0.7 70.88 7.1 52.9 122 36097
## 15 2861 4628 0.5 72.56 2.3 59.0 140 55941
## 16 2280 4669 0.6 72.58 4.5 59.9 114 81787
## 17 3387 3712 1.6 70.10 10.6 38.5 95 39650
## 18 3806 3545 2.8 68.76 13.2 42.2 12 44930
## 19 1058 3694 0.7 70.39 2.7 54.7 161 30920
## 20 4122 5299 0.9 70.22 8.5 52.3 101 9891
## 21 5814 4755 1.1 71.83 3.3 58.5 103 7826
## 22 9111 4751 0.9 70.63 11.1 52.8 125 56817
## 23 3921 4675 0.6 72.96 2.3 57.6 160 79289
## 24 2341 3098 2.4 68.09 12.5 41.0 50 47296
## 25 4767 4254 0.8 70.69 9.3 48.8 108 68995
## 26 746 4347 0.6 70.56 5.0 59.2 155 145587
## 27 1544 4508 0.6 72.60 2.9 59.3 139 76483
## 28 590 5149 0.5 69.03 11.5 65.2 188 109889
## 29 812 4281 0.7 71.23 3.3 57.6 174 9027
## 30 7333 5237 1.1 70.93 5.2 52.5 115 7521
## 31 1144 3601 2.2 70.32 9.7 55.2 120 121412
## 32 18076 4903 1.4 70.55 10.9 52.7 82 47831
## 33 5441 3875 1.8 69.21 11.1 38.5 80 48798
## 34 637 5087 0.8 72.78 1.4 50.3 186 69273
## 35 10735 4561 0.8 70.82 7.4 53.2 124 40975
## 36 2715 3983 1.1 71.42 6.4 51.6 82 68782
## 37 2284 4660 0.6 72.13 4.2 60.0 44 96184
## 38 11860 4449 1.0 70.43 6.1 50.2 126 44966
## 39 931 4558 1.3 71.90 2.4 46.4 127 1049
## 40 2816 3635 2.3 67.96 11.6 37.8 65 30225
## 41 681 4167 0.5 72.08 1.7 53.3 172 75955
## 42 4173 3821 1.7 70.11 11.0 41.8 70 41328
## 43 12237 4188 2.2 70.90 12.2 47.4 35 262134
## 44 1203 4022 0.6 72.90 4.5 67.3 137 82096
## 45 472 3907 0.6 71.64 5.5 57.1 168 9267
## 46 4981 4701 1.4 70.08 9.5 47.8 85 39780
## 47 3559 4864 0.6 71.72 4.3 63.5 32 66570
## 48 1799 3617 1.4 69.48 6.7 41.6 100 24070
## 49 4589 4468 0.7 72.48 3.0 54.5 149 54464
## 50 376 4566 0.6 70.29 6.9 62.9 173 97203
Hypothesis:
##Full Model
mod.1 <- (lm(Life.Exp ~ ., data = statedata2))
## Null Model
mod.0 <- (lm(Life.Exp ~1, data = statedata2))
Stepwise Variable Selection:
step(mod.0, scope = formula(mod.1))
## Start: AIC=30.44
## Life.Exp ~ 1
##
## Df Sum of Sq RSS AIC
## + Murder 1 53.838 34.461 -14.609
## + Illiteracy 1 30.578 57.721 11.179
## + HS.Grad 1 29.931 58.368 11.737
## + Income 1 10.223 78.076 26.283
## + Frost 1 6.064 82.235 28.878
## <none> 88.299 30.435
## + Area 1 1.017 87.282 31.856
## + Population 1 0.409 87.890 32.203
##
## Step: AIC=-14.61
## Life.Exp ~ Murder
##
## Df Sum of Sq RSS AIC
## + HS.Grad 1 4.691 29.770 -19.925
## + Population 1 4.016 30.445 -18.805
## + Frost 1 3.135 31.327 -17.378
## + Income 1 2.405 32.057 -16.226
## <none> 34.461 -14.609
## + Area 1 0.470 33.992 -13.295
## + Illiteracy 1 0.273 34.188 -13.007
## - Murder 1 53.838 88.299 30.435
##
## Step: AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
##
## Df Sum of Sq RSS AIC
## + Frost 1 4.3987 25.372 -25.920
## + Population 1 3.3405 26.430 -23.877
## <none> 29.770 -19.925
## + Illiteracy 1 0.4419 29.328 -18.673
## + Area 1 0.2775 29.493 -18.394
## + Income 1 0.1022 29.668 -18.097
## - HS.Grad 1 4.6910 34.461 -14.609
## - Murder 1 28.5974 58.368 11.737
##
## Step: AIC=-25.92
## Life.Exp ~ Murder + HS.Grad + Frost
##
## Df Sum of Sq RSS AIC
## + Population 1 2.064 23.308 -28.161
## <none> 25.372 -25.920
## + Income 1 0.182 25.189 -24.280
## + Illiteracy 1 0.172 25.200 -24.259
## + Area 1 0.026 25.346 -23.970
## - Frost 1 4.399 29.770 -19.925
## - HS.Grad 1 5.955 31.327 -17.378
## - Murder 1 32.756 58.128 13.531
##
## Step: AIC=-28.16
## Life.Exp ~ Murder + HS.Grad + Frost + Population
##
## Df Sum of Sq RSS AIC
## <none> 23.308 -28.161
## + Income 1 0.006 23.302 -26.174
## + Illiteracy 1 0.004 23.304 -26.170
## + Area 1 0.001 23.307 -26.163
## - Population 1 2.064 25.372 -25.920
## - Frost 1 3.122 26.430 -23.877
## - HS.Grad 1 5.112 28.420 -20.246
## - Murder 1 34.816 58.124 15.528
##
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad + Frost + Population,
## data = statedata2)
##
## Coefficients:
## (Intercept) Murder HS.Grad Frost Population
## 7.103e+01 -3.001e-01 4.658e-02 -5.943e-03 5.014e-05
Final Model:
Life expectancy ~ Population + Murder + HS Grad + Frost
The final model includes the following variables:
-life expectancy
-population
-murder
-HS Graduation status
-Frost
AIC: -28.16
Summary:
summary(step(mod.0, scope = formula(mod.1)))
## Start: AIC=30.44
## Life.Exp ~ 1
##
## Df Sum of Sq RSS AIC
## + Murder 1 53.838 34.461 -14.609
## + Illiteracy 1 30.578 57.721 11.179
## + HS.Grad 1 29.931 58.368 11.737
## + Income 1 10.223 78.076 26.283
## + Frost 1 6.064 82.235 28.878
## <none> 88.299 30.435
## + Area 1 1.017 87.282 31.856
## + Population 1 0.409 87.890 32.203
##
## Step: AIC=-14.61
## Life.Exp ~ Murder
##
## Df Sum of Sq RSS AIC
## + HS.Grad 1 4.691 29.770 -19.925
## + Population 1 4.016 30.445 -18.805
## + Frost 1 3.135 31.327 -17.378
## + Income 1 2.405 32.057 -16.226
## <none> 34.461 -14.609
## + Area 1 0.470 33.992 -13.295
## + Illiteracy 1 0.273 34.188 -13.007
## - Murder 1 53.838 88.299 30.435
##
## Step: AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
##
## Df Sum of Sq RSS AIC
## + Frost 1 4.3987 25.372 -25.920
## + Population 1 3.3405 26.430 -23.877
## <none> 29.770 -19.925
## + Illiteracy 1 0.4419 29.328 -18.673
## + Area 1 0.2775 29.493 -18.394
## + Income 1 0.1022 29.668 -18.097
## - HS.Grad 1 4.6910 34.461 -14.609
## - Murder 1 28.5974 58.368 11.737
##
## Step: AIC=-25.92
## Life.Exp ~ Murder + HS.Grad + Frost
##
## Df Sum of Sq RSS AIC
## + Population 1 2.064 23.308 -28.161
## <none> 25.372 -25.920
## + Income 1 0.182 25.189 -24.280
## + Illiteracy 1 0.172 25.200 -24.259
## + Area 1 0.026 25.346 -23.970
## - Frost 1 4.399 29.770 -19.925
## - HS.Grad 1 5.955 31.327 -17.378
## - Murder 1 32.756 58.128 13.531
##
## Step: AIC=-28.16
## Life.Exp ~ Murder + HS.Grad + Frost + Population
##
## Df Sum of Sq RSS AIC
## <none> 23.308 -28.161
## + Income 1 0.006 23.302 -26.174
## + Illiteracy 1 0.004 23.304 -26.170
## + Area 1 0.001 23.307 -26.163
## - Population 1 2.064 25.372 -25.920
## - Frost 1 3.122 26.430 -23.877
## - HS.Grad 1 5.112 28.420 -20.246
## - Murder 1 34.816 58.124 15.528
##
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad + Frost + Population,
## data = statedata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47095 -0.53464 -0.03701 0.57621 1.50683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.103e+01 9.529e-01 74.542 < 2e-16 ***
## Murder -3.001e-01 3.661e-02 -8.199 1.77e-10 ***
## HS.Grad 4.658e-02 1.483e-02 3.142 0.00297 **
## Frost -5.943e-03 2.421e-03 -2.455 0.01802 *
## Population 5.014e-05 2.512e-05 1.996 0.05201 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared: 0.736, Adjusted R-squared: 0.7126
## F-statistic: 31.37 on 4 and 45 DF, p-value: 1.696e-12
Goodness of Fit
-F Statistic: 31.37
-P-Value: 1.696e-12
-Adjusted R-squared: 0.7126
Utah Prediction
goodmodel = (step(mod.0, scope = formula(mod.1)))
## Start: AIC=30.44
## Life.Exp ~ 1
##
## Df Sum of Sq RSS AIC
## + Murder 1 53.838 34.461 -14.609
## + Illiteracy 1 30.578 57.721 11.179
## + HS.Grad 1 29.931 58.368 11.737
## + Income 1 10.223 78.076 26.283
## + Frost 1 6.064 82.235 28.878
## <none> 88.299 30.435
## + Area 1 1.017 87.282 31.856
## + Population 1 0.409 87.890 32.203
##
## Step: AIC=-14.61
## Life.Exp ~ Murder
##
## Df Sum of Sq RSS AIC
## + HS.Grad 1 4.691 29.770 -19.925
## + Population 1 4.016 30.445 -18.805
## + Frost 1 3.135 31.327 -17.378
## + Income 1 2.405 32.057 -16.226
## <none> 34.461 -14.609
## + Area 1 0.470 33.992 -13.295
## + Illiteracy 1 0.273 34.188 -13.007
## - Murder 1 53.838 88.299 30.435
##
## Step: AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
##
## Df Sum of Sq RSS AIC
## + Frost 1 4.3987 25.372 -25.920
## + Population 1 3.3405 26.430 -23.877
## <none> 29.770 -19.925
## + Illiteracy 1 0.4419 29.328 -18.673
## + Area 1 0.2775 29.493 -18.394
## + Income 1 0.1022 29.668 -18.097
## - HS.Grad 1 4.6910 34.461 -14.609
## - Murder 1 28.5974 58.368 11.737
##
## Step: AIC=-25.92
## Life.Exp ~ Murder + HS.Grad + Frost
##
## Df Sum of Sq RSS AIC
## + Population 1 2.064 23.308 -28.161
## <none> 25.372 -25.920
## + Income 1 0.182 25.189 -24.280
## + Illiteracy 1 0.172 25.200 -24.259
## + Area 1 0.026 25.346 -23.970
## - Frost 1 4.399 29.770 -19.925
## - HS.Grad 1 5.955 31.327 -17.378
## - Murder 1 32.756 58.128 13.531
##
## Step: AIC=-28.16
## Life.Exp ~ Murder + HS.Grad + Frost + Population
##
## Df Sum of Sq RSS AIC
## <none> 23.308 -28.161
## + Income 1 0.006 23.302 -26.174
## + Illiteracy 1 0.004 23.304 -26.170
## + Area 1 0.001 23.307 -26.163
## - Population 1 2.064 25.372 -25.920
## - Frost 1 3.122 26.430 -23.877
## - HS.Grad 1 5.112 28.420 -20.246
## - Murder 1 34.816 58.124 15.528
(newstate <- statedata[44, ])
## State Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
## 44 UT 1203 4022 0.6 72.9 4.5 67.3 137 82096
(newstate[2] <- 2785)
## [1] 2785
(newstate[7] <- 75)
## [1] 75
(newstate[6] <- 1.3)
## [1] 1.3
predict(goodmodel, newdata=newstate, int='pred')
## fit lwr upr
## 44 73.45601 71.8809 75.03112
New Life Expectancy: 74.45601 years
Confidence Interval: 71.8809 - 75.03112
California Prediction
(newstate2 <- statedata[5, ])
## State Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
## 5 CA 21198 5114 1.1 71.71 10.3 62.6 20 156361
(newstate2[2] <- 36962)
## [1] 36962
(newstate2[7] <- 68.3)
## [1] 68.3
(newstate2[6] <- 5.3)
## [1] 5.3
predict(goodmodel, newdata=newstate2, int='pred')
## fit lwr upr
## 5 74.35232 72.11398 76.59065
New Life Expectancy: 74.35232 years
Confidence Interval: 72.11398 - 76.59065
normtemp <- read.csv("../datafiles/normtemp.csv")
(normtemp$sex <- factor(normtemp$sex, labels = c("male", "female")))
## [1] male male male male male male male male male male
## [11] male male male male male male male male male male
## [21] male male male male male male male male male male
## [31] male male male male male male male male male male
## [41] male male male male male male male male male male
## [51] male male male male male male male male male male
## [61] male male male male male female female female female female
## [71] female female female female female female female female female female
## [81] female female female female female female female female female female
## [91] female female female female female female female female female female
## [101] female female female female female female female female female female
## [111] female female female female female female female female female female
## [121] female female female female female female female female female female
## Levels: male female
normtemp$weight <- (normtemp$weight - mean(normtemp$weight))
cor.test(normtemp$temp,normtemp$weight, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: normtemp$temp and normtemp$weight
## t = 2.9668, df = 128, p-value = 0.003591
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.08519113 0.40802170
## sample estimates:
## cor
## 0.2536564
Pearson’s Correlation Coefficient: 0.2536564
Model:
weightandsexlm <- lm(normtemp$temp ~ normtemp$weight + normtemp$sex, data = normtemp)
summary(weightandsexlm)
##
## Call:
## lm(formula = normtemp$temp ~ normtemp$weight + normtemp$sex,
## data = normtemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.86363 -0.45624 0.01841 0.47366 2.33424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.114528 0.087102 1126.428 < 2e-16 ***
## normtemp$weight 0.025267 0.008762 2.884 0.00462 **
## normtemp$sexfemale 0.269406 0.123277 2.185 0.03070 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7017 on 127 degrees of freedom
## Multiple R-squared: 0.09825, Adjusted R-squared: 0.08405
## F-statistic: 6.919 on 2 and 127 DF, p-value: 0.001406
Goodness of Fit -F statistic: 6.919
-P-Value: 0.001406
-Adjusted R-Squared: 0.08405
Coefficients & Intepretations
-98.1145277 (intercept)
-0.0252667 (slope)
-0.2694061
We can assume that the intercept, 98.114, is the average temperature for a male of the average weight. When you increase in one unit of weight, you add about 0.0252 degrees for a male of average weight.
For a female of average weight, her average temperature is 0.269 degrees higher than the average temperature for a man of average weight.
subset model only weight
normtemp2 = lm(normtemp$temp ~ normtemp$weight, data = normtemp)
summary(normtemp2)
##
## Call:
## lm(formula = normtemp$temp ~ normtemp$weight, data = normtemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.85017 -0.39999 0.01033 0.43915 2.46549
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.249231 0.062444 1573.402 < 2e-16 ***
## normtemp$weight 0.026335 0.008876 2.967 0.00359 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.712 on 128 degrees of freedom
## Multiple R-squared: 0.06434, Adjusted R-squared: 0.05703
## F-statistic: 8.802 on 1 and 128 DF, p-value: 0.003591
Goodness of Fit:
-F statistic: 8.802
-P-Value: 0.003591
-Adjusted R-Squared: 0.05703
anova(weightandsexlm, normtemp2)
## Analysis of Variance Table
##
## Model 1: normtemp$temp ~ normtemp$weight + normtemp$sex
## Model 2: normtemp$temp ~ normtemp$weight
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 127 62.532
## 2 128 64.883 -1 -2.3515 4.7758 0.0307 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Which model is better?
Hi Simon,
This question is challenging because the p-value(0.0307), f-statistic(4.7758), and RSS are so similar for both models. I think you could go with either, but I would choose the complex model because it as a smaller p-value and smaller RSS.