setwd("~/Desktop/University of Utah PhD /Course Work/Fall 2022 Semester/GEOG6000_Data Analysis/lab04") ##Set working directory
irished = read.csv("../datafiles/irished.csv")
irished$sex = factor(irished$sex,
levels = c(1, 2),
labels = c("Male", "Female")) ##changed 1 and 2 to male and female
hist(irished$DVRT,
main = "DVRT",
breaks = 20,
xlab = "DVRT Scores",
col = "coral")
##See the lab for saving this as a script then running it using the source() function
for (i in 1:10) { print(i) } #Simple loop - runs ten times printing "i"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
source("../datafiles/cointoss.r") #Coin toss loop see the script for the syntax and functions
## [1] "Heads: 47"
## [1] "Tails: 53"
#Question on the loop syntax specifically the "runif" and "if" statement
mySD = function (x) { sqrt (var(x)) } #Creating a function to calculate SD
mySD(irished$DVRT) #Calculates the same as the built in R function "sd"
## [1] 15.45635
runoff = read.csv("../datafiles/SoCal_Runoff.csv")
summary(runoff) #Basic summary statistics for the datafile
## Year AP1 AP2 AP3
## Min. :1948 Min. : 2.700 Min. : 1.450 Min. : 1.77
## 1st Qu.:1958 1st Qu.: 4.975 1st Qu.: 3.390 1st Qu.: 3.36
## Median :1969 Median : 7.080 Median : 4.460 Median : 4.62
## Mean :1969 Mean : 7.323 Mean : 4.652 Mean : 4.93
## 3rd Qu.:1980 3rd Qu.: 9.115 3rd Qu.: 5.685 3rd Qu.: 5.83
## Max. :1990 Max. :18.080 Max. :11.960 Max. :13.02
## OP1 OP2 OP3 RUNOFF
## Min. : 4.050 Min. : 4.350 Min. : 4.600 Min. : 41785
## 1st Qu.: 7.975 1st Qu.: 7.875 1st Qu.: 8.705 1st Qu.: 59857
## Median : 9.550 Median :11.110 Median :12.140 Median : 69177
## Mean :12.836 Mean :12.002 Mean :13.522 Mean : 77756
## 3rd Qu.:16.545 3rd Qu.:14.975 3rd Qu.:16.920 3rd Qu.: 92206
## Max. :43.370 Max. :24.850 Max. :33.070 Max. :146345
pairs(runoff[ , -1]) #Useful for looking at the correlation between variables
#[ , -1] syntax subtracts the first column from the plot
runoff2 = runoff[ , -1] #*NOTE RUNNING THIS MULTIPLE TIMES WILL CONTINUE TO TAKE COLUMNS OUT!*
runoff2.mod1 = lm(RUNOFF ~ . , data = runoff2) # . denotes full model with all the variables
summary(runoff2.mod1)
##
## Call:
## lm(formula = RUNOFF ~ ., data = runoff2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12690 -4936 -1424 4173 18542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15944.67 4099.80 3.889 0.000416 ***
## AP1 -12.77 708.89 -0.018 0.985725
## AP2 -664.41 1522.89 -0.436 0.665237
## AP3 2270.68 1341.29 1.693 0.099112 .
## OP1 69.70 461.69 0.151 0.880839
## OP2 1916.45 641.36 2.988 0.005031 **
## OP3 2211.58 752.69 2.938 0.005729 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7557 on 36 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9123
## F-statistic: 73.82 on 6 and 36 DF, p-value: < 2.2e-16
anova(runoff2.mod1)
## Analysis of Variance Table
##
## Response: RUNOFF
## Df Sum Sq Mean Sq F value Pr(>F)
## AP1 1 1.5567e+09 1.5567e+09 27.2595 7.634e-06 ***
## AP2 1 1.7427e+07 1.7427e+07 0.3052 0.584071
## AP3 1 6.6126e+08 6.6126e+08 11.5794 0.001649 **
## OP1 1 1.9991e+10 1.9991e+10 350.0595 < 2.2e-16 ***
## OP2 1 2.5762e+09 2.5762e+09 45.1115 7.752e-08 ***
## OP3 1 4.9301e+08 4.9301e+08 8.6332 0.005729 **
## Residuals 36 2.0558e+09 5.7106e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation Matrix
cor(runoff2)
## AP1 AP2 AP3 OP1 OP2 OP3
## AP1 1.0000000 0.82768637 0.81607595 0.12238567 0.1544155 0.10754212
## AP2 0.8276864 1.00000000 0.90030474 0.03954211 0.1056396 0.02961175
## AP3 0.8160760 0.90030474 1.00000000 0.09344773 0.1063836 0.10058669
## OP1 0.1223857 0.03954211 0.09344773 1.00000000 0.8647073 0.94334741
## OP2 0.1544155 0.10563959 0.10638359 0.86470733 1.0000000 0.91914467
## OP3 0.1075421 0.02961175 0.10058669 0.94334741 0.9191447 1.00000000
## RUNOFF 0.2385695 0.18329499 0.24934094 0.88574778 0.9196270 0.93843604
## RUNOFF
## AP1 0.2385695
## AP2 0.1832950
## AP3 0.2493409
## OP1 0.8857478
## OP2 0.9196270
## OP3 0.9384360
## RUNOFF 1.0000000
library(ggplot2)
library(ggcorrplot) #Load the correlation plot package
ggcorrplot(cor(runoff2),
method = "square",
type = "full",
lab = TRUE,
colors = c("blue", "darksalmon", "firebrick"))
library(car)
## Loading required package: carData
vif(runoff2.mod1) ##Remember values over 5 indicate collinearity and values over 10 mean it is definitely has a collinear relationship!
## AP1 AP2 AP3 OP1 OP2 OP3
## 3.546344 7.183798 6.748004 9.266088 7.648712 16.970457
This aids to reduce the effect of collinear variables on the model
runoff2.mod2 = lm(RUNOFF ~ AP3 + OP3, data = runoff2)
summary(runoff2.mod2)
##
## Call:
## lm(formula = RUNOFF ~ AP3 + OP3, data = runoff2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13335.8 -5893.2 -171.8 4219.5 19500.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19144.9 3812.0 5.022 1.1e-05 ***
## AP3 1768.8 553.7 3.194 0.00273 **
## OP3 3689.5 196.0 18.829 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8063 on 40 degrees of freedom
## Multiple R-squared: 0.9049, Adjusted R-squared: 0.9002
## F-statistic: 190.3 on 2 and 40 DF, p-value: < 2.2e-16
Based on the F-statistic and the degrees of freedom, it appears that model 2 (Runoff2.mod2) is the better model, despite having a slightly lower \(r^2\) value.
anova(runoff2.mod1, runoff2.mod2)
## Analysis of Variance Table
##
## Model 1: RUNOFF ~ AP1 + AP2 + AP3 + OP1 + OP2 + OP3
## Model 2: RUNOFF ~ AP3 + OP3
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 36 2055830733
## 2 40 2600641788 -4 -544811055 2.3851 0.06933 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Full Model is runoff2.mod1
runoff2.mod0 = lm(RUNOFF ~ 1, data = runoff2) #Null/initial model using one variable. This will be used to stepwise through possible combinations to test which combination of variables fits the best.
step(runoff2.mod0, scope = formula(runoff2.mod1))
## Start: AIC=873.65
## RUNOFF ~ 1
##
## Df Sum of Sq RSS AIC
## + OP3 1 2.4087e+10 3.2640e+09 784.24
## + OP2 1 2.3131e+10 4.2199e+09 795.28
## + OP1 1 2.1458e+10 5.8928e+09 809.64
## + AP3 1 1.7004e+09 2.5651e+10 872.89
## + AP1 1 1.5567e+09 2.5794e+10 873.13
## <none> 2.7351e+10 873.65
## + AP2 1 9.1891e+08 2.6432e+10 874.18
##
## Step: AIC=784.24
## RUNOFF ~ OP3
##
## Df Sum of Sq RSS AIC
## + AP3 1 6.6337e+08 2.6006e+09 776.47
## + AP2 1 6.6199e+08 2.6020e+09 776.49
## + OP2 1 5.7405e+08 2.6900e+09 777.92
## + AP1 1 5.2428e+08 2.7397e+09 778.71
## <none> 3.2640e+09 784.24
## + OP1 1 5.6424e+04 3.2640e+09 786.24
## - OP3 1 2.4087e+10 2.7351e+10 873.65
##
## Step: AIC=776.47
## RUNOFF ~ OP3 + AP3
##
## Df Sum of Sq RSS AIC
## + OP2 1 5.3169e+08 2.0689e+09 768.63
## <none> 2.6006e+09 776.47
## + AP2 1 3.3349e+07 2.5673e+09 777.91
## + AP1 1 1.1041e+07 2.5896e+09 778.28
## + OP1 1 1.2245e+05 2.6005e+09 778.46
## - AP3 1 6.6337e+08 3.2640e+09 784.24
## - OP3 1 2.3050e+10 2.5651e+10 872.89
##
## Step: AIC=768.63
## RUNOFF ~ OP3 + AP3 + OP2
##
## Df Sum of Sq RSS AIC
## <none> 2068947585 768.63
## + AP2 1 11814207 2057133378 770.39
## + AP1 1 1410311 2067537274 770.60
## + OP1 1 583748 2068363837 770.62
## - OP2 1 531694203 2600641788 776.47
## - AP3 1 621012173 2689959758 777.92
## - OP3 1 1515918540 3584866125 790.27
##
## Call:
## lm(formula = RUNOFF ~ OP3 + AP3 + OP2, data = runoff2)
##
## Coefficients:
## (Intercept) OP3 AP3 OP2
## 15425 2390 1712 1797
kidiq = read.csv("../datafiles/kidiq.csv")
str(kidiq)
## 'data.frame': 434 obs. of 5 variables:
## $ kid.score: int 65 98 85 83 115 98 69 106 102 95 ...
## $ mom.hs : int 1 1 1 1 1 0 1 1 1 1 ...
## $ mom.iq : num 121.1 89.4 115.4 99.4 92.7 ...
## $ mom.work : int 4 4 4 3 4 1 4 3 1 1 ...
## $ mom.age : int 27 25 27 25 27 18 20 23 24 19 ...
summary(kidiq)
## kid.score mom.hs mom.iq mom.work
## Min. : 20.0 Min. :0.0000 Min. : 71.04 Min. :1.000
## 1st Qu.: 74.0 1st Qu.:1.0000 1st Qu.: 88.66 1st Qu.:2.000
## Median : 90.0 Median :1.0000 Median : 97.92 Median :3.000
## Mean : 86.8 Mean :0.7857 Mean :100.00 Mean :2.896
## 3rd Qu.:102.0 3rd Qu.:1.0000 3rd Qu.:110.27 3rd Qu.:4.000
## Max. :144.0 Max. :1.0000 Max. :138.89 Max. :4.000
## mom.age
## Min. :17.00
## 1st Qu.:21.00
## Median :23.00
## Mean :22.79
## 3rd Qu.:25.00
## Max. :29.00
kidiq.lm1 = lm(kid.score ~ mom.hs, data = kidiq) #simple linear model of the kid's IQ score as a function of her/his mother's attendance to high school. R automatically treats binary values, 0/1, as a dummy variable
summary(kidiq.lm1)
##
## Call:
## lm(formula = kid.score ~ mom.hs, data = kidiq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.55 -13.32 2.68 14.68 58.45
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.548 2.059 37.670 < 2e-16 ***
## mom.hs 11.771 2.322 5.069 5.96e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.85 on 432 degrees of freedom
## Multiple R-squared: 0.05613, Adjusted R-squared: 0.05394
## F-statistic: 25.69 on 1 and 432 DF, p-value: 5.957e-07
kidiq$mom.work2 = factor(kidiq$mom.work,
labels = c("Tinker", "Tailor", "Soldier", "Spy")) #Added a column to identify labels. Could you use as.factor to convert the 1-4 values into labels? Just want to ensure I understand this correctly.
kidiq.lm2 = lm(kid.score ~ mom.work2, data = kidiq)
summary(kidiq.lm2)
##
## Call:
## lm(formula = kid.score ~ mom.work2, data = kidiq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.85 -12.85 2.79 14.15 50.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.000 2.305 35.568 <2e-16 ***
## mom.work2Tailor 3.854 3.095 1.245 0.2137
## mom.work2Soldier 11.500 3.553 3.237 0.0013 **
## mom.work2Spy 5.210 2.704 1.927 0.0547 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.23 on 430 degrees of freedom
## Multiple R-squared: 0.02444, Adjusted R-squared: 0.01763
## F-statistic: 3.59 on 3 and 430 DF, p-value: 0.01377
-The average IQ of a spy’s child is approximately 87.21.
kidiq$mom.iq2 = (kidiq$mom.iq - mean(kidiq$mom.iq)) / 10 #dividing by ten scales the new column
kidiq.lm3 = lm(kid.score ~ mom.iq2, data = kidiq)
summary(kidiq.lm3)
##
## Call:
## lm(formula = kid.score ~ mom.iq2, data = kidiq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.753 -12.074 2.217 11.710 47.691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.7972 0.8768 98.99 <2e-16 ***
## mom.iq2 6.0997 0.5852 10.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.27 on 432 degrees of freedom
## Multiple R-squared: 0.201, Adjusted R-squared: 0.1991
## F-statistic: 108.6 on 1 and 432 DF, p-value: < 2.2e-16
kidiq.lm4 = lm(kid.score ~ mom.iq2 + mom.hs, data = kidiq )
summary(kidiq.lm4)
##
## Call:
## lm(formula = kid.score ~ mom.iq2 + mom.hs, data = kidiq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.873 -12.663 2.404 11.356 49.545
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.1221 1.9437 42.250 < 2e-16 ***
## mom.iq2 5.6391 0.6057 9.309 < 2e-16 ***
## mom.hs 5.9501 2.2118 2.690 0.00742 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.14 on 431 degrees of freedom
## Multiple R-squared: 0.2141, Adjusted R-squared: 0.2105
## F-statistic: 58.72 on 2 and 431 DF, p-value: < 2.2e-16
kidiq.lm.momHS = lm(kid.score ~ mom.hs, data = kidiq)
summary(kidiq.lm.momHS)
##
## Call:
## lm(formula = kid.score ~ mom.hs, data = kidiq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.55 -13.32 2.68 14.68 58.45
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.548 2.059 37.670 < 2e-16 ***
## mom.hs 11.771 2.322 5.069 5.96e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.85 on 432 degrees of freedom
## Multiple R-squared: 0.05613, Adjusted R-squared: 0.05394
## F-statistic: 25.69 on 1 and 432 DF, p-value: 5.957e-07
kidiq.lm5 <- lm(kid.score ~ mom.hs * mom.iq2, data = kidiq)
summary(kidiq.lm5)
##
## Call:
## lm(formula = kid.score ~ mom.hs * mom.iq2, data = kidiq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.092 -11.332 2.066 11.663 43.880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85.407 2.218 38.502 < 2e-16 ***
## mom.hs 2.841 2.427 1.171 0.24239
## mom.iq2 9.689 1.483 6.531 1.84e-10 ***
## mom.hs:mom.iq2 -4.843 1.622 -2.985 0.00299 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.97 on 430 degrees of freedom
## Multiple R-squared: 0.2301, Adjusted R-squared: 0.2247
## F-statistic: 42.84 on 3 and 430 DF, p-value: < 2.2e-16
plot(kid.score ~ mom.iq2,
data = kidiq,
col = (kidiq$mom.hs+1),
pch = 16)
abline(coef(kidiq.lm5)[1], coef(kidiq.lm5)[3], lwd = 2) #Line for mom's who did NOT finish
abline(coef(kidiq.lm5)[1] + coef(kidiq.lm5)[2],
coef(kidiq.lm5)[3] + coef(kidiq.lm5)[4], col = 2, lwd = 2) #Line for those that did finish HS
statedata = read.csv("../datafiles/statedata.csv")
str(statedata) #Examining the data frame
## 'data.frame': 50 obs. of 9 variables:
## $ State : chr "AL" "AK" "AZ" "AR" ...
## $ Population: int 3615 365 2212 2110 21198 2541 3100 579 8277 4931 ...
## $ Income : int 3624 6315 4530 3378 5114 4884 5348 4809 4815 4091 ...
## $ Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
## $ Life.Exp : num 69 69.3 70.5 70.7 71.7 ...
## $ Murder : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
## $ HS.Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
## $ Frost : int 20 152 15 65 20 166 139 103 11 60 ...
## $ Area : int 50708 566432 113417 51945 156361 103766 4862 1982 54090 58073 ...
statedata.2 = statedata[ , -1] #Dropping the state column from the frame
str(statedata.2)
## 'data.frame': 50 obs. of 8 variables:
## $ Population: int 3615 365 2212 2110 21198 2541 3100 579 8277 4931 ...
## $ Income : int 3624 6315 4530 3378 5114 4884 5348 4809 4815 4091 ...
## $ Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
## $ Life.Exp : num 69 69.3 70.5 70.7 71.7 ...
## $ Murder : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
## $ HS.Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
## $ Frost : int 20 152 15 65 20 166 139 103 11 60 ...
## $ Area : int 50708 566432 113417 51945 156361 103766 4862 1982 54090 58073 ...
##Full Model
state.fullmod = lm(Life.Exp ~ ., data = statedata.2)
summary(state.fullmod)
##
## Call:
## lm(formula = Life.Exp ~ ., data = statedata.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48895 -0.51232 -0.02747 0.57002 1.49447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.094e+01 1.748e+00 40.586 < 2e-16 ***
## Population 5.180e-05 2.919e-05 1.775 0.0832 .
## Income -2.180e-05 2.444e-04 -0.089 0.9293
## Illiteracy 3.382e-02 3.663e-01 0.092 0.9269
## Murder -3.011e-01 4.662e-02 -6.459 8.68e-08 ***
## HS.Grad 4.893e-02 2.332e-02 2.098 0.0420 *
## Frost -5.735e-03 3.143e-03 -1.825 0.0752 .
## Area -7.383e-08 1.668e-06 -0.044 0.9649
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared: 0.7362, Adjusted R-squared: 0.6922
## F-statistic: 16.74 on 7 and 42 DF, p-value: 2.534e-10
##Null Model (Intercept Only)
state.nullmod0 = lm(Life.Exp ~ 1, data = statedata.2)
summary(state.nullmod0)
##
## Call:
## lm(formula = Life.Exp ~ 1, data = statedata.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9186 -0.7611 -0.2036 1.0139 2.7214
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.8786 0.1898 373.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.342 on 49 degrees of freedom
step(state.nullmod0, scope = formula(state.fullmod))
## Start: AIC=30.44
## Life.Exp ~ 1
##
## Df Sum of Sq RSS AIC
## + Murder 1 53.838 34.461 -14.609
## + Illiteracy 1 30.578 57.721 11.179
## + HS.Grad 1 29.931 58.368 11.737
## + Income 1 10.223 78.076 26.283
## + Frost 1 6.064 82.235 28.878
## <none> 88.299 30.435
## + Area 1 1.017 87.282 31.856
## + Population 1 0.409 87.890 32.203
##
## Step: AIC=-14.61
## Life.Exp ~ Murder
##
## Df Sum of Sq RSS AIC
## + HS.Grad 1 4.691 29.770 -19.925
## + Population 1 4.016 30.445 -18.805
## + Frost 1 3.135 31.327 -17.378
## + Income 1 2.405 32.057 -16.226
## <none> 34.461 -14.609
## + Area 1 0.470 33.992 -13.295
## + Illiteracy 1 0.273 34.188 -13.007
## - Murder 1 53.838 88.299 30.435
##
## Step: AIC=-19.93
## Life.Exp ~ Murder + HS.Grad
##
## Df Sum of Sq RSS AIC
## + Frost 1 4.3987 25.372 -25.920
## + Population 1 3.3405 26.430 -23.877
## <none> 29.770 -19.925
## + Illiteracy 1 0.4419 29.328 -18.673
## + Area 1 0.2775 29.493 -18.394
## + Income 1 0.1022 29.668 -18.097
## - HS.Grad 1 4.6910 34.461 -14.609
## - Murder 1 28.5974 58.368 11.737
##
## Step: AIC=-25.92
## Life.Exp ~ Murder + HS.Grad + Frost
##
## Df Sum of Sq RSS AIC
## + Population 1 2.064 23.308 -28.161
## <none> 25.372 -25.920
## + Income 1 0.182 25.189 -24.280
## + Illiteracy 1 0.172 25.200 -24.259
## + Area 1 0.026 25.346 -23.970
## - Frost 1 4.399 29.770 -19.925
## - HS.Grad 1 5.955 31.327 -17.378
## - Murder 1 32.756 58.128 13.531
##
## Step: AIC=-28.16
## Life.Exp ~ Murder + HS.Grad + Frost + Population
##
## Df Sum of Sq RSS AIC
## <none> 23.308 -28.161
## + Income 1 0.006 23.302 -26.174
## + Illiteracy 1 0.004 23.304 -26.170
## + Area 1 0.001 23.307 -26.163
## - Population 1 2.064 25.372 -25.920
## - Frost 1 3.122 26.430 -23.877
## - HS.Grad 1 5.112 28.420 -20.246
## - Murder 1 34.816 58.124 15.528
##
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad + Frost + Population,
## data = statedata.2)
##
## Coefficients:
## (Intercept) Murder HS.Grad Frost Population
## 7.103e+01 -3.001e-01 4.658e-02 -5.943e-03 5.014e-05
state.finalmod = lm(Life.Exp ~ Murder + HS.Grad + Frost + Population, data = statedata.2)
summary(state.finalmod)
##
## Call:
## lm(formula = Life.Exp ~ Murder + HS.Grad + Frost + Population,
## data = statedata.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47095 -0.53464 -0.03701 0.57621 1.50683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.103e+01 9.529e-01 74.542 < 2e-16 ***
## Murder -3.001e-01 3.661e-02 -8.199 1.77e-10 ***
## HS.Grad 4.658e-02 1.483e-02 3.142 0.00297 **
## Frost -5.943e-03 2.421e-03 -2.455 0.01802 *
## Population 5.014e-05 2.512e-05 1.996 0.05201 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared: 0.736, Adjusted R-squared: 0.7126
## F-statistic: 31.37 on 4 and 45 DF, p-value: 1.696e-12
#Creating a new data frame for Utah with:
#Population - 2,785,000
#HS Graduation - 75%
#Murder Rate - 1.3 per 100,000
new.utah = statedata.2[44, ] #isolate the Utah row from the statedata.2 frame
new.utah[1] = 2785 #change to population
new.utah[5:6] = c(1.3, 75) #change to murder rate and high school graduation
utah.pred = predict(state.finalmod, int = "prediction", level = 0.95, newdata = new.utah) #Utah prediction with a 95% confidence interval using the new.utah data frame
utah.pred
## fit lwr upr
## 44 73.45601 71.8809 75.03112
#Creating a new data frame for California with:
#Population - 36,962,000
#HS Graduation - 68.3%
#Murder Rate - 5.3 per 100,000
new.cali = statedata.2[5, ] #isolate the California row from the statedata.2 frame
new.cali[1] = 36962 #change to population
new.cali[5:6] = c(5.3, 68.3) #change to murder rate and high school graduation
cali.pred = predict(state.finalmod, int = "prediction", level = 0.95, newdata = new.cali) #California prediction with a 95% prediction interval with the new values
cali.pred
## fit lwr upr
## 5 74.35232 72.11398 76.59065
##Reading in and adjusting the data
normtemp = read.csv("../datafiles/normtemp.csv")
normtemp$sex = factor(normtemp$sex, labels = c("Male", "Female")) #change 1 and 2 to male and female
cor.test(normtemp$weight, normtemp$temp) #Follows x, y format
##
## Pearson's product-moment correlation
##
## data: normtemp$weight and normtemp$temp
## t = 2.9668, df = 128, p-value = 0.003591
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.08519113 0.40802170
## sample estimates:
## cor
## 0.2536564
##To visualize the data
plot(normtemp$weight, normtemp$temp,
col = normtemp$sex,
pch = 16,
xlab = "Weight",
ylab = "Temperature",
main = "Temperature/Weight Plot")
temp.lm.full = lm(temp ~ weight + sex, data = normtemp)
summary(temp.lm.full)
##
## Call:
## lm(formula = temp ~ weight + sex, data = normtemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.86363 -0.45624 0.01841 0.47366 2.33424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.250814 0.648717 148.371 < 2e-16 ***
## weight 0.025267 0.008762 2.884 0.00462 **
## sexFemale 0.269406 0.123277 2.185 0.03070 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7017 on 127 degrees of freedom
## Multiple R-squared: 0.09825, Adjusted R-squared: 0.08405
## F-statistic: 6.919 on 2 and 127 DF, p-value: 0.001406
##Help to visualize the temperature data
library(ggeffects)
temp.lm.full.effects = ggpredict(temp.lm.full, terms = c("weight", "sex"))
plot(temp.lm.full.effects)
temp.lm.sub.weight = lm(temp ~ weight, data = normtemp)
summary(temp.lm.sub.weight)
##
## Call:
## lm(formula = temp ~ weight, data = normtemp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.85017 -0.39999 0.01033 0.43915 2.46549
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.306754 0.657703 146.429 < 2e-16 ***
## weight 0.026335 0.008876 2.967 0.00359 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.712 on 128 degrees of freedom
## Multiple R-squared: 0.06434, Adjusted R-squared: 0.05703
## F-statistic: 8.802 on 1 and 128 DF, p-value: 0.003591
anova(temp.lm.full, temp.lm.sub.weight)
## Analysis of Variance Table
##
## Model 1: temp ~ weight + sex
## Model 2: temp ~ weight
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 127 62.532
## 2 128 64.883 -1 -2.3515 4.7758 0.0307 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1