library(tidyverse)
library(Stat2Data)
library(skimr)
library(mosaic)
set.seed(10142019) # set your seed!
In Example 17 on page 35 we see that transforming the response variable to \(\sqrt{MDs}\) was a helpful re-expression for predicting the number of physicians in counties with a linear model based on the number of hospitals.
data("CountyHealth")
CountyHealth = CountyHealth %>%
mutate(MDs = sqrt(MDs))
myCtyHlth = CountyHealth %>%
slice(1:35)
m1 = lm(MDs ~ Hospitals, data = myCtyHlth)
summary(m1)
##
## Call:
## lm(formula = MDs ~ Hospitals, data = myCtyHlth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.582 -6.362 -2.918 8.277 23.170
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.1695 2.6915 -1.178 0.247
## Hospitals 6.7853 0.5284 12.841 2.19e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.627 on 33 degrees of freedom
## Multiple R-squared: 0.8332, Adjusted R-squared: 0.8282
## F-statistic: 164.9 on 1 and 33 DF, p-value: 2.194e-14
coef(m1)
## (Intercept) Hospitals
## -3.169506 6.785333
\[ \widehat {sqrt(MDs)} = -3.169506+6.785333\ Hospitals \]
data("CountyHealth")
lastCtyHlth = CountyHealth %>%
slice(-c(1:35))
fm2 = lm(sqrt(MDs) ~ Hospitals, data = lastCtyHlth)
summary(fm2)
##
## Call:
## lm(formula = sqrt(MDs) ~ Hospitals, data = lastCtyHlth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5834 -4.2935 0.4812 3.1122 12.9001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.3014 2.6449 -0.87 0.397
## Hospitals 7.1610 0.5682 12.60 1.01e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.556 on 16 degrees of freedom
## Multiple R-squared: 0.9085, Adjusted R-squared: 0.9028
## F-statistic: 158.8 on 1 and 16 DF, p-value: 1.008e-09
coef(fm2)
## (Intercept) Hospitals
## -2.301363 7.161031
-2.301363+7.161031*(10)
## [1] 69.30895
\[ \widehat {MDs} = -2.301363+7.161031*(10)=69.30895 \]
set.seed(10142019)
which_train = sample(1:219, size=110, replace=FALSE)
train = CountyHealth %>%
slice(which_train)
test = CountyHealth %>%
slice(-which_train)
mod1 = lm(sqrt(MDs) ~ Hospitals, data=train)
summary(mod1)
##
## Call:
## lm(formula = sqrt(MDs) ~ Hospitals, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.3907 -6.4842 0.3661 5.1176 23.5239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.518 3.248 -0.775 0.446
## Hospitals 6.618 0.629 10.521 1.81e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.311 on 24 degrees of freedom
## Multiple R-squared: 0.8218, Adjusted R-squared: 0.8144
## F-statistic: 110.7 on 1 and 24 DF, p-value: 1.807e-10
test = test %>%
mutate(yhats = predict(m1, newdata=test))
test = test %>%
mutate(residuals = sqrt(MDs) - yhats)
cor(sqrt(test$MDs),test$yhats)
## [1] 0.9389599
(cor(sqrt(test$MDs), test$yhats))^2
## [1] 0.8816457
mod2 = lm(sqrt(MDs) ~ Hospitals, data = CountyHealth)
summary(mod2)
##
## Call:
## lm(formula = sqrt(MDs) ~ Hospitals, data = CountyHealth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0000 -5.9994 -0.9495 6.8426 22.2076
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.7533 1.9850 -1.387 0.171
## Hospitals 6.8764 0.4011 17.144 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.658 on 51 degrees of freedom
## Multiple R-squared: 0.8521, Adjusted R-squared: 0.8492
## F-statistic: 293.9 on 1 and 51 DF, p-value: < 2.2e-16
In exercise 1.19, you were asked to fit a simple linear model to predict the number of calories (per serving) in breakfast cereals using the amount of sugar (grams per serving). The file Cereal also has a variable showing the amount of fiber (grams per serving) for each of the 36 cereals. Fit a multiple regression model to predict Calories based on both predictors: Sugar and Fiber. Examine each of the measures below and identify which (if any) of the cereals you might classify as possibly “unusual” in that measure. Include a specific numerical values and justification for each case.
data("Cereal")
m2 = lm(Calories ~ Sugar + Fiber, data = Cereal)
summary(m2)
##
## Call:
## lm(formula = Calories ~ Sugar + Fiber, data = Cereal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.599 -9.321 -4.435 -0.029 48.860
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109.3082 6.3913 17.103 < 2e-16 ***
## Sugar 1.0050 0.6546 1.535 0.134
## Fiber -3.7442 0.8346 -4.486 8.31e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.42 on 33 degrees of freedom
## Multiple R-squared: 0.5438, Adjusted R-squared: 0.5162
## F-statistic: 19.67 on 2 and 33 DF, p-value: 2.375e-06
rstandard(m2) # standardized residuals
## 1 2 3 4 5
## -0.270195927 -0.579219657 -0.521923937 1.931434465 -0.481613007
## 6 7 8 9 10
## -0.123788972 -0.158963325 0.009269780 2.594352993 -0.005729793
## 11 12 13 14 15
## -0.863122765 -0.105846007 -0.072424273 0.488585179 -1.303473017
## 16 17 18 19 20
## -0.072424273 0.256506456 -0.323200505 -0.627123507 -0.846370310
## 21 22 23 24 25
## -0.092741734 -0.770725244 -0.697816471 -0.747318559 -0.589828427
## 26 27 28 29 30
## -0.663477258 3.368051326 1.814960107 -0.556396105 -0.313683522
## 31 32 33 34 35
## -0.370047071 -0.699385474 1.060599074 -0.046637810 0.925218435
## 36
## -0.662735159
max(rstandard(m2))
## [1] 3.368051
which.max(rstandard(m2))
## 27
## 27
rstudent(m2) # studendized residuals
## 1 2 3 4 5
## -0.266365359 -0.573297758 -0.516089658 2.019513973 -0.475935286
## 6 7 8 9 10
## -0.121927265 -0.156596229 0.009128260 2.863383624 -0.005642313
## 11 12 13 14 15
## -0.859703923 -0.104247640 -0.071324163 0.482875107 -1.317947828
## 16 17 18 19 20
## -0.071324163 0.252842282 -0.318770777 -0.621261639 -0.842643830
## 21 22 23 24 25
## -0.091337650 -0.765882199 -0.692288846 -0.742215797 -0.583908925
## 26 27 28 29 30
## -0.657748975 4.094135952 1.883738312 -0.550489193 -0.309355732
## 31 32 33 34 35
## -0.365155567 -0.693868814 1.062674484 -0.045927254 0.923144112
## 36
## -0.657003353
max(rstudent(m2))
## [1] 4.094136
which.max(rstudent(m2))
## 27
## 27
hatvalues(m2) # leverage
## 1 2 3 4 5 6
## 0.02856026 0.07699452 0.26674416 0.04258648 0.13849063 0.04172376
## 7 8 9 10 11 12
## 0.10138827 0.03621798 0.04258648 0.05874852 0.03069860 0.04589848
## 13 14 15 16 17 18
## 0.04579274 0.09260126 0.04879608 0.04579274 0.03510101 0.05846268
## 19 20 21 22 23 24
## 0.06313806 0.10083232 0.08466175 0.08466175 0.07209700 0.14732429
## 25 26 27 28 29 30
## 0.09954452 0.17186843 0.11455463 0.07568967 0.15252645 0.02976253
## 31 32 33 34 35 36
## 0.03218681 0.07678301 0.08173772 0.22186495 0.08173772 0.07184378
max(hatvalues(m2))
## [1] 0.2667442
which.max(hatvalues(m2))
## 3
## 3
cooks.distance(m2) # Cook's distance
## 1 2 3 4 5
## 7.154554e-04 9.328695e-03 3.303183e-02 5.531092e-02 1.242898e-02
## 6 7 8 9 10
## 2.224003e-04 9.503602e-04 1.076373e-06 9.979506e-02 6.830428e-07
## 11 12 13 14 15
## 7.864727e-03 1.796517e-04 8.390752e-05 8.120411e-03 2.905321e-02
## 16 17 18 19 20
## 8.390752e-05 7.978350e-04 2.162041e-03 8.834877e-03 2.677680e-02
## 21 22 23 24 25
## 2.651764e-04 1.831401e-02 1.261176e-02 3.216479e-02 1.281992e-02
## 26 27 28 29 30
## 3.045283e-02 4.892006e-01 8.991490e-02 1.857231e-02 1.006130e-03
## 31 32 33 34 35
## 1.518025e-03 1.356043e-02 3.337621e-02 2.067230e-04 2.539938e-02
## 36
## 1.133253e-02
max(cooks.distance(m2))
## [1] 0.4892006
which.max(cooks.distance(m2))
## 27
## 27
The data in BaseballTimes2017 contain information from 15 Major League Baseball games played on August 11, 2017. In Exercise 1.45 on page 57, we considered models to predict the time a game lasts (in minutes). one of the potential predictors is the number of runs scored in the game. Use a randomization procedure to test whether there is significant evidence to conclude that the correlation between Runs and Time is greater than zero.
data("BaseballTimes2017")
cor.test(BaseballTimes2017$Runs, BaseballTimes2017$Time)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 3.8677, df = 12, p-value = 0.002237
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3543966 0.9141750
## sample estimates:
## cor
## 0.7449071
rtest = do(5000) * cor(Runs ~ shuffle(Time), data = BaseballTimes2017)
cor_actual = cor(BaseballTimes2017$Runs, BaseballTimes2017$Time)
ggplot(data = rtest, aes(x=cor)) + geom_density() + geom_vline(xintercept = cor_actual, color = "red")
pdata(~cor, q = cor_actual, data = rtest, lower.tail = FALSE)
## [1] 0.0016
Consider a simple linear regression model to predict the Length (in miles) of an Adirondack hike using the typical Time (in hours) it takes to complete the hike. Fitting the model using the data in **HighPeaks* produces the prediction equation
\[ \widehat {Length}=1.10+1.077\ Time \] One rough interpretation of the slope, 1.077, is the average hiking speed (in miles per hour). In this exercise you will examine some bootstrap estimates for this slope.
data("HighPeaks")
ggplot(data = HighPeaks, aes(x = Time, y = Length)) + geom_point() + geom_smooth(method = lm, se = FALSE)
ggplot(data = resample(HighPeaks), aes(x = Time, y = Length)) + geom_point(alpha = 0.1) + geom_smooth(method = lm, se = FALSE)
fm = lm(Length ~ Time, data = HighPeaks)
summary(fm)
##
## Call:
## lm(formula = Length ~ Time, data = HighPeaks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4112 -1.1636 -0.0413 1.0514 3.7743
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.10039 1.06739 1.031 0.308
## Time 1.07711 0.09699 11.105 2.39e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.818 on 44 degrees of freedom
## Multiple R-squared: 0.737, Adjusted R-squared: 0.7311
## F-statistic: 123.3 on 1 and 44 DF, p-value: 2.39e-14
confint(fm, level = 0.90)
## 5 % 95 %
## (Intercept) -0.6930744 2.893854
## Time 0.9141373 1.240075
bootstrap = do(5000) * coef(lm(Length ~ Time, data = resample(HighPeaks)))
ggplot(data = bootstrap, aes(x = Time)) + geom_density() + geom_histogram(color="black") + geom_vline(xintercept = coef(fm)["Time"], color = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
- The distribution almost resembles a normal distribution and is centered around the sample statistic.
sd(~Time, data = bootstrap)
## [1] 0.1209791
mean(~Time, data = bootstrap)
## [1] 1.090785
coef(fm)
## (Intercept) Time
## 1.100390 1.077106
summary(fm)
##
## Call:
## lm(formula = Length ~ Time, data = HighPeaks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4112 -1.1636 -0.0413 1.0514 3.7743
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.10039 1.06739 1.031 0.308
## Time 1.07711 0.09699 11.105 2.39e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.818 on 44 degrees of freedom
## Multiple R-squared: 0.737, Adjusted R-squared: 0.7311
## F-statistic: 123.3 on 1 and 44 DF, p-value: 2.39e-14
qdata(~Time, p=c(0.05, 0.95), data=bootstrap)
## quantile p
## 5% 0.9157748 0.05
## 95% 1.3121679 0.95