library(dplyr)
library(tidyr)
library(ggplot2)
library(openintro)
library(broom)
source('create_datasets.R')
cut()
function and use a box plot# Scatterplot of weight vs. weeks
glimpse(ncbirths)
## Observations: 1,000
## Variables: 13
## $ fage <int> NA, NA, 19, 21, NA, NA, 18, 17, NA, 20, 30, NA,...
## $ mage <int> 13, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16,...
## $ mature <fctr> younger mom, younger mom, younger mom, younger...
## $ weeks <int> 39, 42, 37, 41, 39, 38, 37, 35, 38, 37, 45, 42,...
## $ premie <fctr> full term, full term, full term, full term, fu...
## $ visits <int> 10, 15, 11, 6, 9, 19, 12, 5, 9, 13, 9, 8, 4, 12...
## $ marital <fctr> married, married, married, married, married, m...
## $ gained <int> 38, 20, 38, 34, 27, 22, 76, 15, NA, 52, 28, 34,...
## $ weight <dbl> 7.63, 7.88, 6.63, 8.00, 6.38, 5.38, 8.44, 4.69,...
## $ lowbirthweight <fctr> not low, not low, not low, not low, not low, l...
## $ gender <fctr> male, male, female, male, female, male, male, ...
## $ habit <fctr> nonsmoker, nonsmoker, nonsmoker, nonsmoker, no...
## $ whitemom <fctr> not white, not white, white, white, not white,...
ggplot(ncbirths, aes(x = weeks, y = weight)) +
geom_point()
# Boxplot of weight vs. weeks
ggplot(data = ncbirths,
aes(x = cut(weeks, breaks = 5), y = weight)) +
geom_boxplot()
mammals
dataset contains information about 39 different species of mammals, including their body weight, brain weight, gestation time, and a few other variables.mlbBat10
dataset contains batting statistics for 1,199 Major League Baseball players during the 2010 season.bdims
dataset contains body girth and skeletal diameter measurements for 507 physically active individuals.smoking
dataset contains information on the smoking habits of 1,691 citizens of the United Kingdom.# Mammals scatterplot
glimpse(mammals)
## Observations: 62
## Variables: 11
## $ Species <fctr> Africanelephant, Africangiantpouchedrat, ArcticFo...
## $ BodyWt <dbl> 6654.000, 1.000, 3.385, 0.920, 2547.000, 10.550, 0...
## $ BrainWt <dbl> 5712.0, 6.6, 44.5, 5.7, 4603.0, 179.5, 0.3, 169.0,...
## $ NonDreaming <dbl> NA, 6.3, NA, NA, 2.1, 9.1, 15.8, 5.2, 10.9, 8.3, 1...
## $ Dreaming <dbl> NA, 2.0, NA, NA, 1.8, 0.7, 3.9, 1.0, 3.6, 1.4, 1.5...
## $ TotalSleep <dbl> 3.3, 8.3, 12.5, 16.5, 3.9, 9.8, 19.7, 6.2, 14.5, 9...
## $ LifeSpan <dbl> 38.6, 4.5, 14.0, NA, 69.0, 27.0, 19.0, 30.4, 28.0,...
## $ Gestation <dbl> 645, 42, 60, 25, 624, 180, 35, 392, 63, 230, 112, ...
## $ Predation <int> 3, 3, 1, 5, 3, 4, 1, 4, 1, 1, 5, 5, 2, 5, 1, 2, 2,...
## $ Exposure <int> 5, 1, 1, 2, 5, 4, 1, 5, 2, 1, 4, 5, 1, 5, 1, 2, 2,...
## $ Danger <int> 3, 3, 1, 3, 4, 4, 1, 4, 1, 1, 4, 5, 2, 5, 1, 2, 2,...
ggplot(mammals, aes(x = BodyWt, y = BrainWt )) +
geom_point()
# Baseball player scatterplot
glimpse(mlbBat10)
## Observations: 1,199
## Variables: 19
## $ name <fctr> I Suzuki, D Jeter, M Young, J Pierre, R Weeks, M Scu...
## $ team <fctr> SEA, NYY, TEX, CWS, MIL, BOS, BAL, MIN, NYY, CIN, MI...
## $ position <fctr> OF, SS, 3B, OF, 2B, SS, OF, OF, 2B, 2B, OF, OF, 2B, ...
## $ G <dbl> 162, 157, 157, 160, 160, 150, 160, 153, 160, 155, 157...
## $ AB <dbl> 680, 663, 656, 651, 651, 632, 629, 629, 626, 626, 619...
## $ R <dbl> 74, 111, 99, 96, 112, 92, 79, 85, 103, 100, 101, 103,...
## $ H <dbl> 214, 179, 186, 179, 175, 174, 187, 166, 200, 172, 188...
## $ 2B <dbl> 30, 30, 36, 18, 32, 38, 45, 24, 41, 33, 45, 34, 41, 2...
## $ 3B <dbl> 3, 3, 3, 3, 4, 0, 3, 10, 3, 5, 1, 10, 4, 3, 3, 1, 5, ...
## $ HR <dbl> 6, 10, 21, 1, 29, 11, 12, 3, 29, 18, 25, 4, 10, 25, 1...
## $ RBI <dbl> 43, 67, 91, 47, 83, 56, 60, 58, 109, 59, 103, 41, 75,...
## $ TB <dbl> 268, 245, 291, 206, 302, 245, 274, 219, 334, 269, 310...
## $ BB <dbl> 45, 63, 50, 45, 76, 53, 73, 60, 57, 46, 56, 47, 28, 4...
## $ SO <dbl> 86, 106, 115, 47, 184, 71, 93, 74, 77, 83, 105, 170, ...
## $ SB <dbl> 42, 18, 4, 68, 11, 5, 7, 26, 3, 16, 14, 27, 14, 18, 1...
## $ CS <dbl> 9, 5, 2, 18, 4, 4, 2, 4, 2, 12, 3, 6, 4, 9, 5, 1, 3, ...
## $ OBP <dbl> 0.359, 0.340, 0.330, 0.341, 0.366, 0.333, 0.370, 0.33...
## $ SLG <dbl> 0.394, 0.370, 0.444, 0.316, 0.464, 0.388, 0.436, 0.34...
## $ AVG <dbl> 0.315, 0.270, 0.284, 0.275, 0.269, 0.275, 0.297, 0.26...
ggplot(mlbBat10, aes(x = OBP, y = SLG)) +
geom_point()
# Body dimensions scatterplot
glimpse(bdims)
## Observations: 507
## Variables: 25
## $ bia.di <dbl> 42.9, 43.7, 40.1, 44.3, 42.5, 43.3, 43.5, 44.4, 43.5, 4...
## $ bii.di <dbl> 26.0, 28.5, 28.2, 29.9, 29.9, 27.0, 30.0, 29.8, 26.5, 2...
## $ bit.di <dbl> 31.5, 33.5, 33.3, 34.0, 34.0, 31.5, 34.0, 33.2, 32.1, 3...
## $ che.de <dbl> 17.7, 16.9, 20.9, 18.4, 21.5, 19.6, 21.9, 21.8, 15.5, 2...
## $ che.di <dbl> 28.0, 30.8, 31.7, 28.2, 29.4, 31.3, 31.7, 28.8, 27.5, 2...
## $ elb.di <dbl> 13.1, 14.0, 13.9, 13.9, 15.2, 14.0, 16.1, 15.1, 14.1, 1...
## $ wri.di <dbl> 10.4, 11.8, 10.9, 11.2, 11.6, 11.5, 12.5, 11.9, 11.2, 1...
## $ kne.di <dbl> 18.8, 20.6, 19.7, 20.9, 20.7, 18.8, 20.8, 21.0, 18.9, 2...
## $ ank.di <dbl> 14.1, 15.1, 14.1, 15.0, 14.9, 13.9, 15.6, 14.6, 13.2, 1...
## $ sho.gi <dbl> 106.2, 110.5, 115.1, 104.5, 107.5, 119.8, 123.5, 120.4,...
## $ che.gi <dbl> 89.5, 97.0, 97.5, 97.0, 97.5, 99.9, 106.9, 102.5, 91.0,...
## $ wai.gi <dbl> 71.5, 79.0, 83.2, 77.8, 80.0, 82.5, 82.0, 76.8, 68.5, 7...
## $ nav.gi <dbl> 74.5, 86.5, 82.9, 78.8, 82.5, 80.1, 84.0, 80.5, 69.0, 8...
## $ hip.gi <dbl> 93.5, 94.8, 95.0, 94.0, 98.5, 95.3, 101.0, 98.0, 89.5, ...
## $ thi.gi <dbl> 51.5, 51.5, 57.3, 53.0, 55.4, 57.5, 60.9, 56.0, 50.0, 5...
## $ bic.gi <dbl> 32.5, 34.4, 33.4, 31.0, 32.0, 33.0, 42.4, 34.1, 33.0, 3...
## $ for.gi <dbl> 26.0, 28.0, 28.8, 26.2, 28.4, 28.0, 32.3, 28.0, 26.0, 2...
## $ kne.gi <dbl> 34.5, 36.5, 37.0, 37.0, 37.7, 36.6, 40.1, 39.2, 35.5, 3...
## $ cal.gi <dbl> 36.5, 37.5, 37.3, 34.8, 38.6, 36.1, 40.3, 36.7, 35.0, 3...
## $ ank.gi <dbl> 23.5, 24.5, 21.9, 23.0, 24.4, 23.5, 23.6, 22.5, 22.0, 2...
## $ wri.gi <dbl> 16.5, 17.0, 16.9, 16.6, 18.0, 16.9, 18.8, 18.0, 16.5, 1...
## $ age <int> 21, 23, 28, 23, 22, 21, 26, 27, 23, 21, 23, 22, 20, 26,...
## $ wgt <dbl> 65.6, 71.8, 80.7, 72.6, 78.8, 74.8, 86.4, 78.4, 62.0, 8...
## $ hgt <dbl> 174.0, 175.3, 193.5, 186.5, 187.2, 181.5, 184.0, 184.5,...
## $ sex <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
ggplot(bdims, aes(x = hgt, y = wgt, color = factor(sex))) +
geom_point()
# Smoking scatterplot
glimpse(smoking)
## Observations: 1,691
## Variables: 12
## $ gender <fctr> Male, Female, Male, Female, Female, Fema...
## $ age <int> 38, 42, 40, 40, 39, 37, 53, 44, 40, 41, 7...
## $ maritalStatus <fctr> Divorced, Single, Married, Married, Marr...
## $ highestQualification <fctr> No Qualification, No Qualification, Degr...
## $ nationality <fctr> British, British, English, English, Brit...
## $ ethnicity <fctr> White, White, White, White, White, White...
## $ grossIncome <fctr> 2,600 to 5,200, Under 2,600, 28,600 to 3...
## $ region <fctr> The North, The North, The North, The Nor...
## $ smoke <fctr> No, Yes, No, No, No, No, Yes, No, Yes, Y...
## $ amtWeekends <int> NA, 12, NA, NA, NA, NA, 6, NA, 8, 15, NA,...
## $ amtWeekdays <int> NA, 12, NA, NA, NA, NA, 6, NA, 8, 12, NA,...
## $ type <fctr> , Packets, , , , , Packets, , Hand-Rolle...
ggplot(smoking, aes(x = age, y = amtWeekdays)) +
geom_point()
# Scatterplot with coord_trans()
str(mammals)
## 'data.frame': 62 obs. of 11 variables:
## $ Species : Factor w/ 62 levels "Africanelephant",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ BodyWt : num 6654 1 3.38 0.92 2547 ...
## $ BrainWt : num 5712 6.6 44.5 5.7 4603 ...
## $ NonDreaming: num NA 6.3 NA NA 2.1 9.1 15.8 5.2 10.9 8.3 ...
## $ Dreaming : num NA 2 NA NA 1.8 0.7 3.9 1 3.6 1.4 ...
## $ TotalSleep : num 3.3 8.3 12.5 16.5 3.9 9.8 19.7 6.2 14.5 9.7 ...
## $ LifeSpan : num 38.6 4.5 14 NA 69 27 19 30.4 28 50 ...
## $ Gestation : num 645 42 60 25 624 180 35 392 63 230 ...
## $ Predation : int 3 3 1 5 3 4 1 4 1 1 ...
## $ Exposure : int 5 1 1 2 5 4 1 5 2 1 ...
## $ Danger : int 3 3 1 3 4 4 1 4 1 1 ...
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point() +
coord_trans(x = "log10", y = "log10")
# Scatterplot with scale_x_log10() and scale_y_log10()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
# Scatterplot of SLG vs. OBP
str(mlbBat10)
## 'data.frame': 1199 obs. of 19 variables:
## $ name : Factor w/ 1120 levels "A Jackson","B Butler",..: 13 9 21 14 25 19 22 10 24 3 ...
## $ team : Factor w/ 30 levels "ATL","BAL","BOS",..: 15 13 18 5 11 3 2 12 13 4 ...
## $ position: Factor w/ 9 levels "1B","2B","3B",..: 4 5 3 4 2 5 4 4 2 2 ...
## $ G : num 162 157 157 160 160 150 160 153 160 155 ...
## $ AB : num 680 663 656 651 651 632 629 629 626 626 ...
## $ R : num 74 111 99 96 112 92 79 85 103 100 ...
## $ H : num 214 179 186 179 175 174 187 166 200 172 ...
## $ 2B : num 30 30 36 18 32 38 45 24 41 33 ...
## $ 3B : num 3 3 3 3 4 0 3 10 3 5 ...
## $ HR : num 6 10 21 1 29 11 12 3 29 18 ...
## $ RBI : num 43 67 91 47 83 56 60 58 109 59 ...
## $ TB : num 268 245 291 206 302 245 274 219 334 269 ...
## $ BB : num 45 63 50 45 76 53 73 60 57 46 ...
## $ SO : num 86 106 115 47 184 71 93 74 77 83 ...
## $ SB : num 42 18 4 68 11 5 7 26 3 16 ...
## $ CS : num 9 5 2 18 4 4 2 4 2 12 ...
## $ OBP : num 0.359 0.34 0.33 0.341 0.366 0.333 0.37 0.331 0.381 0.332 ...
## $ SLG : num 0.394 0.37 0.444 0.316 0.464 0.388 0.436 0.348 0.534 0.43 ...
## $ AVG : num 0.315 0.27 0.284 0.275 0.269 0.275 0.297 0.264 0.319 0.275 ...
mlbBat10 %>%
filter(AB >= 200) %>%
ggplot(aes(x = OBP, y = SLG)) +
geom_point()
# Identify the outlying player
mlbBat10 %>%
filter(
AB >= 200,
OBP < 0.2
)
## name team position G AB R H 2B 3B HR RBI TB BB SO SB CS OBP
## 1 B Wood LAA 3B 81 226 20 33 2 0 4 14 47 6 71 1 0 0.174
## SLG AVG
## 1 0.208 0.146
\[ r(x,y) = \frac{Cov(x,y)}{\sqrt{SXX \dot{} SYY}} \]
\[ r(x,y) = \frac{\sum_{i=1}^n(x_i - \bar{x})(y_i - \bar{y})}{\sum_{i=1}^n(x_i - \bar{x})^2 \dot{} \sum_{i=1}^n(y_i - \bar{y})^2 } \]
# Compute correlation
ncbirths %>%
summarize(N = n(), r = cor(mage, weight))
## N r
## 1 1000 0.05506589
# Compute correlation for all non-missing pairs
ncbirths %>%
summarize(N = n(), r = cor(weeks, weight, use = "pairwise.complete.obs"))
## N r
## 1 1000 0.6701013
ggplot(Anscombe, aes(x,y)) +
geom_point() +
facet_wrap(~ set)
## Anscombe dataset is loaded
# Compute properties of Anscombe
Anscombe %>%
group_by(set) %>%
summarize(
N = n(),
mean(x),
sd(x),
mean(y),
sd(y),
cor(x, y)
)
## # A tibble: 4 x 7
## set N `mean(x)` `sd(x)` `mean(y)` `sd(y)` `cor(x, y)`
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 11 9 3.316625 7.500909 2.031568 0.8164205
## 2 2 11 9 3.316625 7.500909 2.031657 0.8162365
## 3 3 11 9 3.316625 7.500000 2.030424 0.8162867
## 4 4 11 9 3.316625 7.500909 2.030579 0.8165214
My guesses:
Actual:
str(mlbBat10)
## 'data.frame': 1199 obs. of 19 variables:
## $ name : Factor w/ 1120 levels "A Jackson","B Butler",..: 13 9 21 14 25 19 22 10 24 3 ...
## $ team : Factor w/ 30 levels "ATL","BAL","BOS",..: 15 13 18 5 11 3 2 12 13 4 ...
## $ position: Factor w/ 9 levels "1B","2B","3B",..: 4 5 3 4 2 5 4 4 2 2 ...
## $ G : num 162 157 157 160 160 150 160 153 160 155 ...
## $ AB : num 680 663 656 651 651 632 629 629 626 626 ...
## $ R : num 74 111 99 96 112 92 79 85 103 100 ...
## $ H : num 214 179 186 179 175 174 187 166 200 172 ...
## $ 2B : num 30 30 36 18 32 38 45 24 41 33 ...
## $ 3B : num 3 3 3 3 4 0 3 10 3 5 ...
## $ HR : num 6 10 21 1 29 11 12 3 29 18 ...
## $ RBI : num 43 67 91 47 83 56 60 58 109 59 ...
## $ TB : num 268 245 291 206 302 245 274 219 334 269 ...
## $ BB : num 45 63 50 45 76 53 73 60 57 46 ...
## $ SO : num 86 106 115 47 184 71 93 74 77 83 ...
## $ SB : num 42 18 4 68 11 5 7 26 3 16 ...
## $ CS : num 9 5 2 18 4 4 2 4 2 12 ...
## $ OBP : num 0.359 0.34 0.33 0.341 0.366 0.333 0.37 0.331 0.381 0.332 ...
## $ SLG : num 0.394 0.37 0.444 0.316 0.464 0.388 0.436 0.348 0.534 0.43 ...
## $ AVG : num 0.315 0.27 0.284 0.275 0.269 0.275 0.297 0.264 0.319 0.275 ...
# Correlation for all baseball players
mlbBat10 %>%
summarize(N = n(), r = cor(OBP, SLG))
## N r
## 1 1199 0.8145628
# Correlation for all players with at least 200 ABs
mlbBat10 %>%
filter(AB >= 200) %>%
summarize(N = n(), r = cor(OBP, SLG))
## N r
## 1 329 0.6855364
# Correlation of body dimensions
str(bdims)
## 'data.frame': 507 obs. of 25 variables:
## $ bia.di: num 42.9 43.7 40.1 44.3 42.5 43.3 43.5 44.4 43.5 42 ...
## $ bii.di: num 26 28.5 28.2 29.9 29.9 27 30 29.8 26.5 28 ...
## $ bit.di: num 31.5 33.5 33.3 34 34 31.5 34 33.2 32.1 34 ...
## $ che.de: num 17.7 16.9 20.9 18.4 21.5 19.6 21.9 21.8 15.5 22.5 ...
## $ che.di: num 28 30.8 31.7 28.2 29.4 31.3 31.7 28.8 27.5 28 ...
## $ elb.di: num 13.1 14 13.9 13.9 15.2 14 16.1 15.1 14.1 15.6 ...
## $ wri.di: num 10.4 11.8 10.9 11.2 11.6 11.5 12.5 11.9 11.2 12 ...
## $ kne.di: num 18.8 20.6 19.7 20.9 20.7 18.8 20.8 21 18.9 21.1 ...
## $ ank.di: num 14.1 15.1 14.1 15 14.9 13.9 15.6 14.6 13.2 15 ...
## $ sho.gi: num 106 110 115 104 108 ...
## $ che.gi: num 89.5 97 97.5 97 97.5 ...
## $ wai.gi: num 71.5 79 83.2 77.8 80 82.5 82 76.8 68.5 77.5 ...
## $ nav.gi: num 74.5 86.5 82.9 78.8 82.5 80.1 84 80.5 69 81.5 ...
## $ hip.gi: num 93.5 94.8 95 94 98.5 95.3 101 98 89.5 99.8 ...
## $ thi.gi: num 51.5 51.5 57.3 53 55.4 57.5 60.9 56 50 59.8 ...
## $ bic.gi: num 32.5 34.4 33.4 31 32 33 42.4 34.1 33 36.5 ...
## $ for.gi: num 26 28 28.8 26.2 28.4 28 32.3 28 26 29.2 ...
## $ kne.gi: num 34.5 36.5 37 37 37.7 36.6 40.1 39.2 35.5 38.3 ...
## $ cal.gi: num 36.5 37.5 37.3 34.8 38.6 36.1 40.3 36.7 35 38.6 ...
## $ ank.gi: num 23.5 24.5 21.9 23 24.4 23.5 23.6 22.5 22 22.2 ...
## $ wri.gi: num 16.5 17 16.9 16.6 18 16.9 18.8 18 16.5 16.9 ...
## $ age : int 21 23 28 23 22 21 26 27 23 21 ...
## $ wgt : num 65.6 71.8 80.7 72.6 78.8 74.8 86.4 78.4 62 81.6 ...
## $ hgt : num 174 175 194 186 187 ...
## $ sex : int 1 1 1 1 1 1 1 1 1 1 ...
bdims %>%
group_by(sex) %>%
summarize(N = n(), r = cor(hgt, wgt))
## # A tibble: 2 x 3
## sex N r
## <int> <int> <dbl>
## 1 0 260 0.4310593
## 2 1 247 0.5347418
# Correlation among mammals, with and without log
str(mammals)
## 'data.frame': 62 obs. of 11 variables:
## $ Species : Factor w/ 62 levels "Africanelephant",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ BodyWt : num 6654 1 3.38 0.92 2547 ...
## $ BrainWt : num 5712 6.6 44.5 5.7 4603 ...
## $ NonDreaming: num NA 6.3 NA NA 2.1 9.1 15.8 5.2 10.9 8.3 ...
## $ Dreaming : num NA 2 NA NA 1.8 0.7 3.9 1 3.6 1.4 ...
## $ TotalSleep : num 3.3 8.3 12.5 16.5 3.9 9.8 19.7 6.2 14.5 9.7 ...
## $ LifeSpan : num 38.6 4.5 14 NA 69 27 19 30.4 28 50 ...
## $ Gestation : num 645 42 60 25 624 180 35 392 63 230 ...
## $ Predation : int 3 3 1 5 3 4 1 4 1 1 ...
## $ Exposure : int 5 1 1 2 5 4 1 5 2 1 ...
## $ Danger : int 3 3 1 3 4 4 1 4 1 1 ...
mammals %>%
summarize(N = n(),
r = cor(BodyWt, BrainWt),
r_log = cor(log(BodyWt), log(BrainWt)))
## N r r_log
## 1 62 0.9341638 0.9595748
In the San Francisco Bay Area from 1960-1967, the correlation between the birthweight of 1,236 babies and the length of their gestational period was 0.408. Which of the following conclusions is not a valid statistical interpretation of these results.
# Create faceted scatterplot
str(noise)
## 'data.frame': 1000 obs. of 3 variables:
## $ y: num 1.702 -0.741 0.225 -1.062 0.314 ...
## $ x: num -0.164 -0.94 0.856 0.454 0.773 ...
## $ z: int 1 2 3 4 5 6 7 8 9 10 ...
ggplot(noise, aes(x, y)) +
geom_point() +
facet_wrap(~ z)
# Compute correlations for each dataset
noise_summary <- noise %>%
group_by(z) %>%
summarize(N = n(), spurious_cor = cor(x, y))
# Isolate sets with correlations above 0.2 in absolute strength
noise_summary %>%
filter(abs(spurious_cor) > 0.2)
## # A tibble: 1 x 3
## z N spurious_cor
## <int> <int> <dbl>
## 1 11 50 0.2462182
# Scatterplot with regression line
str(bdims)
## 'data.frame': 507 obs. of 25 variables:
## $ bia.di: num 42.9 43.7 40.1 44.3 42.5 43.3 43.5 44.4 43.5 42 ...
## $ bii.di: num 26 28.5 28.2 29.9 29.9 27 30 29.8 26.5 28 ...
## $ bit.di: num 31.5 33.5 33.3 34 34 31.5 34 33.2 32.1 34 ...
## $ che.de: num 17.7 16.9 20.9 18.4 21.5 19.6 21.9 21.8 15.5 22.5 ...
## $ che.di: num 28 30.8 31.7 28.2 29.4 31.3 31.7 28.8 27.5 28 ...
## $ elb.di: num 13.1 14 13.9 13.9 15.2 14 16.1 15.1 14.1 15.6 ...
## $ wri.di: num 10.4 11.8 10.9 11.2 11.6 11.5 12.5 11.9 11.2 12 ...
## $ kne.di: num 18.8 20.6 19.7 20.9 20.7 18.8 20.8 21 18.9 21.1 ...
## $ ank.di: num 14.1 15.1 14.1 15 14.9 13.9 15.6 14.6 13.2 15 ...
## $ sho.gi: num 106 110 115 104 108 ...
## $ che.gi: num 89.5 97 97.5 97 97.5 ...
## $ wai.gi: num 71.5 79 83.2 77.8 80 82.5 82 76.8 68.5 77.5 ...
## $ nav.gi: num 74.5 86.5 82.9 78.8 82.5 80.1 84 80.5 69 81.5 ...
## $ hip.gi: num 93.5 94.8 95 94 98.5 95.3 101 98 89.5 99.8 ...
## $ thi.gi: num 51.5 51.5 57.3 53 55.4 57.5 60.9 56 50 59.8 ...
## $ bic.gi: num 32.5 34.4 33.4 31 32 33 42.4 34.1 33 36.5 ...
## $ for.gi: num 26 28 28.8 26.2 28.4 28 32.3 28 26 29.2 ...
## $ kne.gi: num 34.5 36.5 37 37 37.7 36.6 40.1 39.2 35.5 38.3 ...
## $ cal.gi: num 36.5 37.5 37.3 34.8 38.6 36.1 40.3 36.7 35 38.6 ...
## $ ank.gi: num 23.5 24.5 21.9 23 24.4 23.5 23.6 22.5 22 22.2 ...
## $ wri.gi: num 16.5 17 16.9 16.6 18 16.9 18.8 18 16.5 16.9 ...
## $ age : int 21 23 28 23 22 21 26 27 23 21 ...
## $ wgt : num 65.6 71.8 80.7 72.6 78.8 74.8 86.4 78.4 62 81.6 ...
## $ hgt : num 174 175 194 186 187 ...
## $ sex : int 1 1 1 1 1 1 1 1 1 1 ...
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Estimate optimal value of my_slope
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point()
add_line(my_slope = 1)
add_line(my_slope = 2)
add_line(my_slope = .5)
General statistic model
\[ response = f(explanatory) + noise \] General linear model
\[ response = intercept + (slope * explanatory) + noise \]
Regression Model
\[ Y = \beta_0 + \beta_1 \dot{} X + \epsilon \] \[ \epsilon \sim N(0,\sigma_\epsilon) \]
Fitted Values \[ \hat{Y} = \hat{\beta_0} + \hat{\beta_1} \dot{} X\]
Residuals \[ e = Y - \hat{Y} \]
Fitting procedure
Key Concepts
# Print bdims_summary
bdims_summary <- bdims %>%
summarize(
N = n(),
r = cor(hgt, wgt),
mean_hgt = mean(hgt),
mean_wgt = mean(wgt),
sd_hgt = sd(hgt),
sd_wgt = sd(wgt)
)
# Add slope and intercept
bdims_summary %>%
mutate(
slope = r * sd_wgt/sd_hgt,
intercept = mean_wgt - (slope * mean_hgt)
)
## N r mean_hgt mean_wgt sd_hgt sd_wgt slope intercept
## 1 507 0.7173011 171.1438 69.14753 9.407205 13.34576 1.017617 -105.0113
# Height of children vs. height of father
glimpse(Galton_men)
## Observations: 465
## Variables: 6
## $ family <fctr> 1, 2, 2, 3, 4, 4, 5, 5, 5, 7, 7, 7, 7, 11, 11, 14, 14,...
## $ father <dbl> 78.5, 75.5, 75.5, 75.0, 75.0, 75.0, 75.0, 75.0, 75.0, 7...
## $ mother <dbl> 67.0, 66.5, 66.5, 64.0, 64.0, 64.0, 58.5, 58.5, 58.5, 6...
## $ sex <fctr> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, ...
## $ height <dbl> 73.2, 73.5, 72.5, 71.0, 70.5, 68.5, 72.0, 69.0, 68.0, 7...
## $ nkids <int> 4, 4, 4, 2, 5, 5, 6, 6, 6, 6, 6, 6, 6, 8, 8, 2, 2, 3, 3...
ggplot(data = Galton_men, aes(x = father, y = height)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = "lm", se = FALSE)
# Height of children vs. height of mother
glimpse(Galton_women)
## Observations: 433
## Variables: 6
## $ family <fctr> 1, 1, 1, 2, 2, 3, 4, 4, 4, 5, 5, 5, 6, 7, 7, 8, 8, 8, ...
## $ father <dbl> 78.5, 78.5, 78.5, 75.5, 75.5, 75.0, 75.0, 75.0, 75.0, 7...
## $ mother <dbl> 67.0, 67.0, 67.0, 66.5, 66.5, 64.0, 64.0, 64.0, 64.0, 5...
## $ sex <fctr> F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, ...
## $ height <dbl> 69.2, 69.0, 69.0, 65.5, 65.5, 68.0, 67.0, 64.5, 63.0, 6...
## $ nkids <int> 4, 4, 4, 4, 4, 2, 5, 5, 5, 6, 6, 6, 1, 6, 6, 3, 3, 3, 1...
ggplot(data = Galton_women, aes(x = mother, y = height)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = "lm", se = FALSE)
In an opinion piece about nepotism published in The New York Times in 2015, economist Seth Stephens-Davidowitz wrote that:
“Regression to the mean is so powerful that once-in-a-generation talent basically never sires once-in-a-generation talent. It explains why Michael Jordan’s sons were middling college basketball players and Jakob Dylan wrote two good songs. It is why there are no American parent-child pairs among Hall of Fame players in any major professional sports league.”
The author is arguing that …
# Linear model for weight as a function of height
lm(wgt ~ hgt, data = bdims)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Coefficients:
## (Intercept) hgt
## -105.011 1.018
# Linear model for SLG as a function of OBP
lm(SLG ~ OBP, data = mlbBat10)
##
## Call:
## lm(formula = SLG ~ OBP, data = mlbBat10)
##
## Coefficients:
## (Intercept) OBP
## 0.009407 1.110323
# Log-linear model for body weight as a function of brain weight
lm(log(BodyWt) ~ log(BrainWt), data = mammals)
##
## Call:
## lm(formula = log(BodyWt) ~ log(BrainWt), data = mammals)
##
## Coefficients:
## (Intercept) log(BrainWt)
## -2.509 1.225
mod <- lm(wgt ~ hgt, data = bdims)
# Show the coefficients
coef(mod)
## (Intercept) hgt
## -105.011254 1.017617
# Show the full output
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
mod
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Coefficients:
## (Intercept) hgt
## -105.011 1.018
# Mean of weights equal to mean of fitted values?
mean(fitted.values(mod)) == mean(bdims$wgt)
## [1] TRUE
# Mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15
# Load broom
# Create bdims_tidy
bdims_tidy <- augment(mod)
# Glimpse the resulting data frame
glimpse(bdims_tidy)
## Observations: 507
## Variables: 9
## $ wgt <dbl> 65.6, 71.8, 80.7, 72.6, 78.8, 74.8, 86.4, 78.4, 62....
## $ hgt <dbl> 174.0, 175.3, 193.5, 186.5, 187.2, 181.5, 184.0, 18...
## $ .fitted <dbl> 72.05406, 73.37697, 91.89759, 84.77427, 85.48661, 7...
## $ .se.fit <dbl> 0.4320546, 0.4520060, 1.0667332, 0.7919264, 0.81834...
## $ .resid <dbl> -6.4540648, -1.5769666, -11.1975919, -12.1742745, -...
## $ .hat <dbl> 0.002154570, 0.002358152, 0.013133942, 0.007238576,...
## $ .sigma <dbl> 9.312824, 9.317005, 9.303732, 9.301360, 9.312471, 9...
## $ .cooksd <dbl> 5.201807e-04, 3.400330e-05, 9.758463e-03, 6.282074e...
## $ .std.resid <dbl> -0.69413418, -0.16961994, -1.21098084, -1.31269063,...
head(bdims_tidy)
## wgt hgt .fitted .se.fit .resid .hat .sigma
## 1 65.6 174.0 72.05406 0.4320546 -6.454065 0.002154570 9.312824
## 2 71.8 175.3 73.37697 0.4520060 -1.576967 0.002358152 9.317005
## 3 80.7 193.5 91.89759 1.0667332 -11.197592 0.013133942 9.303732
## 4 72.6 186.5 84.77427 0.7919264 -12.174274 0.007238576 9.301360
## 5 78.8 187.2 85.48661 0.8183471 -6.686606 0.007729629 9.312471
## 6 74.8 181.5 79.68619 0.6151427 -4.886191 0.004367523 9.314716
## .cooksd .std.resid
## 1 0.0005201807 -0.6941342
## 2 0.0000340033 -0.1696199
## 3 0.0097584634 -1.2109808
## 4 0.0062820739 -1.3126906
## 5 0.0020256461 -0.7211614
## 6 0.0006070596 -0.5260931
Looking at what books are most overpriced based on the model.
mod <- lm(uclaNew ~ amazNew, data = textbooks)
augment(mod) %>%
arrange(desc(.resid)) %>%
head()
## uclaNew amazNew .fitted .se.fit .resid .hat .sigma
## 1 197.00 131.00 157.99815 2.179394 39.00185 0.04330931 9.408665
## 2 129.60 85.20 103.08389 1.386624 26.51611 0.01753183 10.050561
## 3 180.03 134.69 162.42247 2.256857 17.60753 0.04644271 10.324375
## 4 92.88 65.73 79.73933 1.235832 13.14067 0.01392606 10.427641
## 5 123.84 93.13 112.59197 1.490523 11.24803 0.02025756 10.459091
## 6 171.00 132.77 160.12038 2.216402 10.87962 0.04479266 10.462654
## .cooksd .std.resid
## 1 0.32816195 3.807626
## 2 0.05822235 2.554497
## 3 0.07219393 1.721789
## 4 0.01127520 1.263623
## 5 0.01217296 1.085114
## 6 0.02649222 1.062967
textbooks %>%
filter(uclaNew == 197)
## deptAbbr course ibsn uclaNew amazNew more diff
## 1 Mgmt 228 978-0073379661 197 131 Y 66
Make predictions
new_data <- data.frame(amazNew = 8.49)
predict(mod, newdata = new_data)
## 1
## 11.10849
Visualize new observations
isrs <- broom::augment(mod, newdata = new_data)
ggplot(textbooks, aes(amazNew, uclaNew)) +
geom_point() +
geom_smooth(method = "lm") +
geom_point(data = isrs, aes(y = .fitted), size = 3, color = "red")
# Print ben
ben <- data.frame(wgt = 74.8, hgt = 182.8)
ben
## wgt hgt
## 1 74.8 182.8
mod <- lm(wgt ~ hgt, data = bdims)
# Predict the weight of ben
predict(mod, newdata = ben)
## 1
## 81.00909
coefs <- coef(mod)
# Add the line to the scatterplot
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_abline(data = as.data.frame(as.list(coefs)),
aes(intercept = X.Intercept., slope = hgt),
color = "dodgerblue")
SSE
mod_possum <- lm(totalL ~ tailL, data = possum)
mod_possum %>%
augment() %>%
summarize(
SSE = sum(.resid^2),
SSE_also = (n() -1) * var(.resid)
)
RMSE
\[RMSE = \sqrt{\frac{\sum_ie_i^2}{d.f}} = \sqrt{\frac{SSE}{n-2}}\]
# View summary of model
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Compute the mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15
# Compute RMSE
sqrt(sum(residuals(mod)^2) / df.residual(mod))
## [1] 9.30804
NULL(average) model
\[ \hat{y} = \bar{y} \]
## the null model
lm(totalL ~ 1, data = possum) %>%
augment(possum) %>%
summarize(SST = sum(.resid^2))
## SST
## 1 1913.826
## Using tail as an explanatory variable
lm(totalL ~ tailL, data = possum) %>%
augment(possum) %>%
summarize(SST = sum(.resid^2))
## SST
## 1 1301.488
Coeeficient of Determination
\[ R^2 = 1 - \frac{SSE}{SST} = 1 - \frac{Var(e)}{Var(y)} \]
e
is the vector of residuals and y
is the response variable.Connection to correlation
For simple linear regresssion…
\[ r_{x,y}^2 = R^2 \]
summary()
function to see the R^2 value# View model summary
str(bdims_tidy)
## 'data.frame': 507 obs. of 9 variables:
## $ wgt : num 65.6 71.8 80.7 72.6 78.8 74.8 86.4 78.4 62 81.6 ...
## $ hgt : num 174 175 194 186 187 ...
## $ .fitted : num 72.1 73.4 91.9 84.8 85.5 ...
## $ .se.fit : num 0.432 0.452 1.067 0.792 0.818 ...
## $ .resid : num -6.45 -1.58 -11.2 -12.17 -6.69 ...
## $ .hat : num 0.00215 0.00236 0.01313 0.00724 0.00773 ...
## $ .sigma : num 9.31 9.32 9.3 9.3 9.31 ...
## $ .cooksd : num 0.00052 0.000034 0.009758 0.006282 0.002026 ...
## $ .std.resid: num -0.694 -0.17 -1.211 -1.313 -0.721 ...
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
var(bdims_tidy$.resid)
## [1] 86.46839
var(bdims_tidy$wgt)
## [1] 178.1094
# Compute R-squared
bdims_tidy %>%
summarize(
var_y = var(wgt),
var_e = var(.resid)
) %>%
mutate(R_squared = 1 - (var_e/var_y))
## var_y var_e R_squared
## 1 178.1094 86.46839 0.5145208
# Compute SSE for null model
mod_null <- lm(wgt ~ 1, data = bdims) %>%
augment()
head(mod_null)
## wgt .fitted .se.fit .resid .hat .sigma .cooksd
## 1 65.6 69.14753 0.5927061 -3.547535 0.001972387 13.35803 1.399179e-04
## 2 71.8 69.14753 0.5927061 2.652465 0.001972387 13.35845 7.822033e-05
## 3 80.7 69.14753 0.5927061 11.552465 0.001972387 13.34906 1.483780e-03
## 4 72.6 69.14753 0.5927061 3.452465 0.001972387 13.35808 1.325192e-04
## 5 78.8 69.14753 0.5927061 9.652465 0.001972387 13.35205 1.035849e-03
## 6 74.8 69.14753 0.5927061 5.652465 0.001972387 13.35660 3.552188e-04
## .std.resid
## 1 -0.2660798
## 2 0.1989459
## 3 0.8664829
## 4 0.2589493
## 5 0.7239750
## 6 0.4239584
mod_null %>%
summarize(SSE = var(.resid))
## SSE
## 1 178.1094
# Compute SSE for regression model
mod_hgt <- mod %>%
augment()
head(mod_hgt)
## wgt hgt .fitted .se.fit .resid .hat .sigma
## 1 65.6 174.0 72.05406 0.4320546 -6.454065 0.002154570 9.312824
## 2 71.8 175.3 73.37697 0.4520060 -1.576967 0.002358152 9.317005
## 3 80.7 193.5 91.89759 1.0667332 -11.197592 0.013133942 9.303732
## 4 72.6 186.5 84.77427 0.7919264 -12.174274 0.007238576 9.301360
## 5 78.8 187.2 85.48661 0.8183471 -6.686606 0.007729629 9.312471
## 6 74.8 181.5 79.68619 0.6151427 -4.886191 0.004367523 9.314716
## .cooksd .std.resid
## 1 0.0005201807 -0.6941342
## 2 0.0000340033 -0.1696199
## 3 0.0097584634 -1.2109808
## 4 0.0062820739 -1.3126906
## 5 0.0020256461 -0.7211614
## 6 0.0006070596 -0.5260931
mod_hgt %>%
summarize(SSE = var(.resid))
## SSE
## 1 86.46839
.hat
variable from augment()
function\[ h_i = \frac{1}{n} + \frac{(x_i - \bar{x})^2}{\sum_{i=1}^n(x_i - \bar{x})^2}\]
.cooksd
in augment# Rank points of high leverage
mod <- lm(formula = SLG ~ OBP, data = filter(mlbBat10, AB >= 10))
mod %>%
augment %>%
arrange(desc(.hat)) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 2 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 3 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 4 0.308 0.550 0.69049108 0.009158810 -0.38249108 0.01641049 0.07011360
## 5 0.000 0.037 0.01152451 0.008770891 -0.01152451 0.01504981 0.07154283
## 6 0.038 0.038 0.01284803 0.008739031 0.02515197 0.01494067 0.07153800
## .cooksd .std.resid
## 1 0.0027664282 0.5289049
## 2 0.0027664282 0.5289049
## 3 0.0027664282 0.5289049
## 4 0.2427446800 -5.3943121
## 5 0.0002015398 -0.1624191
## 6 0.0009528017 0.3544561
# Rank influential points
mod %>%
augment() %>%
arrange(desc(.cooksd)) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.308 0.550 0.69049108 0.009158810 -0.3824911 0.016410487 0.07011360
## 2 0.833 0.385 0.47211002 0.004190644 0.3608900 0.003435619 0.07028875
## 3 0.800 0.455 0.56475653 0.006186785 0.2352435 0.007488132 0.07101125
## 4 0.379 0.133 0.13858258 0.005792344 0.2404174 0.006563752 0.07098798
## 5 0.786 0.438 0.54225666 0.005678026 0.2437433 0.006307223 0.07097257
## 6 0.231 0.077 0.06446537 0.007506974 0.1665346 0.011024863 0.07127661
## .cooksd .std.resid
## 1 0.24274468 -5.394312
## 2 0.04407145 5.056428
## 3 0.04114818 3.302718
## 4 0.03760256 3.373787
## 5 0.03712042 3.420018
## 6 0.03057912 2.342252
# Create nontrivial_players
nontrivial_players <- mlbBat10 %>%
filter(AB >= 10, OBP < 0.5)
# Fit model to new data
mod_cleaner <- lm(SLG ~ OBP, data = nontrivial_players)
# View model summary
summary(mod_cleaner)
##
## Call:
## lm(formula = SLG ~ OBP, data = nontrivial_players)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31383 -0.04165 -0.00261 0.03992 0.35819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.043326 0.009823 -4.411 1.18e-05 ***
## OBP 1.345816 0.033012 40.768 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07011 on 734 degrees of freedom
## Multiple R-squared: 0.6937, Adjusted R-squared: 0.6932
## F-statistic: 1662 on 1 and 734 DF, p-value: < 2.2e-16
# Visualize new model
ggplot(data = nontrivial_players, aes(x = OBP, y = SLG)) +
geom_point() +
geom_smooth(method = "lm")
# Rank high leverage points
mod %>%
augment() %>%
arrange(desc(.hat), .cooksd) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 2 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 3 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 4 0.308 0.550 0.69049108 0.009158810 -0.38249108 0.01641049 0.07011360
## 5 0.000 0.037 0.01152451 0.008770891 -0.01152451 0.01504981 0.07154283
## 6 0.038 0.038 0.01284803 0.008739031 0.02515197 0.01494067 0.07153800
## .cooksd .std.resid
## 1 0.0027664282 0.5289049
## 2 0.0027664282 0.5289049
## 3 0.0027664282 0.5289049
## 4 0.2427446800 -5.3943121
## 5 0.0002015398 -0.1624191
## 6 0.0009528017 0.3544561