Disclaimer: The content of this RMarkdown note came from a course called Correlation and Regression in datacamp.
ncbirths <- read.csv("/resources/rstudio/ncbirths.csv")
# Load packages
library(ggplot2)
# Scatterplot of weight vs. weeks
ggplot(data = ncbirths, aes(x = weeks, y = weight)) + geom_point()
# Boxplot of weight vs. weeks
ggplot(data = ncbirths,
aes(x = cut(weeks, breaks = 5), y = weight)) +
geom_boxplot()
# Load the package
library(openintro)
# Mammals scatterplot
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point()
# Baseball player scatterplot
ggplot(data = mlbBat10, aes(x = OBP, y = SLG)) +
geom_point()
# Body dimensions scatterplot
ggplot(data = bdims, aes(x = hgt, y = wgt, color = factor(sex))) +
geom_point()
# Smoking scatterplot
ggplot(data = smoking, aes(x = age, y = amtWeekdays)) +
geom_point()
# Scatterplot with coord_trans()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point() +
coord_trans(x = "log10", y = "log10")
# Scatterplot with scale_x_log10() and scale_y_log10()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point() +
scale_x_log10() + scale_y_log10()
# Load package
library(dplyr)
# Scatterplot of SLG vs. OBP
mlbBat10 %>%
filter(AB >= 200) %>%
ggplot(aes(x = OBP, y = SLG)) +
geom_point()
# Identify the outlying player
mlbBat10 %>%
filter(AB >= 200, OBP < 0.2)
## name team position G AB R H 2B 3B HR RBI TB BB SO SB CS OBP
## 1 B Wood LAA 3B 81 226 20 33 2 0 4 14 47 6 71 1 0 0.174
## SLG AVG
## 1 0.208 0.146
# Compute correlation
ncbirths %>%
summarize(N = n(), r = cor(weight, mage))
## N r
## 1 1000 0.05506589
# Compute correlation for all non-missing pairs
ncbirths %>%
summarize(N = n(), r = cor(weight, weeks, use = "pairwise.complete.obs"))
## N r
## 1 1000 0.6701013
Anscombe <- read.csv("/resources/rstudio/Anscombe.csv")
# Compute properties of Anscombe
Anscombe %>%
group_by(set) %>%
summarize(N = n(), mean(x), sd(x), mean(y), sd(y), cor(x,y))
## # A tibble: 4 x 7
## set N `mean(x)` `sd(x)` `mean(y)` `sd(y)` `cor(x, y)`
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 11 9 3.316625 7.500909 2.031568 0.8164205
## 2 2 11 9 3.316625 7.500909 2.031657 0.8162365
## 3 3 11 9 3.316625 7.500000 2.030424 0.8162867
## 4 4 11 9 3.316625 7.500909 2.030579 0.8165214
# Correlation for all baseball players
mlbBat10 %>%
summarize(N = n(), r = cor(OBP, SLG))
## N r
## 1 1199 0.8145628
# Correlation for all players with at least 200 ABs
mlbBat10 %>%
filter(AB >= 200) %>%
summarize(N = n(), r = cor(OBP, SLG))
## N r
## 1 329 0.6855364
# Correlation of body dimensions
bdims %>%
group_by(sex) %>%
summarize(N = n(), r = cor(hgt, wgt))
## # A tibble: 2 x 3
## sex N r
## <int> <int> <dbl>
## 1 0 260 0.4310593
## 2 1 247 0.5347418
# Correlation among mammals, with and without log
mammals %>%
summarize(N = n(),
r = cor(BodyWt, BrainWt),
r_log = cor(log(BodyWt), log(BrainWt)))
## N r r_log
## 1 62 0.9341638 0.9595748
noise <- read.csv("/resources/rstudio/noise.csv")
# Create faceted scatterplot
ggplot(noise, aes(x = x, y = y)) +
geom_point() +
facet_wrap(~ z)
# Compute correlations for each dataset
noise_summary <- noise %>%
group_by(z) %>%
summarize(N = n(), spurious_cor = cor(x, y))
# Isolate sets with correlations above 0.2 in absolute strength
noise_summary %>%
filter(abs(spurious_cor) > 0.2)
## # A tibble: 4 x 3
## z N spurious_cor
## <int> <int> <dbl>
## 1 3 50 -0.2063690
## 2 5 50 0.3060001
## 3 14 50 -0.2883678
## 4 20 50 -0.2323851
# Scatterplot with regression line
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Define add_line function
add_line <- function(my_slope) {
ggplot(data = bdims, aes(hgt, wgt)) +
geom_point() +
geom_abline(intercept = -102, slope = my_slope, col = "blue")
}
# Estimate optimal value of my_slope
add_line(my_slope = 1)
# Summarize bdims
bdims_summary <-
bdims %>%
summarize(N = n(), r = cor(wgt, hgt), mean_hgt = mean(hgt), sd_hgt = sd(hgt),
mean_wgt = mean(wgt), sd_wgt = sd(wgt))
# Print bdims_summary
bdims_summary
## N r mean_hgt sd_hgt mean_wgt sd_wgt
## 1 507 0.7173011 171.1438 9.407205 69.14753 13.34576
# Add slope and intercept
bdims_summary %>%
mutate(slope = r * sd_wgt / sd_hgt,
intercept = mean_wgt - slope * mean_hgt)
## N r mean_hgt sd_hgt mean_wgt sd_wgt slope intercept
## 1 507 0.7173011 171.1438 9.407205 69.14753 13.34576 1.017617 -105.0113
Galton_men <- read.csv("/resources/rstudio/Galton_men.csv")
Galton_women <- read.csv("/resources/rstudio/Galton_women.csv")
# Height of children vs. height of father
ggplot(data = Galton_men, aes(x = father, y = height)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = "lm", se = FALSE)
# Height of children vs. height of mother
ggplot(data = Galton_women, aes(x = mother, y = height)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = "lm", se = FALSE)
# Linear model for weight as a function of height
lm(wgt ~ hgt, data = bdims)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Coefficients:
## (Intercept) hgt
## -105.011 1.018
# Linear model for SLG as a function of OBP
lm(SLG ~ OBP, data = mlbBat10)
##
## Call:
## lm(formula = SLG ~ OBP, data = mlbBat10)
##
## Coefficients:
## (Intercept) OBP
## 0.009407 1.110323
# Log-linear model for body weight as a function of brain weight
lm(log(BodyWt) ~ log(BrainWt), data = mammals)
##
## Call:
## lm(formula = log(BodyWt) ~ log(BrainWt), data = mammals)
##
## Coefficients:
## (Intercept) log(BrainWt)
## -2.509 1.225
# Create a linear model
mod <- lm(wgt ~ hgt, data = bdims)
# Show the coefficients
coef(mod)
## (Intercept) hgt
## -105.011254 1.017617
# Show the full output
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Mean of weights equal to mean of fitted values?
mean(bdims$wgt) == mean(fitted.values(mod))
## [1] TRUE
# Mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15
# Load broom
library(broom)
# Create bdims_tidy
bdims_tidy <- augment(mod)
# Glimpse the resulting data frame
glimpse(bdims_tidy)
## Observations: 507
## Variables: 9
## $ wgt <dbl> 65.6, 71.8, 80.7, 72.6, 78.8, 74.8, 86.4, 78.4, 62....
## $ hgt <dbl> 174.0, 175.3, 193.5, 186.5, 187.2, 181.5, 184.0, 18...
## $ .fitted <dbl> 72.05406, 73.37697, 91.89759, 84.77427, 85.48661, 7...
## $ .se.fit <dbl> 0.4320546, 0.4520060, 1.0667332, 0.7919264, 0.81834...
## $ .resid <dbl> -6.4540648, -1.5769666, -11.1975919, -12.1742745, -...
## $ .hat <dbl> 0.002154570, 0.002358152, 0.013133942, 0.007238576,...
## $ .sigma <dbl> 9.312824, 9.317005, 9.303732, 9.301360, 9.312471, 9...
## $ .cooksd <dbl> 5.201807e-04, 3.400330e-05, 9.758463e-03, 6.282074e...
## $ .std.resid <dbl> -0.69413418, -0.16961994, -1.21098084, -1.31269063,...
# Define ben
ben <- data.frame(74.8, 182.8)
names(ben) <- c("wgt", "hgt")
# Print ben
ben
## wgt hgt
## 1 74.8 182.8
# Predict the weight of ben
predict(mod, newdata = ben)
## 1
## 81.00909
coefs <- data.frame(-105, 1.02)
names(coefs) <- c("(Intercept)", "hgt")
str(coefs)
## 'data.frame': 1 obs. of 2 variables:
## $ (Intercept): num -105
## $ hgt : num 1.02
# Add the line to the scatterplot
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_abline(data = coefs,
aes(intercept = `(Intercept)`, slope = hgt),
color = "dodgerblue")
# View summary of model
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Compute the mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15
# Compute RMSE
sqrt(sum(residuals(mod)^2) / df.residual(mod))
## [1] 9.30804
# View model summary
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Compute R-squared
bdims_tidy %>%
summarize(var_y = var(wgt), var_e = var(.resid)) %>%
mutate(R_squared = 1 - var_e / var_y)
## var_y var_e R_squared
## 1 178.1094 86.46839 0.5145208
mod <- lm(wgt ~ 1, data = bdims)
mod_null <-
mod %>%
augment(bdims)
mod <- lm(wgt ~ hgt, data = bdims)
mod_hgt <-
mod %>%
augment(bdims)
# Compute SSE for null model
mod_null %>%
summarize(SSE = var(.resid))
## SSE
## 1 178.1094
# Compute SSE for regression model
mod_hgt %>%
summarize(SSE = var(.resid))
## SSE
## 1 86.46839
# Rank points of high leverage
mod %>%
augment() %>%
arrange(desc(.hat)) %>%
head()
## wgt hgt .fitted .se.fit .resid .hat .sigma .cooksd
## 1 85.5 198.1 96.57863 1.255712 -11.078629 0.01819968 9.303950 0.0133734319
## 2 90.9 197.1 95.56101 1.214264 -4.661012 0.01701803 9.314916 0.0022081690
## 3 49.8 147.2 44.78194 1.131432 5.018065 0.01477545 9.314548 0.0022120570
## 4 80.7 193.5 91.89759 1.066733 -11.197592 0.01313394 9.303732 0.0097584634
## 5 95.9 193.0 91.38878 1.046493 4.511216 0.01264027 9.315075 0.0015228117
## 6 44.8 149.5 47.12245 1.037916 -2.322454 0.01243391 9.316688 0.0003968468
## .std.resid
## 1 -1.2012024
## 2 -0.5050673
## 3 0.5431383
## 4 -1.2109808
## 5 0.4877505
## 6 -0.2510763
# Rank influential points
mod %>%
augment() %>%
arrange(desc(.cooksd)) %>%
head()
## wgt hgt .fitted .se.fit .resid .hat .sigma .cooksd
## 1 73.2 151.1 48.75064 0.9737632 24.44936 0.010944356 9.252694 0.03859555
## 2 116.4 177.8 75.92101 0.5065670 40.47899 0.002961811 9.140611 0.02817388
## 3 104.1 165.1 62.99728 0.4914889 41.10272 0.002788117 9.135102 0.02733574
## 4 108.6 190.5 88.84474 0.9464667 19.75526 0.010339372 9.275186 0.02377609
## 5 67.3 152.4 50.07354 0.9223084 17.22646 0.009818289 9.285305 0.01714950
## 6 76.8 157.5 55.26339 0.7287405 21.53661 0.006129560 9.267446 0.01661032
## .std.resid
## 1 2.641185
## 2 4.355274
## 3 4.421999
## 4 2.133444
## 5 1.859860
## 6 2.320888
# Create nontrivial_players
nontrivial_players <- mlbBat10 %>%
filter(AB >= 10, OBP < 0.5)
# Fit model to new data
mod_cleaner <- lm(SLG ~ OBP, data = nontrivial_players)
# View model summary
summary(mod_cleaner)
##
## Call:
## lm(formula = SLG ~ OBP, data = nontrivial_players)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31383 -0.04165 -0.00261 0.03992 0.35819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.043326 0.009823 -4.411 1.18e-05 ***
## OBP 1.345816 0.033012 40.768 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07011 on 734 degrees of freedom
## Multiple R-squared: 0.6937, Adjusted R-squared: 0.6932
## F-statistic: 1662 on 1 and 734 DF, p-value: < 2.2e-16
# Visualize new model
ggplot(data = nontrivial_players, aes(x = OBP, y = SLG)) +
geom_point() +
geom_smooth(method = "lm")
# Rank high leverage points
mod %>%
augment() %>%
arrange(desc(.hat), .cooksd) %>%
head()
## wgt hgt .fitted .se.fit .resid .hat .sigma .cooksd
## 1 85.5 198.1 96.57863 1.255712 -11.078629 0.01819968 9.303950 0.0133734319
## 2 90.9 197.1 95.56101 1.214264 -4.661012 0.01701803 9.314916 0.0022081690
## 3 49.8 147.2 44.78194 1.131432 5.018065 0.01477545 9.314548 0.0022120570
## 4 80.7 193.5 91.89759 1.066733 -11.197592 0.01313394 9.303732 0.0097584634
## 5 95.9 193.0 91.38878 1.046493 4.511216 0.01264027 9.315075 0.0015228117
## 6 44.8 149.5 47.12245 1.037916 -2.322454 0.01243391 9.316688 0.0003968468
## .std.resid
## 1 -1.2012024
## 2 -0.5050673
## 3 0.5431383
## 4 -1.2109808
## 5 0.4877505
## 6 -0.2510763