Disclaimer: The content of this RMarkdown note came from a course called Correlation and Regression in datacamp.

Correlation and Regression

Chapter 1: Visualizing two variables

ncbirths <- read.csv("/resources/rstudio/ncbirths.csv")

1.1 Scatterplots

# Load packages
library(ggplot2)

# Scatterplot of weight vs. weeks
ggplot(data = ncbirths, aes(x = weeks, y = weight)) + geom_point()

1.2 Boxplots as discretized/conditioned scatterplots

# Boxplot of weight vs. weeks
ggplot(data = ncbirths, 
       aes(x = cut(weeks, breaks = 5), y = weight)) + 
  geom_boxplot()

1.3 Creating scatterplots

# Load the package
library(openintro)

# Mammals scatterplot
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
  geom_point()


# Baseball player scatterplot
ggplot(data = mlbBat10, aes(x = OBP, y = SLG)) +
  geom_point()


# Body dimensions scatterplot
ggplot(data = bdims, aes(x = hgt, y = wgt, color = factor(sex))) +
  geom_point()


# Smoking scatterplot
ggplot(data = smoking, aes(x = age, y = amtWeekdays)) +
  geom_point()

1.4 Transformations

# Scatterplot with coord_trans()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
  geom_point() + 
  coord_trans(x = "log10", y = "log10")


# Scatterplot with scale_x_log10() and scale_y_log10()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
  geom_point() +
  scale_x_log10() + scale_y_log10()

1.5 Identifying outliers

# Load package
library(dplyr)

# Scatterplot of SLG vs. OBP
mlbBat10 %>%
  filter(AB >= 200) %>%
  ggplot(aes(x = OBP, y = SLG)) +
  geom_point()


# Identify the outlying player
mlbBat10 %>%
  filter(AB >= 200, OBP < 0.2)
##     name team position  G  AB  R  H 2B 3B HR RBI TB BB SO SB CS   OBP
## 1 B Wood  LAA       3B 81 226 20 33  2  0  4  14 47  6 71  1  0 0.174
##     SLG   AVG
## 1 0.208 0.146

Chapter 2: Correlation

2.1 Computing correlation

# Compute correlation
ncbirths %>%
  summarize(N = n(), r = cor(weight, mage))
##      N          r
## 1 1000 0.05506589

# Compute correlation for all non-missing pairs
ncbirths %>%
  summarize(N = n(), r = cor(weight, weeks, use = "pairwise.complete.obs"))
##      N         r
## 1 1000 0.6701013

2.2 Exploring Anscombe

Anscombe <- read.csv("/resources/rstudio/Anscombe.csv")

# Compute properties of Anscombe
Anscombe %>%
  group_by(set) %>%
  summarize(N = n(), mean(x), sd(x), mean(y), sd(y), cor(x,y))
## # A tibble: 4 x 7
##     set     N `mean(x)`  `sd(x)` `mean(y)`  `sd(y)` `cor(x, y)`
##   <int> <int>     <dbl>    <dbl>     <dbl>    <dbl>       <dbl>
## 1     1    11         9 3.316625  7.500909 2.031568   0.8164205
## 2     2    11         9 3.316625  7.500909 2.031657   0.8162365
## 3     3    11         9 3.316625  7.500000 2.030424   0.8162867
## 4     4    11         9 3.316625  7.500909 2.030579   0.8165214

2.3 Perception of correlation (2)

# Correlation for all baseball players
mlbBat10 %>%
  summarize(N = n(), r = cor(OBP, SLG))
##      N         r
## 1 1199 0.8145628

# Correlation for all players with at least 200 ABs
mlbBat10 %>%
  filter(AB >= 200) %>%
  summarize(N = n(), r = cor(OBP, SLG))
##     N         r
## 1 329 0.6855364

# Correlation of body dimensions
bdims %>%
  group_by(sex) %>%
  summarize(N = n(), r = cor(hgt, wgt))
## # A tibble: 2 x 3
##     sex     N         r
##   <int> <int>     <dbl>
## 1     0   260 0.4310593
## 2     1   247 0.5347418

# Correlation among mammals, with and without log
mammals %>%
  summarize(N = n(), 
            r = cor(BodyWt, BrainWt), 
            r_log = cor(log(BodyWt), log(BrainWt)))
##    N         r     r_log
## 1 62 0.9341638 0.9595748

2.4 Spurious correlation in random data

noise <- read.csv("/resources/rstudio/noise.csv")

# Create faceted scatterplot
ggplot(noise, aes(x = x, y = y)) +
geom_point() + 
facet_wrap(~ z)


# Compute correlations for each dataset
noise_summary <- noise %>%
  group_by(z) %>%
  summarize(N = n(), spurious_cor = cor(x, y))

# Isolate sets with correlations above 0.2 in absolute strength
noise_summary %>%
  filter(abs(spurious_cor) > 0.2)
## # A tibble: 4 x 3
##       z     N spurious_cor
##   <int> <int>        <dbl>
## 1     3    50   -0.2063690
## 2     5    50    0.3060001
## 3    14    50   -0.2883678
## 4    20    50   -0.2323851

Chapter 3: Simple linear regression

3.1 The “best fit” line

# Scatterplot with regression line
ggplot(data = bdims, aes(x = hgt, y = wgt)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE)

3.2 Uniqueness of least squares regression line

# Define add_line function
add_line <- function(my_slope) {
  ggplot(data = bdims, aes(hgt, wgt)) +
    geom_point() +
    geom_abline(intercept = -102, slope = my_slope, col = "blue")
}
# Estimate optimal value of my_slope
add_line(my_slope = 1)

3.3 Fitting a linear model“by hand”

# Summarize bdims
bdims_summary <-
  bdims %>%
  summarize(N = n(), r = cor(wgt, hgt), mean_hgt = mean(hgt), sd_hgt = sd(hgt),
            mean_wgt = mean(wgt), sd_wgt = sd(wgt))

# Print bdims_summary
bdims_summary
##     N         r mean_hgt   sd_hgt mean_wgt   sd_wgt
## 1 507 0.7173011 171.1438 9.407205 69.14753 13.34576

# Add slope and intercept
bdims_summary %>%
  mutate(slope = r * sd_wgt / sd_hgt, 
         intercept = mean_wgt - slope * mean_hgt)
##     N         r mean_hgt   sd_hgt mean_wgt   sd_wgt    slope intercept
## 1 507 0.7173011 171.1438 9.407205 69.14753 13.34576 1.017617 -105.0113

3.4 Regression to the mean

Galton_men <- read.csv("/resources/rstudio/Galton_men.csv")
Galton_women <- read.csv("/resources/rstudio/Galton_women.csv")

# Height of children vs. height of father
ggplot(data = Galton_men, aes(x = father, y = height)) +
  geom_point() + 
  geom_abline(slope = 1, intercept = 0) + 
  geom_smooth(method = "lm", se = FALSE)


# Height of children vs. height of mother
ggplot(data = Galton_women, aes(x = mother, y = height)) +
  geom_point() + 
  geom_abline(slope = 1, intercept = 0) + 
  geom_smooth(method = "lm", se = FALSE)

Chapter 4: Interpreting regression models

4.1 Fitting simple linear models

# Linear model for weight as a function of height
lm(wgt ~ hgt, data = bdims)
## 
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
## 
## Coefficients:
## (Intercept)          hgt  
##    -105.011        1.018

# Linear model for SLG as a function of OBP
lm(SLG ~ OBP, data = mlbBat10)
## 
## Call:
## lm(formula = SLG ~ OBP, data = mlbBat10)
## 
## Coefficients:
## (Intercept)          OBP  
##    0.009407     1.110323

# Log-linear model for body weight as a function of brain weight
lm(log(BodyWt) ~ log(BrainWt), data = mammals)
## 
## Call:
## lm(formula = log(BodyWt) ~ log(BrainWt), data = mammals)
## 
## Coefficients:
##  (Intercept)  log(BrainWt)  
##       -2.509         1.225

4.2 The lm summary output

# Create a linear model
mod <- lm(wgt ~ hgt, data = bdims)

# Show the coefficients
coef(mod)
## (Intercept)         hgt 
## -105.011254    1.017617

# Show the full output
summary(mod)
## 
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.743  -6.402  -1.231   5.059  41.103 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -105.01125    7.53941  -13.93   <2e-16 ***
## hgt            1.01762    0.04399   23.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared:  0.5145, Adjusted R-squared:  0.5136 
## F-statistic: 535.2 on 1 and 505 DF,  p-value: < 2.2e-16

4.3 Fitted values and residuals

# Mean of weights equal to mean of fitted values?
mean(bdims$wgt) == mean(fitted.values(mod))
## [1] TRUE

# Mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15

4.4 Tidying your linear model

# Load broom
library(broom)

# Create bdims_tidy
bdims_tidy <- augment(mod)

# Glimpse the resulting data frame
glimpse(bdims_tidy)
## Observations: 507
## Variables: 9
## $ wgt        <dbl> 65.6, 71.8, 80.7, 72.6, 78.8, 74.8, 86.4, 78.4, 62....
## $ hgt        <dbl> 174.0, 175.3, 193.5, 186.5, 187.2, 181.5, 184.0, 18...
## $ .fitted    <dbl> 72.05406, 73.37697, 91.89759, 84.77427, 85.48661, 7...
## $ .se.fit    <dbl> 0.4320546, 0.4520060, 1.0667332, 0.7919264, 0.81834...
## $ .resid     <dbl> -6.4540648, -1.5769666, -11.1975919, -12.1742745, -...
## $ .hat       <dbl> 0.002154570, 0.002358152, 0.013133942, 0.007238576,...
## $ .sigma     <dbl> 9.312824, 9.317005, 9.303732, 9.301360, 9.312471, 9...
## $ .cooksd    <dbl> 5.201807e-04, 3.400330e-05, 9.758463e-03, 6.282074e...
## $ .std.resid <dbl> -0.69413418, -0.16961994, -1.21098084, -1.31269063,...

4.5 Making predictions

# Define ben
ben <- data.frame(74.8, 182.8)
names(ben) <- c("wgt", "hgt")

# Print ben
ben
##    wgt   hgt
## 1 74.8 182.8

# Predict the weight of ben
predict(mod, newdata = ben)
##        1 
## 81.00909

4.6 Adding a regression line to a plot manually

coefs <- data.frame(-105, 1.02)
names(coefs) <- c("(Intercept)", "hgt")
str(coefs)
## 'data.frame':    1 obs. of  2 variables:
##  $ (Intercept): num -105
##  $ hgt        : num 1.02

# Add the line to the scatterplot
ggplot(data = bdims, aes(x = hgt, y = wgt)) + 
  geom_point() + 
  geom_abline(data = coefs, 
              aes(intercept = `(Intercept)`, slope = hgt),  
              color = "dodgerblue")

Chapter 5: Model Fit

5.1 Standard error of residuals

# View summary of model
summary(mod)
## 
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.743  -6.402  -1.231   5.059  41.103 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -105.01125    7.53941  -13.93   <2e-16 ***
## hgt            1.01762    0.04399   23.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared:  0.5145, Adjusted R-squared:  0.5136 
## F-statistic: 535.2 on 1 and 505 DF,  p-value: < 2.2e-16

# Compute the mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15

# Compute RMSE
sqrt(sum(residuals(mod)^2) / df.residual(mod))
## [1] 9.30804

5.2 Assessing simple linear model fit

# View model summary
summary(mod)
## 
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.743  -6.402  -1.231   5.059  41.103 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -105.01125    7.53941  -13.93   <2e-16 ***
## hgt            1.01762    0.04399   23.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared:  0.5145, Adjusted R-squared:  0.5136 
## F-statistic: 535.2 on 1 and 505 DF,  p-value: < 2.2e-16

# Compute R-squared
bdims_tidy %>%
  summarize(var_y = var(wgt), var_e = var(.resid)) %>%
  mutate(R_squared = 1 - var_e / var_y)
##      var_y    var_e R_squared
## 1 178.1094 86.46839 0.5145208

5.3 Linear vs. average

mod <- lm(wgt ~ 1, data = bdims) 
mod_null <-
  mod %>%
  augment(bdims)

mod <- lm(wgt ~ hgt, data = bdims)
mod_hgt <-
  mod %>%
  augment(bdims)


# Compute SSE for null model
mod_null %>%
  summarize(SSE = var(.resid))
##        SSE
## 1 178.1094

# Compute SSE for regression model
mod_hgt %>%
  summarize(SSE = var(.resid))
##        SSE
## 1 86.46839

5.4 Leverage

# Rank points of high leverage
mod %>%
  augment() %>%
  arrange(desc(.hat)) %>%
  head()
##    wgt   hgt  .fitted  .se.fit     .resid       .hat   .sigma      .cooksd
## 1 85.5 198.1 96.57863 1.255712 -11.078629 0.01819968 9.303950 0.0133734319
## 2 90.9 197.1 95.56101 1.214264  -4.661012 0.01701803 9.314916 0.0022081690
## 3 49.8 147.2 44.78194 1.131432   5.018065 0.01477545 9.314548 0.0022120570
## 4 80.7 193.5 91.89759 1.066733 -11.197592 0.01313394 9.303732 0.0097584634
## 5 95.9 193.0 91.38878 1.046493   4.511216 0.01264027 9.315075 0.0015228117
## 6 44.8 149.5 47.12245 1.037916  -2.322454 0.01243391 9.316688 0.0003968468
##   .std.resid
## 1 -1.2012024
## 2 -0.5050673
## 3  0.5431383
## 4 -1.2109808
## 5  0.4877505
## 6 -0.2510763

5.5 Influence

# Rank influential points
mod %>%
  augment() %>%
  arrange(desc(.cooksd)) %>%
  head()
##     wgt   hgt  .fitted   .se.fit   .resid        .hat   .sigma    .cooksd
## 1  73.2 151.1 48.75064 0.9737632 24.44936 0.010944356 9.252694 0.03859555
## 2 116.4 177.8 75.92101 0.5065670 40.47899 0.002961811 9.140611 0.02817388
## 3 104.1 165.1 62.99728 0.4914889 41.10272 0.002788117 9.135102 0.02733574
## 4 108.6 190.5 88.84474 0.9464667 19.75526 0.010339372 9.275186 0.02377609
## 5  67.3 152.4 50.07354 0.9223084 17.22646 0.009818289 9.285305 0.01714950
## 6  76.8 157.5 55.26339 0.7287405 21.53661 0.006129560 9.267446 0.01661032
##   .std.resid
## 1   2.641185
## 2   4.355274
## 3   4.421999
## 4   2.133444
## 5   1.859860
## 6   2.320888

5.6 Removing outliers

# Create nontrivial_players
nontrivial_players <- mlbBat10 %>%
  filter(AB >= 10, OBP < 0.5)

# Fit model to new data
mod_cleaner <- lm(SLG ~ OBP, data = nontrivial_players)

# View model summary
summary(mod_cleaner)
## 
## Call:
## lm(formula = SLG ~ OBP, data = nontrivial_players)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31383 -0.04165 -0.00261  0.03992  0.35819 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.043326   0.009823  -4.411 1.18e-05 ***
## OBP          1.345816   0.033012  40.768  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07011 on 734 degrees of freedom
## Multiple R-squared:  0.6937, Adjusted R-squared:  0.6932 
## F-statistic:  1662 on 1 and 734 DF,  p-value: < 2.2e-16

# Visualize new model
ggplot(data = nontrivial_players, aes(x = OBP, y = SLG)) +
  geom_point() + 
  geom_smooth(method = "lm")

5.7 High leverage points

# Rank high leverage points
mod %>%
  augment() %>%
  arrange(desc(.hat), .cooksd) %>%
  head()
##    wgt   hgt  .fitted  .se.fit     .resid       .hat   .sigma      .cooksd
## 1 85.5 198.1 96.57863 1.255712 -11.078629 0.01819968 9.303950 0.0133734319
## 2 90.9 197.1 95.56101 1.214264  -4.661012 0.01701803 9.314916 0.0022081690
## 3 49.8 147.2 44.78194 1.131432   5.018065 0.01477545 9.314548 0.0022120570
## 4 80.7 193.5 91.89759 1.066733 -11.197592 0.01313394 9.303732 0.0097584634
## 5 95.9 193.0 91.38878 1.046493   4.511216 0.01264027 9.315075 0.0015228117
## 6 44.8 149.5 47.12245 1.037916  -2.322454 0.01243391 9.316688 0.0003968468
##   .std.resid
## 1 -1.2012024
## 2 -0.5050673
## 3  0.5431383
## 4 -1.2109808
## 5  0.4877505
## 6 -0.2510763

Correlation and Regression

Eric Mazzola

7/10/2017

Correlation and Regression

Chapter 1: Visualizing two variables

1.1 Scatterplots

1.2 Boxplots as discretized/conditioned scatterplots

1.3 Creating scatterplots

1.4 Transformations

1.5 Identifying outliers

Chapter 2: Correlation

2.1 Computing correlation

2.2 Exploring Anscombe

2.3 Perception of correlation (2)

2.4 Spurious correlation in random data

Chapter 3: Simple linear regression

3.1 The “best fit” line

3.2 Uniqueness of least squares regression line

3.3 Fitting a linear model“by hand”

3.4 Regression to the mean

Chapter 4: Interpreting regression models

4.1 Fitting simple linear models

4.2 The lm summary output

4.3 Fitted values and residuals

4.4 Tidying your linear model

4.5 Making predictions

4.6 Adding a regression line to a plot manually

Chapter 5: Model Fit

5.1 Standard error of residuals

5.2 Assessing simple linear model fit

5.3 Linear vs. average

5.4 Leverage

5.5 Influence

5.6 Removing outliers

5.7 High leverage points

Quiz 5