Correlation and Regression inR

DataCamp: Statistics with R

Bonnie Cooper

library( dplyr )
library( ggplot2 )
library( gridExtra )
library( openintro )
library( tidyverse )
library( UsingR )
library( broom )

Visualizing two variables

Visualizing bivariate relationships

  • both variables are numeric
  • response variable: y = dependant variable
  • explanatory variable: x = independent/predictor variable. Basically, something you think might be related to the dependant/response variable

Graphical Representations with Scatter plots.
convention: dependet/response variable on the x-axis and the dependent variable on the y-axis.

#glimpse( possum )
simplescatter <- ggplot( data = possum, aes( y = total_l, x = tail_l ) ) +
  geom_point() +
  labs( x = 'tail length (cm)', y = 'body length (cm)',
        title = 'Start with a Simple Scatter Plot' )

scattercut <- ggplot( data = possum, aes( y = total_l, x = cut( tail_l , breaks = 5 ) ) ) +
  geom_point() +
  labs( x = 'tail length (cm)', y = 'body length (cm)',
        title = 'Cut the distribution into discrete ranges' )

boxcut <- ggplot( data = possum, aes( y = total_l, x = cut( tail_l , breaks = 5 ) ) ) +
  geom_boxplot() +
  labs( x = 'tail length (cm)', y = 'body length (cm)',
        title = 'Plot as Boxplot' )

grid.arrange( simplescatter, scattercut, boxcut, ncol = 3 )

glimpse( ncbirths )
## Rows: 1,000
## Columns: 13
## $ fage           <int> NA, NA, 19, 21, NA, NA, 18, 17, NA, 20, 30, NA, NA, NA…
## $ mage           <int> 13, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16…
## $ mature         <fct> younger mom, younger mom, younger mom, younger mom, yo…
## $ weeks          <int> 39, 42, 37, 41, 39, 38, 37, 35, 38, 37, 45, 42, 40, 38…
## $ premie         <fct> full term, full term, full term, full term, full term,…
## $ visits         <int> 10, 15, 11, 6, 9, 19, 12, 5, 9, 13, 9, 8, 4, 12, 15, 7…
## $ marital        <fct> not married, not married, not married, not married, no…
## $ gained         <int> 38, 20, 38, 34, 27, 22, 76, 15, NA, 52, 28, 34, 12, 30…
## $ weight         <dbl> 7.63, 7.88, 6.63, 8.00, 6.38, 5.38, 8.44, 4.69, 8.81, …
## $ lowbirthweight <fct> not low, not low, not low, not low, not low, low, not …
## $ gender         <fct> male, male, female, male, female, male, male, male, ma…
## $ habit          <fct> nonsmoker, nonsmoker, nonsmoker, nonsmoker, nonsmoker,…
## $ whitemom       <fct> not white, not white, white, white, not white, not whi…
ncbirths <- ncbirths %>% na.omit()

simplescatter <- ggplot( ncbirths, aes( y = weight, x = weeks) ) +
  geom_point() +
  labs( x = 'weeks of gestation', y = 'birth weight' )

boxcut <- ggplot( ncbirths, aes( y = weight, x = cut( weeks, breaks = 5 ) ) ) +
  geom_boxplot() +
  labs( x = 'weeks of gestation', y = 'birth weight' )

grid.arrange( simplescatter, boxcut, ncol = 2 )

Characterizing Bivariate Relationships

  • Form ( linear, quadratic, non-linear )
  • Direction ( positive or negative )
  • Strength ( how much scatter/noise )
  • Outliers
glimpse( mammals )
## Rows: 62
## Columns: 2
## $ body  <dbl> 3.385, 0.480, 1.350, 465.000, 36.330, 27.660, 14.830, 1.040, 4.…
## $ brain <dbl> 44.50, 15.50, 8.10, 423.00, 119.50, 115.00, 98.20, 5.50, 58.00,…
wilddat <- ggplot( mammals, aes( x = body, y = brain ) ) +
geom_point()

log10coord <- ggplot(data = mammals, aes(x = body, y = brain)) +
  geom_point() + 
  coord_trans(x = "log10", y = "log10")

log10scale <- ggplot(data = mammals, aes(x = body, y = brain)) +
  geom_point() +
  scale_x_log10() + 
  scale_y_log10()

grid.arrange( wilddat, log10coord, log10scale, ncol = 3 )

glimpse( mlbbat10 )
## Rows: 1,199
## Columns: 19
## $ name            <fct> I Suzuki, D Jeter, M Young, J Pierre, R Weeks, M Scut…
## $ team            <fct> SEA, NYY, TEX, CWS, MIL, BOS, BAL, MIN, NYY, CIN, MIL…
## $ position        <fct> OF, SS, 3B, OF, 2B, SS, OF, OF, 2B, 2B, OF, OF, 2B, O…
## $ game            <dbl> 162, 157, 157, 160, 160, 150, 160, 153, 160, 155, 157…
## $ at_bat          <dbl> 680, 663, 656, 651, 651, 632, 629, 629, 626, 626, 619…
## $ run             <dbl> 74, 111, 99, 96, 112, 92, 79, 85, 103, 100, 101, 103,…
## $ hit             <dbl> 214, 179, 186, 179, 175, 174, 187, 166, 200, 172, 188…
## $ double          <dbl> 30, 30, 36, 18, 32, 38, 45, 24, 41, 33, 45, 34, 41, 2…
## $ triple          <dbl> 3, 3, 3, 3, 4, 0, 3, 10, 3, 5, 1, 10, 4, 3, 3, 1, 5, …
## $ home_run        <dbl> 6, 10, 21, 1, 29, 11, 12, 3, 29, 18, 25, 4, 10, 25, 1…
## $ rbi             <dbl> 43, 67, 91, 47, 83, 56, 60, 58, 109, 59, 103, 41, 75,…
## $ total_base      <dbl> 268, 245, 291, 206, 302, 245, 274, 219, 334, 269, 310…
## $ walk            <dbl> 45, 63, 50, 45, 76, 53, 73, 60, 57, 46, 56, 47, 28, 4…
## $ strike_out      <dbl> 86, 106, 115, 47, 184, 71, 93, 74, 77, 83, 105, 170, …
## $ stolen_base     <dbl> 42, 18, 4, 68, 11, 5, 7, 26, 3, 16, 14, 27, 14, 18, 1…
## $ caught_stealing <dbl> 9, 5, 2, 18, 4, 4, 2, 4, 2, 12, 3, 6, 4, 9, 5, 1, 3, …
## $ obp             <dbl> 0.359, 0.340, 0.330, 0.341, 0.366, 0.333, 0.370, 0.33…
## $ slg             <dbl> 0.394, 0.370, 0.444, 0.316, 0.464, 0.388, 0.436, 0.34…
## $ bat_avg         <dbl> 0.315, 0.270, 0.284, 0.275, 0.269, 0.275, 0.297, 0.26…
# Baseball player scatterplot
ggplot( mlbbat10, aes( x = obp, y = slg ) ) +
geom_point()

glimpse( bdims )
## Rows: 507
## Columns: 25
## $ bia_di <dbl> 42.9, 43.7, 40.1, 44.3, 42.5, 43.3, 43.5, 44.4, 43.5, 42.0, 40…
## $ bii_di <dbl> 26.0, 28.5, 28.2, 29.9, 29.9, 27.0, 30.0, 29.8, 26.5, 28.0, 29…
## $ bit_di <dbl> 31.5, 33.5, 33.3, 34.0, 34.0, 31.5, 34.0, 33.2, 32.1, 34.0, 33…
## $ che_de <dbl> 17.7, 16.9, 20.9, 18.4, 21.5, 19.6, 21.9, 21.8, 15.5, 22.5, 20…
## $ che_di <dbl> 28.0, 30.8, 31.7, 28.2, 29.4, 31.3, 31.7, 28.8, 27.5, 28.0, 30…
## $ elb_di <dbl> 13.1, 14.0, 13.9, 13.9, 15.2, 14.0, 16.1, 15.1, 14.1, 15.6, 13…
## $ wri_di <dbl> 10.4, 11.8, 10.9, 11.2, 11.6, 11.5, 12.5, 11.9, 11.2, 12.0, 10…
## $ kne_di <dbl> 18.8, 20.6, 19.7, 20.9, 20.7, 18.8, 20.8, 21.0, 18.9, 21.1, 19…
## $ ank_di <dbl> 14.1, 15.1, 14.1, 15.0, 14.9, 13.9, 15.6, 14.6, 13.2, 15.0, 14…
## $ sho_gi <dbl> 106.2, 110.5, 115.1, 104.5, 107.5, 119.8, 123.5, 120.4, 111.0,…
## $ che_gi <dbl> 89.5, 97.0, 97.5, 97.0, 97.5, 99.9, 106.9, 102.5, 91.0, 93.5, …
## $ wai_gi <dbl> 71.5, 79.0, 83.2, 77.8, 80.0, 82.5, 82.0, 76.8, 68.5, 77.5, 81…
## $ nav_gi <dbl> 74.5, 86.5, 82.9, 78.8, 82.5, 80.1, 84.0, 80.5, 69.0, 81.5, 81…
## $ hip_gi <dbl> 93.5, 94.8, 95.0, 94.0, 98.5, 95.3, 101.0, 98.0, 89.5, 99.8, 9…
## $ thi_gi <dbl> 51.5, 51.5, 57.3, 53.0, 55.4, 57.5, 60.9, 56.0, 50.0, 59.8, 60…
## $ bic_gi <dbl> 32.5, 34.4, 33.4, 31.0, 32.0, 33.0, 42.4, 34.1, 33.0, 36.5, 34…
## $ for_gi <dbl> 26.0, 28.0, 28.8, 26.2, 28.4, 28.0, 32.3, 28.0, 26.0, 29.2, 27…
## $ kne_gi <dbl> 34.5, 36.5, 37.0, 37.0, 37.7, 36.6, 40.1, 39.2, 35.5, 38.3, 38…
## $ cal_gi <dbl> 36.5, 37.5, 37.3, 34.8, 38.6, 36.1, 40.3, 36.7, 35.0, 38.6, 40…
## $ ank_gi <dbl> 23.5, 24.5, 21.9, 23.0, 24.4, 23.5, 23.6, 22.5, 22.0, 22.2, 23…
## $ wri_gi <dbl> 16.5, 17.0, 16.9, 16.6, 18.0, 16.9, 18.8, 18.0, 16.5, 16.9, 16…
## $ age    <int> 21, 23, 28, 23, 22, 21, 26, 27, 23, 21, 23, 22, 20, 26, 23, 22…
## $ wgt    <dbl> 65.6, 71.8, 80.7, 72.6, 78.8, 74.8, 86.4, 78.4, 62.0, 81.6, 76…
## $ hgt    <dbl> 174.0, 175.3, 193.5, 186.5, 187.2, 181.5, 184.0, 184.5, 175.0,…
## $ sex    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
# Body dimensions scatterplot
ggplot( bdims, aes( x = hgt, y = wgt, color = factor( sex ) ) ) +
geom_point()

glimpse( smoking )
## Rows: 1,691
## Columns: 12
## $ gender                <fct> Male, Female, Male, Female, Female, Female, Mal…
## $ age                   <int> 38, 42, 40, 40, 39, 37, 53, 44, 40, 41, 72, 49,…
## $ marital_status        <fct> Divorced, Single, Married, Married, Married, Ma…
## $ highest_qualification <fct> No Qualification, No Qualification, Degree, Deg…
## $ nationality           <fct> British, British, English, English, British, Br…
## $ ethnicity             <fct> White, White, White, White, White, White, White…
## $ gross_income          <fct> "2,600 to 5,200", "Under 2,600", "28,600 to 36,…
## $ region                <fct> The North, The North, The North, The North, The…
## $ smoke                 <fct> No, Yes, No, No, No, No, Yes, No, Yes, Yes, No,…
## $ amt_weekends          <int> NA, 12, NA, NA, NA, NA, 6, NA, 8, 15, NA, NA, N…
## $ amt_weekdays          <int> NA, 12, NA, NA, NA, NA, 6, NA, 8, 12, NA, NA, N…
## $ type                  <fct> , Packets, , , , , Packets, , Hand-Rolled, Pack…
# Smoking scatterplot
ggplot( smoking, aes( x = age, y = amt_weekdays, color = factor( gender ) ) ) +
geom_point()
## Warning: Removed 1270 rows containing missing values (geom_point).

Outliers

There’s no real definition of an outlier. Judgement call you gotta make.

basicmlb <- ggplot( mlbbat10, aes( x = stolen_base, y = home_run )) +
  geom_point()

alphamlb <- ggplot( mlbbat10, aes( x = stolen_base, y = home_run )) +
  geom_point( alpha = 0.5 )

jittermlb <- ggplot( mlbbat10, aes( x = stolen_base, y = home_run )) +
  geom_jitter( alpha = 0.5 )

grid.arrange( basicmlb, alphamlb, jittermlb, ncol = 3 )

Use dplyr methods to identify outliers based on a (subjective) criterion

mlbbat10 %>%
  filter( stolen_base > 60 | home_run > 50 ) %>%
  dplyr::select( name, team, position, stolen_base, home_run )
## # A tibble: 2 x 5
##   name       team  position stolen_base home_run
##   <fct>      <fct> <fct>          <dbl>    <dbl>
## 1 J Pierre   CWS   OF                68        1
## 2 J Bautista TOR   OF                 9       54
# Filter for AB greater than or equal to 200
ab_gt_200 <- mlbbat10 %>%
  filter(at_bat >= 200) 

# Scatterplot of SLG vs. OBP
ggplot(ab_gt_200, aes(x = obp, y = slg)) +
  geom_point()

# Identify the outlying player
ab_gt_200 %>%
  filter( obp < 0.200 )
## # A tibble: 1 x 19
##   name  team  position  game at_bat   run   hit double triple home_run   rbi
##   <fct> <fct> <fct>    <dbl>  <dbl> <dbl> <dbl>  <dbl>  <dbl>    <dbl> <dbl>
## 1 B Wo… LAA   3B          81    226    20    33      2      0        4    14
## # … with 8 more variables: total_base <dbl>, walk <dbl>, strike_out <dbl>,
## #   stolen_base <dbl>, caught_stealing <dbl>, obp <dbl>, slg <dbl>,
## #   bat_avg <dbl>

Correlation

Quantifying the Strength of Bivariate Relationships

Correlation
* Correlation coefficients range from -1 to 1 * Sign +/- indicates the direction of the relationship * Magnitude corresponds to the strength. + Strong: Correlations with an absolute magnitude close 1 + Moderate: " " close to 0.5 + Weak: " " close to 0.2 + No Relationship: " " close to 0

Non-Linear correlation

glimpse( run09 )
## Rows: 14,974
## Columns: 14
## $ place     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ time      <dbl> 53.533, 53.917, 53.967, 54.433, 54.450, 54.533, 54.633, 54.…
## $ net_time  <dbl> 53.533, 53.917, 53.967, 54.433, 54.450, 54.533, 54.633, 54.…
## $ pace      <dbl> 5.367, 5.400, 5.400, 5.450, 5.450, 5.467, 5.467, 5.467, 5.4…
## $ age       <int> 21, 21, 22, 19, 36, 28, 25, 31, 23, 26, 23, 35, 28, 28, 26,…
## $ gender    <fct> F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,…
## $ first     <fct> Lineth, Belianesh Zemed, Teyba, Abebu, Catherine, Olga, Sal…
## $ last      <fct> Chepkurui, Gebre, Naser, Gelan, Ndereba, Romanova, Meyerhof…
## $ city      <fct> Kenya, Ethiopia, Ethiopia, Ethiopia, Kenya, Russia, United …
## $ state     <fct> NR, NR, NR, NR, NR, NR, NR, NR, NR, NR, NR, NR, NR, CO, NR,…
## $ country   <fct> KEN, ETH, ETH, ETH, KEN, RUS, USA, KEN, ETH, RUS, ETH, ROM,…
## $ div       <int> 2, 2, 2, 1, 5, 3, 3, 4, 2, 3, 2, 5, 3, 3, 3, 3, 3, 5, 4, 2,…
## $ div_place <int> 1, 2, 3, 1, 1, 1, 2, 1, 4, 3, 4, 2, 4, 5, 6, 7, 8, 3, 2, 5,…
## $ div_tot   <int> 953, 953, 953, 71, 1130, 2706, 2706, 1678, 953, 2706, 953, …
run09 %>%
  filter( div_place <= 10) %>%
  ggplot( aes( x = age, y = pace, color = gender ) ) +
  geom_point()

Pearson Product-Moment Correlation \[r(x,y) = \frac{Cov(x,y)}{\sqrt{SXX \cdot SYY}} = \frac{\sum_{i=1}^{n}(x_i-\bar{x})(y_i-\bar{y})}{\sqrt{\sum_{i=1}^{n}(x_i-\bar{x})^2 \cdot \sum_{i=1}^{n}(y_i-\bar{y})^2}}\] Denominator: the sums of the squared deviations of the \(x\) and \(y\) Numerator: the sum of the terms of \(x\) and \(y\)

#glimpse( ncbirths )
# Compute correlation
ncbirths %>%
  dplyr::summarize(N = n(), r = cor(weight, mage))
## # A tibble: 1 x 2
##       N      r
##   <int>  <dbl>
## 1   800 0.0517
# Compute correlation for all non-missing pairs
ncbirths %>%
  dplyr::summarize(N = n(), r = cor(weight, weeks, use = "pairwise.complete.obs"))
## # A tibble: 1 x 2
##       N     r
##   <int> <dbl>
## 1   800 0.636

The Anscombe Dataset

glimpse( anscombe )
## Rows: 11
## Columns: 8
## $ x1 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ x2 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ x3 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ x4 <dbl> 8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8
## $ y1 <dbl> 8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68
## $ y2 <dbl> 9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74
## $ y3 <dbl> 7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73
## $ y4 <dbl> 6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89
p1 <- ggplot( data = anscombe, aes( x = x1, y = y1 ) ) +
  geom_point()

p2 <- ggplot( data = anscombe, aes( x = x2, y = y2 ) ) +
  geom_point()

p3 <- ggplot( data = anscombe, aes( x = x3, y = y3 ) ) +
  geom_point()

p4 <- ggplot( data = anscombe, aes( x = x4, y = y4 ) ) +
  geom_point()

grid.arrange( p1, p2, p3, p4, ncol = 2 )

All four of these data series have the same number of points, the same mean and the same standard deviation in both \(x\) and \(y\). They yield the same regression line fit and correlation values. However, the distributions are very different from one another.

This demonstration stresses the need to visually inspect your data!

#glimpse( anscombe)

Anscombe <- anscombe %>%
  pivot_longer( cols = everything(), names_to = 'labs', values_to = 'vals' ) %>%
  mutate( Xaxis = str_detect( labs, 'x' ) == TRUE,
          set = case_when( str_detect( labs, '1' ) == TRUE ~ '1',
                           str_detect( labs, '2' ) == TRUE ~ '2',
                           str_detect( labs, '3' ) == TRUE ~ '3',
                           str_detect( labs, '4' ) == TRUE ~ '4'),
          id = sort ( rep( c( 1:11 ), 8) ))
AnsX <- Anscombe %>% 
  filter( Xaxis == TRUE ) %>%
  rename( X = vals )
AnsY <- Anscombe %>% 
  filter( Xaxis == FALSE ) %>%
  rename( Y = vals)
Anscombe <- AnsX %>%
  mutate( Y = AnsY$Y) %>%
  dplyr::select( X, Y, set, id )
Anscombe
## # A tibble: 44 x 4
##        X     Y set      id
##    <dbl> <dbl> <chr> <int>
##  1    10  8.04 1         1
##  2    10  9.14 2         1
##  3    10  7.46 3         1
##  4     8  6.58 4         1
##  5     8  6.95 1         2
##  6     8  8.14 2         2
##  7     8  6.77 3         2
##  8     8  5.76 4         2
##  9    13  7.58 1         3
## 10    13  8.74 2         3
## # … with 34 more rows
ggplot(data = Anscombe, aes(x = X, y = Y)) +
  geom_point() +
  facet_wrap(~ set)

# Compute properties of Anscombe
Anscombe %>%
  group_by(set) %>%
  dplyr::summarize(
    N = n(), 
    mean_of_x = mean( X ), 
    std_dev_of_x = sd( X ), 
    mean_of_y = mean( Y ), 
    std_dev_of_y = sd( Y ), 
    correlation_between_x_and_y = cor( X,Y )
  )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 7
##   set       N mean_of_x std_dev_of_x mean_of_y std_dev_of_y correlation_between…
##   <chr> <int>     <dbl>        <dbl>     <dbl>        <dbl>                <dbl>
## 1 1        11         9         3.32      7.50         2.03                0.816
## 2 2        11         9         3.32      7.50         2.03                0.816
## 3 3        11         9         3.32      7.5          2.03                0.816
## 4 4        11         9         3.32      7.50         2.03                0.817
# Run this and look at the plot
mlbbat10 %>% 
    ggplot(aes(x = obp, y = slg)) + 
    geom_point()

# Correlation for all players with at least 200 ABs
mlbbat10 %>%
  dplyr::summarize(N = n(), r = cor(obp, slg))
## # A tibble: 1 x 2
##       N     r
##   <int> <dbl>
## 1  1199 0.815
# Run this and look at the plot
mlbbat10 %>% 
    filter(at_bat > 200) %>%
    ggplot(aes(x = obp, y = slg)) + 
    geom_point()

# Correlation for all players with at least 200 ABs
mlbbat10 %>%
  filter(at_bat > 200) %>%
  dplyr::summarize(N = n(), r = cor(obp, slg))
## # A tibble: 1 x 2
##       N     r
##   <int> <dbl>
## 1   327 0.686
# Run this and look at the plot
ggplot(data = bdims, aes(x = hgt, y = wgt, color = factor(sex))) +
  geom_point() 

# Correlation of body dimensions
bdims %>%
  group_by(sex) %>%
  dplyr::summarize(N = n(), r = cor(hgt, wgt))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
##     sex     N     r
##   <int> <int> <dbl>
## 1     0   260 0.431
## 2     1   247 0.535
#glimpse( mammals )
# Run this and look at the plot
ggplot(data = mammals, aes(x = body, y = brain)) +
  geom_point() + scale_x_log10() + scale_y_log10()

# Correlation among mammals, with and without log
mammals %>%
  dplyr::summarize(N = n(), 
            r = cor(body, brain), 
            r_log = cor(log(body), log(brain)))
##    N         r     r_log
## 1 62 0.9341638 0.9595748

Interpretation of Correlation

Correlation \(\neq\) Causation

Spurious Correlations

Remarkable but nonsensical correlations are called spurious.
Two variable may seem to move together as a function of the independent variable, but remember that a strong correlation between two variables does not take in to account confounding relations.

Simple Linear Regression

Visualization of Linear Models

#glimpse( possum )
ggplot( data = possum, aes( y = total_l, x = tail_l ) ) +
  geom_point() +
  #geom_abline( intercept = 0, slope = 2.5 ) # not a great fit
  #geom_abline( intercept = 0, slope = 2.3 ) #a gentler slope fits better
  geom_abline( intercept = 40, slope = 1.3 ) # why for inercept at the origin?

How to determine what line fits the data best?
The best fit line: least sum of squares residuals

ggplot( possum, aes( y = total_l, x = tail_l ) ) +
  geom_point() +
  geom_smooth( method = 'lm' )
## `geom_smooth()` using formula 'y ~ x'

# Scatterplot with regression line
ggplot(data = bdims, aes(x = hgt, y = wgt)) + 
  geom_point() + 
  geom_smooth(method = 'lm', se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

Understanding Linear Models

Generic statistical modeling framework: \[\mbox{response} = f(\mbox{explanatory}) + \mbox{noise}\] For linear regression, we assume the \(f()\) takes a linear form \[\mbox{response} = f(\mbox{slope}\cdot\mbox{explanatory}) + \mbox{noise}\] If we assume that the data behave linearly, it can be modeled as follows: \[Y = \beta_0 + \beta_1 \cdot X + \epsilon, \mbox{ where } \epsilon \sim N(0,\sigma_{\epsilon}) \] Fitted estimates with the linear model can be described by \[\hat{Y} = \hat{\beta}_0 + \hat{\beta}_1 \cdot X\] The differences of the two: \[\mbox{Residuals } = e = Y - \hat{Y}\] The residuals are the realization of the noise term. \(\epsilon\) is an unknown true quantity that describes the noise in the system whereas \(e\) is a known estimate of \(\epsilon\).

The goal for determining the coefficients of the best fit is to minimize the sum of squares of the residuals, \(\sum_{i=1}^{n}e_i^2\)

Key Concepts

  • \(\hat{Y}\) is the expected value for a given \(X\), basically, our best guess
  • \(\hat{\beta}\)s are estimates of true, unknown \(\beta\)s
  • Residuals (\(e\)s) are estimates of true unknown \(\epsilon\)s
  • Error is a misleading term, better to think of as noise
bdimslm <- lm( wgt ~ hgt, bdims)
summary( bdimslm )
## 
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.743  -6.402  -1.231   5.059  41.103 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -105.01125    7.53941  -13.93   <2e-16 ***
## hgt            1.01762    0.04399   23.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared:  0.5145, Adjusted R-squared:  0.5136 
## F-statistic: 535.2 on 1 and 505 DF,  p-value: < 2.2e-16

Regression vs. regession to the mean

Galton’s “Regression to the mean”

data( Galton )
glimpse( Galton )
## Rows: 928
## Columns: 2
## $ parent <dbl> 70.5, 68.5, 65.5, 64.5, 64.0, 67.5, 67.5, 67.5, 66.5, 66.5, 66…
## $ child  <dbl> 61.7, 61.7, 61.7, 61.7, 61.7, 62.2, 62.2, 62.2, 62.2, 62.2, 62…
# Height of children vs. height of father
ggplot(data = Galton, aes(x = parent, y = child)) +
  geom_jitter() + 
  geom_abline(slope = 1, intercept = 0) + 
  geom_smooth(method = 'lm', se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

Interpretin regression models

Interpretation of Regression

glimpse( textbooks )
## Rows: 73
## Columns: 7
## $ dept_abbr <fct> Am Ind, Anthro, Anthro, Anthro, Art His, Art His, Asia Am, …
## $ course    <fct>  C170, 9, 135T, 191HB, M102K, 118E, 187B, 191E, C125, M145B…
## $ isbn      <fct> 978-0803272620, 978-0030119194, 978-0300080643, 978-0226206…
## $ ucla_new  <dbl> 27.67, 40.59, 31.68, 16.00, 18.95, 14.95, 24.70, 19.50, 123…
## $ amaz_new  <dbl> 27.95, 31.14, 32.00, 11.52, 14.21, 10.17, 20.06, 16.66, 106…
## $ more      <fct> Y, Y, Y, Y, Y, Y, Y, N, N, Y, Y, N, Y, Y, N, N, N, N, N, N,…
## $ diff      <dbl> -0.28, 9.45, -0.32, 4.48, 4.74, 4.78, 4.64, 2.84, 17.59, 3.…
textbooks %>% mutate( course_number = readr::parse_number( as.character( course ) ) ) %>%
  ggplot( aes( x = course_number, y = ucla_new ) ) +
  geom_point()

textbooks %>%
  ggplot( aes( x = amaz_new, y = ucla_new ) ) +
  geom_point() +
  geom_smooth( method = 'lm' )
## `geom_smooth()` using formula 'y ~ x'

textlm <- lm( ucla_new ~ amaz_new, data = textbooks )
summary( textlm )
## 
## Call:
## lm(formula = ucla_new ~ amaz_new, data = textbooks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.785  -4.574   0.577   4.012  39.002 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.92897    1.93538    0.48    0.633    
## amaz_new     1.19900    0.02519   47.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.47 on 71 degrees of freedom
## Multiple R-squared:  0.9696, Adjusted R-squared:  0.9692 
## F-statistic:  2266 on 1 and 71 DF,  p-value: < 2.2e-16

For each additional dollar that Amazon charges for a book, the UCLA bookstore charges ~$1.20

Your Linear Model Object

The output from lm() is an object. The structure holds all the relevant data to build our model (including the data itself).

class( textlm )
## [1] "lm"
textlm
## 
## Call:
## lm(formula = ucla_new ~ amaz_new, data = textbooks)
## 
## Coefficients:
## (Intercept)     amaz_new  
##       0.929        1.199
coef( textlm )
## (Intercept)    amaz_new 
##   0.9289651   1.1990014

Return the \(\hat{Y}\) values for each data point:

fitted.values( textlm )
##         1         2         3         4         5         6         7         8 
##  34.44105  38.26587  39.29701  14.74146  17.96678  13.12281  24.98093  20.90433 
##         9        10        11        12        13        14        15        16 
## 128.32287  16.82772  36.83906 106.54900  23.05054  20.67652 117.68772  57.89352 
##        17        18        19        20        21        22        23        24 
##  90.77014 160.12038 146.60764 130.42112  14.92131  23.63805  15.60474  27.24705 
##        25        26        27        28        29        30        31        32 
##  38.26587  35.64006  20.29284  46.19127  39.03323  40.46004  37.94214 102.84409 
##        33        34        35        36        37        38        39        40 
##  42.83406 118.37115  98.26390  12.31948  13.15878 162.42247 173.28542 211.95321 
##        41        42        43        44        45        46        47        48 
## 181.53455 175.26377 209.02765 157.99815 189.98751 165.39599  30.84405 191.90591 
##        49        50        51        52        53        54        55        56 
##  28.58993  26.15595  52.10235  48.13365 103.08389 112.59197  81.74166 160.14436 
##        57        58        59        60        61        62        63        64 
##  30.07669  30.84405 103.38364  13.01490  79.73933 101.95682  11.24038  70.97463 
##        65        66        67        68        69        70        71        72 
##  97.29271  77.77297  45.33998  25.16078  48.09768  32.54663  29.93281  23.37427 
##        73 
##  22.77477

Return the residuals \(Y-\hat{Y}\) or \(e\):

residuals( textlm )
##            1            2            3            4            5            6 
##  -6.77105468   2.32413081  -7.61701041   1.25853860   0.98322479   1.82719051 
##            7            8            9           10           11           12 
##  -0.28093350  -1.40432868  -4.48286560   0.17227613  -5.20905751   9.45100013 
##           13           14           15           16           17           18 
##   4.61945878   4.02348159   8.98227697  -3.99352239  -1.04014123  10.87961683 
##           19           20           21           22           23           24 
##   5.39236280  -5.62111808   1.07868839   2.31194809   2.39525758  -5.51704618 
##           25           26           27           28           29           30 
##   2.32413081  -6.69005609  -0.34283796   3.25873144   2.05676990  10.48995821 
##           31           32           33           34           35           36 
##   6.55786119 -20.39408550  -8.23406459 -29.95115384 -14.26390008  -1.06947854 
##           37           38           39           40           41           42 
##   1.84122047  17.60753411   0.71458128 -23.37321441 -34.78454847   8.48622894 
##           43           44           45           46           47           48 
##   5.47234904  39.00184934   4.01249154  10.85401060  -6.14405043  -3.90591073 
##           49           50           51           52           53           54 
##   1.11007224   0.08404511   3.02765446  -4.57365085  26.51611422  11.24803299 
##           55           56           57           58           59           60 
##   3.37833944  -7.66436320  -0.37668952  -6.14405043 -13.67363613  -2.51489936 
##           61           62           63           64           65           66 
##  13.14067180  -1.07682445   0.70962274   1.49537216  -5.19270894   0.57703413 
##           67           68           69           70           71           72 
##  -3.33997755  -6.41078371   0.36231919   7.00336756  -0.28280935   0.38572840 
##           73 
##   4.92522911

Use the broom library to augment the original model output to include useful features of the fit:

augtextlm <- augment( textlm )
glimpse( augtextlm )
## Rows: 73
## Columns: 8
## $ ucla_new   <dbl> 27.67, 40.59, 31.68, 16.00, 18.95, 14.95, 24.70, 19.50, 12…
## $ amaz_new   <dbl> 27.95, 31.14, 32.00, 11.52, 14.21, 10.17, 20.06, 16.66, 10…
## $ .fitted    <dbl> 34.44105, 38.26587, 39.29701, 14.74146, 17.96678, 13.12281…
## $ .resid     <dbl> -6.7710547, 2.3241308, -7.6170104, 1.2585386, 0.9832248, 1…
## $ .std.resid <dbl> -0.65294223, 0.22399308, -0.73400193, 0.12183273, 0.095110…
## $ .hat       <dbl> 0.01944321, 0.01833896, 0.01806141, 0.02699567, 0.02554530…
## $ .sigma     <dbl> 10.51520, 10.54319, 10.50682, 10.54581, 10.54624, 10.54459…
## $ .cooksd    <dbl> 4.226829e-03, 4.686540e-04, 4.954865e-03, 2.059099e-04, 1.…
# Mean of weights equal to mean of fitted values?
( mean( textbooks$ucla_new ) == mean( fitted.values( textlm ) ) )
## [1] TRUE
# Mean of the residuals
mean( residuals( textlm ) )
## [1] -5.944398e-17
# create textlm_tidy
textlm_tidy <- data.frame( augment( textlm ) )
glimpse( textlm_tidy )
## Rows: 73
## Columns: 8
## $ ucla_new   <dbl> 27.67, 40.59, 31.68, 16.00, 18.95, 14.95, 24.70, 19.50, 12…
## $ amaz_new   <dbl> 27.95, 31.14, 32.00, 11.52, 14.21, 10.17, 20.06, 16.66, 10…
## $ .fitted    <dbl> 34.44105, 38.26587, 39.29701, 14.74146, 17.96678, 13.12281…
## $ .resid     <dbl> -6.7710547, 2.3241308, -7.6170104, 1.2585386, 0.9832248, 1…
## $ .std.resid <dbl> -0.65294223, 0.22399308, -0.73400193, 0.12183273, 0.095110…
## $ .hat       <dbl> 0.01944321, 0.01833896, 0.01806141, 0.02699567, 0.02554530…
## $ .sigma     <dbl> 10.51520, 10.54319, 10.50682, 10.54581, 10.54624, 10.54459…
## $ .cooksd    <dbl> 4.226829e-03, 4.686540e-04, 4.954865e-03, 2.059099e-04, 1.…

Using your Linear Model

Examining the Residuals:

textlm_tidy %>%
  arrange( desc( .resid ) ) %>%
  head()
##   ucla_new amaz_new   .fitted   .resid .std.resid       .hat    .sigma
## 1   197.00   131.00 157.99815 39.00185   3.807626 0.04330931  9.408665
## 2   129.60    85.20 103.08389 26.51611   2.554497 0.01753183 10.050561
## 3   180.03   134.69 162.42247 17.60753   1.721789 0.04644271 10.324375
## 4    92.88    65.73  79.73933 13.14067   1.263623 0.01392606 10.427641
## 5   123.84    93.13 112.59197 11.24803   1.085114 0.02025756 10.459091
## 6   171.00   132.77 160.12038 10.87962   1.062967 0.04479266 10.462654
##      .cooksd
## 1 0.32816195
## 2 0.05822235
## 3 0.07219393
## 4 0.01127520
## 5 0.01217296
## 6 0.02649222

The residuals reveal how far from the model prediction a textbook price is. In other words, how overpriced is the textbook. The output above shows that the most overpriced text sells for $197 which is $66 more than the Amazon cost but also $39 more than the typical upsale price.

textbooks %>%
  filter( ucla_new == 197 )
## # A tibble: 1 x 7
##   dept_abbr course isbn           ucla_new amaz_new more   diff
##   <fct>     <fct>  <fct>             <dbl>    <dbl> <fct> <dbl>
## 1 Mgmt      228    978-0073379661      197      131 Y        66

A quick ibsn search shows this text is titled ‘Financial Statement Analysis and Security Valuation’. DataCamp Prof is quick to point out the irony in this.

Were there any underpriced books?

textlm_tidy %>%
  arrange( .resid ) %>%
  head()
##   ucla_new amaz_new  .fitted    .resid .std.resid       .hat    .sigma
## 1   146.75   150.63 181.5345 -34.78455  -3.429184 0.06178867  9.633991
## 2    88.42    97.95 118.3712 -29.95115  -2.892403 0.02226987  9.906067
## 3   188.58   176.00 211.9532 -23.37321  -2.342591 0.09227679 10.131120
## 4    82.45    85.00 102.8441 -20.39409  -1.964657 0.01747249 10.256218
## 5    84.00    81.18  98.2639 -14.26390  -1.373378 0.01642800 10.405876
## 6    89.71    85.45 103.3836 -13.67364  -1.317335 0.01760665 10.417222
##      .cooksd
## 1 0.38722189
## 2 0.09527663
## 3 0.27893442
## 4 0.03432050
## 5 0.01575174
## 6 0.01555083

Yes, there were quite a few underpriced books. It would be interesting to pursue this dataset further to understand what features correspond to under/over pricing.

textbooks %>%
  filter( ucla_new == 146.75 )
## # A tibble: 1 x 7
##   dept_abbr course isbn           ucla_new amaz_new more   diff
##   <fct>     <fct>  <fct>             <dbl>    <dbl> <fct> <dbl>
## 1 Mgmt      1A     978-0077251031     147.     151. Y     -3.88

The most underpriced is a Studyguide to the Principles of Accounting.

Making Predictions

predict( lm, newdata ) yields fitted values for any new data

new_df <- data.frame( amaz_new = 8.49 )
predict( textlm, newdata = new_df )
##        1 
## 11.10849

Visualizing New Observations. Use broom::augment to get predictions in a data.frame with other relevant model features

isrs <- broom::augment( textlm, newdata =new_df )
ggplot( data = textbooks, aes( x = amaz_new, y = ucla_new ) ) +
  geom_point() +
  geom_smooth( method = 'lm' ) +
  geom_point( data = isrs, aes( y = .fitted ), size = 3, color = 'magenta' )
## `geom_smooth()` using formula 'y ~ x'

mod <- lm( wgt ~ hgt,  data =  bdims)
coefs <- data.frame( Intercept = coef( mod )[1], slope = coef( mod )[2] )
glimpse( coefs )
## Rows: 1
## Columns: 2
## $ Intercept <dbl> -105.0113
## $ slope     <dbl> 1.017617
# Scatterplot with regression line
ggplot(data = bdims, aes(x = hgt, y = wgt)) + 
  geom_point() + 
  geom_abline(data = coefs,
              aes( intercept = Intercept,
                   slope = slope ),
              color = 'dodgerblue' )

Model fit

Assessing Model Fit

How well do models work?
How to quantify the quality of a model fit?

  • Sum of Squares Deviations. However, this has a way of penelizing large deviation disproportionately.
#Two ways to compute SSE
mod_possum <- lm( total_l ~ tail_l, data = possum )
mod_possum %>%
  augment() %>%
  dplyr::summarize( SSE = sum( .resid^2 ),
             SSE_also = ( n() - 1 ) * var( .resid ) )
## # A tibble: 1 x 2
##     SSE SSE_also
##   <dbl>    <dbl>
## 1 1301.    1301.

The SSE is a single number that captures how much our model missed the real data by. However, it is difficult to interpret because the units are squared. Therefore it is common to work with the RMSE.

RMSE: Root Mean Square Error or The Residual Standard Error
\[RMSE = \sqrt{\frac{\sum_i e_i^2}{d.f}} = \sqrt{\frac{SSE}{n-2}}\]

summary( mod_possum )
## 
## Call:
## lm(formula = total_l ~ tail_l, data = possum)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2100 -2.3265  0.1792  2.7765  6.7900 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  41.0371     6.6568   6.165 1.43e-08 ***
## tail_l        1.2443     0.1796   6.927 3.94e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.572 on 102 degrees of freedom
## Multiple R-squared:   0.32,  Adjusted R-squared:  0.3133 
## F-statistic: 47.99 on 1 and 102 DF,  p-value: 3.935e-10

RMSE is conveniently in the same units as the target variable Here, RMSE = 3.572 on 102 degrees of freedom. This means that our model makes a prediction on body length that is typically within 3.572 cm of the truth.

summary( textlm )
## 
## Call:
## lm(formula = ucla_new ~ amaz_new, data = textbooks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.785  -4.574   0.577   4.012  39.002 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.92897    1.93538    0.48    0.633    
## amaz_new     1.19900    0.02519   47.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.47 on 71 degrees of freedom
## Multiple R-squared:  0.9696, Adjusted R-squared:  0.9692 
## F-statistic:  2266 on 1 and 71 DF,  p-value: < 2.2e-16

For the textbook model, the RMSE = 10.47. So our text price prediction is typically withing $10.47 USD of the truth.

# Compute the mean of the residuals
mean( residuals( textlm ) )
## [1] -5.944398e-17
# Compute RMSE
sqrt(sum(residuals( textlm )^2) / df.residual( textlm ))
## [1] 10.47237

Comparing Model Fits

a unitless measure of model fit that can be used to compare between different modeling approaches

The Null (averagle) Model

  • for all observations, \(\hat{y} = \bar{y}\)
  • find the SSE for the null model:
null_possum <- lm( total_l ~ 1, data = possum )
null_possum %>%
  augment( possum ) %>%
  dplyr::summarize( SSE = sum( .resid^2 ) )
## # A tibble: 1 x 1
##     SSE
##   <dbl>
## 1 1914.
  • find the SSE for our linear model
mod_possum %>%
  augment( possum ) %>%
  dplyr::summarize( SSE = sum( .resid^2 ) )
## # A tibble: 1 x 1
##     SSE
##   <dbl>
## 1 1301.
  • the ration of the the SSEs quatifies the variance accounted for by the linear model

Coefficient of Determination \[R^2 = 1 - \frac{SSE}{SST} = 1 - \frac{Var(e)}{Var(y)}\] SST is the SSE for the Null Model \(\therefore\) we interpret \(R^2\) as the proportion of variance of the response variable that is explained by the model.

Connection to Correlation: For simple linear regression with just 1 explanatory variable, the value of \(R^2\) corresponds to to square of the correlation coeficient. Or: \[r_{x,y}^2 = R^2\] However, this is not the case with more complex models

\(R^2\) is very commonly used to evaluate model fit. However, it should not be the be-all & end-all measure. A very high \(R^2\) could happen when the model is overfitting results. A low \(R^2\) does not mean that the model can’t yield statistically powerful insights about a dataset.

# View model summary
summary( mod )
## 
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.743  -6.402  -1.231   5.059  41.103 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -105.01125    7.53941  -13.93   <2e-16 ***
## hgt            1.01762    0.04399   23.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared:  0.5145, Adjusted R-squared:  0.5136 
## F-statistic: 535.2 on 1 and 505 DF,  p-value: < 2.2e-16
# Compute R-squared
bdims_tidy <- augment( mod )
bdims_tidy %>%
  dplyr::summarize(var_y = var( wgt ), var_e = var( .resid ) ) %>%
  mutate(R_squared = 1 - var_e / var_y )
## # A tibble: 1 x 3
##   var_y var_e R_squared
##   <dbl> <dbl>     <dbl>
## 1  178.  86.5     0.515

Unusual Points

Leverage & Influence

regulars <- mlbbat10 %>%
  filter( at_bat > 400 )
ggplot( regulars, aes( x = stolen_base, y = home_run )) +
  geom_jitter( alpha = 0.5 ) +
  geom_smooth( method = 'lm' )
## `geom_smooth()` using formula 'y ~ x'

How individual points, especially if they are extreme, can change the fit of a linear model.

Leverage is a function of the distance between the value of the explanatary variable and the mean of the explanatary variable. Points that are far from the horizontal center of the data distribution have high leverage while close points have low leverage. \[h_i = \frac{1}{n} + \frac{(x_i- \bar{x})^2}{\sum_{i=1}^n (x-1 - \bar{x})^2}\] To see the points with the highest leverage, sort by the.hat feature

modmlb <- lm( home_run ~ stolen_base, data = regulars )
modmlb %>%
  augment() %>%
  arrange( desc( .hat ) ) %>%
  dplyr::select( home_run, stolen_base, .fitted, .resid, .hat )
## # A tibble: 170 x 5
##    home_run stolen_base .fitted .resid   .hat
##       <dbl>       <dbl>   <dbl>  <dbl>  <dbl>
##  1        1          68    2.38 -1.38  0.131 
##  2        2          52    6.46 -4.46  0.0703
##  3        5          50    6.97 -1.97  0.0642
##  4       19          47    7.74 11.3   0.0555
##  5        5          47    7.74 -2.74  0.0555
##  6        1          42    9.01 -8.01  0.0426
##  7       18          42    9.01  8.99  0.0426
##  8        6          42    9.01 -3.01  0.0426
##  9       11          37   10.3   0.715 0.0316
## 10       18          34   11.0   6.95  0.0260
## # … with 160 more rows

Influnece: determines by both .hat and .resid. See .cooksd

modmlb %>%
  augment() %>%
  arrange( desc( .cooksd ) ) %>%
  dplyr::select( home_run, stolen_base, .fitted, .resid, .hat, .cooksd ) %>%
  head()
## # A tibble: 6 x 6
##   home_run stolen_base .fitted .resid    .hat .cooksd
##      <dbl>       <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
## 1       54           9   17.4   36.6  0.00607  0.0434
## 2       19          47    7.74  11.3  0.0555   0.0416
## 3       34          26   13.1   20.9  0.0144   0.0341
## 4       42          14   16.1   25.9  0.00618  0.0221
## 5       39           0   19.7   19.3  0.0108   0.0216
## 6       18          42    9.01   8.99 0.0426   0.0198

Dealing with Outliers

Removing outliers: What is the justification? If the answer is, because it improves my result, then this is not justification. How does this change the scope of inference? by removing an outlier are you selectively dismissing a group/demographic?