library( dplyr )
library( ggplot2 )
library( gridExtra )
library( openintro )
library( tidyverse )
library( UsingR )
library( broom )
Visualizing two variables
Visualizing bivariate relationships
- both variables are numeric
- response variable: y = dependant variable
- explanatory variable: x = independent/predictor variable. Basically, something you think might be related to the dependant/response variable
Graphical Representations with Scatter plots.
convention: dependet/response variable on the x-axis and the dependent variable on the y-axis.
#glimpse( possum )
simplescatter <- ggplot( data = possum, aes( y = total_l, x = tail_l ) ) +
geom_point() +
labs( x = 'tail length (cm)', y = 'body length (cm)',
title = 'Start with a Simple Scatter Plot' )
scattercut <- ggplot( data = possum, aes( y = total_l, x = cut( tail_l , breaks = 5 ) ) ) +
geom_point() +
labs( x = 'tail length (cm)', y = 'body length (cm)',
title = 'Cut the distribution into discrete ranges' )
boxcut <- ggplot( data = possum, aes( y = total_l, x = cut( tail_l , breaks = 5 ) ) ) +
geom_boxplot() +
labs( x = 'tail length (cm)', y = 'body length (cm)',
title = 'Plot as Boxplot' )
grid.arrange( simplescatter, scattercut, boxcut, ncol = 3 )
glimpse( ncbirths )
## Rows: 1,000
## Columns: 13
## $ fage <int> NA, NA, 19, 21, NA, NA, 18, 17, NA, 20, 30, NA, NA, NA…
## $ mage <int> 13, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16…
## $ mature <fct> younger mom, younger mom, younger mom, younger mom, yo…
## $ weeks <int> 39, 42, 37, 41, 39, 38, 37, 35, 38, 37, 45, 42, 40, 38…
## $ premie <fct> full term, full term, full term, full term, full term,…
## $ visits <int> 10, 15, 11, 6, 9, 19, 12, 5, 9, 13, 9, 8, 4, 12, 15, 7…
## $ marital <fct> not married, not married, not married, not married, no…
## $ gained <int> 38, 20, 38, 34, 27, 22, 76, 15, NA, 52, 28, 34, 12, 30…
## $ weight <dbl> 7.63, 7.88, 6.63, 8.00, 6.38, 5.38, 8.44, 4.69, 8.81, …
## $ lowbirthweight <fct> not low, not low, not low, not low, not low, low, not …
## $ gender <fct> male, male, female, male, female, male, male, male, ma…
## $ habit <fct> nonsmoker, nonsmoker, nonsmoker, nonsmoker, nonsmoker,…
## $ whitemom <fct> not white, not white, white, white, not white, not whi…
ncbirths <- ncbirths %>% na.omit()
simplescatter <- ggplot( ncbirths, aes( y = weight, x = weeks) ) +
geom_point() +
labs( x = 'weeks of gestation', y = 'birth weight' )
boxcut <- ggplot( ncbirths, aes( y = weight, x = cut( weeks, breaks = 5 ) ) ) +
geom_boxplot() +
labs( x = 'weeks of gestation', y = 'birth weight' )
grid.arrange( simplescatter, boxcut, ncol = 2 )
Characterizing Bivariate Relationships
- Form ( linear, quadratic, non-linear )
- Direction ( positive or negative )
- Strength ( how much scatter/noise )
- Outliers
glimpse( mammals )
## Rows: 62
## Columns: 2
## $ body <dbl> 3.385, 0.480, 1.350, 465.000, 36.330, 27.660, 14.830, 1.040, 4.…
## $ brain <dbl> 44.50, 15.50, 8.10, 423.00, 119.50, 115.00, 98.20, 5.50, 58.00,…
wilddat <- ggplot( mammals, aes( x = body, y = brain ) ) +
geom_point()
log10coord <- ggplot(data = mammals, aes(x = body, y = brain)) +
geom_point() +
coord_trans(x = "log10", y = "log10")
log10scale <- ggplot(data = mammals, aes(x = body, y = brain)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
grid.arrange( wilddat, log10coord, log10scale, ncol = 3 )
glimpse( mlbbat10 )
## Rows: 1,199
## Columns: 19
## $ name <fct> I Suzuki, D Jeter, M Young, J Pierre, R Weeks, M Scut…
## $ team <fct> SEA, NYY, TEX, CWS, MIL, BOS, BAL, MIN, NYY, CIN, MIL…
## $ position <fct> OF, SS, 3B, OF, 2B, SS, OF, OF, 2B, 2B, OF, OF, 2B, O…
## $ game <dbl> 162, 157, 157, 160, 160, 150, 160, 153, 160, 155, 157…
## $ at_bat <dbl> 680, 663, 656, 651, 651, 632, 629, 629, 626, 626, 619…
## $ run <dbl> 74, 111, 99, 96, 112, 92, 79, 85, 103, 100, 101, 103,…
## $ hit <dbl> 214, 179, 186, 179, 175, 174, 187, 166, 200, 172, 188…
## $ double <dbl> 30, 30, 36, 18, 32, 38, 45, 24, 41, 33, 45, 34, 41, 2…
## $ triple <dbl> 3, 3, 3, 3, 4, 0, 3, 10, 3, 5, 1, 10, 4, 3, 3, 1, 5, …
## $ home_run <dbl> 6, 10, 21, 1, 29, 11, 12, 3, 29, 18, 25, 4, 10, 25, 1…
## $ rbi <dbl> 43, 67, 91, 47, 83, 56, 60, 58, 109, 59, 103, 41, 75,…
## $ total_base <dbl> 268, 245, 291, 206, 302, 245, 274, 219, 334, 269, 310…
## $ walk <dbl> 45, 63, 50, 45, 76, 53, 73, 60, 57, 46, 56, 47, 28, 4…
## $ strike_out <dbl> 86, 106, 115, 47, 184, 71, 93, 74, 77, 83, 105, 170, …
## $ stolen_base <dbl> 42, 18, 4, 68, 11, 5, 7, 26, 3, 16, 14, 27, 14, 18, 1…
## $ caught_stealing <dbl> 9, 5, 2, 18, 4, 4, 2, 4, 2, 12, 3, 6, 4, 9, 5, 1, 3, …
## $ obp <dbl> 0.359, 0.340, 0.330, 0.341, 0.366, 0.333, 0.370, 0.33…
## $ slg <dbl> 0.394, 0.370, 0.444, 0.316, 0.464, 0.388, 0.436, 0.34…
## $ bat_avg <dbl> 0.315, 0.270, 0.284, 0.275, 0.269, 0.275, 0.297, 0.26…
# Baseball player scatterplot
ggplot( mlbbat10, aes( x = obp, y = slg ) ) +
geom_point()
glimpse( bdims )
## Rows: 507
## Columns: 25
## $ bia_di <dbl> 42.9, 43.7, 40.1, 44.3, 42.5, 43.3, 43.5, 44.4, 43.5, 42.0, 40…
## $ bii_di <dbl> 26.0, 28.5, 28.2, 29.9, 29.9, 27.0, 30.0, 29.8, 26.5, 28.0, 29…
## $ bit_di <dbl> 31.5, 33.5, 33.3, 34.0, 34.0, 31.5, 34.0, 33.2, 32.1, 34.0, 33…
## $ che_de <dbl> 17.7, 16.9, 20.9, 18.4, 21.5, 19.6, 21.9, 21.8, 15.5, 22.5, 20…
## $ che_di <dbl> 28.0, 30.8, 31.7, 28.2, 29.4, 31.3, 31.7, 28.8, 27.5, 28.0, 30…
## $ elb_di <dbl> 13.1, 14.0, 13.9, 13.9, 15.2, 14.0, 16.1, 15.1, 14.1, 15.6, 13…
## $ wri_di <dbl> 10.4, 11.8, 10.9, 11.2, 11.6, 11.5, 12.5, 11.9, 11.2, 12.0, 10…
## $ kne_di <dbl> 18.8, 20.6, 19.7, 20.9, 20.7, 18.8, 20.8, 21.0, 18.9, 21.1, 19…
## $ ank_di <dbl> 14.1, 15.1, 14.1, 15.0, 14.9, 13.9, 15.6, 14.6, 13.2, 15.0, 14…
## $ sho_gi <dbl> 106.2, 110.5, 115.1, 104.5, 107.5, 119.8, 123.5, 120.4, 111.0,…
## $ che_gi <dbl> 89.5, 97.0, 97.5, 97.0, 97.5, 99.9, 106.9, 102.5, 91.0, 93.5, …
## $ wai_gi <dbl> 71.5, 79.0, 83.2, 77.8, 80.0, 82.5, 82.0, 76.8, 68.5, 77.5, 81…
## $ nav_gi <dbl> 74.5, 86.5, 82.9, 78.8, 82.5, 80.1, 84.0, 80.5, 69.0, 81.5, 81…
## $ hip_gi <dbl> 93.5, 94.8, 95.0, 94.0, 98.5, 95.3, 101.0, 98.0, 89.5, 99.8, 9…
## $ thi_gi <dbl> 51.5, 51.5, 57.3, 53.0, 55.4, 57.5, 60.9, 56.0, 50.0, 59.8, 60…
## $ bic_gi <dbl> 32.5, 34.4, 33.4, 31.0, 32.0, 33.0, 42.4, 34.1, 33.0, 36.5, 34…
## $ for_gi <dbl> 26.0, 28.0, 28.8, 26.2, 28.4, 28.0, 32.3, 28.0, 26.0, 29.2, 27…
## $ kne_gi <dbl> 34.5, 36.5, 37.0, 37.0, 37.7, 36.6, 40.1, 39.2, 35.5, 38.3, 38…
## $ cal_gi <dbl> 36.5, 37.5, 37.3, 34.8, 38.6, 36.1, 40.3, 36.7, 35.0, 38.6, 40…
## $ ank_gi <dbl> 23.5, 24.5, 21.9, 23.0, 24.4, 23.5, 23.6, 22.5, 22.0, 22.2, 23…
## $ wri_gi <dbl> 16.5, 17.0, 16.9, 16.6, 18.0, 16.9, 18.8, 18.0, 16.5, 16.9, 16…
## $ age <int> 21, 23, 28, 23, 22, 21, 26, 27, 23, 21, 23, 22, 20, 26, 23, 22…
## $ wgt <dbl> 65.6, 71.8, 80.7, 72.6, 78.8, 74.8, 86.4, 78.4, 62.0, 81.6, 76…
## $ hgt <dbl> 174.0, 175.3, 193.5, 186.5, 187.2, 181.5, 184.0, 184.5, 175.0,…
## $ sex <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
# Body dimensions scatterplot
ggplot( bdims, aes( x = hgt, y = wgt, color = factor( sex ) ) ) +
geom_point()
glimpse( smoking )
## Rows: 1,691
## Columns: 12
## $ gender <fct> Male, Female, Male, Female, Female, Female, Mal…
## $ age <int> 38, 42, 40, 40, 39, 37, 53, 44, 40, 41, 72, 49,…
## $ marital_status <fct> Divorced, Single, Married, Married, Married, Ma…
## $ highest_qualification <fct> No Qualification, No Qualification, Degree, Deg…
## $ nationality <fct> British, British, English, English, British, Br…
## $ ethnicity <fct> White, White, White, White, White, White, White…
## $ gross_income <fct> "2,600 to 5,200", "Under 2,600", "28,600 to 36,…
## $ region <fct> The North, The North, The North, The North, The…
## $ smoke <fct> No, Yes, No, No, No, No, Yes, No, Yes, Yes, No,…
## $ amt_weekends <int> NA, 12, NA, NA, NA, NA, 6, NA, 8, 15, NA, NA, N…
## $ amt_weekdays <int> NA, 12, NA, NA, NA, NA, 6, NA, 8, 12, NA, NA, N…
## $ type <fct> , Packets, , , , , Packets, , Hand-Rolled, Pack…
# Smoking scatterplot
ggplot( smoking, aes( x = age, y = amt_weekdays, color = factor( gender ) ) ) +
geom_point()
## Warning: Removed 1270 rows containing missing values (geom_point).
Outliers
There’s no real definition of an outlier. Judgement call you gotta make.
basicmlb <- ggplot( mlbbat10, aes( x = stolen_base, y = home_run )) +
geom_point()
alphamlb <- ggplot( mlbbat10, aes( x = stolen_base, y = home_run )) +
geom_point( alpha = 0.5 )
jittermlb <- ggplot( mlbbat10, aes( x = stolen_base, y = home_run )) +
geom_jitter( alpha = 0.5 )
grid.arrange( basicmlb, alphamlb, jittermlb, ncol = 3 )
Use
dplyr methods to identify outliers based on a (subjective) criterion
mlbbat10 %>%
filter( stolen_base > 60 | home_run > 50 ) %>%
dplyr::select( name, team, position, stolen_base, home_run )
## # A tibble: 2 x 5
## name team position stolen_base home_run
## <fct> <fct> <fct> <dbl> <dbl>
## 1 J Pierre CWS OF 68 1
## 2 J Bautista TOR OF 9 54
# Filter for AB greater than or equal to 200
ab_gt_200 <- mlbbat10 %>%
filter(at_bat >= 200)
# Scatterplot of SLG vs. OBP
ggplot(ab_gt_200, aes(x = obp, y = slg)) +
geom_point()
# Identify the outlying player
ab_gt_200 %>%
filter( obp < 0.200 )
## # A tibble: 1 x 19
## name team position game at_bat run hit double triple home_run rbi
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 B Wo… LAA 3B 81 226 20 33 2 0 4 14
## # … with 8 more variables: total_base <dbl>, walk <dbl>, strike_out <dbl>,
## # stolen_base <dbl>, caught_stealing <dbl>, obp <dbl>, slg <dbl>,
## # bat_avg <dbl>
Correlation
Quantifying the Strength of Bivariate Relationships
Correlation
* Correlation coefficients range from -1 to 1 * Sign +/- indicates the direction of the relationship * Magnitude corresponds to the strength. + Strong: Correlations with an absolute magnitude close 1 + Moderate: " " close to 0.5 + Weak: " " close to 0.2 + No Relationship: " " close to 0
Non-Linear correlation
glimpse( run09 )
## Rows: 14,974
## Columns: 14
## $ place <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ time <dbl> 53.533, 53.917, 53.967, 54.433, 54.450, 54.533, 54.633, 54.…
## $ net_time <dbl> 53.533, 53.917, 53.967, 54.433, 54.450, 54.533, 54.633, 54.…
## $ pace <dbl> 5.367, 5.400, 5.400, 5.450, 5.450, 5.467, 5.467, 5.467, 5.4…
## $ age <int> 21, 21, 22, 19, 36, 28, 25, 31, 23, 26, 23, 35, 28, 28, 26,…
## $ gender <fct> F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,…
## $ first <fct> Lineth, Belianesh Zemed, Teyba, Abebu, Catherine, Olga, Sal…
## $ last <fct> Chepkurui, Gebre, Naser, Gelan, Ndereba, Romanova, Meyerhof…
## $ city <fct> Kenya, Ethiopia, Ethiopia, Ethiopia, Kenya, Russia, United …
## $ state <fct> NR, NR, NR, NR, NR, NR, NR, NR, NR, NR, NR, NR, NR, CO, NR,…
## $ country <fct> KEN, ETH, ETH, ETH, KEN, RUS, USA, KEN, ETH, RUS, ETH, ROM,…
## $ div <int> 2, 2, 2, 1, 5, 3, 3, 4, 2, 3, 2, 5, 3, 3, 3, 3, 3, 5, 4, 2,…
## $ div_place <int> 1, 2, 3, 1, 1, 1, 2, 1, 4, 3, 4, 2, 4, 5, 6, 7, 8, 3, 2, 5,…
## $ div_tot <int> 953, 953, 953, 71, 1130, 2706, 2706, 1678, 953, 2706, 953, …
run09 %>%
filter( div_place <= 10) %>%
ggplot( aes( x = age, y = pace, color = gender ) ) +
geom_point()
Pearson Product-Moment Correlation \[r(x,y) = \frac{Cov(x,y)}{\sqrt{SXX \cdot SYY}} = \frac{\sum_{i=1}^{n}(x_i-\bar{x})(y_i-\bar{y})}{\sqrt{\sum_{i=1}^{n}(x_i-\bar{x})^2 \cdot \sum_{i=1}^{n}(y_i-\bar{y})^2}}\] Denominator: the sums of the squared deviations of the \(x\) and \(y\) Numerator: the sum of the terms of \(x\) and \(y\)
#glimpse( ncbirths )
# Compute correlation
ncbirths %>%
dplyr::summarize(N = n(), r = cor(weight, mage))
## # A tibble: 1 x 2
## N r
## <int> <dbl>
## 1 800 0.0517
# Compute correlation for all non-missing pairs
ncbirths %>%
dplyr::summarize(N = n(), r = cor(weight, weeks, use = "pairwise.complete.obs"))
## # A tibble: 1 x 2
## N r
## <int> <dbl>
## 1 800 0.636
The Anscombe Dataset
glimpse( anscombe )
## Rows: 11
## Columns: 8
## $ x1 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ x2 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ x3 <dbl> 10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5
## $ x4 <dbl> 8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8
## $ y1 <dbl> 8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68
## $ y2 <dbl> 9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74
## $ y3 <dbl> 7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73
## $ y4 <dbl> 6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89
p1 <- ggplot( data = anscombe, aes( x = x1, y = y1 ) ) +
geom_point()
p2 <- ggplot( data = anscombe, aes( x = x2, y = y2 ) ) +
geom_point()
p3 <- ggplot( data = anscombe, aes( x = x3, y = y3 ) ) +
geom_point()
p4 <- ggplot( data = anscombe, aes( x = x4, y = y4 ) ) +
geom_point()
grid.arrange( p1, p2, p3, p4, ncol = 2 )
All four of these data series have the same number of points, the same mean and the same standard deviation in both \(x\) and \(y\). They yield the same regression line fit and correlation values. However, the distributions are very different from one another.
This demonstration stresses the need to visually inspect your data!
#glimpse( anscombe)
Anscombe <- anscombe %>%
pivot_longer( cols = everything(), names_to = 'labs', values_to = 'vals' ) %>%
mutate( Xaxis = str_detect( labs, 'x' ) == TRUE,
set = case_when( str_detect( labs, '1' ) == TRUE ~ '1',
str_detect( labs, '2' ) == TRUE ~ '2',
str_detect( labs, '3' ) == TRUE ~ '3',
str_detect( labs, '4' ) == TRUE ~ '4'),
id = sort ( rep( c( 1:11 ), 8) ))
AnsX <- Anscombe %>%
filter( Xaxis == TRUE ) %>%
rename( X = vals )
AnsY <- Anscombe %>%
filter( Xaxis == FALSE ) %>%
rename( Y = vals)
Anscombe <- AnsX %>%
mutate( Y = AnsY$Y) %>%
dplyr::select( X, Y, set, id )
Anscombe
## # A tibble: 44 x 4
## X Y set id
## <dbl> <dbl> <chr> <int>
## 1 10 8.04 1 1
## 2 10 9.14 2 1
## 3 10 7.46 3 1
## 4 8 6.58 4 1
## 5 8 6.95 1 2
## 6 8 8.14 2 2
## 7 8 6.77 3 2
## 8 8 5.76 4 2
## 9 13 7.58 1 3
## 10 13 8.74 2 3
## # … with 34 more rows
ggplot(data = Anscombe, aes(x = X, y = Y)) +
geom_point() +
facet_wrap(~ set)
# Compute properties of Anscombe
Anscombe %>%
group_by(set) %>%
dplyr::summarize(
N = n(),
mean_of_x = mean( X ),
std_dev_of_x = sd( X ),
mean_of_y = mean( Y ),
std_dev_of_y = sd( Y ),
correlation_between_x_and_y = cor( X,Y )
)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 7
## set N mean_of_x std_dev_of_x mean_of_y std_dev_of_y correlation_between…
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 11 9 3.32 7.50 2.03 0.816
## 2 2 11 9 3.32 7.50 2.03 0.816
## 3 3 11 9 3.32 7.5 2.03 0.816
## 4 4 11 9 3.32 7.50 2.03 0.817
# Run this and look at the plot
mlbbat10 %>%
ggplot(aes(x = obp, y = slg)) +
geom_point()
# Correlation for all players with at least 200 ABs
mlbbat10 %>%
dplyr::summarize(N = n(), r = cor(obp, slg))
## # A tibble: 1 x 2
## N r
## <int> <dbl>
## 1 1199 0.815
# Run this and look at the plot
mlbbat10 %>%
filter(at_bat > 200) %>%
ggplot(aes(x = obp, y = slg)) +
geom_point()
# Correlation for all players with at least 200 ABs
mlbbat10 %>%
filter(at_bat > 200) %>%
dplyr::summarize(N = n(), r = cor(obp, slg))
## # A tibble: 1 x 2
## N r
## <int> <dbl>
## 1 327 0.686
# Run this and look at the plot
ggplot(data = bdims, aes(x = hgt, y = wgt, color = factor(sex))) +
geom_point()
# Correlation of body dimensions
bdims %>%
group_by(sex) %>%
dplyr::summarize(N = n(), r = cor(hgt, wgt))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## sex N r
## <int> <int> <dbl>
## 1 0 260 0.431
## 2 1 247 0.535
#glimpse( mammals )
# Run this and look at the plot
ggplot(data = mammals, aes(x = body, y = brain)) +
geom_point() + scale_x_log10() + scale_y_log10()
# Correlation among mammals, with and without log
mammals %>%
dplyr::summarize(N = n(),
r = cor(body, brain),
r_log = cor(log(body), log(brain)))
## N r r_log
## 1 62 0.9341638 0.9595748
Interpretation of Correlation
Correlation \(\neq\) Causation
Spurious Correlations
Remarkable but nonsensical correlations are called spurious.
Two variable may seem to move together as a function of the independent variable, but remember that a strong correlation between two variables does not take in to account confounding relations.
Simple Linear Regression
Visualization of Linear Models
#glimpse( possum )
ggplot( data = possum, aes( y = total_l, x = tail_l ) ) +
geom_point() +
#geom_abline( intercept = 0, slope = 2.5 ) # not a great fit
#geom_abline( intercept = 0, slope = 2.3 ) #a gentler slope fits better
geom_abline( intercept = 40, slope = 1.3 ) # why for inercept at the origin?
How to determine what line fits the data best?
The best fit line: least sum of squares residuals
ggplot( possum, aes( y = total_l, x = tail_l ) ) +
geom_point() +
geom_smooth( method = 'lm' )
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot with regression line
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
Understanding Linear Models
Generic statistical modeling framework: \[\mbox{response} = f(\mbox{explanatory}) + \mbox{noise}\] For linear regression, we assume the \(f()\) takes a linear form \[\mbox{response} = f(\mbox{slope}\cdot\mbox{explanatory}) + \mbox{noise}\] If we assume that the data behave linearly, it can be modeled as follows: \[Y = \beta_0 + \beta_1 \cdot X + \epsilon, \mbox{ where } \epsilon \sim N(0,\sigma_{\epsilon}) \] Fitted estimates with the linear model can be described by \[\hat{Y} = \hat{\beta}_0 + \hat{\beta}_1 \cdot X\] The differences of the two: \[\mbox{Residuals } = e = Y - \hat{Y}\] The residuals are the realization of the noise term. \(\epsilon\) is an unknown true quantity that describes the noise in the system whereas \(e\) is a known estimate of \(\epsilon\).
The goal for determining the coefficients of the best fit is to minimize the sum of squares of the residuals, \(\sum_{i=1}^{n}e_i^2\)
Key Concepts
- \(\hat{Y}\) is the expected value for a given \(X\), basically, our best guess
- \(\hat{\beta}\)s are estimates of true, unknown \(\beta\)s
- Residuals (\(e\)s) are estimates of true unknown \(\epsilon\)s
- Error is a misleading term, better to think of as noise
bdimslm <- lm( wgt ~ hgt, bdims)
summary( bdimslm )
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
Regression vs. regession to the mean
Galton’s “Regression to the mean”
data( Galton )
glimpse( Galton )
## Rows: 928
## Columns: 2
## $ parent <dbl> 70.5, 68.5, 65.5, 64.5, 64.0, 67.5, 67.5, 67.5, 66.5, 66.5, 66…
## $ child <dbl> 61.7, 61.7, 61.7, 61.7, 61.7, 62.2, 62.2, 62.2, 62.2, 62.2, 62…
# Height of children vs. height of father
ggplot(data = Galton, aes(x = parent, y = child)) +
geom_jitter() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = 'lm', se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
Interpretin regression models
Interpretation of Regression
glimpse( textbooks )
## Rows: 73
## Columns: 7
## $ dept_abbr <fct> Am Ind, Anthro, Anthro, Anthro, Art His, Art His, Asia Am, …
## $ course <fct> C170, 9, 135T, 191HB, M102K, 118E, 187B, 191E, C125, M145B…
## $ isbn <fct> 978-0803272620, 978-0030119194, 978-0300080643, 978-0226206…
## $ ucla_new <dbl> 27.67, 40.59, 31.68, 16.00, 18.95, 14.95, 24.70, 19.50, 123…
## $ amaz_new <dbl> 27.95, 31.14, 32.00, 11.52, 14.21, 10.17, 20.06, 16.66, 106…
## $ more <fct> Y, Y, Y, Y, Y, Y, Y, N, N, Y, Y, N, Y, Y, N, N, N, N, N, N,…
## $ diff <dbl> -0.28, 9.45, -0.32, 4.48, 4.74, 4.78, 4.64, 2.84, 17.59, 3.…
textbooks %>% mutate( course_number = readr::parse_number( as.character( course ) ) ) %>%
ggplot( aes( x = course_number, y = ucla_new ) ) +
geom_point()
textbooks %>%
ggplot( aes( x = amaz_new, y = ucla_new ) ) +
geom_point() +
geom_smooth( method = 'lm' )
## `geom_smooth()` using formula 'y ~ x'
textlm <- lm( ucla_new ~ amaz_new, data = textbooks )
summary( textlm )
##
## Call:
## lm(formula = ucla_new ~ amaz_new, data = textbooks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.785 -4.574 0.577 4.012 39.002
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.92897 1.93538 0.48 0.633
## amaz_new 1.19900 0.02519 47.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.47 on 71 degrees of freedom
## Multiple R-squared: 0.9696, Adjusted R-squared: 0.9692
## F-statistic: 2266 on 1 and 71 DF, p-value: < 2.2e-16
For each additional dollar that Amazon charges for a book, the UCLA bookstore charges ~$1.20
Your Linear Model Object
The output from lm() is an object. The structure holds all the relevant data to build our model (including the data itself).
class( textlm )
## [1] "lm"
textlm
##
## Call:
## lm(formula = ucla_new ~ amaz_new, data = textbooks)
##
## Coefficients:
## (Intercept) amaz_new
## 0.929 1.199
coef( textlm )
## (Intercept) amaz_new
## 0.9289651 1.1990014
Return the \(\hat{Y}\) values for each data point:
fitted.values( textlm )
## 1 2 3 4 5 6 7 8
## 34.44105 38.26587 39.29701 14.74146 17.96678 13.12281 24.98093 20.90433
## 9 10 11 12 13 14 15 16
## 128.32287 16.82772 36.83906 106.54900 23.05054 20.67652 117.68772 57.89352
## 17 18 19 20 21 22 23 24
## 90.77014 160.12038 146.60764 130.42112 14.92131 23.63805 15.60474 27.24705
## 25 26 27 28 29 30 31 32
## 38.26587 35.64006 20.29284 46.19127 39.03323 40.46004 37.94214 102.84409
## 33 34 35 36 37 38 39 40
## 42.83406 118.37115 98.26390 12.31948 13.15878 162.42247 173.28542 211.95321
## 41 42 43 44 45 46 47 48
## 181.53455 175.26377 209.02765 157.99815 189.98751 165.39599 30.84405 191.90591
## 49 50 51 52 53 54 55 56
## 28.58993 26.15595 52.10235 48.13365 103.08389 112.59197 81.74166 160.14436
## 57 58 59 60 61 62 63 64
## 30.07669 30.84405 103.38364 13.01490 79.73933 101.95682 11.24038 70.97463
## 65 66 67 68 69 70 71 72
## 97.29271 77.77297 45.33998 25.16078 48.09768 32.54663 29.93281 23.37427
## 73
## 22.77477
Return the residuals \(Y-\hat{Y}\) or \(e\):
residuals( textlm )
## 1 2 3 4 5 6
## -6.77105468 2.32413081 -7.61701041 1.25853860 0.98322479 1.82719051
## 7 8 9 10 11 12
## -0.28093350 -1.40432868 -4.48286560 0.17227613 -5.20905751 9.45100013
## 13 14 15 16 17 18
## 4.61945878 4.02348159 8.98227697 -3.99352239 -1.04014123 10.87961683
## 19 20 21 22 23 24
## 5.39236280 -5.62111808 1.07868839 2.31194809 2.39525758 -5.51704618
## 25 26 27 28 29 30
## 2.32413081 -6.69005609 -0.34283796 3.25873144 2.05676990 10.48995821
## 31 32 33 34 35 36
## 6.55786119 -20.39408550 -8.23406459 -29.95115384 -14.26390008 -1.06947854
## 37 38 39 40 41 42
## 1.84122047 17.60753411 0.71458128 -23.37321441 -34.78454847 8.48622894
## 43 44 45 46 47 48
## 5.47234904 39.00184934 4.01249154 10.85401060 -6.14405043 -3.90591073
## 49 50 51 52 53 54
## 1.11007224 0.08404511 3.02765446 -4.57365085 26.51611422 11.24803299
## 55 56 57 58 59 60
## 3.37833944 -7.66436320 -0.37668952 -6.14405043 -13.67363613 -2.51489936
## 61 62 63 64 65 66
## 13.14067180 -1.07682445 0.70962274 1.49537216 -5.19270894 0.57703413
## 67 68 69 70 71 72
## -3.33997755 -6.41078371 0.36231919 7.00336756 -0.28280935 0.38572840
## 73
## 4.92522911
Use the broom library to augment the original model output to include useful features of the fit:
augtextlm <- augment( textlm )
glimpse( augtextlm )
## Rows: 73
## Columns: 8
## $ ucla_new <dbl> 27.67, 40.59, 31.68, 16.00, 18.95, 14.95, 24.70, 19.50, 12…
## $ amaz_new <dbl> 27.95, 31.14, 32.00, 11.52, 14.21, 10.17, 20.06, 16.66, 10…
## $ .fitted <dbl> 34.44105, 38.26587, 39.29701, 14.74146, 17.96678, 13.12281…
## $ .resid <dbl> -6.7710547, 2.3241308, -7.6170104, 1.2585386, 0.9832248, 1…
## $ .std.resid <dbl> -0.65294223, 0.22399308, -0.73400193, 0.12183273, 0.095110…
## $ .hat <dbl> 0.01944321, 0.01833896, 0.01806141, 0.02699567, 0.02554530…
## $ .sigma <dbl> 10.51520, 10.54319, 10.50682, 10.54581, 10.54624, 10.54459…
## $ .cooksd <dbl> 4.226829e-03, 4.686540e-04, 4.954865e-03, 2.059099e-04, 1.…
# Mean of weights equal to mean of fitted values?
( mean( textbooks$ucla_new ) == mean( fitted.values( textlm ) ) )
## [1] TRUE
# Mean of the residuals
mean( residuals( textlm ) )
## [1] -5.944398e-17
# create textlm_tidy
textlm_tidy <- data.frame( augment( textlm ) )
glimpse( textlm_tidy )
## Rows: 73
## Columns: 8
## $ ucla_new <dbl> 27.67, 40.59, 31.68, 16.00, 18.95, 14.95, 24.70, 19.50, 12…
## $ amaz_new <dbl> 27.95, 31.14, 32.00, 11.52, 14.21, 10.17, 20.06, 16.66, 10…
## $ .fitted <dbl> 34.44105, 38.26587, 39.29701, 14.74146, 17.96678, 13.12281…
## $ .resid <dbl> -6.7710547, 2.3241308, -7.6170104, 1.2585386, 0.9832248, 1…
## $ .std.resid <dbl> -0.65294223, 0.22399308, -0.73400193, 0.12183273, 0.095110…
## $ .hat <dbl> 0.01944321, 0.01833896, 0.01806141, 0.02699567, 0.02554530…
## $ .sigma <dbl> 10.51520, 10.54319, 10.50682, 10.54581, 10.54624, 10.54459…
## $ .cooksd <dbl> 4.226829e-03, 4.686540e-04, 4.954865e-03, 2.059099e-04, 1.…
Using your Linear Model
Examining the Residuals:
textlm_tidy %>%
arrange( desc( .resid ) ) %>%
head()
## ucla_new amaz_new .fitted .resid .std.resid .hat .sigma
## 1 197.00 131.00 157.99815 39.00185 3.807626 0.04330931 9.408665
## 2 129.60 85.20 103.08389 26.51611 2.554497 0.01753183 10.050561
## 3 180.03 134.69 162.42247 17.60753 1.721789 0.04644271 10.324375
## 4 92.88 65.73 79.73933 13.14067 1.263623 0.01392606 10.427641
## 5 123.84 93.13 112.59197 11.24803 1.085114 0.02025756 10.459091
## 6 171.00 132.77 160.12038 10.87962 1.062967 0.04479266 10.462654
## .cooksd
## 1 0.32816195
## 2 0.05822235
## 3 0.07219393
## 4 0.01127520
## 5 0.01217296
## 6 0.02649222
The residuals reveal how far from the model prediction a textbook price is. In other words, how overpriced is the textbook. The output above shows that the most overpriced text sells for $197 which is $66 more than the Amazon cost but also $39 more than the typical upsale price.
textbooks %>%
filter( ucla_new == 197 )
## # A tibble: 1 x 7
## dept_abbr course isbn ucla_new amaz_new more diff
## <fct> <fct> <fct> <dbl> <dbl> <fct> <dbl>
## 1 Mgmt 228 978-0073379661 197 131 Y 66
A quick ibsn search shows this text is titled ‘Financial Statement Analysis and Security Valuation’. DataCamp Prof is quick to point out the irony in this.
Were there any underpriced books?
textlm_tidy %>%
arrange( .resid ) %>%
head()
## ucla_new amaz_new .fitted .resid .std.resid .hat .sigma
## 1 146.75 150.63 181.5345 -34.78455 -3.429184 0.06178867 9.633991
## 2 88.42 97.95 118.3712 -29.95115 -2.892403 0.02226987 9.906067
## 3 188.58 176.00 211.9532 -23.37321 -2.342591 0.09227679 10.131120
## 4 82.45 85.00 102.8441 -20.39409 -1.964657 0.01747249 10.256218
## 5 84.00 81.18 98.2639 -14.26390 -1.373378 0.01642800 10.405876
## 6 89.71 85.45 103.3836 -13.67364 -1.317335 0.01760665 10.417222
## .cooksd
## 1 0.38722189
## 2 0.09527663
## 3 0.27893442
## 4 0.03432050
## 5 0.01575174
## 6 0.01555083
Yes, there were quite a few underpriced books. It would be interesting to pursue this dataset further to understand what features correspond to under/over pricing.
textbooks %>%
filter( ucla_new == 146.75 )
## # A tibble: 1 x 7
## dept_abbr course isbn ucla_new amaz_new more diff
## <fct> <fct> <fct> <dbl> <dbl> <fct> <dbl>
## 1 Mgmt 1A 978-0077251031 147. 151. Y -3.88
The most underpriced is a Studyguide to the Principles of Accounting.
Making Predictions
predict( lm, newdata ) yields fitted values for any new data
new_df <- data.frame( amaz_new = 8.49 )
predict( textlm, newdata = new_df )
## 1
## 11.10849
Visualizing New Observations. Use broom::augment to get predictions in a data.frame with other relevant model features
isrs <- broom::augment( textlm, newdata =new_df )
ggplot( data = textbooks, aes( x = amaz_new, y = ucla_new ) ) +
geom_point() +
geom_smooth( method = 'lm' ) +
geom_point( data = isrs, aes( y = .fitted ), size = 3, color = 'magenta' )
## `geom_smooth()` using formula 'y ~ x'
mod <- lm( wgt ~ hgt, data = bdims)
coefs <- data.frame( Intercept = coef( mod )[1], slope = coef( mod )[2] )
glimpse( coefs )
## Rows: 1
## Columns: 2
## $ Intercept <dbl> -105.0113
## $ slope <dbl> 1.017617
# Scatterplot with regression line
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_abline(data = coefs,
aes( intercept = Intercept,
slope = slope ),
color = 'dodgerblue' )
Model fit
Assessing Model Fit
How well do models work?
How to quantify the quality of a model fit?
- Sum of Squares Deviations. However, this has a way of penelizing large deviation disproportionately.
#Two ways to compute SSE
mod_possum <- lm( total_l ~ tail_l, data = possum )
mod_possum %>%
augment() %>%
dplyr::summarize( SSE = sum( .resid^2 ),
SSE_also = ( n() - 1 ) * var( .resid ) )
## # A tibble: 1 x 2
## SSE SSE_also
## <dbl> <dbl>
## 1 1301. 1301.
The SSE is a single number that captures how much our model missed the real data by. However, it is difficult to interpret because the units are squared. Therefore it is common to work with the RMSE.
RMSE: Root Mean Square Error or The Residual Standard Error
\[RMSE = \sqrt{\frac{\sum_i e_i^2}{d.f}} = \sqrt{\frac{SSE}{n-2}}\]
summary( mod_possum )
##
## Call:
## lm(formula = total_l ~ tail_l, data = possum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.2100 -2.3265 0.1792 2.7765 6.7900
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.0371 6.6568 6.165 1.43e-08 ***
## tail_l 1.2443 0.1796 6.927 3.94e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.572 on 102 degrees of freedom
## Multiple R-squared: 0.32, Adjusted R-squared: 0.3133
## F-statistic: 47.99 on 1 and 102 DF, p-value: 3.935e-10
RMSE is conveniently in the same units as the target variable Here, RMSE = 3.572 on 102 degrees of freedom. This means that our model makes a prediction on body length that is typically within 3.572 cm of the truth.
summary( textlm )
##
## Call:
## lm(formula = ucla_new ~ amaz_new, data = textbooks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.785 -4.574 0.577 4.012 39.002
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.92897 1.93538 0.48 0.633
## amaz_new 1.19900 0.02519 47.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.47 on 71 degrees of freedom
## Multiple R-squared: 0.9696, Adjusted R-squared: 0.9692
## F-statistic: 2266 on 1 and 71 DF, p-value: < 2.2e-16
For the textbook model, the RMSE = 10.47. So our text price prediction is typically withing $10.47 USD of the truth.
# Compute the mean of the residuals
mean( residuals( textlm ) )
## [1] -5.944398e-17
# Compute RMSE
sqrt(sum(residuals( textlm )^2) / df.residual( textlm ))
## [1] 10.47237
Comparing Model Fits
a unitless measure of model fit that can be used to compare between different modeling approaches
The Null (averagle) Model
- for all observations, \(\hat{y} = \bar{y}\)
- find the SSE for the null model:
null_possum <- lm( total_l ~ 1, data = possum )
null_possum %>%
augment( possum ) %>%
dplyr::summarize( SSE = sum( .resid^2 ) )
## # A tibble: 1 x 1
## SSE
## <dbl>
## 1 1914.
- find the SSE for our linear model
mod_possum %>%
augment( possum ) %>%
dplyr::summarize( SSE = sum( .resid^2 ) )
## # A tibble: 1 x 1
## SSE
## <dbl>
## 1 1301.
- the ration of the the SSEs quatifies the variance accounted for by the linear model
Coefficient of Determination \[R^2 = 1 - \frac{SSE}{SST} = 1 - \frac{Var(e)}{Var(y)}\] SST is the SSE for the Null Model \(\therefore\) we interpret \(R^2\) as the proportion of variance of the response variable that is explained by the model.
Connection to Correlation: For simple linear regression with just 1 explanatory variable, the value of \(R^2\) corresponds to to square of the correlation coeficient. Or: \[r_{x,y}^2 = R^2\] However, this is not the case with more complex models
\(R^2\) is very commonly used to evaluate model fit. However, it should not be the be-all & end-all measure. A very high \(R^2\) could happen when the model is overfitting results. A low \(R^2\) does not mean that the model can’t yield statistically powerful insights about a dataset.
# View model summary
summary( mod )
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Compute R-squared
bdims_tidy <- augment( mod )
bdims_tidy %>%
dplyr::summarize(var_y = var( wgt ), var_e = var( .resid ) ) %>%
mutate(R_squared = 1 - var_e / var_y )
## # A tibble: 1 x 3
## var_y var_e R_squared
## <dbl> <dbl> <dbl>
## 1 178. 86.5 0.515
Unusual Points
Leverage & Influence
regulars <- mlbbat10 %>%
filter( at_bat > 400 )
ggplot( regulars, aes( x = stolen_base, y = home_run )) +
geom_jitter( alpha = 0.5 ) +
geom_smooth( method = 'lm' )
## `geom_smooth()` using formula 'y ~ x'
How individual points, especially if they are extreme, can change the fit of a linear model.
Leverage is a function of the distance between the value of the explanatary variable and the mean of the explanatary variable. Points that are far from the horizontal center of the data distribution have high leverage while close points have low leverage. \[h_i = \frac{1}{n} + \frac{(x_i- \bar{x})^2}{\sum_{i=1}^n (x-1 - \bar{x})^2}\] To see the points with the highest leverage, sort by the.hat feature
modmlb <- lm( home_run ~ stolen_base, data = regulars )
modmlb %>%
augment() %>%
arrange( desc( .hat ) ) %>%
dplyr::select( home_run, stolen_base, .fitted, .resid, .hat )
## # A tibble: 170 x 5
## home_run stolen_base .fitted .resid .hat
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 68 2.38 -1.38 0.131
## 2 2 52 6.46 -4.46 0.0703
## 3 5 50 6.97 -1.97 0.0642
## 4 19 47 7.74 11.3 0.0555
## 5 5 47 7.74 -2.74 0.0555
## 6 1 42 9.01 -8.01 0.0426
## 7 18 42 9.01 8.99 0.0426
## 8 6 42 9.01 -3.01 0.0426
## 9 11 37 10.3 0.715 0.0316
## 10 18 34 11.0 6.95 0.0260
## # … with 160 more rows
Influnece: determines by both .hat and .resid. See .cooksd
modmlb %>%
augment() %>%
arrange( desc( .cooksd ) ) %>%
dplyr::select( home_run, stolen_base, .fitted, .resid, .hat, .cooksd ) %>%
head()
## # A tibble: 6 x 6
## home_run stolen_base .fitted .resid .hat .cooksd
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 54 9 17.4 36.6 0.00607 0.0434
## 2 19 47 7.74 11.3 0.0555 0.0416
## 3 34 26 13.1 20.9 0.0144 0.0341
## 4 42 14 16.1 25.9 0.00618 0.0221
## 5 39 0 19.7 19.3 0.0108 0.0216
## 6 18 42 9.01 8.99 0.0426 0.0198
Dealing with Outliers
Removing outliers: What is the justification? If the answer is, because it improves my result, then this is not justification. How does this change the scope of inference? by removing an outlier are you selectively dismissing a group/demographic?