download.file("http://www.openintro.org/stat/data/mlb11.RData", destfile = "mlb11.RData")
load("mlb11.RData")
plot(mlb11$at_bats, mlb11$runs,
pch = 19, # solid circle
cex = .9, # make 150% size
col = "#cc0000", # red
main = "Relationship between at_bats & runs",
xlab = "at_bats",
ylab = "home runs")
#-1 indicates a strong negative relationship. # A result of zero indicates no relationship at all.
cor(mlb11$runs, mlb11$at_bats)
## [1] 0.610627
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
mlb11 %>%
ggplot()+
geom_boxplot(mapping = aes(x = at_bats, y = runs))
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
mlb11 %>%
ggplot() +
geom_histogram(mapping = aes(x = at_bats))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot_ss(x = mlb11$at_bats, y = mlb11$runs)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
plot_ss(x = mlb11$at_bats, y = mlb11$runs, showSquares = TRUE)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
plot_ss(x = mlb11$at_bats, y = mlb11$runs, showSquares = TRUE)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
m1 <- lm(runs ~ at_bats, data = mlb11)
summary(m1)
##
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.58 -47.05 -16.59 54.40 176.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2789.2429 853.6957 -3.267 0.002871 **
## at_bats 0.6305 0.1545 4.080 0.000339 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared: 0.3729, Adjusted R-squared: 0.3505
## F-statistic: 16.65 on 1 and 28 DF, p-value: 0.0003388
plot_ss(x = mlb11$homeruns, y = mlb11$runs)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## 415.239 1.835
##
## Sum of Squares: 73671.99
plot_ss(x = mlb11$homeruns, y = mlb11$runs, showSquares = TRUE)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## 415.239 1.835
##
## Sum of Squares: 73671.99
m2 <- lm(runs ~ homeruns, data = mlb11)
summary(m2)
##
## Call:
## lm(formula = runs ~ homeruns, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91.615 -33.410 3.231 24.292 104.631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 415.2389 41.6779 9.963 1.04e-10 ***
## homeruns 1.8345 0.2677 6.854 1.90e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.29 on 28 degrees of freedom
## Multiple R-squared: 0.6266, Adjusted R-squared: 0.6132
## F-statistic: 46.98 on 1 and 28 DF, p-value: 1.9e-07
plot(mlb11$runs ~ mlb11$at_bats)
abline(m1)
plot(m1$residuals ~ mlb11$at_bats)
abline(h = 0, lty = 3) # adds a horizontal dashed line at y = 0
hist(m1$residuals)
qqnorm(m1$residuals)
qqline(m1$residuals) # adds diagonal line to the normal prob plot
plot(mlb11$stolen_bases, mlb11$runs,
pch = 19, # solid circle
cex = .9, # make 150% size
col = "#cc0000", # red
main = "Relationship between stolen_bases & runs",
xlab = "stolen_bases",
ylab = "runs")
cor(mlb11$runs, mlb11$stolen_bases)
## [1] 0.05398141
mlb11 %>%
ggplot()+
geom_point(mapping = aes(x = stolen_bases, y = runs), color = "red")+
geom_smooth(mapping = aes(x = stolen_bases, y = runs),method = 'lm',se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
m3 <- lm(runs ~ stolen_bases, data = mlb11)
summary(m3)
##
## Call:
## lm(formula = runs ~ stolen_bases, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.94 -62.87 10.01 38.54 182.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 677.3074 58.9751 11.485 4.17e-12 ***
## stolen_bases 0.1491 0.5211 0.286 0.777
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 83.82 on 28 degrees of freedom
## Multiple R-squared: 0.002914, Adjusted R-squared: -0.0327
## F-statistic: 0.08183 on 1 and 28 DF, p-value: 0.7769
plot(mlb11$hits, mlb11$runs,
pch = 19, # solid circle
cex = .9, # make 150% size
col = "#cc0000", # red
main = "Relationship between hits & runs",
xlab = "hits",
ylab = "runs")
cor(mlb11$runs, mlb11$hits)
## [1] 0.8012108
mlb11 %>%
ggplot()+
geom_point(mapping = aes(x = hits, y = runs), color = "red")+
geom_smooth(mapping = aes(x = hits, y = runs),method = 'lm',se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
m4 <- lm(runs ~ hits, data = mlb11)
summary(m4)
##
## Call:
## lm(formula = runs ~ hits, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -103.718 -27.179 -5.233 19.322 140.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -375.5600 151.1806 -2.484 0.0192 *
## hits 0.7589 0.1071 7.085 1.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.23 on 28 degrees of freedom
## Multiple R-squared: 0.6419, Adjusted R-squared: 0.6292
## F-statistic: 50.2 on 1 and 28 DF, p-value: 1.043e-07
plot(mlb11$bat_avg, mlb11$runs,
pch = 19, # solid circle
cex = .9, # make 150% size
col = "#cc0000", # red
main = "Relationship between bat_avg & runs",
xlab = "bat_avg",
ylab = "runs")
cor(mlb11$runs, mlb11$bat_avg)
## [1] 0.8099859
mlb11 %>%
ggplot()+
geom_point(mapping = aes(x = bat_avg, y = runs), color = "red")+
geom_smooth(mapping = aes(x = bat_avg, y = runs),method = 'lm',se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
m5 <- lm(runs ~ bat_avg, data = mlb11)
summary(m5)
##
## Call:
## lm(formula = runs ~ bat_avg, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94.676 -26.303 -5.496 28.482 131.113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -642.8 183.1 -3.511 0.00153 **
## bat_avg 5242.2 717.3 7.308 5.88e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.23 on 28 degrees of freedom
## Multiple R-squared: 0.6561, Adjusted R-squared: 0.6438
## F-statistic: 53.41 on 1 and 28 DF, p-value: 5.877e-08
plot(mlb11$strikeouts, mlb11$runs,
pch = 19, # solid circle
cex = .9, # make 150% size
col = "#cc0000", # red
main = "Relationship between strikeouts & runs",
xlab = "strikeouts",
ylab = "runs")
cor(mlb11$runs, mlb11$strikeouts)
## [1] -0.4115312
mlb11 %>%
ggplot()+
geom_point(mapping = aes(x = strikeouts, y = runs), color = "red")+
geom_smooth(mapping = aes(x = strikeouts, y = runs),method = 'lm',se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
m6 <- lm(runs ~ strikeouts, data = mlb11)
summary(m6)
##
## Call:
## lm(formula = runs ~ strikeouts, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -132.27 -46.95 -11.92 55.14 169.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1054.7342 151.7890 6.949 1.49e-07 ***
## strikeouts -0.3141 0.1315 -2.389 0.0239 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 76.5 on 28 degrees of freedom
## Multiple R-squared: 0.1694, Adjusted R-squared: 0.1397
## F-statistic: 5.709 on 1 and 28 DF, p-value: 0.02386
plot(mlb11$wins, mlb11$runs,
pch = 19, # solid circle
cex = .9, # make 150% size
col = "#cc0000", # red
main = "Relationship between wins & runs",
xlab = "wins",
ylab = "runs")
cor(mlb11$runs, mlb11$wins)
## [1] 0.6008088
mlb11 %>%
ggplot()+
geom_point(mapping = aes(x = wins, y = runs), color = "red")+
geom_smooth(mapping = aes(x = wins, y = runs),method = 'lm',se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
m7 <- lm(runs ~ wins, data = mlb11)
summary(m7)
##
## Call:
## lm(formula = runs ~ wins, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145.450 -47.506 -7.482 47.346 142.186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 342.121 89.223 3.834 0.000654 ***
## wins 4.341 1.092 3.977 0.000447 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.1 on 28 degrees of freedom
## Multiple R-squared: 0.361, Adjusted R-squared: 0.3381
## F-statistic: 15.82 on 1 and 28 DF, p-value: 0.0004469
plot(mlb11$homeruns, mlb11$runs,
pch = 19, # solid circle
cex = .9, # make 150% size
col = "#cc0000", # red
main = "Relationship between homeruns & runs",
xlab = "homeruns",
ylab = "runs")
cor(mlb11$runs, mlb11$homeruns)
## [1] 0.7915577
mlb11 %>%
ggplot()+
geom_point(mapping = aes(x = homeruns, y = runs), color = "red")+
geom_smooth(mapping = aes(x = homeruns, y = runs),method = 'lm',se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
m8 <- lm(runs ~ homeruns, data = mlb11)
summary(m8)
##
## Call:
## lm(formula = runs ~ homeruns, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91.615 -33.410 3.231 24.292 104.631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 415.2389 41.6779 9.963 1.04e-10 ***
## homeruns 1.8345 0.2677 6.854 1.90e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.29 on 28 degrees of freedom
## Multiple R-squared: 0.6266, Adjusted R-squared: 0.6132
## F-statistic: 46.98 on 1 and 28 DF, p-value: 1.9e-07
plot(mlb11$new_onbase, mlb11$runs,
pch = 19, # solid circle
cex = .9, # make 150% size
col = "#cc0000", # red
main = "Relationship between new_onbase & runs",
xlab = "new_onbase",
ylab = "runs")
cor(mlb11$runs, mlb11$new_onbase)
## [1] 0.9214691
mlb11 %>%
ggplot()+
geom_point(mapping = aes(x = new_onbase, y = runs), color = "red")+
geom_smooth(mapping = aes(x = new_onbase, y = runs),method = 'lm',se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
m9 <- lm(runs ~ new_onbase, data = mlb11)
summary(m9)
##
## Call:
## lm(formula = runs ~ new_onbase, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.270 -18.335 3.249 19.520 69.002
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1118.4 144.5 -7.741 1.97e-08 ***
## new_onbase 5654.3 450.5 12.552 5.12e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.61 on 28 degrees of freedom
## Multiple R-squared: 0.8491, Adjusted R-squared: 0.8437
## F-statistic: 157.6 on 1 and 28 DF, p-value: 5.116e-13
plot(mlb11$new_slug, mlb11$runs,
pch = 19, # solid circle
cex = .9, # make 150% size
col = "#cc0000", # red
main = "Relationship between new_slug & runs",
xlab = "new_slug",
ylab = "runs")
cor(mlb11$runs, mlb11$new_slug)
## [1] 0.9470324
mlb11 %>%
ggplot()+
geom_point(mapping = aes(x = new_slug, y = runs), color = "red")+
geom_smooth(mapping = aes(x = new_slug, y = runs),method = 'lm',se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
m10 <- lm(runs ~ new_slug, data = mlb11)
summary(m10)
##
## Call:
## lm(formula = runs ~ new_slug, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.41 -18.66 -0.91 16.29 52.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -375.80 68.71 -5.47 7.70e-06 ***
## new_slug 2681.33 171.83 15.61 2.42e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.96 on 28 degrees of freedom
## Multiple R-squared: 0.8969, Adjusted R-squared: 0.8932
## F-statistic: 243.5 on 1 and 28 DF, p-value: 2.42e-15
plot(mlb11$new_obs, mlb11$runs,
pch = 19, # solid circle
cex = .9, # make 150% size
col = "#cc0000", # red
main = "Relationship between new_obs & runs",
xlab = "new_obs",
ylab = "runs")
cor(mlb11$runs, mlb11$new_obs)
## [1] 0.9669163
mlb11 %>%
ggplot()+
geom_point(mapping = aes(x = new_obs, y = runs), color = "red")+
geom_smooth(mapping = aes(x = new_obs, y = runs),method = 'lm',se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
m11 <- lm(runs ~ new_obs, data = mlb11)
summary(m11)
##
## Call:
## lm(formula = runs ~ new_obs, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.456 -13.690 1.165 13.935 41.156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -686.61 68.93 -9.962 1.05e-10 ***
## new_obs 1919.36 95.70 20.057 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.41 on 28 degrees of freedom
## Multiple R-squared: 0.9349, Adjusted R-squared: 0.9326
## F-statistic: 402.3 on 1 and 28 DF, p-value: < 2.2e-16
plot(mlb11$runs ~ mlb11$new_obs)
abline(m11)
plot(m11$residuals ~ mlb11$new_obs)
abline(h = 0, lty = 3) # adds a horizontal dashed line at y = 0
hist(m11$residuals)
qqnorm(m1$residuals)
qqline(m1$residuals)