code to access each variable in dataset and test model fit
library(statsr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(moments)
#we'll be looking at data from all 30 Major League Baseball teams and examining the linear relationship between runs scored in a season and a number of other player statistics. Our aim will be to summarize these relationships both graphically and numerically in order to find which variable, if any, helps us best predict a team's runs scored in a season.
data(mlb11)
names(mlb11)
## [1] "team" "runs" "at_bats" "hits"
## [5] "homeruns" "bat_avg" "strikeouts" "stolen_bases"
## [9] "wins" "new_onbase" "new_slug" "new_obs"
# relationsship between runs and at_bats
ggplot(data = mlb11, aes(x = at_bats, y = runs)) + geom_point()
# quantify linear relationship
mlb11 %>%
summarise(cor(runs, at_bats))
## # A tibble: 1 x 1
## `cor(runs, at_bats)`
## <dbl>
## 1 0.611
plot_ss(x = at_bats, y = runs, data = mlb11)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
plot_ss(x = at_bats, y = runs, data = mlb11, showSquares = TRUE)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
# use linear model
m1 <- lm(runs ~ at_bats, data = mlb11)
# The output of lm is an object that contains all of the information we need about the linear model that was just fit. We can access this information using the summary function.
summary(m1)
##
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.58 -47.05 -16.59 54.40 176.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2789.2429 853.6957 -3.267 0.002871 **
## at_bats 0.6305 0.1545 4.080 0.000339 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared: 0.3729, Adjusted R-squared: 0.3505
## F-statistic: 16.65 on 1 and 28 DF, p-value: 0.0003388
#Fit a new model that uses homeruns to predict runs
m2 <- lm(runs ~ homeruns, data = mlb11)
summary(m2)
##
## Call:
## lm(formula = runs ~ homeruns, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91.615 -33.410 3.231 24.292 104.631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 415.2389 41.6779 9.963 1.04e-10 ***
## homeruns 1.8345 0.2677 6.854 1.90e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.29 on 28 degrees of freedom
## Multiple R-squared: 0.6266, Adjusted R-squared: 0.6132
## F-statistic: 46.98 on 1 and 28 DF, p-value: 1.9e-07
anova(m2)
## Analysis of Variance Table
##
## Response: runs
## Df Sum Sq Mean Sq F value Pr(>F)
## homeruns 1 123609 123609 46.979 1.9e-07 ***
## Residuals 28 73672 2631
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(data = mlb11, aes(x = at_bats, y = runs)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE)
# with std error
ggplot(data = mlb11, aes(x = at_bats, y = runs)) +
geom_point() +
stat_smooth(method = "lm", se = TRUE)
# observed number of runs for the team with 5,579 at bats you can use the following
mlb11 %>%
filter(at_bats == 5579) %>%
dplyr::select(runs)
## # A tibble: 1 x 1
## runs
## <int>
## 1 713
#o assess whether the linear model is reliable, we need to check for (1) linearity, (2) nearly normal residuals, and (3) constant variability.
# linearity already checked by using scatterplo now check residual vs fitted values
ggplot(data = m1, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
xlab("Fitted values") +
ylab("Residuals")
#The residuals appear to be randomly distributed around 0
#Nearly normal residuals: To check this condition, we can look at a histogram
ggplot(data = m1, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals")
# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here
ggplot(data = m1, aes(sample = .resid)) +
stat_qq()
skewness(resid(m1))
## [1] 0.5439197
############### check with wins now
ggplot(data = mlb11, aes(x = wins, y = runs)) + geom_point()
# quantify linear relationship
mlb11 %>%
summarise(cor(runs, wins))
## # A tibble: 1 x 1
## `cor(runs, wins)`
## <dbl>
## 1 0.601
plot_ss(x = wins, y = runs, data = mlb11)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## 342.121 4.341
##
## Sum of Squares: 126068.4
# use linear model
m_wins <- lm(runs ~ wins, data = mlb11)
# The output of lm is an object that contains all of the information we need about the linear model that was just fit. We can access this information using the summary function.
summary(m_wins)
##
## Call:
## lm(formula = runs ~ wins, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145.450 -47.506 -7.482 47.346 142.186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 342.121 89.223 3.834 0.000654 ***
## wins 4.341 1.092 3.977 0.000447 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.1 on 28 degrees of freedom
## Multiple R-squared: 0.361, Adjusted R-squared: 0.3381
## F-statistic: 15.82 on 1 and 28 DF, p-value: 0.0004469
# check residucal plot
ggplot(data = m_wins, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
xlab("Fitted values") +
ylab("Residuals")
#Nearly normal residuals: To check this condition, we can look at a histogram
ggplot(data = m_wins, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals")
# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here
ggplot(data = m_wins, aes(sample = .resid)) +
stat_qq()
skewness(resid(m_wins))
## [1] 0.1184672
# check with #############Hits########################
ggplot(data = mlb11, aes(x = hits, y = runs)) + geom_point()
# quantify linear relationship
mlb11 %>%
summarise(cor(hits, runs))
## # A tibble: 1 x 1
## `cor(hits, runs)`
## <dbl>
## 1 0.801
plot_ss(x = wins, y = runs, data = mlb11)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## 342.121 4.341
##
## Sum of Squares: 126068.4
# use linear model
m_hits <- lm(runs ~ hits, data = mlb11)
# The output of lm is an object that contains all of the information we need about the linear model that was just fit. We can access this information using the summary function.
summary(m_hits)
##
## Call:
## lm(formula = runs ~ hits, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -103.718 -27.179 -5.233 19.322 140.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -375.5600 151.1806 -2.484 0.0192 *
## hits 0.7589 0.1071 7.085 1.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.23 on 28 degrees of freedom
## Multiple R-squared: 0.6419, Adjusted R-squared: 0.6292
## F-statistic: 50.2 on 1 and 28 DF, p-value: 1.043e-07
# check residucal plot
ggplot(data = m_hits, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
xlab("Fitted values") +
ylab("Residuals")
#Nearly normal residuals: To check this condition, we can look at a histogram
ggplot(data = m_hits, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals")
# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here
ggplot(data = m_hits, aes(sample = .resid)) +
stat_qq()
skewness(resid(m_hits))
## [1] 0.584631
################# bat_avg###################
ggplot(data = mlb11, aes(x = bat_avg, y = runs)) + geom_point()
# quantify linear relationship
mlb11 %>%
summarise(cor(bat_avg, runs))
## # A tibble: 1 x 1
## `cor(bat_avg, runs)`
## <dbl>
## 1 0.810
plot_ss(x = bat_avg, y = runs, data = mlb11)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -642.8 5242.2
##
## Sum of Squares: 67849.52
# use linear model
m_bat_avg <- lm(runs ~ bat_avg, data = mlb11)
# The output of lm is an object that contains all of the information we need about the linear model that was just fit. We can access this information using the summary function.
summary(m_bat_avg)
##
## Call:
## lm(formula = runs ~ bat_avg, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94.676 -26.303 -5.496 28.482 131.113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -642.8 183.1 -3.511 0.00153 **
## bat_avg 5242.2 717.3 7.308 5.88e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.23 on 28 degrees of freedom
## Multiple R-squared: 0.6561, Adjusted R-squared: 0.6438
## F-statistic: 53.41 on 1 and 28 DF, p-value: 5.877e-08
# check residucal plot
ggplot(data = m_bat_avg, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
xlab("Fitted values") +
ylab("Residuals")
#Nearly normal residuals: To check this condition, we can look at a histogram
ggplot(data = m_bat_avg, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals")
# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here
ggplot(data = m_bat_avg, aes(sample = .resid)) +
stat_qq()
skewness(resid(m_bat_avg))
## [1] 0.5703952
######## summary
summary(m1)
##
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.58 -47.05 -16.59 54.40 176.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2789.2429 853.6957 -3.267 0.002871 **
## at_bats 0.6305 0.1545 4.080 0.000339 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared: 0.3729, Adjusted R-squared: 0.3505
## F-statistic: 16.65 on 1 and 28 DF, p-value: 0.0003388
summary(m_wins)
##
## Call:
## lm(formula = runs ~ wins, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145.450 -47.506 -7.482 47.346 142.186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 342.121 89.223 3.834 0.000654 ***
## wins 4.341 1.092 3.977 0.000447 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.1 on 28 degrees of freedom
## Multiple R-squared: 0.361, Adjusted R-squared: 0.3381
## F-statistic: 15.82 on 1 and 28 DF, p-value: 0.0004469
summary(m_bat_avg)
##
## Call:
## lm(formula = runs ~ bat_avg, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94.676 -26.303 -5.496 28.482 131.113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -642.8 183.1 -3.511 0.00153 **
## bat_avg 5242.2 717.3 7.308 5.88e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.23 on 28 degrees of freedom
## Multiple R-squared: 0.6561, Adjusted R-squared: 0.6438
## F-statistic: 53.41 on 1 and 28 DF, p-value: 5.877e-08
summary(m_hits)
##
## Call:
## lm(formula = runs ~ hits, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -103.718 -27.179 -5.233 19.322 140.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -375.5600 151.1806 -2.484 0.0192 *
## hits 0.7589 0.1071 7.085 1.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.23 on 28 degrees of freedom
## Multiple R-squared: 0.6419, Adjusted R-squared: 0.6292
## F-statistic: 50.2 on 1 and 28 DF, p-value: 1.043e-07
# look to me wins is better predictor of runs
############# check new obs
ggplot(data = mlb11, aes(x = new_obs, y = runs)) + geom_point()
mlb11 %>% summarise(cor(new_obs, runs))
## # A tibble: 1 x 1
## `cor(new_obs, runs)`
## <dbl>
## 1 0.967
plot_ss(x = new_obs, y = runs, data = mlb11)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -686.6 1919.4
##
## Sum of Squares: 12837.66
m_new_obs <- lm(runs ~ new_obs, data = mlb11)
summary(m_new_obs)
##
## Call:
## lm(formula = runs ~ new_obs, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.456 -13.690 1.165 13.935 41.156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -686.61 68.93 -9.962 1.05e-10 ***
## new_obs 1919.36 95.70 20.057 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.41 on 28 degrees of freedom
## Multiple R-squared: 0.9349, Adjusted R-squared: 0.9326
## F-statistic: 402.3 on 1 and 28 DF, p-value: < 2.2e-16
ggplot(data = m_new_obs, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
xlab("Fitted values") +
ylab("Residuals")
ggplot(data = m_new_obs, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals")
ggplot(data = m_new_obs, aes(sample = .resid)) +
stat_qq()
skewness(resid(m_new_obs))
## [1] -0.09244757
############### new slug
ggplot(data = mlb11, aes(x = new_slug, y = runs)) + geom_point()
mlb11 %>% summarise(cor(new_slug, runs))
## # A tibble: 1 x 1
## `cor(new_slug, runs)`
## <dbl>
## 1 0.947
plot_ss(x = new_slug, y = runs, data = mlb11)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -375.8 2681.3
##
## Sum of Squares: 20345.54
m_new_slug <- lm(runs ~ new_slug, data = mlb11)
summary(m_new_slug)
##
## Call:
## lm(formula = runs ~ new_slug, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.41 -18.66 -0.91 16.29 52.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -375.80 68.71 -5.47 7.70e-06 ***
## new_slug 2681.33 171.83 15.61 2.42e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.96 on 28 degrees of freedom
## Multiple R-squared: 0.8969, Adjusted R-squared: 0.8932
## F-statistic: 243.5 on 1 and 28 DF, p-value: 2.42e-15
ggplot(data = m_new_slug, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
xlab("Fitted values") +
ylab("Residuals")
ggplot(data = m_new_slug, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals")
ggplot(data = m_new_slug, aes(sample = .resid)) +
stat_qq()
skewness(resid(m_new_slug))
## [1] -0.02966238
############### new onbase
ggplot(data = mlb11, aes(x = new_onbase, y = runs)) + geom_point()
mlb11 %>% summarise(cor(new_onbase, runs))
## # A tibble: 1 x 1
## `cor(new_onbase, runs)`
## <dbl>
## 1 0.921
plot_ss(x = new_onbase, y = runs, data = mlb11)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -1118 5654
##
## Sum of Squares: 29768.7
m_new_onbase <- lm(runs ~ new_onbase, data = mlb11)
summary(m_new_onbase)
##
## Call:
## lm(formula = runs ~ new_onbase, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.270 -18.335 3.249 19.520 69.002
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1118.4 144.5 -7.741 1.97e-08 ***
## new_onbase 5654.3 450.5 12.552 5.12e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.61 on 28 degrees of freedom
## Multiple R-squared: 0.8491, Adjusted R-squared: 0.8437
## F-statistic: 157.6 on 1 and 28 DF, p-value: 5.116e-13
ggplot(data = m_new_onbase, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
xlab("Fitted values") +
ylab("Residuals")
ggplot(data = m_new_onbase, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals")
ggplot(data = m_new_onbase, aes(sample = .resid)) +
stat_qq()
skewness(resid(m_new_onbase))
## [1] -0.03938303