R Markdown

code to access each variable in dataset and test model fit

library(statsr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(moments)

#we'll be looking at data from all 30 Major League Baseball teams and examining the linear relationship between runs scored in a season and a number of other player statistics. Our aim will be to summarize these relationships both graphically and numerically in order to find which variable, if any, helps us best predict a team's runs scored in a season.

data(mlb11)
names(mlb11)
##  [1] "team"         "runs"         "at_bats"      "hits"        
##  [5] "homeruns"     "bat_avg"      "strikeouts"   "stolen_bases"
##  [9] "wins"         "new_onbase"   "new_slug"     "new_obs"
# relationsship between runs and at_bats
ggplot(data = mlb11, aes(x = at_bats, y = runs)) + geom_point()

# quantify linear relationship
mlb11 %>%
  summarise(cor(runs, at_bats))
## # A tibble: 1 x 1
##   `cor(runs, at_bats)`
##                  <dbl>
## 1                0.611
plot_ss(x = at_bats, y = runs, data = mlb11)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##  -2789.2429       0.6305  
## 
## Sum of Squares:  123721.9
plot_ss(x = at_bats, y = runs, data = mlb11, showSquares = TRUE)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##  -2789.2429       0.6305  
## 
## Sum of Squares:  123721.9
# use linear model
m1 <- lm(runs ~ at_bats, data = mlb11)

# The output of lm is an object that contains all of the information we need about the linear model that was just fit. We can access this information using the summary function.

summary(m1)
## 
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -125.58  -47.05  -16.59   54.40  176.87 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2789.2429   853.6957  -3.267 0.002871 ** 
## at_bats         0.6305     0.1545   4.080 0.000339 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared:  0.3729, Adjusted R-squared:  0.3505 
## F-statistic: 16.65 on 1 and 28 DF,  p-value: 0.0003388
#Fit a new model that uses homeruns to predict runs
m2 <- lm(runs ~ homeruns, data = mlb11)
summary(m2)
## 
## Call:
## lm(formula = runs ~ homeruns, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -91.615 -33.410   3.231  24.292 104.631 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 415.2389    41.6779   9.963 1.04e-10 ***
## homeruns      1.8345     0.2677   6.854 1.90e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.29 on 28 degrees of freedom
## Multiple R-squared:  0.6266, Adjusted R-squared:  0.6132 
## F-statistic: 46.98 on 1 and 28 DF,  p-value: 1.9e-07
anova(m2)
## Analysis of Variance Table
## 
## Response: runs
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## homeruns   1 123609  123609  46.979 1.9e-07 ***
## Residuals 28  73672    2631                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(data = mlb11, aes(x = at_bats, y = runs)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)

# with std error

ggplot(data = mlb11, aes(x = at_bats, y = runs)) +
  geom_point() +
  stat_smooth(method = "lm", se = TRUE)

# observed number of runs for the team with 5,579 at bats you can use the following
mlb11 %>%
  filter(at_bats == 5579) %>%
  dplyr::select(runs)
## # A tibble: 1 x 1
##    runs
##   <int>
## 1   713
#o assess whether the linear model is reliable, we need to check for (1) linearity, (2) nearly normal residuals, and (3) constant variability.
# linearity already checked by using scatterplo now check residual vs fitted values
ggplot(data = m1, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

#The residuals appear to be randomly distributed around 0

#Nearly normal residuals: To check this condition, we can look at a histogram

ggplot(data = m1, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here 
ggplot(data = m1, aes(sample = .resid)) +
  stat_qq()

skewness(resid(m1))
## [1] 0.5439197
############### check with wins now 
ggplot(data = mlb11, aes(x = wins, y = runs)) + geom_point()

# quantify linear relationship
mlb11 %>%
  summarise(cor(runs, wins))
## # A tibble: 1 x 1
##   `cor(runs, wins)`
##               <dbl>
## 1             0.601
plot_ss(x = wins, y = runs, data = mlb11)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##     342.121        4.341  
## 
## Sum of Squares:  126068.4
# use linear model
m_wins <- lm(runs ~ wins, data = mlb11)

# The output of lm is an object that contains all of the information we need about the linear model that was just fit. We can access this information using the summary function.
summary(m_wins)
## 
## Call:
## lm(formula = runs ~ wins, data = mlb11)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.450  -47.506   -7.482   47.346  142.186 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  342.121     89.223   3.834 0.000654 ***
## wins           4.341      1.092   3.977 0.000447 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67.1 on 28 degrees of freedom
## Multiple R-squared:  0.361,  Adjusted R-squared:  0.3381 
## F-statistic: 15.82 on 1 and 28 DF,  p-value: 0.0004469
# check residucal plot
ggplot(data = m_wins, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

#Nearly normal residuals: To check this condition, we can look at a histogram

ggplot(data = m_wins, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here 
ggplot(data = m_wins, aes(sample = .resid)) +
  stat_qq()

skewness(resid(m_wins))
## [1] 0.1184672
# check with #############Hits########################
ggplot(data = mlb11, aes(x = hits, y = runs)) + geom_point()

# quantify linear relationship
mlb11 %>%
  summarise(cor(hits, runs))
## # A tibble: 1 x 1
##   `cor(hits, runs)`
##               <dbl>
## 1             0.801
plot_ss(x = wins, y = runs, data = mlb11)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##     342.121        4.341  
## 
## Sum of Squares:  126068.4
# use linear model
m_hits <- lm(runs ~ hits, data = mlb11)

# The output of lm is an object that contains all of the information we need about the linear model that was just fit. We can access this information using the summary function.
summary(m_hits)
## 
## Call:
## lm(formula = runs ~ hits, data = mlb11)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -103.718  -27.179   -5.233   19.322  140.693 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -375.5600   151.1806  -2.484   0.0192 *  
## hits           0.7589     0.1071   7.085 1.04e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.23 on 28 degrees of freedom
## Multiple R-squared:  0.6419, Adjusted R-squared:  0.6292 
## F-statistic:  50.2 on 1 and 28 DF,  p-value: 1.043e-07
# check residucal plot
ggplot(data = m_hits, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

#Nearly normal residuals: To check this condition, we can look at a histogram

ggplot(data = m_hits, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here 
ggplot(data = m_hits, aes(sample = .resid)) +
  stat_qq()

skewness(resid(m_hits))
## [1] 0.584631
################# bat_avg###################
ggplot(data = mlb11, aes(x = bat_avg, y = runs)) + geom_point()

# quantify linear relationship
mlb11 %>%
  summarise(cor(bat_avg, runs))
## # A tibble: 1 x 1
##   `cor(bat_avg, runs)`
##                  <dbl>
## 1                0.810
plot_ss(x = bat_avg, y = runs, data = mlb11)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##      -642.8       5242.2  
## 
## Sum of Squares:  67849.52
# use linear model
m_bat_avg <- lm(runs ~ bat_avg, data = mlb11)

# The output of lm is an object that contains all of the information we need about the linear model that was just fit. We can access this information using the summary function.
summary(m_bat_avg)
## 
## Call:
## lm(formula = runs ~ bat_avg, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -94.676 -26.303  -5.496  28.482 131.113 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -642.8      183.1  -3.511  0.00153 ** 
## bat_avg       5242.2      717.3   7.308 5.88e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.23 on 28 degrees of freedom
## Multiple R-squared:  0.6561, Adjusted R-squared:  0.6438 
## F-statistic: 53.41 on 1 and 28 DF,  p-value: 5.877e-08
# check residucal plot
ggplot(data = m_bat_avg, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

#Nearly normal residuals: To check this condition, we can look at a histogram

ggplot(data = m_bat_avg, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here 
ggplot(data = m_bat_avg, aes(sample = .resid)) +
  stat_qq()

skewness(resid(m_bat_avg))
## [1] 0.5703952
######## summary
summary(m1)
## 
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -125.58  -47.05  -16.59   54.40  176.87 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2789.2429   853.6957  -3.267 0.002871 ** 
## at_bats         0.6305     0.1545   4.080 0.000339 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared:  0.3729, Adjusted R-squared:  0.3505 
## F-statistic: 16.65 on 1 and 28 DF,  p-value: 0.0003388
summary(m_wins)
## 
## Call:
## lm(formula = runs ~ wins, data = mlb11)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.450  -47.506   -7.482   47.346  142.186 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  342.121     89.223   3.834 0.000654 ***
## wins           4.341      1.092   3.977 0.000447 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67.1 on 28 degrees of freedom
## Multiple R-squared:  0.361,  Adjusted R-squared:  0.3381 
## F-statistic: 15.82 on 1 and 28 DF,  p-value: 0.0004469
summary(m_bat_avg)
## 
## Call:
## lm(formula = runs ~ bat_avg, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -94.676 -26.303  -5.496  28.482 131.113 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -642.8      183.1  -3.511  0.00153 ** 
## bat_avg       5242.2      717.3   7.308 5.88e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.23 on 28 degrees of freedom
## Multiple R-squared:  0.6561, Adjusted R-squared:  0.6438 
## F-statistic: 53.41 on 1 and 28 DF,  p-value: 5.877e-08
summary(m_hits)
## 
## Call:
## lm(formula = runs ~ hits, data = mlb11)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -103.718  -27.179   -5.233   19.322  140.693 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -375.5600   151.1806  -2.484   0.0192 *  
## hits           0.7589     0.1071   7.085 1.04e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.23 on 28 degrees of freedom
## Multiple R-squared:  0.6419, Adjusted R-squared:  0.6292 
## F-statistic:  50.2 on 1 and 28 DF,  p-value: 1.043e-07
# look to me wins is better predictor of runs

############# check new obs
ggplot(data = mlb11, aes(x = new_obs, y = runs)) + geom_point()

mlb11 %>%  summarise(cor(new_obs, runs))
## # A tibble: 1 x 1
##   `cor(new_obs, runs)`
##                  <dbl>
## 1                0.967
plot_ss(x = new_obs, y = runs, data = mlb11)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##      -686.6       1919.4  
## 
## Sum of Squares:  12837.66
m_new_obs <- lm(runs ~ new_obs, data = mlb11)
summary(m_new_obs)
## 
## Call:
## lm(formula = runs ~ new_obs, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.456 -13.690   1.165  13.935  41.156 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -686.61      68.93  -9.962 1.05e-10 ***
## new_obs      1919.36      95.70  20.057  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.41 on 28 degrees of freedom
## Multiple R-squared:  0.9349, Adjusted R-squared:  0.9326 
## F-statistic: 402.3 on 1 and 28 DF,  p-value: < 2.2e-16
ggplot(data = m_new_obs, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m_new_obs, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

ggplot(data = m_new_obs, aes(sample = .resid)) +
  stat_qq()

skewness(resid(m_new_obs))
## [1] -0.09244757
############### new slug
ggplot(data = mlb11, aes(x = new_slug, y = runs)) + geom_point()

mlb11 %>%  summarise(cor(new_slug, runs))
## # A tibble: 1 x 1
##   `cor(new_slug, runs)`
##                   <dbl>
## 1                 0.947
plot_ss(x = new_slug, y = runs, data = mlb11)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##      -375.8       2681.3  
## 
## Sum of Squares:  20345.54
m_new_slug <- lm(runs ~ new_slug, data = mlb11)
summary(m_new_slug)
## 
## Call:
## lm(formula = runs ~ new_slug, data = mlb11)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45.41 -18.66  -0.91  16.29  52.29 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -375.80      68.71   -5.47 7.70e-06 ***
## new_slug     2681.33     171.83   15.61 2.42e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.96 on 28 degrees of freedom
## Multiple R-squared:  0.8969, Adjusted R-squared:  0.8932 
## F-statistic: 243.5 on 1 and 28 DF,  p-value: 2.42e-15
ggplot(data = m_new_slug, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m_new_slug, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

ggplot(data = m_new_slug, aes(sample = .resid)) +
  stat_qq()

skewness(resid(m_new_slug))
## [1] -0.02966238
############### new onbase
ggplot(data = mlb11, aes(x = new_onbase, y = runs)) + geom_point()

mlb11 %>%  summarise(cor(new_onbase, runs))
## # A tibble: 1 x 1
##   `cor(new_onbase, runs)`
##                     <dbl>
## 1                   0.921
plot_ss(x = new_onbase, y = runs, data = mlb11)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##       -1118         5654  
## 
## Sum of Squares:  29768.7
m_new_onbase <- lm(runs ~ new_onbase, data = mlb11)
summary(m_new_onbase)
## 
## Call:
## lm(formula = runs ~ new_onbase, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.270 -18.335   3.249  19.520  69.002 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1118.4      144.5  -7.741 1.97e-08 ***
## new_onbase    5654.3      450.5  12.552 5.12e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.61 on 28 degrees of freedom
## Multiple R-squared:  0.8491, Adjusted R-squared:  0.8437 
## F-statistic: 157.6 on 1 and 28 DF,  p-value: 5.116e-13
ggplot(data = m_new_onbase, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m_new_onbase, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

ggplot(data = m_new_onbase, aes(sample = .resid)) +
  stat_qq()

skewness(resid(m_new_onbase))
## [1] -0.03938303