# Code for demo
setwd("~/Desktop/R Materials/mih140/Assignments/'20 Assignments/Data")
mlb = read.table("mlb.txt", sep = "\t", header = T)
Review: Want to learn a (linear) relationship between some Y and X.
Simple Lin. Reg: \(Y = b_0 + b_1 * X\) <- Linear model learned from data by OLS \(Y = B_0 + B_1 * X + \epsilon\) <- Assumed Prob. Linear Model, if this is whats generating the data when we can do inference on the learned model.
What about a Y and multiple X’s?
Mult. Lin. Reg \(Y = b_0 + b_1 * X_1 + b_1 * X_2 + ... + b_k * X_k\) <- Linear model learned from data by OLS \(Y = B_0 + B_1 * X_1 + B_1 * X_2 + ... + B_k * X_k + \epsilon\) <- Assumed Prob. Linear Model, if this is whats generating the data when we can do inference on the learned model.
Let’s consider the following question: How well can we predict AvgRuns scored by a team from information about the teams offense?
Steps we’ll take 1. Look for linear correlation in the data using cor(), gpairs() 2. We’ll build a multiple linear regression model and examine the coeffs 3. We’ll check the assumptions of prob. linear model. 4. We’ll do prediction using the model
# Motivation: Lets look inside the data for interesting features that could be useful in predicting/explaining average runs.
# Lets test using these: "OBP", "SLG", "BB", "H", "HR"
# To look for correlations in the data lets use the cor() function
cor(mlb[,c("OBP", "SLG", "SO", "H", "HR", "AvgRuns")])
## OBP SLG SO H HR AvgRuns
## OBP 1.0000000 0.74851213 -0.22677620 0.66350687 0.34017597 0.9069616
## SLG 0.7485121 1.00000000 0.05254551 0.60846506 0.76822395 0.8655062
## SO -0.2267762 0.05254551 1.00000000 -0.45381034 0.41799431 -0.1260806
## H 0.6635069 0.60846506 -0.45381034 1.00000000 0.06468699 0.6947970
## HR 0.3401760 0.76822395 0.41799431 0.06468699 1.00000000 0.5445436
## AvgRuns 0.9069616 0.86550616 -0.12608056 0.69479697 0.54454360 1.0000000
# Seems like everything positively correlates with AvgRuns, except SO which negatively correlates. We can examine individual correlations more closely using cor.test():
cor.test(mlb$SO, mlb$AvgRuns) # Not sign.
##
## Pearson's product-moment correlation
##
## data: mlb$SO and mlb$AvgRuns
## t = -0.67252, df = 28, p-value = 0.5068
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4652182 0.2453324
## sample estimates:
## cor
## -0.1260806
cor.test(mlb$HR, mlb$AvgRuns) # Sign.
##
## Pearson's product-moment correlation
##
## data: mlb$HR and mlb$AvgRuns
## t = 3.4355, df = 28, p-value = 0.001864
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2292490 0.7564172
## sample estimates:
## cor
## 0.5445436
cor.test(mlb$OBP, mlb$AvgRuns) # Very sign.
##
## Pearson's product-moment correlation
##
## data: mlb$OBP and mlb$AvgRuns
## t = 11.394, df = 28, p-value = 5.01e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8120203 0.9551389
## sample estimates:
## cor
## 0.9069616
# Can also closely examine relationship with gpairs
# install.packages("gpairs) <- make sure package is installed
library(gpairs)
gpairs(mlb[,c("OBP", "SLG", "SO", "H", "HR", "AvgRuns")])
# Lets train a model using all of the data
model_full = lm(data = mlb, AvgRuns ~ OBP + SLG + SO + H + HR)
# We can examine the models coeff.
model_full$coefficients
## (Intercept) OBP SLG SO H
## -5.859346e+00 2.354406e+01 -1.002836e+00 1.672289e-05 1.601717e-03
## HR
## 5.016273e-03
# Model is:
# AvgRuns = -5.859 + 23.54*OBP - 1*SLG + .0000167*SO + .0016017*H + 00501*HR
# Examine the model more closely
summary(model_full)
##
## Call:
## lm(formula = AvgRuns ~ OBP + SLG + SO + H + HR, data = mlb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.21666 -0.06209 -0.01367 0.03097 0.34876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.859e+00 8.968e-01 -6.534 9.31e-07 ***
## OBP 2.354e+01 3.856e+00 6.106 2.63e-06 ***
## SLG -1.003e+00 4.528e+00 -0.221 0.8266
## SO 1.672e-05 2.475e-04 0.068 0.9467
## H 1.602e-03 7.215e-04 2.220 0.0361 *
## HR 5.016e-03 2.398e-03 2.092 0.0472 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1248 on 24 degrees of freedom
## Multiple R-squared: 0.9206, Adjusted R-squared: 0.9041
## F-statistic: 55.66 on 5 and 24 DF, p-value: 2.028e-12
# Seems SLG and SO are not statistically sign.
# (d)
confint.lm(model_full, level = .99)
## 0.5 % 99.5 %
## (Intercept) -8.367546e+00 -3.3511455989
## OBP 1.275891e+01 34.3292119086
## SLG -1.366805e+01 11.6623728336
## SO -6.754848e-04 0.0007089306
## H -4.162833e-04 0.0036197177
## HR -1.690921e-03 0.0117234669
m1 = lm(AvgRuns ~ OBP + SLG + SO + H + HR, data = mlb)
m2 = lm(AvgRuns ~ OBP + H + HR, data = mlb)
m3 = lm(AvgRuns ~ OBP + SO + H + HR, data = mlb)
summary(m1)
##
## Call:
## lm(formula = AvgRuns ~ OBP + SLG + SO + H + HR, data = mlb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.21666 -0.06209 -0.01367 0.03097 0.34876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.859e+00 8.968e-01 -6.534 9.31e-07 ***
## OBP 2.354e+01 3.856e+00 6.106 2.63e-06 ***
## SLG -1.003e+00 4.528e+00 -0.221 0.8266
## SO 1.672e-05 2.475e-04 0.068 0.9467
## H 1.602e-03 7.215e-04 2.220 0.0361 *
## HR 5.016e-03 2.398e-03 2.092 0.0472 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1248 on 24 degrees of freedom
## Multiple R-squared: 0.9206, Adjusted R-squared: 0.9041
## F-statistic: 55.66 on 5 and 24 DF, p-value: 2.028e-12
summary(m2)
##
## Call:
## lm(formula = AvgRuns ~ OBP + H + HR, data = mlb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.21069 -0.06334 -0.01946 0.02914 0.34983
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.8225894 0.6585764 -8.841 2.58e-09 ***
## OBP 23.0137281 2.9434921 7.819 2.71e-08 ***
## H 0.0014754 0.0004371 3.376 0.00232 **
## HR 0.0045651 0.0008797 5.189 2.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.12 on 26 degrees of freedom
## Multiple R-squared: 0.9204, Adjusted R-squared: 0.9113
## F-statistic: 100.3 on 3 and 26 DF, p-value: 2.06e-14
summary(m3)
##
## Call:
## lm(formula = AvgRuns ~ OBP + SO + H + HR, data = mlb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.21176 -0.06340 -0.01887 0.02788 0.34959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.850e+00 8.786e-01 -6.658 5.62e-07 ***
## OBP 2.303e+01 3.021e+00 7.624 5.58e-08 ***
## SO 1.176e-05 2.417e-04 0.049 0.961579
## H 1.484e-03 4.772e-04 3.109 0.004637 **
## HR 4.540e-03 1.038e-03 4.373 0.000189 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1224 on 25 degrees of freedom
## Multiple R-squared: 0.9204, Adjusted R-squared: 0.9077
## F-statistic: 72.32 on 4 and 25 DF, p-value: 2.265e-13
library(gvlma)
gvlma(m2) # Not all satisified
##
## Call:
## lm(formula = AvgRuns ~ OBP + H + HR, data = mlb)
##
## Coefficients:
## (Intercept) OBP H HR
## -5.822589 23.013728 0.001475 0.004565
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = m2)
##
## Value p-value Decision
## Global Stat 9.769018 0.04450 Assumptions NOT satisfied!
## Skewness 4.981904 0.02561 Assumptions NOT satisfied!
## Kurtosis 2.905006 0.08830 Assumptions acceptable.
## Link Function 0.004468 0.94671 Assumptions acceptable.
## Heteroscedasticity 1.877639 0.17060 Assumptions acceptable.
par(mfrow = c(2,2))
plot(m2)
par(mfrow = c(1,1))
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
dwtest(m2) # p-value = .8586, thus independence is a valid assumption
##
## Durbin-Watson test
##
## data: m2
## DW = 2.3876, p-value = 0.8586
## alternative hypothesis: true autocorrelation is greater than 0
# Finally lets do prediction using the model m2
mlb_train = mlb[mlb$Tm != "PIT",] # All teams not PIT
test_df = mlb[mlb$Tm == "PIT",] # We will test on PIT
m2_no_PIT = lm(AvgRuns ~ OBP + H + HR, data = mlb_train) # first retrain the model without PIT
predict(m2_no_PIT, test_df, interval = "confidence")
## fit lwr upr
## 22 4.157294 4.021757 4.292831
predict(m2_no_PIT, test_df, interval = "prediction")
## fit lwr upr
## 22 4.157294 3.871375 4.443214
test_df$AvgRuns # 4.12, pretty good estimate!
## [1] 4.12