Pull the data
moneyballdata <- read.csv("moneyball-training-data.csv", header = TRUE)
head(moneyballdata)
Summarize
dim(moneyballdata)
[1] 2276 17
Whittle down the data set to wins, hits, doubles, triples
head(mike.data.set)
wins hits doubles triples
[1,] 39 1445 194 39
[2,] 70 1339 219 22
[3,] 86 1377 232 35
[4,] 70 1387 209 38
[5,] 82 1297 186 27
[6,] 75 1279 200 36
Get details on wins variable
typeof(wins)
[1] "integer"
summary(wins)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 71 82 81 92 146
boxplot(wins, col = "orange")

hist(wins, col = "orange")

Get details on hits variable
typeof(hits)
[1] "integer"
summary(hits)
Min. 1st Qu. Median Mean 3rd Qu. Max.
891 1383 1454 1469 1537 2554
boxplot(hits, col = "blue")

hist(hits, col = "blue")

Get details on doubles variable
typeof(doubles)
[1] "integer"
summary(doubles)
Min. 1st Qu. Median Mean 3rd Qu. Max.
69 208 238 241 273 458
boxplot(doubles, col = "green")

hist(doubles, col = "green")

Get details on triples variable
typeof(triples)
[1] "integer"
summary(triples)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 34 47 55 72 223
boxplot(triples, col = "turquoise")

hist(triples, col = "turquoise")

Obtain a more detailed summary of variables
stat.desc(mike.data.set)
Find correlations between pairs
pairs.panels(mike.data.set)


Check for missing values
md.pattern(mike.data.set)
wins hits doubles triples
[1,] 1 1 1 1 0
[2,] 0 0 0 0 0
aggr_plot <- aggr(mike.data.set,
col=c('navyblue','red'),
numbers=TRUE,
sortVars=TRUE,
labels=names(mike.data.set),
cex.axis=.7,
gap=3,
ylab=c("Histogram of missing data","Pattern"))
Variables sorted by number of missings:
Variable Count
wins 0
hits 0
doubles 0
triples 0

Produce a scatterplot
plot(hits, wins, main="Scatterplot wins vs. hits")

Check the correlation between wins and hits
cor(wins, hits)
[1] 0.39
Create an example simple linear regression model. I used wins based on hits because it had the highest correlation.
mod <- lm(wins ~ hits)
summary(mod)
Call:
lm(formula = wins ~ hits)
Residuals:
Min 1Q Median 3Q Max
-71.77 -8.76 0.86 9.76 46.02
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 18.5623 3.1075 5.97 0.0000000027 ***
hits 0.0423 0.0021 20.12 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 14 on 2274 degrees of freedom
Multiple R-squared: 0.151, Adjusted R-squared: 0.151
F-statistic: 405 on 1 and 2274 DF, p-value: <0.0000000000000002
attributes(mod)
$names
[1] "coefficients" "residuals" "effects" "rank" "fitted.values" "assign"
[7] "qr" "df.residual" "xlevels" "call" "terms" "model"
$class
[1] "lm"
Check the model’s residuals or errors
par(mfrow=c(2,2))
plot(mod)

Fit the line
plot(wins~hits, data=faithful)
abline(mod, col=2, lwd=3)

Multple regression on wins vs doubles + triples
mod2 <- lm(wins ~ doubles + triples)
summary(mod2)
Call:
lm(formula = wins ~ doubles + triples)
Residuals:
Min 1Q Median 3Q Max
-69.90 -9.38 0.60 10.07 51.32
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 50.31417 1.81465 27.73 <0.0000000000000002 ***
doubles 0.10365 0.00668 15.51 <0.0000000000000002 ***
triples 0.09904 0.01120 8.85 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 15 on 2273 degrees of freedom
Multiple R-squared: 0.114, Adjusted R-squared: 0.113
F-statistic: 146 on 2 and 2273 DF, p-value: <0.0000000000000002
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCgpgYGB7cn0KbGlicmFyeShyZXNoYXBlMikKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHBhc3RlY3MpCmxpYnJhcnkocHN5Y2gpCmxpYnJhcnkobWljZSkKbGlicmFyeShWSU0pCmxpYnJhcnkoY29ycnBsb3QpCmBgYAoKCgojUHVsbCB0aGUgZGF0YQpgYGB7cn0KbW9uZXliYWxsZGF0YSA8LSByZWFkLmNzdigibW9uZXliYWxsLXRyYWluaW5nLWRhdGEuY3N2IiwgaGVhZGVyID0gVFJVRSkKaGVhZChtb25leWJhbGxkYXRhKQpgYGAKCiMjU3VtbWFyaXplCmBgYHtyfQpkaW0obW9uZXliYWxsZGF0YSkKYGBgCgojI1doaXR0bGUgZG93biB0aGUgZGF0YSBzZXQgdG8gd2lucywgaGl0cywgZG91YmxlcywgdHJpcGxlcyAKYGBge3J9CndpbnMgICAgPC0gbW9uZXliYWxsZGF0YSRUQVJHRVRfV0lOUwpoaXRzICAgIDwtIG1vbmV5YmFsbGRhdGEkVEVBTV9CQVRUSU5HX0gKZG91YmxlcyA8LSBtb25leWJhbGxkYXRhJFRFQU1fQkFUVElOR18yQgp0cmlwbGVzIDwtIG1vbmV5YmFsbGRhdGEkVEVBTV9CQVRUSU5HXzNCCgptaWtlLmRhdGEuc2V0IDwtIGNiaW5kKHdpbnMsIGhpdHMsIGRvdWJsZXMsIHRyaXBsZXMpCmhlYWQobWlrZS5kYXRhLnNldCkKYGBgCgoKIyNHZXQgZGV0YWlscyBvbiB3aW5zIHZhcmlhYmxlCmBgYHtyfQp0eXBlb2Yod2lucykKc3VtbWFyeSh3aW5zKQpib3hwbG90KHdpbnMsIGNvbCA9ICJvcmFuZ2UiKQpoaXN0KHdpbnMsIGNvbCA9ICJvcmFuZ2UiKQpgYGAKCgojI0dldCBkZXRhaWxzIG9uIGhpdHMgdmFyaWFibGUKYGBge3J9CnR5cGVvZihoaXRzKQpzdW1tYXJ5KGhpdHMpCmJveHBsb3QoaGl0cywgY29sID0gImJsdWUiKQpoaXN0KGhpdHMsIGNvbCA9ICJibHVlIikKYGBgCgoKIyNHZXQgZGV0YWlscyBvbiBkb3VibGVzIHZhcmlhYmxlCmBgYHtyfQp0eXBlb2YoZG91YmxlcykKc3VtbWFyeShkb3VibGVzKQpib3hwbG90KGRvdWJsZXMsIGNvbCA9ICJncmVlbiIpCmhpc3QoZG91YmxlcywgY29sID0gImdyZWVuIikKYGBgCgoKIyNHZXQgZGV0YWlscyBvbiB0cmlwbGVzIHZhcmlhYmxlCmBgYHtyfQp0eXBlb2YodHJpcGxlcykKc3VtbWFyeSh0cmlwbGVzKQpib3hwbG90KHRyaXBsZXMsIGNvbCA9ICJ0dXJxdW9pc2UiKQpoaXN0KHRyaXBsZXMsIGNvbCA9ICJ0dXJxdW9pc2UiKQpgYGAKIyNPYnRhaW4gYSBtb3JlIGRldGFpbGVkIHN1bW1hcnkgb2YgdmFyaWFibGVzCmBgYHtyfQpzdGF0LmRlc2MobWlrZS5kYXRhLnNldCkKYGBgCiMjRmluZCBjb3JyZWxhdGlvbnMgYmV0d2VlbiBwYWlycwpgYGB7cn0KcGFpcnMucGFuZWxzKG1pa2UuZGF0YS5zZXQpCmBgYApgYGB7cn0KY29ycnBsb3QoY29yKG1pa2UuZGF0YS5zZXQpLCBtZXRob2QgPSAibnVtYmVyIikKYGBgCiMjQ2hlY2sgZm9yIG1pc3NpbmcgdmFsdWVzCmBgYHtyfQptZC5wYXR0ZXJuKG1pa2UuZGF0YS5zZXQpCgphZ2dyX3Bsb3QgPC0gYWdncihtaWtlLmRhdGEuc2V0LCAKICAgICAgICAgICAgICAgIGNvbD1jKCduYXZ5Ymx1ZScsJ3JlZCcpLCAKICAgICAgICAgICAgICAgIG51bWJlcnM9VFJVRSwgCiAgICAgICAgICAgICAgICBzb3J0VmFycz1UUlVFLCAKICAgICAgICAgICAgICAgIGxhYmVscz1uYW1lcyhtaWtlLmRhdGEuc2V0KSwgCiAgICAgICAgICAgICAgICBjZXguYXhpcz0uNywgCiAgICAgICAgICAgICAgICBnYXA9MywgCiAgICAgICAgICAgICAgICB5bGFiPWMoIkhpc3RvZ3JhbSBvZiBtaXNzaW5nIGRhdGEiLCJQYXR0ZXJuIikpCgpgYGAKIyNQcm9kdWNlIGEgc2NhdHRlcnBsb3QKYGBge3J9CnBsb3QoaGl0cywgd2lucywgbWFpbj0iU2NhdHRlcnBsb3Qgd2lucyB2cy4gaGl0cyIpCmBgYAojIyNDaGVjayB0aGUgY29ycmVsYXRpb24gYmV0d2VlbiB3aW5zIGFuZCBoaXRzCmBgYHtyfQpjb3Iod2lucywgaGl0cykKYGBgCgojI0NyZWF0ZSBhbiBleGFtcGxlIHNpbXBsZSBsaW5lYXIgcmVncmVzc2lvbiBtb2RlbC4gSSB1c2VkIHdpbnMgYmFzZWQgb24gaGl0cyBiZWNhdXNlIGl0IGhhZCB0aGUgaGlnaGVzdCBjb3JyZWxhdGlvbi4KYGBge3J9Cm1vZCA8LSBsbSh3aW5zIH4gaGl0cykKc3VtbWFyeShtb2QpCmF0dHJpYnV0ZXMobW9kKQpgYGAKIyNDaGVjayB0aGUgbW9kZWwncyByZXNpZHVhbHMgb3IgZXJyb3JzCmBgYHtyfQpwYXIobWZyb3c9YygyLDIpKQpwbG90KG1vZCkKYGBgCgojI0ZpdCB0aGUgbGluZSAKYGBge3J9CnBsb3Qod2luc35oaXRzLCBkYXRhPWZhaXRoZnVsKQphYmxpbmUobW9kLCBjb2w9MiwgbHdkPTMpCmBgYAojI011bHRwbGUgcmVncmVzc2lvbiBvbiB3aW5zIHZzIGRvdWJsZXMgKyB0cmlwbGVzCmBgYHtyfQptb2QyIDwtIGxtKHdpbnMgfiBkb3VibGVzICsgdHJpcGxlcykKc3VtbWFyeShtb2QyKQpgYGAKCg==