library(reshape2)
library(ggplot2)
library(pastecs)
library(psych)
library(mice)
library(VIM)
library(corrplot)

Pull the data

moneyballdata <- read.csv("moneyball-training-data.csv", header = TRUE)
head(moneyballdata)

Summarize

dim(moneyballdata)
[1] 2276   17

Whittle down the data set to wins, hits, doubles, triples

head(mike.data.set)
     wins hits doubles triples
[1,]   39 1445     194      39
[2,]   70 1339     219      22
[3,]   86 1377     232      35
[4,]   70 1387     209      38
[5,]   82 1297     186      27
[6,]   75 1279     200      36

Get details on wins variable

typeof(wins)
[1] "integer"
summary(wins)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      71      82      81      92     146 
boxplot(wins, col = "orange")

hist(wins, col = "orange")

Get details on hits variable

typeof(hits)
[1] "integer"
summary(hits)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    891    1383    1454    1469    1537    2554 
boxplot(hits, col = "blue")

hist(hits, col = "blue")

Get details on doubles variable

typeof(doubles)
[1] "integer"
summary(doubles)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
     69     208     238     241     273     458 
boxplot(doubles, col = "green")

hist(doubles, col = "green")

Get details on triples variable

typeof(triples)
[1] "integer"
summary(triples)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      34      47      55      72     223 
boxplot(triples, col = "turquoise")

hist(triples, col = "turquoise")

Obtain a more detailed summary of variables

stat.desc(mike.data.set)

Find correlations between pairs

pairs.panels(mike.data.set)

Check for missing values

md.pattern(mike.data.set)
     wins hits doubles triples  
[1,]    1    1       1       1 0
[2,]    0    0       0       0 0
aggr_plot <- aggr(mike.data.set, 
                col=c('navyblue','red'), 
                numbers=TRUE, 
                sortVars=TRUE, 
                labels=names(mike.data.set), 
                cex.axis=.7, 
                gap=3, 
                ylab=c("Histogram of missing data","Pattern"))

 Variables sorted by number of missings: 
 Variable Count
     wins     0
     hits     0
  doubles     0
  triples     0

Produce a scatterplot

plot(hits, wins, main="Scatterplot wins vs. hits")

Check the correlation between wins and hits

cor(wins, hits)
[1] 0.39

Create an example simple linear regression model. I used wins based on hits because it had the highest correlation.

mod <- lm(wins ~ hits)
summary(mod)

Call:
lm(formula = wins ~ hits)

Residuals:
   Min     1Q Median     3Q    Max 
-71.77  -8.76   0.86   9.76  46.02 

Coefficients:
            Estimate Std. Error t value             Pr(>|t|)    
(Intercept)  18.5623     3.1075    5.97         0.0000000027 ***
hits          0.0423     0.0021   20.12 < 0.0000000000000002 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 14 on 2274 degrees of freedom
Multiple R-squared:  0.151, Adjusted R-squared:  0.151 
F-statistic:  405 on 1 and 2274 DF,  p-value: <0.0000000000000002
attributes(mod)
$names
 [1] "coefficients"  "residuals"     "effects"       "rank"          "fitted.values" "assign"       
 [7] "qr"            "df.residual"   "xlevels"       "call"          "terms"         "model"        

$class
[1] "lm"

Check the model’s residuals or errors

par(mfrow=c(2,2))
plot(mod)

Fit the line

plot(wins~hits, data=faithful)
abline(mod, col=2, lwd=3)

Multple regression on wins vs doubles + triples

mod2 <- lm(wins ~ doubles + triples)
summary(mod2)

Call:
lm(formula = wins ~ doubles + triples)

Residuals:
   Min     1Q Median     3Q    Max 
-69.90  -9.38   0.60  10.07  51.32 

Coefficients:
            Estimate Std. Error t value            Pr(>|t|)    
(Intercept) 50.31417    1.81465   27.73 <0.0000000000000002 ***
doubles      0.10365    0.00668   15.51 <0.0000000000000002 ***
triples      0.09904    0.01120    8.85 <0.0000000000000002 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 15 on 2273 degrees of freedom
Multiple R-squared:  0.114, Adjusted R-squared:  0.113 
F-statistic:  146 on 2 and 2273 DF,  p-value: <0.0000000000000002
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCgpgYGB7cn0KbGlicmFyeShyZXNoYXBlMikKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHBhc3RlY3MpCmxpYnJhcnkocHN5Y2gpCmxpYnJhcnkobWljZSkKbGlicmFyeShWSU0pCmxpYnJhcnkoY29ycnBsb3QpCmBgYAoKCgojUHVsbCB0aGUgZGF0YQpgYGB7cn0KbW9uZXliYWxsZGF0YSA8LSByZWFkLmNzdigibW9uZXliYWxsLXRyYWluaW5nLWRhdGEuY3N2IiwgaGVhZGVyID0gVFJVRSkKaGVhZChtb25leWJhbGxkYXRhKQpgYGAKCiMjU3VtbWFyaXplCmBgYHtyfQpkaW0obW9uZXliYWxsZGF0YSkKYGBgCgojI1doaXR0bGUgZG93biB0aGUgZGF0YSBzZXQgdG8gd2lucywgaGl0cywgZG91YmxlcywgdHJpcGxlcyAKYGBge3J9CndpbnMgICAgPC0gbW9uZXliYWxsZGF0YSRUQVJHRVRfV0lOUwpoaXRzICAgIDwtIG1vbmV5YmFsbGRhdGEkVEVBTV9CQVRUSU5HX0gKZG91YmxlcyA8LSBtb25leWJhbGxkYXRhJFRFQU1fQkFUVElOR18yQgp0cmlwbGVzIDwtIG1vbmV5YmFsbGRhdGEkVEVBTV9CQVRUSU5HXzNCCgptaWtlLmRhdGEuc2V0IDwtIGNiaW5kKHdpbnMsIGhpdHMsIGRvdWJsZXMsIHRyaXBsZXMpCmhlYWQobWlrZS5kYXRhLnNldCkKYGBgCgoKIyNHZXQgZGV0YWlscyBvbiB3aW5zIHZhcmlhYmxlCmBgYHtyfQp0eXBlb2Yod2lucykKc3VtbWFyeSh3aW5zKQpib3hwbG90KHdpbnMsIGNvbCA9ICJvcmFuZ2UiKQpoaXN0KHdpbnMsIGNvbCA9ICJvcmFuZ2UiKQpgYGAKCgojI0dldCBkZXRhaWxzIG9uIGhpdHMgdmFyaWFibGUKYGBge3J9CnR5cGVvZihoaXRzKQpzdW1tYXJ5KGhpdHMpCmJveHBsb3QoaGl0cywgY29sID0gImJsdWUiKQpoaXN0KGhpdHMsIGNvbCA9ICJibHVlIikKYGBgCgoKIyNHZXQgZGV0YWlscyBvbiBkb3VibGVzIHZhcmlhYmxlCmBgYHtyfQp0eXBlb2YoZG91YmxlcykKc3VtbWFyeShkb3VibGVzKQpib3hwbG90KGRvdWJsZXMsIGNvbCA9ICJncmVlbiIpCmhpc3QoZG91YmxlcywgY29sID0gImdyZWVuIikKYGBgCgoKIyNHZXQgZGV0YWlscyBvbiB0cmlwbGVzIHZhcmlhYmxlCmBgYHtyfQp0eXBlb2YodHJpcGxlcykKc3VtbWFyeSh0cmlwbGVzKQpib3hwbG90KHRyaXBsZXMsIGNvbCA9ICJ0dXJxdW9pc2UiKQpoaXN0KHRyaXBsZXMsIGNvbCA9ICJ0dXJxdW9pc2UiKQpgYGAKIyNPYnRhaW4gYSBtb3JlIGRldGFpbGVkIHN1bW1hcnkgb2YgdmFyaWFibGVzCmBgYHtyfQpzdGF0LmRlc2MobWlrZS5kYXRhLnNldCkKYGBgCiMjRmluZCBjb3JyZWxhdGlvbnMgYmV0d2VlbiBwYWlycwpgYGB7cn0KcGFpcnMucGFuZWxzKG1pa2UuZGF0YS5zZXQpCmBgYApgYGB7cn0KY29ycnBsb3QoY29yKG1pa2UuZGF0YS5zZXQpLCBtZXRob2QgPSAibnVtYmVyIikKYGBgCiMjQ2hlY2sgZm9yIG1pc3NpbmcgdmFsdWVzCmBgYHtyfQptZC5wYXR0ZXJuKG1pa2UuZGF0YS5zZXQpCgphZ2dyX3Bsb3QgPC0gYWdncihtaWtlLmRhdGEuc2V0LCAKICAgICAgICAgICAgICAgIGNvbD1jKCduYXZ5Ymx1ZScsJ3JlZCcpLCAKICAgICAgICAgICAgICAgIG51bWJlcnM9VFJVRSwgCiAgICAgICAgICAgICAgICBzb3J0VmFycz1UUlVFLCAKICAgICAgICAgICAgICAgIGxhYmVscz1uYW1lcyhtaWtlLmRhdGEuc2V0KSwgCiAgICAgICAgICAgICAgICBjZXguYXhpcz0uNywgCiAgICAgICAgICAgICAgICBnYXA9MywgCiAgICAgICAgICAgICAgICB5bGFiPWMoIkhpc3RvZ3JhbSBvZiBtaXNzaW5nIGRhdGEiLCJQYXR0ZXJuIikpCgpgYGAKIyNQcm9kdWNlIGEgc2NhdHRlcnBsb3QKYGBge3J9CnBsb3QoaGl0cywgd2lucywgbWFpbj0iU2NhdHRlcnBsb3Qgd2lucyB2cy4gaGl0cyIpCmBgYAojIyNDaGVjayB0aGUgY29ycmVsYXRpb24gYmV0d2VlbiB3aW5zIGFuZCBoaXRzCmBgYHtyfQpjb3Iod2lucywgaGl0cykKYGBgCgojI0NyZWF0ZSBhbiBleGFtcGxlIHNpbXBsZSBsaW5lYXIgcmVncmVzc2lvbiBtb2RlbC4gSSB1c2VkIHdpbnMgYmFzZWQgb24gaGl0cyBiZWNhdXNlIGl0IGhhZCB0aGUgaGlnaGVzdCBjb3JyZWxhdGlvbi4KYGBge3J9Cm1vZCA8LSBsbSh3aW5zIH4gaGl0cykKc3VtbWFyeShtb2QpCmF0dHJpYnV0ZXMobW9kKQpgYGAKIyNDaGVjayB0aGUgbW9kZWwncyByZXNpZHVhbHMgb3IgZXJyb3JzCmBgYHtyfQpwYXIobWZyb3c9YygyLDIpKQpwbG90KG1vZCkKYGBgCgojI0ZpdCB0aGUgbGluZSAKYGBge3J9CnBsb3Qod2luc35oaXRzLCBkYXRhPWZhaXRoZnVsKQphYmxpbmUobW9kLCBjb2w9MiwgbHdkPTMpCmBgYAojI011bHRwbGUgcmVncmVzc2lvbiBvbiB3aW5zIHZzIGRvdWJsZXMgKyB0cmlwbGVzCmBgYHtyfQptb2QyIDwtIGxtKHdpbnMgfiBkb3VibGVzICsgdHJpcGxlcykKc3VtbWFyeShtb2QyKQpgYGAKCg==