library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.84 loaded
library(psych)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(forecast)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
###1. Load Data
train.df = read.table("moneyball-training-data.csv", sep=',', header = TRUE, stringsAsFactors = FALSE)
###2. Data Exploration
head(train.df)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 1 39 1445 194 39
## 2 2 70 1339 219 22
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 NA
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 NA NA 9364 84
## 2 28 NA 1347 191
## 3 27 NA 1377 137
## 4 30 NA 1396 97
## 5 39 NA 1297 102
## 6 59 NA 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 NA
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
nrow(train.df)
## [1] 2276
ncol(train.df)
## [1] 17
train.df = train.df%>%dplyr::select(-INDEX)
cols.miss = sapply(train.df, function(x) any(is.na(x)))
cols.miss = names(cols.miss[cols.miss>0])
cols.nomiss = names(train.df)[!names(train.df)%in%(cols.miss)]
train.df = train.df%>%dplyr::select(cols.nomiss)
summary(train.df$TARGET_WINS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 71.00 82.00 80.79 92.00 146.00
hist(train.df$TARGET_WINS)
boxplot(train.df$TARGET_WINS)
res = cor(train.df)
corrplot(res, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
###3. Data Preparation
train.trans = train.df
train.trans$TEAM_BATTING_H = train.trans$TEAM_BATTING_H + train.trans$TEAM_BATTING_2B + train.trans$TEAM_BATTING_3B + train.trans$TEAM_BATTING_HR
train.trans$TEAM_FIELDING_E = train.trans$TEAM_FIELDING_E + train.trans$TEAM_PITCHING_H
train.trans = train.trans%>%dplyr::select(-TEAM_BATTING_2B, -TEAM_BATTING_3B, -TEAM_BATTING_HR, -TEAM_PITCHING_H)
summary(train.trans)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_BB TEAM_PITCHING_HR
## Min. : 0.00 Min. :1026 Min. : 0.0 Min. : 0.0
## 1st Qu.: 71.00 1st Qu.:1739 1st Qu.:451.0 1st Qu.: 50.0
## Median : 82.00 Median :1862 Median :512.0 Median :107.0
## Mean : 80.79 Mean :1865 Mean :501.6 Mean :105.7
## 3rd Qu.: 92.00 3rd Qu.:1978 3rd Qu.:580.0 3rd Qu.:150.0
## Max. :146.00 Max. :3092 Max. :878.0 Max. :343.0
## TEAM_PITCHING_BB TEAM_FIELDING_E
## Min. : 0.0 Min. : 1276
## 1st Qu.: 476.0 1st Qu.: 1566
## Median : 536.5 Median : 1679
## Mean : 553.0 Mean : 2026
## 3rd Qu.: 611.0 3rd Qu.: 1922
## Max. :3645.0 Max. :31860
hist(train.trans$TEAM_BATTING_H)
restrans = cor(train.trans)
pairs.panels(train.trans,
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
train.trans$TEAM_PITCHING_HR = apply(train.trans['TEAM_PITCHING_HR'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
train.trans$TEAM_PITCHING_BB = apply(train.trans['TEAM_PITCHING_BB'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
train.trans$TEAM_FIELDING_E = apply(train.trans['TEAM_FIELDING_E'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
pairs.panels(train.trans,
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
)
model1 = lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_FIELDING_E, data=train.trans)
summary(model1)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_FIELDING_E, data = train.trans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.452 -9.134 0.337 9.240 48.351
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.101138 10.855178 5.629 2.04e-08 ***
## TEAM_BATTING_H 0.041235 0.001990 20.720 < 2e-16 ***
## TEAM_BATTING_BB 0.011761 0.004849 2.426 0.0154 *
## TEAM_PITCHING_HR -2.451877 0.488779 -5.016 5.67e-07 ***
## TEAM_PITCHING_BB 2.600056 1.641425 1.584 0.1133
## TEAM_FIELDING_E -9.122132 1.562982 -5.836 6.10e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.82 on 2270 degrees of freedom
## Multiple R-squared: 0.2316, Adjusted R-squared: 0.2299
## F-statistic: 136.9 on 5 and 2270 DF, p-value: < 2.2e-16
model2 = lm(TARGET_WINS ~ TEAM_BATTING_H + poly(TEAM_BATTING_BB,2) + TEAM_PITCHING_HR + poly(TEAM_PITCHING_BB,2) + poly(TEAM_FIELDING_E,2), data=train.trans)
summary(model2)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + poly(TEAM_BATTING_BB,
## 2) + TEAM_PITCHING_HR + poly(TEAM_PITCHING_BB, 2) + poly(TEAM_FIELDING_E,
## 2), data = train.trans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.787 -8.938 0.192 9.165 49.336
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.848e+00 3.764e+00 0.757 0.449
## TEAM_BATTING_H 4.945e-02 2.586e-03 19.122 < 2e-16 ***
## poly(TEAM_BATTING_BB, 2)1 -3.781e+02 6.711e+01 -5.634 1.98e-08 ***
## poly(TEAM_BATTING_BB, 2)2 2.057e+02 2.629e+01 7.826 7.65e-15 ***
## TEAM_PITCHING_HR -3.246e+00 5.333e-01 -6.088 1.34e-09 ***
## poly(TEAM_PITCHING_BB, 2)1 2.847e+02 4.062e+01 7.009 3.16e-12 ***
## poly(TEAM_PITCHING_BB, 2)2 1.759e+02 2.884e+01 6.100 1.24e-09 ***
## poly(TEAM_FIELDING_E, 2)1 -5.730e+02 6.423e+01 -8.920 < 2e-16 ***
## poly(TEAM_FIELDING_E, 2)2 -1.043e+02 1.606e+01 -6.496 1.01e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.58 on 2267 degrees of freedom
## Multiple R-squared: 0.259, Adjusted R-squared: 0.2563
## F-statistic: 99.02 on 8 and 2267 DF, p-value: < 2.2e-16
model3 = lm(TARGET_WINS ~ TEAM_BATTING_H + poly(TEAM_BATTING_BB,2) + TEAM_PITCHING_HR + poly(TEAM_PITCHING_BB,2) + poly(TEAM_FIELDING_E,3), data=train.trans)
summary(model3)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + poly(TEAM_BATTING_BB,
## 2) + TEAM_PITCHING_HR + poly(TEAM_PITCHING_BB, 2) + poly(TEAM_FIELDING_E,
## 3), data = train.trans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.800 -8.957 0.291 9.101 49.593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.836e+00 3.756e+00 0.755 0.45028
## TEAM_BATTING_H 5.083e-02 2.614e-03 19.443 < 2e-16 ***
## poly(TEAM_BATTING_BB, 2)1 -3.519e+02 6.744e+01 -5.218 1.97e-07 ***
## poly(TEAM_BATTING_BB, 2)2 1.826e+02 2.716e+01 6.722 2.26e-11 ***
## TEAM_PITCHING_HR -3.827e+00 5.607e-01 -6.826 1.12e-11 ***
## poly(TEAM_PITCHING_BB, 2)1 2.781e+02 4.058e+01 6.855 9.19e-12 ***
## poly(TEAM_PITCHING_BB, 2)2 1.952e+02 2.937e+01 6.646 3.77e-11 ***
## poly(TEAM_FIELDING_E, 3)1 -5.658e+02 6.413e+01 -8.823 < 2e-16 ***
## poly(TEAM_FIELDING_E, 3)2 -1.051e+02 1.602e+01 -6.561 6.61e-11 ***
## poly(TEAM_FIELDING_E, 3)3 -5.528e+01 1.682e+01 -3.286 0.00103 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.55 on 2266 degrees of freedom
## Multiple R-squared: 0.2625, Adjusted R-squared: 0.2595
## F-statistic: 89.6 on 9 and 2266 DF, p-value: < 2.2e-16
plot(model1)
plot(model2)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
plot(model3)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#1] Load test data
df.test = read.table("moneyball-evaluation-data.csv", sep=',', header = TRUE, stringsAsFactors = FALSE)
#2] SELECT Non Null columns
df.test = df.test%>%dplyr::select(-INDEX)
cols.nomiss = cols.nomiss[!cols.nomiss %in%('TARGET_WINS')]
df.test = df.test%>%dplyr::select(cols.nomiss)
#3] Transform Data
test.trans = df.test
test.trans$TEAM_BATTING_H = test.trans$TEAM_BATTING_H + test.trans$TEAM_BATTING_2B + test.trans$TEAM_BATTING_3B + test.trans$TEAM_BATTING_HR
test.trans$TEAM_FIELDING_E = test.trans$TEAM_FIELDING_E + test.trans$TEAM_PITCHING_H
test.trans = test.trans%>%dplyr::select(-TEAM_BATTING_2B, -TEAM_BATTING_3B, -TEAM_BATTING_HR, -TEAM_PITCHING_H)
test.trans$TEAM_PITCHING_HR = apply(test.trans['TEAM_PITCHING_HR'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
test.trans$TEAM_PITCHING_BB = apply(test.trans['TEAM_PITCHING_BB'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
test.trans$TEAM_FIELDING_E = apply(test.trans['TEAM_FIELDING_E'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
Target_Wins = predict(model3, newdata = test.trans)
test.trans$TARGET_WINS = Target_Wins
df.test1 = read.table("moneyball-evaluation-data.csv", sep=',', header = TRUE, stringsAsFactors = FALSE)
df.test1$TARGET_WINS = Target_Wins
write.csv(df.test1, "Target_Prediction.csv", row.names = FALSE)