Data-621 Homework-1

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corrplot)

## corrplot 0.84 loaded

library(psych)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(forecast)

## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Registered S3 methods overwritten by 'forecast':
##   method             from    
##   fitted.fracdiff    fracdiff
##   residuals.fracdiff fracdiff

###1. Load Data

train.df = read.table("moneyball-training-data.csv", sep=',', header = TRUE, stringsAsFactors = FALSE)

###2. Data Exploration

head(train.df)

##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1     1          39           1445             194              39
## 2     2          70           1339             219              22
## 3     3          86           1377             232              35
## 4     4          70           1387             209              38
## 5     5          82           1297             186              27
## 6     6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842              NA
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1              NA               NA            9364               84
## 2              28               NA            1347              191
## 3              27               NA            1377              137
## 4              30               NA            1396               97
## 5              39               NA            1297              102
## 6              59               NA            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011               NA
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149

nrow(train.df)

## [1] 2276

ncol(train.df)

## [1] 17

train.df = train.df%>%dplyr::select(-INDEX)
cols.miss = sapply(train.df, function(x) any(is.na(x)))
cols.miss = names(cols.miss[cols.miss>0])
cols.nomiss = names(train.df)[!names(train.df)%in%(cols.miss)]
train.df = train.df%>%dplyr::select(cols.nomiss)
summary(train.df$TARGET_WINS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   71.00   82.00   80.79   92.00  146.00

hist(train.df$TARGET_WINS)

boxplot(train.df$TARGET_WINS)

res = cor(train.df)
corrplot(res, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

###3. Data Preparation

train.trans = train.df
train.trans$TEAM_BATTING_H = train.trans$TEAM_BATTING_H  + train.trans$TEAM_BATTING_2B + train.trans$TEAM_BATTING_3B + train.trans$TEAM_BATTING_HR
train.trans$TEAM_FIELDING_E = train.trans$TEAM_FIELDING_E + train.trans$TEAM_PITCHING_H
train.trans = train.trans%>%dplyr::select(-TEAM_BATTING_2B, -TEAM_BATTING_3B, -TEAM_BATTING_HR, -TEAM_PITCHING_H)
summary(train.trans)

##   TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_BB TEAM_PITCHING_HR
##  Min.   :  0.00   Min.   :1026   Min.   :  0.0   Min.   :  0.0   
##  1st Qu.: 71.00   1st Qu.:1739   1st Qu.:451.0   1st Qu.: 50.0   
##  Median : 82.00   Median :1862   Median :512.0   Median :107.0   
##  Mean   : 80.79   Mean   :1865   Mean   :501.6   Mean   :105.7   
##  3rd Qu.: 92.00   3rd Qu.:1978   3rd Qu.:580.0   3rd Qu.:150.0   
##  Max.   :146.00   Max.   :3092   Max.   :878.0   Max.   :343.0   
##  TEAM_PITCHING_BB TEAM_FIELDING_E
##  Min.   :   0.0   Min.   : 1276  
##  1st Qu.: 476.0   1st Qu.: 1566  
##  Median : 536.5   Median : 1679  
##  Mean   : 553.0   Mean   : 2026  
##  3rd Qu.: 611.0   3rd Qu.: 1922  
##  Max.   :3645.0   Max.   :31860

hist(train.trans$TEAM_BATTING_H)

restrans = cor(train.trans)
pairs.panels(train.trans, 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = TRUE,  # show density plots
             ellipses = TRUE # show correlation ellipses
             )

train.trans$TEAM_PITCHING_HR = apply(train.trans['TEAM_PITCHING_HR'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
train.trans$TEAM_PITCHING_BB = apply(train.trans['TEAM_PITCHING_BB'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
train.trans$TEAM_FIELDING_E = apply(train.trans['TEAM_FIELDING_E'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
pairs.panels(train.trans, 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = TRUE,  # show density plots
             ellipses = TRUE # show correlation ellipses
             )

4. Build Model

model1 = lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_FIELDING_E, data=train.trans)
summary(model1)

## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_FIELDING_E, data = train.trans)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.452  -9.134   0.337   9.240  48.351 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      61.101138  10.855178   5.629 2.04e-08 ***
## TEAM_BATTING_H    0.041235   0.001990  20.720  < 2e-16 ***
## TEAM_BATTING_BB   0.011761   0.004849   2.426   0.0154 *  
## TEAM_PITCHING_HR -2.451877   0.488779  -5.016 5.67e-07 ***
## TEAM_PITCHING_BB  2.600056   1.641425   1.584   0.1133    
## TEAM_FIELDING_E  -9.122132   1.562982  -5.836 6.10e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.82 on 2270 degrees of freedom
## Multiple R-squared:  0.2316, Adjusted R-squared:  0.2299 
## F-statistic: 136.9 on 5 and 2270 DF,  p-value: < 2.2e-16

model2 = lm(TARGET_WINS ~ TEAM_BATTING_H + poly(TEAM_BATTING_BB,2)  + TEAM_PITCHING_HR + poly(TEAM_PITCHING_BB,2) + poly(TEAM_FIELDING_E,2), data=train.trans)
summary(model2)

## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + poly(TEAM_BATTING_BB, 
##     2) + TEAM_PITCHING_HR + poly(TEAM_PITCHING_BB, 2) + poly(TEAM_FIELDING_E, 
##     2), data = train.trans)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.787  -8.938   0.192   9.165  49.336 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.848e+00  3.764e+00   0.757    0.449    
## TEAM_BATTING_H              4.945e-02  2.586e-03  19.122  < 2e-16 ***
## poly(TEAM_BATTING_BB, 2)1  -3.781e+02  6.711e+01  -5.634 1.98e-08 ***
## poly(TEAM_BATTING_BB, 2)2   2.057e+02  2.629e+01   7.826 7.65e-15 ***
## TEAM_PITCHING_HR           -3.246e+00  5.333e-01  -6.088 1.34e-09 ***
## poly(TEAM_PITCHING_BB, 2)1  2.847e+02  4.062e+01   7.009 3.16e-12 ***
## poly(TEAM_PITCHING_BB, 2)2  1.759e+02  2.884e+01   6.100 1.24e-09 ***
## poly(TEAM_FIELDING_E, 2)1  -5.730e+02  6.423e+01  -8.920  < 2e-16 ***
## poly(TEAM_FIELDING_E, 2)2  -1.043e+02  1.606e+01  -6.496 1.01e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.58 on 2267 degrees of freedom
## Multiple R-squared:  0.259,  Adjusted R-squared:  0.2563 
## F-statistic: 99.02 on 8 and 2267 DF,  p-value: < 2.2e-16

model3 = lm(TARGET_WINS ~ TEAM_BATTING_H + poly(TEAM_BATTING_BB,2)  + TEAM_PITCHING_HR + poly(TEAM_PITCHING_BB,2) + poly(TEAM_FIELDING_E,3), data=train.trans)
summary(model3)

## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + poly(TEAM_BATTING_BB, 
##     2) + TEAM_PITCHING_HR + poly(TEAM_PITCHING_BB, 2) + poly(TEAM_FIELDING_E, 
##     3), data = train.trans)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.800  -8.957   0.291   9.101  49.593 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.836e+00  3.756e+00   0.755  0.45028    
## TEAM_BATTING_H              5.083e-02  2.614e-03  19.443  < 2e-16 ***
## poly(TEAM_BATTING_BB, 2)1  -3.519e+02  6.744e+01  -5.218 1.97e-07 ***
## poly(TEAM_BATTING_BB, 2)2   1.826e+02  2.716e+01   6.722 2.26e-11 ***
## TEAM_PITCHING_HR           -3.827e+00  5.607e-01  -6.826 1.12e-11 ***
## poly(TEAM_PITCHING_BB, 2)1  2.781e+02  4.058e+01   6.855 9.19e-12 ***
## poly(TEAM_PITCHING_BB, 2)2  1.952e+02  2.937e+01   6.646 3.77e-11 ***
## poly(TEAM_FIELDING_E, 3)1  -5.658e+02  6.413e+01  -8.823  < 2e-16 ***
## poly(TEAM_FIELDING_E, 3)2  -1.051e+02  1.602e+01  -6.561 6.61e-11 ***
## poly(TEAM_FIELDING_E, 3)3  -5.528e+01  1.682e+01  -3.286  0.00103 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.55 on 2266 degrees of freedom
## Multiple R-squared:  0.2625, Adjusted R-squared:  0.2595 
## F-statistic:  89.6 on 9 and 2266 DF,  p-value: < 2.2e-16

plot(model1)

plot(model2)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

plot(model3)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

5. Make prediction using winning model

#1] Load test data
df.test = read.table("moneyball-evaluation-data.csv", sep=',', header = TRUE, stringsAsFactors = FALSE)
#2] SELECT Non Null columns
df.test = df.test%>%dplyr::select(-INDEX)
cols.nomiss = cols.nomiss[!cols.nomiss %in%('TARGET_WINS')]
df.test = df.test%>%dplyr::select(cols.nomiss)
#3] Transform Data
test.trans = df.test
test.trans$TEAM_BATTING_H = test.trans$TEAM_BATTING_H  + test.trans$TEAM_BATTING_2B + test.trans$TEAM_BATTING_3B + test.trans$TEAM_BATTING_HR
test.trans$TEAM_FIELDING_E = test.trans$TEAM_FIELDING_E + test.trans$TEAM_PITCHING_H
test.trans = test.trans%>%dplyr::select(-TEAM_BATTING_2B, -TEAM_BATTING_3B, -TEAM_BATTING_HR, -TEAM_PITCHING_H)
test.trans$TEAM_PITCHING_HR = apply(test.trans['TEAM_PITCHING_HR'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
test.trans$TEAM_PITCHING_BB = apply(test.trans['TEAM_PITCHING_BB'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
test.trans$TEAM_FIELDING_E = apply(test.trans['TEAM_FIELDING_E'], 1, function(x) if(x==0){return(0)} else{return(log(x))})
Target_Wins = predict(model3, newdata = test.trans)
test.trans$TARGET_WINS = Target_Wins
df.test1 = read.table("moneyball-evaluation-data.csv", sep=',', header = TRUE, stringsAsFactors = FALSE)
df.test1$TARGET_WINS = Target_Wins
write.csv(df.test1, "Target_Prediction.csv", row.names = FALSE)