library(ggplot2)
library(tidyr)
library(MASS)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.84 loaded
library(naniar)

DATA EXPLORATION

Loading the train and test data sets csv files

train <- read.csv("https://raw.githubusercontent.com/Harpreet1984/DATA621/master/HW1/moneyball-training-data.csv")
test <- read.csv("https://raw.githubusercontent.com/Harpreet1984/DATA621/master/HW1/moneyball-evaluation-data.csv")
train$INDEX <- NULL
test$INDEX <- NULL

cleanNames <- function(train) {
    name_list <- names(train)
    name_list <- gsub("TEAM_", "", name_list)
    names(train) <- name_list
    train
}

train <- cleanNames(train)
test <- cleanNames(test)

There are 2,276 rows and 16 columns (features). Of all 16 columns, 0 are discrete, 16 are continuous, and 0 are all missing. There are 3,478 missing values out of 36,416 data points with highest missings from Batters hit by pitch column

summary(train)
##   TARGET_WINS       BATTING_H      BATTING_2B      BATTING_3B    
##  Min.   :  0.00   Min.   : 891   Min.   : 69.0   Min.   :  0.00  
##  1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0   1st Qu.: 34.00  
##  Median : 82.00   Median :1454   Median :238.0   Median : 47.00  
##  Mean   : 80.79   Mean   :1469   Mean   :241.2   Mean   : 55.25  
##  3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0   3rd Qu.: 72.00  
##  Max.   :146.00   Max.   :2554   Max.   :458.0   Max.   :223.00  
##                                                                  
##    BATTING_HR       BATTING_BB      BATTING_SO       BASERUN_SB   
##  Min.   :  0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0   1st Qu.: 66.0  
##  Median :102.00   Median :512.0   Median : 750.0   Median :101.0  
##  Mean   : 99.61   Mean   :501.6   Mean   : 735.6   Mean   :124.8  
##  3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0   3rd Qu.:156.0  
##  Max.   :264.00   Max.   :878.0   Max.   :1399.0   Max.   :697.0  
##                                   NA's   :102      NA's   :131    
##    BASERUN_CS     BATTING_HBP      PITCHING_H     PITCHING_HR   
##  Min.   :  0.0   Min.   :29.00   Min.   : 1137   Min.   :  0.0  
##  1st Qu.: 38.0   1st Qu.:50.50   1st Qu.: 1419   1st Qu.: 50.0  
##  Median : 49.0   Median :58.00   Median : 1518   Median :107.0  
##  Mean   : 52.8   Mean   :59.36   Mean   : 1779   Mean   :105.7  
##  3rd Qu.: 62.0   3rd Qu.:67.00   3rd Qu.: 1682   3rd Qu.:150.0  
##  Max.   :201.0   Max.   :95.00   Max.   :30132   Max.   :343.0  
##  NA's   :772     NA's   :2085                                   
##   PITCHING_BB      PITCHING_SO        FIELDING_E      FIELDING_DP   
##  Min.   :   0.0   Min.   :    0.0   Min.   :  65.0   Min.   : 52.0  
##  1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0   1st Qu.:131.0  
##  Median : 536.5   Median :  813.5   Median : 159.0   Median :149.0  
##  Mean   : 553.0   Mean   :  817.7   Mean   : 246.5   Mean   :146.4  
##  3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2   3rd Qu.:164.0  
##  Max.   :3645.0   Max.   :19278.0   Max.   :1898.0   Max.   :228.0  
##                   NA's   :102                        NA's   :286
Histogram of Variables
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units

This shows very few variables are normally distributed.

Now the datpreparation

DATA PREPARATION

Here from the plots we can see outliers in PITCHING_H,PITCHING_BB and PITCHING_SO

Also, since BATTING_H is a combination of BATTING_2B, BATTING_3B, BATTING_HR (and also includes batted singles), we will create a new variable BATTING_1B equaling BATTING_H - BATTING_2B - BATTING_3B - BATTING_HR and after creating this we will remove BATTING_H

vis_miss(train)

As we can see from our chart, we have a number of missing values. Since HBP has 92% missing values, we will remove that entirely.

train1 <- transform(train)
train1["BATTING_1B"] <- NA
train1$BATTING_1B = train1$BATTING_H - train1$BATTING_HR - train1$BATTING_3B - train1$BATTING_2B
train1 <- select(train1, -BATTING_HBP)
train1 <- select(train1, -BATTING_H)
#for(i in 1:ncol(train1)){
#  train1[is.na(train1[,i]), i] <- mean(train1[,i], na.rm = TRUE)
#}
head(train1)
##   TARGET_WINS BATTING_2B BATTING_3B BATTING_HR BATTING_BB BATTING_SO
## 1          39        194         39         13        143        842
## 2          70        219         22        190        685       1075
## 3          86        232         35        137        602        917
## 4          70        209         38         96        451        922
## 5          82        186         27        102        472        920
## 6          75        200         36         92        443        973
##   BASERUN_SB BASERUN_CS PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO
## 1         NA         NA       9364          84         927        5456
## 2         37         28       1347         191         689        1082
## 3         46         27       1377         137         602         917
## 4         43         30       1396          97         454         928
## 5         49         39       1297         102         472         920
## 6        107         59       1279          92         443         973
##   FIELDING_E FIELDING_DP BATTING_1B
## 1       1011          NA       1199
## 2        193         155        908
## 3        175         153        973
## 4        164         156       1044
## 5        138         168        982
## 6        123         149        951
correlation
M<-cor(train1)
corrplot(M, method="number")

Buliding Models

MODEL1:- Create model With all columns as features
full.model <- lm (TARGET_WINS ~   . , data=train1)

reduced.full.model<- step (full.model, direction = "backward")
## Start:  AIC=6723.18
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB + 
##     BATTING_SO + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_HR + 
##     PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP + BATTING_1B
## 
##               Df Sum of Sq    RSS    AIC
## - PITCHING_BB  1       0.8 134324 6721.2
## - PITCHING_HR  1       7.2 134330 6721.3
## - BATTING_SO   1      55.2 134378 6721.8
## - BATTING_1B   1      56.5 134380 6721.8
## - BATTING_BB   1      81.0 134404 6722.1
## - BATTING_HR   1      89.3 134412 6722.2
## - PITCHING_H   1      98.0 134421 6722.3
## <none>                     134323 6723.2
## - PITCHING_SO  1     264.1 134587 6724.1
## - BATTING_2B   1     654.6 134978 6728.4
## - BASERUN_CS   1     746.8 135070 6729.4
## - BASERUN_SB   1    1557.8 135881 6738.3
## - BATTING_3B   1    3598.5 137922 6760.5
## - FIELDING_DP  1    6742.5 141066 6794.0
## - FIELDING_E   1   22427.4 156751 6950.6
## 
## Step:  AIC=6721.19
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB + 
##     BATTING_SO + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_HR + 
##     PITCHING_SO + FIELDING_E + FIELDING_DP + BATTING_1B
## 
##               Df Sum of Sq    RSS    AIC
## - PITCHING_HR  1       6.4 134330 6719.3
## - BATTING_SO   1      56.2 134380 6719.8
## - BATTING_HR   1     115.7 134440 6720.5
## - BATTING_1B   1     147.2 134471 6720.8
## <none>                     134324 6721.2
## - PITCHING_H   1     197.5 134521 6721.4
## - PITCHING_SO  1     266.3 134590 6722.1
## - BASERUN_CS   1     746.5 135070 6727.4
## - BATTING_2B   1    1192.3 135516 6732.3
## - BASERUN_SB   1    1564.2 135888 6736.4
## - BATTING_3B   1    4777.8 139102 6771.1
## - FIELDING_DP  1    6744.6 141069 6792.0
## - BATTING_BB   1   12568.9 146893 6852.1
## - FIELDING_E   1   22491.7 156816 6949.2
## 
## Step:  AIC=6719.26
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB + 
##     BATTING_SO + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_SO + 
##     FIELDING_E + FIELDING_DP + BATTING_1B
## 
##               Df Sum of Sq    RSS    AIC
## - BATTING_SO   1      51.2 134382 6717.8
## - BATTING_1B   1     144.7 134475 6718.9
## <none>                     134330 6719.3
## - PITCHING_H   1     202.0 134532 6719.5
## - PITCHING_SO  1     298.0 134628 6720.6
## - BASERUN_CS   1     742.6 135073 6725.5
## - BATTING_2B   1    1209.1 135539 6730.6
## - BASERUN_SB   1    1570.4 135901 6734.5
## - BATTING_3B   1    4772.6 139103 6769.1
## - BATTING_HR   1    5777.7 140108 6779.8
## - FIELDING_DP  1    6744.4 141075 6790.1
## - BATTING_BB   1   12606.9 146937 6850.6
## - FIELDING_E   1   22525.1 156855 6947.6
## 
## Step:  AIC=6717.83
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB + 
##     BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_SO + FIELDING_E + 
##     FIELDING_DP + BATTING_1B
## 
##               Df Sum of Sq    RSS    AIC
## <none>                     134382 6717.8
## - BASERUN_CS   1     737.6 135119 6724.0
## - PITCHING_H   1    1355.1 135737 6730.7
## - BASERUN_SB   1    1575.6 135957 6733.2
## - BATTING_1B   1    1740.1 136122 6734.9
## - BATTING_2B   1    3312.6 137694 6752.0
## - FIELDING_DP  1    6779.2 141161 6789.0
## - BATTING_3B   1    6904.5 141286 6790.3
## - PITCHING_SO  1    7395.1 141777 6795.4
## - BATTING_BB   1   12619.7 147001 6849.2
## - BATTING_HR   1   21121.1 155503 6932.8
## - FIELDING_E   1   22552.0 156934 6946.4
summary(reduced.full.model)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + 
##     BATTING_BB + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_SO + 
##     FIELDING_E + FIELDING_DP + BATTING_1B, data = train1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.5830  -6.7313  -0.1643   6.5323  27.8502 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 58.446058   6.588864   8.870  < 2e-16 ***
## BATTING_2B  -0.044326   0.007354  -6.028 2.10e-09 ***
## BATTING_3B   0.187121   0.021502   8.702  < 2e-16 ***
## BATTING_HR   0.123248   0.008097  15.221  < 2e-16 ***
## BATTING_BB   0.039484   0.003356  11.765  < 2e-16 ***
## BASERUN_SB   0.036034   0.008668   4.157 3.41e-05 ***
## BASERUN_CS   0.051768   0.018200   2.844 0.004511 ** 
## PITCHING_H   0.009070   0.002353   3.855 0.000121 ***
## PITCHING_SO -0.020827   0.002312  -9.006  < 2e-16 ***
## FIELDING_E  -0.155970   0.009917 -15.728  < 2e-16 ***
## FIELDING_DP -0.113149   0.013121  -8.623  < 2e-16 ***
## BATTING_1B   0.025502   0.005837   4.369 1.34e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.548 on 1474 degrees of freedom
##   (790 observations deleted due to missingness)
## Multiple R-squared:  0.4384, Adjusted R-squared:  0.4342 
## F-statistic: 104.6 on 11 and 1474 DF,  p-value: < 2.2e-16
MODEL2:- Create model With only significant columns as features from the model 1.
significant.model <- lm (TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_SO + BASERUN_SB + PITCHING_SO + FIELDING_E + FIELDING_DP + BATTING_1B , data=train1)

reduced.significant.model<- step (significant.model, direction = "backward")
## Start:  AIC=8652.74
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_SO + 
##     BASERUN_SB + PITCHING_SO + FIELDING_E + FIELDING_DP + BATTING_1B
## 
##               Df Sum of Sq    RSS    AIC
## - PITCHING_SO  1       125 202787 8651.9
## <none>                     202661 8652.7
## - BATTING_2B   1       517 203178 8655.4
## - BATTING_SO   1      3380 206042 8681.1
## - BATTING_1B   1      3440 206101 8681.6
## - FIELDING_DP  1      6946 209607 8712.6
## - BATTING_3B   1     13546 216207 8769.5
## - BASERUN_SB   1     20358 223019 8826.4
## - FIELDING_E   1     30721 233382 8909.7
## - BATTING_HR   1     39457 242119 8977.2
## 
## Step:  AIC=8651.88
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_SO + 
##     BASERUN_SB + FIELDING_E + FIELDING_DP + BATTING_1B
## 
##               Df Sum of Sq    RSS    AIC
## <none>                     202787 8651.9
## - BATTING_2B   1       532 203318 8654.7
## - BATTING_1B   1      3451 206237 8680.8
## - FIELDING_DP  1      6967 209754 8711.9
## - BATTING_3B   1     13574 216361 8768.8
## - BATTING_SO   1     14521 217308 8776.8
## - BASERUN_SB   1     20238 223024 8824.4
## - FIELDING_E   1     31043 233829 8911.2
## - BATTING_HR   1     39475 242261 8976.3
summary(reduced.significant.model)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + 
##     BATTING_SO + BASERUN_SB + FIELDING_E + FIELDING_DP + BATTING_1B, 
##     data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.590  -7.615   0.266   7.397  30.783 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 78.547640   5.911037  13.288  < 2e-16 ***
## BATTING_2B  -0.014921   0.006819  -2.188   0.0288 *  
## BATTING_3B   0.205199   0.018560  11.056  < 2e-16 ***
## BATTING_HR   0.150831   0.008000  18.853  < 2e-16 ***
## BATTING_SO  -0.026701   0.002335 -11.435  < 2e-16 ***
## BASERUN_SB   0.076556   0.005671  13.499  < 2e-16 ***
## FIELDING_E  -0.118339   0.007078 -16.719  < 2e-16 ***
## FIELDING_DP -0.099724   0.012591  -7.921 4.07e-15 ***
## BATTING_1B   0.024854   0.004459   5.574 2.86e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.54 on 1826 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.3614, Adjusted R-squared:  0.3586 
## F-statistic: 129.2 on 8 and 1826 DF,  p-value: < 2.2e-16
MODEL3:- Create model With top 10 high correlation columns as features

Select top 10 high correlation predictor.

cors <- sapply(train1, cor, y=train1$TARGET_WINS)
mask <- (rank(-abs(cors)) <= 10 )
best10.pred <- train1[, mask]

best10.pred <- subset(best10.pred, select = c(-TARGET_WINS) )
summary(best10.pred)
##    BATTING_2B      BATTING_3B       BATTING_HR       BATTING_BB   
##  Min.   : 69.0   Min.   :  0.00   Min.   :  0.00   Min.   :  0.0  
##  1st Qu.:208.0   1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0  
##  Median :238.0   Median : 47.00   Median :102.00   Median :512.0  
##  Mean   :241.2   Mean   : 55.25   Mean   : 99.61   Mean   :501.6  
##  3rd Qu.:273.0   3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0  
##  Max.   :458.0   Max.   :223.00   Max.   :264.00   Max.   :878.0  
##    PITCHING_H     PITCHING_HR     PITCHING_BB       FIELDING_E    
##  Min.   : 1137   Min.   :  0.0   Min.   :   0.0   Min.   :  65.0  
##  1st Qu.: 1419   1st Qu.: 50.0   1st Qu.: 476.0   1st Qu.: 127.0  
##  Median : 1518   Median :107.0   Median : 536.5   Median : 159.0  
##  Mean   : 1779   Mean   :105.7   Mean   : 553.0   Mean   : 246.5  
##  3rd Qu.: 1682   3rd Qu.:150.0   3rd Qu.: 611.0   3rd Qu.: 249.2  
##  Max.   :30132   Max.   :343.0   Max.   :3645.0   Max.   :1898.0  
##    BATTING_1B    
##  Min.   : 709.0  
##  1st Qu.: 990.8  
##  Median :1050.0  
##  Mean   :1073.2  
##  3rd Qu.:1129.0  
##  Max.   :2112.0
Stepwise backward regression
full.model.best10 <- lm (TARGET_WINS ~    BATTING_2B+ BATTING_3B + BATTING_HR + BATTING_BB + PITCHING_H +  PITCHING_HR + PITCHING_BB + FIELDING_E + BATTING_1B , data=train1)

reduced.model.best10<- step (full.model.best10, direction = "backward")
## Start:  AIC=11851.69
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB + 
##     PITCHING_H + PITCHING_HR + PITCHING_BB + FIELDING_E + BATTING_1B
## 
##               Df Sum of Sq    RSS   AIC
## - BATTING_BB   1        14 411954 11850
## - PITCHING_HR  1       116 412057 11850
## <none>                     411940 11852
## - BATTING_2B   1      1662 413602 11859
## - PITCHING_H   1      2358 414299 11863
## - PITCHING_BB  1      2477 414417 11863
## - BATTING_HR   1      2783 414724 11865
## - FIELDING_E   1      8814 420755 11898
## - BATTING_3B   1     17481 429421 11944
## - BATTING_1B   1     41151 453091 12066
## 
## Step:  AIC=11849.77
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + PITCHING_H + 
##     PITCHING_HR + PITCHING_BB + FIELDING_E + BATTING_1B
## 
##               Df Sum of Sq    RSS   AIC
## - PITCHING_HR  1       103 412057 11848
## <none>                     411954 11850
## - BATTING_2B   1      1665 413619 11857
## - PITCHING_H   1      2795 414749 11863
## - BATTING_HR   1      3080 415034 11865
## - PITCHING_BB  1      5071 417025 11876
## - FIELDING_E   1      9287 421241 11898
## - BATTING_3B   1     17535 429489 11943
## - BATTING_1B   1     41986 453940 12069
## 
## Step:  AIC=11848.34
## TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + PITCHING_H + 
##     PITCHING_BB + FIELDING_E + BATTING_1B
## 
##               Df Sum of Sq    RSS   AIC
## <none>                     412057 11848
## - BATTING_2B   1      1661 413718 11856
## - PITCHING_H   1      2944 415001 11862
## - PITCHING_BB  1      5078 417135 11874
## - FIELDING_E   1      9853 421910 11900
## - BATTING_3B   1     17469 429526 11941
## - BATTING_HR   1     23141 435198 11971
## - BATTING_1B   1     42092 454149 12068
summary(reduced.model.best10)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + 
##     PITCHING_H + PITCHING_BB + FIELDING_E + BATTING_1B, data = train1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.763  -8.861   0.095   8.860  55.469 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.2713462  3.2775220   2.219  0.02662 *  
## BATTING_2B   0.0226647  0.0074954   3.024  0.00252 ** 
## BATTING_3B   0.1495551  0.0152520   9.806  < 2e-16 ***
## BATTING_HR   0.0851691  0.0075466  11.286  < 2e-16 ***
## PITCHING_H  -0.0013088  0.0003251  -4.026 5.87e-05 ***
## PITCHING_BB  0.0103207  0.0019522   5.287 1.36e-07 ***
## FIELDING_E  -0.0166263  0.0022577  -7.364 2.48e-13 ***
## BATTING_1B   0.0484775  0.0031849  15.221  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.48 on 2268 degrees of freedom
## Multiple R-squared:   0.27,  Adjusted R-squared:  0.2678 
## F-statistic: 119.9 on 7 and 2268 DF,  p-value: < 2.2e-16

SELECTING MODEL

Based on the above stats, Model1 with all the columns as feature is a best fit. As from all the three models it has the best RSquare, Adjusted RSquare (0.4342), F Stats and P values(less than 0.5). It also has the best RMSE score of 13.03416.

Model Application

using the Model1 we can predict the dependent variable for the test data as follows

test1 <- transform(test)
test1["BATTING_1B"] <- NA
test1$BATTING_1B = test1$BATTING_H - test1$BATTING_HR - test1$BATTING_3B - test1$BATTING_2B
test1 <- select(test1, -BATTING_HBP)
test1 <- select(test1, -BATTING_H)
predictedValues <- predict (reduced.full.model, newdata=test1 )
summary (predictedValues)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   58.65   75.74   81.39   81.14   86.90  113.36      89