library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library (MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# install.packages("plotmo", dependencies = TRUE)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: foreach
## Loaded glmnet 2.0-16
library(plotmo)
## Warning: package 'plotmo' was built under R version 3.5.2
## Loading required package: plotrix
## Loading required package: TeachingDemos
train_data <- "/Users/Olga/Desktop/DataMining/assignment1/moneyball-training-data.csv"
moneyball <- read.csv(train_data, header=TRUE, stringsAsFactors=FALSE, fileEncoding="latin1")
colnames(moneyball)
##  [1] "INDEX"            "TARGET_WINS"      "TEAM_BATTING_H"  
##  [4] "TEAM_BATTING_2B"  "TEAM_BATTING_3B"  "TEAM_BATTING_HR" 
##  [7] "TEAM_BATTING_BB"  "TEAM_BATTING_SO"  "TEAM_BASERUN_SB" 
## [10] "TEAM_BASERUN_CS"  "TEAM_BATTING_HBP" "TEAM_PITCHING_H" 
## [13] "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [16] "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"
summary(moneyball)
##      INDEX         TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00    Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00    Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36    Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00    Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                    
##  TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##  Min.   :  0.0    Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0    1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0    Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7    Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0    3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0    Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                    NA's   :102                       
##  TEAM_FIELDING_DP
##  Min.   : 52.0   
##  1st Qu.:131.0   
##  Median :149.0   
##  Mean   :146.4   
##  3rd Qu.:164.0   
##  Max.   :228.0   
##  NA's   :286
dim(moneyball)
## [1] 2276   17

removing “index” and “TEAM_BATTING_HBP” columns as “TEAM_BATTING_HBP” has 92% of missing values"

moneyball<-subset(moneyball, select = -c(INDEX, TEAM_BATTING_HBP))

replacing other missing values with mean

replace_mean <- function(x){
  x <- as.numeric(as.character(x))
  x[is.na(x)] = mean(x, na.rm=TRUE)
  return(x)
}

moneyball_filled <- apply(moneyball, 2, replace_mean)
moneyball_filled <- as.data.frame(moneyball_filled)
x <- model.matrix(TARGET_WINS~.,moneyball_filled)
str(x)
##  num [1:2276, 1:15] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2276] "1" "2" "3" "4" ...
##   ..$ : chr [1:15] "(Intercept)" "TEAM_BATTING_H" "TEAM_BATTING_2B" "TEAM_BATTING_3B" ...
##  - attr(*, "assign")= int [1:15] 0 1 2 3 4 5 6 7 8 9 ...
y <-moneyball_filled$TARGET_WINS
str(y)
##  num [1:2276] 39 70 86 70 82 75 80 85 86 76 ...
mod1 <- lm(y ~ x) 

xmm <- model.matrix(mod1)
mod2 <- glmnet(xmm, y, alpha=1, lambda=1)

coef(mod1)
##       (Intercept)      x(Intercept)   xTEAM_BATTING_H  xTEAM_BATTING_2B 
##      2.501955e+01                NA      4.824355e-02     -2.006022e-02 
##  xTEAM_BATTING_3B  xTEAM_BATTING_HR  xTEAM_BATTING_BB  xTEAM_BATTING_SO 
##      6.047402e-02      5.299245e-02      1.041571e-02     -9.348587e-03 
##  xTEAM_BASERUN_SB  xTEAM_BASERUN_CS  xTEAM_PITCHING_H xTEAM_PITCHING_HR 
##      2.949092e-02     -1.187609e-02     -7.341675e-04      1.480251e-02 
## xTEAM_PITCHING_BB xTEAM_PITCHING_SO  xTEAM_FIELDING_E xTEAM_FIELDING_DP 
##      8.890651e-05      2.843293e-03     -2.112073e-02     -1.210350e-01
coef(mod2)
## 17 x 1 sparse Matrix of class "dgCMatrix"
##                             s0
## (Intercept)       21.657224908
## (Intercept)        .          
## x(Intercept)       .          
## xTEAM_BATTING_H    0.041379255
## xTEAM_BATTING_2B   .          
## xTEAM_BATTING_3B   .          
## xTEAM_BATTING_HR   .          
## xTEAM_BATTING_BB   0.012001270
## xTEAM_BATTING_SO   .          
## xTEAM_BASERUN_SB   0.014881881
## xTEAM_BASERUN_CS   .          
## xTEAM_PITCHING_H   .          
## xTEAM_PITCHING_HR  0.005972642
## xTEAM_PITCHING_BB  .          
## xTEAM_PITCHING_SO  .          
## xTEAM_FIELDING_E  -0.013084234
## xTEAM_FIELDING_DP -0.047449088
cv.lasso=cv.glmnet(x,y)
plot(cv.lasso)

lasso_model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_PITCHING_HR + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(lasso_model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_HR + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
##     data = moneyball_filled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.817  -8.745   0.119   8.473  56.861 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      14.591083   3.356859   4.347 1.44e-05 ***
## TEAM_BATTING_H    0.050816   0.002072  24.526  < 2e-16 ***
## TEAM_BATTING_BB   0.013366   0.003300   4.050 5.30e-05 ***
## TEAM_BASERUN_SB   0.031800   0.003912   8.128 7.08e-16 ***
## TEAM_PITCHING_HR  0.027307   0.005859   4.661 3.33e-06 ***
## TEAM_FIELDING_E  -0.019972   0.001880 -10.625  < 2e-16 ***
## TEAM_FIELDING_DP -0.116796   0.012991  -8.991  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.17 on 2269 degrees of freedom
## Multiple R-squared:  0.303,  Adjusted R-squared:  0.3012 
## F-statistic: 164.4 on 6 and 2269 DF,  p-value: < 2.2e-16