library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library (MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# install.packages("plotmo", dependencies = TRUE)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: foreach
## Loaded glmnet 2.0-16
library(plotmo)
## Warning: package 'plotmo' was built under R version 3.5.2
## Loading required package: plotrix
## Loading required package: TeachingDemos
train_data <- "/Users/Olga/Desktop/DataMining/assignment1/moneyball-training-data.csv"
moneyball <- read.csv(train_data, header=TRUE, stringsAsFactors=FALSE, fileEncoding="latin1")
colnames(moneyball)
## [1] "INDEX" "TARGET_WINS" "TEAM_BATTING_H"
## [4] "TEAM_BATTING_2B" "TEAM_BATTING_3B" "TEAM_BATTING_HR"
## [7] "TEAM_BATTING_BB" "TEAM_BATTING_SO" "TEAM_BASERUN_SB"
## [10] "TEAM_BASERUN_CS" "TEAM_BATTING_HBP" "TEAM_PITCHING_H"
## [13] "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [16] "TEAM_FIELDING_E" "TEAM_FIELDING_DP"
summary(moneyball)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## TEAM_FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
dim(moneyball)
## [1] 2276 17
removing “index” and “TEAM_BATTING_HBP” columns as “TEAM_BATTING_HBP” has 92% of missing values"
moneyball<-subset(moneyball, select = -c(INDEX, TEAM_BATTING_HBP))
replacing other missing values with mean
replace_mean <- function(x){
x <- as.numeric(as.character(x))
x[is.na(x)] = mean(x, na.rm=TRUE)
return(x)
}
moneyball_filled <- apply(moneyball, 2, replace_mean)
moneyball_filled <- as.data.frame(moneyball_filled)
x <- model.matrix(TARGET_WINS~.,moneyball_filled)
str(x)
## num [1:2276, 1:15] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:2276] "1" "2" "3" "4" ...
## ..$ : chr [1:15] "(Intercept)" "TEAM_BATTING_H" "TEAM_BATTING_2B" "TEAM_BATTING_3B" ...
## - attr(*, "assign")= int [1:15] 0 1 2 3 4 5 6 7 8 9 ...
y <-moneyball_filled$TARGET_WINS
str(y)
## num [1:2276] 39 70 86 70 82 75 80 85 86 76 ...
mod1 <- lm(y ~ x)
xmm <- model.matrix(mod1)
mod2 <- glmnet(xmm, y, alpha=1, lambda=1)
coef(mod1)
## (Intercept) x(Intercept) xTEAM_BATTING_H xTEAM_BATTING_2B
## 2.501955e+01 NA 4.824355e-02 -2.006022e-02
## xTEAM_BATTING_3B xTEAM_BATTING_HR xTEAM_BATTING_BB xTEAM_BATTING_SO
## 6.047402e-02 5.299245e-02 1.041571e-02 -9.348587e-03
## xTEAM_BASERUN_SB xTEAM_BASERUN_CS xTEAM_PITCHING_H xTEAM_PITCHING_HR
## 2.949092e-02 -1.187609e-02 -7.341675e-04 1.480251e-02
## xTEAM_PITCHING_BB xTEAM_PITCHING_SO xTEAM_FIELDING_E xTEAM_FIELDING_DP
## 8.890651e-05 2.843293e-03 -2.112073e-02 -1.210350e-01
coef(mod2)
## 17 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 21.657224908
## (Intercept) .
## x(Intercept) .
## xTEAM_BATTING_H 0.041379255
## xTEAM_BATTING_2B .
## xTEAM_BATTING_3B .
## xTEAM_BATTING_HR .
## xTEAM_BATTING_BB 0.012001270
## xTEAM_BATTING_SO .
## xTEAM_BASERUN_SB 0.014881881
## xTEAM_BASERUN_CS .
## xTEAM_PITCHING_H .
## xTEAM_PITCHING_HR 0.005972642
## xTEAM_PITCHING_BB .
## xTEAM_PITCHING_SO .
## xTEAM_FIELDING_E -0.013084234
## xTEAM_FIELDING_DP -0.047449088
cv.lasso=cv.glmnet(x,y)
plot(cv.lasso)
lasso_model<- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_PITCHING_HR + TEAM_FIELDING_E + TEAM_FIELDING_DP, moneyball_filled)
summary(lasso_model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB +
## TEAM_BASERUN_SB + TEAM_PITCHING_HR + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = moneyball_filled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.817 -8.745 0.119 8.473 56.861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.591083 3.356859 4.347 1.44e-05 ***
## TEAM_BATTING_H 0.050816 0.002072 24.526 < 2e-16 ***
## TEAM_BATTING_BB 0.013366 0.003300 4.050 5.30e-05 ***
## TEAM_BASERUN_SB 0.031800 0.003912 8.128 7.08e-16 ***
## TEAM_PITCHING_HR 0.027307 0.005859 4.661 3.33e-06 ***
## TEAM_FIELDING_E -0.019972 0.001880 -10.625 < 2e-16 ***
## TEAM_FIELDING_DP -0.116796 0.012991 -8.991 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.17 on 2269 degrees of freedom
## Multiple R-squared: 0.303, Adjusted R-squared: 0.3012
## F-statistic: 164.4 on 6 and 2269 DF, p-value: < 2.2e-16