library(csv)
nba <- as.csv('C:/Users/Seol/Desktop/nba.csv')
attach(nba)

결측치 확인

sum(is.na(nba))
## [1] 0

없당~

str(nba)
## 'data.frame':    100 obs. of  55 variables:
##  $ AGE                            : int  28 28 22 24 27 28 32 32 26 22 ...
##  $ GP                             : int  81 35 5 75 81 62 74 61 72 80 ...
##  $ W                              : int  46 16 1 31 54 51 51 43 30 42 ...
##  $ L                              : int  35 19 4 44 27 11 23 18 42 38 ...
##  $ W_PCT                          : num  0.568 0.457 0.2 0.413 0.667 0.823 0.689 0.705 0.417 0.525 ...
##  $ MIN                            : num  34.6 8.4 3.4 36.1 36.4 33.4 37.8 31.5 34.2 35.6 ...
##  $ OFF_RATING                     : num  108 104 124 104 114 ...
##  $ DEF_RATING                     : num  105 102 118 102 107 ...
##  $ NET_RATING                     : num  3.3 1.9 6.3 1.7 6.3 16 7.7 14.9 -2.2 1.5 ...
##  $ AST_PCT                        : num  0.543 0.054 0.3 0.11 0.505 0.218 0.388 0.444 0.262 0.269 ...
##  $ AST_TO                         : num  1.92 0.9 0 0.87 1.95 2.17 2.13 3.83 1.23 1.85 ...
##  $ AST_RATIO                      : num  23.4 5.1 31.1 7.3 27.6 18.4 25.6 35 14.3 19.8 ...
##  $ OREB_PCT                       : num  0.053 0.166 0.091 0.067 0.035 0.023 0.04 0.025 0.072 0.058 ...
##  $ DREB_PCT                       : num  0.279 0.313 0.118 0.269 0.212 0.232 0.209 0.149 0.305 0.226 ...
##  $ REB_PCT                        : num  0.167 0.239 0.103 0.17 0.123 0.137 0.127 0.089 0.188 0.143 ...
##  $ TM_TOV_PCT                     : num  12.2 5.7 0 8.4 14.1 8.5 12 9.1 11.6 10.7 ...
##  $ EFG_PCT                        : num  0.476 0.545 0.875 0.518 0.525 0.594 0.594 0.555 0.498 0.541 ...
##  $ TS_PCT                         : num  0.554 0.606 0.753 0.58 0.613 0.651 0.619 0.614 0.562 0.599 ...
##  $ USG_PCT                        : num  0.408 0.248 0.172 0.326 0.341 0.276 0.297 0.243 0.364 0.283 ...
##  $ PACE                           : num  102.3 97.2 87.5 100.2 103 ...
##  $ PIE                            : num  0.23 0.196 0.194 0.192 0.19 0.186 0.183 0.182 0.178 0.176 ...
##  $ FGM                            : int  824 72 3 770 674 551 736 374 647 656 ...
##  $ FGA                            : int  1941 132 4 1526 1533 1026 1344 785 1432 1259 ...
##  $ FGM_PG                         : num  10.2 2.1 0.6 10.3 8.3 8.9 9.9 6.1 9 8.2 ...
##  $ FGA_PG                         : num  24 3.8 0.8 20.3 18.9 16.5 18.2 12.9 19.9 15.7 ...
##  $ FG_PCT                         : num  0.425 0.545 0.75 0.505 0.44 0.537 0.548 0.476 0.452 0.521 ...
##  $ GP_RANK                        : int  18 365 458 111 18 244 126 253 158 38 ...
##  $ W_RANK                         : int  62 345 464 196 15 21 21 76 209 82 ...
##  $ L_RANK                         : int  330 149 34 419 221 79 190 136 402 367 ...
##  $ W_PCT_RANK                     : int  143 270 468 316 56 14 45 38 315 182 ...
##  $ MIN_RANK                       : int  22 432 475 9 7 40 1 64 25 12 ...
##  $ OFF_RATING_RANK                : int  115 260 2 271 27 9 18 10 152 131 ...
##  $ DEF_RATING_RANK                : int  168 82 475 88 296 57 287 58 374 224 ...
##  $ NET_RATING_RANK                : int  95 139 53 144 55 9 45 14 273 148 ...
##  $ AST_PCT_RANK                   : int  1 412 32 230 2 79 7 4 50 47 ...
##  $ AST_TO_RANK                    : int  140 381 466 387 132 89 98 7 300 150 ...
##  $ AST_RATIO_RANK                 : int  96 467 28 442 48 168 68 15 241 133 ...
##  $ OREB_PCT_RANK                  : int  171 4 81 139 242 320 220 301 127 157 ...
##  $ DREB_PCT_RANK                  : int  22 8 317 27 73 55 81 219 11 57 ...
##  $ REB_PCT_RANK                   : int  53 5 201 48 144 106 125 242 29 95 ...
##  $ TM_TOV_PCT_RANK                : int  358 30 1 129 426 131 347 173 321 265 ...
##  $ EFG_PCT_RANK                   : int  339 100 2 189 167 31 32 84 261 113 ...
##  $ TS_PCT_RANK                    : int  181 52 4 112 40 12 35 39 156 67 ...
##  $ USG_PCT_RANK                   : int  2 63 279 9 7 31 15 72 4 28 ...
##  $ PACE_RANK                      : int  47 328 483 121 34 17 230 243 331 378 ...
##  $ PIE_RANK                       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ FGM_RANK                       : int  1 344 461 3 10 26 4 79 14 13 ...
##  $ FGA_RANK                       : int  1 360 476 5 4 35 19 78 13 23 ...
##  $ FGM_PG_RANK                    : int  2 306 458 1 15 9 3 51 7 20 ...
##  $ FGA_PG_RANK                    : int  1 356 480 3 9 22 15 58 4 29 ...
##  $ FG_PCT_RANK                    : int  293 47 3 95 253 53 45 142 208 77 ...
##  $ SALARY_MILLIONS                : num  26.54 7 1.45 22.12 26.5 ...
##  $ PTS                            : num  31.6 5.5 2 28 29.1 25.1 26.4 18.1 24.4 22.9 ...
##  $ ACTIVE_TWITTER_LAST_YEAR       : int  1 0 1 1 1 1 1 1 1 1 ...
##  $ TWITTER_FOLLOWER_COUNT_MILLIONS: num  4.5 0 0.049 1.22 4.47 16.2 37 6.4 0.826 0.246 ...
##  - attr(*, "source")= chr "C:/Users/Seol/Desktop/nba.csv"

EDA

비슷한 친구들끼리 묶어서 상관관계 확인

library(corrplot)
## corrplot 0.84 loaded
cor(data.frame(GP, W, L, W_PCT))
##              GP         W          L      W_PCT
## GP    1.0000000 0.8050370  0.6951741  0.4050292
## W     0.8050370 1.0000000  0.1332065  0.8050411
## L     0.6951741 0.1332065  1.0000000 -0.2988366
## W_PCT 0.4050292 0.8050411 -0.2988366  1.0000000
Games <- cor(data.frame(GP, GP_RANK, W, W_RANK, L, L_RANK, W_PCT, W_PCT_RANK))
corrplot.mixed(Games)

GP, W_PCT 사용

ODN <- cor(data.frame(OFF_RATING, OFF_RATING_RANK, DEF_RATING, DEF_RATING_RANK, NET_RATING, NET_RATING_RANK))
corrplot.mixed(ODN)

NET_RATING 사용

ASS <- cor(data.frame(AST_PCT, AST_PCT_RANK, AST_RATIO, AST_RATIO_RANK, AST_TO, AST_TO_RANK))
corrplot.mixed(ASS)

AST_PCT 사용

REB <- cor(data.frame(OREB_PCT, OREB_PCT_RANK, DREB_PCT, DREB_PCT_RANK, REB_PCT, REB_PCT_RANK))
corrplot.mixed(REB)

REB_PCT 사용

a <- cor(data.frame(TM_TOV_PCT, TM_TOV_PCT_RANK, EFG_PCT, EFG_PCT_RANK, TS_PCT, TS_PCT_RANK, USG_PCT, USG_PCT_RANK))
corrplot.mixed(a)

TM_TOV_PCT, TS_PCT, USG_PCT 사용

b <- cor(data.frame(PACE, PACE_RANK, PIE, PIE_RANK))
corrplot.mixed(b)

PACE, PIE 사용

FG <- cor(data.frame(FGM, FGM_RANK, FGA, FGA_RANK, FGM_PG, FGM_PG_RANK, FGA_PG, FGA_PG_RANK, PTS))
corrplot.mixed(FG)

PTS 사용

TWI <- cor(data.frame(ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS))
corrplot.mixed(TWI)

둘 다 사용

total <- cor(data.frame(GP, W_PCT, NET_RATING, AST_PCT, REB_PCT, TM_TOV_PCT, TS_PCT, USG_PCT, PACE, PIE, PTS, ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS))
corrplot.mixed(total)

최종 EDA AGE, GP, W_PCT, NET_RATING, AST_PCT, REB_PCT, TM_TOV_PCT, TS_PCT, USG_PCT, PACE, PIE, PTS, ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS

Cross Validation

set.seed(1234)
x=model.matrix(SALARY_MILLIONS ~ AGE + GP + W_PCT + NET_RATING + AST_PCT + REB_PCT + TM_TOV_PCT + TS_PCT + USG_PCT + PACE + PIE + PTS + ACTIVE_TWITTER_LAST_YEAR + TWITTER_FOLLOWER_COUNT_MILLIONS, data=nba)[,-1]
y=SALARY_MILLIONS
train=sample(1:nrow(x), nrow(x)/2) # 5:5
test=(-train)
y.test=y[test]
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
cv.out.ridge=cv.glmnet(x[train,], y[train], alpha=0)
cv.out.lasso=cv.glmnet(x[train,], y[train], alpha=1)
plot(cv.out.ridge)

plot(cv.out.lasso)

bestlam.ridge=cv.out.ridge$lambda.min
bestlam.ridge
## [1] 0.6472069
bestlam.lasso=cv.out.lasso$lambda.min
bestlam.lasso
## [1] 1.006844

ridge

grid=10^seq(10, -2, length=100)
ridge.mod=glmnet(x[train, ], y[train], alpha =0, lambda = grid, thresh=1e-12)
ridge.pred=predict(ridge.mod, s=bestlam.ridge, newx=x[test, ])
ridge.mse=mean((ridge.pred-y.test)^2)
ridge.mse
## [1] 51.45201
plot(ridge.mod)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
ridge <- data.frame(y.act = y.test)
ridge$y.act <- ridge$y.act %>%
  as.character() %>%
  as.numeric()
ridge$pred <- predict(ridge.mod, s=bestlam.ridge, newx=x[test,])  %>%
  as.character() %>%
  as.numeric()

# Plot of actual vs. predicted values for the test data set.
ggplot(ridge, aes(jitter(y.act), pred)) + 
  geom_point() + 
  geom_smooth(method = "lm", col = "#c41200") + 
  geom_abline(intercept = 0, col = "#c41200", size = 2) +
  xlab("Actual SALARY") +
  ylab("Predicted SALARY") + 
  theme_classic()

ridge.out=glmnet(x, y, alpha=0)
predict(ridge.out, type="coefficients", s=bestlam.ridge)[1:14, ]
##              (Intercept)                      AGE                       GP 
##              -7.41587587               0.55385661               0.03363193 
##                    W_PCT               NET_RATING                  AST_PCT 
##               0.34412733              -0.02338244               2.69792504 
##                  REB_PCT               TM_TOV_PCT                   TS_PCT 
##              14.61482226              -0.30279127             -26.36682101 
##                  USG_PCT                     PACE                      PIE 
##             -29.26856836               0.07906411              46.45292192 
##                      PTS ACTIVE_TWITTER_LAST_YEAR 
##               0.72143735              -1.68535458
barplot(ridge.out$beta[,1])

lasso

lasso.mod=glmnet(x[train,], y[train], alpha=1, lambda=grid)
lasso.pred=predict(lasso.mod, s=bestlam.lasso, newx=x[test,])
lasso.mse=mean((lasso.pred-y.test)^2)
lasso.mse
## [1] 36.62912
plot(lasso.mod)

lasso <- data.frame(y.act = y.test)
lasso$y.act <- lasso$y.act %>%
  as.character() %>%
  as.numeric()
lasso$pred <- predict(lasso.mod, s=bestlam.lasso, newx=x[test,])  %>%
  as.character() %>%
  as.numeric()

ggplot(lasso, aes(jitter(y.act), pred)) + 
  geom_point() + 
  geom_smooth(method = "lm", col = "#c41200") + 
  geom_abline(intercept = 0, col = "#c41200", size = 2) +
  xlab("Actual SALARY") +
  ylab("Predicted SALARY") + 
  theme_classic()

lasso.out=glmnet(x, y, alpha=1)
predict(lasso.out, type="coefficients", s=bestlam.lasso)[1:14,]
##              (Intercept)                      AGE                       GP 
##            -8.6942491510             0.3829121929             0.0003705212 
##                    W_PCT               NET_RATING                  AST_PCT 
##             0.0000000000             0.0000000000             0.0000000000 
##                  REB_PCT               TM_TOV_PCT                   TS_PCT 
##             0.0000000000             0.0000000000             0.0000000000 
##                  USG_PCT                     PACE                      PIE 
##             0.0000000000             0.0000000000             0.0000000000 
##                      PTS ACTIVE_TWITTER_LAST_YEAR 
##             0.5961589364             0.0000000000

alpha 차이에 따른 mse 변화

for (i in 0:10) {
    assign(paste("fit", i, sep=""), cv.glmnet(x[train,], y[train], type.measure="mse", 
                                              alpha=i/10))
}

yhat0 <- predict(fit0, s=fit0$lambda.min, newx=x[test,])
yhat1 <- predict(fit1, s=fit1$lambda.min, newx=x[test,])
yhat2 <- predict(fit2, s=fit2$lambda.min, newx=x[test,])
yhat3 <- predict(fit3, s=fit3$lambda.min, newx=x[test,])
yhat4 <- predict(fit4, s=fit4$lambda.min, newx=x[test,])
yhat5 <- predict(fit5, s=fit5$lambda.min, newx=x[test,])
yhat6 <- predict(fit6, s=fit6$lambda.min, newx=x[test,])
yhat7 <- predict(fit7, s=fit7$lambda.min, newx=x[test,])
yhat8 <- predict(fit8, s=fit8$lambda.min, newx=x[test,])
yhat9 <- predict(fit9, s=fit9$lambda.min, newx=x[test,])
yhat10 <- predict(fit10, s=fit10$lambda.min, newx=x[test,])

mse0 <- mean((y.test - yhat0)^2)
mse1 <- mean((y.test - yhat1)^2)
mse2 <- mean((y.test - yhat2)^2)
mse3 <- mean((y.test - yhat3)^2)
mse4 <- mean((y.test - yhat4)^2)
mse5 <- mean((y.test - yhat5)^2)
mse6 <- mean((y.test - yhat6)^2)
mse7 <- mean((y.test - yhat7)^2)
mse8 <- mean((y.test - yhat8)^2)
mse9 <- mean((y.test - yhat9)^2)
mse10 <- mean((y.test - yhat10)^2)
mse0;mse1;mse2;mse3;mse4;mse5;mse6;mse7;mse8;mse9;mse10
## [1] 41.85282
## [1] 85.16581
## [1] 37.00045
## [1] 36.51723
## [1] 36.00646
## [1] 36.09369
## [1] 36.341
## [1] 36.33308
## [1] 36.61155
## [1] 36.57271
## [1] 36.63294

alpha가 0.4일때 mse가 제일 낮음

elastic net regression

cv.out.elnet=cv.glmnet(x[train,], y[train], alpha=0.4)
plot(cv.out.elnet)

bestlam.elnet=cv.out.elnet$lambda.min
bestlam.elnet
## [1] 1.195832
elnet.mod=glmnet(x[train,], y[train], alpha=0.4, lambda=grid)
elnet.pred=predict(elnet.mod, s=bestlam.elnet, newx=x[test,])
elnet.mse=mean((elnet.pred-y.test)^2)
elnet.mse
## [1] 36.54213
elnet.out=glmnet(x, y, alpha=0.4)
predict(elnet.out, type="coefficients", s=bestlam.elnet)[1:14,]
##              (Intercept)                      AGE                       GP 
##              -9.28848464               0.44217458               0.02558698 
##                    W_PCT               NET_RATING                  AST_PCT 
##               0.00000000               0.00000000               0.00000000 
##                  REB_PCT               TM_TOV_PCT                   TS_PCT 
##               0.00000000              -0.02611677              -7.77994558 
##                  USG_PCT                     PACE                      PIE 
##               0.00000000               0.00000000              22.17358768 
##                      PTS ACTIVE_TWITTER_LAST_YEAR 
##               0.52979309               0.00000000
plot(elnet.mod)

elnet <- data.frame(y.act = y.test)
elnet$y.act <- elnet$y.act %>%
  as.character() %>%
  as.numeric()
elnet$pred <- predict(elnet.mod, s=bestlam.elnet, newx=x[test,])  %>%
  as.character() %>%
  as.numeric()

ggplot(elnet, aes(jitter(y.act), pred)) + 
  geom_point() + 
  geom_smooth(method = "lm", colour="#000099") + 
  geom_abline(intercept = 0, colour="#000099", size = 2) +
  xlab("Actual SALARY") +
  ylab("Elastic Net Predicted SALARY") + 
  theme_classic()

predict(elnet.out, type="coefficients", s=bestlam.elnet)[1:14, ]
##              (Intercept)                      AGE                       GP 
##              -9.28848464               0.44217458               0.02558698 
##                    W_PCT               NET_RATING                  AST_PCT 
##               0.00000000               0.00000000               0.00000000 
##                  REB_PCT               TM_TOV_PCT                   TS_PCT 
##               0.00000000              -0.02611677              -7.77994558 
##                  USG_PCT                     PACE                      PIE 
##               0.00000000               0.00000000              22.17358768 
##                      PTS ACTIVE_TWITTER_LAST_YEAR 
##               0.52979309               0.00000000

ridge vs lasso vs elnet

ridge.mse;lasso.mse;elnet.mse
## [1] 51.45201
## [1] 36.62912
## [1] 36.54213
par(mfrow=c(2,3))
plot(cv.out.ridge);plot(cv.out.lasso);plot(cv.out.elnet)
plot(ridge.mod);plot(lasso.mod);plot(elnet.mod)

vs validaion set apporoach, loocv, k-fold

validation set approach

lm.fit=lm(SALARY_MILLIONS ~ AGE + GP + W_PCT + NET_RATING + AST_PCT + REB_PCT + TM_TOV_PCT + TS_PCT + USG_PCT + PACE + PIE + PTS + ACTIVE_TWITTER_LAST_YEAR + TWITTER_FOLLOWER_COUNT_MILLIONS, data=nba, subset=train)
mean((SALARY_MILLIONS - predict(lm.fit, data=nba))[-train]^2)
## [1] 94.04492

loocv

library(boot)
glm.fit=glm(SALARY_MILLIONS ~ AGE + GP + W_PCT + NET_RATING + AST_PCT + REB_PCT + TM_TOV_PCT + TS_PCT + USG_PCT + PACE + PIE + PTS + ACTIVE_TWITTER_LAST_YEAR + TWITTER_FOLLOWER_COUNT_MILLIONS, data=nba)
cv.err=cv.glm(nba, glm.fit)
cv.err$delta
## [1] 41.92583 41.84471

k-fold

cv.err=cv.glm(nba, glm.fit, K=10)
cv.err$delta
## [1] 45.11905 44.08304