library(csv)
nba <- as.csv('C:/Users/Seol/Desktop/nba.csv')
attach(nba)
결측치 확인
sum(is.na(nba))
## [1] 0
없당~
str(nba)
## 'data.frame': 100 obs. of 55 variables:
## $ AGE : int 28 28 22 24 27 28 32 32 26 22 ...
## $ GP : int 81 35 5 75 81 62 74 61 72 80 ...
## $ W : int 46 16 1 31 54 51 51 43 30 42 ...
## $ L : int 35 19 4 44 27 11 23 18 42 38 ...
## $ W_PCT : num 0.568 0.457 0.2 0.413 0.667 0.823 0.689 0.705 0.417 0.525 ...
## $ MIN : num 34.6 8.4 3.4 36.1 36.4 33.4 37.8 31.5 34.2 35.6 ...
## $ OFF_RATING : num 108 104 124 104 114 ...
## $ DEF_RATING : num 105 102 118 102 107 ...
## $ NET_RATING : num 3.3 1.9 6.3 1.7 6.3 16 7.7 14.9 -2.2 1.5 ...
## $ AST_PCT : num 0.543 0.054 0.3 0.11 0.505 0.218 0.388 0.444 0.262 0.269 ...
## $ AST_TO : num 1.92 0.9 0 0.87 1.95 2.17 2.13 3.83 1.23 1.85 ...
## $ AST_RATIO : num 23.4 5.1 31.1 7.3 27.6 18.4 25.6 35 14.3 19.8 ...
## $ OREB_PCT : num 0.053 0.166 0.091 0.067 0.035 0.023 0.04 0.025 0.072 0.058 ...
## $ DREB_PCT : num 0.279 0.313 0.118 0.269 0.212 0.232 0.209 0.149 0.305 0.226 ...
## $ REB_PCT : num 0.167 0.239 0.103 0.17 0.123 0.137 0.127 0.089 0.188 0.143 ...
## $ TM_TOV_PCT : num 12.2 5.7 0 8.4 14.1 8.5 12 9.1 11.6 10.7 ...
## $ EFG_PCT : num 0.476 0.545 0.875 0.518 0.525 0.594 0.594 0.555 0.498 0.541 ...
## $ TS_PCT : num 0.554 0.606 0.753 0.58 0.613 0.651 0.619 0.614 0.562 0.599 ...
## $ USG_PCT : num 0.408 0.248 0.172 0.326 0.341 0.276 0.297 0.243 0.364 0.283 ...
## $ PACE : num 102.3 97.2 87.5 100.2 103 ...
## $ PIE : num 0.23 0.196 0.194 0.192 0.19 0.186 0.183 0.182 0.178 0.176 ...
## $ FGM : int 824 72 3 770 674 551 736 374 647 656 ...
## $ FGA : int 1941 132 4 1526 1533 1026 1344 785 1432 1259 ...
## $ FGM_PG : num 10.2 2.1 0.6 10.3 8.3 8.9 9.9 6.1 9 8.2 ...
## $ FGA_PG : num 24 3.8 0.8 20.3 18.9 16.5 18.2 12.9 19.9 15.7 ...
## $ FG_PCT : num 0.425 0.545 0.75 0.505 0.44 0.537 0.548 0.476 0.452 0.521 ...
## $ GP_RANK : int 18 365 458 111 18 244 126 253 158 38 ...
## $ W_RANK : int 62 345 464 196 15 21 21 76 209 82 ...
## $ L_RANK : int 330 149 34 419 221 79 190 136 402 367 ...
## $ W_PCT_RANK : int 143 270 468 316 56 14 45 38 315 182 ...
## $ MIN_RANK : int 22 432 475 9 7 40 1 64 25 12 ...
## $ OFF_RATING_RANK : int 115 260 2 271 27 9 18 10 152 131 ...
## $ DEF_RATING_RANK : int 168 82 475 88 296 57 287 58 374 224 ...
## $ NET_RATING_RANK : int 95 139 53 144 55 9 45 14 273 148 ...
## $ AST_PCT_RANK : int 1 412 32 230 2 79 7 4 50 47 ...
## $ AST_TO_RANK : int 140 381 466 387 132 89 98 7 300 150 ...
## $ AST_RATIO_RANK : int 96 467 28 442 48 168 68 15 241 133 ...
## $ OREB_PCT_RANK : int 171 4 81 139 242 320 220 301 127 157 ...
## $ DREB_PCT_RANK : int 22 8 317 27 73 55 81 219 11 57 ...
## $ REB_PCT_RANK : int 53 5 201 48 144 106 125 242 29 95 ...
## $ TM_TOV_PCT_RANK : int 358 30 1 129 426 131 347 173 321 265 ...
## $ EFG_PCT_RANK : int 339 100 2 189 167 31 32 84 261 113 ...
## $ TS_PCT_RANK : int 181 52 4 112 40 12 35 39 156 67 ...
## $ USG_PCT_RANK : int 2 63 279 9 7 31 15 72 4 28 ...
## $ PACE_RANK : int 47 328 483 121 34 17 230 243 331 378 ...
## $ PIE_RANK : int 1 2 3 4 5 6 7 8 9 10 ...
## $ FGM_RANK : int 1 344 461 3 10 26 4 79 14 13 ...
## $ FGA_RANK : int 1 360 476 5 4 35 19 78 13 23 ...
## $ FGM_PG_RANK : int 2 306 458 1 15 9 3 51 7 20 ...
## $ FGA_PG_RANK : int 1 356 480 3 9 22 15 58 4 29 ...
## $ FG_PCT_RANK : int 293 47 3 95 253 53 45 142 208 77 ...
## $ SALARY_MILLIONS : num 26.54 7 1.45 22.12 26.5 ...
## $ PTS : num 31.6 5.5 2 28 29.1 25.1 26.4 18.1 24.4 22.9 ...
## $ ACTIVE_TWITTER_LAST_YEAR : int 1 0 1 1 1 1 1 1 1 1 ...
## $ TWITTER_FOLLOWER_COUNT_MILLIONS: num 4.5 0 0.049 1.22 4.47 16.2 37 6.4 0.826 0.246 ...
## - attr(*, "source")= chr "C:/Users/Seol/Desktop/nba.csv"
비슷한 친구들끼리 묶어서 상관관계 확인
library(corrplot)
## corrplot 0.84 loaded
cor(data.frame(GP, W, L, W_PCT))
## GP W L W_PCT
## GP 1.0000000 0.8050370 0.6951741 0.4050292
## W 0.8050370 1.0000000 0.1332065 0.8050411
## L 0.6951741 0.1332065 1.0000000 -0.2988366
## W_PCT 0.4050292 0.8050411 -0.2988366 1.0000000
Games <- cor(data.frame(GP, GP_RANK, W, W_RANK, L, L_RANK, W_PCT, W_PCT_RANK))
corrplot.mixed(Games)
GP, W_PCT 사용
ODN <- cor(data.frame(OFF_RATING, OFF_RATING_RANK, DEF_RATING, DEF_RATING_RANK, NET_RATING, NET_RATING_RANK))
corrplot.mixed(ODN)
NET_RATING 사용
ASS <- cor(data.frame(AST_PCT, AST_PCT_RANK, AST_RATIO, AST_RATIO_RANK, AST_TO, AST_TO_RANK))
corrplot.mixed(ASS)
AST_PCT 사용
REB <- cor(data.frame(OREB_PCT, OREB_PCT_RANK, DREB_PCT, DREB_PCT_RANK, REB_PCT, REB_PCT_RANK))
corrplot.mixed(REB)
REB_PCT 사용
a <- cor(data.frame(TM_TOV_PCT, TM_TOV_PCT_RANK, EFG_PCT, EFG_PCT_RANK, TS_PCT, TS_PCT_RANK, USG_PCT, USG_PCT_RANK))
corrplot.mixed(a)
TM_TOV_PCT, TS_PCT, USG_PCT 사용
b <- cor(data.frame(PACE, PACE_RANK, PIE, PIE_RANK))
corrplot.mixed(b)
PACE, PIE 사용
FG <- cor(data.frame(FGM, FGM_RANK, FGA, FGA_RANK, FGM_PG, FGM_PG_RANK, FGA_PG, FGA_PG_RANK, PTS))
corrplot.mixed(FG)
PTS 사용
TWI <- cor(data.frame(ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS))
corrplot.mixed(TWI)
둘 다 사용
total <- cor(data.frame(GP, W_PCT, NET_RATING, AST_PCT, REB_PCT, TM_TOV_PCT, TS_PCT, USG_PCT, PACE, PIE, PTS, ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS))
corrplot.mixed(total)
최종 EDA AGE, GP, W_PCT, NET_RATING, AST_PCT, REB_PCT, TM_TOV_PCT, TS_PCT, USG_PCT, PACE, PIE, PTS, ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS
set.seed(1234)
x=model.matrix(SALARY_MILLIONS ~ AGE + GP + W_PCT + NET_RATING + AST_PCT + REB_PCT + TM_TOV_PCT + TS_PCT + USG_PCT + PACE + PIE + PTS + ACTIVE_TWITTER_LAST_YEAR + TWITTER_FOLLOWER_COUNT_MILLIONS, data=nba)[,-1]
y=SALARY_MILLIONS
train=sample(1:nrow(x), nrow(x)/2) # 5:5
test=(-train)
y.test=y[test]
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
cv.out.ridge=cv.glmnet(x[train,], y[train], alpha=0)
cv.out.lasso=cv.glmnet(x[train,], y[train], alpha=1)
plot(cv.out.ridge)
plot(cv.out.lasso)
bestlam.ridge=cv.out.ridge$lambda.min
bestlam.ridge
## [1] 0.6472069
bestlam.lasso=cv.out.lasso$lambda.min
bestlam.lasso
## [1] 1.006844
grid=10^seq(10, -2, length=100)
ridge.mod=glmnet(x[train, ], y[train], alpha =0, lambda = grid, thresh=1e-12)
ridge.pred=predict(ridge.mod, s=bestlam.ridge, newx=x[test, ])
ridge.mse=mean((ridge.pred-y.test)^2)
ridge.mse
## [1] 51.45201
plot(ridge.mod)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
ridge <- data.frame(y.act = y.test)
ridge$y.act <- ridge$y.act %>%
as.character() %>%
as.numeric()
ridge$pred <- predict(ridge.mod, s=bestlam.ridge, newx=x[test,]) %>%
as.character() %>%
as.numeric()
# Plot of actual vs. predicted values for the test data set.
ggplot(ridge, aes(jitter(y.act), pred)) +
geom_point() +
geom_smooth(method = "lm", col = "#c41200") +
geom_abline(intercept = 0, col = "#c41200", size = 2) +
xlab("Actual SALARY") +
ylab("Predicted SALARY") +
theme_classic()
ridge.out=glmnet(x, y, alpha=0)
predict(ridge.out, type="coefficients", s=bestlam.ridge)[1:14, ]
## (Intercept) AGE GP
## -7.41587587 0.55385661 0.03363193
## W_PCT NET_RATING AST_PCT
## 0.34412733 -0.02338244 2.69792504
## REB_PCT TM_TOV_PCT TS_PCT
## 14.61482226 -0.30279127 -26.36682101
## USG_PCT PACE PIE
## -29.26856836 0.07906411 46.45292192
## PTS ACTIVE_TWITTER_LAST_YEAR
## 0.72143735 -1.68535458
barplot(ridge.out$beta[,1])
lasso.mod=glmnet(x[train,], y[train], alpha=1, lambda=grid)
lasso.pred=predict(lasso.mod, s=bestlam.lasso, newx=x[test,])
lasso.mse=mean((lasso.pred-y.test)^2)
lasso.mse
## [1] 36.62912
plot(lasso.mod)
lasso <- data.frame(y.act = y.test)
lasso$y.act <- lasso$y.act %>%
as.character() %>%
as.numeric()
lasso$pred <- predict(lasso.mod, s=bestlam.lasso, newx=x[test,]) %>%
as.character() %>%
as.numeric()
ggplot(lasso, aes(jitter(y.act), pred)) +
geom_point() +
geom_smooth(method = "lm", col = "#c41200") +
geom_abline(intercept = 0, col = "#c41200", size = 2) +
xlab("Actual SALARY") +
ylab("Predicted SALARY") +
theme_classic()
lasso.out=glmnet(x, y, alpha=1)
predict(lasso.out, type="coefficients", s=bestlam.lasso)[1:14,]
## (Intercept) AGE GP
## -8.6942491510 0.3829121929 0.0003705212
## W_PCT NET_RATING AST_PCT
## 0.0000000000 0.0000000000 0.0000000000
## REB_PCT TM_TOV_PCT TS_PCT
## 0.0000000000 0.0000000000 0.0000000000
## USG_PCT PACE PIE
## 0.0000000000 0.0000000000 0.0000000000
## PTS ACTIVE_TWITTER_LAST_YEAR
## 0.5961589364 0.0000000000
for (i in 0:10) {
assign(paste("fit", i, sep=""), cv.glmnet(x[train,], y[train], type.measure="mse",
alpha=i/10))
}
yhat0 <- predict(fit0, s=fit0$lambda.min, newx=x[test,])
yhat1 <- predict(fit1, s=fit1$lambda.min, newx=x[test,])
yhat2 <- predict(fit2, s=fit2$lambda.min, newx=x[test,])
yhat3 <- predict(fit3, s=fit3$lambda.min, newx=x[test,])
yhat4 <- predict(fit4, s=fit4$lambda.min, newx=x[test,])
yhat5 <- predict(fit5, s=fit5$lambda.min, newx=x[test,])
yhat6 <- predict(fit6, s=fit6$lambda.min, newx=x[test,])
yhat7 <- predict(fit7, s=fit7$lambda.min, newx=x[test,])
yhat8 <- predict(fit8, s=fit8$lambda.min, newx=x[test,])
yhat9 <- predict(fit9, s=fit9$lambda.min, newx=x[test,])
yhat10 <- predict(fit10, s=fit10$lambda.min, newx=x[test,])
mse0 <- mean((y.test - yhat0)^2)
mse1 <- mean((y.test - yhat1)^2)
mse2 <- mean((y.test - yhat2)^2)
mse3 <- mean((y.test - yhat3)^2)
mse4 <- mean((y.test - yhat4)^2)
mse5 <- mean((y.test - yhat5)^2)
mse6 <- mean((y.test - yhat6)^2)
mse7 <- mean((y.test - yhat7)^2)
mse8 <- mean((y.test - yhat8)^2)
mse9 <- mean((y.test - yhat9)^2)
mse10 <- mean((y.test - yhat10)^2)
mse0;mse1;mse2;mse3;mse4;mse5;mse6;mse7;mse8;mse9;mse10
## [1] 41.85282
## [1] 85.16581
## [1] 37.00045
## [1] 36.51723
## [1] 36.00646
## [1] 36.09369
## [1] 36.341
## [1] 36.33308
## [1] 36.61155
## [1] 36.57271
## [1] 36.63294
alpha가 0.4일때 mse가 제일 낮음
cv.out.elnet=cv.glmnet(x[train,], y[train], alpha=0.4)
plot(cv.out.elnet)
bestlam.elnet=cv.out.elnet$lambda.min
bestlam.elnet
## [1] 1.195832
elnet.mod=glmnet(x[train,], y[train], alpha=0.4, lambda=grid)
elnet.pred=predict(elnet.mod, s=bestlam.elnet, newx=x[test,])
elnet.mse=mean((elnet.pred-y.test)^2)
elnet.mse
## [1] 36.54213
elnet.out=glmnet(x, y, alpha=0.4)
predict(elnet.out, type="coefficients", s=bestlam.elnet)[1:14,]
## (Intercept) AGE GP
## -9.28848464 0.44217458 0.02558698
## W_PCT NET_RATING AST_PCT
## 0.00000000 0.00000000 0.00000000
## REB_PCT TM_TOV_PCT TS_PCT
## 0.00000000 -0.02611677 -7.77994558
## USG_PCT PACE PIE
## 0.00000000 0.00000000 22.17358768
## PTS ACTIVE_TWITTER_LAST_YEAR
## 0.52979309 0.00000000
plot(elnet.mod)
elnet <- data.frame(y.act = y.test)
elnet$y.act <- elnet$y.act %>%
as.character() %>%
as.numeric()
elnet$pred <- predict(elnet.mod, s=bestlam.elnet, newx=x[test,]) %>%
as.character() %>%
as.numeric()
ggplot(elnet, aes(jitter(y.act), pred)) +
geom_point() +
geom_smooth(method = "lm", colour="#000099") +
geom_abline(intercept = 0, colour="#000099", size = 2) +
xlab("Actual SALARY") +
ylab("Elastic Net Predicted SALARY") +
theme_classic()
predict(elnet.out, type="coefficients", s=bestlam.elnet)[1:14, ]
## (Intercept) AGE GP
## -9.28848464 0.44217458 0.02558698
## W_PCT NET_RATING AST_PCT
## 0.00000000 0.00000000 0.00000000
## REB_PCT TM_TOV_PCT TS_PCT
## 0.00000000 -0.02611677 -7.77994558
## USG_PCT PACE PIE
## 0.00000000 0.00000000 22.17358768
## PTS ACTIVE_TWITTER_LAST_YEAR
## 0.52979309 0.00000000
ridge.mse;lasso.mse;elnet.mse
## [1] 51.45201
## [1] 36.62912
## [1] 36.54213
par(mfrow=c(2,3))
plot(cv.out.ridge);plot(cv.out.lasso);plot(cv.out.elnet)
plot(ridge.mod);plot(lasso.mod);plot(elnet.mod)
validation set approach
lm.fit=lm(SALARY_MILLIONS ~ AGE + GP + W_PCT + NET_RATING + AST_PCT + REB_PCT + TM_TOV_PCT + TS_PCT + USG_PCT + PACE + PIE + PTS + ACTIVE_TWITTER_LAST_YEAR + TWITTER_FOLLOWER_COUNT_MILLIONS, data=nba, subset=train)
mean((SALARY_MILLIONS - predict(lm.fit, data=nba))[-train]^2)
## [1] 94.04492
loocv
library(boot)
glm.fit=glm(SALARY_MILLIONS ~ AGE + GP + W_PCT + NET_RATING + AST_PCT + REB_PCT + TM_TOV_PCT + TS_PCT + USG_PCT + PACE + PIE + PTS + ACTIVE_TWITTER_LAST_YEAR + TWITTER_FOLLOWER_COUNT_MILLIONS, data=nba)
cv.err=cv.glm(nba, glm.fit)
cv.err$delta
## [1] 41.92583 41.84471
k-fold
cv.err=cv.glm(nba, glm.fit, K=10)
cv.err$delta
## [1] 45.11905 44.08304