library(csv)
nba <- as.csv('C:/Users/Seol/Desktop/nba.csv')
attach(nba)

결측치 확인

sum(is.na(nba))
## [1] 0

없당~

str(nba)
## 'data.frame':    100 obs. of  55 variables:
##  $ AGE                            : int  28 28 22 24 27 28 32 32 26 22 ...
##  $ GP                             : int  81 35 5 75 81 62 74 61 72 80 ...
##  $ W                              : int  46 16 1 31 54 51 51 43 30 42 ...
##  $ L                              : int  35 19 4 44 27 11 23 18 42 38 ...
##  $ W_PCT                          : num  0.568 0.457 0.2 0.413 0.667 0.823 0.689 0.705 0.417 0.525 ...
##  $ MIN                            : num  34.6 8.4 3.4 36.1 36.4 33.4 37.8 31.5 34.2 35.6 ...
##  $ OFF_RATING                     : num  108 104 124 104 114 ...
##  $ DEF_RATING                     : num  105 102 118 102 107 ...
##  $ NET_RATING                     : num  3.3 1.9 6.3 1.7 6.3 16 7.7 14.9 -2.2 1.5 ...
##  $ AST_PCT                        : num  0.543 0.054 0.3 0.11 0.505 0.218 0.388 0.444 0.262 0.269 ...
##  $ AST_TO                         : num  1.92 0.9 0 0.87 1.95 2.17 2.13 3.83 1.23 1.85 ...
##  $ AST_RATIO                      : num  23.4 5.1 31.1 7.3 27.6 18.4 25.6 35 14.3 19.8 ...
##  $ OREB_PCT                       : num  0.053 0.166 0.091 0.067 0.035 0.023 0.04 0.025 0.072 0.058 ...
##  $ DREB_PCT                       : num  0.279 0.313 0.118 0.269 0.212 0.232 0.209 0.149 0.305 0.226 ...
##  $ REB_PCT                        : num  0.167 0.239 0.103 0.17 0.123 0.137 0.127 0.089 0.188 0.143 ...
##  $ TM_TOV_PCT                     : num  12.2 5.7 0 8.4 14.1 8.5 12 9.1 11.6 10.7 ...
##  $ EFG_PCT                        : num  0.476 0.545 0.875 0.518 0.525 0.594 0.594 0.555 0.498 0.541 ...
##  $ TS_PCT                         : num  0.554 0.606 0.753 0.58 0.613 0.651 0.619 0.614 0.562 0.599 ...
##  $ USG_PCT                        : num  0.408 0.248 0.172 0.326 0.341 0.276 0.297 0.243 0.364 0.283 ...
##  $ PACE                           : num  102.3 97.2 87.5 100.2 103 ...
##  $ PIE                            : num  0.23 0.196 0.194 0.192 0.19 0.186 0.183 0.182 0.178 0.176 ...
##  $ FGM                            : int  824 72 3 770 674 551 736 374 647 656 ...
##  $ FGA                            : int  1941 132 4 1526 1533 1026 1344 785 1432 1259 ...
##  $ FGM_PG                         : num  10.2 2.1 0.6 10.3 8.3 8.9 9.9 6.1 9 8.2 ...
##  $ FGA_PG                         : num  24 3.8 0.8 20.3 18.9 16.5 18.2 12.9 19.9 15.7 ...
##  $ FG_PCT                         : num  0.425 0.545 0.75 0.505 0.44 0.537 0.548 0.476 0.452 0.521 ...
##  $ GP_RANK                        : int  18 365 458 111 18 244 126 253 158 38 ...
##  $ W_RANK                         : int  62 345 464 196 15 21 21 76 209 82 ...
##  $ L_RANK                         : int  330 149 34 419 221 79 190 136 402 367 ...
##  $ W_PCT_RANK                     : int  143 270 468 316 56 14 45 38 315 182 ...
##  $ MIN_RANK                       : int  22 432 475 9 7 40 1 64 25 12 ...
##  $ OFF_RATING_RANK                : int  115 260 2 271 27 9 18 10 152 131 ...
##  $ DEF_RATING_RANK                : int  168 82 475 88 296 57 287 58 374 224 ...
##  $ NET_RATING_RANK                : int  95 139 53 144 55 9 45 14 273 148 ...
##  $ AST_PCT_RANK                   : int  1 412 32 230 2 79 7 4 50 47 ...
##  $ AST_TO_RANK                    : int  140 381 466 387 132 89 98 7 300 150 ...
##  $ AST_RATIO_RANK                 : int  96 467 28 442 48 168 68 15 241 133 ...
##  $ OREB_PCT_RANK                  : int  171 4 81 139 242 320 220 301 127 157 ...
##  $ DREB_PCT_RANK                  : int  22 8 317 27 73 55 81 219 11 57 ...
##  $ REB_PCT_RANK                   : int  53 5 201 48 144 106 125 242 29 95 ...
##  $ TM_TOV_PCT_RANK                : int  358 30 1 129 426 131 347 173 321 265 ...
##  $ EFG_PCT_RANK                   : int  339 100 2 189 167 31 32 84 261 113 ...
##  $ TS_PCT_RANK                    : int  181 52 4 112 40 12 35 39 156 67 ...
##  $ USG_PCT_RANK                   : int  2 63 279 9 7 31 15 72 4 28 ...
##  $ PACE_RANK                      : int  47 328 483 121 34 17 230 243 331 378 ...
##  $ PIE_RANK                       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ FGM_RANK                       : int  1 344 461 3 10 26 4 79 14 13 ...
##  $ FGA_RANK                       : int  1 360 476 5 4 35 19 78 13 23 ...
##  $ FGM_PG_RANK                    : int  2 306 458 1 15 9 3 51 7 20 ...
##  $ FGA_PG_RANK                    : int  1 356 480 3 9 22 15 58 4 29 ...
##  $ FG_PCT_RANK                    : int  293 47 3 95 253 53 45 142 208 77 ...
##  $ SALARY_MILLIONS                : num  26.54 7 1.45 22.12 26.5 ...
##  $ PTS                            : num  31.6 5.5 2 28 29.1 25.1 26.4 18.1 24.4 22.9 ...
##  $ ACTIVE_TWITTER_LAST_YEAR       : int  1 0 1 1 1 1 1 1 1 1 ...
##  $ TWITTER_FOLLOWER_COUNT_MILLIONS: num  4.5 0 0.049 1.22 4.47 16.2 37 6.4 0.826 0.246 ...
##  - attr(*, "source")= chr "C:/Users/Seol/Desktop/nba.csv"

regression

full model

fit1 <- lm(SALARY_MILLIONS ~., data=nba)
summary(fit1)
## 
## Call:
## lm(formula = SALARY_MILLIONS ~ ., data = nba)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.241  -1.882   0.014   2.538  11.345 
## 
## Coefficients: (1 not defined because of singularities)
##                                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                     -1.182e+02  1.970e+02  -0.600  0.55149   
## AGE                              5.371e-01  2.989e-01   1.797  0.07886 . 
## GP                               6.272e-01  7.552e-01   0.830  0.41059   
## W                               -1.001e+00  9.114e-01  -1.099  0.27758   
## L                                       NA         NA      NA       NA   
## W_PCT                            4.667e+01  5.204e+01   0.897  0.37453   
## MIN                             -6.018e-01  1.776e+00  -0.339  0.73627   
## OFF_RATING                       5.475e-01  1.599e+01   0.034  0.97282   
## DEF_RATING                      -7.065e-01  1.602e+01  -0.044  0.96501   
## NET_RATING                      -1.149e+00  1.601e+01  -0.072  0.94308   
## AST_PCT                         -3.787e+01  5.142e+01  -0.737  0.46513   
## AST_TO                           1.045e+01  6.558e+00   1.594  0.11780   
## AST_RATIO                       -1.523e-01  9.142e-01  -0.167  0.86838   
## OREB_PCT                         4.783e+01  2.648e+02   0.181  0.85748   
## DREB_PCT                        -9.906e+00  2.681e+02  -0.037  0.97068   
## REB_PCT                         -8.892e+01  5.452e+02  -0.163  0.87116   
## TM_TOV_PCT                      -2.805e+00  1.912e+00  -1.467  0.14920   
## EFG_PCT                          1.037e+02  1.271e+02   0.816  0.41859   
## TS_PCT                          -2.644e+02  9.760e+01  -2.709  0.00946 **
## USG_PCT                          1.368e+02  1.238e+02   1.105  0.27494   
## PACE                             1.695e+00  1.172e+00   1.446  0.15488   
## PIE                              2.191e+02  1.784e+02   1.228  0.22575   
## FGM                              7.403e-02  1.403e-01   0.528  0.60035   
## FGA                             -4.514e-02  6.057e-02  -0.745  0.45984   
## FGM_PG                          -1.597e+01  1.154e+01  -1.385  0.17286   
## FGA_PG                           5.427e+00  5.052e+00   1.074  0.28833   
## FG_PCT                           9.666e+01  1.056e+02   0.915  0.36500   
## GP_RANK                          1.564e-02  2.531e-02   0.618  0.53968   
## W_RANK                          -6.329e-02  7.112e-02  -0.890  0.37815   
## L_RANK                          -4.147e-02  6.785e-02  -0.611  0.54410   
## W_PCT_RANK                       2.645e-02  4.861e-02   0.544  0.58893   
## MIN_RANK                        -1.721e-01  1.238e-01  -1.390  0.17136   
## OFF_RATING_RANK                  2.058e-02  2.481e-02   0.829  0.41115   
## DEF_RATING_RANK                  3.674e-02  2.948e-02   1.246  0.21898   
## NET_RATING_RANK                 -3.915e-02  2.852e-02  -1.373  0.17652   
## AST_PCT_RANK                    -6.451e-04  5.309e-02  -0.012  0.99036   
## AST_TO_RANK                      3.506e-02  3.919e-02   0.895  0.37566   
## AST_RATIO_RANK                   8.159e-03  4.786e-02   0.170  0.86537   
## OREB_PCT_RANK                    3.412e-02  3.146e-02   1.084  0.28382   
## DREB_PCT_RANK                    2.751e-02  6.605e-02   0.416  0.67901   
## REB_PCT_RANK                    -6.615e-02  8.467e-02  -0.781  0.43859   
## TM_TOV_PCT_RANK                  4.887e-02  4.447e-02   1.099  0.27757   
## EFG_PCT_RANK                    -1.880e-02  4.318e-02  -0.435  0.66535   
## TS_PCT_RANK                     -7.183e-03  3.820e-02  -0.188  0.85167   
## USG_PCT_RANK                     1.151e-02  3.244e-02   0.355  0.72432   
## PACE_RANK                        1.190e-02  2.341e-02   0.508  0.61373   
## PIE_RANK                         1.813e-03  8.783e-02   0.021  0.98362   
## FGM_RANK                         3.025e-01  1.525e-01   1.984  0.05324 . 
## FGA_RANK                        -3.080e-01  1.320e-01  -2.333  0.02407 * 
## FGM_PG_RANK                     -1.694e-01  1.272e-01  -1.332  0.18936   
## FGA_PG_RANK                      2.238e-01  1.418e-01   1.578  0.12135   
## FG_PCT_RANK                     -1.167e-02  4.537e-02  -0.257  0.79807   
## PTS                              7.289e-01  8.817e-01   0.827  0.41266   
## ACTIVE_TWITTER_LAST_YEAR        -1.554e+00  2.982e+00  -0.521  0.60477   
## TWITTER_FOLLOWER_COUNT_MILLIONS  4.660e-01  2.176e-01   2.142  0.03753 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.623 on 46 degrees of freedom
## Multiple R-squared:  0.8098, Adjusted R-squared:  0.5906 
## F-statistic: 3.695 on 53 and 46 DF,  p-value: 6.858e-06

망함 아무래도 그냥 쓰면 안될 것 같다.

EDA

비슷한 친구들끼리 묶어서 상관관계 확인

library(corrplot)
## corrplot 0.84 loaded
cor(data.frame(GP, W, L, W_PCT))
##              GP         W          L      W_PCT
## GP    1.0000000 0.8050370  0.6951741  0.4050292
## W     0.8050370 1.0000000  0.1332065  0.8050411
## L     0.6951741 0.1332065  1.0000000 -0.2988366
## W_PCT 0.4050292 0.8050411 -0.2988366  1.0000000
Games <- cor(data.frame(GP, GP_RANK, W, W_RANK, L, L_RANK, W_PCT, W_PCT_RANK))
corrplot.mixed(Games)

GP, W_PCT 사용

ODN <- cor(data.frame(OFF_RATING, OFF_RATING_RANK, DEF_RATING, DEF_RATING_RANK, NET_RATING, NET_RATING_RANK))
corrplot.mixed(ODN)

NET_RATING 사용

ASS <- cor(data.frame(AST_PCT, AST_PCT_RANK, AST_RATIO, AST_RATIO_RANK, AST_TO, AST_TO_RANK))
corrplot.mixed(ASS)

AST_PCT 사용

REB <- cor(data.frame(OREB_PCT, OREB_PCT_RANK, DREB_PCT, DREB_PCT_RANK, REB_PCT, REB_PCT_RANK))
corrplot.mixed(REB)

REB_PCT 사용

a <- cor(data.frame(TM_TOV_PCT, TM_TOV_PCT_RANK, EFG_PCT, EFG_PCT_RANK, TS_PCT, TS_PCT_RANK, USG_PCT, USG_PCT_RANK))
corrplot.mixed(a)

TM_TOV_PCT, TS_PCT, USG_PCT 사용

b <- cor(data.frame(PACE, PACE_RANK, PIE, PIE_RANK))
corrplot.mixed(b)

PACE, PIE 사용

FG <- cor(data.frame(FGM, FGM_RANK, FGA, FGA_RANK, FGM_PG, FGM_PG_RANK, FGA_PG, FGA_PG_RANK, PTS))
corrplot.mixed(FG)

PTS 사용

TWI <- cor(data.frame(ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS))
corrplot.mixed(TWI)

둘 다 사용

total <- cor(data.frame(GP, W_PCT, NET_RATING, AST_PCT, REB_PCT, TM_TOV_PCT, TS_PCT, USG_PCT, PACE, PIE, PTS, ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS))
corrplot.mixed(total)

최종 EDA AGE, GP, W_PCT, NET_RATING, AST_PCT, REB_PCT, TM_TOV_PCT, TS_PCT, USG_PCT, PACE, PIE, PTS, ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS

선택된 변수로 다시 회귀

fit2 <- lm(SALARY_MILLIONS ~ AGE + GP + W_PCT + NET_RATING + AST_PCT + REB_PCT + TM_TOV_PCT + TS_PCT + USG_PCT + PACE + PIE + PTS + ACTIVE_TWITTER_LAST_YEAR + TWITTER_FOLLOWER_COUNT_MILLIONS, data=nba)
summary(fit2)
## 
## Call:
## lm(formula = SALARY_MILLIONS ~ AGE + GP + W_PCT + NET_RATING + 
##     AST_PCT + REB_PCT + TM_TOV_PCT + TS_PCT + USG_PCT + PACE + 
##     PIE + PTS + ACTIVE_TWITTER_LAST_YEAR + TWITTER_FOLLOWER_COUNT_MILLIONS, 
##     data = nba)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.2796  -3.7633   0.3473   3.7408  11.6711 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       2.666980  23.673551   0.113 0.910569    
## AGE                               0.623547   0.170165   3.664 0.000431 ***
## GP                               -0.001231   0.040370  -0.030 0.975748    
## W_PCT                             1.939250   5.780638   0.335 0.738095    
## NET_RATING                       -0.075385   0.131682  -0.572 0.568509    
## AST_PCT                           5.632040   9.186827   0.613 0.541476    
## REB_PCT                          27.917467  23.711703   1.177 0.242334    
## TM_TOV_PCT                       -0.406335   0.220385  -1.844 0.068704 .  
## TS_PCT                          -38.026170  17.052671  -2.230 0.028388 *  
## USG_PCT                         -67.486239  24.123996  -2.797 0.006369 ** 
## PACE                              0.074181   0.202010   0.367 0.714373    
## PIE                              45.013559  57.589344   0.782 0.436604    
## PTS                               1.096793   0.195880   5.599 2.59e-07 ***
## ACTIVE_TWITTER_LAST_YEAR         -2.587400   2.412586  -1.072 0.286549    
## TWITTER_FOLLOWER_COUNT_MILLIONS   0.327831   0.160930   2.037 0.044751 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.809 on 85 degrees of freedom
## Multiple R-squared:  0.625,  Adjusted R-squared:  0.5632 
## F-statistic: 10.12 on 14 and 85 DF,  p-value: 6.696e-13

망했다.

p-value만 보고 변수를 선택하면 안된다고 배웠지만 이미 망했으니까 계속 해보자.

fit3 <- lm(SALARY_MILLIONS ~ AGE + TS_PCT + USG_PCT + PTS + TWITTER_FOLLOWER_COUNT_MILLIONS,data=nba)
summary(fit3)
## 
## Call:
## lm(formula = SALARY_MILLIONS ~ AGE + TS_PCT + USG_PCT + PTS + 
##     TWITTER_FOLLOWER_COUNT_MILLIONS, data = nba)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.622  -4.374   0.079   4.361  12.819 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       4.4839    11.1064   0.404 0.687332    
## AGE                               0.5723     0.1590   3.600 0.000511 ***
## TS_PCT                          -24.1342    13.1209  -1.839 0.069018 .  
## USG_PCT                         -41.3052    19.7657  -2.090 0.039341 *  
## PTS                               0.9497     0.1501   6.326 8.44e-09 ***
## TWITTER_FOLLOWER_COUNT_MILLIONS   0.3960     0.1522   2.602 0.010757 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.88 on 94 degrees of freedom
## Multiple R-squared:  0.5751, Adjusted R-squared:  0.5525 
## F-statistic: 25.44 on 5 and 94 DF,  p-value: 3.815e-16

adjr2 55%, full model 59%랑 비교했을 때 크게 나쁘지 않음.

fit <- fit3
par(mfrow=c(2,2))
plot(fit)

망한것같다..

정규성검정

library(car)
## Loading required package: carData
qqPlot(fit)

## [1] 18 41

모든 값이 95% 신뢰구간 안에 있다. 정규성 만족.

독립성검정

durbinWatsonTest(fit)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.08088411      2.126077   0.634
##  Alternative hypothesis: rho != 0

durbinWatsonTest는 D-W Statistic이 0

등분산성

ncvTest(fit)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 1.405846, Df = 1, p = 0.23575
spreadLevelPlot(fit)
## Warning in spreadLevelPlot.lm(fit): 
## 3 negative fitted values removed

## 
## Suggested power transformation:  0.6766045

Suggested power transformation 값은 일정하지 않은 오차의 분산을 안정화시키기 위해 필요한 power transformation 값을 제시해준다. power transformation이 0.67라는 말은 0.67승 하면 만족된다고.. 1에 가까울 수록 잘 만족함.

선형모델 가정 검증

library(gvlma)
summary(gvlma(fit))
## 
## Call:
## lm(formula = SALARY_MILLIONS ~ AGE + TS_PCT + USG_PCT + PTS + 
##     TWITTER_FOLLOWER_COUNT_MILLIONS, data = nba)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.622  -4.374   0.079   4.361  12.819 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       4.4839    11.1064   0.404 0.687332    
## AGE                               0.5723     0.1590   3.600 0.000511 ***
## TS_PCT                          -24.1342    13.1209  -1.839 0.069018 .  
## USG_PCT                         -41.3052    19.7657  -2.090 0.039341 *  
## PTS                               0.9497     0.1501   6.326 8.44e-09 ***
## TWITTER_FOLLOWER_COUNT_MILLIONS   0.3960     0.1522   2.602 0.010757 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.88 on 94 degrees of freedom
## Multiple R-squared:  0.5751, Adjusted R-squared:  0.5525 
## F-statistic: 25.44 on 5 and 94 DF,  p-value: 3.815e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = fit) 
## 
##                     Value p-value                Decision
## Global Stat        4.8228  0.3060 Assumptions acceptable.
## Skewness           0.1556  0.6932 Assumptions acceptable.
## Kurtosis           1.3310  0.2486 Assumptions acceptable.
## Link Function      1.3977  0.2371 Assumptions acceptable.
## Heteroscedasticity 1.9384  0.1638 Assumptions acceptable.

만족

vif

vif(fit)
##                             AGE                          TS_PCT 
##                        1.120628                        1.283693 
##                         USG_PCT                             PTS 
##                        3.668291                        3.456934 
## TWITTER_FOLLOWER_COUNT_MILLIONS 
##                        1.252105
sqrt(vif(fit))>2
##                             AGE                          TS_PCT 
##                           FALSE                           FALSE 
##                         USG_PCT                             PTS 
##                           FALSE                           FALSE 
## TWITTER_FOLLOWER_COUNT_MILLIONS 
##                           FALSE

당연히 만족.. 왜냐하면 변수를 죄다 지웠기 때문이다. ㅠ

outliertest

outlierTest(fit)
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##    rstudent unadjusted p-value Bonferonni p
## 41 2.295775           0.023936           NA
influencePlot(fit, main="Influence Plot",
                 sub="Circle size is proportional to Cook’s distance")

##      StudRes       Hat      CookD
## 3   1.467479 0.1864834 0.08127728
## 7  -1.383895 0.7108834 0.77727126
## 18 -2.288337 0.0803073 0.07292155
## 41  2.295775 0.0571591 0.05093997
## 79  1.098466 0.2096242 0.05322013

studRes>2 : outlier hat > 2 : high leverage points cookD : influential obs

그냥 이런것도 있길래.. 상대적 중요도

relweights <- function(fit,...){
    R <- cor(fit$model)
    nvar <- ncol(R)
    rxx <- R[2:nvar, 2:nvar]
    rxy <- R[2:nvar, 1]
    svd <- eigen(rxx)
    evec <- svd$vectors
    ev <- svd$values
    delta <- diag(sqrt(ev))
    lambda <- evec %*% delta %*% t(evec)
    lambdasq <- lambda ^ 2
    beta <- solve(lambda) %*% rxy
    rsquare <- colSums(beta ^ 2)
    rawwgt <- lambdasq %*% beta ^ 2
    import <- (rawwgt / rsquare) * 100
    import <- as.data.frame(import)
    row.names(import) <- names(fit$model[2:nvar])
    names(import) <- "Weights"
    import <- import[order(import),1, drop=FALSE]
    dotchart(import$Weights, labels=row.names(import),
                 xlab="% of R-Square", pch=19,
                 main="Relative Importance of Predictor Variables",
                 sub=paste("Total R-Square=", round(rsquare, digits=3)),
                 ...)
    return(import)
}    
relweights(fit)

##                                   Weights
## TS_PCT                           3.185345
## AGE                             12.931895
## USG_PCT                         17.156108
## TWITTER_FOLLOWER_COUNT_MILLIONS 18.116434
## PTS                             48.610218

5개의 변수로 모델 57%를 설명한다. 이중 PTS가 49%, TWITTER가 18%, USG_PCT가 17% 설명하는 상대적 중요도를 보여줌 회귀계수보다 직관적이라고 한다..

이제 회귀 그만하고

variable selection

nba <- subset(nba, select=c(AGE, GP, W_PCT, NET_RATING, AST_PCT, REB_PCT, TM_TOV_PCT, TS_PCT, USG_PCT, PACE, PIE, PTS, ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS))

best subset

library(leaps)
regfit.full <- regsubsets(SALARY_MILLIONS ~ ., data = nba, nvmax=14)
full.summary <- summary(regfit.full)
names(full.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

coef 개수 확인

which.max(full.summary$rss);which.max(full.summary$adjr2);which.min(full.summary$cp);which.min(full.summary$bic)
## [1] 1
## [1] 9
## [1] 7
## [1] 3
library(ggvis)
rsq <- as.data.frame(full.summary$rsq)
names(rsq) <- "R2"
rsq %>% 
        ggvis(x=~ c(1:nrow(rsq)), y=~R2 ) %>%
        layer_points(fill = ~ R2 ) %>%
        add_axis("y", title = "R2") %>% 
        add_axis("x", title = "Number of variables")

변수 9개부터는 크게 의미 없어보임.

par(mfrow=c(2,2))
plot(full.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
plot(full.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
# which.max(full.summary$adjr2)
points(9,full.summary$adjr2[9], col="red",cex=2,pch=20)
plot(full.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
# which.min(full.summary$cp )
points(7,full.summary$cp[7],col="red",cex=2,pch=20)
plot(full.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
# which.min(full.summary$bic )
points(3,full.summary$bic[3],col="red",cex=2,pch=20)

plot(regfit.full, scale="adjr2")

plot(regfit.full, scale="Cp")

plot(regfit.full, scale='bic')

forward

library(leaps)
regfit.fwd <- regsubsets(SALARY_MILLIONS ~ ., data = nba, nvmax=14, method = 'forward')
fwd.summary <- summary(regfit.fwd)
names(fwd.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

coef 개수 확인

which.max(fwd.summary$rss);which.max(fwd.summary$adjr2);which.min(fwd.summary$cp);which.min(fwd.summary$bic)
## [1] 1
## [1] 8
## [1] 7
## [1] 3
library(ggvis)
rsq <- as.data.frame(fwd.summary$rsq)
names(rsq) <- "R2"
rsq %>% 
        ggvis(x=~ c(1:nrow(rsq)), y=~R2 ) %>%
        layer_points(fill = ~ R2 ) %>%
        add_axis("y", title = "R2") %>% 
        add_axis("x", title = "Number of variables")

나머지도 그려봄

par(mfrow=c(2,2))
plot(fwd.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
plot(fwd.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
# which.max(fwd.summary$adjr2)
points(8,fwd.summary$adjr2[8], col="red",cex=2,pch=20)
plot(fwd.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
# which.min(fwd.summary$cp )
points(7,fwd.summary$cp[7],col="red",cex=2,pch=20)
plot(fwd.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
# which.min(fwd.summary$bic )
points(3,fwd.summary$bic[3],col="red",cex=2,pch=20)

plot(regfit.fwd, scale="adjr2")

plot(regfit.fwd, scale="Cp")

plot(regfit.fwd, scale='bic')

backward

regfit.bwd <- regsubsets(SALARY_MILLIONS ~ ., data = nba, nvmax=14, method = 'backward')
bwd.summary <- summary(regfit.bwd)
names(bwd.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
which.max(bwd.summary$rsq);which.max(bwd.summary$adjr2);which.min(bwd.summary$cp);which.min(bwd.summary$bic)
## [1] 14
## [1] 9
## [1] 7
## [1] 3
library(ggvis)
rsq <- as.data.frame(bwd.summary$rsq)
names(rsq) <- "R2"
rsq %>% 
        ggvis(x=~ c(1:nrow(rsq)), y=~R2 ) %>%
        layer_points(fill = ~ R2 ) %>%
        add_axis("y", title = "R2") %>% 
        add_axis("x", title = "Number of variables")
par(mfrow=c(2,2))
plot(bwd.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
plot(bwd.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
# which.max(bwd.summary$adjr2)
points(9,bwd.summary$adjr2[9], col="red",cex=2,pch=20)
plot(bwd.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
# which.min(bwd.summary$cp )
points(7,bwd.summary$cp[7],col="red",cex=2,pch=20)
plot(bwd.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
# which.min(reg.summary$bic )
points(3,bwd.summary$bic[3],col="red",cex=2,pch=20)

plot(regfit.bwd, scale="adjr2")

plot(regfit.bwd, scale="Cp")

plot(regfit.bwd, scale='bic')

cp, bic로 선택된 7개, 3개에 대해 모델을 만들자.

7개

coef(regfit.full, 7);coef(regfit.fwd, 7);coef(regfit.bwd, 7)
##                     (Intercept)                             AGE 
##                       4.7601333                       0.6517193 
##                         REB_PCT                      TM_TOV_PCT 
##                      29.9423717                      -0.3902498 
##                          TS_PCT                         USG_PCT 
##                     -29.3459549                     -47.2186728 
##                             PTS TWITTER_FOLLOWER_COUNT_MILLIONS 
##                       1.0810269                       0.3616331
##                     (Intercept)                             AGE 
##                       4.7601333                       0.6517193 
##                         REB_PCT                      TM_TOV_PCT 
##                      29.9423717                      -0.3902498 
##                          TS_PCT                         USG_PCT 
##                     -29.3459549                     -47.2186728 
##                             PTS TWITTER_FOLLOWER_COUNT_MILLIONS 
##                       1.0810269                       0.3616331
##                     (Intercept)                             AGE 
##                       4.7601333                       0.6517193 
##                         REB_PCT                      TM_TOV_PCT 
##                      29.9423717                      -0.3902498 
##                          TS_PCT                         USG_PCT 
##                     -29.3459549                     -47.2186728 
##                             PTS TWITTER_FOLLOWER_COUNT_MILLIONS 
##                       1.0810269                       0.3616331

3개

coef(regfit.full, 3);coef(regfit.fwd, 3);coef(regfit.bwd, 3)
##                     (Intercept)                             AGE 
##                     -17.0889133                       0.6161403 
##                             PTS TWITTER_FOLLOWER_COUNT_MILLIONS 
##                       0.7165846                       0.3662290
##                     (Intercept)                             AGE 
##                     -17.0889133                       0.6161403 
##                             PTS TWITTER_FOLLOWER_COUNT_MILLIONS 
##                       0.7165846                       0.3662290
##                     (Intercept)                             AGE 
##                     -17.0889133                       0.6161403 
##                             PTS TWITTER_FOLLOWER_COUNT_MILLIONS 
##                       0.7165846                       0.3662290

Validation은 다음주에.

결론 : eda를 잘하자