library(csv)
nba <- as.csv('C:/Users/Seol/Desktop/nba.csv')
attach(nba)
결측치 확인
sum(is.na(nba))
## [1] 0
없당~
str(nba)
## 'data.frame': 100 obs. of 55 variables:
## $ AGE : int 28 28 22 24 27 28 32 32 26 22 ...
## $ GP : int 81 35 5 75 81 62 74 61 72 80 ...
## $ W : int 46 16 1 31 54 51 51 43 30 42 ...
## $ L : int 35 19 4 44 27 11 23 18 42 38 ...
## $ W_PCT : num 0.568 0.457 0.2 0.413 0.667 0.823 0.689 0.705 0.417 0.525 ...
## $ MIN : num 34.6 8.4 3.4 36.1 36.4 33.4 37.8 31.5 34.2 35.6 ...
## $ OFF_RATING : num 108 104 124 104 114 ...
## $ DEF_RATING : num 105 102 118 102 107 ...
## $ NET_RATING : num 3.3 1.9 6.3 1.7 6.3 16 7.7 14.9 -2.2 1.5 ...
## $ AST_PCT : num 0.543 0.054 0.3 0.11 0.505 0.218 0.388 0.444 0.262 0.269 ...
## $ AST_TO : num 1.92 0.9 0 0.87 1.95 2.17 2.13 3.83 1.23 1.85 ...
## $ AST_RATIO : num 23.4 5.1 31.1 7.3 27.6 18.4 25.6 35 14.3 19.8 ...
## $ OREB_PCT : num 0.053 0.166 0.091 0.067 0.035 0.023 0.04 0.025 0.072 0.058 ...
## $ DREB_PCT : num 0.279 0.313 0.118 0.269 0.212 0.232 0.209 0.149 0.305 0.226 ...
## $ REB_PCT : num 0.167 0.239 0.103 0.17 0.123 0.137 0.127 0.089 0.188 0.143 ...
## $ TM_TOV_PCT : num 12.2 5.7 0 8.4 14.1 8.5 12 9.1 11.6 10.7 ...
## $ EFG_PCT : num 0.476 0.545 0.875 0.518 0.525 0.594 0.594 0.555 0.498 0.541 ...
## $ TS_PCT : num 0.554 0.606 0.753 0.58 0.613 0.651 0.619 0.614 0.562 0.599 ...
## $ USG_PCT : num 0.408 0.248 0.172 0.326 0.341 0.276 0.297 0.243 0.364 0.283 ...
## $ PACE : num 102.3 97.2 87.5 100.2 103 ...
## $ PIE : num 0.23 0.196 0.194 0.192 0.19 0.186 0.183 0.182 0.178 0.176 ...
## $ FGM : int 824 72 3 770 674 551 736 374 647 656 ...
## $ FGA : int 1941 132 4 1526 1533 1026 1344 785 1432 1259 ...
## $ FGM_PG : num 10.2 2.1 0.6 10.3 8.3 8.9 9.9 6.1 9 8.2 ...
## $ FGA_PG : num 24 3.8 0.8 20.3 18.9 16.5 18.2 12.9 19.9 15.7 ...
## $ FG_PCT : num 0.425 0.545 0.75 0.505 0.44 0.537 0.548 0.476 0.452 0.521 ...
## $ GP_RANK : int 18 365 458 111 18 244 126 253 158 38 ...
## $ W_RANK : int 62 345 464 196 15 21 21 76 209 82 ...
## $ L_RANK : int 330 149 34 419 221 79 190 136 402 367 ...
## $ W_PCT_RANK : int 143 270 468 316 56 14 45 38 315 182 ...
## $ MIN_RANK : int 22 432 475 9 7 40 1 64 25 12 ...
## $ OFF_RATING_RANK : int 115 260 2 271 27 9 18 10 152 131 ...
## $ DEF_RATING_RANK : int 168 82 475 88 296 57 287 58 374 224 ...
## $ NET_RATING_RANK : int 95 139 53 144 55 9 45 14 273 148 ...
## $ AST_PCT_RANK : int 1 412 32 230 2 79 7 4 50 47 ...
## $ AST_TO_RANK : int 140 381 466 387 132 89 98 7 300 150 ...
## $ AST_RATIO_RANK : int 96 467 28 442 48 168 68 15 241 133 ...
## $ OREB_PCT_RANK : int 171 4 81 139 242 320 220 301 127 157 ...
## $ DREB_PCT_RANK : int 22 8 317 27 73 55 81 219 11 57 ...
## $ REB_PCT_RANK : int 53 5 201 48 144 106 125 242 29 95 ...
## $ TM_TOV_PCT_RANK : int 358 30 1 129 426 131 347 173 321 265 ...
## $ EFG_PCT_RANK : int 339 100 2 189 167 31 32 84 261 113 ...
## $ TS_PCT_RANK : int 181 52 4 112 40 12 35 39 156 67 ...
## $ USG_PCT_RANK : int 2 63 279 9 7 31 15 72 4 28 ...
## $ PACE_RANK : int 47 328 483 121 34 17 230 243 331 378 ...
## $ PIE_RANK : int 1 2 3 4 5 6 7 8 9 10 ...
## $ FGM_RANK : int 1 344 461 3 10 26 4 79 14 13 ...
## $ FGA_RANK : int 1 360 476 5 4 35 19 78 13 23 ...
## $ FGM_PG_RANK : int 2 306 458 1 15 9 3 51 7 20 ...
## $ FGA_PG_RANK : int 1 356 480 3 9 22 15 58 4 29 ...
## $ FG_PCT_RANK : int 293 47 3 95 253 53 45 142 208 77 ...
## $ SALARY_MILLIONS : num 26.54 7 1.45 22.12 26.5 ...
## $ PTS : num 31.6 5.5 2 28 29.1 25.1 26.4 18.1 24.4 22.9 ...
## $ ACTIVE_TWITTER_LAST_YEAR : int 1 0 1 1 1 1 1 1 1 1 ...
## $ TWITTER_FOLLOWER_COUNT_MILLIONS: num 4.5 0 0.049 1.22 4.47 16.2 37 6.4 0.826 0.246 ...
## - attr(*, "source")= chr "C:/Users/Seol/Desktop/nba.csv"
fit1 <- lm(SALARY_MILLIONS ~., data=nba)
summary(fit1)
##
## Call:
## lm(formula = SALARY_MILLIONS ~ ., data = nba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.241 -1.882 0.014 2.538 11.345
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.182e+02 1.970e+02 -0.600 0.55149
## AGE 5.371e-01 2.989e-01 1.797 0.07886 .
## GP 6.272e-01 7.552e-01 0.830 0.41059
## W -1.001e+00 9.114e-01 -1.099 0.27758
## L NA NA NA NA
## W_PCT 4.667e+01 5.204e+01 0.897 0.37453
## MIN -6.018e-01 1.776e+00 -0.339 0.73627
## OFF_RATING 5.475e-01 1.599e+01 0.034 0.97282
## DEF_RATING -7.065e-01 1.602e+01 -0.044 0.96501
## NET_RATING -1.149e+00 1.601e+01 -0.072 0.94308
## AST_PCT -3.787e+01 5.142e+01 -0.737 0.46513
## AST_TO 1.045e+01 6.558e+00 1.594 0.11780
## AST_RATIO -1.523e-01 9.142e-01 -0.167 0.86838
## OREB_PCT 4.783e+01 2.648e+02 0.181 0.85748
## DREB_PCT -9.906e+00 2.681e+02 -0.037 0.97068
## REB_PCT -8.892e+01 5.452e+02 -0.163 0.87116
## TM_TOV_PCT -2.805e+00 1.912e+00 -1.467 0.14920
## EFG_PCT 1.037e+02 1.271e+02 0.816 0.41859
## TS_PCT -2.644e+02 9.760e+01 -2.709 0.00946 **
## USG_PCT 1.368e+02 1.238e+02 1.105 0.27494
## PACE 1.695e+00 1.172e+00 1.446 0.15488
## PIE 2.191e+02 1.784e+02 1.228 0.22575
## FGM 7.403e-02 1.403e-01 0.528 0.60035
## FGA -4.514e-02 6.057e-02 -0.745 0.45984
## FGM_PG -1.597e+01 1.154e+01 -1.385 0.17286
## FGA_PG 5.427e+00 5.052e+00 1.074 0.28833
## FG_PCT 9.666e+01 1.056e+02 0.915 0.36500
## GP_RANK 1.564e-02 2.531e-02 0.618 0.53968
## W_RANK -6.329e-02 7.112e-02 -0.890 0.37815
## L_RANK -4.147e-02 6.785e-02 -0.611 0.54410
## W_PCT_RANK 2.645e-02 4.861e-02 0.544 0.58893
## MIN_RANK -1.721e-01 1.238e-01 -1.390 0.17136
## OFF_RATING_RANK 2.058e-02 2.481e-02 0.829 0.41115
## DEF_RATING_RANK 3.674e-02 2.948e-02 1.246 0.21898
## NET_RATING_RANK -3.915e-02 2.852e-02 -1.373 0.17652
## AST_PCT_RANK -6.451e-04 5.309e-02 -0.012 0.99036
## AST_TO_RANK 3.506e-02 3.919e-02 0.895 0.37566
## AST_RATIO_RANK 8.159e-03 4.786e-02 0.170 0.86537
## OREB_PCT_RANK 3.412e-02 3.146e-02 1.084 0.28382
## DREB_PCT_RANK 2.751e-02 6.605e-02 0.416 0.67901
## REB_PCT_RANK -6.615e-02 8.467e-02 -0.781 0.43859
## TM_TOV_PCT_RANK 4.887e-02 4.447e-02 1.099 0.27757
## EFG_PCT_RANK -1.880e-02 4.318e-02 -0.435 0.66535
## TS_PCT_RANK -7.183e-03 3.820e-02 -0.188 0.85167
## USG_PCT_RANK 1.151e-02 3.244e-02 0.355 0.72432
## PACE_RANK 1.190e-02 2.341e-02 0.508 0.61373
## PIE_RANK 1.813e-03 8.783e-02 0.021 0.98362
## FGM_RANK 3.025e-01 1.525e-01 1.984 0.05324 .
## FGA_RANK -3.080e-01 1.320e-01 -2.333 0.02407 *
## FGM_PG_RANK -1.694e-01 1.272e-01 -1.332 0.18936
## FGA_PG_RANK 2.238e-01 1.418e-01 1.578 0.12135
## FG_PCT_RANK -1.167e-02 4.537e-02 -0.257 0.79807
## PTS 7.289e-01 8.817e-01 0.827 0.41266
## ACTIVE_TWITTER_LAST_YEAR -1.554e+00 2.982e+00 -0.521 0.60477
## TWITTER_FOLLOWER_COUNT_MILLIONS 4.660e-01 2.176e-01 2.142 0.03753 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.623 on 46 degrees of freedom
## Multiple R-squared: 0.8098, Adjusted R-squared: 0.5906
## F-statistic: 3.695 on 53 and 46 DF, p-value: 6.858e-06
망함 아무래도 그냥 쓰면 안될 것 같다.
비슷한 친구들끼리 묶어서 상관관계 확인
library(corrplot)
## corrplot 0.84 loaded
cor(data.frame(GP, W, L, W_PCT))
## GP W L W_PCT
## GP 1.0000000 0.8050370 0.6951741 0.4050292
## W 0.8050370 1.0000000 0.1332065 0.8050411
## L 0.6951741 0.1332065 1.0000000 -0.2988366
## W_PCT 0.4050292 0.8050411 -0.2988366 1.0000000
Games <- cor(data.frame(GP, GP_RANK, W, W_RANK, L, L_RANK, W_PCT, W_PCT_RANK))
corrplot.mixed(Games)
GP, W_PCT 사용
ODN <- cor(data.frame(OFF_RATING, OFF_RATING_RANK, DEF_RATING, DEF_RATING_RANK, NET_RATING, NET_RATING_RANK))
corrplot.mixed(ODN)
NET_RATING 사용
ASS <- cor(data.frame(AST_PCT, AST_PCT_RANK, AST_RATIO, AST_RATIO_RANK, AST_TO, AST_TO_RANK))
corrplot.mixed(ASS)
AST_PCT 사용
REB <- cor(data.frame(OREB_PCT, OREB_PCT_RANK, DREB_PCT, DREB_PCT_RANK, REB_PCT, REB_PCT_RANK))
corrplot.mixed(REB)
REB_PCT 사용
a <- cor(data.frame(TM_TOV_PCT, TM_TOV_PCT_RANK, EFG_PCT, EFG_PCT_RANK, TS_PCT, TS_PCT_RANK, USG_PCT, USG_PCT_RANK))
corrplot.mixed(a)
TM_TOV_PCT, TS_PCT, USG_PCT 사용
b <- cor(data.frame(PACE, PACE_RANK, PIE, PIE_RANK))
corrplot.mixed(b)
PACE, PIE 사용
FG <- cor(data.frame(FGM, FGM_RANK, FGA, FGA_RANK, FGM_PG, FGM_PG_RANK, FGA_PG, FGA_PG_RANK, PTS))
corrplot.mixed(FG)
PTS 사용
TWI <- cor(data.frame(ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS))
corrplot.mixed(TWI)
둘 다 사용
total <- cor(data.frame(GP, W_PCT, NET_RATING, AST_PCT, REB_PCT, TM_TOV_PCT, TS_PCT, USG_PCT, PACE, PIE, PTS, ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS))
corrplot.mixed(total)
최종 EDA AGE, GP, W_PCT, NET_RATING, AST_PCT, REB_PCT, TM_TOV_PCT, TS_PCT, USG_PCT, PACE, PIE, PTS, ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS
선택된 변수로 다시 회귀
fit2 <- lm(SALARY_MILLIONS ~ AGE + GP + W_PCT + NET_RATING + AST_PCT + REB_PCT + TM_TOV_PCT + TS_PCT + USG_PCT + PACE + PIE + PTS + ACTIVE_TWITTER_LAST_YEAR + TWITTER_FOLLOWER_COUNT_MILLIONS, data=nba)
summary(fit2)
##
## Call:
## lm(formula = SALARY_MILLIONS ~ AGE + GP + W_PCT + NET_RATING +
## AST_PCT + REB_PCT + TM_TOV_PCT + TS_PCT + USG_PCT + PACE +
## PIE + PTS + ACTIVE_TWITTER_LAST_YEAR + TWITTER_FOLLOWER_COUNT_MILLIONS,
## data = nba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2796 -3.7633 0.3473 3.7408 11.6711
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.666980 23.673551 0.113 0.910569
## AGE 0.623547 0.170165 3.664 0.000431 ***
## GP -0.001231 0.040370 -0.030 0.975748
## W_PCT 1.939250 5.780638 0.335 0.738095
## NET_RATING -0.075385 0.131682 -0.572 0.568509
## AST_PCT 5.632040 9.186827 0.613 0.541476
## REB_PCT 27.917467 23.711703 1.177 0.242334
## TM_TOV_PCT -0.406335 0.220385 -1.844 0.068704 .
## TS_PCT -38.026170 17.052671 -2.230 0.028388 *
## USG_PCT -67.486239 24.123996 -2.797 0.006369 **
## PACE 0.074181 0.202010 0.367 0.714373
## PIE 45.013559 57.589344 0.782 0.436604
## PTS 1.096793 0.195880 5.599 2.59e-07 ***
## ACTIVE_TWITTER_LAST_YEAR -2.587400 2.412586 -1.072 0.286549
## TWITTER_FOLLOWER_COUNT_MILLIONS 0.327831 0.160930 2.037 0.044751 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.809 on 85 degrees of freedom
## Multiple R-squared: 0.625, Adjusted R-squared: 0.5632
## F-statistic: 10.12 on 14 and 85 DF, p-value: 6.696e-13
망했다.
fit3 <- lm(SALARY_MILLIONS ~ AGE + TS_PCT + USG_PCT + PTS + TWITTER_FOLLOWER_COUNT_MILLIONS,data=nba)
summary(fit3)
##
## Call:
## lm(formula = SALARY_MILLIONS ~ AGE + TS_PCT + USG_PCT + PTS +
## TWITTER_FOLLOWER_COUNT_MILLIONS, data = nba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.622 -4.374 0.079 4.361 12.819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.4839 11.1064 0.404 0.687332
## AGE 0.5723 0.1590 3.600 0.000511 ***
## TS_PCT -24.1342 13.1209 -1.839 0.069018 .
## USG_PCT -41.3052 19.7657 -2.090 0.039341 *
## PTS 0.9497 0.1501 6.326 8.44e-09 ***
## TWITTER_FOLLOWER_COUNT_MILLIONS 0.3960 0.1522 2.602 0.010757 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.88 on 94 degrees of freedom
## Multiple R-squared: 0.5751, Adjusted R-squared: 0.5525
## F-statistic: 25.44 on 5 and 94 DF, p-value: 3.815e-16
adjr2 55%, full model 59%랑 비교했을 때 크게 나쁘지 않음.
fit <- fit3
par(mfrow=c(2,2))
plot(fit)
망한것같다..
정규성검정
library(car)
## Loading required package: carData
qqPlot(fit)
## [1] 18 41
모든 값이 95% 신뢰구간 안에 있다. 정규성 만족.
독립성검정
durbinWatsonTest(fit)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.08088411 2.126077 0.634
## Alternative hypothesis: rho != 0
durbinWatsonTest는 D-W Statistic이 0
등분산성
ncvTest(fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 1.405846, Df = 1, p = 0.23575
spreadLevelPlot(fit)
## Warning in spreadLevelPlot.lm(fit):
## 3 negative fitted values removed
##
## Suggested power transformation: 0.6766045
Suggested power transformation 값은 일정하지 않은 오차의 분산을 안정화시키기 위해 필요한 power transformation 값을 제시해준다. power transformation이 0.67라는 말은 0.67승 하면 만족된다고.. 1에 가까울 수록 잘 만족함.
선형모델 가정 검증
library(gvlma)
summary(gvlma(fit))
##
## Call:
## lm(formula = SALARY_MILLIONS ~ AGE + TS_PCT + USG_PCT + PTS +
## TWITTER_FOLLOWER_COUNT_MILLIONS, data = nba)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.622 -4.374 0.079 4.361 12.819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.4839 11.1064 0.404 0.687332
## AGE 0.5723 0.1590 3.600 0.000511 ***
## TS_PCT -24.1342 13.1209 -1.839 0.069018 .
## USG_PCT -41.3052 19.7657 -2.090 0.039341 *
## PTS 0.9497 0.1501 6.326 8.44e-09 ***
## TWITTER_FOLLOWER_COUNT_MILLIONS 0.3960 0.1522 2.602 0.010757 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.88 on 94 degrees of freedom
## Multiple R-squared: 0.5751, Adjusted R-squared: 0.5525
## F-statistic: 25.44 on 5 and 94 DF, p-value: 3.815e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = fit)
##
## Value p-value Decision
## Global Stat 4.8228 0.3060 Assumptions acceptable.
## Skewness 0.1556 0.6932 Assumptions acceptable.
## Kurtosis 1.3310 0.2486 Assumptions acceptable.
## Link Function 1.3977 0.2371 Assumptions acceptable.
## Heteroscedasticity 1.9384 0.1638 Assumptions acceptable.
만족
vif
vif(fit)
## AGE TS_PCT
## 1.120628 1.283693
## USG_PCT PTS
## 3.668291 3.456934
## TWITTER_FOLLOWER_COUNT_MILLIONS
## 1.252105
sqrt(vif(fit))>2
## AGE TS_PCT
## FALSE FALSE
## USG_PCT PTS
## FALSE FALSE
## TWITTER_FOLLOWER_COUNT_MILLIONS
## FALSE
당연히 만족.. 왜냐하면 변수를 죄다 지웠기 때문이다. ㅠ
outliertest
outlierTest(fit)
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 41 2.295775 0.023936 NA
influencePlot(fit, main="Influence Plot",
sub="Circle size is proportional to Cook’s distance")
## StudRes Hat CookD
## 3 1.467479 0.1864834 0.08127728
## 7 -1.383895 0.7108834 0.77727126
## 18 -2.288337 0.0803073 0.07292155
## 41 2.295775 0.0571591 0.05093997
## 79 1.098466 0.2096242 0.05322013
studRes>2 : outlier hat > 2 : high leverage points cookD : influential obs
그냥 이런것도 있길래.. 상대적 중요도
relweights <- function(fit,...){
R <- cor(fit$model)
nvar <- ncol(R)
rxx <- R[2:nvar, 2:nvar]
rxy <- R[2:nvar, 1]
svd <- eigen(rxx)
evec <- svd$vectors
ev <- svd$values
delta <- diag(sqrt(ev))
lambda <- evec %*% delta %*% t(evec)
lambdasq <- lambda ^ 2
beta <- solve(lambda) %*% rxy
rsquare <- colSums(beta ^ 2)
rawwgt <- lambdasq %*% beta ^ 2
import <- (rawwgt / rsquare) * 100
import <- as.data.frame(import)
row.names(import) <- names(fit$model[2:nvar])
names(import) <- "Weights"
import <- import[order(import),1, drop=FALSE]
dotchart(import$Weights, labels=row.names(import),
xlab="% of R-Square", pch=19,
main="Relative Importance of Predictor Variables",
sub=paste("Total R-Square=", round(rsquare, digits=3)),
...)
return(import)
}
relweights(fit)
## Weights
## TS_PCT 3.185345
## AGE 12.931895
## USG_PCT 17.156108
## TWITTER_FOLLOWER_COUNT_MILLIONS 18.116434
## PTS 48.610218
5개의 변수로 모델 57%를 설명한다. 이중 PTS가 49%, TWITTER가 18%, USG_PCT가 17% 설명하는 상대적 중요도를 보여줌 회귀계수보다 직관적이라고 한다..
이제 회귀 그만하고
nba <- subset(nba, select=c(AGE, GP, W_PCT, NET_RATING, AST_PCT, REB_PCT, TM_TOV_PCT, TS_PCT, USG_PCT, PACE, PIE, PTS, ACTIVE_TWITTER_LAST_YEAR, TWITTER_FOLLOWER_COUNT_MILLIONS))
library(leaps)
regfit.full <- regsubsets(SALARY_MILLIONS ~ ., data = nba, nvmax=14)
full.summary <- summary(regfit.full)
names(full.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
coef 개수 확인
which.max(full.summary$rss);which.max(full.summary$adjr2);which.min(full.summary$cp);which.min(full.summary$bic)
## [1] 1
## [1] 9
## [1] 7
## [1] 3
library(ggvis)
rsq <- as.data.frame(full.summary$rsq)
names(rsq) <- "R2"
rsq %>%
ggvis(x=~ c(1:nrow(rsq)), y=~R2 ) %>%
layer_points(fill = ~ R2 ) %>%
add_axis("y", title = "R2") %>%
add_axis("x", title = "Number of variables")
par(mfrow=c(2,2))
plot(full.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
plot(full.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
# which.max(full.summary$adjr2)
points(9,full.summary$adjr2[9], col="red",cex=2,pch=20)
plot(full.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
# which.min(full.summary$cp )
points(7,full.summary$cp[7],col="red",cex=2,pch=20)
plot(full.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
# which.min(full.summary$bic )
points(3,full.summary$bic[3],col="red",cex=2,pch=20)
plot(regfit.full, scale="adjr2")
plot(regfit.full, scale="Cp")
plot(regfit.full, scale='bic')
library(leaps)
regfit.fwd <- regsubsets(SALARY_MILLIONS ~ ., data = nba, nvmax=14, method = 'forward')
fwd.summary <- summary(regfit.fwd)
names(fwd.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
coef 개수 확인
which.max(fwd.summary$rss);which.max(fwd.summary$adjr2);which.min(fwd.summary$cp);which.min(fwd.summary$bic)
## [1] 1
## [1] 8
## [1] 7
## [1] 3
library(ggvis)
rsq <- as.data.frame(fwd.summary$rsq)
names(rsq) <- "R2"
rsq %>%
ggvis(x=~ c(1:nrow(rsq)), y=~R2 ) %>%
layer_points(fill = ~ R2 ) %>%
add_axis("y", title = "R2") %>%
add_axis("x", title = "Number of variables")
나머지도 그려봄
par(mfrow=c(2,2))
plot(fwd.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
plot(fwd.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
# which.max(fwd.summary$adjr2)
points(8,fwd.summary$adjr2[8], col="red",cex=2,pch=20)
plot(fwd.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
# which.min(fwd.summary$cp )
points(7,fwd.summary$cp[7],col="red",cex=2,pch=20)
plot(fwd.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
# which.min(fwd.summary$bic )
points(3,fwd.summary$bic[3],col="red",cex=2,pch=20)
plot(regfit.fwd, scale="adjr2")
plot(regfit.fwd, scale="Cp")
plot(regfit.fwd, scale='bic')
regfit.bwd <- regsubsets(SALARY_MILLIONS ~ ., data = nba, nvmax=14, method = 'backward')
bwd.summary <- summary(regfit.bwd)
names(bwd.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
which.max(bwd.summary$rsq);which.max(bwd.summary$adjr2);which.min(bwd.summary$cp);which.min(bwd.summary$bic)
## [1] 14
## [1] 9
## [1] 7
## [1] 3
library(ggvis)
rsq <- as.data.frame(bwd.summary$rsq)
names(rsq) <- "R2"
rsq %>%
ggvis(x=~ c(1:nrow(rsq)), y=~R2 ) %>%
layer_points(fill = ~ R2 ) %>%
add_axis("y", title = "R2") %>%
add_axis("x", title = "Number of variables")
par(mfrow=c(2,2))
plot(bwd.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
plot(bwd.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
# which.max(bwd.summary$adjr2)
points(9,bwd.summary$adjr2[9], col="red",cex=2,pch=20)
plot(bwd.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
# which.min(bwd.summary$cp )
points(7,bwd.summary$cp[7],col="red",cex=2,pch=20)
plot(bwd.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
# which.min(reg.summary$bic )
points(3,bwd.summary$bic[3],col="red",cex=2,pch=20)
plot(regfit.bwd, scale="adjr2")
plot(regfit.bwd, scale="Cp")
plot(regfit.bwd, scale='bic')
cp, bic로 선택된 7개, 3개에 대해 모델을 만들자.
7개
coef(regfit.full, 7);coef(regfit.fwd, 7);coef(regfit.bwd, 7)
## (Intercept) AGE
## 4.7601333 0.6517193
## REB_PCT TM_TOV_PCT
## 29.9423717 -0.3902498
## TS_PCT USG_PCT
## -29.3459549 -47.2186728
## PTS TWITTER_FOLLOWER_COUNT_MILLIONS
## 1.0810269 0.3616331
## (Intercept) AGE
## 4.7601333 0.6517193
## REB_PCT TM_TOV_PCT
## 29.9423717 -0.3902498
## TS_PCT USG_PCT
## -29.3459549 -47.2186728
## PTS TWITTER_FOLLOWER_COUNT_MILLIONS
## 1.0810269 0.3616331
## (Intercept) AGE
## 4.7601333 0.6517193
## REB_PCT TM_TOV_PCT
## 29.9423717 -0.3902498
## TS_PCT USG_PCT
## -29.3459549 -47.2186728
## PTS TWITTER_FOLLOWER_COUNT_MILLIONS
## 1.0810269 0.3616331
3개
coef(regfit.full, 3);coef(regfit.fwd, 3);coef(regfit.bwd, 3)
## (Intercept) AGE
## -17.0889133 0.6161403
## PTS TWITTER_FOLLOWER_COUNT_MILLIONS
## 0.7165846 0.3662290
## (Intercept) AGE
## -17.0889133 0.6161403
## PTS TWITTER_FOLLOWER_COUNT_MILLIONS
## 0.7165846 0.3662290
## (Intercept) AGE
## -17.0889133 0.6161403
## PTS TWITTER_FOLLOWER_COUNT_MILLIONS
## 0.7165846 0.3662290
Validation은 다음주에.