데이터가 어떤 변수로 이루어져있는지 보자.
모두 연속형 변수이다.
'data.frame': 252 obs. of 16 variables:
$ brozek : num 12.6 6.9 24.6 10.9 27.8 20.6 19 12.8 5.1 12 ...
$ age : int 23 22 22 26 24 24 26 25 25 23 ...
$ weight : num 154 173 154 185 184 ...
$ height : num 67.8 72.2 66.2 72.2 71.2 ...
$ adipos : num 23.7 23.4 24.7 24.9 25.6 26.5 26.2 23.6 24.6 25.8 ...
$ free : num 135 161 116 165 133 ...
$ neck : num 36.2 38.5 34 37.4 34.4 39 36.4 37.8 38.1 42.1 ...
$ chest : num 93.1 93.6 95.8 101.8 97.3 ...
$ abdom : num 85.2 83 87.9 86.4 100 94.4 90.7 88.5 82.5 88.6 ...
$ hip : num 94.5 98.7 99.2 101.2 101.9 ...
$ thigh : num 59 58.7 59.6 60.1 63.2 66 58.4 60 62.9 63.1 ...
$ knee : num 37.3 37.3 38.9 37.3 42.2 42 38.3 39.4 38.3 41.7 ...
$ ankle : num 21.9 23.4 24 22.8 24 25.6 22.9 23.2 23.8 25 ...
$ biceps : num 32 30.5 28.8 32.4 32.2 35.7 31.9 30.5 35.9 35.6 ...
$ forearm: num 27.4 28.9 25.2 29.4 27.7 30.6 27.8 29 31.1 30 ...
$ wrist : num 17.1 18.2 16.6 18.2 17.7 18.8 17.7 18.8 18.2 19.2 ...
> head(fat, n=3)
brozek age weight height adipos free neck chest abdom hip thigh knee
1 12.6 23 154.25 67.75 23.7 134.9 36.2 93.1 85.2 94.5 59.0 37.3
2 6.9 22 173.25 72.25 23.4 161.3 38.5 93.6 83.0 98.7 58.7 37.3
3 24.6 22 154.00 66.25 24.7 116.0 34.0 95.8 87.9 99.2 59.6 38.9
ankle biceps forearm wrist
1 21.9 32.0 27.4 17.1
2 23.4 30.5 28.9 18.2
3 24.0 28.8 25.2 16.6
> sum(is.na(fat)) # 결측값이 없다.
[1] 0
age weight height adipos free neck chest abdom hip thigh knee
1 ( 1 ) " " " " " " " " " " " " " " "*" " " " " " "
2 ( 1 ) " " "*" " " " " "*" " " " " " " " " " " " "
3 ( 1 ) " " "*" " " " " "*" " " " " " " " " " " " "
4 ( 1 ) " " "*" " " " " "*" " " " " "*" " " " " " "
5 ( 1 ) " " "*" " " "*" "*" " " " " "*" " " " " " "
6 ( 1 ) " " "*" " " "*" "*" " " " " "*" " " "*" " "
7 ( 1 ) " " "*" " " "*" "*" " " "*" "*" " " "*" " "
8 ( 1 ) " " "*" " " "*" "*" " " "*" "*" " " "*" " "
9 ( 1 ) " " "*" " " "*" "*" " " "*" "*" " " "*" " "
10 ( 1 ) " " "*" " " "*" "*" " " "*" "*" " " "*" " "
11 ( 1 ) " " "*" "*" "*" "*" " " "*" "*" " " "*" " "
12 ( 1 ) " " "*" "*" "*" "*" " " "*" "*" " " "*" "*"
13 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
15 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
ankle biceps forearm wrist
1 ( 1 ) " " " " " " " "
2 ( 1 ) " " " " " " " "
3 ( 1 ) " " " " "*" " "
4 ( 1 ) " " " " "*" " "
5 ( 1 ) " " " " "*" " "
6 ( 1 ) " " " " "*" " "
7 ( 1 ) " " " " "*" " "
8 ( 1 ) "*" " " "*" " "
9 ( 1 ) "*" "*" "*" " "
10 ( 1 ) "*" "*" "*" "*"
11 ( 1 ) "*" "*" "*" "*"
12 ( 1 ) "*" "*" "*" "*"
13 ( 1 ) "*" "*" "*" "*"
14 ( 1 ) "*" "*" "*" "*"
15 ( 1 ) "*" "*" "*" "*"
> coef(regfit.full,2)
(Intercept) weight free
19.6535898 0.4229089 -0.5314993
QQplot에 찍힌 점들도 45도 각도 직선으로부터 많이 벗어나지 않는 것으로 보아 정규성도 만족된다고 판단한다.
따라서 분석을 진행하게 된다면 polynomial하게 바꾸는 것이 좋겠다.
> coef(regfit.best,2)
(Intercept) weight free
18.0518456 0.4321000 -0.5308647
변수를 2개 포함시킬 때 MSE가 가장 작다.
따라서 최적의 모형은 변수 2개를 넣었을 때이고, 그때의 회귀계수는 아래와 같다. \(Brozek= 18.5 + 0.43Weight - 0.53free\)
> coef(regfit.best,2)
(Intercept) weight free
18.0518456 0.4321000 -0.5308647
변수가 2개일 때 MSE값이 가장 작다.
따라서 최적의 모형은 변수 2개를 넣었을 때이고, 그때의 회귀계수는 아래와 같다. \(Brozek= 18.5 + 0.43Weight - 0.53free\)
> coef(regfit.best,2)
(Intercept) weight free
18.0518456 0.4321000 -0.5308647
-k-fold로 수행한 결과도 회귀진단을 거쳐야하지만 생략하도록 한다.
> # 데이터 살펴보기
> Sys.setlocale("LC_ALL", "eng")
> setwd("E:/Dropbox/00.2018/01.2018_1_semester/01.DataMining/04.HW/HW2")
> fat <- read.csv("./fat.csv", header = T)
> str(fat)
> head(fat, n=3)
> sum(is.na(fat)) # 결측값이 없다.
>
> # Best Subset Approach
> library(leaps)
>
> regfit.full=regsubsets(brozek~.,fat)
> regfit.full=regsubsets(brozek~.,data=fat,nvmax=15)
> reg.summary=summary(regfit.full)
> reg.summary$outmat
>
> # 결과를 보자. 몇 개의 변수를 추가하는 것이 좋은가.
> par(mfrow=c(2,2))
>
> plot(reg.summary$rss, xlab="Number of Variables",
+ ylab = "RSS",
+ type="l")
> points(x = which(reg.summary$rss==min(reg.summary$rss)),y = min(reg.summary$rss), col="red", pch=20, cex=2)
>
> plot(reg.summary$cp, xlab="Number of Variables",
+ ylab = "Mellow's Cp",
+ type="l")
> points(x = which(reg.summary$cp==min(reg.summary$cp)),y = min(reg.summary$cp), col="red", pch=20, cex=2)
>
> plot(reg.summary$adjr2, xlab="Number of Variables",
+ ylab = "Adjusted R-square",
+ type="l")
> points(x = which(reg.summary$adjr2==max(reg.summary$adjr2)),y = max(reg.summary$adjr2), col="red", pch=20, cex=2)
>
> plot(reg.summary$bic, xlab="Number of Variables",
+ ylab = "BIC",
+ type="l")
> points(x = which(reg.summary$bic==min(reg.summary$bic)),y = min(reg.summary$bic), col="red", pch=20, cex=2)
>
> # 최종모형식
> coef(regfit.full,2)
>
> # 회귀진단
> par(mfrow=c(2,2))
> model <- lm(brozek~ weight+ free, data=fat)
> plot(model)
>
> # VS approach
> set.seed(20141630)
> train=sample(c(TRUE,FALSE), nrow(fat), replace = TRUE)
> test=(!train)
>
> library(leaps)
> regfit.best <- regsubsets(brozek ~ ., data = fat[train, ], nvmax = 15)
> test.mat <- model.matrix(brozek~., data = fat[test, ])
> val.error <- rep(NA, 15)
>
>
> for (j in 1:20) {
+ set.seed(20141630 * j)
+ train <- sample(c(TRUE, FALSE), nrow(fat), replace = TRUE)
+ test <- (!train)
+ regfit.best <- regsubsets(brozek ~ ., data = fat[train, ], nvmax = 15)
+ test.mat <- model.matrix(brozek~., data = fat[test, ])
+ val.error <- rep(NA, 15)
+ for (i in 1:15) {
+ coefi <- coef(regfit.best, id = i)
+ pred <- test.mat[, names(coefi)] %*% coefi
+ val.error[i] <- mean((fat$brozek[test] - pred) ^ 2)
+ }
+ if (j < 2) {
+ plot(
+ val.error, type = "l", ylim = c(0, max(val.error) * 1.5),
+ ylab = "MSE", xlab = "number of variables(p)"
+ )
+ } else {
+ lines(val.error, type = "l", col = j)
+ }
+ }
>
>
> predict.regsubsets <- function(object, newdata, id, ...) {
+ form <- as.formula(object$call [[2]])
+ mat <- model.matrix(form, newdata)
+ coefi <- coef(object, id = id)
+ xvars <- names(coefi)
+ mat[, xvars] %*% coefi
+ }
>
>
> # LOOCV
> k=nrow(fat)
> cv.errors <- matrix(NA, k, 15, dimnames=list(NULL, paste(1:15)))
>
> library(leaps)
> for(j in 1:k){
+ best.fit <- regsubsets(brozek~. ,data=fat[-j,], nvmax=15)
+ for(i in 1:15){
+ pred=predict(best.fit, fat[j,],id=i)
+ cv.errors[j,i]=mean((fat$brozek[j]- pred)^2)
+ }
+ }
>
> mean.cv.errors <- apply(cv.errors, 2, mean)
> tmp <- apply(cv.errors, 1, mean)
>
> par(mfrow=c(1,1))
> plot(mean.cv.errors, type="l")
> points(x = which(mean.cv.errors==min(mean.cv.errors)),y = min(mean.cv.errors), col="red", pch=20, cex=2)
>
>
> ## K-fold
> k <- 10
> # k=5
> library(RColorBrewer)
> library(leaps)
> set.seed(20141630)
> for (j in 1:k) {
+ folds <- sample(1:k, nrow(fat), replace = TRUE, prob = rep(1 / k, k))
+ cv.errors <- rep(NA, 15)
+ for (i in 1:15) {
+ pred <- predict(best.fit, fat[folds == k, ], id = i)
+ cv.errors[i] <- mean((fat$brozek[folds == k] - pred) ^ 2)
+ }
+ if (j < 2) {
+ plot(
+ cv.errors, type = "l", ylim = c(0, max(cv.errors) * 2),
+ ylab = "MSE", xlab = "number of variables(p)"
+ )
+ } else {
+ lines(cv.errors, type = "l", col = brewer.pal(9, "Set1")[j])
+ 1}
+ }