Initial Library
library(ElemStatLearn)
library(corrplot)
## corrplot 0.92 loaded
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(caTools)
library(ISLR)
library(knitr)
library(printr)
## Registered S3 method overwritten by 'printr':
## method from
## knit_print.data.frame rmarkdown
library(leaps)
Menampilkan 5 data pertama
head(prostate,5)
| lcavol | lweight | age | lbph | svi | lcp | gleason | pgg45 | lpsa | train |
|---|---|---|---|---|---|---|---|---|---|
| -0.5798185 | 2.769459 | 50 | -1.386294 | 0 | -1.386294 | 6 | 0 | -0.4307829 | TRUE |
| -0.9942523 | 3.319626 | 58 | -1.386294 | 0 | -1.386294 | 6 | 0 | -0.1625189 | TRUE |
| -0.5108256 | 2.691243 | 74 | -1.386294 | 0 | -1.386294 | 7 | 20 | -0.1625189 | TRUE |
| -1.2039728 | 3.282789 | 58 | -1.386294 | 0 | -1.386294 | 6 | 0 | -0.1625189 | TRUE |
| 0.7514161 | 3.432373 | 62 | -1.386294 | 0 | -1.386294 | 6 | 0 | 0.3715636 | TRUE |
Menghapus variabel train
prostate <- subset(prostate, select = -c(train))
str(prostate)
## 'data.frame': 97 obs. of 9 variables:
## $ lcavol : num -0.58 -0.994 -0.511 -1.204 0.751 ...
## $ lweight: num 2.77 3.32 2.69 3.28 3.43 ...
## $ age : int 50 58 74 58 62 50 64 58 47 63 ...
## $ lbph : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ svi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lcp : num -1.39 -1.39 -1.39 -1.39 -1.39 ...
## $ gleason: int 6 6 7 6 6 6 6 6 6 6 ...
## $ pgg45 : int 0 0 20 0 0 0 0 0 0 0 ...
## $ lpsa : num -0.431 -0.163 -0.163 -0.163 0.372 ...
print(summary(prostate))
## lcavol lweight age lbph
## Min. :-1.3471 Min. :2.375 Min. :41.00 Min. :-1.3863
## 1st Qu.: 0.5128 1st Qu.:3.376 1st Qu.:60.00 1st Qu.:-1.3863
## Median : 1.4469 Median :3.623 Median :65.00 Median : 0.3001
## Mean : 1.3500 Mean :3.629 Mean :63.87 Mean : 0.1004
## 3rd Qu.: 2.1270 3rd Qu.:3.876 3rd Qu.:68.00 3rd Qu.: 1.5581
## Max. : 3.8210 Max. :4.780 Max. :79.00 Max. : 2.3263
## svi lcp gleason pgg45
## Min. :0.0000 Min. :-1.3863 Min. :6.000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:-1.3863 1st Qu.:6.000 1st Qu.: 0.00
## Median :0.0000 Median :-0.7985 Median :7.000 Median : 15.00
## Mean :0.2165 Mean :-0.1794 Mean :6.753 Mean : 24.38
## 3rd Qu.:0.0000 3rd Qu.: 1.1787 3rd Qu.:7.000 3rd Qu.: 40.00
## Max. :1.0000 Max. : 2.9042 Max. :9.000 Max. :100.00
## lpsa
## Min. :-0.4308
## 1st Qu.: 1.7317
## Median : 2.5915
## Mean : 2.4784
## 3rd Qu.: 3.0564
## Max. : 5.5829
Mengecek apakah ada data missing
sum(is.na(prostate))
## [1] 0
Plot Korelasi antar peubah prediktor
prostate.corr <- cor(prostate)
corrplot.mixed(prostate.corr)
Split data menjadi data train dan data test
split <- sample.split(prostate, SplitRatio = 0.8)
train <- subset(prostate, split==TRUE)
test <- subset(prostate, split==FALSE)
Fungsi regsubsets() bagian dari library (leaps) mencari best subset peubah prediktor mana saja yang dipakai untuk menghasilkan nilai Jumlah Kuadrat Sisaan (JKT/RSS) terkecil.
reg.bestsubset <- regsubsets(lpsa ~ ., data = train)
summary(reg.bestsubset)
## Subset selection object
## Call: regsubsets.formula(lpsa ~ ., data = train)
## 8 Variables (and intercept)
## Forced in Forced out
## lcavol FALSE FALSE
## lweight FALSE FALSE
## age FALSE FALSE
## lbph FALSE FALSE
## svi FALSE FALSE
## lcp FALSE FALSE
## gleason FALSE FALSE
## pgg45 FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## lcavol lweight age lbph svi lcp gleason pgg45
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" " " " " "*" " " " " " "
## 4 ( 1 ) "*" "*" "*" " " "*" " " " " " "
## 5 ( 1 ) "*" "*" "*" " " "*" " " " " "*"
## 6 ( 1 ) "*" "*" "*" " " "*" "*" " " "*"
## 7 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
Tanda bintang “*” menandakan bahwa variabel prediktor dimasukkan kedalam model,
reg.fit.full <- regsubsets(lpsa ~ ., data = train, nvmax = 8)
reg.summary <- summary(reg.fit.full)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
reg.summary$rsq
## [1] 0.5623183 0.6277022 0.6605719 0.6742170 0.6780745 0.6805366 0.6820799
## [8] 0.6827996
Plot R-Square setiap variabel yang digunakan pada model
library(ggvis)
##
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
##
## resolution
rsq <- as.data.frame(reg.summary$rsq)
names(rsq) <- "R2"
rsq %>%
ggvis(x=~ c(1:nrow(rsq)), y=~R2) %>%
layer_points(fill = ~ R2) %>%
add_axis("y", title = "R2") %>%
add_axis("x", title = "Number of Variables")
Plot RSS, Adjusted R2, Cp, BIC pada setiap jumlah variabel yang digunakan
par(mfrow=c(2,2))
plot(reg.summary$rss ,xlab="Number of Variables ",ylab="RSS",type="l")
plot(reg.summary$adjr2 ,xlab="Number of Variables ", ylab="Adjusted RSq",type="l")
# which.max(reg.summary$adjr2)
points(11,reg.summary$adjr2[11], col="red",cex=2,pch=20)
plot(reg.summary$cp ,xlab="Number of Variables ",ylab="Cp", type='l')
# which.min(reg.summary$cp )
points(10,reg.summary$cp [10],col="red",cex=2,pch=20)
plot(reg.summary$bic ,xlab="Number of Variables ",ylab="BIC",type='l')
# which.min(reg.summary$bic )
points(6,reg.summary$bic [6],col="red",cex=2,pch=20)
coef(reg.fit.full, 3)
## (Intercept) lcavol lweight svi
## -0.9666374 0.5372666 0.7115252 0.6006456
Memilih model terbaik dengan pendekatan validasi silang
regfit.best = regsubsets(lpsa~., data=train, nvmax =8, method = "forward")
test.mat = model.matrix(lpsa ~., data=test)
val.errors = rep(NA,8)
for (i in 1:8){
coefi = coef(regfit.best, id=i)
pred = test.mat[,names(coefi)]%*%coefi
val.errors[i] = mean((test$lpsa-pred)^2)
}
which.min(val.errors)
## [1] 3
verr <- as.data.frame(val.errors); names(verr) <- "err"
index <- c(1:nrow(verr))
verr <- cbind.data.frame(verr,index)
verr %>%
ggvis(x=~ index, y=~err ) %>%
layer_points(fill = ~ err , size =~ err ) %>%
layer_lines(stroke := "skyblue")%>%
add_axis("y", title = "MSE") %>%
add_axis("x", title = "Number of variables")
rss <- as.data.frame(sqrt(regfit.best$rss[-1]/100)); names(rss) <- "rss"
verr <- cbind.data.frame(verr,rss)
verr %>%
ggvis(x=~ index) %>%
layer_points(y=~rss ,fill = ~ rss , size =~ rss ) %>%
layer_lines(y=~rss ,stroke :="purple")%>%
add_axis("y", title = "Root MSE") %>%
add_axis("x", title = "Number of variables") %>%
layer_points(y=~sqrt(err), fill = ~ sqrt(err) , size =~ sqrt(err) ) %>%
layer_lines(y=~sqrt(err), stroke := "skyblue")
coef(regfit.best ,3)
## (Intercept) lcavol lweight svi
## -0.9666374 0.5372666 0.7115252 0.6006456
model <- lm(lpsa ~ lcavol+lweight+svi, data=train)
summary(model)
##
## Call:
## lm(formula = lpsa ~ lcavol + lweight + svi, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.75658 -0.44976 0.03463 0.41474 1.60956
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.96664 0.66953 -1.444 0.153147
## lcavol 0.53727 0.08555 6.280 2.28e-08 ***
## lweight 0.71153 0.18914 3.762 0.000341 ***
## svi 0.60065 0.22747 2.641 0.010144 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6995 on 72 degrees of freedom
## Multiple R-squared: 0.6606, Adjusted R-squared: 0.6464
## F-statistic: 46.71 on 3 and 72 DF, p-value: < 2.2e-16