MATERI PRAKTIKUM 08
Library
library(MultiKink)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(PerformanceAnalytics)
DATA BANGKITAN
Data
set.seed(123)
data.x <- rnorm(1000,1,1)
err <- rnorm(1000)
y <- 5+2*data.x+3*data.x^2+err #Polynomial ordo 2
Eksplorasi Data
Korelasi
cor(data.x,y)
## [1] 0.892582
Diperoleh korelasi antara data.x dengan y sebesar 0.8023291
Plot
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))
Regresi Linear
lin.mod <-lm( y~data.x)
plot(data.x,y,xlim=c( -2,5), ylim=c( -10,70))
lines(data.x,lin.mod$fitted.values,col="red") #lines x dengan nilai prediksinya (yhat)
summary(lin.mod)
##
## Call:
## lm(formula = y ~ data.x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.686 -2.574 -1.428 1.195 27.185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6056 0.1902 24.22 <2e-16 ***
## data.x 8.3790 0.1340 62.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.2 on 998 degrees of freedom
## Multiple R-squared: 0.7967, Adjusted R-squared: 0.7965
## F-statistic: 3911 on 1 and 998 DF, p-value: < 2.2e-16
Regresi Polynomial
poly.mod<- lm(y~data.x+I(data.x^2))
ix <- sort( data.x,index.return=T)$ix #nilai diurutkan berdasarkan nilai x lalu dikeluarkan nilai indeksnya(observasi ke-berapa)
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))
lines(data.x[ix], poly.mod$fitted.values[ix],col="blue", cex=2)
summary(poly.mod)
##
## Call:
## lm(formula = y ~ data.x + I(data.x^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0319 -0.6942 0.0049 0.7116 3.2855
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.95193 0.04568 108.41 <2e-16 ***
## data.x 2.10732 0.05861 35.95 <2e-16 ***
## I(data.x^2) 2.99081 0.02338 127.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.007 on 997 degrees of freedom
## Multiple R-squared: 0.9883, Adjusted R-squared: 0.9883
## F-statistic: 4.221e+04 on 2 and 997 DF, p-value: < 2.2e-16
Regresi Fungsi Tangga
#regresi fungsi tangga
range(data.x) #nilai minimum dan maksimum
## [1] -1.809775 4.241040
Diperoleh nilai minimum sebesar -1.809775 dan nilai maksimum sebesar 4.241040
#membuat fungsi tangga secara manual
c1 <- as.factor(ifelse(data.x<=0,1,0))
c2 <- as.factor(ifelse(data.x<=2 & data.x>0,1,0))
c3 <- as.factor(ifelse(data.x>2,1,0))
head(data.frame(y,c1,c2,c3))
## y c1 c2 c3
## 1 5.462795 0 1 0
## 2 7.277570 0 1 0
## 3 29.740401 0 0 1
## 4 10.446806 0 1 0
## 5 8.535105 0 1 0
## 6 33.585437 0 0 1
step.mod <- lm(y~c1+c2+c3)
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))
lines(data.x,lin.mod$fitted.values,col="red")
lines(data.x[ix], step.mod$fitted.values[ix],col="green")
summary(step.mod)
##
## Call:
## lm(formula = y ~ c1 + c2 + c3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.395 -3.534 -0.530 2.527 36.876
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.4499 0.4087 74.50 <2e-16 ***
## c11 -25.2395 0.5710 -44.21 <2e-16 ***
## c21 -19.4184 0.4536 -42.81 <2e-16 ***
## c31 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.121 on 997 degrees of freedom
## Multiple R-squared: 0.698, Adjusted R-squared: 0.6974
## F-statistic: 1152 on 2 and 997 DF, p-value: < 2.2e-16
Perbandingan Model
Nilai AIC
perbandinganAIC <- data.frame("Model" = c("Linear", "Polynomial (Ordo=2)", "Tangga (breaks=3)"), "AIC" = c(AIC(lin.mod),AIC(poly.mod),AIC(step.mod)))
dplyr::arrange(.data=perbandinganAIC, AIC)
## Model AIC
## 1 Polynomial (Ordo=2) 2856.470
## 2 Linear 5711.836
## 3 Tangga (breaks=3) 6109.609
Nilai MSE
MSE = function(pred,actual){
mean((pred-actual)^2)
}
perbandinganMSE <- data.frame("Model" = c("Linear", "Polynomial (Ordo=2)", "Tangga (breaks=3)"), "MSE" = c(MSE(predict(lin.mod),y),MSE(predict(poly.mod),y),MSE(predict(step.mod),y)))
dplyr::arrange(.data=perbandinganMSE, MSE)
## Model MSE
## 1 Polynomial (Ordo=2) 1.010649
## 2 Linear 17.601059
## 3 Tangga (breaks=3) 26.146934
Berdasarkan hasil di atas, diperoleh bahwa Model Regresi Polynomial (Ordo=2) memiliki nilai AIC dan MSE terkecil. Oleh karena itu, dapat disimpulkan bahwa Model Regresi Polynomial (Ordo=2) merupakan model terbaik berdasarkan nilai AIC dan nilai MSE yang paling kecil dibandingkan dengan model lainnya.
DATA TRICEPS
Data yang digunakan untuk ilustrasi berasal dari studi antropometri terhadap 892 perempuan di bawah 50 tahun di tiga desa Gambia di Afrika Barat. Data terdiri dari 3 Kolom yaitu Age, Intriceps dan tricepts. Berikut adalah penjelasannya pada masing-masing kolom:
age: umur respondenIntriceps: logaritma dari ketebalan lipatan kulit tricepstriceps: ketebalan lipatan kulit triceps
Lipatan kulit trisep diperlukan untuk menghitung lingkar otot lengan atas. Ketebalannya memberikan informasi tentang cadangan lemak tubuh, sedangkan massa otot yang dihitung memberikan informasi tentang cadangan protein.
Data
data("triceps", package="MultiKink")
dt.triceps <- data.frame("age"=triceps$age,
"lntriceps"=triceps$lntriceps,
"triceps"=triceps$triceps)
head(dt.triceps)
## age lntriceps triceps
## 1 12.05 1.223776 3.4
## 2 9.91 1.386294 4.0
## 3 10.04 1.435084 4.2
## 4 11.49 1.435084 4.2
## 5 10.12 1.481605 4.4
## 6 11.87 1.481605 4.4
Eksplorasi Data
Korelasi
cor(dt.triceps)
## age lntriceps triceps
## age 1.0000000 0.5776443 0.5806972
## lntriceps 0.5776443 1.0000000 0.9612043
## triceps 0.5806972 0.9612043 1.0000000
Plot Korelasi
chart.Correlation(dt.triceps)
Berdasarkan plot korelasi di atas, diperoleh bahwa korelasi antara peubah AGE dengan peubah lntriceps bernilai positif sebesar 0.58. Nilai korelasi antara peubah AGE dengan peubah triceps juga bernilai positif sebesar 0.58. Kemudian, nilai korelasi antara peubah lntriceps dengan peubah triceps bernilai positif sebesar 0.96.
Scatterplot
ggplot(dt.triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
theme_bw()
Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.
Regresi Linear
mod_linear = lm(triceps~age,data=dt.triceps)
summary(mod_linear)
##
## Call:
## lm(formula = triceps ~ age, data = dt.triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.9512 -2.3965 -0.5154 1.5822 25.1233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.19717 0.21244 29.17 <2e-16 ***
## age 0.21584 0.01014 21.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.007 on 890 degrees of freedom
## Multiple R-squared: 0.3372, Adjusted R-squared: 0.3365
## F-statistic: 452.8 on 1 and 890 DF, p-value: < 2.2e-16
ringkasan_linear <- summary(mod_linear)
ringkasan_linear$r.squared
## [1] 0.3372092
AIC(mod_linear)
## [1] 5011.515
tabel_data <- rbind(data.frame("y_aktual"=dt.triceps$triceps,
"y_pred"=predict(mod_linear),
"residual"=residuals(mod_linear)))
head(tabel_data)
## y_aktual y_pred residual
## 1 3.4 8.798074 -5.398074
## 2 4.0 8.336171 -4.336171
## 3 4.2 8.364231 -4.164231
## 4 4.2 8.677202 -4.477202
## 5 4.4 8.381498 -3.981498
## 6 4.4 8.759222 -4.359222
ggplot(dt.triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "blue",se = F)+
theme_bw()
Regresi Polynomial Derajat 2 (Ordo=2)
mod_polinomial2 = lm(triceps ~ poly(age,2,raw = T),
data=dt.triceps)
summary(mod_polinomial2)
##
## Call:
## lm(formula = triceps ~ poly(age, 2, raw = T), data = dt.triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.5677 -2.4401 -0.4587 1.6368 24.9961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0229191 0.3063806 19.658 < 2e-16 ***
## poly(age, 2, raw = T)1 0.2434733 0.0364403 6.681 4.17e-11 ***
## poly(age, 2, raw = T)2 -0.0006257 0.0007926 -0.789 0.43
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.008 on 889 degrees of freedom
## Multiple R-squared: 0.3377, Adjusted R-squared: 0.3362
## F-statistic: 226.6 on 2 and 889 DF, p-value: < 2.2e-16
ggplot(dt.triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
ggplot(dt.triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = T) + theme_bw() #ada standart errornya
Regresi Polynomial Derajat 3 (Ordo=3)
mod_polinomial3 = lm(triceps ~ poly(age,3,raw = T),data=dt.triceps)
summary(mod_polinomial3)
##
## Call:
## lm(formula = triceps ~ poly(age, 3, raw = T), data = dt.triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5832 -1.9284 -0.5415 1.3283 24.4440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.004e+00 3.831e-01 20.893 < 2e-16 ***
## poly(age, 3, raw = T)1 -3.157e-01 7.721e-02 -4.089 4.73e-05 ***
## poly(age, 3, raw = T)2 3.101e-02 3.964e-03 7.824 1.45e-14 ***
## poly(age, 3, raw = T)3 -4.566e-04 5.612e-05 -8.135 1.38e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.868 on 888 degrees of freedom
## Multiple R-squared: 0.3836, Adjusted R-squared: 0.3815
## F-statistic: 184.2 on 3 and 888 DF, p-value: < 2.2e-16
Berdasarkan hasil di atas, diperoleh bahwa X^1, X^2, X^3 seluruhnya signifikan serta terdapat kenaikan pada R-squared menjadi 0.3815
ggplot(dt.triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = T) + theme_bw()
Perbandingan Model
Nilai AIC
perbandinganAIC2 <- data.frame("Model" = c("Model Linear", "Model Polynomial (Ordo=2)", "Model Polynomial (Ordo=3)"), "AIC" = c(AIC(mod_linear),AIC(mod_polinomial2), AIC(mod_polinomial3)))
dplyr::arrange(.data=perbandinganAIC2, AIC)
## Model AIC
## 1 Model Polynomial (Ordo=3) 4950.774
## 2 Model Linear 5011.515
## 3 Model Polynomial (Ordo=2) 5012.890
Nilai MSE
perbandinganMSE2 <- data.frame("Model" = c("Model Linear", "Model Polynomial (Ordo=2)", "Model Polynomial (Ordo=3)"), "MSE" = c(MSE(predict(mod_linear),dt.triceps$triceps), MSE(predict(mod_polinomial2),dt.triceps$triceps),
MSE(predict(mod_polinomial3),dt.triceps$triceps)))
dplyr::arrange(.data=perbandinganMSE2, MSE)
## Model MSE
## 1 Model Polynomial (Ordo=3) 14.89621
## 2 Model Polynomial (Ordo=2) 16.00636
## 3 Model Linear 16.01758
Berdasarkan hasil di atas, diperoleh bahwa Model Regresi Polynomial (Ordo=3) memiliki nilai AIC dan MSE terkecil. Oleh karena itu, dapat disimpulkan bahwa Model Regresi Polynomial (Ordo=3) merupakan model terbaik berdasarkan nilai AIC dan nilai MSE yang paling kecil dibandingkan dengan model lainnya.
Regresi Fungsi Tangga (breaks=5)
mod_tangga5 = lm(triceps ~ cut(age,5),data=dt.triceps)
summary(mod_tangga5)
##
## Call:
## lm(formula = triceps ~ cut(age, 5), data = dt.triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5474 -2.0318 -0.4465 1.3682 23.3759
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2318 0.1994 36.260 < 2e-16 ***
## cut(age, 5)(10.6,20.9] 1.6294 0.3244 5.023 6.16e-07 ***
## cut(age, 5)(20.9,31.2] 5.9923 0.4222 14.192 < 2e-16 ***
## cut(age, 5)(31.2,41.5] 7.5156 0.4506 16.678 < 2e-16 ***
## cut(age, 5)(41.5,51.8] 7.4561 0.5543 13.452 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.939 on 887 degrees of freedom
## Multiple R-squared: 0.3617, Adjusted R-squared: 0.3588
## F-statistic: 125.7 on 4 and 887 DF, p-value: < 2.2e-16
ggplot(dt.triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+
theme_bw()
Regresi Fungsi Tangga (breaks=7)
mod_tangga7 = lm(triceps ~ cut(age,7),data=dt.triceps)
summary(mod_tangga7)
##
## Call:
## lm(formula = triceps ~ cut(age, 7), data = dt.triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8063 -1.7592 -0.4366 1.2894 23.1461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.5592 0.2219 34.060 < 2e-16 ***
## cut(age, 7)(7.62,15] -0.6486 0.3326 -1.950 0.0515 .
## cut(age, 7)(15,22.3] 3.4534 0.4239 8.146 1.27e-15 ***
## cut(age, 7)(22.3,29.7] 5.8947 0.4604 12.804 < 2e-16 ***
## cut(age, 7)(29.7,37] 7.8471 0.5249 14.949 < 2e-16 ***
## cut(age, 7)(37,44.4] 6.9191 0.5391 12.835 < 2e-16 ***
## cut(age, 7)(44.4,51.8] 6.3013 0.6560 9.606 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.805 on 885 degrees of freedom
## Multiple R-squared: 0.4055, Adjusted R-squared: 0.4014
## F-statistic: 100.6 on 6 and 885 DF, p-value: < 2.2e-16
ggplot(dt.triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "blue",se = F)+
theme_bw()
Perbandingan Model
Nilai MSE
MSE = function(pred,actual){
mean((pred-actual)^2)
}
MSE <- rbind(MSE(predict(mod_linear),triceps$triceps),
MSE(predict(mod_polinomial2),triceps$triceps),
MSE(predict(mod_polinomial3),triceps$triceps),
MSE(predict(mod_tangga5),triceps$triceps),
MSE(predict(mod_tangga7),triceps$triceps))
Model <- c("Linear","Polynomial (ordo=2)",
"Polynomial (ordo=3)","Tangga (breaks=5)",
"Tangga (breaks=7)")
data.frame(Model,MSE)
## Model MSE
## 1 Linear 16.01758
## 2 Polynomial (ordo=2) 16.00636
## 3 Polynomial (ordo=3) 14.89621
## 4 Tangga (breaks=5) 15.42602
## 5 Tangga (breaks=7) 14.36779
Berdasarkan hasil di atas, diperoleh bahwa Model Fungsi Tangga (breaks=7) memiliki nilai MSE terkecil. Oleh karena itu, dapat disimpulkan bahwa Model Fungsi Tangga (breaks=7) merupakan model terbaik berdasarkan nilai nilai MSE yang paling kecil dibandingkan dengan model lainnya.
Cross Validation Evaluation
Regresi Linear
set.seed(123)
cross_val <- vfold_cv(dt.triceps,v=10,strata = "triceps") #v yang digunakan biasanya 5 atau 10
metric_linear <- map_dfr(cross_val$splits,
function(x){
mod <- lm(triceps ~ age,data=dt.triceps[x$in_id,])
pred <- predict(mod,newdata=dt.triceps[-x$in_id,])
truth <- dt.triceps[-x$in_id,]$triceps
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear
## # A tibble: 10 x 2
## rmse mae
## <dbl> <dbl>
## 1 3.65 2.82
## 2 4.62 3.22
## 3 4.38 3.00
## 4 3.85 2.80
## 5 3.08 2.36
## 6 3.83 2.81
## 7 3.59 2.78
## 8 4.66 3.06
## 9 3.50 2.56
## 10 4.59 2.93
# menghitung rata-rata untuk 10 folds
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
## rmse mae
## 3.973249 2.833886
Regresi Polynomial Derajat 2 (Ordo=2)
set.seed(123)
cross_val <- vfold_cv(dt.triceps,v=10,strata = "triceps")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(triceps ~ poly(age,2,raw = T),data=dt.triceps[x$in_id,])
pred <- predict(mod,newdata=dt.triceps[-x$in_id,])
truth <- dt.triceps[-x$in_id,]$triceps
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly2
## # A tibble: 10 x 2
## rmse mae
## <dbl> <dbl>
## 1 3.64 2.82
## 2 4.62 3.26
## 3 4.39 3.02
## 4 3.85 2.81
## 5 3.10 2.38
## 6 3.82 2.81
## 7 3.62 2.83
## 8 4.65 3.07
## 9 3.50 2.57
## 10 4.59 2.94
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
## rmse mae
## 3.977777 2.851787
Regresi Polynomial Derajat 3 (Ordo=3)
set.seed(123)
cross_val <- vfold_cv(dt.triceps,v=10,strata = "triceps")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(triceps ~ poly(age,3,raw = T),data=dt.triceps[x$in_id,])
pred <- predict(mod,newdata=dt.triceps[-x$in_id,])
truth <- dt.triceps[-x$in_id,]$triceps
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly3
## # A tibble: 10 x 2
## rmse mae
## <dbl> <dbl>
## 1 3.49 2.60
## 2 4.48 2.99
## 3 4.21 2.85
## 4 4.02 2.75
## 5 3.03 2.09
## 6 3.63 2.59
## 7 3.53 2.52
## 8 4.54 2.91
## 9 3.27 2.33
## 10 4.27 2.68
# menghitung rata-rata untuk 10 folds
mean_metric_poly3 <- colMeans(metric_poly3)
mean_metric_poly3
## rmse mae
## 3.845976 2.632125
Regresi Fungsi Tangga
set.seed(123)
cross_val <- vfold_cv(dt.triceps,v=10,strata = "triceps")
breaks <- 3:10 #titik cutnya pada 3-10 (dicari break yang paling baik)
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- dt.triceps[x$in_id,]
training$age <- cut(training$age,i)
mod <- lm(triceps ~ age,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- triceps[-x$in_id,]
age_new <-cut(testing$age,c(labs_x_breaks[1,1],
labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(age=age_new))
truth <- testing$triceps
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
}
)
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
## breaks rmse mae
## 1 3 3.835357 2.618775
## 2 4 3.882932 2.651911
## 3 5 3.917840 2.724368
## 4 6 3.836068 2.622939
## 5 7 3.789715 2.555062
## 6 8 3.812789 2.555563
## 7 9 3.781720 2.518706
## 8 10 3.795877 2.529479
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
## breaks rmse mae
## 1 9 3.78172 2.518706
#berdasarkan mae
best_tangga %>% slice_min(mae)
## breaks rmse mae
## 1 9 3.78172 2.518706
Berdasarkan hasil di atas, diperoleh bahwa Model Fungsi Tangga yang terbaik adalah Model Fungsi Tangga (breaks=9) karena menghasilkan nilai RMSE dan MAE yang paling kecil dibandingkan dengan model lainnya.
Perbandingan Model
Nilai RMSE dan MAE
nilai_metric <- rbind(mean_metric_linear,
mean_metric_poly2,
mean_metric_poly3,
best_tangga %>% select(-1) %>% slice_min(mae))
Model <- c("Linear","Poly2","Poly3","Tangga (breaks=9)")
data.frame(Model,nilai_metric)
## Model rmse mae
## 1 Linear 3.973249 2.833886
## 2 Poly2 3.977777 2.851787
## 3 Poly3 3.845976 2.632125
## 4 Tangga (breaks=9) 3.781720 2.518706
Berdasarkan hasil di atas, diperoleh bahwa Model Regresi Fungsi Tangga (breaks=9) menghasilkan RMSE dan MAE yang paling kecil sehingga dapat disimpulkan bahwa Model Regresi Fungsi Tangga (breaks=9) merupakan model terbaik dibandingkan dengan model lainnya.
MATERI PRAKTIKUM 09
Library
#install.packages("MultiKink")
library(tidyverse)
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(splines)
DATA BANGKITAN
set.seed(123)
data.x <- rnorm(1000,1,1)
err <- rnorm(1000,0,5)
y <- 5+2*data.x+3*data.x^2+err
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70), pch=16, col="orange")
abline(v=1, col="red", lty=2)
Piecewise Cubic Polynom
dt.all <- cbind(y,data.x)
##knots=1
dt1 <- dt.all[data.x <=1,]
dim(dt1)
## [1] 495 2
dt2 <- dt.all[data.x >1,]
dim(dt2)
## [1] 505 2
Berdasarkan hasil di atas, diperoleh bahwa jumlah observasi pada dt.1 (data.x <= 1) sebanyak 495 dan pada dt.2 (data.x > 1) sebanyak 505 observasi.
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70), pch=16, col="orange")
cub.mod1 <- lm(dt1[,1]~dt1[,2]+I(dt1[,2]^2)+I(dt1[,2]^3))
ix <- sort(dt1[,2], index.return=T)$ix
lines(dt1[ix,2],cub.mod1$fitted.values[ix],col="blue", lwd=2)
cub.mod2 <- lm(dt2[,1]~dt2[,2]+I(dt2[,2]^2)+I(dt2[,2]^3))
ix <- sort(dt2[,2], index.return=T)$ix
lines(dt2[ix,2],cub.mod2$fitted.values[ix],col="blue", lwd=2)
Berdasarkan plot di atas, masih terdapat cut off pada garis biru sehingga dapat menggunakan Truncated Power Basis untuk menyambungkannya.
Truncated Power Basis
#Dengan menggunakan Truncated Power Basis
plot(data.x,y,xlim=c( -2,5), ylim=c( -10,70), pch=16,col="orange")
abline(v=1,col="red", lty=2)
hx <- ifelse( data.x>1,(data.x-1)^3,0)
cubspline.mod <- lm( y~data.x+I(data.x^2)+I(data.x^3)+hx)
ix <- sort( data.x,index.return=T)$ix
lines( data.x[ix], cubspline.mod$fitted.values[ix],col="green", lwd=2)
Perbandingan dengan k-fold CV
plot( data.x,y,xlim=c(-2,5), ylim=c( -10,70), pch=16,col="orange")
abline(v=0,col="red", lty=2)
abline(v=2,col="red", lty=2)
hx1 <- ifelse( data.x>0,(data.x-0)^3,0)
hx2 <- ifelse( data.x>2,(data.x-2)^3,0)
cubspline.mod2 <- lm( y~data.x+I(data.x^2)+I(data.x^3)+hx1+hx2)
ix <- sort( data.x,index.return=T)$ix
lines( data.x[ix],cubspline.mod2$fitted.values[ix],col="green", lwd=2)
Perbandingan 1 knot dan 2 knot dengan 5-fold CV
##1 knot
data.all <- cbind( y,data.x,hx)
set.seed(456)
ind <- sample(1:5,length( data.x),replace=T)
res <- c()
for( i in 1:5){
dt.train <- data.all[ ind!=i,]
x.test <- data.all[ ind==i, -1]
y.test <- data.all[ ind==i,1]
mod1 <- lm( dt.train[,1]~dt.train[,2]+I( dt.train[,2]^2)+I( dt.train[,2]^3)+dt.train[,3])
x.test.olah <- cbind(1,x.test[,1], x.test[,1]^2,x.test[,1]^3,x.test[,2])
beta <- coefficients(mod1)
prediksi <- x.test.olah%*%beta
res <- c( res,mean(( y.test-prediksi)^2))
}
res
## [1] 22.08767 21.39598 29.47677 29.68683 25.18070
##2 knot (knot = 0, 2)
data.all2 <- cbind(y,data.x,hx1,hx2)
set.seed(456)
ind2 <- sample(1:5,length( data.x),replace=T)
res2 <- c()
for( i in 1:5){
dt.train2 <- data.all2[ind2!=i,]
x.test2 <- data.all2[ind2==i,-1]
y.test2 <- data.all2[ind2==i,1]
mod2 <- lm(dt.train2[,1]~dt.train2[,2]+I(dt.train2[,2]^2)+I(dt.train2[,2]^3)+dt.train2[,3]+dt.train2[,4])
x.test.olah2 <- cbind(1,x.test2[,1],x.test2[,1]^2,x.test2[,1]^3,x.test2[,2],x.test2[,3])
beta2 <- coefficients(mod2)
prediksi2 <- x.test.olah2%*%beta2
res2 <- c(res2,mean((y.test2-prediksi2)^2))
}
res2
## [1] 22.32898 21.33868 29.40459 29.79913 25.22047
Perbandingan Nilai Residual Antara KNOT = 1 dan KNOT = 2
PerbandinganRES <-matrix(c(mean(res),mean(res2)),nrow=1,ncol=2)
colnames(PerbandinganRES)<- c("KNOT = 1","KNOT = 2")
row.names(PerbandinganRES) <- c("Rata-rata Residual")
knitr::kable(PerbandinganRES, align = "c")
| KNOT = 1 | KNOT = 2 | |
|---|---|---|
| Rata-rata Residual | 25.56559 | 25.61837 |
Berdasarkan nilai rata-rata residual, dapat disimpulkan bahwa KNOT = 1 lebih baik karena memiliki nilai rata-rata residual yang lebih kecil dibandingkan dengan KNOT = 2.
Smooting Spline
ss1 <- smooth.spline(data.x,y,all.knots = T)
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70), pch=16,col="orange")
lines(ss1,col="purple", lwd=2)
DATA TRICEPS
Data yang digunakan untuk ilustrasi berasal dari studi antropometri terhadap 892 perempuan di bawah 50 tahun di tiga desa Gambia di Afrika Barat. Data terdiri dari 3 Kolom yaitu Age, Intriceps dan tricepts. Berikut adalah penjelasannya pada masing-masing kolom:
age: umur respondenIntriceps: logaritma dari ketebalan lipatan kulit tricepstriceps: ketebalan lipatan kulit triceps
Lipatan kulit trisep diperlukan untuk menghitung lingkar otot lengan atas. Ketebalannya memberikan informasi tentang cadangan lemak tubuh, sedangkan massa otot yang dihitung memberikan informasi tentang cadangan protein.
Data
data("triceps", package="MultiKink")
dt.triceps <- data.frame("age"=triceps$age,
"lntriceps"=triceps$lntriceps,
"triceps"=triceps$triceps)
head(dt.triceps)
## age lntriceps triceps
## 1 12.05 1.223776 3.4
## 2 9.91 1.386294 4.0
## 3 10.04 1.435084 4.2
## 4 11.49 1.435084 4.2
## 5 10.12 1.481605 4.4
## 6 11.87 1.481605 4.4
Eksplorasi Data
Scatterplot
ggplot(dt.triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
theme_bw()
Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.
Regresi Spline
#Menentukan banyaknya fungsi basis dan knots
dim(bs(dt.triceps$age, knots = c(10, 20,40)))
## [1] 892 6
#atau
dim(bs(dt.triceps$age, df=6))
## [1] 892 6
#nilai knots yang ditentukan oleh komputer
attr(bs(dt.triceps$age, df=6),"knots")
## 25% 50% 75%
## 5.5475 12.2100 24.7275
b-spline
Menggunakan Knot yang Ditentukan Manual
knot_value_manual_3 = c(10, 20,40)
mod_spline_1 = lm(triceps ~ bs(age, knots =knot_value_manual_3 ),data=dt.triceps)
summary(mod_spline_1)
##
## Call:
## lm(formula = triceps ~ bs(age, knots = knot_value_manual_3),
## data = dt.triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.385 -1.682 -0.393 1.165 22.855
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.22533 0.61873 13.294 < 2e-16 ***
## bs(age, knots = knot_value_manual_3)1 -0.06551 1.14972 -0.057 0.955
## bs(age, knots = knot_value_manual_3)2 -4.30051 0.72301 -5.948 3.90e-09 ***
## bs(age, knots = knot_value_manual_3)3 7.80435 1.17793 6.625 6.00e-11 ***
## bs(age, knots = knot_value_manual_3)4 6.14890 1.27439 4.825 1.65e-06 ***
## bs(age, knots = knot_value_manual_3)5 5.56640 1.42225 3.914 9.78e-05 ***
## bs(age, knots = knot_value_manual_3)6 7.90178 1.54514 5.114 3.87e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.76 on 885 degrees of freedom
## Multiple R-squared: 0.4195, Adjusted R-squared: 0.4155
## F-statistic: 106.6 on 6 and 885 DF, p-value: < 2.2e-16
knot_value_manual_3 = c(10, 20,40)
mod_spline_1 = lm(triceps ~ bs(age, knots =knot_value_manual_3 ),data=dt.triceps)
summary(mod_spline_1)
##
## Call:
## lm(formula = triceps ~ bs(age, knots = knot_value_manual_3),
## data = dt.triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.385 -1.682 -0.393 1.165 22.855
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.22533 0.61873 13.294 < 2e-16 ***
## bs(age, knots = knot_value_manual_3)1 -0.06551 1.14972 -0.057 0.955
## bs(age, knots = knot_value_manual_3)2 -4.30051 0.72301 -5.948 3.90e-09 ***
## bs(age, knots = knot_value_manual_3)3 7.80435 1.17793 6.625 6.00e-11 ***
## bs(age, knots = knot_value_manual_3)4 6.14890 1.27439 4.825 1.65e-06 ***
## bs(age, knots = knot_value_manual_3)5 5.56640 1.42225 3.914 9.78e-05 ***
## bs(age, knots = knot_value_manual_3)6 7.90178 1.54514 5.114 3.87e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.76 on 885 degrees of freedom
## Multiple R-squared: 0.4195, Adjusted R-squared: 0.4155
## F-statistic: 106.6 on 6 and 885 DF, p-value: < 2.2e-16
Menggunakan Knot yang Ditentukan Fungsi Komputer
knot_value_pc_df_6 = attr(bs(dt.triceps$age, df=6),"knots")
mod_spline_1 = lm(triceps ~ bs(age, knots =knot_value_pc_df_6 ),data=dt.triceps)
summary(mod_spline_1)
##
## Call:
## lm(formula = triceps ~ bs(age, knots = knot_value_pc_df_6), data = dt.triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.0056 -1.7556 -0.2944 1.2008 23.0695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5925 0.8770 7.517 1.37e-13 ***
## bs(age, knots = knot_value_pc_df_6)1 3.7961 1.5015 2.528 0.01164 *
## bs(age, knots = knot_value_pc_df_6)2 -2.0749 0.8884 -2.335 0.01974 *
## bs(age, knots = knot_value_pc_df_6)3 1.5139 1.1645 1.300 0.19391
## bs(age, knots = knot_value_pc_df_6)4 11.6394 1.3144 8.855 < 2e-16 ***
## bs(age, knots = knot_value_pc_df_6)5 5.9680 1.5602 3.825 0.00014 ***
## bs(age, knots = knot_value_pc_df_6)6 8.9127 1.4053 6.342 3.60e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.757 on 885 degrees of freedom
## Multiple R-squared: 0.4206, Adjusted R-squared: 0.4167
## F-statistic: 107.1 on 6 and 885 DF, p-value: < 2.2e-16
data_hasil <- cbind("y_aktual"=dt.triceps$triceps,
"y_pred"=mod_spline_1$fitted.values,
"residual"=resid(mod_spline_1))
head(data_hasil)
## y_aktual y_pred residual
## 1 3.4 7.333196 -3.933196
## 2 4.0 6.380136 -2.380136
## 3 4.2 6.426062 -2.226062
## 4 4.2 7.052817 -2.852817
## 5 4.4 6.455335 -2.055335
## 6 4.4 7.241677 -2.841677
ggplot(dt.triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = knot_value_pc_df_6),
lty = 1,se = F)
Natural Spline
knot_value_manual_3 = c(10, 20,40)
mod_spline3ns = lm(triceps ~ ns(age, knots = knot_value_manual_3),data=dt.triceps)
summary(mod_spline3ns)
##
## Call:
## lm(formula = triceps ~ ns(age, knots = knot_value_manual_3),
## data = dt.triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7220 -1.7640 -0.3985 1.1908 22.9684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.8748 0.3742 23.714 < 2e-16 ***
## ns(age, knots = knot_value_manual_3)1 7.0119 0.6728 10.422 < 2e-16 ***
## ns(age, knots = knot_value_manual_3)2 6.0762 0.8625 7.045 3.72e-12 ***
## ns(age, knots = knot_value_manual_3)3 2.0780 1.0632 1.954 0.051 .
## ns(age, knots = knot_value_manual_3)4 8.8616 0.9930 8.924 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.762 on 887 degrees of freedom
## Multiple R-squared: 0.4176, Adjusted R-squared: 0.415
## F-statistic: 159 on 4 and 887 DF, p-value: < 2.2e-16
ggplot(dt.triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black")+
stat_smooth(method = "lm",
formula = y~ns(x, knots = knot_value_manual_3),
lty = 1,se=F) #ns = pemulusan dari masing-masing ujung kiri dan ujung kanan
Smoothing Spline
Fungsi smooth.spline() dapat digunakan untuk melakukan smoothing spline pada R.
model_sms <- with(data = triceps,smooth.spline(age,triceps))
model_sms
## Call:
## smooth.spline(x = age, y = triceps)
##
## Smoothing Parameter spar= 0.5162704 lambda= 1.232259e-06 (13 iterations)
## Equivalent Degrees of Freedom (Df): 50.00639
## Penalized Criterion (RSS): 8591.581
## GCV: 13.77835
Fungsi smooth.spline secara otomatis memilih paramter lambda terbaik dengan menggunakan Cross-Validation (CV) dan Generalized Cross-Validation (GCV). Perbedaan utama antara keduanya adalah pada ukuran kebaikan model yang digunakan. Kalau CV menggunakan MSE sebagai ukuran kebaikan model, sedangkan GCV menggunakan Weighted-MSE. Banyaknya folds (lipatan) pada CV dan GCV ini adalah sejumlah obserbasinya.
pred_data <- broom::augment(model_sms)
ggplot(pred_data,aes(x=x,y=y))+
geom_point(alpha=0.55, color="black")+
geom_line(aes(y=.fitted),col="blue",
lty=1)+
xlab("age")+
ylab("triceps")+
theme_bw()
Berdasarkan gambar di atas, garis biru menunjukkan pemulusan dari beberapa bagian.
Pengaruh parameter lambda terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:
model_sms_lambda <- data.frame(lambda=seq(0,5,by=0.5)) %>%
group_by(lambda) %>%
do(broom::augment(with(data = triceps,smooth.spline(age,triceps,lambda = .$lambda))))
p <- ggplot(model_sms_lambda,
aes(x=x,y=y))+
geom_line(aes(y=.fitted),
col="blue",
lty=1
)+
facet_wrap(~lambda)
p
Ketika nilai lamda semakin kecil, model tersebut semakin fitted dengan pola data (lebih kompleks) sehingga pemulusannya akan lebih banyak. Sedangkan, nilai lamda semakin besar pola data akan cenderung semakin linear (tidak ada faktor penalty/pemulusan).
Jika kita tentukan df=7, maka hasil kurva model smooth.spline akan lebih merepresentasikan data
model_sms <- with(data = triceps,smooth.spline(age,triceps,df=7))
model_sms
## Call:
## smooth.spline(x = age, y = triceps, df = 7)
##
## Smoothing Parameter spar= 1.049874 lambda= 0.008827582 (11 iterations)
## Equivalent Degrees of Freedom (Df): 7.000747
## Penalized Criterion (RSS): 10112.16
## GCV: 14.20355
Berdasarkan hasil di atas, diperoleh bahwa terdapat perbedaan pada nilai GCV, yaitu pada sebelumnya sebesar 13.77835 dan yang saat ini sebesar 14.20355. Oleh karena itu, dapat disimpulkan model smoothing spline yang lebih baik adalah model sebelumnya karena menghasilkan nilai GCV yang lebih kecil.
pred_data <- broom::augment(model_sms)
ggplot(pred_data,aes(x=x,y=y))+
geom_point(alpha=0.55, color="black")+
geom_line(aes(y=.fitted),col="blue",
lty=1)+
xlab("age")+
ylab("triceps")+
theme_bw()
MATERI TAMBAHAN
LOESS
Masih menggunakan data triceps seperti pada ilustrasi sebelumnya, kali ini kita akan mencoba melakukan pendekatan local regression dengan fungsi loess().
model_loess <- loess(triceps~age,
data = triceps)
summary(model_loess)
## Call:
## loess(formula = triceps ~ age, data = triceps)
##
## Number of Observations: 892
## Equivalent Number of Parameters: 4.6
## Residual Standard Error: 3.777
## Trace of smoother matrix: 5.01 (exact)
##
## Control settings:
## span : 0.75
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
Pengaruh parameter span terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:
model_loess_span <- data.frame(span=seq(0.1,5,by=0.5)) %>%
group_by(span) %>%
do(broom::augment(loess(triceps~age,
data = triceps,span=.$span)))
p2 <- ggplot(model_loess_span,
aes(x=age,y=triceps))+
geom_line(aes(y=.fitted),
col="blue",
lty=1
)+
facet_wrap(~span)
p2
Pendekatan ini juga dapat dilakukan dengan fungsi stat_smooth() pada package ggplot2.
library(ggplot2)
ggplot(triceps, aes(age,triceps)) +
geom_point(alpha=0.5,color="black") +
stat_smooth(method='loess',
formula=y~x,
span = 0.75,
col="blue",
lty=1,
se=F)
Tuning span dapat dilakukan dengan menggunakan cross-validation:
set.seed(123)
cross_val <- vfold_cv(triceps,v=10,strata = "triceps")
span <- seq(0.1,1,length.out=50)
best_loess <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
function(x){
mod <- loess(triceps ~ age,span = i,
data=triceps[x$in_id,])
pred <- predict(mod,
newdata=triceps[-x$in_id,])
truth <- triceps[-x$in_id,]$triceps
data_eval <- na.omit(data.frame(pred=pred,
truth=truth))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
head(metric_loess,20)
# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)
mean_metric_loess
}
)
best_loess <- cbind(span=span,best_loess)
# menampilkan hasil all breaks
best_loess
## span rmse mae
## 1 0.1000000 3.773822 2.495752
## 2 0.1183673 3.771342 2.493506
## 3 0.1367347 3.766450 2.487557
## 4 0.1551020 3.757385 2.480571
## 5 0.1734694 3.749539 2.473494
## 6 0.1918367 3.746288 2.470915
## 7 0.2102041 3.742741 2.467990
## 8 0.2285714 3.740397 2.465388
## 9 0.2469388 3.737724 2.463311
## 10 0.2653061 3.737105 2.462252
## 11 0.2836735 3.735645 2.460514
## 12 0.3020408 3.734522 2.459225
## 13 0.3204082 3.733298 2.457694
## 14 0.3387755 3.732538 2.456280
## 15 0.3571429 3.732082 2.455452
## 16 0.3755102 3.731261 2.454642
## 17 0.3938776 3.730882 2.454293
## 18 0.4122449 3.730585 2.454291
## 19 0.4306122 3.730351 2.454869
## 20 0.4489796 3.730717 2.456222
## 21 0.4673469 3.730831 2.456768
## 22 0.4857143 3.731387 2.458307
## 23 0.5040816 3.732091 2.460189
## 24 0.5224490 3.732392 2.461811
## 25 0.5408163 3.733609 2.464543
## 26 0.5591837 3.734393 2.467180
## 27 0.5775510 3.735554 2.470517
## 28 0.5959184 3.736679 2.473012
## 29 0.6142857 3.738227 2.476519
## 30 0.6326531 3.741401 2.476887
## 31 0.6510204 3.742660 2.479842
## 32 0.6693878 3.744336 2.483581
## 33 0.6877551 3.746651 2.488792
## 34 0.7061224 3.748708 2.492180
## 35 0.7244898 3.750734 2.495218
## 36 0.7428571 3.752815 2.498359
## 37 0.7612245 3.753998 2.504175
## 38 0.7795918 3.755981 2.511421
## 39 0.7979592 3.756794 2.514375
## 40 0.8163265 3.759456 2.522994
## 41 0.8346939 3.767892 2.543250
## 42 0.8530612 3.777168 2.557863
## 43 0.8714286 3.782407 2.566972
## 44 0.8897959 3.788428 2.578042
## 45 0.9081633 3.797002 2.595698
## 46 0.9265306 3.803259 2.609852
## 47 0.9448980 3.809478 2.623386
## 48 0.9632653 3.824192 2.655691
## 49 0.9816327 3.835596 2.679155
## 50 1.0000000 3.860514 2.718911
#berdasarkan rmse
best_loess %>% slice_min(rmse)
## span rmse mae
## 1 0.4306122 3.730351 2.454869
#berdasarkan mae
best_loess %>% slice_min(mae)
## span rmse mae
## 1 0.4122449 3.730585 2.454291
library(ggplot2)
ggplot(triceps, aes(age,triceps)) +
geom_point(alpha=0.5,color="black") +
stat_smooth(method='loess',
formula=y~x,
span = 0.4306122,
col="blue",
lty=1,
se=F)
library(ggplot2)
ggplot(triceps, aes(age,triceps)) +
geom_point(alpha=0.5,color="black") +
stat_smooth(method='loess',
formula=y~x,
span = 0.41,
col="blue",
lty=1,
se=F)
LATIHAN
DATA AUTO
Data Auto diperoleh dari package ISLR berisikan dataset yang digunakan pada buku “An Introduction to Statistical Learning with Applications in R”. Data Auto yang digunakan pada praktikum kali ini terdiri dari 392 observasi dan 3 peubah, yaitu:
- mpg : miles per gallon
- Horsepower : engine horsepower
- origin : origin of car (1. American, 2. European, 3. Japanese)
Data
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.3
AutoData = Auto %>% select(mpg, horsepower, origin)
tibble(AutoData)
## # A tibble: 392 x 3
## mpg horsepower origin
## <dbl> <dbl> <dbl>
## 1 18 130 1
## 2 15 165 1
## 3 18 150 1
## 4 16 150 1
## 5 17 140 1
## 6 15 198 1
## 7 14 220 1
## 8 14 215 1
## 9 14 225 1
## 10 15 190 1
## # ... with 382 more rows
Eksplorasi Data
Korelasi
cor(AutoData)
## mpg horsepower origin
## mpg 1.0000000 -0.7784268 0.5652088
## horsepower -0.7784268 1.0000000 -0.4551715
## origin 0.5652088 -0.4551715 1.0000000
Plot Korelasi
chart.Correlation(AutoData)
Berdasarkan plot korelasi di atas, diperoleh bahwa korelasi antara peubah Y (MPG) dan peubah X (Horsepower) bernilai negatif sebesar -0.78. Nilai korelasi antara peubah Y (MPG) dengan peubah Origin bernilai positif sebesar 0.57. Sedangkan, nilai korelasi antara peubah X (Horsepower) dengan peubah Origin bernilai negatif sebesar -0.46.
Regresi Linear
# Regresi Linear
mod.lin.auto <- lm(mpg~horsepower, data=AutoData)
summary(mod.lin.auto)
##
## Call:
## lm(formula = mpg ~ horsepower, data = AutoData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
# Plot
plot(AutoData$horsepower,AutoData$mpg, main = "Plot MPG dan Horsepower",ylab="MPG",xlab="Horsepower")
lines(AutoData$horsepower,mod.lin.auto$fitted.values,col="blue")
Regresi Polynomial
# Regresi Polynomial
mod.poly.auto <- lm(AutoData$mpg~AutoData$horsepower+I(AutoData$horsepower^2))
summary(mod.poly.auto)
##
## Call:
## lm(formula = AutoData$mpg ~ AutoData$horsepower + I(AutoData$horsepower^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.9000997 1.8004268 31.60 <2e-16 ***
## AutoData$horsepower -0.4661896 0.0311246 -14.98 <2e-16 ***
## I(AutoData$horsepower^2) 0.0012305 0.0001221 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
# Sorting
ix<-sort(AutoData$horsepower,index.return=T)$ix
# Plot
plot(AutoData$horsepower,AutoData$mpg, main = "Plot MPG dan Horsepower",ylab="MPG",xlab="Horsepower")
lines(AutoData$horsepower[ix],mod.poly.auto$fitted.values[ix],col="blue")
Regresi Fungsi Tangga
# Formatting
range(AutoData$horsepower)
## [1] 46 230
c1 <- as.factor(ifelse(AutoData$horsepower<=100,1,0))
c2 <- as.factor(ifelse(AutoData$horsepower<=200,1,0))
c3 <- as.factor(ifelse(AutoData$horsepower>200,1,0))
head(data.frame(AutoData$mpg,c1,c2,c3))
## AutoData.mpg c1 c2 c3
## 1 18 0 1 0
## 2 15 0 1 0
## 3 18 0 1 0
## 4 16 0 1 0
## 5 17 0 1 0
## 6 15 0 1 0
# Regresi Fungsi Tangga
mod.step.auto <- lm(mpg~c1+c2+c3, data=AutoData)
summary(mod.step.auto)
##
## Call:
## lm(formula = mpg ~ c1 + c2 + c3, data = AutoData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.5917 -3.7167 -0.5917 3.4083 19.0083
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.9000 1.8132 7.114 5.44e-12 ***
## c11 10.5589 0.6089 17.342 < 2e-16 ***
## c21 4.1329 1.8769 2.202 0.0283 *
## c31 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.734 on 389 degrees of freedom
## Multiple R-squared: 0.4631, Adjusted R-squared: 0.4603
## F-statistic: 167.7 on 2 and 389 DF, p-value: < 2.2e-16
# Plot
plot(AutoData$horsepower,AutoData$mpg, main = "Plot Antara MPG dan Horsepower",ylab="MPG",xlab="Horsepower")
lines(AutoData$horsepower[ix],mod.step.auto$fitted.values[ix],col="blue")
Perbandingan Model
Nilai MSE
# Fungsi MSE
MSE = function(pred,actual){
mean((pred-actual)^2)
}
# Perbandingan
MSEAuto <- rbind(MSE(predict(mod.lin.auto),AutoData$mpg),
MSE(predict(mod.poly.auto),AutoData$mpg),
MSE(predict(mod.step.auto),AutoData$mpg))
ModelAuto <- c("Regresi Linear","Regresi Polynomial","Regresi Fungsi Tangga")
data.frame("Model"=ModelAuto,"MSE"=MSEAuto)
## Model MSE
## 1 Regresi Linear 23.94366
## 2 Regresi Polynomial 18.98477
## 3 Regresi Fungsi Tangga 32.62641
Nilai AIC
AICAuto <- data.frame("Model"=c("Linear","Polynomial","Fungsi Tangga"),
"AIC"=c(AIC(mod.lin.auto),AIC(mod.poly.auto),AIC(mod.step.auto)))
AICAuto
## Model AIC
## 1 Linear 2363.324
## 2 Polynomial 2274.354
## 3 Fungsi Tangga 2486.616
Berdasarkan hasil di atas, diperoleh bahwa Model Regresi Polynomial memiliki nilai AIC dan MSE terkecil. Oleh karena itu, dapat disimpulkan bahwa Model Regresi Polynomial merupakan model terbaik berdasarkan nilai AIC dan nilai MSE yang paling kecil dibandingkan dengan model lainnya.
Cross Validation Evaluation
Regresi Linear
set.seed(123)
cross_val <- vfold_cv(AutoData,v=5,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ horsepower,data=AutoData[x$in_id,])
pred <- predict(mod,newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear
## # A tibble: 5 x 2
## rmse mae
## <dbl> <dbl>
## 1 4.45 3.49
## 2 5.42 4.22
## 3 4.80 3.64
## 4 4.71 3.89
## 5 5.11 3.94
# menghitung rata-rata untuk 10 folds
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
## rmse mae
## 4.898592 3.835773
Regresi Polynomial Derajat 2 (Ordo=2)
set.seed(123)
cross_val <- vfold_cv(AutoData,v=5,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),data=AutoData[x$in_id,])
pred <- predict(mod,newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly2
## # A tibble: 5 x 2
## rmse mae
## <dbl> <dbl>
## 1 3.80 2.78
## 2 4.78 3.65
## 3 4.19 3.08
## 4 4.01 3.13
## 5 5.07 3.78
# menghitung rata-rata untuk 5 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
## rmse mae
## 4.370631 3.284748
Regresi Polynomial Derajat 3 (Ordo=3)
set.seed(123)
cross_val <- vfold_cv(AutoData,v=5,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,3,raw = T),data=AutoData[x$in_id,])
pred <- predict(mod,newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly3
## # A tibble: 5 x 2
## rmse mae
## <dbl> <dbl>
## 1 3.79 2.76
## 2 4.78 3.64
## 3 4.20 3.09
## 4 4.00 3.12
## 5 5.09 3.79
# menghitung rata-rata untuk 5 folds
mean_metric_poly3 <- colMeans(metric_poly3)
mean_metric_poly3
## rmse mae
## 4.372623 3.280150
Regresi Fungsi Tangga
set.seed(123)
cross_val <- vfold_cv(AutoData,v=5,strata = "mpg")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- AutoData[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- AutoData[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
}
)
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
## breaks rmse mae
## 1 3 5.788482 4.522081
## 2 4 5.026946 3.778564
## 3 5 4.737723 3.588164
## 4 6 4.728788 3.573371
## 5 7 4.590146 3.411292
## 6 8 4.492694 3.419407
## 7 9 4.582439 3.494722
## 8 10 4.609092 3.476580
Perbandingan Model
Nilai RMSE dan MAE
nilai_metricAuto <- rbind(mean_metric_linear,
mean_metric_poly2,
mean_metric_poly3,
best_tangga %>% select(-1) %>% slice_min(mae))
nama_modelAuto <- c("Linear","Polynomial (Ordo=2)","Polynomial (Ordo=3)","Fungsi Tangga (Breaks=9)")
data.frame("Model"=nama_modelAuto,nilai_metricAuto)
## Model rmse mae
## 1 Linear 4.898592 3.835773
## 2 Polynomial (Ordo=2) 4.370631 3.284748
## 3 Polynomial (Ordo=3) 4.372623 3.280150
## 4 Fungsi Tangga (Breaks=9) 4.590146 3.411292
Berdasarkan nilai RMSE, diperoleh bahwa Model Regresi Polynomial (Ordo=2) menghasilkan nilai RMSE yang paling kecil dibandingkan dengan model lainnya. Sedangkan, berdasarkan nilai MAE, diperoleh bahwa Model Regresi Polynomial (Ordo=3) menghasilkan nilai MAE yang paling kecil dibandingkan dengan model lainnya.