Tugas Praktikum STA 581-8 | Model Non Linear
Soal
Data untuk tugas dapat diperoleh di package ISLR. Silahkan install dulu packagenya.
Gunakan model-model non linear yang sudah dipelajari pada data Auto dengan peubah respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.
Apakah ada bukti untuk hubungan non linear untuk peubah-peubah yang anda pilih? Buat visualisasi untuk mendukung jawaban Anda.
Tentukan model non-linear terbaik untuk masing pasangan peubah (secara visual dan/atau secara empiris).
Data
Dataset Auto terdiri dari 392 observasi dengan 9 variabel yaitu :
mpg(miles per gallon) : penggunaan bahan bakar yang dihitung dalam mpgcylinders: jumlah silinder yang digunakan pada kendaraandisplacement: besarnya perpindahan mesin (cu.inches)horsepower: tenaga kuda dari kendaraanweight: berat kendaraan (lbs.)acceleration: percepatan dari 0 ke 60 mph (detik)year: tahun model (modulo 100)origin: asal negara produsen (1 : Amerika, 2 : Eropa, 3 : Jepang)name: nama kendaraan
Peubah respon : mpg
Peubah prediktor : horsepower, displacement, dan weight
library(ISLR)
library(tidyverse)
library(splines)
library(ggplot2)
library(ggpubr)
library(rsample)
library(caret)
library(DT)
head(Auto)## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
Plot Awal
A <-ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.3, color="orange") +
theme_bw()+
ggtitle("Plot horsepower vs mpg") +
xlab("horsepower") +
ylab("mpg")
B <-ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.3, color="purple") +
theme_bw()+
ggtitle("Plot displacement vs mpg") +
xlab("displacement") +
ylab("mpg")
C <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=1, color="pink") +
theme_bw()+
ggtitle("Plot weight vs mpg") +
xlab("weight") +
ylab("mpg")
ggarrange(A, B, C, ncol = 2, nrow = 2)Berdasarkan scatter plot di atas, terlihat bahwa sebaran data tidak membentuk pola linear. Hubungan ini akan diuji dengan model regresi non linear, yaitu polinomial, fungsi tangga dan natural cubic spline dan akan ditentukan model terbaik dari masing-masing pasangan peubahnya.
Peubah mpg vs horsepower
Regresi Linier
set.seed(123)
cross_val_h_lin <- vfold_cv(Auto,v=5,strata = "mpg")
metric_linear_h <- map_dfr(cross_val_h_lin$splits,
function(x){
mod <- lm(mpg ~ horsepower,data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear_h## # A tibble: 5 x 2
## rmse mae
## <dbl> <dbl>
## 1 4.45 3.49
## 2 5.42 4.22
## 3 4.80 3.64
## 4 4.71 3.89
## 5 5.11 3.94
# menghitung rata-rata untuk 5 folds
mean_metric_linear_h <- colMeans(metric_linear_h)
mean_metric_linear_h## rmse mae
## 4.898592 3.835773
Lh <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.3, color="orange") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "black",se = F)+
theme_bw()+
ggtitle("Linear") +
xlab("horsepower") +
ylab("mpg")
LhRegresi Polinomial
# Cross Validation dengan 5 lipatan
set.seed(123)
cross_val_hrspwr <- vfold_cv(Auto,v=5, strata = "mpg") #rsample
ordo <- 2:5
best_poly_hrspwr <- map_dfr(ordo, function(i) { #tidiverse iterasi pada suatu fungsi (output data.frame)
Polinomial_hrspwr <- map_dfr(cross_val_hrspwr$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,ordo=i),data=Auto[x$in_id,]) #in_id adalah data training
pred <- predict(mod,newdata=Auto[-x$in_id,]) #nilai prediksi untuk data testing
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
Polinomial_hrspwr
# menghitung rata-rata untuk 5 folds
mean_Polinomial_hrspwr <- colMeans(Polinomial_hrspwr)
mean_Polinomial_hrspwr
}
)
# menampilkan hasil all ordo
best_poly_hrspwr <- cbind(ordo=ordo,best_poly_hrspwr)
best_poly_hrspwr## ordo rmse mae
## 1 2 4.370631 3.284748
## 2 3 4.372623 3.280150
## 3 4 4.373708 3.288706
## 4 5 4.338150 3.271598
# berdasarkan rmse
best_poly_hrspwr %>% slice_min(rmse)## ordo rmse mae
## 1 5 4.33815 3.271598
# berdasarkan mae
best_poly_hrspwr %>% slice_min(mae)## ordo rmse mae
## 1 5 4.33815 3.271598
Plot CV
best_poly_hrspwr$ordo <- as.factor(best_poly_hrspwr$ordo)
ggplot(data=best_poly_hrspwr, aes(x=ordo, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("Ordo") +
ylab("RMSE")+
theme_bw() Secara empiris regresi polinomial dengan ordo 5 memiliki RMSE dan MAE terkecil. Namun, dipilih regresi polinomial dengan ordo 2 karena untuk ordo 5 pada bagian ujung-ujung plot cenderung tidak stabil.
Ph2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.3, color="orange") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "green3",se = F)+
theme_bw()+
ggtitle("Polinomial Ordo 2") +
xlab("horsepower") +
ylab("mpg")
Ph5 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.3, color="orange") +
stat_smooth(method = "lm",
formula = y~poly(x,5,raw=T),
lty = 1, col = "green3",se = F)+
theme_bw()+
ggtitle("Polinomial Ordo 5") +
xlab("horsepower") +
ylab("mpg")
ggarrange(Ph2, Ph5, ncol = 2)set.seed(123)
cross_val_hrspwr <- vfold_cv(Auto,v=5, strata = "mpg")
Polinomial_hrspwr2 <- map_dfr(cross_val_hrspwr$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw=T),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
Polinomial_hrspwr2## # A tibble: 5 x 2
## rmse mae
## <dbl> <dbl>
## 1 3.80 2.78
## 2 4.78 3.65
## 3 4.19 3.08
## 4 4.01 3.13
## 5 5.07 3.78
# menghitung rata-rata untuk 5 folds
mean_Polinomial_hrspwr2 <- colMeans(Polinomial_hrspwr2)
mean_Polinomial_hrspwr2## rmse mae
## 4.370631 3.284748
Fungsi Tangga (Piecewise Constant)
set.seed(123)
cross_val_hrspwr <- vfold_cv(Auto,v=5,strata = "mpg")
breaks <- 2:8 # pembagian interval 2 sampai 8
best_ftangga_hrspwr <- map_dfr(breaks,
function(i){
ftangga_hrspwr <- map_dfr(cross_val_hrspwr$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i) #Transformasi jadi factor
mod <- lm(mpg ~ horsepower,data=training)
#Convert ke numeric lagi
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*",
"\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]",
"\\1", labs_x) ))
#Testing
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
ftangga_hrspwr
# menghitung rata-rata untuk 5 folds
ftangga_hrspwr <- colMeans(ftangga_hrspwr)
ftangga_hrspwr
}
)
best_ftangga_hrspwr <- cbind(breaks=breaks, best_ftangga_hrspwr)
# menampilkan hasil all breaks
best_ftangga_hrspwr## breaks rmse mae
## 1 2 6.119241 4.756365
## 2 3 5.788482 4.522081
## 3 4 5.026946 3.778564
## 4 5 4.737723 3.588164
## 5 6 4.728788 3.573371
## 6 7 4.590146 3.411292
## 7 8 4.492694 3.419407
#berdasarkan rmse
best_ftangga_hrspwr %>% slice_min(rmse)## breaks rmse mae
## 1 8 4.492694 3.419407
#berdasarkan mae
best_ftangga_hrspwr %>% slice_min(mae)## breaks rmse mae
## 1 7 4.590146 3.411292
Berdasarkan RMSE terkecil banyaknya interval yang terbaik adalah 8 sedangkan berdasarkan MAE terkecil banyaknya interval terbaik adalah 7.
Plot CV
T1 <- ggplot(data=best_ftangga_hrspwr, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("Breaks") +
ylab("RMSE")+
theme_bw()
T2 <- ggplot(data=best_ftangga_hrspwr, aes(x=breaks, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("Breaks") +
ylab("MAE")+
theme_bw()
ggarrange(T1, T2, ncol = 2, nrow = 1)Th <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.3, color="orange") +
stat_smooth(method = "lm",
formula = y~cut(x,8),lty = 1,
col = "red",se = F)+
theme_bw()+
ggtitle("Fungsi Tangga 8 knots") +
xlab("horsepower") +
ylab("mpg")
ThNatural Cubic Spline
set.seed(123)
cross_val_h_ns <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 2:15
best_nspline_h <- map_dfr(df, function(i){
nspline_h <- map_dfr(cross_val_h_ns$splits, function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
nspline_h
# menghitung rata-rata untuk 5 folds
mean_nspline_h <- colMeans(nspline_h)
mean_nspline_h
}
)
best_nspline_h <- cbind(df=df,best_nspline_h)
# menampilkan hasil all breaks
best_nspline_h## df rmse mae
## 1 2 4.350969 3.264158
## 2 3 4.358924 3.270071
## 3 4 4.349080 3.276399
## 4 5 4.334825 3.266389
## 5 6 4.335422 3.264572
## 6 7 4.315948 3.260216
## 7 8 4.326330 3.272321
## 8 9 4.305634 3.245872
## 9 10 4.291769 3.231180
## 10 11 4.320656 3.253788
## 11 12 4.320773 3.236359
## 12 13 4.346547 3.266437
## 13 14 4.378758 3.292366
## 14 15 4.373600 3.303746
#berdasarkan rmse
best_nspline_h %>% slice_min(rmse)## df rmse mae
## 1 10 4.291769 3.23118
#berdasarkan mae
best_nspline_h %>% slice_min(mae)## df rmse mae
## 1 10 4.291769 3.23118
Plot CV
best_nspline_h$df <- as.factor(best_nspline_h$df)
ggplot(data=best_nspline_h, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("DF") +
ylab("RMSE")+
theme_bw() Berdasarkan hasil di atas, model natural cubic spline terbaik memiliki 10 knots.
Secara Visual
Knots
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$horsepower, df=10),"knots")## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
NSh <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.3, color="orange") +
stat_smooth(method = "lm",
formula = y~ns(x, df=10),
lty = 1,col = "blue", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines (df=10)") +
xlab("horsepower") +
ylab("mpg")+
geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140,157.7), col="grey50", lty=2)
NShPerbandingan Model
Secara Empiris berdasarkan nilai RMSE dan MAE
nilai_metric_h <- rbind(mean_metric_linear_h,
mean_Polinomial_hrspwr2,
best_ftangga_hrspwr %>% select(-1) %>% slice_min(rmse),
best_nspline_h %>% select(-1)%>% slice_min(rmse)
)
nama_model_regresi_h <- c("Linear",
"Polinomial ordo 2",
"Fungsi Tangga (breaks=8)",
"Natural Cubic Splines (df=10)"
)
komparasi <- data.frame(nama_model_regresi_h,nilai_metric_h)
komparasi %>% datatable(komparasi, class = 'cell-border stripe')Berdasarkan perbandingan secara empiris di atas, model terbaik yang dapat digunakan untuk memodelkan mpg vs horsepower adalah Natural Cubic Spline dengan 10 Knot.
Secara Visual berdasarkan Plot
ggarrange(Lh, Ph2, Th, NSh, ncol = 2, nrow = 2)plot_all_horsepower <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.3, color="orange") +
stat_smooth(method = "lm",
formula = y~x,
lty = 1, col = "black",se = F)+
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "green",se = F)+
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "red",se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, df=10),
lty = 1,col = "blue", se = F)+
labs(x = "horsepower" , y = "mpg") +
ggtitle(expression(atop("Gabungan Model Terbaik",
atop(italic("mpg vs horsepower"))))) +
theme(plot.title = element_text(hjust = 0.5, vjust = -2))
plot_all_horsepowerPeubah mpg vs displacement
Regresi Linier
set.seed(123)
cross_val_lin_d <- vfold_cv(Auto,v=5,strata = "mpg")
metric_linear_d <- map_dfr(cross_val_lin_d$splits,
function(x){
mod <- lm(mpg ~ displacement,data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear_d## # A tibble: 5 x 2
## rmse mae
## <dbl> <dbl>
## 1 4.40 3.29
## 2 5.83 4.45
## 3 4.46 3.48
## 4 4.51 3.55
## 5 3.72 2.80
# menghitung rata-rata untuk 5 folds
mean_metric_linear_d <- colMeans(metric_linear_d)
mean_metric_linear_d## rmse mae
## 4.583160 3.516231
Ld <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.3, color="purple") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "black",se = F)+
theme_bw()+
ggtitle("Linear") +
xlab("displacement") +
ylab("mpg")
LdRegresi Polinomial
# Cross Validation dengan 5 lipatan
set.seed(123)
cross_val_dis <- vfold_cv(Auto,v=5, strata = "mpg") #rsample
ordo <- 2:5
best_poly_dis <- map_dfr(ordo, function(i) { #tidiverse iterasi pada suatu fungsi (output data.frame)
Polinomial_dis <- map_dfr(cross_val_dis$splits,
function(x){
mod <- lm(mpg ~ poly(displacement,ordo=i),data=Auto[x$in_id,]) #in_id adalah data training
pred <- predict(mod,newdata=Auto[-x$in_id,]) #nilai prediksi untuk data testing
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
Polinomial_dis
# menghitung rata-rata untuk 5 folds
mean_Polinomial_dis <- colMeans(Polinomial_dis)
mean_Polinomial_dis
}
)
# menampilkan hasil all ordo
best_poly_dis <- cbind(ordo=ordo,best_poly_dis)
best_poly_dis## ordo rmse mae
## 1 2 4.304907 3.159841
## 2 3 4.312431 3.170241
## 3 4 4.319214 3.164415
## 4 5 4.339744 3.183127
# berdasarkan rmse
best_poly_dis %>% slice_min(rmse)## ordo rmse mae
## 1 2 4.304907 3.159841
# berdasarkan mae
best_poly_dis %>% slice_min(mae)## ordo rmse mae
## 1 2 4.304907 3.159841
Plot CV
best_poly_dis$ordo <- as.factor(best_poly_dis$ordo)
ggplot(data=best_poly_dis, aes(x=ordo, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("Ordo") +
ylab("RMSE")+
theme_bw() Secara empiris, regresi polinomial dengan ordo 2 memiliki RMSE dan MAE terkecil.
Pd2 <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.3, color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "green3",se = F)+
theme_bw()+
ggtitle("Polinomial Ordo 2") +
xlab("displacement") +
ylab("mpg")
Pd2Fungsi Tangga (Piecewise Constant)
set.seed(123)
cross_val_ft_dis <- vfold_cv(Auto,v=5,strata = "mpg")
breaks <-2:8 # pembagian interval 2 sampai 8
best_tangga_ft_dis <- map_dfr(breaks, function(i){
metric_tangga_ft_dis <- map_dfr(cross_val_ft_dis$splits, function(x){
training <- Auto[x$in_id,]
training$displacement <- cut(training$displacement,i) #Transformasi jadi factor
mod <- lm(mpg ~ displacement, data=training)
#Convert ke numeric lagi
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
#Testing
testing <- Auto[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_ft_dis
# menghitung rata-rata untuk 5 folds
mean_metric_tangga_ft_dis <- colMeans(metric_tangga_ft_dis)
mean_metric_tangga_ft_dis
})
best_tangga_ft_dis <- cbind(breaks=breaks,best_tangga_ft_dis)
# menampilkan hasil all breaks
best_tangga_ft_dis## breaks rmse mae
## 1 2 5.925600 4.629407
## 2 3 4.893745 3.675196
## 3 4 4.824438 3.623216
## 4 5 4.666040 3.487272
## 5 6 4.603742 3.404050
## 6 7 4.588973 3.382947
## 7 8 4.405202 3.250663
#berdasarkan rmse
best_tangga_ft_dis %>% slice_min(rmse)## breaks rmse mae
## 1 8 4.405202 3.250663
#berdasarkan mae
best_tangga_ft_dis %>% slice_min(mae)## breaks rmse mae
## 1 8 4.405202 3.250663
Berdasarkan RMSE dan MAE terkecil, banyaknya interval yang terbaik adalah 8.
Plot CV
T1 <- ggplot(data=best_tangga_ft_dis, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("Breaks") +
ylab("RMSE")+
theme_bw()
T1Td <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.3, color="purple") +
stat_smooth(method = "lm",
formula = y~cut(x,8),lty = 1,
col = "red",se = F)+
theme_bw()+
ggtitle("Fungsi Tangga 8 knots") +
xlab("displacement") +
ylab("mpg")
TdNatural Cubic Spline
set.seed(123)
cross_val_d_ns <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 2:15
best_nspline_d <- map_dfr(df, function(i){
nspline_d <- map_dfr(cross_val_d_ns$splits, function(x){
mod <- lm(mpg ~ ns(displacement,df=i),data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
nspline_d
# menghitung rata-rata untuk 5 folds
mean_nspline_d <- colMeans(nspline_d)
mean_nspline_d
}
)
best_nspline_d <- cbind(df=df,best_nspline_d)
# menampilkan hasil all breaks
best_nspline_d## df rmse mae
## 1 2 4.301604 3.163136
## 2 3 4.313449 3.172620
## 3 4 4.351866 3.186249
## 4 5 4.323274 3.204052
## 5 6 4.248683 3.147266
## 6 7 4.233862 3.143808
## 7 8 4.214122 3.153459
## 8 9 4.202560 3.144546
## 9 10 4.182256 3.137854
## 10 11 4.153097 3.099830
## 11 12 4.096341 3.044499
## 12 13 4.154786 3.102888
## 13 14 4.192029 3.128335
## 14 15 4.235035 3.154566
#berdasarkan rmse
best_nspline_d %>% slice_min(rmse)## df rmse mae
## 1 12 4.096341 3.044499
#berdasarkan mae
best_nspline_d %>% slice_min(mae)## df rmse mae
## 1 12 4.096341 3.044499
Plot CV
best_nspline_d$df <- as.factor(best_nspline_d$df)
ggplot(data=best_nspline_d, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("DF") +
ylab("RMSE")+
theme_bw() Berdasarkan hasil di atas, model natural cubic spline terbaik memiliki 12 knots.
Secara Visual
Knots
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$displacement, df=12),"knots")## 8.333333% 16.66667% 25% 33.33333% 41.66667% 50% 58.33333% 66.66667%
## 89.0000 97.0000 105.0000 119.0000 130.9167 151.0000 200.0000 232.0000
## 75% 83.33333% 91.66667%
## 275.7500 318.0000 351.0000
NSd <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.3, color="purple") +
stat_smooth(method = "lm",
formula = y~ns(x, df=12),
lty = 1,col = "blue", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines (df=12)") +
xlab("displacement") +
ylab("mpg")+
geom_vline(xintercept = c(89, 97, 105, 119, 130.9, 151, 200, 232, 275.75, 318, 351), col="grey50", lty=2)
NSdPerbandingan Model
Secara Empiris berdasarkan nilai RMSE dan MAE
nilai_metric_d <- rbind(mean_metric_linear_d,
best_poly_dis %>% select(-1) %>% slice_min(rmse),
best_tangga_ft_dis %>% select(-1) %>% slice_min(rmse),
best_nspline_d %>% select(-1) %>% slice_min(rmse)
)
nama_model_regresi_d <- c("Linear",
"Polinomial ordo 2",
"Fungsi Tangga (breaks=8)",
"Natural Cubic Splines (df=12)"
)
komparasi <- data.frame(nama_model_regresi_d,nilai_metric_d)
komparasi %>% datatable(komparasi, class = 'cell-border stripe')Berdasarkan perbandingan secara empiris di atas, model terbaik yang dapat digunakan untuk memodelkan mpg vs displacement adalah Natural Cubic Spline dengan 12 Knot.
Secara Visual berdasarkan Plot
ggarrange(Ld, Pd2, Td, NSd, ncol = 2, nrow = 2)plot_all_displacement<- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.3, color="purple") +
stat_smooth(method = "lm",
formula = y~x,
lty = 1, col = "black",se = F)+
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "green",se = F)+
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "red",se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, df=12),
lty = 1,col = "blue", se = F)+
labs(x = "displacement" , y = "mpg") +
ggtitle(expression(atop("Gabungan Model Terbaik",
atop(italic("mpg vs displacement"))))) +
theme(plot.title = element_text(hjust = 0.5, vjust = -2))
plot_all_displacementPeubah mpg vs weight
Regresi Linear
set.seed(123)
cross_val_lin_w <- vfold_cv(Auto,v=5,strata = "mpg")
metric_linear_w <- map_dfr(cross_val_lin_w$splits,
function(x){
mod <- lm(mpg ~ weight,data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear_w## # A tibble: 5 x 2
## rmse mae
## <dbl> <dbl>
## 1 4.15 3.08
## 2 4.90 3.76
## 3 4.39 3.31
## 4 4.17 3.08
## 5 3.93 3.17
# menghitung rata-rata untuk 5 folds
mean_metric_linear_w <- colMeans(metric_linear_w)
mean_metric_linear_w## rmse mae
## 4.310464 3.280982
Lw <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=1, color="pink") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "black",se = F)+
theme_bw()+
ggtitle("Linear") +
xlab("weight") +
ylab("mpg")
LwRegresi Polinomial
# Cross Validation dengan 5 lipatan
set.seed(123)
cross_val_w <- vfold_cv(Auto,v=5, strata = "mpg") #rsample
ordo <- 2:5
best_poly_w <- map_dfr(ordo, function(i) { #tidiverse iterasi pada suatu fungsi (output data.frame)
Polinomial_w <- map_dfr(cross_val_w$splits,
function(x){
mod <- lm(mpg ~ poly(weight,ordo=i),data=Auto[x$in_id,]) #in_id adalah data training
pred <- predict(mod,newdata=Auto[-x$in_id,]) #nilai prediksi untuk data testing
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
Polinomial_w
# menghitung rata-rata untuk 5 folds
mean_Polinomial_w <- colMeans(Polinomial_w)
mean_Polinomial_w
}
)
# menampilkan hasil all ordo
best_poly_w <- cbind(ordo=ordo,best_poly_w)
best_poly_w## ordo rmse mae
## 1 2 4.150345 3.054677
## 2 3 4.161762 3.060332
## 3 4 4.172705 3.065323
## 4 5 4.179309 3.077552
# berdasarkan rmse
best_poly_w %>% slice_min(rmse)## ordo rmse mae
## 1 2 4.150345 3.054677
# berdasarkan mae
best_poly_w %>% slice_min(mae)## ordo rmse mae
## 1 2 4.150345 3.054677
Plot CV
best_poly_w$ordo <- as.factor(best_poly_w$ordo)
ggplot(data=best_poly_w, aes(x=ordo, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("Ordo") +
ylab("RMSE")+
theme_bw() Secara empiris, regresi polinomial dengan ordo 2 memiliki RMSE dan MAE terkecil.
Pw2 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=1, color="pink") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "green3",se = F)+
theme_bw()+
ggtitle("Polinomial Ordo 2") +
xlab("weight") +
ylab("mpg")
Pw2Fungsi Tangga (Piecewise Constant)
set.seed(123)
cross_val_ft_w <- vfold_cv(Auto,v=5,strata = "mpg")
breaks <-2:8 # pembagian interval 2 sampai 8
best_tangga_ft_w <- map_dfr(breaks, function(i){
metric_tangga_ft_w <- map_dfr(cross_val_ft_w$splits, function(x){
training <- Auto[x$in_id,]
training$weight <- cut(training$weight,i) #Transformasi jadi factor
mod <- lm(mpg ~ weight, data=training)
#Convert ke numeric lagi
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
#Testing
testing <- Auto[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth, response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth, response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_ft_w
# menghitung rata-rata untuk 5 folds
mean_metric_tangga_ft_w <- colMeans(metric_tangga_ft_w)
mean_metric_tangga_ft_w
})
best_tangga_ft_w <- cbind(breaks=breaks,best_tangga_ft_w)
# menampilkan hasil all breaks
best_tangga_ft_w## breaks rmse mae
## 1 2 5.606147 4.344818
## 2 3 4.881005 3.658741
## 3 4 4.717345 3.647426
## 4 5 4.478443 3.263110
## 5 6 4.248970 3.132424
## 6 7 4.293833 3.176184
## 7 8 4.326547 3.228837
#berdasarkan rmse
best_tangga_ft_w %>% slice_min(rmse)## breaks rmse mae
## 1 6 4.24897 3.132424
#berdasarkan mae
best_tangga_ft_w %>% slice_min(mae)## breaks rmse mae
## 1 6 4.24897 3.132424
Berdasarkan RMSE dan MAE terkecil, banyaknya interval yang terbaik adalah 6.
Plot CV
T1 <- ggplot(data=best_tangga_ft_w, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("Breaks") +
ylab("RMSE")+
theme_bw()
T1Tw <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=1, color="pink") +
stat_smooth(method = "lm",
formula = y~cut(x,6),lty = 1,
col = "red",se = F)+
theme_bw()+
ggtitle("Fungsi Tangga 6 knots") +
xlab("weight") +
ylab("mpg")
TwNatural Cubic Spline
set.seed(123)
cross_val_w_ns <- vfold_cv(Auto,v=5,strata = "mpg")
df <- 2:15
best_nspline_w <- map_dfr(df, function(i){
nspline_w <- map_dfr(cross_val_w_ns$splits, function(x){
mod <- lm(mpg ~ ns(weight,df=i),data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
nspline_w
# menghitung rata-rata untuk 5 folds
mean_nspline_w <- colMeans(nspline_w)
mean_nspline_w
}
)
best_nspline_w <- cbind(df=df,best_nspline_w)
# menampilkan hasil all breaks
best_nspline_w## df rmse mae
## 1 2 4.149920 3.054651
## 2 3 4.172693 3.062742
## 3 4 4.179401 3.069074
## 4 5 4.192673 3.087995
## 5 6 4.199590 3.101134
## 6 7 4.158297 3.048696
## 7 8 4.189938 3.088689
## 8 9 4.206900 3.095412
## 9 10 4.220522 3.115617
## 10 11 4.210792 3.109537
## 11 12 4.204752 3.108552
## 12 13 4.229605 3.130726
## 13 14 4.244783 3.152284
## 14 15 4.237106 3.145465
#berdasarkan rmse
best_nspline_w %>% slice_min(rmse)## df rmse mae
## 1 2 4.14992 3.054651
#berdasarkan mae
best_nspline_w %>% slice_min(mae)## df rmse mae
## 1 7 4.158297 3.048696
Berdasarkan RMSE terkecil banyaknya knot yang terbaik adalah 2 sedangkan berdasarkan MAE terkecil banyaknya knot terbaik adalah 7.
Plot CV
w1 <- ggplot(data=best_nspline_w, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("DF") +
ylab("RMSE")+
theme_bw()
w2 <- ggplot(data=best_nspline_w, aes(x=df, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("DF") +
ylab("MAE")+
theme_bw()
ggarrange(w1, w2, ncol =2)Secara Visual
Knots
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$weight, df=2),"knots")## 50%
## 2803.5
attr(ns(Auto$weight, df=7),"knots")## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 2074.857 2285.429 2635.000 2986.571 3446.143 4096.286
NSw2 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=1, color="pink") +
stat_smooth(method = "lm",
formula = y~ns(x, df=2),
lty = 1,col = "blue", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines (df=2)") +
xlab("weight") +
ylab("mpg")+
geom_vline(xintercept = c(2803.5), col="grey50", lty=2)
NSw2NSw7 <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=1, color="pink") +
stat_smooth(method = "lm",
formula = y~ns(x, df=7),
lty = 1,col = "blue", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines (df=7)") +
xlab("weight") +
ylab("mpg")+
geom_vline(xintercept = c(2074.857, 2285.429, 2635.000, 2986.571, 3446.143, 4096.286 ), col="grey50", lty=2)
NSw7 Berdasarkan hasil di atas, model natural cubic spline terbaik memiliki 2 knots.
Perbandingan Model
Secara Empiris berdasarkan nilai RMSE dan MAE
nilai_metric_w <- rbind(mean_metric_linear_w,
best_poly_w %>% select(-1) %>% slice_min(rmse),
best_tangga_ft_w %>% select(-1) %>% slice_min(rmse),
best_nspline_w %>% select(-1) %>% slice_min(rmse)
)
nama_model_regresi_w <- c("Linear",
"Polinomial ordo 2",
"Fungsi Tangga (breaks=6)",
"Natural Cubic Splines (df=2)"
)
komparasi <- data.frame(nama_model_regresi_w,nilai_metric_w)
komparasi %>% datatable(komparasi, class = 'cell-border stripe')Berdasarkan perbandingan secara empiris di atas, model terbaik yang dapat digunakan untuk memodelkan mpg vs weight adalah Natural Cubic Spline dengan 2 Knot.
Secara Visual berdasarkan Plot
ggarrange(Lw, Pw2, Tw, NSw2, ncol = 2, nrow = 2)plot_all_weight<- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=1, color="pink") +
stat_smooth(method = "lm",
formula = y~x,
lty = 1, col = "black",se = F)+
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "green",se = F)+
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "red",se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, df=12),
lty = 1,col = "blue", se = F)+
labs(x = "weight" , y = "mpg") +
ggtitle(expression(atop("Gabungan Model Terbaik",
atop(italic("mpg vs weight"))))) +
theme(plot.title = element_text(hjust = 0.5, vjust = -2))
plot_all_weightKesimpulan
Plot model terbaik untuk masing-masing variabel prediktor:
ggarrange(NSh, NSd, NSw2, ncol = 2, nrow = 2)Model regresi non linear terbaik untuk masing-masing pasangan peubah:
- Variabel respon
mpgdan variabel prediktorhorsepower: Regresi Natural Cubic Spline (df = 10) - Variabel respon
mpgdan variabel prediktordisplacement: Regresi Natural Cubic Spline (df = 12) - Variabel respon
mpgdan variabel prediktorweight: Regresi Natural Cubic Spline (df = 2)
Terima kasih.