Soal
Gunakan model-model non-linier yang sudah dipelajari pada data Auto dengan peubah respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.
Apakah ada bukti untuk hubungan non-linier untuk peubah-peubah yang anda pilih? Buat visualisasi untuk Mendukung jawaban Anda.
Tentukan model non-linear terbaik untuk masing pasangan peubah (Secara visual dan/atau secara empiris)
Data
Data yang digunakan yaitu data Auto. Data ini berisi tentang informasi mengenai 392 jenis kendaraan. Berikut ini merupakan penjelasan untuk masing- masing variabel dalam data:
mpg: miles per galloncylinders: Number of cylinders between 4 and 8displacement: Engine displacement (cu. inches)horsepower: Engine horsepowerweight: Vehicle weight (lbs.)acceleration: Time to accelerate from 0 to 60 mph (sec.)year: Model year (modulo 100)origin: Origin of car (1. American, 2. European, 3. Japanese)name: Vehicle name
Peubah respon yang digunakan yaitu mpg sedangkan peubah penjelasnya yaitu horsepower,displacement, dan weight.
Library
library(ISLR)## Warning: package 'ISLR' was built under R version 4.0.5
library(tidyverse)## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(rsample)## Warning: package 'rsample' was built under R version 4.0.5
library(ggplot2)
library(gam) # generalized additive models## Warning: package 'gam' was built under R version 4.0.5
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.20
library(ggpubr)## Warning: package 'ggpubr' was built under R version 4.0.5
Import Data
data("Auto",package="ISLR")Soal 1
mpg vs horsepower
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
theme_bw()Berdasarkan gambar diatas terlihat bahwa plot antara variabel mpg dan horsepower tidak linear.
mpg vs displacement
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
theme_bw()Berdasarkan gambar diatas terlihat bahwa plot antara variabel mpg dan displacement tidak linear.
mpg vs weight
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
theme_bw()Berdasarkan gambar diatas terlihat bahwa plot antara variabel mpg dan weight tidak linear.
Soal 2
mpg vrs horsepower
Regresi Polinomial
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 1:10
best_polinomial <- map_dfr(df,function(i){
metric_polinomial <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(horsepower,df=i),data=Auto[x$in_id,])
#Mempresdiksi dengan data testing
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polinomial
#Rata-rata 10 folds
mean_Polinomial<-colMeans(metric_polinomial)
mean_Polinomial
})
best_polinomial<-cbind(df=df,best_polinomial)
best_polinomial## df rmse mae
## 1 1 4.889086 3.832328
## 2 2 4.351129 3.266460
## 3 3 4.355430 3.266945
## 4 4 4.367009 3.280452
## 5 5 4.318767 3.249003
## 6 6 4.317431 3.257272
## 7 7 4.293242 3.233376
## 8 8 4.309511 3.251758
## 9 9 4.323321 3.244919
## 10 10 4.371903 3.274887
best_polinomial %>% slice_min(rmse)## df rmse mae
## 1 7 4.293242 3.233376
best_polinomial %>% slice_min(mae)## df rmse mae
## 1 7 4.293242 3.233376
Berdasarkan output diatas dengan menggunakan derajat 1 hingga 10, dapat dilihat bahwa model regresi polinomial terbaik terdapat pada derajat ke-7 yaitu ketika nilai rmse dan mae merupakan nilai paling kecil.
ggplot(Auto,aes(x=horsepower,y=mpg)) +
geom_point(alpha=0.55,color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,7,raw=T),
lty = 1,col="black",se=F) +
ggtitle("Regresi Polinomial df=7") +
ylab("mpg") +
xlab("displacement") +
theme(plot.title = element_text(hjust = 0.5))Gambar diatas merupakan kurva regresi polinomial dengan df=7. Dari gambar tersebut dapat disimpulkan bahwa semakin meningkatnya horsepower maka mpg semakin menurun.
Fungsi Tangga
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 3:20
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i) #cut untuk mengubah fungsi numerik jadi factor
mod <- lm(mpg ~ horsepower,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga## breaks rmse mae
## 1 3 5.775792 4.521829
## 2 4 4.985270 3.774549
## 3 5 4.712623 3.571614
## 4 6 4.644383 3.548375
## 5 7 4.554116 3.379980
## 6 8 4.437597 3.405433
## 7 9 4.549668 3.471999
## 8 10 4.589172 3.455531
## 9 11 4.531039 3.380931
## 10 12 4.442697 3.321151
## 11 13 4.527126 3.375170
## 12 14 4.428824 3.270424
## 13 15 4.313162 3.286289
## 14 16 4.336661 3.302031
## 15 17 4.414343 3.347532
## 16 18 4.340715 3.221099
## 17 19 4.331798 3.224404
## 18 20 4.480710 3.320605
best_tangga %>% slice_min(rmse)## breaks rmse mae
## 1 15 4.313162 3.286289
best_tangga %>% slice_min(mae)## breaks rmse mae
## 1 18 4.340715 3.221099
Plot Fugsi Tangga breaks = 15 dan breaks = 18
pc18<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~cut(x,18),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Piecewise Constant with Breaks=18") +
xlab("Horse Power") + ylab("mile per gallon")
pc15<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~cut(x,15),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Piecewise Constant with Breaks=15") +
xlab("Horse Power") + ylab("mile per gallon")
ggarrange(pc18, pc15, ncol = 2, nrow = 1)Apabila dilihat secara visual plot fungsi tangga dengan breaks 15 atau knots 14 memiliki kurva yang lebih mulus dibandingkan dengan breaks 18 atau knots 17. Jadi, metode fungsi tangga ini dipilih knots = 14.
Natural Cubic Splines
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 2:6
best_nspline <- map_dfr(df, function(i){
nspline <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
nspline
# menghitung rata-rata untuk 10 folds
mean_nspline <- colMeans(nspline)
mean_nspline
}
)
best_nspline <- cbind(df=df,best_nspline)
best_nspline## df rmse mae
## 1 2 4.330108 3.252908
## 2 3 4.348006 3.261538
## 3 4 4.329461 3.257926
## 4 5 4.313312 3.246412
## 5 6 4.310011 3.237990
best_nspline %>% slice_min(rmse)## df rmse mae
## 1 6 4.310011 3.23799
best_nspline %>% slice_min(mae)## df rmse mae
## 1 6 4.310011 3.23799
Menentukan Knots Berdasarkan Fungsi Basis Terpilih
df_6<-attr(ns(Auto$horsepower, df=6),"knots")
df_6## 16.66667% 33.33333% 50% 66.66667% 83.33333%
## 70.0 84.0 93.5 110.0 150.0
ncs <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple")+
stat_smooth(method = "lm",
formula = y~ns(x, df=6),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines With DF =6") +
xlab("Horse Power") +
ylab("mile per gallon") +
theme(plot.title = element_text(hjust = 0.5))+
geom_vline(xintercept = c(70, 84, 93.5, 110, 150), col="grey50", lty=2)
ncs Perbandingan Model
nilai_rmse <- rbind (4.293242,4.313162,4.310011)
nilai_mae <- rbind (3.233376,3.286289,3.237990)
nama_model <- c("Polinomial (df=7)",
"Fungsi Tangga (breaks=15)",
"Natural Cubic Spline (df=6)")
gabungan_model <- data.frame(nama_model,nilai_rmse,nilai_mae)
gabungan_model## nama_model nilai_rmse nilai_mae
## 1 Polinomial (df=7) 4.293242 3.233376
## 2 Fungsi Tangga (breaks=15) 4.313162 3.286289
## 3 Natural Cubic Spline (df=6) 4.310011 3.237990
ggplot(Auto,aes(x=horsepower, y=mpg))+
geom_point(alpha=0.05, color="red")+
stat_smooth(method="lm",
formula=y~poly(x,7,raw=T),
lty=1, aes(col="Regresi Polinomial"),col="red",se=F)+
stat_smooth(method="lm",
formula=y~cut(x,15),
lty=1, aes(col="Fungsi Tangga"),col="blue",se=F)+
stat_smooth(method="lm",
formula=y~ns(x,df=6),
lty=1, aes(col="Natural Cubic Spline"),col="green",se=F)+
scale_color_manual(values = c("Regresi Polynomial"="red","Fungsi Tangga"="black","Natutal Cubic Spline"="green"))+
ggtitle("Perbandingan Model")+theme(plot.title = element_text(hjust = 0.5))Berdasarkan nilai RMSE dan MAE dapat disimpulkan bahwa model terbaik untuk data Auto dengan peubah respon mpg dan peubah penjelas horsepower adalah Natural Cubic Spline dengan df=7.
mpg vs displacement
Regresi Polinomial
Cross Validatiom
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 1:10
best_polinomial <- map_dfr(df,function(i){
metric_polinomial <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(displacement,df=i),data=Auto[x$in_id,])
#Mempresdiksi dengan data testing
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polinomial
#Rata-rata 10 folds
mean_Polinomial<-colMeans(metric_polinomial)
mean_Polinomial
})
best_polinomial2<-cbind(df=df,best_polinomial)
best_polinomial2## df rmse mae
## 1 1 4.562513 3.511871
## 2 2 4.283164 3.159670
## 3 3 4.284465 3.165588
## 4 4 4.288491 3.164067
## 5 5 4.301619 3.177421
## 6 6 4.301749 3.204309
## 7 7 4.254069 3.197191
## 8 8 4.210302 3.198735
## 9 9 4.162825 3.149143
## 10 10 4.152981 3.130045
best_polinomial2 %>% slice_min(rmse)## df rmse mae
## 1 10 4.152981 3.130045
best_polinomial2 %>% slice_min(mae)## df rmse mae
## 1 10 4.152981 3.130045
Berdasarkan output diatas dengan menggunakan derajat 1 hingga 10, dapat dilihat bahwa model regresi polinomial terbaik terdapat pada derajat ke-10 yaitu ketika nilai rmse dan mae merupakan nilai paling kecil.
ggplot(Auto,aes(x=displacement,y=mpg)) +
geom_point(alpha=0.55,color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,10,raw=T),
lty = 1,col="black",se=F) +
ggtitle("Regresi Polinomial DF=10") +
ylab("mpg") +
xlab("displacement") +
theme(plot.title = element_text(hjust = 0.5))Fungsi Tangga
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 3:14
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,function(x){
training <- Auto[x$in_id,]
training$displacement <- cut(training$displacement,i) #cut untuk mengubah fungsi numerik jadi factor
mod <- lm(mpg ~ displacement,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga2 <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga2## breaks rmse mae
## 1 3 4.865081 3.676866
## 2 4 4.813915 3.619919
## 3 5 4.661119 3.496662
## 4 6 4.561127 3.389387
## 5 7 4.552850 3.372940
## 6 8 4.363903 3.230513
## 7 9 4.218350 3.109206
## 8 10 4.194571 3.097485
## 9 11 4.322365 3.224822
## 10 12 4.414807 3.274607
## 11 13 4.488436 3.340034
## 12 14 4.326971 3.207262
best_tangga2 %>% slice_min(rmse)## breaks rmse mae
## 1 10 4.194571 3.097485
best_tangga2 %>% slice_min(mae)## breaks rmse mae
## 1 10 4.194571 3.097485
Berdasarkan output diatas dengan menggunakan break 3 hingga 14, dapat dilihat bahwa model fungsi tangga terbaik terjadi pada break ke-10.
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",formula = y~cut(x,10),
lty = 1, col = "black",se = F)+
ggtitle("Fungsi Tangga breaks=10") +
ylab("mpg") +
xlab("displacement") +
theme(plot.title = element_text(hjust = 0.5))Terlihat bahwa data terbagi menjadi 10 interval dengan 9 knots.
Natural Cubic Spline
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 1:15
best_nspline <- map_dfr(df, function(i){
nspline <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ ns(displacement,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
nspline
# menghitung rata-rata untuk 10 folds
mean_nspline <- colMeans(nspline)
mean_nspline
}
)
best_nspline2 <- cbind(df=df,best_nspline)
best_nspline2## df rmse mae
## 1 1 4.562513 3.511871
## 2 2 4.278651 3.162207
## 3 3 4.283586 3.168701
## 4 4 4.312777 3.178787
## 5 5 4.280108 3.186282
## 6 6 4.197436 3.127948
## 7 7 4.184108 3.123310
## 8 8 4.165765 3.134441
## 9 9 4.163914 3.133642
## 10 10 4.133528 3.116680
## 11 11 4.114617 3.082797
## 12 12 4.074090 3.040305
## 13 13 4.112606 3.086771
## 14 14 4.144166 3.108204
## 15 15 4.185091 3.134720
best_nspline2 %>% slice_min(rmse)## df rmse mae
## 1 12 4.07409 3.040305
best_nspline2 %>% slice_min(mae)## df rmse mae
## 1 12 4.07409 3.040305
Berdasarkan output diatas dengan menggunakan df 1 hingga 15, dapat dilihat bahwa model regresi natural spline terbaik terjadi pada df ke-12.
df_12<-attr(ns(Auto$displacement, df=12),"knots")
df_12## 8.333333% 16.66667% 25% 33.33333% 41.66667% 50% 58.33333% 66.66667%
## 89.0000 97.0000 105.0000 119.0000 130.9167 151.0000 200.0000 232.0000
## 75% 83.33333% 91.66667%
## 275.7500 318.0000 351.0000
ncs <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="purple")+
stat_smooth(method = "lm",
formula = y~ns(x, df=12),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines With DF =12") +
xlab("Horse Power") +
ylab("mile per gallon") +
theme(plot.title = element_text(hjust = 0.5))+
geom_vline(xintercept = c(89,97,105,119,130.9167,151,200,232,275.75,318,351), col="grey50", lty=2)
ncsGambar diatas merupakan kurva Natural Cubic Spline dengan menggunakan df=12.
Perbandingan Model
nilai_metric <- rbind(best_polinomial2 %>% select(-1) %>% slice_min(rmse),
best_tangga2 %>% select(-1) %>% slice_min(rmse),
best_nspline2 %>% select(-1)%>% slice_min(rmse))
nama_model <- c("Polinomial (df=10)",
"Fungsi Tangga (breaks=10)",
"Natural Cubic Spline (df=12)")
gabungan_model <- data.frame(nama_model,nilai_metric)
gabungan_model## nama_model rmse mae
## 1 Polinomial (df=10) 4.152981 3.130045
## 2 Fungsi Tangga (breaks=10) 4.194571 3.097485
## 3 Natural Cubic Spline (df=12) 4.074090 3.040305
ggplot(Auto,aes(x=displacement, y=mpg))+
geom_point(alpha=0.05, color="red")+
stat_smooth(method="lm",formula=y~poly(x,10,raw=T),
lty=1, aes(col="Regresi Polinomial"),col="red",se=F)+
stat_smooth(method="lm", formula=y~cut(x,10),
lty=1, aes(col="Fungsi Tangga"),col="blue",se=F)+
stat_smooth(method="lm", formula=y~ns(x,df=12),
lty=1, aes(col="Regresi Natural Spline"),col="green",se=F)+
scale_color_manual(values = c("Regresi Polynomial"="red","Fungsi Tangga"="blue","Regresi Natural Spline"="yellow"))+
ggtitle("Perbandingan Model")+
theme(plot.title = element_text(hjust = 0.5))Model yang terbaik adalah regresi natural cubic spline dengan df=12 karena memiliki nilai rmse dan mae yang paling kecil dibandingkan model lainnya.
mpg vs weight
Regresi Polinomial
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 1:10
best_polinomial <- map_dfr(df,function(i){
metric_polinomial <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(weight,df=i),data=Auto[x$in_id,])
#Mempresdiksi dengan data testing
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polinomial
#Rata-rata 10 folds
mean_Polinomial<-colMeans(metric_polinomial)
mean_Polinomial
})
best_polinomial3<-cbind(df=df,best_polinomial)
best_polinomial3## df rmse mae
## 1 1 4.301837 3.283637
## 2 2 4.149977 3.060704
## 3 3 4.158482 3.068020
## 4 4 4.165585 3.067210
## 5 5 4.169760 3.079497
## 6 6 4.178027 3.098482
## 7 7 4.170375 3.100742
## 8 8 4.181430 3.108979
## 9 9 4.254235 3.164729
## 10 10 4.299992 3.167556
best_polinomial3 %>% slice_min(rmse)## df rmse mae
## 1 2 4.149977 3.060704
best_polinomial3 %>% slice_min(mae)## df rmse mae
## 1 2 4.149977 3.060704
Berdasarkan output diatas dengan menggunakan derajat 1 hingga 10, dapat dilihat bahwa model regresi polinomial terbaik terdapat pada derajat ke-2 yaitu ketika nilai rmse dan mae merupakan nilai paling kecil.
ggplot(Auto,aes(x=weight,y=mpg)) +
geom_point(alpha=0.55,color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1,col="red",se=F) +
ggtitle("Regresi Polinomial DF=2") +
ylab("mpg") +
xlab("weight") +
theme(plot.title = element_text(hjust = 0.5))Dari gambar tersebut dapat disimpulkan bahwa semakin meningkatnya weight maka mpg semakin menurun.
Fungsi Tangga
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 3:15
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,function(x){
training <- Auto[x$in_id,]
training$weight <- cut(training$weight,i) #cut untuk mengubah fungsi numerik jadi factor
mod <- lm(mpg ~ weight,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga3 <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga3## breaks rmse mae
## 1 3 4.868847 3.657505
## 2 4 4.706593 3.633584
## 3 5 4.389515 3.221556
## 4 6 4.201264 3.112221
## 5 7 4.279395 3.191726
## 6 8 4.370964 3.249765
## 7 9 4.341743 3.182396
## 8 10 4.329114 3.176390
## 9 11 4.300160 3.209753
## 10 12 4.195146 3.076847
## 11 13 4.242706 3.149450
## 12 14 4.227745 3.127902
## 13 15 4.263350 3.142354
best_tangga3 %>% slice_min(rmse)## breaks rmse mae
## 1 12 4.195146 3.076847
best_tangga3 %>% slice_min(mae)## breaks rmse mae
## 1 12 4.195146 3.076847
Dari hasil tersebut dapat dilihat bahwa model fungsi tangga terbaik terjadi pada break ke-12 yaitu ketika nilai rmse dan mae nya merupakan nilai terkecil dibandingkan nilai break yang lain.
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",formula = y~cut(x,12),
lty = 1, col = "red",se = F)+
ggtitle("Fungsi Tangga breaks=12") +
ylab("mpg") +
xlab("weight") +
theme(plot.title = element_text(hjust = 0.5))Gambar diatas merupakan kurva fungsi tangga dengan berhenti pada break=12. Terlihat bahwa data terbagi menjadi 12 interval dengan 11 knots.
Natural Cubic Spline
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 1:10
best_nspline <- map_dfr(df, function(i){
nspline <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ ns(weight,df=i),data=Auto[x$in_id,])
pred <- predict(mod,newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
nspline
# menghitung rata-rata untuk 10 folds
mean_nspline <- colMeans(nspline)
mean_nspline
}
)
best_nspline3 <- cbind(df=df,best_nspline)
best_nspline3## df rmse mae
## 1 1 4.301837 3.283637
## 2 2 4.150320 3.062108
## 3 3 4.162964 3.064666
## 4 4 4.171112 3.073367
## 5 5 4.180162 3.092261
## 6 6 4.171277 3.102353
## 7 7 4.141103 3.061136
## 8 8 4.170552 3.098226
## 9 9 4.175108 3.091378
## 10 10 4.168457 3.089614
best_nspline3 %>% slice_min(rmse)## df rmse mae
## 1 7 4.141103 3.061136
best_nspline3 %>% slice_min(mae)## df rmse mae
## 1 7 4.141103 3.061136
Dapat dilihat bahwa model regresi natural spline terbaik terjadi pada df ke-7 yaitu ketika nilai rmse dan mae merupakan nilai paling kecil diantara df yang lainnya.
df_7<-attr(ns(Auto$weight, df=7),"knots")
df_7## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 2074.857 2285.429 2635.000 2986.571 3446.143 4096.286
ncs <- ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="purple")+
stat_smooth(method = "lm",
formula = y~ns(x, df=7),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines With DF =7") +
xlab("Horse Power") +
ylab("mile per gallon") +
theme(plot.title = element_text(hjust = 0.5))+
geom_vline(xintercept = c(2074.857,2285.429,1635,2986.578,3446.143,4096.286), col="grey50", lty=2)
ncsPerbandingan Model
nilai_metric <- rbind(best_polinomial3 %>% select(-1) %>% slice_min(rmse),
best_tangga3 %>% select(-1) %>% slice_min(rmse),
best_nspline3 %>% select(-1)%>% slice_min(rmse))
nama_model <- c("Polinomial (df=2)",
"Fungsi Tangga (breaks=12)",
"Natural Cubic Spline (df=7)")
gabungan_model <- data.frame(nama_model,nilai_metric)
gabungan_model## nama_model rmse mae
## 1 Polinomial (df=2) 4.149977 3.060704
## 2 Fungsi Tangga (breaks=12) 4.195146 3.076847
## 3 Natural Cubic Spline (df=7) 4.141103 3.061136
ggplot(Auto,aes(x=weight, y=mpg))+
geom_point(alpha=0.05, color="red")+
stat_smooth(method="lm",formula=y~poly(x,2,raw=T),
lty=1, aes(col="Regresi Polinomial"),col="red",se=F)+
stat_smooth(method="lm", formula=y~cut(x,12),
lty=1, aes(col="Fungsi Tangga"),col="blue",se=F)+
stat_smooth(method="lm", formula=y~ns(x,df=7),
lty=1, aes(col="Regresi Natural Spline"),col="green",se=F)+
scale_color_manual(values = c("Regresi Polynomial"="red","Fungsi Tangga"="blue","Regresi Natural Spline"="green"))+
ggtitle("Perbandingan Model")+
theme(plot.title = element_text(hjust = 0.5))Model yang terbaik adalah regresi polinomial dengan df=2 karena memiliki nilai mae yang paling kecil dibandingkan model lainnya. Jika dilihat dari kurvanya maka regresi polinomial memiliki garis yang lebih mulus dibandingkan model yang lainnya.
Kesimpulan
Dari ke-tiga model yang digunakan untuk analisis regresi non-linear, maka dihasilkan model terbaik untuk setiap pasangan variabel sebagai berikut:
- Untuk variabel mpg vs horsepower diperoleh regresi terbaiknya yaitu natural cubic spline dengan df=7.
- Untuk variabel mpg vs displacement diperoleh regresi terbaiknya yaitu natural cubic spline dengan df=12.
- Untuk variabel mpg vs weight diperoleh regresi terbaiknya yaitu regresi polinomial dengan df=2.
Reference
Dito, Gerry A. (Oktober, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/