library(ISLR)
library(rsample)
library(splines)
library(tidyverse)
library(ggpubr)
library(caret)
library(DT)
library(corrplot)
library(gridExtra)
library(kableExtra)
library(magick)Data yang digunakan merupakan data Auto dari package ISLR. Data Auto merupakan data yang fokusnya adalah untuk memprediksi konsumsi bahan bakar kendaraan dalam satuan mile per galon (mpg). Terdapat 392 observasi (sebelumnya 398 observasi, namun terdapat 6 data missing value yang telah didrop), dan 9 variabel. Berikut merupakan peubah dan tipe datanya :
Berikut merupakan tampilan dari data Auto
df <- ISLR::Auto
datatable(df)Berikut ini merupakan visualisasi pola hubungan peubah yang terdapat pada data Auto
plot(df)Berdasarkan visualisasi diatas khususnya pada peubah mpg, kita bisa melihat antara mpg dan peubah lainnya memiliki hubungan yang tidak linear.
g1 <- ggplot(df, aes(x = factor(cylinders), y = mpg, fill=factor(cylinders))) +
geom_boxplot()
g2 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.5) +
geom_smooth()
g3 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.5) +
geom_smooth()
g4 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.5) +
geom_smooth()
g5 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.5) +
geom_smooth()
g6 <- ggplot(df, aes(x = year, y = mpg)) +
geom_point(alpha = 0.5) +
geom_smooth()
g7 <- ggplot(Auto,aes(y = mpg, x = factor(origin), fill=factor(origin))) +
geom_boxplot() +
ylab("mpg") + xlab("Origin") +
scale_fill_discrete(name="Origin", labels=c("Amerika","Eropa","Jepang")) +
theme_bw()
gridExtra::grid.arrange (g1, g2, g3, g4, g5, g6, g7, ncol = 2)## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Jika kita tampilkan hubungan mpg dengan peubah lainnya dengan menambahkan unsur geom_smooth(), kita bisa lebih jelas melihat bahwa pola hubungan yang terjadi antara mpg dan peubah prediktornya adalah hubungan yang tidak linear.
Korelasi Antar Peubah
correlations = cor(df[-9])
corrplot(correlations)Apabila kita lihat nilai korelasinya, peubah yang memiliki korelasi yang kuat dengan mpg adalah peubah cylinders, displacements, horsepower dan weight. Keempat peubah prediktor tersebut memiliki korelasi yang negatif terhadap mpg.
Pada data yang memiliki pola hubungan yang tidak linear, jika pemodelan dengan regresi linear akan menjadi tidak efisien, karena tujuan dari meminimumkan error akan tidak tercapai. Sehingga dalam prakteknya diperlukan suatu metode pemodelan tak linear. Pendugaan model bertujuan sebagai explanatory analysis atau untuk melihat pola hubungan yang terjadi pada peubah respon dan prediktor.
Pada analisis ini, peubah yang digunakan sebagai respon adalah mpg dan peubah prediktor nya adalah weight, displacement, acceleration dan horsepower.
Agar lebih tergambar jelas, berikut disajikan plot pasangan antara mpg dengan dengan masing - masing peubah prediktornya.
#Plot Mpg dan Weight
a <- ggplot(df, aes(x = weight, y =mpg)) +
geom_point(alpha=0.55, color= "#00CC99") +
geom_smooth()+
ggtitle("Plot Mpg dan Weight")+
theme_bw()+
theme(plot.title = element_text(size=10))
#Plot Mpg dan displacement
b <- ggplot(df, aes(x = displacement, y =mpg)) +
geom_point(alpha=0.55, color= "#00CC99") +
geom_smooth()+
ggtitle("Plot Mpg dan displacement")+
theme_bw()+
theme(plot.title = element_text(size=10))
# Plot Mpg dan horsepower
c <- ggplot(df, aes(x = horsepower, y =mpg)) +
geom_point(alpha=0.55, color= "#00CC99") +
geom_smooth()+
ggtitle("Plot Mpg dan horsepower")+
theme_bw()+
theme(plot.title = element_text(size=10))
#Plot Mpg dan acceleration
d <- ggplot(df, aes(x = acceleration, y =mpg)) +
geom_point(alpha=0.55, color= "#00CC99") +
geom_smooth()+
ggtitle("Plot Mpg dan acceleration")+
theme_bw()+
theme(plot.title = element_text(size=10))
ggarrange(a, b, c, d, ncol = 2, nrow=2)Visualisasi pola hubungan antara mpg dengan weight, displacement, acceleration dan horsepower Berdasarkan scatterplot diatas menunjukkan bahwa pola hubungan yang terbentuk tidak linear sehingga akan dilakukan pemodelan tak linear yang dapat merepresentasikan pola hubungan tersebut.
Pemodelan tak linear yang akan dicobakan yaitu :
A. Polynomial Regression
Derajat polinomial yang dianalisis pada derajat (d) 1,2, 3 dan 4 saja, karena pada regresi polynomial, derajat yang digunakan biasanya maksimal sebanyak 4. Jika d > 4 maka kurva yang terbentuk akan kurang beraturan terutama pada bagian ujung-ujungnya.
B. Picewise Constant
C. Basic Spline
D. Natural Spline
Agar mendapatkan nilai turning parameter optimum pada setiap model, maka dilakukan Cross Validation (CV). Tipe CV yang digunakana yakni k -folds dengan k = 10 dan ulangan sebanyak 3 kali. Selanjutnya, pemilihan model terbaik pada setiap pasangan mpg dan prediktornya didasari oleh kriteria nilai RMSE dan/atau MAE terkecil.
Secara Visual
g1 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 1, raw = T)") +
labs(title = "Polynomial Degree = 1")+
theme_bw()+ theme(plot.title = element_text(size=9))
g2 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") +
labs(title = "Polynomial Degree = 2")+
theme_bw()+ theme(plot.title = element_text(size=9))
g3 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") +
labs(title = "Polynomial Degree = 3")+
theme_bw()+ theme(plot.title = element_text(size=9))
g4 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") +
labs(title = "Polynomial Degree = 4")+
theme_bw() +
theme(plot.title = element_text(size=9))
ggarrange(g1, g2, g3, g4, ncol = 2, nrow=2)Secara visual, model polynomial berderajat 2 dan 3 terlihat lebih dapat menggambarkan pola hubungan data antara mpg dan weight dibandingkan model polynomial lainnya.
Secara Empiris
#Fungsi Pemilihan Model dengan K-Folds CV
train_control <- trainControl(method = "repeatedcv",
number = 10, repeats = 3)
cv_polynom <- function(df,i){
f <- bquote(mpg ~ poly(weight, degree = .(i)))
set.seed(581)
model_poly <- train(as.formula(f),
data = df,
trControl = train_control,
method = "lm",
na.action = na.pass)
predict <- predict(model_poly, df)
metric_model_poly <- model_poly$resample[,-4]
poly <- apply(metric_model_poly,2,mean)
return( metric_model_poly)
}Plot CV Berdasarkan hasil Cross Validation, maka diperoleh derajat yang optimum adalah
mae <- c()
rmse <- c()
for ( i in 1:4){
temp_er <- cv_polynom(df,i)
rmse[i] <- mean(temp_er$RMSE)
mae [i] <- mean(temp_er$MAE)
}
#RMSE
data.frame(degree = 1:4, rmse) %>%
mutate(min_RMSE = as.numeric(min(rmse) == rmse)) %>%
ggplot(aes(x = degree, y = rmse)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_RMSE))) +
scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
scale_y_continuous(breaks = seq(0, 5, 0.05)) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme_bw()+
theme(legend.position = "none") +
labs(title = "Auto Dataset - Polynomial Regression",
subtitle = "Memilih 'weight' polynomial degree dengan RMSE",
x = "Degree",
y = "CV RMSE")#MAE
data.frame(degree = 1:4, mae) %>%
mutate(min_MAE = as.numeric(min(mae) == mae)) %>%
ggplot(aes(x = degree, y = mae)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_MAE))) +
scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
scale_y_continuous(breaks = seq(0, 5, 0.05)) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme_bw()+
theme(legend.position = "none") +
labs(title = "Auto Dataset - Polynomial Regression",
subtitle = "Memilih 'weight' polynomial degree dengan MAE",
x = "Degree",
y = "CV MAE")Berdasarkan bukti empiris dengan repeated cross validation, dapat ditunjukkan bahwa model polynomial dengan derajat 2 memiliki nilai RMSE dan MAE terkecil.
Model Terbaik
ggplot(df,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Derajat 2") +
xlab("Weight") +
ylab("mile per gallon")Secara Visual
g2 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 2, raw = T)") +
labs(title = "Breaks= 2")+
theme(plot.title = element_text(size=9))
g3 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 3, raw = T)") +
labs(title = "Breaks= 3")+
theme(plot.title = element_text(size=9))
g4 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 4, raw = T)") +
labs(title = "Breaks= 4")+
theme(plot.title = element_text(size=9))
g5 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 5, raw = T)") +
labs(title = "Breaks= 5")+
theme(plot.title = element_text(size=9))
g6 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 6, raw = T)") +
labs(title = "Breaks= 6")+
theme(plot.title = element_text(size=9))
g7 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 7, raw = T)") +
labs(title = "Breaks= 7")+
theme(plot.title = element_text(size=9))
g8 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 8, raw = T)") +
labs(title = "Breaks= 8")+
theme(plot.title = element_text(size=9))
g9 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 9, raw = T)") +
labs(title = "Breaks= 9")+
theme(plot.title = element_text(size=9))
g10 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 10, raw = T)") +
labs(title = "Breaks= 10")+
theme(plot.title = element_text(size=9))
theme_bw()
grid.arrange(g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 3)Visualisasi piecewise contstant tersebut menunjukkan bahwa, model dengan k yang semakin besar memberikan kurva yang semakin mulus, namun diujung kurva tampak kurang beraturan. Dari 9 plot tersebut, terlihat plot 6 dan 7 yang lebih menggambarkan pola hubungan data antara mpg dan weight.
Secara Empiris
set.seed(581)
cross_val <- vfold_cv(df,v=10,repeats = 3, strata = "mpg")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- df[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- df[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
}
)
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
datatable(best_tangga)Plot CV Piecewise
#Plot PC_Weight_Berdasarkan kriteria RMSE
pcw <- best_tangga$breaks <- as.factor(best_tangga$breaks)
ggplot(data=best_tangga, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_tangga[which.min(best_tangga$rmse), ], color="#00CC99",
size=3) +
labs(title = "MPG vs Weight - Piecewise Regression",
subtitle = "Memilih 'weight' piecewise dengan RMSE",
x = "Breaks",
y = "CV RMSE")+
theme_bw ()#Plot PC_Weight_Berdasarkan kriteria MAE
pcw2 <- best_tangga$breaks <- as.factor(best_tangga$breaks)
ggplot(data=best_tangga, aes(x=breaks, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_tangga[which.min(best_tangga$mae), ], color="#00CC99",
size=3) +
labs(title = "MPG vs Weight - Piecewise Regression",
subtitle = "Memilih 'weight' piecewise dengan MAE",
x = "Breaks",
y = "CV MAE")+
theme_bw ()#ggarrange(pcw, pc2, ncol=2, nrow=1)Berdasarkan k fold cv, didapatkan Model Piecewise dengan knot = 6 yang memiliki RMSE dan MAE terkecil.
Model Terbaik Piecewise Constant
ggplot(df,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~cut(x,6,raw=T),
lty = 1, col = "darkblue",se = F)+
theme_bw()+
labs(title = "MPG vs Weight - Piecewise Regression K = 6",
x = "Weight",
y = "mile per gallon")Secara Visual
g1 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 3)") +
labs(title = "df = 4 (1 knots)")+
theme_bw()
g2 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 4)") +
labs(title = "df = 5 (2 knot)")+ theme_bw()
g3 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 5)") +
labs(title = "df = 6 (3 knots)")+ theme_bw()
g4 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 6)") +
labs(title = "df = 7 (4 knots)") + theme_bw()
g5 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 7)") +
labs(title = "df = 8 (5 knots)") + theme_bw()
g6 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 8)") +
labs(title = "df = 9 (6 knots)") + theme_bw()
g7 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 9)") +
labs(title = "df = 10 (7 knots)") + theme_bw()
g8 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 10)") +
labs(title = "df = 11 (8 knots)") + theme_bw()
g9 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 11)") +
labs(title = "df = 12 (9 knots)") + theme_bw()
g10 <- ggplot(df, aes(x = weight, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 12)") +
labs(title = "df = 13 (10 knots)") + theme_bw()
grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, nrow = 3)Secara visual, model basic spline dengan df = 5 dan 6 lebih nampak mulus, dan diujung ujung kurva tampak lebih teratur dan tidak liar.
Secara Empiris
set.seed(581)
cross_val <- vfold_cv(df,v=10, repeats=5, strata = "mpg")
dg <- 2:12
best_bspline <- map_dfr(dg, function(i){
metric_bspline <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(weight,df=i),
data=df[x$in_id,])
pred <- predict(mod,
newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_bspline
# menghitung rata-rata untuk 10 folds
mean_metric_bspline <- colMeans(metric_bspline)
mean_metric_bspline
}
)
best_bspline <- cbind(df=dg,best_bspline)
# menampilkan hasil all breaks
datatable(best_bspline)Plot CV BS
Berdasarkan RMSE
best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_bspline[which.min(best_bspline$rmse), ], color="#00CC99",
size=3) +
theme_bw()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Berdasarkan MAE
best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_bspline[which.min(best_bspline$mae), ], color="#00CC99",
size=3) +
theme_bw() +
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Berdasarkan bukti empiris dengan repeated cross validation, dapat ditunjukkan bahwa model basic spline dengan knot = 9 memiliki nilai RMSE dan MAE terkecil.
Plot Basic Spline*
Menentukan titik knots
df_9<-attr(bs(df$weight, df=9),"knots")
df_9## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 2074.857 2285.429 2635.000 2986.571 3446.143 4096.286
Membuat Plot
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~bs(x, df=9),
lty = 1, se = F) +
ggtitle("B-Spline (df=9)") +
xlab("Weight") + ylab("mile per gallon")+
theme_bw() +
theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
geom_vline(xintercept = c(2074.9, 2285.5, 2635, 2896.6, 3446.1, 4096.3), col="grey50", lty=2)Secara Empiris
set.seed(581)
cross_val <- vfold_cv(df,v=10, repeats=3, strata = "mpg")
dg <- 2:20
best_spline3 <- map_dfr(dg, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(weight,df=i),
data=df[x$in_id,])
pred <- predict(mod,
newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(df=dg,best_spline3)
# menampilkan hasil all breaks
datatable(best_spline3)Plot CV NS
Berdasarkan RMSE
best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_spline3[which.min(best_spline3$rmse), ], color="#00CC99",
size=3) +
theme_bw()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Berdasarkan MAE
best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_spline3[which.min(best_spline3$mae), ], color="#00CC99",
size=3) +
theme_bw() +
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Plot Natural Spline*
Menentukan titik knots
df_7<-attr(ns(df$weight, df=7),"knots")
df_7## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 2074.857 2285.429 2635.000 2986.571 3446.143 4096.286
Membuat Plot
ggplot(Auto,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~ns(x, df=7),
lty = 1, se = F) +
ggtitle("Natural Splines (df=7)") +
xlab("Weight") + ylab("mile per gallon")+
theme_bw() +
theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
geom_vline(xintercept = c(2074.9, 2285.5, 2635, 2896.6, 3446.1, 4096.3), col="grey50", lty=2)Berdasarkan bukti empiris dengan repeated cross validation, dapat ditunjukkan bahwa model natural spline dengan knot = 7 memiliki nilai RMSE dan MAE terkecil.
Komparasi model terbaik dari pola hubungan antara mpg dan weight dengan nilai RMSE dan MAE terkecil adalah sebagai berikut :
Model Basic Spline memiliki nilai RMSE dan MAE terkecil dibandingkan model tak linear lainya, sehingga dalam kasus ini model Basic Spline dengan df 9 merupakan model yang lebih menggambarkan pola hubungan data antara mpg dan weight dibandingkan dengan model lainnya.
Secara Visual
g1 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 1, raw = T)") +
labs(title = "Polynomial Derajat = 1")+
theme_bw() +
theme(plot.title = element_text(size=9))
g2 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") +
labs(title = "Polynomial Derajat = 2")+
theme_bw() +
theme(plot.title = element_text(size=9))
g3 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") +
labs(title = "Polynomial Derajat = 3")+
theme_bw() +
theme(plot.title = element_text(size=9))
g4 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") +
labs(title = "Polynomial Derajat = 4")+
theme_bw() +
theme(plot.title = element_text(size=9))
ggarrange(g1, g2, g3, g4, ncol = 2, nrow = 2, widths = c(10,10) ,heights = c(2,2))Secara visual, model polynomial berderajat 2 dan 3 terlihat lebih dapat menggambarkan pola hubungan data antara mpg dan displacement dibandingkan model polynomial lainnya.
Secara Empiris
Pemilihan Model dengan K-Folds CV
train_control <- trainControl(method = "repeatedcv",
number = 10, repeats = 3)
cv_polynomd <- function(df,i){
f <- bquote(mpg ~ poly(displacement, degree = .(i)))
set.seed(581)
model_poly <- train(as.formula(f),
data = df,
trControl = train_control,
method = "lm",
na.action = na.pass)
predict <- predict(model_poly, df)
metric_model_poly <- model_poly$resample[,-4]
#poly <- apply(metric_model_poly,2,mean)
return( metric_model_poly)
}Plot CV
mae <- c()
rmse <- c()
for ( i in 1:4){
temp_er <- cv_polynomd(df,i)
rmse[i] <- mean(temp_er$RMSE)
mae [i] <- mean(temp_er$MAE)
}
#RMSE
data.frame(degree = 1:4, rmse) %>%
mutate(min_RMSE = as.numeric(min(rmse) == rmse)) %>%
ggplot(aes(x = degree, y = rmse)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_RMSE))) +
scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
scale_y_continuous(breaks = seq(0, 5, 0.05)) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme_bw()+
theme(legend.position = "none") +
labs(title = "Auto Dataset - Polynomial Regression",
subtitle = "Memilih 'displacement' polynomial degree dengan RMSE",
x = "Degree",
y = "CV RMSE")#MAE
data.frame(degree = 1:4, mae) %>%
mutate(min_MAE = as.numeric(min(mae) == mae)) %>%
ggplot(aes(x = degree, y = mae)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_MAE))) +
scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
scale_y_continuous(breaks = seq(0, 5, 0.05)) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme_bw()+
theme(legend.position = "none") +
labs(title = "Auto Dataset - Polynomial Regression",
subtitle = "Memilih 'displacement' polynomial degree dengan MAE",
x = "Degree",
y = "CV MAE")Model Terbaik Polynomial
Secara empiris, model polynomial 2 memiliki nilai RMSE dan MAE terkecil
ggplot(df,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Derajat 2") +
xlab("displacement") +
ylab("mile per gallon")Secara empiris, berdasarkan k fold cv, didapatkan Model Polynomial derajat 2 yang memiliki RMSE dan MAE terkecil.
Secara Visual
g2 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 2, raw = T)") +
labs(title = "Polynomial Degree = 2")+
theme(plot.title = element_text(size=9))
g3 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 3, raw = T)") +
labs(title = "Polynomial Degree = 3")+
theme(plot.title = element_text(size=9))
g4 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 4, raw = T)") +
labs(title = "Polynomial Degree = 4")+
theme(plot.title = element_text(size=9))
g5 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 5, raw = T)") +
labs(title = "Polynomial Degree = 5")+
theme(plot.title = element_text(size=9))
g6 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 6, raw = T)") +
labs(title = "Polynomial Degree = 6")+
theme(plot.title = element_text(size=9))
g7 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 7, raw = T)") +
labs(title = "Polynomial Degree = 7")+
theme(plot.title = element_text(size=9))
g8 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 8, raw = T)") +
labs(title = "Polynomial Degree = 8")+
theme(plot.title = element_text(size=9))
g9 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 9, raw = T)") +
labs(title = "Polynomial Degree = 9")+
theme(plot.title = element_text(size=9))
g10 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 10, raw = T)") +
labs(title = "Polynomial Degree = 10")+
theme(plot.title = element_text(size=9))
grid.arrange(g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 3)Visualisasi piecewise contstant tersebut menunjukkan bahwa, model dengan k yang semakin besar memberikan kurva yang semakin mulus, namun di tengah dan diujung kurva tampak kurang beraturan. Dari 9 plot tersebut, terlihat plot 6 dan 7 yang lebih menggambarkan pola hubungan data antara mpg dan displacement.
Secara Empiris
set.seed(581)
cross_val <- vfold_cv(df,v=10,strata = "mpg")
breaks <- 2:9
best_tangga_dis <- map_dfr(breaks, function(i){
metric_tangga_dis <- map_dfr(cross_val$splits,
function(x){
training <- df[x$in_id,]
training$displacement <- cut(training$displacement,i)
mod <- lm(mpg ~ displacement,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- df[-x$in_id,]
displacement_new <- cut(testing$displacement,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_dis
# menghitung rata-rata untuk 10 folds
mean_metric_tangga_dis <- colMeans(metric_tangga_dis)
mean_metric_tangga_dis
}
)
best_tangga_dis <- cbind(breaks=breaks,best_tangga_dis)
# menampilkan hasil all breaks
datatable(best_tangga_dis)Plot CV
#Plot PC_dis_Berdasarkan kriteria RMSE
best_tangga_dis## breaks rmse mae
## 1 2 5.927553 4.645127
## 2 3 4.912809 3.680623
## 3 4 4.825994 3.635968
## 4 5 4.681469 3.500329
## 5 6 4.603800 3.415141
## 6 7 4.571606 3.376004
## 7 8 4.402890 3.251622
## 8 9 4.233686 3.112410
best_tangga_dis$breaks <- as.factor(best_tangga_dis$breaks)
ggplot(data=best_tangga_dis, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_tangga_dis[which.min(best_tangga_dis$rmse), ], color="#00CC99",
size=3) +
ggtitle("K-Fold Cross Validation- RMSE Criteria") +
labs(caption= "Data dis") +
xlab("Breaks") +
ylab("Root Mean Square Error") +
theme_bw ()#Plot PC_dis_Berdasarkan kriteria MAE
best_tangga_dis$breaks <- as.factor(best_tangga_dis$breaks)
ggplot(data=best_tangga_dis, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_tangga_dis[which.min(best_tangga_dis$mae), ], color="#00CC99",
size=3) +
ggtitle("K-Fold Cross Validation- MAE Criteria") +
labs(caption= "Data dis")+
xlab("Breaks") +
ylab(" Mean Absolute Error") +
theme_bw ()Model Terbaik Piecewise
ggplot(df,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~cut(x,9,raw=T),
lty = 1, col = "darkblue",se = F)+
theme_bw()+
ggtitle("Regresi Piecewise Constant K= 9") +
xlab("Displacement") +
ylab("mile per gallon")Berdasarkan k fold cv, didapatkan Model Piecewise dengan knot = 9 yang memiliki RMSE dan MAE terkecil.
Secara Visual
g1 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 3)") +
labs(title = "df = 4 (0 knots)")+
theme_bw()
g2 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 4)") +
labs(title = "df = 5 (1 knot)")+ theme_bw()
g3 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 5)") +
labs(title = "df = 6 (2 knots)")+ theme_bw()
g4 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 6)") +
labs(title = "df = 7 (3 knots)") + theme_bw()
g5 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 7)") +
labs(title = "df = 8 (4 knots)") + theme_bw()
g6 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 8)") +
labs(title = "df = 9 (5 knots)") + theme_bw()
g7 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 9)") +
labs(title = "df = 10 (6 knots)") + theme_bw()
g8 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 10)") +
labs(title = "df = 11 (7 knots)") + theme_bw()
g9 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 11)") +
labs(title = "df = 12 (8 knots)") + theme_bw()
g10 <- ggplot(df, aes(x = displacement, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 12)") +
labs(title = "df = 13 (9 knots)") + theme_bw()
grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 4)Secara visual, model basic spline dengan df = 5 lebih nampak mulus, dan diujung ujung kurva tampak lebih teratur dan tidak liar. Pada model dengan df > 6, tampak kurva pada titik boundary knots sebelah kiri yang cukup liar.
Secara Empiris
set.seed(581)
cross_val <- vfold_cv(df,v=10, repeats=5, strata = "mpg")
dg <- 2:10
best_bspline <- map_dfr(dg, function(i){
metric_bspline <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(displacement,df=i),
data=df[x$in_id,])
pred <- predict(mod,
newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_bspline
# menghitung rata-rata untuk 10 folds
mean_metric_bspline <- colMeans(metric_bspline)
mean_metric_bspline
}
)
best_bspline <- cbind(df=dg,best_bspline)
# menampilkan hasil all breaks
datatable(best_bspline)Plot CV BS
Berdasarkan RMSE
best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_bspline[which.min(best_bspline$rmse), ], color="#00CC99",
size=3) +
theme_bw()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Berdasarkan MAE
best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_bspline[which.min(best_bspline$mae), ], color="#00CC99",
size=3) +
theme_bw() +
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Plot Basic Spline*
Menentukan titik knots
df_6<-attr(bs(df$displacement, df=6),"knots")
df_6## 25% 50% 75%
## 105.00 151.00 275.75
Membuat Plot
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~bs(x, df=6),
lty = 1, se = F) +
ggtitle("B-Spline (df=6)") +
xlab("Displacement") + ylab("mile per gallon")+
theme_bw() +
theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
geom_vline(xintercept = c(105, 151, 275.75), col="grey50", lty=2)Berdasarkan k fold cv, model bspline yang memiliki RMSE dan MAE terkecil terdapat pada knot = 6
Secara Empiris
set.seed(581)
cross_val <- vfold_cv(df,v=10, repeats=3, strata = "mpg")
dg <- 2:20
best_spline3 <- map_dfr(dg, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(displacement,df=i),
data=df[x$in_id,])
pred <- predict(mod,
newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(df=dg,best_spline3)
# menampilkan hasil all breaks
datatable(best_spline3)Plot CV NS
Berdasarkan RMSE
best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_spline3[which.min(best_spline3$rmse), ], color="#00CC99",
size=3) +
theme_bw()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Berdasarkan MAE
best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_spline3[which.min(best_spline3$mae), ], color="#00CC99",
size=3) +
theme_bw() +
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Plot Natural Spline
Menentukan titik knots
df_12d <-attr(ns(df$displacement, df=12),"knots")
df_12d## 8.333333% 16.66667% 25% 33.33333% 41.66667% 50% 58.33333% 66.66667%
## 89.0000 97.0000 105.0000 119.0000 130.9167 151.0000 200.0000 232.0000
## 75% 83.33333% 91.66667%
## 275.7500 318.0000 351.0000
Membuat Plot
ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~ns(x, df=12),
lty = 1, se = F) +
ggtitle("Natural Splines (df=12)") +
xlab("Displacement") + ylab("mile per gallon")+
theme_bw() +
theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
geom_vline(xintercept = df_12d, col="grey50", lty=2)Komparasi model terbaik dari pola hubungan antara mpg dan displalacement dengan nilai RMSE dan MAE terkecil adalah sebagai berikut :
Model Natural Spline memiliki nilai RMSE dan MAE terkecil dibandingkan model tak linear lainya, sehingga dalam kasus ini model natural spline dengan df 12 merupakan model yang lebih menggambarkan pola hubungan data antara mpg dan displacement dibandingkan dengan model lainnya.
Secara Visual
g1 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 1, raw = T)") +
labs(title = "Polynomial Derajat = 1")+
theme_bw() +
theme(plot.title = element_text(size=9))
g2 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") +
labs(title = "Polynomial Derajat = 2")+
theme_bw() +
theme(plot.title = element_text(size=9))
g3 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") +
labs(title = "Polynomial Derajat = 3")+
theme_bw() +
theme(plot.title = element_text(size=9))
g4 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") +
labs(title = "Polynomial Derajat = 4")+
theme_bw() +
theme(plot.title = element_text(size=9))
ggarrange(g1, g2, g3, g4, ncol = 2, nrow = 2, widths = c(10,10) ,heights = c(2,2))Secara visual, model polinomial derajat 3 dan 4 tampak dapat menggambarkan pola hubungan data mpg dan acceleration lebih baik dibandingkan dengan polinomial derajat lainnya.
Secara Empiris
train_control <- trainControl(method = "repeatedcv",
number = 10, repeats = 3)
cv_polynomda <- function(df,i){
f <- bquote(mpg ~ poly(acceleration, degree = .(i)))
set.seed(581)
model_poly <- train(as.formula(f),
data = df,
trControl = train_control,
method = "lm",
na.action = na.pass)
predict <- predict(model_poly, df)
metric_model_poly <- model_poly$resample[,-4]
#poly <- apply(metric_model_poly,2,mean)
return( metric_model_poly)
}Plot CV
mae <- c()
rmse <- c()
for ( i in 1:4){
temp_er <- cv_polynomda(df,i)
rmse[i] <- mean(temp_er$RMSE)
mae [i] <- mean(temp_er$MAE)
}
#RMSE
pa <- data.frame(degree = 1:4, rmse) %>%
mutate(min_RMSE = as.numeric(min(rmse) == rmse)) %>%
ggplot(aes(x = degree, y = rmse)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_RMSE))) +
scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
scale_y_continuous(breaks = seq(0, 12, 0.02)) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme_bw()+
theme(legend.position = "none",plot.title = element_text(size=10), plot.subtitle = element_text(size=8)) +
labs(title = "Auto Dataset - Polynomial Regression",
subtitle = "Memilih 'acceleration' polynomial degree dengan RMSE",
x = "Degree",
y = "CV RMSE")
#MAE
pb <- data.frame(degree = 1:4, mae) %>%
mutate(min_MAE = as.numeric(min(mae) == mae)) %>%
ggplot(aes(x = degree, y = mae)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_MAE))) +
scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
scale_y_continuous(breaks = seq(0, 10, 0.02)) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme_bw()+
theme(legend.position = "none",plot.title = element_text(size=9),plot.subtitle = element_text(size=8) ) +
labs(title = "Auto Dataset - Polynomial Regression",
subtitle = "Memilih 'acceleration' polynomial degree dengan MAE",
x = "Degree",
y = "CV MAE")
ggarrange(pa, pb, ncol = 2, nrow =1)Berdasarkan k fold cv, didapatkan Model Polynomial dengan derajat = 4 yang memiliki RMSE dan MAE terkecil.
Model Terbaik Polynomal
ggplot(df,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Derajat 2") +
xlab("displacement") +
ylab("mile per gallon")Secara Visual
g2 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 2, raw = T)") +
labs(title = "Breaks= 2")+
theme(plot.title = element_text(size=9))
g3 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 3, raw = T)") +
labs(title = "Breaks= 3")+
theme(plot.title = element_text(size=9))
g4 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 4, raw = T)") +
labs(title = "Breaks= 4")+
theme(plot.title = element_text(size=9))
g5 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 5, raw = T)") +
labs(title = "Breaks= 5")+
theme(plot.title = element_text(size=9))
g6 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 6, raw = T)") +
labs(title = "Breaks= 6")+
theme(plot.title = element_text(size=9))
g7 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 7, raw = T)") +
labs(title = "Breaks= 7")+
theme(plot.title = element_text(size=9))
g8 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 8, raw = T)") +
labs(title = "Breaks= 8")+
theme(plot.title = element_text(size=9))
g9 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 9, raw = T)") +
labs(title = "Breaks= 9")+
theme(plot.title = element_text(size=9))
g10 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 10, raw = T)") +
labs(title = "Breaks= 10")+
theme(plot.title = element_text(size=9))
grid.arrange(g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 3)Visualisasi piecewise contstant tersebut menunjukkan bahwa, model dengan k yang semakin besar memberikan kurva yang semakin mulus, namun diujung kurva tampak kurang beraturan. Dari 9 plot tersebut, terlihat plot 4 dan 5 yang lebih menggambarkan pola hubungan data antara mpg dan acceleration.
Secara Empiris
set.seed(581)
cross_val <- vfold_cv(df,v=10,strata = "mpg")
breaks <- 2:10
best_tangga_ac <- map_dfr(breaks, function(i){
metric_tangga_ac <- map_dfr(cross_val$splits,
function(x){
training <- df[x$in_id,]
training$acceleration <- cut(training$acceleration,i)
mod <- lm(mpg ~ acceleration,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- df[-x$in_id,]
acceleration_new <- cut(testing$acceleration,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(acceleration=acceleration_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_ac
# menghitung rata-rata untuk 10 folds
mean_metric_tangga_ac <- colMeans(metric_tangga_ac)
mean_metric_tangga_ac
}
)
best_tangga_ac <- cbind(breaks=breaks,best_tangga_ac)
# menampilkan hasil all breaks
datatable(best_tangga_ac)Plot CV_Acceleration
#Plot PC_acceleration_Berdasarkan kriteria RMSE
best_tangga_ac$breaks <- as.factor(best_tangga_ac$breaks)
ggplot(data=best_tangga_ac, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_tangga_ac[which.min(best_tangga$rmse), ], color="#00CC99",
size=3) +
ggtitle("K-Fold Cross Validation- RMSE Criteria") +
xlab("Breaks") +
ylab("Root Mean Square Error") +
theme_bw ()#Plot PC_Weight_Berdasarkan kriteria MAE
best_tangga_ac$breaks <- as.factor(best_tangga_ac$breaks)
ggplot(data=best_tangga_ac, aes(x=breaks, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_tangga_ac[which.min(best_tangga$mae), ], color="#00CC99",
size=3) +
ggtitle("K-Fold Cross Validation- MAE Criteria") +
xlab("Breaks") +
ylab(" Mean Absolute Error") +
theme_bw ()Berdasarkan k fold cv, didapatkan Model Piecewise Constant dengan knot = 5 yang memiliki RMSE dan MAE terkecil.
Model Terbaik Piecewise Constant
ggplot(df,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~cut(x,5,raw=T),
lty = 1, col = "darkblue",se = F)+
theme_bw()+
ggtitle("Regresi Piecewise Constant K= 5") +
xlab("Acceleration") +
ylab("mile per gallon")Secar Visual
g1 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 3)") +
labs(title = "df = 4 (1 knots)")+
theme_bw()
g2 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 4)") +
labs(title = "df = 5 (2 knot)")+ theme_bw()
g3 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 5)") +
labs(title = "df = 6 (3 knots)")+ theme_bw()
g4 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 6)") +
labs(title = "df = 7 (4 knots)") + theme_bw()
g5 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 7)") +
labs(title = "df = 8 (5 knots)") + theme_bw()
g6 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 8)") +
labs(title = "df = 9 (6 knots)") + theme_bw()
g7 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 9)") +
labs(title = "df = 10 (7 knots)") + theme_bw()
g8 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 10)") +
labs(title = "df = 11 (8 knots)") + theme_bw()
g9 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 11)") +
labs(title = "df = 12 (9 knots)") + theme_bw()
g10 <- ggplot(df, aes(x = acceleration, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 12)") +
labs(title = "df = 13 (10 knots)") + theme_bw()
grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 4)Secara visual, model basic spline dengan df = 7 lebih nampak mulus, dan ditengah maupun diujung ujung kurva tampak lebih teratur dan tidak liar. Model ini secara visual cukup baik menggambarkan pola hubungan data antara mpg dan acceleration.
Secara Empiris
set.seed(123)
cross_val <- vfold_cv(df,v=10, repeats=5, strata = "mpg")
dg <- 2:10
best_bspline <- map_dfr(dg, function(i){
metric_bspline <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(acceleration,df=i),
data=df[x$in_id,])
pred <- predict(mod,
newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_bspline
# menghitung rata-rata untuk 10 folds
mean_metric_bspline <- colMeans(metric_bspline)
mean_metric_bspline
}
)
best_bspline <- cbind(df=dg,best_bspline)
# menampilkan hasil all breaks
datatable(best_bspline)Plot CV BS
Berdasarkan RMSE
best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_bspline[which.min(best_bspline$rmse), ], color="#00CC99",
size=3) +
theme_bw()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Berdasarkan MAE
best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_bspline[which.min(best_bspline$mae), ], color="#00CC99",
size=3) +
theme_bw() +
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Berdasarkan bukti empiris dengan repeated cross validation, model basic spline dengan RMSE terkecil dengan knot dengan df = 4 dan pada nilai MAE terkecil terletak pada model dengan df= 7.
Plot Basic Spline
Menentukan titik knots
df_4<-attr(bs(df$acceleration, df=4),"knots")
df_4## 50%
## 15.5
df_7<-attr(bs(df$acceleration, df=7),"knots")
df_7## 20% 40% 60% 80%
## 13.42 14.80 16.00 17.70
Membuat Plot
#Membuat Plot B spline df=4
pa_bs4 <- ggplot(Auto,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~bs(x, df=4),
lty = 1, se = F) +
ggtitle("B-Spline (df=4)") +
xlab("acceleration") + ylab("mile per gallon")+
theme_bw() +
theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
geom_vline(xintercept = c(15.1), col="grey50", lty=2)
#Membuat Plot B spline df=7
pa_bs7 <- ggplot(Auto,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~bs(x, df=7),
lty = 1, se = F) +
ggtitle("B-Spline (df=7)") +
xlab("acceleration") + ylab("mile per gallon")+
theme_bw() +
theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
geom_vline(xintercept = c(13.42, 14.80, 16.00, 17.70 ), col="grey50", lty=2)
ggarrange( pa_bs4, pa_bs7, ncol=2, nrow = 1)Secara Empiris
set.seed(581)
cross_val <- vfold_cv(df,v=10, repeats=3, strata = "mpg")
dg <- 2:15
best_spline3 <- map_dfr(dg, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(acceleration,df=i),
data=df[x$in_id,])
pred <- predict(mod,
newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(df=dg,best_spline3)
# menampilkan hasil all breaks
datatable(best_spline3)Plot CV NS
Berdasarkan RMSE
best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_spline3[which.min(best_spline3$rmse), ], color="#00CC99",
size=3) +
theme_bw()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Berdasarkan MAE
best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_spline3[which.min(best_spline3$mae), ], color="#00CC99",
size=3) +
theme_bw() +
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Plot Natural Spline
Menentukan titik knots
df_8a<-attr(ns(df$acceleration, df=8),"knots")
df_8a## 12.5% 25% 37.5% 50% 62.5% 75% 87.5%
## 12.5000 13.7750 14.5000 15.5000 16.2000 17.0250 18.7125
Membuat Plot
ggplot(Auto,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~ns(x, df=8),
lty = 1, se = F) +
ggtitle("Natural Splines (df=8)") +
xlab("Acceleration") + ylab("mile per gallon")+
theme_bw() +
theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
geom_vline(xintercept = df_8a, col="grey50", lty=2)Komparasi model terbaik dari pola hubungan antara mpg dan acceleration dengan nilai RMSE dan MAE terkecil adalah sebagai berikut :
Model Polynomial memiliki nilai RMSE dan MAE terkecil dibandingkan model tak linear lainya, sehingga dalam kasus ini model polynomial dengan derajat 4 merupakan model yang lebih menggambarkan pola hubungan data antara mpg dan acceleration dibandingkan dengan model lainnya.
Secara Visual
g1 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 1, raw = T)") +
labs(title = "Polynomial Derajat = 1")+
theme_bw() +
theme(plot.title = element_text(size=9))
g2 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") +
labs(title = "Polynomial Derajat = 2")+
theme_bw() +
theme(plot.title = element_text(size=9))
g3 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") +
labs(title = "Polynomial Derajat = 3")+
theme_bw() +
theme(plot.title = element_text(size=9))
g4 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") +
labs(title = "Polynomial Derajat = 4")+
theme_bw() +
theme(plot.title = element_text(size=9))
ggarrange(g1, g2, g3, g4, ncol = 2, nrow = 2, widths = c(10,10) ,heights = c(2,2))Secara visual, model polinomial derajat 3 dan 4 tampak dapat menggambarkan pola hubungan data mpg dan horsepower lebih baik dibandingkan dengan polinomial derajat lainnya.
Secara Empiris
train_control <- trainControl(method = "repeatedcv",
number = 10, repeats = 3)
cv_polynomdh <- function(df,i){
f <- bquote(mpg ~ poly(horsepower, degree = .(i)))
set.seed(581)
model_poly <- train(as.formula(f),
data = df,
trControl = train_control,
method = "lm",
na.action = na.pass)
predict <- predict(model_poly, df)
metric_model_poly <- model_poly$resample[,-4]
#poly <- apply(metric_model_poly,2,mean)
return( metric_model_poly)
}Plot CV
mae <- c()
rmse <- c()
for ( i in 1:4){
temp_er <- cv_polynomdh(df,i)
rmse[i] <- mean(temp_er$RMSE)
mae [i] <- mean(temp_er$MAE)
}
#RMSE
ph <- data.frame(degree = 1:4, rmse) %>%
mutate(min_RMSE = as.numeric(min(rmse) == rmse)) %>%
ggplot(aes(x = degree, y = rmse)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_RMSE))) +
scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
scale_y_continuous(breaks = seq(0, 12, 0.2)) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme_bw()+
theme(legend.position = "none",plot.title = element_text(size=10), plot.subtitle = element_text(size=8)) +
labs(title = "Auto Dataset - Polynomial Regression",
subtitle = "Memilih 'horsepower' polynomial degree dengan RMSE",
x = "Degree",
y = "CV RMSE")
#MAE
pi <- data.frame(degree = 1:4, mae) %>%
mutate(min_MAE = as.numeric(min(mae) == mae)) %>%
ggplot(aes(x = degree, y = mae)) +
geom_line(col = "grey55") +
geom_point(size = 2, aes(col = factor(min_MAE))) +
scale_x_continuous(breaks = 1:4, minor_breaks = NULL) +
scale_y_continuous(breaks = seq(0, 10, 0.2)) +
scale_color_manual(values = c("deepskyblue3", "green")) +
theme_bw()+
theme(legend.position = "none",plot.title = element_text(size=9),plot.subtitle = element_text(size=8) ) +
labs(title = "Auto Dataset - Polynomial Regression",
subtitle = "Memilih 'horsepower' polynomial degree dengan MAE",
x = "Degree",
y = "CV MAE")
ggarrange(ph, pi, ncol = 2, nrow =1)Berdasarkan k fold cv, didapatkan Model Polynomial dengan derajat = 2 yang memiliki RMSE dan MAE terkecil.
Model Terbaik Polynomal
ggplot(df,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Derajat 2") +
xlab("horsepower") +
ylab("mile per gallon")Secar Visual
g2 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 2, raw = T)") +
labs(title = "Breaks= 2")+
theme(plot.title = element_text(size=9))
g3 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 3, raw = T)") +
labs(title = "Breaks= 3")+
theme(plot.title = element_text(size=9))
g4 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 4, raw = T)") +
labs(title = "Breaks= 4")+
theme(plot.title = element_text(size=9))
g5 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 5, raw = T)") +
labs(title = "Breaks= 5")+
theme(plot.title = element_text(size=9))
g6 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 6, raw = T)") +
labs(title = "Breaks= 6")+
theme(plot.title = element_text(size=9))
g7 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 7, raw = T)") +
labs(title = "Breaks= 7")+
theme(plot.title = element_text(size=9))
g8 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 8, raw = T)") +
labs(title = "Breaks= 8")+
theme(plot.title = element_text(size=9))
g9 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 9, raw = T)") +
labs(title = "Breaks= 9")+
theme(plot.title = element_text(size=9))
g10 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ cut(x, 10, raw = T)") +
labs(title = "Breaks= 10")+
theme(plot.title = element_text(size=9))
grid.arrange(g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 3)Visualisasi piecewise contstant tersebut menunjukkan bahwa, model dengan k yang semakin besar memberikan kurva yang semakin mulus, namun diujung kurva tampak kurang beraturan. Dari 9 plot tersebut, terlihat plot 7 dan 8 yang lebih menggambarkan pola hubungan data antara mpg dan acceleration.
Secara Empiris
set.seed(581)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
}
)
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
datatable(best_tangga)Plot CV_Horsepower
#Plot PC_horsepower_Berdasarkan kriteria RMSE
pch2 <-best_tangga_ac$breaks <- as.factor(best_tangga_ac$breaks)
ggplot(data=best_tangga_ac, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_tangga_ac[which.min(best_tangga$rmse), ], color="#00CC99",
size=3) +
ggtitle("K-Fold Cross Validation- RMSE Criteria") +
xlab("Breaks") +
ylab("Root Mean Square Error") +
theme_bw ()#Plot PC_Weight_Berdasarkan kriteria MAE
pch3 <- best_tangga_ac$breaks <- as.factor(best_tangga_ac$breaks)
ggplot(data=best_tangga_ac, aes(x=breaks, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_tangga_ac[which.min(best_tangga$mae), ], color="#00CC99",
size=3) +
ggtitle("K-Fold Cross Validation- MAE Criteria") +
xlab("Breaks") +
ylab(" Mean Absolute Error") +
theme_bw ()Berdasarkan k fold cv, didapatkan Model Piecewise Constant dengan knot = 8 yang memiliki RMSE dan MAE terkecil.
Model Terbaik Piecewise Constant
ggplot(df,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~cut(x,8,raw=T),
lty = 1, col = "darkblue",se = F)+
theme_bw()+
ggtitle("Regresi Piecewise Constant K= 8") +
xlab("Acceleration") +
ylab("mile per gallon")Secara Visual
g1 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 3)") +
labs(title = "df = 4 (1 knots)")+
theme_bw()
g2 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 4)") +
labs(title = "df = 5 (2 knot)")+ theme_bw()
g3 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 5)") +
labs(title = "df = 6 (3 knots)")+ theme_bw()
g4 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 6)") +
labs(title = "df = 7 (4 knots)") + theme_bw()
g5 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 7)") +
labs(title = "df = 8 (5 knots)") + theme_bw()
g6 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 8)") +
labs(title = "df = 9 (6 knots)") + theme_bw()
g7 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 9)") +
labs(title = "df = 10 (7 knots)") + theme_bw()
g8 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 10)") +
labs(title = "df = 11 (8 knots)") + theme_bw()
g9 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 11)") +
labs(title = "df = 12 (9 knots)") + theme_bw()
g10 <- ggplot(df, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color= "#00CC99") +
geom_smooth(method = "lm", formula = "y ~ bs(x, df = 12)") +
labs(title = "df = 13 (10 knots)") + theme_bw()
grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, ncol = 4)Secara visual, model basic spline dengan df = 5 lebih nampak mulus, dan ditengah maupun diujung ujung kurva tampak lebih teratur dan tidak liar. Model ini secara visual cukup baik menggambarkan pola hubungan data antara mpg dan acceleration.
Secara Empiris
set.seed(581)
cross_val <- vfold_cv(df,v=10, repeats=5, strata = "mpg")
dg <- 2:10
best_bspline <- map_dfr(dg, function(i){
metric_bspline <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(horsepower,df=i),
data=df[x$in_id,])
pred <- predict(mod,
newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_bspline
# menghitung rata-rata untuk 10 folds
mean_metric_bspline <- colMeans(metric_bspline)
mean_metric_bspline
}
)
best_bspline <- cbind(df=dg,best_bspline)
# menampilkan hasil all breaks
datatable(best_bspline)Plot CV BS
Berdasarkan RMSE
best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_bspline[which.min(best_bspline$rmse), ], color="#00CC99",
size=3) +
theme_bw()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Berdasarkan MAE
best_bspline$df <- as.factor(best_bspline$df)
ggplot(data=best_bspline, aes(x=df, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_bspline[which.min(best_bspline$mae), ], color="#00CC99",
size=3) +
theme_bw() +
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Plot Basic Spline
Menentukan titik knots
df_5h<-attr(bs(df$horsepower, df=5),"knots")
df_5h## 33.33333% 66.66667%
## 84 110
Membuat Plot
#Membuat Plot B spline df=5
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~bs(x, df=5),
lty = 1, se = F) +
ggtitle("B-Spline (df=5)") +
xlab("horsepower") + ylab("mile per gallon")+
theme_bw() +
theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
geom_vline(xintercept = df_5h, col="grey50", lty=2)Secara Empiris
set.seed(123)
cross_val <- vfold_cv(df,v=10, repeats=3, strata = "mpg")
dg <- 2:20
best_spline3 <- map_dfr(dg, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
data=df[x$in_id,])
pred <- predict(mod,
newdata=df[-x$in_id,])
truth <- df[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(df=dg,best_spline3)
# menampilkan hasil all breaks
datatable(best_spline3)Plot CV NS
Berdasarkan RMSE
best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_spline3[which.min(best_spline3$rmse), ], color="#00CC99",
size=3) +
theme_bw()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Berdasarkan MAE
best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
geom_point(data = best_spline3[which.min(best_spline3$mae), ], color="#00CC99",
size=3) +
theme_bw() +
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Mean Absolute Error") +
theme(plot.title = element_text(hjust=0.5, size=11))Plot Natural Spline
Menentukan titik knots
df_10<-attr(ns(Auto$horsepower, df=10),"knots")
df_10## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
Membuat Plot
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="#00CC99") +
stat_smooth(method = "lm",
formula = y~ns(x, df=10),
lty = 1, se = F) +
ggtitle("Natural Spline (df=10)") +
xlab("Horse Power") + ylab("mile per gallon")+
theme_bw() +
theme(panel.background = element_rect(fill = "white", colour = "white"), strip.background = element_blank(), plot.title = element_text(hjust=0.5, size=11)) +
geom_vline(xintercept = df_10, col="grey50", lty=2)Komparasi model terbaik dari pola hubungan antara mpg dan horsepower dengan nilai RMSE dan MAE terkecil adalah sebagai berikut :
Model Polynomial memiliki nilai RMSE dan MAE terkecil dibandingkan model tak linear lainya, sehingga dalam kasus ini model polynomial dengan derajat 2 merupakan model yang lebih menggambarkan pola hubungan data antara mpg dan horsepower dibandingkan dengan model lainnya.