Lakukan cross validation untuk menghasilkan pemodelan mpg vs horsepower yang optimal menggunakan 3 metode berikut, kemudian bandingkan hasilnya.
Regesi Polinomial
Piecewise constant
Natural cubic splines
Lakukan hal ditas untuk masing-masing subset data berdasarkan asal negara produsennya (Amerika, Eropa, Jepang). Plot model terbaik dalam satu frame.
Berikan ulasan terhadap hasil anda.
Susun bahan presentasi (dengan durasi 5-7 menit), untuk dipresentasikan minggu depan. Presenter akan diacak, sehingga semua orang harus siap presentasi.
library(ISLR)
library(rsample)
library(splines)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## Warning: package 'lubridate' was built under R version 4.1.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.1.2
library(DT)
library(ggplot2)
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:ggpubr':
##
## get_legend
##
## The following object is masked from 'package:lubridate':
##
## stamp
attach(Auto)
## The following object is masked from package:lubridate:
##
## origin
##
## The following object is masked from package:ggplot2:
##
## mpg
library(patchwork)
##
## Attaching package: 'patchwork'
##
## The following object is masked from 'package:cowplot':
##
## align_plots
Data yang digunakan adalah Auto dari package ISLR yang terdiri dari 392 observasi dan 9 variabel. Namun, variabel yang digunakan kali ini hanya 3, yaitu:
mpg : miles per gallon
horsepower: Engine horsepower
origin: origin of car (1. American, 2. European, 3. Japanese)
Auto = Auto %>% select(mpg, horsepower, origin)
datatable(Auto)
Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))
ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
geom_point(alpha = 0.55) +
theme_bw() +
scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
labs(
title = "Scatter Plot of `mpg` and `horsepower`",
x = "Horse Power",
y = "Mile per Gallon"
)
Plot Pemodelan mpg vs horsepower
ggplot(Auto, aes(x = horsepower, y = mpg, color = factor(origin))) +
geom_point(alpha = 0.55) +
stat_smooth(method = "lm", formula = y ~ x, lty = 1, col = "black", se = FALSE) +
theme_bw() +
scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green"))
Amerika
AutoAmerika = Auto[Auto$origin==1,] %>% select(mpg, horsepower, origin)
datatable(AutoAmerika)
ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
theme_bw() +
labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
subtitle = "Amerika")
Eropa
AutoEropa = Auto[Auto$origin==2,] %>% select(mpg, horsepower, origin)
datatable(AutoEropa)
ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
theme_bw() +
labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
subtitle = "Eropa")
Jepang
AutoJepang = Auto[Auto$origin==3,] %>% select(mpg, horsepower, origin)
datatable(AutoJepang)
ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="green") +
theme_bw() +
labs(y="Mile per Gallon", x="Horse Power", title = "Scatter Plot `mpg` dan `horsepower`",
subtitle = "Jepang")
plot1 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color = "red") +
theme_bw() +
labs(y = "Mile per Gallon", x = "Horse Power", title = "Scatter Plot",
subtitle = "Amerika")
plot2 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color = "blue") +
theme_bw() +
labs(y = "Mile per Gallon", x = "Horse Power", title = "Scatter Plot",
subtitle = "Eropa")
plot3 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color = "green") +
theme_bw() +
labs(y = "Mile per Gallon", x = "Horse Power", title = "Scatter Plot",
subtitle = "Jepang")
combined_plot <- plot1 + plot2 + plot3
combined_plot
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
ordo <- 2:10
best_poly <- map_dfr(ordo, function(i) {
metric_poly <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~ poly(horsepower,ordo=i), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
}
)
best_poly <- cbind(ordo=ordo,best_poly)
best_poly
## ordo rmse mae
## 1 2 4.351129 3.266460
## 2 3 4.355430 3.266945
## 3 4 4.367009 3.280452
## 4 5 4.318767 3.249003
## 5 6 4.317431 3.257272
## 6 7 4.293242 3.233376
## 7 8 4.309511 3.251758
## 8 9 4.323321 3.244919
## 9 10 4.371903 3.274887
Berdasarkan rmse dan mae
best_poly %>% slice_min(rmse)
## ordo rmse mae
## 1 7 4.293242 3.233376
best_poly %>% slice_min(rmse)
## ordo rmse mae
## 1 7 4.293242 3.233376
Plot CV
best_poly$ordo <- as.factor(best_poly$ordo)
ggplot(data=best_poly, aes(x=ordo, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
Plot
Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))
A <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
geom_point(alpha = 0.55) +
stat_smooth(method = "lm",
formula = y ~ poly(x, 2, raw = TRUE),
lty = 1, col = "black", se = FALSE) +
theme_bw() +
scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
ggtitle("Regresi Polinomial Ordo 2") +
xlab("Horse Power") +
ylab("Mile per Gallon")
B <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
geom_point(alpha = 0.55) +
stat_smooth(method = "lm",
formula = y ~ poly(x, 7, raw = TRUE),
lty = 1, col = "black", se = FALSE) +
theme_bw() +
scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
ggtitle("Regresi Polinomial Ordo 7") +
xlab("Horse Power") +
ylab("Mile per Gallon")
ggarrange(A, B, ncol = 2, nrow = 1)
Summary Model
polinom = lm(mpg ~ poly(horsepower,2))
summary(polinom)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.13 <2e-16 ***
## poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
## poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
best_tangga
## breaks rmse mae
## 1 2 6.147626 4.790234
## 2 3 5.775792 4.521829
## 3 4 4.985270 3.774549
## 4 5 4.712623 3.571614
## 5 6 4.644383 3.548375
## 6 7 4.554116 3.379980
## 7 8 4.437597 3.405433
## 8 9 4.549668 3.471999
## 9 10 4.589172 3.455531
Berdasarkan rmse dan mae
best_tangga %>% slice_min(rmse)
## breaks rmse mae
## 1 8 4.437597 3.405433
best_tangga %>% slice_min(mae)
## breaks rmse mae
## 1 7 4.554116 3.37998
Plot CV
# Assuming best_tangga$breaks is already converted to a factor
best_tangga$breaks <- as.factor(best_tangga$breaks)
# First plot
plot_rmse <- ggplot(data = best_tangga, aes(x = breaks, y = rmse, group = 1)) +
geom_line(linetype = "dashed") +
geom_point() +
ggtitle("K-Fold Cross Validation - RMSE") +
xlab("Breaks") +
ylab("Root Mean Square Error")
# Second plot
plot_mae <- ggplot(data = best_tangga, aes(x = breaks, y = mae, group = 1)) +
geom_line(linetype = "dashed") +
geom_point() +
ggtitle("K-Fold Cross Validation - MAE") +
xlab("Breaks") +
ylab("MAE")
# Combine the plots into one row
combined_plot <- plot_rmse + plot_mae
# Display the combined plot
combined_plot
Plot
pc7 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
geom_point(alpha = 0.55) +
stat_smooth(method = "lm",
formula = y ~ cut(x, 7),
lty = 1, col = "black", se = FALSE) +
theme_bw() +
scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
ggtitle("K=7") +
xlab("Horse Power") +
ylab("Mile per Gallon")
pc8 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
geom_point(alpha = 0.55) +
stat_smooth(method = "lm",
formula = y ~ cut(x, 8),
lty = 1, col = "black", se = FALSE) +
theme_bw() +
scale_color_manual(values = c("1" = "red", "2" = "blue", "3" = "green")) +
ggtitle("K=8") +
xlab("Horse Power") +
ylab("Mile per Gallon")
ggarrange(pc7, pc8, ncol = 2, nrow = 1)
Summary Model
tangga = lm(mpg ~ cut(horsepower,7))
summary(tangga)
##
## Call:
## lm(formula = mpg ~ cut(horsepower, 7))
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.5280 -2.5530 -0.3758 2.2881 16.7482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.5280 0.5069 64.17 <2e-16 ***
## cut(horsepower, 7)(72.3,98.6] -6.8162 0.6358 -10.72 <2e-16 ***
## cut(horsepower, 7)(98.6,125] -12.1523 0.7591 -16.01 <2e-16 ***
## cut(horsepower, 7)(125,151] -16.5763 0.7957 -20.83 <2e-16 ***
## cut(horsepower, 7)(151,177] -18.5020 1.0831 -17.08 <2e-16 ***
## cut(horsepower, 7)(177,204] -19.4447 1.4187 -13.71 <2e-16 ***
## cut(horsepower, 7)(204,230] -19.6280 1.5375 -12.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.59 on 385 degrees of freedom
## Multiple R-squared: 0.6594, Adjusted R-squared: 0.6541
## F-statistic: 124.2 on 6 and 385 DF, p-value: < 2.2e-16
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 2:15
best_spline3 <- map_dfr(df, function(i) {
metric_spline3 <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(df=df,best_spline3)
best_spline3
## df rmse mae
## 1 2 4.330108 3.252908
## 2 3 4.348006 3.261538
## 3 4 4.329461 3.257926
## 4 5 4.313312 3.246412
## 5 6 4.310011 3.237990
## 6 7 4.294671 3.229174
## 7 8 4.293369 3.232221
## 8 9 4.281877 3.211860
## 9 10 4.260316 3.192010
## 10 11 4.288667 3.220958
## 11 12 4.285063 3.192363
## 12 13 4.302071 3.220998
## 13 14 4.322821 3.244848
## 14 15 4.305928 3.250204
Berdasarkan rmse dan mae
best_spline3 %>% slice_min(rmse)
## df rmse mae
## 1 10 4.260316 3.19201
best_spline3 %>% slice_min(mae)
## df rmse mae
## 1 10 4.260316 3.19201
Plot CV
best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
Knots
attr(ns(Auto$horsepower, df=10),"knots")
## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
Plot
Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))
ncs <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
geom_point(alpha = 0.55) +
stat_smooth(method = "lm",
formula = y ~ ns(x, df = 10),
lty = 1, col = "black", se = FALSE) +
theme_bw() +
ggtitle("DF = 10") +
xlab("Horse Power") +
ylab("Mile per Gallon") +
geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7), col = "black", lty = 2)
ncs
Summary Model
regspline = lm(mpg ~ ns(horsepower, df=10))
summary(regspline)
##
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 10))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5003 -2.4956 -0.2086 2.2577 14.6400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.462 1.680 19.324 < 2e-16 ***
## ns(horsepower, df = 10)1 -4.600 1.789 -2.572 0.010495 *
## ns(horsepower, df = 10)2 -3.007 2.555 -1.177 0.239996
## ns(horsepower, df = 10)3 -8.663 2.043 -4.239 2.82e-05 ***
## ns(horsepower, df = 10)4 -7.170 2.188 -3.278 0.001142 **
## ns(horsepower, df = 10)5 -14.722 2.125 -6.927 1.84e-11 ***
## ns(horsepower, df = 10)6 -8.446 2.467 -3.424 0.000684 ***
## ns(horsepower, df = 10)7 -16.947 2.080 -8.148 5.38e-15 ***
## ns(horsepower, df = 10)8 -21.715 1.891 -11.483 < 2e-16 ***
## ns(horsepower, df = 10)9 -14.384 4.116 -3.494 0.000531 ***
## ns(horsepower, df = 10)10 -22.326 1.946 -11.476 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.24 on 381 degrees of freedom
## Multiple R-squared: 0.7124, Adjusted R-squared: 0.7049
## F-statistic: 94.39 on 10 and 381 DF, p-value: < 2.2e-16
Berdasarkan Nilai RMSE dan MAE
nilai_metric <- rbind(best_poly %>% select(-1) %>% slice_min(rmse),
best_tangga %>% select(-1) %>% slice_min(rmse),
best_spline3 %>% select(-1)%>% slice_min(rmse)
)
nama_model <- c("Polinomial",
"Piecewise Constant",
"Natural Cubic Splines")
komparasi <- data.frame(nama_model,nilai_metric)
komparasi
## nama_model rmse mae
## 1 Polinomial 4.293242 3.233376
## 2 Piecewise Constant 4.437597 3.405433
## 3 Natural Cubic Splines 4.260316 3.192010
Berdasarkan Plot
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Assuming 'origin' is a factor variable
Auto$origin <- factor(Auto$origin, levels = c('1', '2', '3'))
# Create the poly2 plot without origin legend
poly2 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
geom_point(alpha = 0.55) +
stat_smooth(method = "lm",
formula = y ~ poly(x, 2, raw = TRUE),
lty = 1, col = "black", se = FALSE) +
theme_bw() +
ggtitle("poly2") +
guides(color = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Create the pc7 plot without origin legend
pc7 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
geom_point(alpha = 0.55) +
stat_smooth(method = "lm",
formula = y ~ cut(x, 7),
lty = 1, col = "black", se = FALSE) +
theme_bw() +
ggtitle("pc7") +
guides(color = FALSE)
# Create the nc10 plot without origin legend
nc10 <- ggplot(Auto, aes(x = horsepower, y = mpg, color = origin)) +
geom_point(alpha = 0.55) +
stat_smooth(method = "lm",
formula = y ~ ns(x, knots = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7)),
col = "black", se = FALSE) +
theme_bw() +
ggtitle("nc10") +
guides(color = FALSE)
# Arrange the plots in a grid
grid.arrange(poly2, pc7, nc10, ncol = 3)
Berdasarkan rmse dan mae
Cross Validation
set.seed(123)
cross_val_amerika <- vfold_cv(AutoAmerika,v=10,strata = "mpg")
ordo <- 2:10
best_poly_amerika <- map_dfr(ordo, function(i) {
metric_poly_amerika <- map_dfr(cross_val_amerika$splits,
function(x) {
mod <- lm(mpg ~ poly(horsepower,ordo=i),
data=AutoAmerika[x$in_id,])
pred <- predict(mod, newdata=AutoAmerika[-x$in_id,])
truth <- AutoAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly_amerika
# menghitung rata-rata untuk 10 folds
mean_metric_poly_amerika <- colMeans(metric_poly_amerika)
mean_metric_poly_amerika
}
)
best_poly_amerika <- cbind(ordo=ordo,best_poly_amerika)
best_poly_amerika
## ordo rmse mae
## 1 2 3.799433 2.887688
## 2 3 3.804542 2.901911
## 3 4 3.823584 2.900848
## 4 5 3.808271 2.892205
## 5 6 3.832598 2.912551
## 6 7 3.781019 2.853644
## 7 8 3.791754 2.866697
## 8 9 3.890987 2.898198
## 9 10 4.021537 2.953468
Berdasarkan rmse dan mae
best_poly_amerika %>% slice_min(rmse)
## ordo rmse mae
## 1 7 3.781019 2.853644
best_poly_amerika %>% slice_min(rmse)
## ordo rmse mae
## 1 7 3.781019 2.853644
Plot CV
best_poly_amerika$ordo <- as.factor(best_poly_amerika$ordo)
ggplot(data=best_poly_amerika, aes(x=ordo, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
Plot
H <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Ordo 2") +
xlab("Horse Power") +
ylab("mile per gallon")
I <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~poly(x,7,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Ordo 7") +
xlab("Horse Power") +
ylab("mile per gallon")
ggarrange(H, I, ncol = 2, nrow = 1)
Summary Model
polinom_amerika = lm(mpg ~ poly(horsepower,2))
summary(polinom_amerika)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.13 <2e-16 ***
## poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
## poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
Cross Validation
set.seed(123)
cross_val_amerika <- vfold_cv(AutoAmerika,v=10,strata = "mpg")
breaks <- 2:10
best_tangga_amerika <- map_dfr(breaks, function(i){
metric_tangga_amerika <- map_dfr(cross_val_amerika$splits,
function(x){
training <- AutoAmerika[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- AutoAmerika[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_amerika
# menghitung rata-rata untuk 10 folds
mean_metric_tangga_amerika <- colMeans(metric_tangga_amerika)
mean_metric_tangga_amerika
})
best_tangga_amerika <- cbind(breaks=breaks,best_tangga_amerika)
best_tangga_amerika
## breaks rmse mae
## 1 2 4.997476 3.818722
## 2 3 4.759908 3.710652
## 3 4 4.139362 3.076968
## 4 5 3.966889 3.067326
## 5 6 4.222039 3.206859
## 6 7 4.150320 3.162726
## 7 8 3.945534 2.949718
## 8 9 3.737522 2.856986
## 9 10 3.816464 2.963023
Berdasarkan rmse dan mae
best_tangga_amerika %>% slice_min(rmse)
## breaks rmse mae
## 1 9 3.737522 2.856986
best_tangga_amerika %>% slice_min(mae)
## breaks rmse mae
## 1 9 3.737522 2.856986
Plot CV
best_tangga_amerika$breaks <- as.factor(best_tangga_amerika$breaks)
ggplot(data = best_tangga_amerika, aes(x = breaks, group = 1)) +
geom_line(aes(y = rmse), linetype = "dashed", color = "blue") +
geom_point(aes(y = rmse), color = "blue") +
geom_line(aes(y = mae), linetype = "dashed", color = "red") +
geom_point(aes(y = mae), color = "red") +
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Metric Value")
Plot
pc9<- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("K=9") +
xlab("Horse Power") + ylab("mile per gallon")
ggarrange(pc9, ncol = 1, nrow = 1)
Summary Model
tangga_amerika = lm(mpg ~ cut(horsepower,9))
summary(tangga_amerika)
##
## Call:
## lm(formula = mpg ~ cut(horsepower, 9))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0593 -2.8829 -0.0593 2.4977 16.2133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.3711 0.7379 46.577 < 2e-16 ***
## cut(horsepower, 9)(66.4,86.9] -5.3118 0.8580 -6.191 1.54e-09 ***
## cut(horsepower, 9)(86.9,107] -11.4882 0.8550 -13.437 < 2e-16 ***
## cut(horsepower, 9)(107,128] -13.7332 1.0506 -13.071 < 2e-16 ***
## cut(horsepower, 9)(128,148] -17.8844 1.1110 -16.097 < 2e-16 ***
## cut(horsepower, 9)(148,169] -19.8544 1.0580 -18.766 < 2e-16 ***
## cut(horsepower, 9)(169,189] -20.5711 1.3871 -14.830 < 2e-16 ***
## cut(horsepower, 9)(189,210] -21.8086 1.7695 -12.324 < 2e-16 ***
## cut(horsepower, 9)(210,230] -21.2599 1.6864 -12.607 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.549 on 383 degrees of freedom
## Multiple R-squared: 0.6673, Adjusted R-squared: 0.6603
## F-statistic: 96 on 8 and 383 DF, p-value: < 2.2e-16
Cross Validation
set.seed(123)
cross_val_amerika <- vfold_cv(AutoAmerika,v=10,strata = "mpg")
df <- 2:15
best_spline3_amerika <- map_dfr(df, function(i) {
metric_spline3_amerika <- map_dfr(cross_val_amerika$splits,
function(x) {
mod <- lm(mpg ~ ns(horsepower,df=i), data=AutoAmerika[x$in_id,])
pred <- predict(mod, newdata=AutoAmerika[-x$in_id,])
truth <- AutoAmerika[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3_amerika
# menghitung rata-rata untuk 10 folds
mean_metric_spline3_amerika <- colMeans(metric_spline3_amerika)
mean_metric_spline3_amerika
}
)
best_spline3_amerika <- cbind(df=df,best_spline3_amerika)
best_spline3_amerika
## df rmse mae
## 1 2 3.786961 2.889484
## 2 3 3.797055 2.892392
## 3 4 3.815970 2.903793
## 4 5 3.796834 2.852705
## 5 6 3.800473 2.869119
## 6 7 3.769022 2.814088
## 7 8 3.774172 2.816419
## 8 9 3.804854 2.843316
## 9 10 3.814011 2.838197
## 10 11 3.906193 2.898857
## 11 12 3.919305 2.885175
## 12 13 3.930376 2.888267
## 13 14 3.946494 2.921426
## 14 15 3.858799 2.880199
Berdasarkan rmse dan mae
best_spline3_amerika %>% slice_min(rmse)
## df rmse mae
## 1 7 3.769022 2.814088
best_spline3_amerika %>% slice_min(mae)
## df rmse mae
## 1 7 3.769022 2.814088
Plot CV
best_spline3_amerika$df <- as.factor(best_spline3_amerika$df)
ggplot(data=best_spline3_amerika, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
Knots
attr(ns(AutoAmerika$horsepower, df=7),"knots")
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 82.85714 90.00000 100.00000 110.00000 145.00000 165.00000
Plot
ncs <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="red")+
stat_smooth(method = "lm",
formula = y~ns(x, df=7),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("DF =7") +
xlab("Horse Power") +
ylab("mile per gallon") +
geom_vline(xintercept = c(82.5, 90, 100, 110, 145, 165), col="grey50", lty=2)
ncs
Summary Model
regspline_amerika = lm(mpg ~ ns(horsepower, df=7))
summary(regspline_amerika)
##
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 7))
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6158 -2.5868 -0.1339 2.2474 14.4829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.748 1.563 21.592 < 2e-16 ***
## ns(horsepower, df = 7)1 -7.888 1.694 -4.657 4.43e-06 ***
## ns(horsepower, df = 7)2 -7.844 1.923 -4.078 5.52e-05 ***
## ns(horsepower, df = 7)3 -14.226 1.812 -7.853 4.11e-14 ***
## ns(horsepower, df = 7)4 -12.527 2.077 -6.032 3.81e-09 ***
## ns(horsepower, df = 7)5 -24.049 1.715 -14.021 < 2e-16 ***
## ns(horsepower, df = 7)6 -19.387 3.664 -5.291 2.05e-07 ***
## ns(horsepower, df = 7)7 -20.803 1.761 -11.816 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.272 on 384 degrees of freedom
## Multiple R-squared: 0.7057, Adjusted R-squared: 0.7004
## F-statistic: 131.6 on 7 and 384 DF, p-value: < 2.2e-16
Berdasarkan Nilai RMSE dan MAE
nilai_metric_amerika <- rbind(best_poly_amerika %>% select(-1) %>% slice_min(rmse),
best_tangga_amerika %>% select(-1) %>% slice_min(rmse),
best_spline3_amerika %>% select(-1)%>% slice_min(rmse)
)
nama_model_amerika <- c("Polinomial",
"Piecewise Constant",
"Natural Cubic Splines")
komparasi_amerika <- data.frame(nama_model_amerika,nilai_metric_amerika)
komparasi_amerika
## nama_model_amerika rmse mae
## 1 Polinomial 3.781019 2.853644
## 2 Piecewise Constant 3.737522 2.856986
## 3 Natural Cubic Splines 3.769022 2.814088
Berdasarkan Plot
rl_A <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color = "red") +
theme_bw() +
labs(y = "mpg", x = "Horse Power")
poly2 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
pc9 <- ggplot(AutoAmerika,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="red") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "black",se = F)+
theme_bw()
nc7 <- ggplot(AutoAmerika, aes(x = horsepower, y = mpg), add=TRUE) +
geom_point(alpha = 0.55, color="red") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(82.5, 90, 100, 110, 145, 165)), col = "black", se = F)+
theme_bw()
plot_grid(rl_A, poly2, pc9, nc7, labels = c("rl_A", "poly2","pc9", "nc7"), label_size = 6)
Cross Validation
set.seed(123)
cross_val_eropa <- vfold_cv(AutoEropa,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
ordo <- 2:10
best_poly_eropa <- map_dfr(ordo, function(i) {
metric_poly_eropa <- map_dfr(cross_val_eropa$splits,
function(x) {
mod <- lm(mpg ~ poly(horsepower,ordo=i),
data=AutoEropa[x$in_id,])
pred <- predict(mod, newdata=AutoEropa[-x$in_id,])
truth <- AutoEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly_eropa
# menghitung rata-rata untuk 10 folds
mean_metric_poly_eropa <- colMeans(metric_poly_eropa)
mean_metric_poly_eropa
}
)
best_poly_eropa <- cbind(ordo=ordo,best_poly_eropa)
best_poly_eropa
## ordo rmse mae
## 1 2 4.805676 3.906728
## 2 3 4.832309 3.944713
## 3 4 4.878846 3.984634
## 4 5 5.386692 4.428764
## 5 6 5.906675 4.692453
## 6 7 6.286173 4.754299
## 7 8 6.530337 4.785631
## 8 9 16.788402 8.940515
## 9 10 12.961372 7.304674
Berdasarkan rmse dan mae
best_poly_eropa %>% slice_min(rmse)
## ordo rmse mae
## 1 2 4.805676 3.906728
best_poly_eropa %>% slice_min(rmse)
## ordo rmse mae
## 1 2 4.805676 3.906728
Plot CV
best_poly_eropa$ordo <- as.factor(best_poly_eropa$ordo)
ggplot(data=best_poly_eropa, aes(x=ordo, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
Plot
J <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Ordo 2") +
xlab("Horse Power") +
ylab("mile per gallon")
K <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Ordo 3") +
xlab("Horse Power") +
ylab("mile per gallon")
ggarrange(J, K, ncol = 2, nrow = 1)
Summary Model
polinom_eropa = lm(mpg ~ poly(horsepower,2))
summary(polinom_eropa)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.13 <2e-16 ***
## poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
## poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
Cross Validation
set.seed(123)
cross_val_eropa <- vfold_cv(AutoEropa,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
breaks <- 2:9
best_tangga_eropa <- map_dfr(breaks, function(i){
metric_tangga_eropa <- map_dfr(cross_val_eropa$splits,
function(x){
training <- AutoEropa[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- AutoEropa[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_eropa
# menghitung rata-rata untuk 10 folds
mean_metric_tangga_eropa <- colMeans(metric_tangga_eropa)
mean_metric_tangga_eropa
})
best_tangga_eropa <- cbind(breaks=breaks,best_tangga_eropa)
best_tangga_eropa
## breaks rmse mae
## 1 2 5.462277 4.313951
## 2 3 5.395409 4.482648
## 3 4 5.226264 4.308287
## 4 5 4.963259 3.982939
## 5 6 5.133934 4.101179
## 6 7 4.949469 3.951007
## 7 8 5.116465 4.154967
## 8 9 5.225155 4.293021
Berdasarkan rmse dan mae
best_tangga_eropa %>% slice_min(rmse)
## breaks rmse mae
## 1 7 4.949469 3.951007
best_tangga_eropa %>% slice_min(mae)
## breaks rmse mae
## 1 7 4.949469 3.951007
Plot CV
best_tangga_eropa$breaks <- as.factor(best_tangga_eropa$breaks)
ggplot(data = best_tangga_eropa, aes(x = breaks, group = 1)) +
geom_line(aes(y = rmse), linetype = "dashed", color = "blue") +
geom_point(aes(y = rmse), color = "blue") +
geom_line(aes(y = mae), linetype = "dashed", color = "red") +
geom_point(aes(y = mae), color = "red") +
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Metric Value")
Plot
pc5<- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("K=5") +
xlab("Horse Power") + ylab("mile per gallon")
pc7<- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("K=7") +
xlab("Horse Power") + ylab("mile per gallon")
ggarrange(pc5, pc7, ncol = 2, nrow = 1)
Summary Model
tangga_eropa = lm(mpg ~ cut(horsepower,7))
summary(tangga_eropa)
##
## Call:
## lm(formula = mpg ~ cut(horsepower, 7))
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.5280 -2.5530 -0.3758 2.2881 16.7482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.5280 0.5069 64.17 <2e-16 ***
## cut(horsepower, 7)(72.3,98.6] -6.8162 0.6358 -10.72 <2e-16 ***
## cut(horsepower, 7)(98.6,125] -12.1523 0.7591 -16.01 <2e-16 ***
## cut(horsepower, 7)(125,151] -16.5763 0.7957 -20.83 <2e-16 ***
## cut(horsepower, 7)(151,177] -18.5020 1.0831 -17.08 <2e-16 ***
## cut(horsepower, 7)(177,204] -19.4447 1.4187 -13.71 <2e-16 ***
## cut(horsepower, 7)(204,230] -19.6280 1.5375 -12.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.59 on 385 degrees of freedom
## Multiple R-squared: 0.6594, Adjusted R-squared: 0.6541
## F-statistic: 124.2 on 6 and 385 DF, p-value: < 2.2e-16
Cross Validation
set.seed(123)
cross_val_eropa <- vfold_cv(AutoEropa,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
df <- 2:15
best_spline3_eropa <- map_dfr(df, function(i) {
metric_spline3_eropa <- map_dfr(cross_val_eropa$splits,
function(x) {
mod <- lm(mpg ~ ns(horsepower,df=i), data=AutoEropa[x$in_id,])
pred <- predict(mod, newdata=AutoEropa[-x$in_id,])
truth <- AutoEropa[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3_eropa
# menghitung rata-rata untuk 10 folds
mean_metric_spline3_eropa <- colMeans(metric_spline3_eropa)
mean_metric_spline3_eropa
}
)
best_spline3_eropa <- cbind(df=df,best_spline3_eropa)
best_spline3_eropa
## df rmse mae
## 1 2 4.807144 3.901102
## 2 3 4.830632 3.946643
## 3 4 4.921306 4.013528
## 4 5 5.000935 4.056378
## 5 6 5.065292 4.049801
## 6 7 5.126662 4.096608
## 7 8 5.217695 4.177479
## 8 9 5.382172 4.306452
## 9 10 5.465415 4.374886
## 10 11 5.564501 4.422779
## 11 12 5.537463 4.408483
## 12 13 5.513853 4.343441
## 13 14 5.588090 4.477874
## 14 15 5.659180 4.506912
Berdasarkan rmse dan mae
best_spline3_eropa %>% slice_min(rmse)
## df rmse mae
## 1 2 4.807144 3.901102
best_spline3_eropa %>% slice_min(mae)
## df rmse mae
## 1 2 4.807144 3.901102
Plot CV
best_spline3_eropa$df <- as.factor(best_spline3_eropa$df)
ggplot(data=best_spline3_eropa, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
Knots
attr(ns(AutoEropa$horsepower, df=2),"knots")
## 50%
## 76.5
Plot
ncs <- ggplot(AutoEropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue")+
stat_smooth(method = "lm",
formula = y~ns(x, df=2),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("DF =2") +
xlab("Horse Power") +
ylab("mile per gallon") +
geom_vline(xintercept = c(76.5), col="black", lty=2)
ncs
Summary Model
regspline_eropa = lm(mpg ~ ns(horsepower, df=2))
summary(regspline_eropa)
##
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.8454 -2.4171 -0.1837 2.2563 15.8921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.3090 0.6779 56.52 <2e-16 ***
## ns(horsepower, df = 2)1 -41.4766 1.5831 -26.20 <2e-16 ***
## ns(horsepower, df = 2)2 -16.0003 1.2308 -13.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.354 on 389 degrees of freedom
## Multiple R-squared: 0.6904, Adjusted R-squared: 0.6888
## F-statistic: 433.7 on 2 and 389 DF, p-value: < 2.2e-16
Berdasarkan Nilai RMSE dan MAE
nilai_metric_eropa <- rbind(best_poly_eropa %>% select(-1) %>% slice_min(rmse),
best_tangga_eropa %>% select(-1) %>% slice_min(rmse),
best_spline3_eropa %>% select(-1)%>% slice_min(rmse)
)
nama_model_eropa <- c("Polinomial",
"Piecewise Constant",
"Natural Cubic Splines")
komparasi_eropa <- data.frame(nama_model_eropa,nilai_metric_eropa)
komparasi_eropa
## nama_model_eropa rmse mae
## 1 Polinomial 4.805676 3.906728
## 2 Piecewise Constant 4.949469 3.951007
## 3 Natural Cubic Splines 4.807144 3.901102
Berdasarkan Plot
rl_E <- ggplot(AutoEropa, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color = "blue") +
theme_bw() +
labs(y = "mpg", x = "Horse Power")
poly2 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
pc7 <- ggplot(AutoEropa,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "black",se = F)+
theme_bw()
nc2 <- ggplot(AutoEropa, aes(x = horsepower, y = mpg), add=TRUE) +
geom_point(alpha = 0.55, color="blue") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(76.5)), col = "black", se = F)+
theme_bw()
plot_grid(rl_E, poly2, pc7, nc2, labels = c("rl_E", "poly2","pc7", "nc2"), label_size = 6)
Cross Validation
set.seed(123)
cross_val_jepang <- vfold_cv(AutoJepang,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
ordo <- 2:10
best_poly_jepang <- map_dfr(ordo, function(i) {
metric_poly_jepang <- map_dfr(cross_val_jepang$splits,
function(x) {
mod <- lm(mpg ~ poly(horsepower,ordo=i),
data=AutoJepang[x$in_id,])
pred <- predict(mod, newdata=AutoJepang[-x$in_id,])
truth <- AutoJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly_jepang
# menghitung rata-rata untuk 10 folds
mean_metric_poly_jepang <- colMeans(metric_poly_jepang)
mean_metric_poly_jepang
}
)
best_poly_jepang <- cbind(ordo=ordo,best_poly_jepang)
best_poly_jepang
## ordo rmse mae
## 1 2 4.522705 3.624496
## 2 3 3.962162 3.024320
## 3 4 4.380986 3.311081
## 4 5 3.963580 3.013920
## 5 6 5.441724 3.816592
## 6 7 8.538692 5.108307
## 7 8 12.147461 6.621785
## 8 9 18.314340 9.158679
## 9 10 33.718613 15.414116
Berdasarkan rmse dan mae
best_poly_jepang %>% slice_min(rmse)
## ordo rmse mae
## 1 3 3.962162 3.02432
best_poly_jepang %>% slice_min(rmse)
## ordo rmse mae
## 1 3 3.962162 3.02432
Plot CV
best_poly_jepang$ordo <- as.factor(best_poly_jepang$ordo)
ggplot(data=best_poly_jepang, aes(x=ordo, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
Plot
M <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="green") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Ordo 3") +
xlab("Horse Power") +
ylab("mile per gallon")
N <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="green") +
stat_smooth(method = "lm",
formula = y~poly(x,5,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Ordo 5") +
xlab("Horse Power") +
ylab("mile per gallon")
ggarrange(M, N, ncol = 2, nrow = 1)
Summary Model
polinom_jepang = lm(mpg ~ poly(horsepower,3))
summary(polinom_jepang)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7039 -2.4491 -0.1519 2.2035 15.8159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.446 0.221 106.105 <2e-16 ***
## poly(horsepower, 3)1 -120.138 4.375 -27.460 <2e-16 ***
## poly(horsepower, 3)2 44.090 4.375 10.078 <2e-16 ***
## poly(horsepower, 3)3 -3.949 4.375 -0.903 0.367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.375 on 388 degrees of freedom
## Multiple R-squared: 0.6882, Adjusted R-squared: 0.6858
## F-statistic: 285.5 on 3 and 388 DF, p-value: < 2.2e-16
Cross Validation
set.seed(123)
cross_val_jepang <- vfold_cv(AutoJepang,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
breaks <- 2:5
best_tangga_jepang <- map_dfr(breaks, function(i){
metric_tangga_jepang <- map_dfr(cross_val_jepang$splits,
function(x){
training <- AutoJepang[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- AutoJepang[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_jepang
# menghitung rata-rata untuk 10 folds
mean_metric_tangga_jepang <- colMeans(metric_tangga_jepang)
mean_metric_tangga_jepang
})
best_tangga_jepang <- cbind(breaks=breaks,best_tangga_jepang)
best_tangga_jepang
## breaks rmse mae
## 1 2 4.306154 3.254086
## 2 3 4.201949 3.360962
## 3 4 4.072662 3.248551
## 4 5 4.226607 3.410855
Berdasarkan rmse dan mae
best_tangga_jepang %>% slice_min(rmse)
## breaks rmse mae
## 1 4 4.072662 3.248551
best_tangga_jepang %>% slice_min(mae)
## breaks rmse mae
## 1 4 4.072662 3.248551
Plot CV
best_tangga_jepang$breaks <- as.factor(best_tangga_jepang$breaks)
ggplot(data = best_tangga_jepang, aes(x = breaks, group = 1)) +
geom_line(aes(y = rmse), linetype = "dashed", color = "blue") +
geom_point(aes(y = rmse), color = "blue") +
geom_line(aes(y = mae), linetype = "dashed", color = "red") +
geom_point(aes(y = mae), color = "red") +
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Metric Value")
Plot
pc4<- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="green") +
stat_smooth(method = "lm",
formula = y~cut(x,4),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("K=4") +
xlab("Horse Power") + ylab("mile per gallon")
ggarrange(pc4, ncol = 1, nrow = 1)
Summary Model
tangga_jepang = lm(mpg ~ cut(horsepower,4))
summary(tangga_jepang)
##
## Call:
## lm(formula = mpg ~ cut(horsepower, 4))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0533 -2.7826 -0.2927 2.8593 17.5467
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.0533 0.3581 81.12 <2e-16 ***
## cut(horsepower, 4)(92,138] -8.4506 0.5946 -14.21 <2e-16 ***
## cut(horsepower, 4)(138,184] -14.2707 0.7005 -20.37 <2e-16 ***
## cut(horsepower, 4)(184,230] -16.2004 1.2647 -12.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.001 on 388 degrees of freedom
## Multiple R-squared: 0.5926, Adjusted R-squared: 0.5894
## F-statistic: 188.1 on 3 and 388 DF, p-value: < 2.2e-16
Cross Validation
set.seed(123)
cross_val_jepang <- vfold_cv(AutoJepang,v=10,strata = "mpg")
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
df <- 2:15
best_spline3_jepang <- map_dfr(df, function(i) {
metric_spline3_jepang <- map_dfr(cross_val_jepang$splits,
function(x) {
mod <- lm(mpg ~ ns(horsepower,df=i), data=AutoJepang[x$in_id,])
pred <- predict(mod, newdata=AutoJepang[-x$in_id,])
truth <- AutoJepang[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3_jepang
# menghitung rata-rata untuk 10 folds
mean_metric_spline3_jepang <- colMeans(metric_spline3_jepang)
mean_metric_spline3_jepang
}
)
best_spline3_jepang <- cbind(df=df,best_spline3_jepang)
best_spline3_jepang
## df rmse mae
## 1 2 4.537131 3.647191
## 2 3 4.143445 3.215053
## 3 4 4.215103 3.264447
## 4 5 4.182602 3.158549
## 5 6 4.186622 3.186509
## 6 7 4.189215 3.225135
## 7 8 4.341229 3.347164
## 8 9 4.389358 3.393529
## 9 10 4.405940 3.369006
## 10 11 4.474687 3.458585
## 11 12 4.429249 3.448116
## 12 13 4.445900 3.402477
## 13 14 4.560086 3.583991
## 14 15 4.540602 3.527475
Berdasarkan rmse dan mae
best_spline3_jepang %>% slice_min(rmse)
## df rmse mae
## 1 3 4.143445 3.215053
best_spline3_jepang %>% slice_min(mae)
## df rmse mae
## 1 5 4.182602 3.158549
Plot CV
best_spline3_jepang$df <- as.factor(best_spline3_jepang$df)
ggplot(data=best_spline3_jepang, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
Knots
attr(ns(AutoJepang$horsepower, df=3),"knots")
## 33.33333% 66.66667%
## 68 90
Plot
ncs <- ggplot(AutoJepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="green")+
stat_smooth(method = "lm",
formula = y~ns(x, df=3),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("DF =3") +
xlab("Horse Power") +
ylab("mile per gallon") +
geom_vline(xintercept = c(68, 90), col="grey50", lty=2)
ncs
Summary Model
regspline_jepang = lm(mpg ~ ns(horsepower, df=3))
summary(regspline_jepang)
##
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.8308 -2.4068 -0.1672 2.2524 15.9184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.276 1.039 36.85 <2e-16 ***
## ns(horsepower, df = 3)1 -21.311 1.054 -20.22 <2e-16 ***
## ns(horsepower, df = 3)2 -35.274 2.313 -15.25 <2e-16 ***
## ns(horsepower, df = 3)3 -19.669 1.435 -13.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.362 on 388 degrees of freedom
## Multiple R-squared: 0.6901, Adjusted R-squared: 0.6877
## F-statistic: 288 on 3 and 388 DF, p-value: < 2.2e-16
Berdasarkan Nilai RMSE dan MAE
nilai_metric_jepang <- rbind(best_poly_jepang %>% select(-1) %>% slice_min(rmse),
best_tangga_jepang %>% select(-1) %>% slice_min(rmse),
best_spline3_jepang %>% select(-1)%>% slice_min(rmse)
)
nama_model_jepang <- c("Polinomial",
"Piecewise Constant",
"Natural Cubic Splines")
komparasi_jepang <- data.frame(nama_model_jepang,nilai_metric_jepang)
komparasi_jepang
## nama_model_jepang rmse mae
## 1 Polinomial 3.962162 3.024320
## 2 Piecewise Constant 4.072662 3.248551
## 3 Natural Cubic Splines 4.143445 3.215053
Berdasarkan Plot
rl_J <- ggplot(AutoJepang, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.55, color = "green") +
theme_bw() +
labs(y = "mpg", x = "Horse Power")
poly3 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="green") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()
pc4 <- ggplot(AutoJepang,aes(x=horsepower, y=mpg), add=TRUE) +
geom_point(alpha=0.55, color="green") +
stat_smooth(method = "lm",
formula = y~cut(x,3),
lty = 1, col = "black",se = F)+
theme_bw()
nc3 <- ggplot(AutoJepang, aes(x = horsepower, y = mpg), add=TRUE) +
geom_point(alpha = 0.55, color="green") +
stat_smooth(method = "lm", formula = y~ns(x, knots = c(68, 90)), col = "black", se = FALSE) +
geom_vline(xintercept = c(68, 90), col="black", lty=2)+
theme_bw()
plot_grid(rl_J, poly3, pc4, nc3, labels = c("rl_J", "poly3","pc4", "nc3"), label_size = 6)