Beyond Linearity
library(ISLR)
library(rsample)
library(splines)
library(tidyverse)
library(ggpubr)
library(DT)
attach(Auto)
Plot Awal
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "black",se = F)+
theme_bw()
Regresi Polinomial
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
ordo <- 2:10
best_poly <- map_dfr(ordo, function(i) {
metric_poly <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~ poly(horsepower,ordo=i), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
}
)
best_poly <- cbind(ordo=ordo,best_poly)
Plot CV
best_poly$ordo <- as.factor(best_poly$ordo)
ggplot(data=best_poly, aes(x=ordo, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Degree of Polynomial") +
ylab("Root Mean Square Error")
Plot
A <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Ordo 2") +
xlab("Horse Power") +
ylab("mile per gallon")
B <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~poly(x,7,raw=T),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Ordo 7") +
xlab("Horse Power") +
ylab("mile per gallon")
ggarrange(A, B, ncol = 2, nrow = 1)
Jika dilihat berdasarkan plot, meskipun nilai RMSE Regresi Polinomial ordo 7 lebih kecil daripada ordo 2, model terbaik yang dipilih adalah Regresi Polinomial ordo 2. Karena untuk ordo 7 pada bagian ujung ujung plot cenderung tidak stabil.
Summary Model
Call:
lm(formula = mpg ~ poly(horsepower, 2))
Residuals:
Min 1Q Median 3Q Max
-14.7135 -2.5943 -0.0859 2.2868 15.8961
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.4459 0.2209 106.13 <2e-16 ***
poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.374 on 389 degrees of freedom
Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
Piecewise Constant
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:20
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower, data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod, newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
Plot CV
best_tangga$breaks <- as.factor(best_tangga$breaks)
ggplot(data=best_tangga, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("Root Mean Square Error")
best_tangga$breaks <- as.factor(best_tangga$breaks)
ggplot(data=best_tangga, aes(x=breaks, y=mae, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("Breaks") +
ylab("MAE")
Plot
pc8<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("K=8") +
xlab("Horse Power") + ylab("mile per gallon")
pc15<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple") +
stat_smooth(method = "lm",
formula = y~cut(x,15),
lty = 1, col = "black",se = F)+
theme_bw()+
ggtitle("K=15") +
xlab("Horse Power") + ylab("mile per gallon")
ggarrange(pc8, pc15, ncol = 2, nrow = 1)
Summary Model
Call:
lm(formula = mpg ~ cut(horsepower, 15))
Residuals:
Min 1Q Median 3Q Max
-13.0137 -2.5309 -0.3381 2.2360 14.9800
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 34.1800 1.1077 30.857 < 2e-16 ***
cut(horsepower, 15)(58.3,70.5] -0.6514 1.2472 -0.522 0.602
cut(horsepower, 15)(70.5,82.8] -6.1663 1.2601 -4.894 1.47e-06 ***
cut(horsepower, 15)(82.8,95.1] -9.4160 1.1974 -7.864 3.97e-14 ***
cut(horsepower, 15)(95.1,107] -13.2778 1.2756 -10.409 < 2e-16 ***
cut(horsepower, 15)(107,120] -13.1421 1.3644 -9.632 < 2e-16 ***
cut(horsepower, 15)(120,132] -16.8400 1.5665 -10.750 < 2e-16 ***
cut(horsepower, 15)(132,144] -16.4600 1.5665 -10.507 < 2e-16 ***
cut(horsepower, 15)(144,156] -19.3439 1.3184 -14.672 < 2e-16 ***
cut(horsepower, 15)(156,169] -20.3425 1.8782 -10.831 < 2e-16 ***
cut(horsepower, 15)(169,181] -20.3800 1.5665 -13.010 < 2e-16 ***
cut(horsepower, 15)(181,193] -21.0550 2.4141 -8.722 < 2e-16 ***
cut(horsepower, 15)(193,205] -21.8467 2.7133 -8.052 1.08e-14 ***
cut(horsepower, 15)(205,218] -22.3800 2.2154 -10.102 < 2e-16 ***
cut(horsepower, 15)(218,230] -20.1800 2.2154 -9.109 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.29 on 377 degrees of freedom
Multiple R-squared: 0.7087, Adjusted R-squared: 0.6979
F-statistic: 65.51 on 14 and 377 DF, p-value: < 2.2e-16
Natural Cubic Splines
Cross Validation
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 2:15
best_spline3 <- map_dfr(df, function(i) {
metric_spline3 <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~ ns(horsepower,df=i), data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(df=df,best_spline3)
Plot CV
best_spline3$df <- as.factor(best_spline3$df)
ggplot(data=best_spline3, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ggtitle("K-Fold Cross Validation") +
xlab("DF") + ylab("Root Mean Square Error")
Plot
ncs <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="purple")+
stat_smooth(method = "lm",
formula = y~ns(x, df=10),
lty = 1,col = "black", se=F)+
theme_bw()+
ggtitle("DF =10") +
xlab("Horse Power") +
ylab("mile per gallon") +
geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7), col="grey50", lty=2)
ncs
Knots
10% 20% 30% 40% 50% 60% 70% 80% 90%
67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
Summary Model
Call:
lm(formula = mpg ~ ns(horsepower, df = 10))
Residuals:
Min 1Q Median 3Q Max
-14.5003 -2.4956 -0.2086 2.2577 14.6400
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 32.462 1.680 19.324 < 2e-16 ***
ns(horsepower, df = 10)1 -4.600 1.789 -2.572 0.010495 *
ns(horsepower, df = 10)2 -3.007 2.555 -1.177 0.239996
ns(horsepower, df = 10)3 -8.663 2.043 -4.239 2.82e-05 ***
ns(horsepower, df = 10)4 -7.170 2.188 -3.278 0.001142 **
ns(horsepower, df = 10)5 -14.722 2.125 -6.927 1.84e-11 ***
ns(horsepower, df = 10)6 -8.446 2.467 -3.424 0.000684 ***
ns(horsepower, df = 10)7 -16.947 2.080 -8.148 5.38e-15 ***
ns(horsepower, df = 10)8 -21.715 1.891 -11.483 < 2e-16 ***
ns(horsepower, df = 10)9 -14.384 4.116 -3.494 0.000531 ***
ns(horsepower, df = 10)10 -22.326 1.946 -11.476 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.24 on 381 degrees of freedom
Multiple R-squared: 0.7124, Adjusted R-squared: 0.7049
F-statistic: 94.39 on 10 and 381 DF, p-value: < 2.2e-16
Komparasi Model
Berdasarkan Nilai RMSE dan MAE
nilai_metric <- rbind(mean_metric_reg,
best_poly %>% select(-1) %>% slice_min(rmse),
best_tangga %>% select(-1) %>% slice_min(rmse),
best_spline3 %>% select(-1)%>% slice_min(rmse)
)
nama_model <- c("Regresi",
"Polinomial",
"Piecewise Constant",
"Natural Cubic Splines")
komparasi <- data.frame(nama_model,nilai_metric)
Berdasarkan Plot
Subset Data
Amerika
Plot Data
Komparasi Model Berdasarkan Nilai RMSE dan MAE
Komparasi Model Berdasarkan Plot
Eropa
Plot Data
Komparasi Model Berdasarkan Nilai RMSE dan MAE
Komparasi Model Berdasarkan Plot
Jepang
Plot Data
rmse mae
4.454860 3.568692
Komparasi Model Berdasarkan Nilai RMSE dan MAE
Komparasi Model Berdasarkan Plot
Referensi
Dito, G.A. (April 14, 2021). Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline di R. Retrieved from https://gerrydito.github.io/Regresi-Polinomial,-Regresi-Fungsi-Tangga-dan-Regresi-Spline/
James, Gareth. et.al. (2013). An Introduction to Statistical Learning With Aplication in R. New York: Springer.
Sartono, B.. (November 1, 2020). Moving Beyond Linearity. Retrieved from https://rpubs.com/bagusco/taklinear