Tugas STA581 | Moving Beyond Linearity
Data Auto
attach(Auto)
head(cbind(mpg, horsepower))## mpg horsepower
## [1,] 18 130
## [2,] 15 165
## [3,] 18 150
## [4,] 16 150
## [5,] 17 140
## [6,] 15 198
Plot Awal
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=1, color="orange") +
theme_bw()Cross Validation
1. Regresi Linear
Secara Visual
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
Secara Empiris
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~ horsepower,
data=Auto[x$in_id,])
pred <- predict(mod, newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
mae <- mlr3measures::mae(truth = truth, response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear## # A tibble: 10 x 2
## rmse mae
## <dbl> <dbl>
## 1 4.52 3.73
## 2 4.86 3.80
## 3 4.54 3.35
## 4 4.67 3.99
## 5 5.12 4.12
## 6 4.39 3.25
## 7 5.93 4.65
## 8 5.05 3.95
## 9 4.71 3.75
## 10 5.10 3.73
# menghitung rata-rata untuk 10 folds
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear## rmse mae
## 4.889086 3.832328
2. Regresi Polinomial
Secara Empiris
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
ordo <- 2:10
best_poly <- map_dfr(ordo, function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,ordo=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
}
)
# menampilkan hasil all ordo
best_poly <- cbind(ordo=ordo,best_poly)
best_poly## ordo rmse mae
## 1 2 4.351129 3.266460
## 2 3 4.355430 3.266945
## 3 4 4.367009 3.280452
## 4 5 4.318767 3.249003
## 5 6 4.317431 3.257272
## 6 7 4.293242 3.233376
## 7 8 4.309511 3.251758
## 8 9 4.323321 3.244919
## 9 10 4.371903 3.274887
#berdasarkan rmse
best_poly %>% slice_min(rmse)## ordo rmse mae
## 1 7 4.293242 3.233376
Plot CV
best_poly$ordo <- as.factor(best_poly$ordo)
ggplot(data=best_poly, aes(x=ordo, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("Ordo") +
ylab("RMSE")+
theme_bw() Secara Visual
A <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=1, color="orange") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Derajat 2") +
xlab("horse power") +
ylab("mile per gallon")
B <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=1, color="orange") +
stat_smooth(method = "lm",
formula = y~poly(x,7,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()+
ggtitle("Regresi Polinomial Derajat 7") +
xlab("horse power") +
ylab("mile per gallon")
ggarrange(A, B, ncol = 2, nrow = 1)Jika dilihat berdasarkan plot, meskipun nilai RMSE Regresi Polinomial ordo 7 lebih kecil daripada ordo 2, namun model terbaik yang dipilih adalah Regresi Polinomial ordo 2 karena untuk ordo 7 pada bagian ujung-ujung plot cenderung tidak stabil.
Regresi Polinomial Derajat 2
mod_polinomial2 = lm(mpg ~ poly(horsepower,2))
summary(mod_polinomial2)##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.13 <2e-16 ***
## poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
## poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw=T),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly2## # A tibble: 10 x 2
## rmse mae
## <dbl> <dbl>
## 1 4.06 3.03
## 2 4.62 3.59
## 3 4.28 3.22
## 4 4.19 3.13
## 5 4.97 3.94
## 6 3.53 2.51
## 7 4.92 3.66
## 8 4.07 2.96
## 9 3.79 3.07
## 10 5.09 3.55
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2## rmse mae
## 4.351129 3.266460
3. Piecewise Constant (Fungsi Tangga)
Secara Empiris
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:20
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
}
)
# menampilkan hasil all breaks
best_tangga <- cbind(breaks=breaks,best_tangga)
best_tangga## breaks rmse mae
## 1 2 6.147626 4.790234
## 2 3 5.775792 4.521829
## 3 4 4.985270 3.774549
## 4 5 4.712623 3.571614
## 5 6 4.644383 3.548375
## 6 7 4.554116 3.379980
## 7 8 4.437597 3.405433
## 8 9 4.549668 3.471999
## 9 10 4.589172 3.455531
## 10 11 4.531039 3.380931
## 11 12 4.442697 3.321151
## 12 13 4.527126 3.375170
## 13 14 4.428824 3.270424
## 14 15 4.313162 3.286289
## 15 16 4.336661 3.302031
## 16 17 4.414343 3.347532
## 17 18 4.340715 3.221099
## 18 19 4.331798 3.224404
## 19 20 4.480710 3.320605
#berdasarkan rmse
best_tangga %>% slice_min(rmse)## breaks rmse mae
## 1 15 4.313162 3.286289
Plot CV
best_tangga$breaks <- as.factor(best_tangga$breaks)
ggplot(data=best_tangga, aes(x=breaks, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("Breaks") +
ylab("RMSE")+
theme_bw()Secara Visual
mod_tangga = lm(mpg ~ cut(horsepower,15))
summary(mod_tangga)##
## Call:
## lm(formula = mpg ~ cut(horsepower, 15))
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.0137 -2.5309 -0.3381 2.2360 14.9800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.1800 1.1077 30.857 < 2e-16 ***
## cut(horsepower, 15)(58.3,70.5] -0.6514 1.2472 -0.522 0.602
## cut(horsepower, 15)(70.5,82.8] -6.1663 1.2601 -4.894 1.47e-06 ***
## cut(horsepower, 15)(82.8,95.1] -9.4160 1.1974 -7.864 3.97e-14 ***
## cut(horsepower, 15)(95.1,107] -13.2778 1.2756 -10.409 < 2e-16 ***
## cut(horsepower, 15)(107,120] -13.1421 1.3644 -9.632 < 2e-16 ***
## cut(horsepower, 15)(120,132] -16.8400 1.5665 -10.750 < 2e-16 ***
## cut(horsepower, 15)(132,144] -16.4600 1.5665 -10.507 < 2e-16 ***
## cut(horsepower, 15)(144,156] -19.3439 1.3184 -14.672 < 2e-16 ***
## cut(horsepower, 15)(156,169] -20.3425 1.8782 -10.831 < 2e-16 ***
## cut(horsepower, 15)(169,181] -20.3800 1.5665 -13.010 < 2e-16 ***
## cut(horsepower, 15)(181,193] -21.0550 2.4141 -8.722 < 2e-16 ***
## cut(horsepower, 15)(193,205] -21.8467 2.7133 -8.052 1.08e-14 ***
## cut(horsepower, 15)(205,218] -22.3800 2.2154 -10.102 < 2e-16 ***
## cut(horsepower, 15)(218,230] -20.1800 2.2154 -9.109 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.29 on 377 degrees of freedom
## Multiple R-squared: 0.7087, Adjusted R-squared: 0.6979
## F-statistic: 65.51 on 14 and 377 DF, p-value: < 2.2e-16
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=1, color="orange") +
stat_smooth(method = "lm",
formula = y~cut(x,15),lty = 1,
col = "blue",se = F)+
theme_bw()+
ggtitle("Regresi Fungsi Tangga dengan 15 knots") +
xlab("horse power") +
ylab("mile per gallon") 4. Natural Cubic Splines
Secara Empiris
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 2:15
best_spline <- map_dfr(df, function(i) {
metric_spline <- map_dfr(cross_val$splits,
function(x) {
mod <- lm(mpg ~ ns(horsepower,df=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline
# menghitung rata-rata untuk 10 folds
mean_metric_spline <- colMeans(metric_spline)
mean_metric_spline
}
)
# menampilkan hasil all df
best_spline <- cbind(df=df,best_spline)
best_spline## df rmse mae
## 1 2 4.330108 3.252908
## 2 3 4.348006 3.261538
## 3 4 4.329461 3.257926
## 4 5 4.313312 3.246412
## 5 6 4.310011 3.237990
## 6 7 4.294671 3.229174
## 7 8 4.293369 3.232221
## 8 9 4.281877 3.211860
## 9 10 4.260316 3.192010
## 10 11 4.288667 3.220958
## 11 12 4.285063 3.192363
## 12 13 4.302071 3.220998
## 13 14 4.322821 3.244848
## 14 15 4.305928 3.250204
#berdasarkan rmse
best_spline %>% slice_min(rmse)## df rmse mae
## 1 10 4.260316 3.19201
Plot CV
best_spline$df <- as.factor(best_spline$df)
ggplot(data=best_spline, aes(x=df, y=rmse, group=1)) +
geom_line(linetype = "dashed")+
geom_point(cex=3,col="red")+
ggtitle("K-Fold CV") +
xlab("DF") +
ylab("RMSE")+
theme_bw()Secara Visual
Knots
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$horsepower, df=10),"knots")## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
mod_ncs = lm(mpg ~ ns(horsepower, df=10))
summary(mod_ncs)##
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 10))
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5003 -2.4956 -0.2086 2.2577 14.6400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.462 1.680 19.324 < 2e-16 ***
## ns(horsepower, df = 10)1 -4.600 1.789 -2.572 0.010495 *
## ns(horsepower, df = 10)2 -3.007 2.555 -1.177 0.239996
## ns(horsepower, df = 10)3 -8.663 2.043 -4.239 2.82e-05 ***
## ns(horsepower, df = 10)4 -7.170 2.188 -3.278 0.001142 **
## ns(horsepower, df = 10)5 -14.722 2.125 -6.927 1.84e-11 ***
## ns(horsepower, df = 10)6 -8.446 2.467 -3.424 0.000684 ***
## ns(horsepower, df = 10)7 -16.947 2.080 -8.148 5.38e-15 ***
## ns(horsepower, df = 10)8 -21.715 1.891 -11.483 < 2e-16 ***
## ns(horsepower, df = 10)9 -14.384 4.116 -3.494 0.000531 ***
## ns(horsepower, df = 10)10 -22.326 1.946 -11.476 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.24 on 381 degrees of freedom
## Multiple R-squared: 0.7124, Adjusted R-squared: 0.7049
## F-statistic: 94.39 on 10 and 381 DF, p-value: < 2.2e-16
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=1, color="orange") +
stat_smooth(method = "lm",
formula = y~ns(x, df=10),
lty = 1,col = "blue", se=F)+
theme_bw()+
ggtitle("Natural Cubic Splines (df=10)") +
xlab("horse power") +
ylab("mile per gallon")+
geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140,157.7), col="grey50", lty=2)Komparasi Model
Berdasarkan Nilai RMSE dan MAE
nilai_metric <- rbind(mean_metric_linear,
mean_metric_poly2,
best_tangga %>% select(-1) %>% slice_min(rmse),
best_spline %>% select(-1)%>% slice_min(rmse)
)
nama_model <- c("Linear",
"Polinomial ordo 2",
"Fungsi Tangga (breaks=15)",
"Natural Cubic Splines (df=10)"
)
komparasi<-data.frame(nama_model,nilai_metric)
komparasi %>% datatable(komparasi, class = 'cell-border stripe')Berdasarkan Plot
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=1, color="orange") +
stat_smooth(method = "lm",
formula = y~x,
lty = 1, col = "black",se = F)+
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "red",se = F)+
stat_smooth(method = "lm",
formula = y~cut(x,15),
lty = 1, col = "green3",se = F)+
stat_smooth(method = "lm",
formula = y~ns(x, df=10),
lty = 1,col = "blue", se=F)+
ggtitle("Auto")+
theme_bw()Subset Data Amerika
Plot Awal
Komparasi Model
Berdasarkan Nilai RMSE dan MAE
Berdasarkan Plot
Subset Data Eropa
Plot Awal
Komparasi Model
Berdasarkan Nilai RMSE dan MAE
Berdasarkan Plot
Subset Data Jepang
Plot Awal
Komparasi Model
Berdasarkan Nilai RMSE dan MAE
Berdasarkan Plot
Plot model terbaik masing-masing subset negara adalah sebagai berikut:
Terima kasih