Tugas Praktikum 08 Sains Data
Regresi Non-Linier
Pada simulasi ini, kita akan melakukan simulasi regresi non-linier dengan menggunakan data Auto yang terdapat pada packages ISLR, sehingga terlebih dahulu dapat mengintal packages ISLR. Selain packages ISLR, terdapat beberapa packages lain yang akan digunakan, yaitu :
rsamplemlr3measuressplinestidyverseISLR
Load Packages
library(ISLR)
library(rsample)
library(mlr3measures)
library(splines)
library(tidyverse)
library(ggplot2)Import Data
attach(Auto)
head(Auto,10) mpg cylinders displacement horsepower weight acceleration year origin
1 18 8 307 130 3504 12.0 70 1
2 15 8 350 165 3693 11.5 70 1
3 18 8 318 150 3436 11.0 70 1
4 16 8 304 150 3433 12.0 70 1
5 17 8 302 140 3449 10.5 70 1
6 15 8 429 198 4341 10.0 70 1
7 14 8 454 220 4354 9.0 70 1
8 14 8 440 215 4312 8.5 70 1
9 14 8 455 225 4425 10.0 70 1
10 15 8 390 190 3850 8.5 70 1
name
1 chevrolet chevelle malibu
2 buick skylark 320
3 plymouth satellite
4 amc rebel sst
5 ford torino
6 ford galaxie 500
7 chevrolet impala
8 plymouth fury iii
9 pontiac catalina
10 amc ambassador dpl
Di dalam data Auto terdapat data mpg, cylinders, displacement, horsepower, weight, acceleration, year, origin, dan name. Pada simulasi ini akan kita pilih mpg sebagai peubah respon, sedangkan acceleration, horsepower, dan weight sebagai peubah prediktor.
data1<- data_frame(mpg,horsepower,acceleration,weight)
colnames(data1) <- c("mpg","horsepower","acceleration","weight")
head(data1,10)# A tibble: 10 x 4
mpg horsepower acceleration weight
<dbl> <dbl> <dbl> <dbl>
1 18 130 12 3504
2 15 165 11.5 3693
3 18 150 11 3436
4 16 150 12 3433
5 17 140 10.5 3449
6 15 198 10 4341
7 14 220 9 4354
8 14 215 8.5 4312
9 14 225 10 4425
10 15 190 8.5 3850
mpg vs horsepower
Indikasi Non-Linier?
lm1 <- lm(mpg~horsepower, data = data1 )
pred1 <- predict(lm1,
data.frame(horsepower),
interval="predict")
plot(horsepower,mpg, pch=16,
col ="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Plot Data")Jika melihat pola data pada grafik, terlihat ada indikasi non-linier karena pola data membentuk pola polinom (melengkung)
Regresi Polinomial (Ordo 2,3,4) dan Cross validation
par(mfrow=c(2,2))
# regresi linier
lm1 <- lm(mpg~horsepower, data = data1 )
pred1 <- predict(lm1,
data.frame(horsepower),
interval="predict")
plot(horsepower,mpg, pch=16,
col ="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Regresi Linier")
ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix],pred1[ix],
col = "blue",
type = "l",
lwd = 2)
# Regresi Polinomial Derajat 2
polinom2 <- lm(mpg~poly(horsepower,degree=2))
prediksi2 <- predict(polinom2,
data.frame(horsepower),
interval="predict")
plot(horsepower,mpg, pch=9,
col ="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 2")
ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix],prediksi2[ix],
main="Polinomial Regression",
col = "blue",
type = "l",
lwd = 2)
# Regresi Polinomial Derajat 3
polinom3 <- lm(mpg~poly(horsepower,degree=3))
prediksi3 <- predict(polinom3,
data.frame(horsepower),
interval="predict")
plot(horsepower,mpg, pch=9,
col ="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 3")
ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix],prediksi3[ix],
col = "blue",
type = "l",
lwd = 2)
# Regresi Polinomial Derajat 4
polinom4 <- lm(mpg~poly(horsepower,degree=4))
prediksi4 <- predict(polinom4,
data.frame(horsepower),
interval="predict")
plot(horsepower,mpg, pch=9,
col ="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 4")
ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix],prediksi4[ix],
col = "blue",
type = "l",
lwd = 2)# Cross Validation
set.seed(1501211014)
# Regresi Polinomial Ordo 2
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly2# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.47 3.25
2 4.99 3.67
3 3.02 2.29
4 4.54 3.12
5 4.65 3.75
6 3.75 3.10
7 4.54 3.39
8 3.39 2.73
9 5.50 3.99
10 4.43 3.41
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
#______________________________________
# # Regresi Polinomial Ordo 3
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,3,raw = T),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly3# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.52 3.58
2 3.54 2.87
3 4.40 3.28
4 4.81 3.57
5 4.47 3.26
6 4.20 3.08
7 3.80 2.96
8 4.22 3.21
9 5.88 4.32
10 3.79 2.69
# menghitung rata-rata untuk 10 folds
mean_metric_poly3 <- colMeans(metric_poly3)
#______________________________________
# # Regresi Polinomial Ordo 4
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,4,raw = T),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly4# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.08 2.92
2 3.84 3.00
3 5.08 3.78
4 4.60 3.50
5 4.83 3.47
6 3.94 3.23
7 4.21 3.12
8 4.67 3.71
9 4.52 3.21
10 4.37 3.03
# menghitung rata-rata untuk 10 folds
mean_metric_poly4 <- colMeans(metric_poly4)Piecewise Constant (Fungsi Tangga) dan Cross Validation
# fungsi tangga
fungsi.tangga <- lm(mpg~cut(horsepower,12))
prediksi2 <- predict(fungsi.tangga, data.frame(horsepower),interval = "predict")
plot(horsepower,mpg,
pch=9,col="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Fungsi Tangga")
ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix], prediksi2[ix],
col="blue",
type="l", lwd=2)# cross validation
set.seed(1501211014)
# fungsi tangga
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- data1[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- data1[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
})
best_tangga <- cbind(best_tangga)
# menampilkan hasil all breaks
best_tangga rmse mae
1 5.758574 4.520978
2 4.967702 3.788951
3 4.699976 3.583206
4 4.641994 3.558949
5 4.449726 3.361857
6 4.451573 3.419655
7 4.525785 3.480239
8 4.556936 3.451788
mean_metric_ft <- colMeans(best_tangga)
mean_metric_ft rmse mae
4.756533 3.645703
Regresi Natural Spline dan Cross Validation
library(splines)
regresi.spline <- lm(mpg~ns(horsepower,knots=c(80,120,160,200)))
prediksi3 <- predict(regresi.spline, data.frame(horsepower), interval = "predict")
plot(horsepower, mpg, pch=19, col="coral",
xlab="Horse Power", ylab="mile per gallon",
main="natural cubic splines dengan knots 80, 120, 160, 200")
ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix], prediksi3[ix],
main = "Polynomial Regression", col="blue",
type="l", lwd=2)
abline(v=c(80, 120, 160, 200), col="grey50", lty=2)# cross validation
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_spline3my <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg~ns(horsepower,knots=c(80,120,160,200)),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3my# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.51 3.53
2 3.48 2.81
3 4.36 3.29
4 4.82 3.61
5 4.45 3.24
6 4.30 3.10
7 3.79 2.95
8 4.19 3.17
9 5.86 4.39
10 3.81 2.73
# menghitung rata-rata untuk 10 folds
mean_metric_spline3my <- colMeans(metric_spline3my)
mean_metric_spline3my rmse mae
4.357602 3.280519
Rataan rmse setiap metode
frame.mse <- rbind(mean_metric_poly2,mean_metric_poly3,mean_metric_poly4,mean_metric_ft,mean_metric_spline3my)
frame.mse rmse mae
mean_metric_poly2 4.327517 3.270278
mean_metric_poly3 4.362364 3.280605
mean_metric_poly4 4.414156 3.298285
mean_metric_ft 4.756533 3.645703
mean_metric_spline3my 4.357602 3.280519
Kesimpulan
Dengan melihat perbadingan rmse dan mae di atas, dapat dipilih model terbaik adalah Regresi Polinomial Derajat 2 karena memiliki rmse dan mae yang lebih kecil dari model lain.
mpg vs acceleration
Indikasi Non-Linier?
lm1 <- lm(mpg~acceleration, data = data1 )
pred1 <- predict(lm1,
data.frame(acceleration),
interval="predict")
plot(acceleration,mpg, pch=16,
col ="red",
xlab = "acceleration",
ylab = "Mile Per Gallon",
main = "Plot Data")Jika melihat pola data pada grafik, terlihat ada indikasi non-linier karena pola data membentuk pola polinom (melengkung)
Regresi Polinomial (ordo 2,3,4) dan Cross Validation
par(mfrow=c(2,2))
# regresi linier
lm1 <- lm(mpg~acceleration, data = data1 )
pred1 <- predict(lm1,
data.frame(acceleration),
interval="predict")
plot(acceleration,mpg, pch=16,
col ="red",
xlab = "acceleration",
ylab = "Mile Per Gallon",
main = "Regresi Linier")
ix <- sort(acceleration,index.return=T)$ix
lines(acceleration[ix],pred1[ix],
col = "blue",
type = "l",
lwd = 2)
# Regresi Polinomial Derajat 2
polinom2 <- lm(mpg~poly(acceleration,degree=2))
prediksi2 <- predict(polinom2,
data.frame(acceleration),
interval="predict")
plot(acceleration,mpg, pch=16,
col ="red",
xlab = "acceleration",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 2")
ix <- sort(acceleration,index.return=T)$ix
lines(acceleration[ix],prediksi2[ix],
main="Polinomial Regression",
col = "blue",
type = "l",
lwd = 2)
# Regresi Polinomial Derajat 3
polinom3 <- lm(mpg~poly(acceleration,degree=3))
prediksi3 <- predict(polinom3,
data.frame(acceleration),
interval="predict")
plot(acceleration,mpg, pch=16,
col ="red",
xlab = "acceleration",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 3")
ix <- sort(acceleration,index.return=T)$ix
lines(acceleration[ix],prediksi3[ix],
col = "blue",
type = "l",
lwd = 2)
# Regresi Polinomial Derajat 4
polinom4 <- lm(mpg~poly(acceleration,degree=4))
prediksi4 <- predict(polinom4,
data.frame(acceleration),
interval="predict")
plot(acceleration,mpg, pch=16,
col ="red",
xlab = "acceleration",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 4")
ix <- sort(acceleration,index.return=T)$ix
lines(acceleration[ix],prediksi4[ix],
col = "blue",
type = "l",
lwd = 2)########
# CROSS VALIDATION
set.seed(1501211014)
# Regresi Polinomial Ordo 2
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_poly22 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(acceleration,2,raw = T),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly22# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 7.15 5.40
2 6.84 5.70
3 5.97 5.02
4 6.31 5.08
5 7.26 6.17
6 6.97 6.00
7 7.26 5.79
8 6.96 5.75
9 8.22 7.07
10 7.19 5.47
# menghitung rata-rata untuk 10 folds
mean_metric_poly22 <- colMeans(metric_poly22)
#______________________________________
# # Regresi Polinomial Ordo 3
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_poly32 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(acceleration,3,raw = T),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly32# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 6.99 5.82
2 6.85 5.75
3 6.61 5.41
4 7.31 6.02
5 8.29 6.35
6 7.08 6.29
7 6.56 5.33
8 6.67 5.78
9 7.21 5.95
10 6.91 5.33
# menghitung rata-rata untuk 10 folds
mean_metric_poly32 <- colMeans(metric_poly32)
#______________________________________
# # Regresi Polinomial Ordo 4
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_poly42 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(acceleration,4,raw = T),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly42# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 7.45 5.88
2 5.65 4.69
3 8.22 6.39
4 6.69 5.63
5 6.83 5.52
6 6.64 5.45
7 6.85 5.61
8 8.19 6.66
9 6.86 5.79
10 5.91 5.07
# menghitung rata-rata untuk 10 folds
mean_metric_poly42 <- colMeans(metric_poly42)Piecewise Constant (Fungsi Tangga) dan Cross Validation
# fungsi tangga
fungsi.tangga <- lm(mpg~cut(acceleration,12))
prediksi2 <- predict(fungsi.tangga, data.frame(acceleration),interval = "predict")
plot(acceleration,mpg,
pch=16,col="red",
xlab = "acceleration",
ylab = "Mile Per Gallon",
main = "Fungsi Tangga")
ix <- sort(acceleration,index.return=T)$ix
lines(acceleration[ix], prediksi2[ix],
col="blue",
type="l", lwd=2)# cross validation
set.seed(1501211014)
# fungsi tangga
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
breaks <- 3:10
best_tangga2 <- map_dfr(breaks, function(i){
metric_tangga2 <- map_dfr(cross_val$splits,
function(x){
training <- data1[x$in_id,]
training$acceleration <- cut(training$acceleration,i)
mod <- lm(mpg ~ acceleration,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- data1[-x$in_id,]
acceleration_new <- cut(testing$acceleration,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(acceleration= acceleration_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga2
# menghitung rata-rata untuk 10 folds
mean_metric_tangga2 <- colMeans(metric_tangga2)
})
best_tangga2 <- cbind(best_tangga2)
# menampilkan hasil all breaks
best_tangga2 rmse mae
1 6.988717 5.624703
2 7.255641 5.899551
3 7.229422 5.971461
4 6.988643 5.647889
5 7.154052 5.809326
6 7.055655 5.677527
7 6.974706 5.603037
8 7.149203 5.832346
mean_metric_ft2 <- colMeans(best_tangga2)
mean_metric_ft2 rmse mae
7.099505 5.758230
Regresi Natural Spline dan Cross Validation
library(splines)
regresi.spline <- lm(mpg~ns(acceleration,knots=c(12,16,20)))
prediksi3 <- predict(regresi.spline, data.frame(acceleration), interval = "predict")
plot(acceleration, mpg, pch=19, col="coral",
xlab="acceleration", ylab="mile per gallon",
main="natural cubic splines dengan knots 12,16,20")
ix <- sort(acceleration,index.return=T)$ix
lines(acceleration[ix], prediksi3[ix],
col="blue",
type="l", lwd=2)
abline(v=c(12,16,20), col="grey50", lty=2)# cross validation
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_spline3my2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg~ns(acceleration,knots=c(12,16,20)),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3my2# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 6.84 5.71
2 7.33 6.05
3 6.19 5.04
4 7.23 5.84
5 8.21 6.08
6 7.09 6.27
7 6.51 5.25
8 6.39 5.44
9 7.14 5.80
10 6.74 5.19
# menghitung rata-rata untuk 10 folds
mean_metric_spline3my2 <- colMeans(metric_spline3my2)
mean_metric_spline3my2 rmse mae
6.967545 5.667685
Rataan mse per metode
frame.mse2 <- rbind(mean_metric_poly22,mean_metric_poly32,mean_metric_poly42,mean_metric_ft2, mean_metric_spline3my2)
frame.mse2 rmse mae
mean_metric_poly22 7.013245 5.744721
mean_metric_poly32 7.046325 5.802483
mean_metric_poly42 6.928485 5.668711
mean_metric_ft2 7.099505 5.758230
mean_metric_spline3my2 6.967545 5.667685
Kesimpulan
Dengan melihat perbadingan rmse dan mae di atas, dapat dipilih model terbaik adalah Regresi Polinomial Derajat 4 karena memiliki rmse dan mae yang lebih kecil dari model lain.
mpg vs weight
Indikasi Non-Linier?
lm1 <- lm(mpg~weight, data = data1 )
pred1 <- predict(lm1,
data.frame(weight),
interval="predict")
plot(weight,mpg, pch=16,
col ="red",
xlab = "Weight",
ylab = "Mile Per Gallon",
main = "Plot Data")Jika melihat pola data pada grafik, terlihat ada indikasi non-linier karena pola data membentuk pola polinom (melengkung)
Regresi Polinomial (Ordo 2,3,dan 4) dan Cross Validation
par(mfrow=c(2,2))
# regresi linier
lm1 <- lm(mpg~weight, data = data1 )
pred1 <- predict(lm1,
data.frame(weight),
interval="predict")
plot(weight,mpg, pch=16,
col ="red",
xlab = "Weight",
ylab = "Mile Per Gallon",
main = "Regresi Linier")
ix <- sort(weight,index.return=T)$ix
lines(weight[ix],pred1[ix],
col = "blue",
type = "l",
lwd = 2)
# Regresi Polinomial Derajat 2
polinom2 <- lm(mpg~poly(weight,degree=2))
prediksi2 <- predict(polinom2,
data.frame(weight),
interval="predict")
plot(weight,mpg, pch=16,
col ="red",
xlab = "Weight",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 2")
ix <- sort(weight,index.return=T)$ix
lines(weight[ix],prediksi2[ix],
main="Polinomial Regression",
col = "blue",
type = "l",
lwd = 2)
# Regresi Polinomial Derajat 3
polinom3 <- lm(mpg~poly(weight,degree=3))
prediksi3 <- predict(polinom3,
data.frame(weight),
interval="predict")
plot(weight,mpg, pch=16,
col ="red",
xlab = "Weight",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 3")
ix <- sort(weight,index.return=T)$ix
lines(weight[ix],prediksi3[ix],
col = "blue",
type = "l",
lwd = 2)
# Regresi Polinomial Derajat 4
polinom4 <- lm(mpg~poly(weight,degree=4))
prediksi4 <- predict(polinom4,
data.frame(weight),
interval="predict")
plot(weight,mpg, pch=16,
col ="red",
xlab = "Weight",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 4")
ix <- sort(weight,index.return=T)$ix
lines(weight[ix],prediksi4[ix],
col = "blue",
type = "l",
lwd = 2)########
# Cross Validation
set.seed(1501211014)
# Regresi Polinomial Ordo 2
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_poly23 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,2,raw = T),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly23# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 3.72 2.72
2 4.84 3.54
3 3.98 2.93
4 3.16 2.51
5 4.37 3.20
6 3.66 2.92
7 3.74 2.85
8 3.25 2.25
9 4.89 3.50
10 5.62 4.35
# menghitung rata-rata untuk 10 folds
mean_metric_poly23 <- colMeans(metric_poly23)
#______________________________________
# # Regresi Polinomial Ordo 3
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_poly33 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,3,raw = T),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly33# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.39 3.49
2 3.23 2.61
3 4.30 3.20
4 4.28 3.10
5 4.08 3.13
6 3.77 2.41
7 4.13 2.96
8 4.30 2.95
9 4.85 4.03
10 4.41 2.93
# menghitung rata-rata untuk 10 folds
mean_metric_poly33 <- colMeans(metric_poly33)
#______________________________________
# # Regresi Polinomial Ordo 4
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_poly43 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,4,raw = T),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly43# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.07 2.91
2 3.99 2.89
3 5.11 3.79
4 4.10 3.02
5 4.82 3.36
6 3.22 2.64
7 4.28 3.51
8 4.75 3.41
9 3.56 2.54
10 3.73 2.79
# menghitung rata-rata untuk 10 folds
mean_metric_poly43 <- colMeans(metric_poly43)Piecewise Constant (Fungsi Tangga) dan Cross Validation
# fungsi tangga
fungsi.tangga <- lm(mpg~cut(weight,12))
prediksi2 <- predict(fungsi.tangga, data.frame(weight),interval = "predict")
plot(weight,mpg,
pch=16,col="red",
xlab = "weight",
ylab = "Mile Per Gallon",
main = "Fungsi Tangga")
ix <- sort(weight,index.return=T)$ix
lines(weight[ix], prediksi2[ix],
col="blue",
type="l", lwd=2)########
# cross validation
set.seed(1501211014)
# fungsi tangga
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
breaks <- 3:10
best_tangga3 <- map_dfr(breaks, function(i){
metric_tangga3 <- map_dfr(cross_val$splits,
function(x){
training <- data1[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- data1[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga3
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga3)
})
best_tangga3 <- cbind(best_tangga3)
# menampilkan hasil all breaks
best_tangga3 rmse mae
1 4.830120 3.633944
2 4.669770 3.610406
3 4.374974 3.241580
4 4.174446 3.101118
5 4.244377 3.205605
6 4.386876 3.285414
7 4.276171 3.156466
8 4.301396 3.193385
mean_metric_ft3 <- colMeans(best_tangga3)
mean_metric_ft3 rmse mae
4.407266 3.303490
Regresi Natural Spline dan Cross Validation
regresi.spline <- lm(mpg~ns(weight,knots=c(2000,2750,3500,4250)))
prediksi3 <- predict(regresi.spline, data.frame(weight), interval = "predict")
plot(weight, mpg, pch=19, col="coral",
xlab="Weight", ylab="mile per gallon",
main="natural cubic splines dengan knots 2000,2750,3500,4250")
ix <- sort(weight,index.return=T)$ix
lines(weight[ix], prediksi3[ix],
main = "Polynomial Regression", col="blue",
type="l", lwd=2)
abline(v=c(2000,2750,3500,4250), col="grey50", lty=2)# cross validation
cross_val <- vfold_cv(data1,v=10,strata = "mpg")
metric_spline3my3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg~ns(weight,knots=c(2000,2750,3500,4250)),
data=data1[x$in_id,])
pred <- predict(mod,
newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3my3# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.43 3.49
2 3.24 2.63
3 4.28 3.18
4 4.27 3.09
5 4.12 3.16
6 3.78 2.41
7 4.17 2.97
8 4.33 2.97
9 4.84 4.03
10 4.40 2.94
# menghitung rata-rata untuk 10 folds
mean_metric_spline3my3 <- colMeans(metric_spline3my3)
mean_metric_spline3my3 rmse mae
4.186928 3.086945
Rataan mse per metode
frame.mse3 <- rbind(mean_metric_poly23,mean_metric_poly33,mean_metric_poly43,mean_metric_ft3, mean_metric_spline3my3)
frame.mse3 rmse mae
mean_metric_poly23 4.122701 3.076882
mean_metric_poly33 4.174291 3.080987
mean_metric_poly43 4.163681 3.085913
mean_metric_ft3 4.407266 3.303490
mean_metric_spline3my3 4.186928 3.086945
Kesimpulan
Dengan melihat perbadingan rmse dan mae di atas, dapat dipilih model terbaik adalah Regresi Polinomial Derajat 2 karena memiliki rmse dan mae yang lebih kecil dari model lain.