Tugas Praktikum 08 Sains Data

Regresi Non-Linier

Pada simulasi ini, kita akan melakukan simulasi regresi non-linier dengan menggunakan data Auto yang terdapat pada packages ISLR, sehingga terlebih dahulu dapat mengintal packages ISLR. Selain packages ISLR, terdapat beberapa packages lain yang akan digunakan, yaitu :

rsample
mlr3measures
splines
tidyverse
ISLR

Load Packages

library(ISLR)
library(rsample)
library(mlr3measures)
library(splines)
library(tidyverse)
library(ggplot2)

Import Data

attach(Auto)
head(Auto,10)

   mpg cylinders displacement horsepower weight acceleration year origin
1   18         8          307        130   3504         12.0   70      1
2   15         8          350        165   3693         11.5   70      1
3   18         8          318        150   3436         11.0   70      1
4   16         8          304        150   3433         12.0   70      1
5   17         8          302        140   3449         10.5   70      1
6   15         8          429        198   4341         10.0   70      1
7   14         8          454        220   4354          9.0   70      1
8   14         8          440        215   4312          8.5   70      1
9   14         8          455        225   4425         10.0   70      1
10  15         8          390        190   3850          8.5   70      1
                        name
1  chevrolet chevelle malibu
2          buick skylark 320
3         plymouth satellite
4              amc rebel sst
5                ford torino
6           ford galaxie 500
7           chevrolet impala
8          plymouth fury iii
9           pontiac catalina
10        amc ambassador dpl

Di dalam data Auto terdapat data mpg, cylinders, displacement, horsepower, weight, acceleration, year, origin, dan name. Pada simulasi ini akan kita pilih mpg sebagai peubah respon, sedangkan acceleration, horsepower, dan weight sebagai peubah prediktor.

data1<- data_frame(mpg,horsepower,acceleration,weight)

colnames(data1) <- c("mpg","horsepower","acceleration","weight")

head(data1,10)

# A tibble: 10 x 4
     mpg horsepower acceleration weight
   <dbl>      <dbl>        <dbl>  <dbl>
 1    18        130         12     3504
 2    15        165         11.5   3693
 3    18        150         11     3436
 4    16        150         12     3433
 5    17        140         10.5   3449
 6    15        198         10     4341
 7    14        220          9     4354
 8    14        215          8.5   4312
 9    14        225         10     4425
10    15        190          8.5   3850

mpg vs horsepower

Indikasi Non-Linier?

lm1 <- lm(mpg~horsepower, data = data1 )

pred1 <- predict(lm1,
                    data.frame(horsepower),
                    interval="predict")

plot(horsepower,mpg, pch=16,
     col ="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Plot Data")

Jika melihat pola data pada grafik, terlihat ada indikasi non-linier karena pola data membentuk pola polinom (melengkung)

Regresi Polinomial (Ordo 2,3,4) dan Cross validation

par(mfrow=c(2,2))


# regresi linier
lm1 <- lm(mpg~horsepower, data = data1 )

pred1 <- predict(lm1,
                    data.frame(horsepower),
                    interval="predict")

plot(horsepower,mpg, pch=16,
     col ="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Regresi Linier")

ix <- sort(horsepower,index.return=T)$ix

lines(horsepower[ix],pred1[ix],
      col = "blue",
      type = "l",
      lwd = 2)


# Regresi Polinomial Derajat 2

polinom2 <- lm(mpg~poly(horsepower,degree=2))

prediksi2 <- predict(polinom2,
                    data.frame(horsepower),
                    interval="predict")

plot(horsepower,mpg, pch=9,
     col ="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 2")

ix <- sort(horsepower,index.return=T)$ix

lines(horsepower[ix],prediksi2[ix],
      main="Polinomial Regression",
      col = "blue",
      type = "l",
      lwd = 2)

# Regresi Polinomial Derajat 3
polinom3 <- lm(mpg~poly(horsepower,degree=3))

prediksi3 <- predict(polinom3,
                    data.frame(horsepower),
                    interval="predict")

plot(horsepower,mpg, pch=9,
     col ="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 3")

ix <- sort(horsepower,index.return=T)$ix

lines(horsepower[ix],prediksi3[ix],
      col = "blue",
      type = "l",
      lwd = 2)

# Regresi Polinomial Derajat 4
polinom4 <- lm(mpg~poly(horsepower,degree=4))

prediksi4 <- predict(polinom4,
                    data.frame(horsepower),
                    interval="predict")

plot(horsepower,mpg, pch=9,
     col ="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 4")

ix <- sort(horsepower,index.return=T)$ix

lines(horsepower[ix],prediksi4[ix],
      col = "blue",
      type = "l",
      lwd = 2)

# Cross Validation
set.seed(1501211014)

# Regresi Polinomial Ordo 2
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_poly2 <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(horsepower,2,raw = T),
         data=data1[x$in_id,])

pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly2

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.47  3.25
 2  4.99  3.67
 3  3.02  2.29
 4  4.54  3.12
 5  4.65  3.75
 6  3.75  3.10
 7  4.54  3.39
 8  3.39  2.73
 9  5.50  3.99
10  4.43  3.41

# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)


#______________________________________

# # Regresi Polinomial Ordo 3
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_poly3 <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(horsepower,3,raw = T),
         data=data1[x$in_id,])

pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly3

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.52  3.58
 2  3.54  2.87
 3  4.40  3.28
 4  4.81  3.57
 5  4.47  3.26
 6  4.20  3.08
 7  3.80  2.96
 8  4.22  3.21
 9  5.88  4.32
10  3.79  2.69

# menghitung rata-rata untuk 10 folds
mean_metric_poly3 <- colMeans(metric_poly3)


#______________________________________

# # Regresi Polinomial Ordo 4
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_poly4 <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(horsepower,4,raw = T),
         data=data1[x$in_id,])

pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly4

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.08  2.92
 2  3.84  3.00
 3  5.08  3.78
 4  4.60  3.50
 5  4.83  3.47
 6  3.94  3.23
 7  4.21  3.12
 8  4.67  3.71
 9  4.52  3.21
10  4.37  3.03

# menghitung rata-rata untuk 10 folds
mean_metric_poly4 <- colMeans(metric_poly4)

Piecewise Constant (Fungsi Tangga) dan Cross Validation

# fungsi tangga
fungsi.tangga <- lm(mpg~cut(horsepower,12))

prediksi2 <- predict(fungsi.tangga, data.frame(horsepower),interval = "predict")


plot(horsepower,mpg, 
     pch=9,col="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Fungsi Tangga")

ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix], prediksi2[ix], 
      col="blue",
      type="l", lwd=2)

# cross validation
set.seed(1501211014)

# fungsi tangga
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

breaks <- 3:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- data1[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- data1[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

})

best_tangga <- cbind(best_tangga)

# menampilkan hasil all breaks
best_tangga

      rmse      mae
1 5.758574 4.520978
2 4.967702 3.788951
3 4.699976 3.583206
4 4.641994 3.558949
5 4.449726 3.361857
6 4.451573 3.419655
7 4.525785 3.480239
8 4.556936 3.451788

mean_metric_ft <- colMeans(best_tangga)
mean_metric_ft

    rmse      mae 
4.756533 3.645703

Regresi Natural Spline dan Cross Validation

library(splines)

regresi.spline <- lm(mpg~ns(horsepower,knots=c(80,120,160,200)))

prediksi3 <- predict(regresi.spline, data.frame(horsepower), interval = "predict")

plot(horsepower, mpg, pch=19, col="coral",
     xlab="Horse Power", ylab="mile per gallon",
     main="natural cubic splines dengan knots 80, 120, 160, 200")

ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix], prediksi3[ix], 
      main = "Polynomial Regression", col="blue",
      type="l", lwd=2)

abline(v=c(80, 120, 160, 200), col="grey50", lty=2)

# cross validation
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_spline3my <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg~ns(horsepower,knots=c(80,120,160,200)),
         data=data1[x$in_id,])
pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3my

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.51  3.53
 2  3.48  2.81
 3  4.36  3.29
 4  4.82  3.61
 5  4.45  3.24
 6  4.30  3.10
 7  3.79  2.95
 8  4.19  3.17
 9  5.86  4.39
10  3.81  2.73

# menghitung rata-rata untuk 10 folds
mean_metric_spline3my <- colMeans(metric_spline3my)

mean_metric_spline3my

    rmse      mae 
4.357602 3.280519

Rataan rmse setiap metode

frame.mse <- rbind(mean_metric_poly2,mean_metric_poly3,mean_metric_poly4,mean_metric_ft,mean_metric_spline3my)
frame.mse

                          rmse      mae
mean_metric_poly2     4.327517 3.270278
mean_metric_poly3     4.362364 3.280605
mean_metric_poly4     4.414156 3.298285
mean_metric_ft        4.756533 3.645703
mean_metric_spline3my 4.357602 3.280519

Kesimpulan

Dengan melihat perbadingan rmse dan mae di atas, dapat dipilih model terbaik adalah Regresi Polinomial Derajat 2 karena memiliki rmse dan mae yang lebih kecil dari model lain.

mpg vs acceleration

Indikasi Non-Linier?

lm1 <- lm(mpg~acceleration, data = data1 )

pred1 <- predict(lm1,
                    data.frame(acceleration),
                    interval="predict")

plot(acceleration,mpg, pch=16,
     col ="red",
     xlab = "acceleration",
     ylab = "Mile Per Gallon",
     main = "Plot Data")

Jika melihat pola data pada grafik, terlihat ada indikasi non-linier karena pola data membentuk pola polinom (melengkung)

Regresi Polinomial (ordo 2,3,4) dan Cross Validation

par(mfrow=c(2,2))


# regresi linier
lm1 <- lm(mpg~acceleration, data = data1 )

pred1 <- predict(lm1,
                    data.frame(acceleration),
                    interval="predict")

plot(acceleration,mpg, pch=16,
     col ="red",
     xlab = "acceleration",
     ylab = "Mile Per Gallon",
     main = "Regresi Linier")

ix <- sort(acceleration,index.return=T)$ix

lines(acceleration[ix],pred1[ix],
      col = "blue",
      type = "l",
      lwd = 2)


# Regresi Polinomial Derajat 2

polinom2 <- lm(mpg~poly(acceleration,degree=2))

prediksi2 <- predict(polinom2,
                    data.frame(acceleration),
                    interval="predict")

plot(acceleration,mpg, pch=16,
     col ="red",
     xlab = "acceleration",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 2")

ix <- sort(acceleration,index.return=T)$ix

lines(acceleration[ix],prediksi2[ix],
      main="Polinomial Regression",
      col = "blue",
      type = "l",
      lwd = 2)

# Regresi Polinomial Derajat 3
polinom3 <- lm(mpg~poly(acceleration,degree=3))

prediksi3 <- predict(polinom3,
                    data.frame(acceleration),
                    interval="predict")

plot(acceleration,mpg, pch=16,
     col ="red",
     xlab = "acceleration",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 3")

ix <- sort(acceleration,index.return=T)$ix

lines(acceleration[ix],prediksi3[ix],
      col = "blue",
      type = "l",
      lwd = 2)

# Regresi Polinomial Derajat 4
polinom4 <- lm(mpg~poly(acceleration,degree=4))

prediksi4 <- predict(polinom4,
                    data.frame(acceleration),
                    interval="predict")

plot(acceleration,mpg, pch=16,
     col ="red",
     xlab = "acceleration",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 4")

ix <- sort(acceleration,index.return=T)$ix

lines(acceleration[ix],prediksi4[ix],
      col = "blue",
      type = "l",
      lwd = 2)

########
# CROSS VALIDATION

set.seed(1501211014)

# Regresi Polinomial Ordo 2
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_poly22 <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(acceleration,2,raw = T),
         data=data1[x$in_id,])

pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly22

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  7.15  5.40
 2  6.84  5.70
 3  5.97  5.02
 4  6.31  5.08
 5  7.26  6.17
 6  6.97  6.00
 7  7.26  5.79
 8  6.96  5.75
 9  8.22  7.07
10  7.19  5.47

# menghitung rata-rata untuk 10 folds
mean_metric_poly22 <- colMeans(metric_poly22)


#______________________________________

# # Regresi Polinomial Ordo 3
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_poly32 <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(acceleration,3,raw = T),
         data=data1[x$in_id,])

pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly32

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  6.99  5.82
 2  6.85  5.75
 3  6.61  5.41
 4  7.31  6.02
 5  8.29  6.35
 6  7.08  6.29
 7  6.56  5.33
 8  6.67  5.78
 9  7.21  5.95
10  6.91  5.33

# menghitung rata-rata untuk 10 folds
mean_metric_poly32 <- colMeans(metric_poly32)


#______________________________________

# # Regresi Polinomial Ordo 4
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_poly42 <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(acceleration,4,raw = T),
         data=data1[x$in_id,])

pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly42

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  7.45  5.88
 2  5.65  4.69
 3  8.22  6.39
 4  6.69  5.63
 5  6.83  5.52
 6  6.64  5.45
 7  6.85  5.61
 8  8.19  6.66
 9  6.86  5.79
10  5.91  5.07

# menghitung rata-rata untuk 10 folds
mean_metric_poly42 <- colMeans(metric_poly42)

Piecewise Constant (Fungsi Tangga) dan Cross Validation

# fungsi tangga
fungsi.tangga <- lm(mpg~cut(acceleration,12))

prediksi2 <- predict(fungsi.tangga, data.frame(acceleration),interval = "predict")


plot(acceleration,mpg, 
     pch=16,col="red",
     xlab = "acceleration",
     ylab = "Mile Per Gallon",
     main = "Fungsi Tangga")

ix <- sort(acceleration,index.return=T)$ix
lines(acceleration[ix], prediksi2[ix], 
      col="blue",
      type="l", lwd=2)

# cross validation
set.seed(1501211014)

# fungsi tangga
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

breaks <- 3:10

best_tangga2 <- map_dfr(breaks, function(i){

metric_tangga2 <- map_dfr(cross_val$splits,
    function(x){

training <- data1[x$in_id,]
training$acceleration <- cut(training$acceleration,i)

mod <- lm(mpg ~ acceleration,data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- data1[-x$in_id,]

acceleration_new <- cut(testing$acceleration,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(acceleration= acceleration_new))

truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga2



# menghitung rata-rata untuk 10 folds
mean_metric_tangga2 <- colMeans(metric_tangga2)

})

best_tangga2 <- cbind(best_tangga2)

# menampilkan hasil all breaks
best_tangga2

      rmse      mae
1 6.988717 5.624703
2 7.255641 5.899551
3 7.229422 5.971461
4 6.988643 5.647889
5 7.154052 5.809326
6 7.055655 5.677527
7 6.974706 5.603037
8 7.149203 5.832346

mean_metric_ft2 <- colMeans(best_tangga2)
mean_metric_ft2

    rmse      mae 
7.099505 5.758230

Regresi Natural Spline dan Cross Validation

library(splines)

regresi.spline <- lm(mpg~ns(acceleration,knots=c(12,16,20)))

prediksi3 <- predict(regresi.spline, data.frame(acceleration), interval = "predict")

plot(acceleration, mpg, pch=19, col="coral",
     xlab="acceleration", ylab="mile per gallon",
     main="natural cubic splines dengan knots 12,16,20")

ix <- sort(acceleration,index.return=T)$ix
lines(acceleration[ix], prediksi3[ix], 
      col="blue",
      type="l", lwd=2)

abline(v=c(12,16,20), col="grey50", lty=2)

# cross validation
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_spline3my2 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg~ns(acceleration,knots=c(12,16,20)),
         data=data1[x$in_id,])
pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3my2

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  6.84  5.71
 2  7.33  6.05
 3  6.19  5.04
 4  7.23  5.84
 5  8.21  6.08
 6  7.09  6.27
 7  6.51  5.25
 8  6.39  5.44
 9  7.14  5.80
10  6.74  5.19

# menghitung rata-rata untuk 10 folds
mean_metric_spline3my2 <- colMeans(metric_spline3my2)

mean_metric_spline3my2

    rmse      mae 
6.967545 5.667685

Rataan mse per metode

frame.mse2 <- rbind(mean_metric_poly22,mean_metric_poly32,mean_metric_poly42,mean_metric_ft2, mean_metric_spline3my2)
frame.mse2

                           rmse      mae
mean_metric_poly22     7.013245 5.744721
mean_metric_poly32     7.046325 5.802483
mean_metric_poly42     6.928485 5.668711
mean_metric_ft2        7.099505 5.758230
mean_metric_spline3my2 6.967545 5.667685

Kesimpulan

Dengan melihat perbadingan rmse dan mae di atas, dapat dipilih model terbaik adalah Regresi Polinomial Derajat 4 karena memiliki rmse dan mae yang lebih kecil dari model lain.

mpg vs weight

Indikasi Non-Linier?

lm1 <- lm(mpg~weight, data = data1 )

pred1 <- predict(lm1,
                    data.frame(weight),
                    interval="predict")

plot(weight,mpg, pch=16,
     col ="red",
     xlab = "Weight",
     ylab = "Mile Per Gallon",
     main = "Plot Data")

Jika melihat pola data pada grafik, terlihat ada indikasi non-linier karena pola data membentuk pola polinom (melengkung)

Regresi Polinomial (Ordo 2,3,dan 4) dan Cross Validation

par(mfrow=c(2,2))


# regresi linier
lm1 <- lm(mpg~weight, data = data1 )

pred1 <- predict(lm1,
                    data.frame(weight),
                    interval="predict")

plot(weight,mpg, pch=16,
     col ="red",
     xlab = "Weight",
     ylab = "Mile Per Gallon",
     main = "Regresi Linier")

ix <- sort(weight,index.return=T)$ix

lines(weight[ix],pred1[ix],
      col = "blue",
      type = "l",
      lwd = 2)


# Regresi Polinomial Derajat 2

polinom2 <- lm(mpg~poly(weight,degree=2))

prediksi2 <- predict(polinom2,
                    data.frame(weight),
                    interval="predict")

plot(weight,mpg, pch=16,
     col ="red",
     xlab = "Weight",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 2")

ix <- sort(weight,index.return=T)$ix

lines(weight[ix],prediksi2[ix],
      main="Polinomial Regression",
      col = "blue",
      type = "l",
      lwd = 2)

# Regresi Polinomial Derajat 3
polinom3 <- lm(mpg~poly(weight,degree=3))

prediksi3 <- predict(polinom3,
                    data.frame(weight),
                    interval="predict")

plot(weight,mpg, pch=16,
     col ="red",
     xlab = "Weight",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 3")

ix <- sort(weight,index.return=T)$ix

lines(weight[ix],prediksi3[ix],
      col = "blue",
      type = "l",
      lwd = 2)

# Regresi Polinomial Derajat 4
polinom4 <- lm(mpg~poly(weight,degree=4))

prediksi4 <- predict(polinom4,
                    data.frame(weight),
                    interval="predict")

plot(weight,mpg, pch=16,
     col ="red",
     xlab = "Weight",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 4")

ix <- sort(weight,index.return=T)$ix

lines(weight[ix],prediksi4[ix],
      col = "blue",
      type = "l",
      lwd = 2)

######## 
# Cross Validation



set.seed(1501211014)

# Regresi Polinomial Ordo 2
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_poly23 <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(weight,2,raw = T),
         data=data1[x$in_id,])

pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly23

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  3.72  2.72
 2  4.84  3.54
 3  3.98  2.93
 4  3.16  2.51
 5  4.37  3.20
 6  3.66  2.92
 7  3.74  2.85
 8  3.25  2.25
 9  4.89  3.50
10  5.62  4.35

# menghitung rata-rata untuk 10 folds
mean_metric_poly23 <- colMeans(metric_poly23)


#______________________________________

# # Regresi Polinomial Ordo 3
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_poly33 <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(weight,3,raw = T),
         data=data1[x$in_id,])

pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly33

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.39  3.49
 2  3.23  2.61
 3  4.30  3.20
 4  4.28  3.10
 5  4.08  3.13
 6  3.77  2.41
 7  4.13  2.96
 8  4.30  2.95
 9  4.85  4.03
10  4.41  2.93

# menghitung rata-rata untuk 10 folds
mean_metric_poly33 <- colMeans(metric_poly33)


#______________________________________

# # Regresi Polinomial Ordo 4
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_poly43 <- map_dfr(cross_val$splits,
    function(x){

mod <- lm(mpg ~ poly(weight,4,raw = T),
         data=data1[x$in_id,])

pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly43

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.07  2.91
 2  3.99  2.89
 3  5.11  3.79
 4  4.10  3.02
 5  4.82  3.36
 6  3.22  2.64
 7  4.28  3.51
 8  4.75  3.41
 9  3.56  2.54
10  3.73  2.79

# menghitung rata-rata untuk 10 folds
mean_metric_poly43 <- colMeans(metric_poly43)

Piecewise Constant (Fungsi Tangga) dan Cross Validation

# fungsi tangga
fungsi.tangga <- lm(mpg~cut(weight,12))

prediksi2 <- predict(fungsi.tangga, data.frame(weight),interval = "predict")


plot(weight,mpg, 
     pch=16,col="red",
     xlab = "weight",
     ylab = "Mile Per Gallon",
     main = "Fungsi Tangga")

ix <- sort(weight,index.return=T)$ix
lines(weight[ix], prediksi2[ix], 
      col="blue",
      type="l", lwd=2)

########
# cross validation


set.seed(1501211014)

# fungsi tangga
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

breaks <- 3:10

best_tangga3 <- map_dfr(breaks, function(i){

metric_tangga3 <- map_dfr(cross_val$splits,
    function(x){

training <- data1[x$in_id,]
training$weight <- cut(training$weight,i)

mod <- lm(mpg ~ weight,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- data1[-x$in_id,]

weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(weight=weight_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga3



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga3)

})

best_tangga3 <- cbind(best_tangga3)

# menampilkan hasil all breaks
best_tangga3

      rmse      mae
1 4.830120 3.633944
2 4.669770 3.610406
3 4.374974 3.241580
4 4.174446 3.101118
5 4.244377 3.205605
6 4.386876 3.285414
7 4.276171 3.156466
8 4.301396 3.193385

mean_metric_ft3 <- colMeans(best_tangga3)
mean_metric_ft3

    rmse      mae 
4.407266 3.303490

Regresi Natural Spline dan Cross Validation

regresi.spline <- lm(mpg~ns(weight,knots=c(2000,2750,3500,4250)))

prediksi3 <- predict(regresi.spline, data.frame(weight), interval = "predict")

plot(weight, mpg, pch=19, col="coral",
     xlab="Weight", ylab="mile per gallon",
     main="natural cubic splines dengan knots 2000,2750,3500,4250")

ix <- sort(weight,index.return=T)$ix
lines(weight[ix], prediksi3[ix], 
      main = "Polynomial Regression", col="blue",
      type="l", lwd=2)

abline(v=c(2000,2750,3500,4250), col="grey50", lty=2)

# cross validation
cross_val <- vfold_cv(data1,v=10,strata = "mpg")

metric_spline3my3 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg~ns(weight,knots=c(2000,2750,3500,4250)),
         data=data1[x$in_id,])
pred <- predict(mod,
               newdata=data1[-x$in_id,])
truth <- data1[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3my3

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.43  3.49
 2  3.24  2.63
 3  4.28  3.18
 4  4.27  3.09
 5  4.12  3.16
 6  3.78  2.41
 7  4.17  2.97
 8  4.33  2.97
 9  4.84  4.03
10  4.40  2.94

# menghitung rata-rata untuk 10 folds
mean_metric_spline3my3 <- colMeans(metric_spline3my3)

mean_metric_spline3my3

    rmse      mae 
4.186928 3.086945

Rataan mse per metode

frame.mse3 <- rbind(mean_metric_poly23,mean_metric_poly33,mean_metric_poly43,mean_metric_ft3, mean_metric_spline3my3)
frame.mse3

                           rmse      mae
mean_metric_poly23     4.122701 3.076882
mean_metric_poly33     4.174291 3.080987
mean_metric_poly43     4.163681 3.085913
mean_metric_ft3        4.407266 3.303490
mean_metric_spline3my3 4.186928 3.086945

Kesimpulan

Dengan melihat perbadingan rmse dan mae di atas, dapat dipilih model terbaik adalah Regresi Polinomial Derajat 2 karena memiliki rmse dan mae yang lebih kecil dari model lain.