Tugas Kuliah 08 Sains Data

Load Packages

library(ISLR)
library(rsample)
library(mlr3measures)
library(splines)
library(tidyverse)
library(ggplot2)

Import Data

attach(Auto)
head(Auto,10)

   mpg cylinders displacement horsepower weight acceleration year origin
1   18         8          307        130   3504         12.0   70      1
2   15         8          350        165   3693         11.5   70      1
3   18         8          318        150   3436         11.0   70      1
4   16         8          304        150   3433         12.0   70      1
5   17         8          302        140   3449         10.5   70      1
6   15         8          429        198   4341         10.0   70      1
7   14         8          454        220   4354          9.0   70      1
8   14         8          440        215   4312          8.5   70      1
9   14         8          455        225   4425         10.0   70      1
10  15         8          390        190   3850          8.5   70      1
                        name
1  chevrolet chevelle malibu
2          buick skylark 320
3         plymouth satellite
4              amc rebel sst
5                ford torino
6           ford galaxie 500
7           chevrolet impala
8          plymouth fury iii
9           pontiac catalina
10        amc ambassador dpl

dat <- data.frame(mpg,horsepower)
dat

   mpg horsepower
1   18        130
2   15        165
3   18        150
4   16        150
5   17        140
6   15        198
7   14        220
8   14        215
9   14        225
10  15        190
11  15        170
12  14        160
13  15        150
14  14        225
15  24         95
16  22         95
17  18         97
18  21         85
19  27         88
20  26         46
21  25         87
22  24         90
23  25         95
24  26        113
25  21         90
26  10        215
27  10        200
28  11        210
29   9        193
30  27         88
31  28         90
32  25         95
33  19        100
34  16        105
35  17        100
36  19         88
37  18        100
38  14        165
39  14        175
40  14        153
41  14        150
42  12        180
43  13        170
44  13        175
45  18        110
 [ reached 'max' / getOption("max.print") -- omitted 347 rows ]

plot(Auto$horsepower,Auto$mpg,xlab = "horsepower",ylab = "mile per gallon", main = "Plot Data",col="red")

Regresi Non-Linier

Cross Validation f.t. Regresi Polinomial Ordo 2

# regresi polinomial ordo 2
polinom2 <- lm(Auto$mpg~poly(Auto$horsepower,degree=2))

prediksi2 <- predict(polinom2,
                    data.frame(Auto$horsepower),
                    interval="predict")


plot(Auto$horsepower,Auto$mpg, pch=9,
     col ="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 2")

ix <- sort(Auto$horsepower,index.return=T)$ix

lines(Auto$horsepower[ix],prediksi2[ix],
      col = "blue",
      type = "l",
      lwd = 2)

# cross validation
set.seed(1501211014)

# Regresi Polinomial Ordo 2
cross_val2 <- vfold_cv(dat,v=10,strata = "mpg")

metric_poly2 <- map_dfr(cross_val2$splits,
    function(x){

mod <- lm(mpg ~ poly(horsepower,2,raw = T),
         data=dat[x$in_id,])

pred <- predict(mod,
               newdata=dat[-x$in_id,])
truth <- dat[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly2

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.47  3.25
 2  4.99  3.67
 3  3.02  2.29
 4  4.54  3.12
 5  4.65  3.75
 6  3.75  3.10
 7  4.54  3.39
 8  3.39  2.73
 9  5.50  3.99
10  4.43  3.41

# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2

    rmse      mae 
4.327517 3.270278

Cross Validation f.t. Piecewise Constant (Fungsi Tangga)

# fungsi tangga
fungsi.tangga <- lm(Auto$mpg~cut(Auto$horsepower,12))

prediksi.ft <- predict(fungsi.tangga, data.frame(Auto$horsepower),interval = "predict")

#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar2.jpg")
plot(Auto$horsepower,Auto$mpg, 
     pch=9,col="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Fungsi Tangga")

ix <- sort(Auto$horsepower,index.return=T)$ix
lines(Auto$horsepower[ix], prediksi.ft[ix], 
      col="blue",
      type="l", lwd=2)

#dev.off()

# cross validation
set.seed(1501211014)

# fungsi tangga
cross_val.ft <- vfold_cv(dat,v=10,strata = "mpg")

breaks <- 3:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val.ft$splits,
    function(x){

training <- dat[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- dat[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

})

best_tangga <- cbind(best_tangga)

# menampilkan hasil all breaks
best_tangga

      rmse      mae
1 5.758574 4.520978
2 4.967702 3.788951
3 4.699976 3.583206
4 4.641994 3.558949
5 4.449726 3.361857
6 4.451573 3.419655
7 4.525785 3.480239
8 4.556936 3.451788

mean_metric_ft <- colMeans(best_tangga)
mean_metric_ft

    rmse      mae 
4.756533 3.645703

Cross Validation f.t Regresi Natural Spline

library(splines)

regresi.spline <- lm(Auto$mpg~ns(Auto$horsepower,knots=c(80,120,160,200)))

prediksi.ns <- predict(regresi.spline, data.frame(Auto$horsepower), interval = "predict")

#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar3.jpg")
plot(Auto$horsepower, Auto$mpg, pch=19, col="coral",
     xlab="Horse Power", ylab="mile per gallon",
     main="natural cubic splines dengan knots 80, 120, 160, 200")

ix <- sort(Auto$horsepower,index.return=T)$ix
lines(Auto$horsepower[ix], prediksi.ns[ix], 
      main = "Polynomial Regression", col="blue",
      type="l", lwd=2)

abline(v=c(80, 120, 160, 200), col="grey50", lty=2)

#dev.off()

# cross validation
cross_val <- vfold_cv(dat,v=10,strata = "mpg")

metric_spline3my <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg~ns(horsepower,knots=c(80,120,160,200)),
         data=dat[x$in_id,])
pred <- predict(mod,
               newdata=dat[-x$in_id,])
truth <- dat[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline3my

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.51  3.53
 2  3.48  2.81
 3  4.36  3.29
 4  4.82  3.61
 5  4.45  3.24
 6  4.30  3.10
 7  3.79  2.95
 8  4.19  3.17
 9  5.86  4.39
10  3.81  2.73

# menghitung rata-rata untuk 10 folds
mean_metric_spline3my <- colMeans(metric_spline3my)

mean_metric_spline3my

    rmse      mae 
4.357602 3.280519

frame.mse <- rbind(mean_metric_poly2,mean_metric_ft,mean_metric_spline3my)

frame.mse

                          rmse      mae
mean_metric_poly2     4.327517 3.270278
mean_metric_ft        4.756533 3.645703
mean_metric_spline3my 4.357602 3.280519

Each Countries : Load Data

library(dplyr)
library(tidyverse)
library(ISLR)

data(Auto)


auto <- as_tibble(Auto)
head(auto)

# A tibble: 6 x 9
    mpg cylinders displacement horsepower weight acceleration  year origin name 
  <dbl>     <dbl>        <dbl>      <dbl>  <dbl>        <dbl> <dbl>  <dbl> <fct>
1    18         8          307        130   3504         12      70      1 chev~
2    15         8          350        165   3693         11.5    70      1 buic~
3    18         8          318        150   3436         11      70      1 plym~
4    16         8          304        150   3433         12      70      1 amc ~
5    17         8          302        140   3449         10.5    70      1 ford~
6    15         8          429        198   4341         10      70      1 ford~

# data khusus America
american <- auto %>% filter(origin==1)
head(american)

# A tibble: 6 x 9
    mpg cylinders displacement horsepower weight acceleration  year origin name 
  <dbl>     <dbl>        <dbl>      <dbl>  <dbl>        <dbl> <dbl>  <dbl> <fct>
1    18         8          307        130   3504         12      70      1 chev~
2    15         8          350        165   3693         11.5    70      1 buic~
3    18         8          318        150   3436         11      70      1 plym~
4    16         8          304        150   3433         12      70      1 amc ~
5    17         8          302        140   3449         10.5    70      1 ford~
6    15         8          429        198   4341         10      70      1 ford~

df.american <- data.frame(american$mpg,american$horsepower)
colnames(df.american) <- c("mpg","horsepower")
df.american

   mpg horsepower
1   18        130
2   15        165
3   18        150
4   16        150
5   17        140
6   15        198
7   14        220
8   14        215
9   14        225
10  15        190
11  15        170
12  14        160
13  15        150
14  14        225
15  22         95
16  18         97
17  21         85
18  21         90
19  10        215
20  10        200
21  11        210
22   9        193
23  28         90
24  19        100
25  16        105
26  17        100
27  19         88
28  18        100
29  14        165
30  14        175
31  14        153
32  14        150
33  12        180
34  13        170
35  13        175
36  18        110
37  22         72
38  19        100
39  18         88
40  23         86
41  26         70
42  25         80
43  20         90
44  21         86
45  13        165
 [ reached 'max' / getOption("max.print") -- omitted 200 rows ]

# data khusus Eropa
european <- auto %>% filter(origin==2)
head(european)

# A tibble: 6 x 9
    mpg cylinders displacement horsepower weight acceleration  year origin name 
  <dbl>     <dbl>        <dbl>      <dbl>  <dbl>        <dbl> <dbl>  <dbl> <fct>
1    26         4           97         46   1835         20.5    70      2 volk~
2    25         4          110         87   2672         17.5    70      2 peug~
3    24         4          107         90   2430         14.5    70      2 audi~
4    25         4          104         95   2375         17.5    70      2 saab~
5    26         4          121        113   2234         12.5    70      2 bmw ~
6    28         4          116         90   2123         14      71      2 opel~

df.european <- data.frame(european$mpg,european$horsepower)
colnames(df.european) <- c("mpg","horsepower")

# data khusus Japan
japan <- auto %>% filter(origin==3)
head(japan)

# A tibble: 6 x 9
    mpg cylinders displacement horsepower weight acceleration  year origin name 
  <dbl>     <dbl>        <dbl>      <dbl>  <dbl>        <dbl> <dbl>  <dbl> <fct>
1    24         4          113         95   2372         15      70      3 toyo~
2    27         4           97         88   2130         14.5    70      3 dats~
3    27         4           97         88   2130         14.5    71      3 dats~
4    25         4          113         95   2228         14      71      3 toyo~
5    31         4           71         65   1773         19      71      3 toyo~
6    35         4           72         69   1613         18      71      3 dats~

df.japan <- data.frame(japan$mpg,japan$horsepower)
colnames(df.japan) <- c("mpg","horsepower")

American

Regresi Polinomial

# regresi polinomial ordo 2 for American
polinom.amr <- lm(american$mpg~poly(american$horsepower,degree=2))

prediksi <- predict(polinom.amr,
                    data.frame(american$horsepower),
                    interval="predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar4.jpg")
plot(american$horsepower,american$mpg, pch=16,
     col ="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 2 : Amrican")

ix <- sort(american$horsepower,index.return=T)$ix

lines(american$horsepower[ix],prediksi[ix],
      main="Polinomial Regression",
      col = "blue",
      type = "l",
      lwd = 2)

#dev.off()

# cross validation
set.seed(1501211014)

# Regresi Polinomial Ordo 2 for American
cross_val_american <- vfold_cv(df.american,v=10,strata = "mpg")

metric_american <- map_dfr(cross_val_american$splits,
    function(x){

mod <- lm(mpg ~ poly(horsepower,2,raw = T),
         data=df.american[x$in_id,])

pred <- predict(mod,
               newdata=df.american[-x$in_id,])
truth <- df.american[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_american

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.43  3.42
 2  4.54  3.29
 3  3.35  2.42
 4  3.26  2.43
 5  2.98  2.30
 6  3.41  2.50
 7  4.98  3.94
 8  3.54  2.85
 9  3.38  2.76
10  4.07  3.05

# menghitung rata-rata untuk 10 folds
mean_metric_american <- colMeans(metric_american)
mean_metric_american

    rmse      mae 
3.793082 2.895665

Fungsi Tangga

fungsi.tangga <- lm(american$mpg~cut(american$horsepower,10))

prediksi_american <- predict(fungsi.tangga, data.frame(american$horsepower),interval = "predict")

#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar4.jpg")
plot(american$horsepower,american$mpg, 
     pch=9,col="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Fungsi Tangga ; American")

ix <- sort(american$horsepower,index.return=T)$ix
lines(american$horsepower[ix], prediksi_american[ix], 
      col="blue",
      type="l", lwd=2)

#dev.off()

# cross validation
set.seed(1501211014)

# fungsi tangga
cross_val <- vfold_cv(df.american,v=10,strata = "mpg")

breaks <- 3:10

best_tangga <- map_dfr(breaks, function(i){

metric_tangga_american <- map_dfr(cross_val$splits,
    function(x){

training <- df.american[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- df.american[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,newdata=list(horsepower=horsepower_new))

truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga_american



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga_american)

})

best_tangga <- cbind(best_tangga)

# menampilkan hasil all breaks
best_tangga

      rmse      mae
1 4.732655 3.688594
2 4.124001 3.076672
3 4.011764 3.108658
4 4.194509 3.181011
5 4.174749 3.165399
6 4.021932 3.032369
7 3.720188 2.835682
8 3.867501 2.983208

mean_metric_ft <- colMeans(best_tangga)
mean_metric_ft

    rmse      mae 
4.105912 3.133949

Regresi Natural Spline

library(splines)

regresi.spline.american <- lm(df.american$mpg~ns(df.american$horsepower,knots=c(80,120,160,200)))

prediksi.ns <- predict(regresi.spline.american, data.frame(df.american$horsepower), interval = "predict")

#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar4.jpg")
plot(df.american$horsepower, df.american$mpg, pch=19, col="coral",
     xlab="Horse Power", ylab="mile per gallon",
     main="Natural Cubic Splines dengan knots 80, 120, 160, 200")

ix <- sort(df.american$horsepower,index.return=T)$ix
lines(df.american$horsepower[ix], prediksi.ns[ix], 
      main = "Polynomial Regression", col="blue",
      type="l", lwd=2)

abline(v=c(80, 120, 160, 200), col="grey50", lty=2)

#dev.off()

# cross validation
cross_val <- vfold_cv(df.american,v=10,strata = "mpg")

metric_spline_american <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg~ns(horsepower,knots=c(80,120,160,200)),
         data=df.american[x$in_id,])
pred <- predict(mod,
               newdata=df.american[-x$in_id,])
truth <- df.american[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline_american

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.98  3.67
 2  3.07  2.40
 3  3.44  2.71
 4  3.02  2.39
 5  4.34  3.02
 6  3.12  2.67
 7  4.65  3.55
 8  3.43  2.79
 9  4.60  3.12
10  3.44  2.77

# menghitung rata-rata untuk 10 folds
mean_metric_spline_american <- colMeans(metric_spline_american)

mean_metric_spline_american

    rmse      mae 
3.810119 2.909049

Japan

Regresi Polinomial

# regresi polinomial ordo 3 for American
polinom.japan <- lm(df.japan$mpg~poly(df.japan$horsepower,degree=3))

prediksi.japan <- predict(polinom.japan,
                    data.frame(df.japan$horsepower),
                    interval="predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar5.jpg")
plot(df.japan$horsepower,df.japan$mpg, pch=16,
     col ="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 3 : Japan")

ix <- sort(df.japan$horsepower,index.return=T)$ix

lines(df.japan$horsepower[ix],prediksi.japan[ix],
      main="Polinomial Regression",
      col = "blue",
      type = "l",
      lwd = 2)

#dev.off()

# cross validation
set.seed(1501211014)

# Regresi Polinomial Ordo 2 for American
cross_val_japan <- vfold_cv(df.japan,v=10)

metric_japan <- map_dfr(cross_val_japan$splits,
    function(x){

mod <- lm(mpg ~ poly(horsepower,2,raw = T),
         data=df.japan[x$in_id,])

pred <- predict(mod,
               newdata=df.japan[-x$in_id,])
truth <- df.japan[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_japan

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  6.50  5.79
 2  4.12  3.10
 3  4.16  3.63
 4  5.54  4.44
 5  3.56  2.97
 6  3.95  3.14
 7  5.30  3.79
 8  3.97  2.96
 9  8.64  5.79
10  2.27  1.78

# menghitung rata-rata untuk 10 folds
mean_metric_japan <- colMeans(metric_japan)
mean_metric_japan

    rmse      mae 
4.801797 3.739180

# fungsi tangga
fungsi.tangga.japan <- lm(df.japan$mpg~cut(df.japan$horsepower,10))

prediksi.ft.japan <- predict(fungsi.tangga.japan, data.frame(df.japan$horsepower),interval = "predict")

#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar5.jpg")
plot(df.japan$horsepower,df.japan$mpg, 
     pch=16,col="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Fungsi Tangga")

ix <- sort(df.japan$horsepower,index.return=T)$ix
lines(df.japan$horsepower[ix], prediksi.ft.japan[ix], 
      col="blue",
      type="l", lwd=2)

#dev.off()

# cross validation
set.seed(1501211014)

# fungsi tangga
cross_val.ft <- vfold_cv(df.japan,v=10)

breaks <- 2:5

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val.ft$splits,
    function(x){

training <- df.japan[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- df.japan[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

})

best_tangga <- cbind(best_tangga)

# menampilkan hasil all breaks
best_tangga

      rmse      mae
1 4.335414 3.334392
2 4.176612 3.341504
3 4.110073 3.280559
4 4.280540 3.440718

mean_metric_ft <- colMeans(best_tangga)
mean_metric_ft

    rmse      mae 
4.225660 3.349293

Regresi Natural Spline

library(splines)

regresi.spline.japan <- lm(df.japan$mpg~ns(df.japan$horsepower,knots=c( 70, 90,110 )))

prediksi.ns <- predict(regresi.spline.japan, data.frame(df.japan$horsepower), interval = "predict")

#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar5.jpg")
plot(df.japan$horsepower, df.japan$mpg, pch=19, col="coral",
     xlab="Horse Power", ylab="mile per gallon",
     main="Natural Cubic Splines dengan knots 70, 90,110")

ix <- sort(df.japan$horsepower,index.return=T)$ix
lines(df.japan$horsepower[ix], prediksi.ns[ix], 
      main = "Polynomial Regression", col="blue",
      type="l", lwd=2)

abline(v=c(70, 90,110), col="grey50", lty=2)

#dev.off()

# cross validation
cross_val <- vfold_cv(df.japan,v=10)

metric_spline_japan <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg~ns(horsepower,knots=c(70, 90,110)),
         data=df.japan[x$in_id,])
pred <- predict(mod,
               newdata=df.japan[-x$in_id,])
truth <- df.japan[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline_japan

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  5.55  4.79
 2  4.66  2.74
 3  4.53  3.41
 4  3.64  3.29
 5  4.87  3.74
 6  4.62  2.96
 7  3.34  2.56
 8  2.79  2.13
 9  3.20  2.39
10  5.98  4.69

# menghitung rata-rata untuk 10 folds
mean_metric_spline_japan<- colMeans(metric_spline_japan)

mean_metric_spline_japan

    rmse      mae 
4.316356 3.269669

European

Regresi Polinomial

# regresi polinomial ordo 4 for American
polinom.european <- lm(df.european$mpg~poly(df.european$horsepower,degree=4))

prediksi.european <- predict(polinom.european,
                    data.frame(df.european$horsepower),
                    interval="predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar6.jpg")
plot(df.european$horsepower,df.european$mpg, pch=16,
     col ="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Regresi Polinomial Ordo 4 : European")

ix <- sort(df.european$horsepower,index.return=T)$ix

lines(df.european$horsepower[ix],prediksi.european[ix],
      col = "blue",
      type = "l",
      lwd = 2)

#dev.off()


# cross validation
set.seed(1501211014)

# Regresi Polinomial Ordo 2 for American
cross_val_european <- vfold_cv(df.european,v=10)

metric_european <- map_dfr(cross_val_european$splits,
    function(x){

mod <- lm(mpg ~ poly(horsepower,2,raw = T),
         data=df.european[x$in_id,])

pred <- predict(mod,
               newdata=df.european[-x$in_id,])
truth <- df.european[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_european

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.56  4.07
 2  4.99  3.61
 3  3.75  3.15
 4  4.09  3.33
 5  6.35  5.52
 6  5.07  3.59
 7  5.75  3.95
 8  6.07  5.16
 9  4.91  3.75
10  2.71  2.09

# menghitung rata-rata untuk 10 folds
mean_metric_european <- colMeans(metric_european)
mean_metric_european

    rmse      mae 
4.824346 3.820599

Regresi Natural Spline

library(splines)

regresi.spline.european<- lm(df.european$mpg~ns(df.european$horsepower,knots=c( 70, 90,110 )))

prediksi.ns <- predict(regresi.spline.european, data.frame(df.european$horsepower), interval = "predict")

#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar6.jpg")
plot(df.european$horsepower, df.european$mpg, pch=19, col="coral",
     xlab="Horse Power", ylab="mile per gallon",
     main="Natural Cubic Splines dengan knots 70, 90,110")

ix <- sort(df.european$horsepower,index.return=T)$ix
lines(df.european$horsepower[ix], prediksi.ns[ix], 
      main = "Polynomial Regression", col="blue",
      type="l", lwd=2)

abline(v=c(70, 90,110), col="grey50", lty=2)

#dev.off()

# cross validation
cross_val <- vfold_cv(df.european,v=10)

metric_spline_european <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg~ns(horsepower,knots=c(70, 90,110)),
         data=df.european[x$in_id,])
pred <- predict(mod,
               newdata=df.european[-x$in_id,])
truth <- df.european[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_spline_european

# A tibble: 10 x 2
    rmse   mae
   <dbl> <dbl>
 1  4.70  3.22
 2  5.50  4.55
 3  4.27  3.63
 4  3.22  2.53
 5  5.08  4.06
 6  3.95  2.92
 7  6.76  5.26
 8  5.08  4.00
 9  7.40  5.82
10  5.66  4.62

# menghitung rata-rata untuk 10 folds
mean_metric_spline_european<- colMeans(metric_spline_european)

mean_metric_spline_european

    rmse      mae 
5.161558 4.061438

# fungsi tangga
fungsi.tangga.eurpean <- lm(df.european$mpg~cut(df.european$horsepower,12))

prediksi.ft.european <- predict(fungsi.tangga.eurpean, data.frame(df.european$horsepower),interval = "predict")

#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar6.jpg")
plot(df.european$horsepower,df.european$mpg, 
     pch=16,col="red",
     xlab = "Horse Power",
     ylab = "Mile Per Gallon",
     main = "Fungsi Tangga : European")

ix <- sort(df.european$horsepower,index.return=T)$ix
lines(df.european$horsepower[ix], prediksi.ft.european[ix], 
      col="blue",
      type="l", lwd=2)

#dev.off()

# cross validation
set.seed(1501211014)

# fungsi tangga
cross_val.ft <- vfold_cv(df.european,v=10)

breaks <- 2:5

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val.ft$splits,
    function(x){

training <- df.european[x$in_id,]
training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,
         data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))


testing <- df.european[-x$in_id,]

horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))

pred <- predict(mod,
               newdata=list(horsepower=horsepower_new))
truth <- testing$mpg

data_eval <- na.omit(data.frame(truth,pred))

rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga



# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)

})

best_tangga <- cbind(best_tangga)

# menampilkan hasil all breaks
best_tangga

      rmse      mae
1 5.033635 4.034447
2 5.410103 4.459703
3 5.085279 4.123989
4 5.050154 3.978805

mean_metric_ft <- colMeans(best_tangga)
mean_metric_ft

    rmse      mae 
5.144793 4.149236