Tugas Kuliah 08 Sains Data
Load Packages
library(ISLR)
library(rsample)
library(mlr3measures)
library(splines)
library(tidyverse)
library(ggplot2)Import Data
attach(Auto)
head(Auto,10) mpg cylinders displacement horsepower weight acceleration year origin
1 18 8 307 130 3504 12.0 70 1
2 15 8 350 165 3693 11.5 70 1
3 18 8 318 150 3436 11.0 70 1
4 16 8 304 150 3433 12.0 70 1
5 17 8 302 140 3449 10.5 70 1
6 15 8 429 198 4341 10.0 70 1
7 14 8 454 220 4354 9.0 70 1
8 14 8 440 215 4312 8.5 70 1
9 14 8 455 225 4425 10.0 70 1
10 15 8 390 190 3850 8.5 70 1
name
1 chevrolet chevelle malibu
2 buick skylark 320
3 plymouth satellite
4 amc rebel sst
5 ford torino
6 ford galaxie 500
7 chevrolet impala
8 plymouth fury iii
9 pontiac catalina
10 amc ambassador dpl
dat <- data.frame(mpg,horsepower)
dat mpg horsepower
1 18 130
2 15 165
3 18 150
4 16 150
5 17 140
6 15 198
7 14 220
8 14 215
9 14 225
10 15 190
11 15 170
12 14 160
13 15 150
14 14 225
15 24 95
16 22 95
17 18 97
18 21 85
19 27 88
20 26 46
21 25 87
22 24 90
23 25 95
24 26 113
25 21 90
26 10 215
27 10 200
28 11 210
29 9 193
30 27 88
31 28 90
32 25 95
33 19 100
34 16 105
35 17 100
36 19 88
37 18 100
38 14 165
39 14 175
40 14 153
41 14 150
42 12 180
43 13 170
44 13 175
45 18 110
[ reached 'max' / getOption("max.print") -- omitted 347 rows ]
plot(Auto$horsepower,Auto$mpg,xlab = "horsepower",ylab = "mile per gallon", main = "Plot Data",col="red")Regresi Non-Linier
Cross Validation f.t. Regresi Polinomial Ordo 2
# regresi polinomial ordo 2
polinom2 <- lm(Auto$mpg~poly(Auto$horsepower,degree=2))
prediksi2 <- predict(polinom2,
data.frame(Auto$horsepower),
interval="predict")
plot(Auto$horsepower,Auto$mpg, pch=9,
col ="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 2")
ix <- sort(Auto$horsepower,index.return=T)$ix
lines(Auto$horsepower[ix],prediksi2[ix],
col = "blue",
type = "l",
lwd = 2)# cross validation
set.seed(1501211014)
# Regresi Polinomial Ordo 2
cross_val2 <- vfold_cv(dat,v=10,strata = "mpg")
metric_poly2 <- map_dfr(cross_val2$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=dat[x$in_id,])
pred <- predict(mod,
newdata=dat[-x$in_id,])
truth <- dat[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly2# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.47 3.25
2 4.99 3.67
3 3.02 2.29
4 4.54 3.12
5 4.65 3.75
6 3.75 3.10
7 4.54 3.39
8 3.39 2.73
9 5.50 3.99
10 4.43 3.41
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2 rmse mae
4.327517 3.270278
Cross Validation f.t. Piecewise Constant (Fungsi Tangga)
# fungsi tangga
fungsi.tangga <- lm(Auto$mpg~cut(Auto$horsepower,12))
prediksi.ft <- predict(fungsi.tangga, data.frame(Auto$horsepower),interval = "predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar2.jpg")
plot(Auto$horsepower,Auto$mpg,
pch=9,col="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Fungsi Tangga")
ix <- sort(Auto$horsepower,index.return=T)$ix
lines(Auto$horsepower[ix], prediksi.ft[ix],
col="blue",
type="l", lwd=2)#dev.off()
# cross validation
set.seed(1501211014)
# fungsi tangga
cross_val.ft <- vfold_cv(dat,v=10,strata = "mpg")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val.ft$splits,
function(x){
training <- dat[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- dat[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
})
best_tangga <- cbind(best_tangga)
# menampilkan hasil all breaks
best_tangga rmse mae
1 5.758574 4.520978
2 4.967702 3.788951
3 4.699976 3.583206
4 4.641994 3.558949
5 4.449726 3.361857
6 4.451573 3.419655
7 4.525785 3.480239
8 4.556936 3.451788
mean_metric_ft <- colMeans(best_tangga)
mean_metric_ft rmse mae
4.756533 3.645703
Cross Validation f.t Regresi Natural Spline
library(splines)
regresi.spline <- lm(Auto$mpg~ns(Auto$horsepower,knots=c(80,120,160,200)))
prediksi.ns <- predict(regresi.spline, data.frame(Auto$horsepower), interval = "predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar3.jpg")
plot(Auto$horsepower, Auto$mpg, pch=19, col="coral",
xlab="Horse Power", ylab="mile per gallon",
main="natural cubic splines dengan knots 80, 120, 160, 200")
ix <- sort(Auto$horsepower,index.return=T)$ix
lines(Auto$horsepower[ix], prediksi.ns[ix],
main = "Polynomial Regression", col="blue",
type="l", lwd=2)
abline(v=c(80, 120, 160, 200), col="grey50", lty=2)#dev.off()# cross validation
cross_val <- vfold_cv(dat,v=10,strata = "mpg")
metric_spline3my <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg~ns(horsepower,knots=c(80,120,160,200)),
data=dat[x$in_id,])
pred <- predict(mod,
newdata=dat[-x$in_id,])
truth <- dat[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3my# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.51 3.53
2 3.48 2.81
3 4.36 3.29
4 4.82 3.61
5 4.45 3.24
6 4.30 3.10
7 3.79 2.95
8 4.19 3.17
9 5.86 4.39
10 3.81 2.73
# menghitung rata-rata untuk 10 folds
mean_metric_spline3my <- colMeans(metric_spline3my)
mean_metric_spline3my rmse mae
4.357602 3.280519
frame.mse <- rbind(mean_metric_poly2,mean_metric_ft,mean_metric_spline3my)
frame.mse rmse mae
mean_metric_poly2 4.327517 3.270278
mean_metric_ft 4.756533 3.645703
mean_metric_spline3my 4.357602 3.280519
Each Countries : Load Data
library(dplyr)
library(tidyverse)
library(ISLR)
data(Auto)
auto <- as_tibble(Auto)
head(auto)# A tibble: 6 x 9
mpg cylinders displacement horsepower weight acceleration year origin name
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
1 18 8 307 130 3504 12 70 1 chev~
2 15 8 350 165 3693 11.5 70 1 buic~
3 18 8 318 150 3436 11 70 1 plym~
4 16 8 304 150 3433 12 70 1 amc ~
5 17 8 302 140 3449 10.5 70 1 ford~
6 15 8 429 198 4341 10 70 1 ford~
# data khusus America
american <- auto %>% filter(origin==1)
head(american)# A tibble: 6 x 9
mpg cylinders displacement horsepower weight acceleration year origin name
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
1 18 8 307 130 3504 12 70 1 chev~
2 15 8 350 165 3693 11.5 70 1 buic~
3 18 8 318 150 3436 11 70 1 plym~
4 16 8 304 150 3433 12 70 1 amc ~
5 17 8 302 140 3449 10.5 70 1 ford~
6 15 8 429 198 4341 10 70 1 ford~
df.american <- data.frame(american$mpg,american$horsepower)
colnames(df.american) <- c("mpg","horsepower")
df.american mpg horsepower
1 18 130
2 15 165
3 18 150
4 16 150
5 17 140
6 15 198
7 14 220
8 14 215
9 14 225
10 15 190
11 15 170
12 14 160
13 15 150
14 14 225
15 22 95
16 18 97
17 21 85
18 21 90
19 10 215
20 10 200
21 11 210
22 9 193
23 28 90
24 19 100
25 16 105
26 17 100
27 19 88
28 18 100
29 14 165
30 14 175
31 14 153
32 14 150
33 12 180
34 13 170
35 13 175
36 18 110
37 22 72
38 19 100
39 18 88
40 23 86
41 26 70
42 25 80
43 20 90
44 21 86
45 13 165
[ reached 'max' / getOption("max.print") -- omitted 200 rows ]
# data khusus Eropa
european <- auto %>% filter(origin==2)
head(european)# A tibble: 6 x 9
mpg cylinders displacement horsepower weight acceleration year origin name
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
1 26 4 97 46 1835 20.5 70 2 volk~
2 25 4 110 87 2672 17.5 70 2 peug~
3 24 4 107 90 2430 14.5 70 2 audi~
4 25 4 104 95 2375 17.5 70 2 saab~
5 26 4 121 113 2234 12.5 70 2 bmw ~
6 28 4 116 90 2123 14 71 2 opel~
df.european <- data.frame(european$mpg,european$horsepower)
colnames(df.european) <- c("mpg","horsepower")
# data khusus Japan
japan <- auto %>% filter(origin==3)
head(japan)# A tibble: 6 x 9
mpg cylinders displacement horsepower weight acceleration year origin name
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
1 24 4 113 95 2372 15 70 3 toyo~
2 27 4 97 88 2130 14.5 70 3 dats~
3 27 4 97 88 2130 14.5 71 3 dats~
4 25 4 113 95 2228 14 71 3 toyo~
5 31 4 71 65 1773 19 71 3 toyo~
6 35 4 72 69 1613 18 71 3 dats~
df.japan <- data.frame(japan$mpg,japan$horsepower)
colnames(df.japan) <- c("mpg","horsepower")American
Regresi Polinomial
# regresi polinomial ordo 2 for American
polinom.amr <- lm(american$mpg~poly(american$horsepower,degree=2))
prediksi <- predict(polinom.amr,
data.frame(american$horsepower),
interval="predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar4.jpg")
plot(american$horsepower,american$mpg, pch=16,
col ="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 2 : Amrican")
ix <- sort(american$horsepower,index.return=T)$ix
lines(american$horsepower[ix],prediksi[ix],
main="Polinomial Regression",
col = "blue",
type = "l",
lwd = 2)#dev.off()
# cross validation
set.seed(1501211014)
# Regresi Polinomial Ordo 2 for American
cross_val_american <- vfold_cv(df.american,v=10,strata = "mpg")
metric_american <- map_dfr(cross_val_american$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=df.american[x$in_id,])
pred <- predict(mod,
newdata=df.american[-x$in_id,])
truth <- df.american[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_american# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.43 3.42
2 4.54 3.29
3 3.35 2.42
4 3.26 2.43
5 2.98 2.30
6 3.41 2.50
7 4.98 3.94
8 3.54 2.85
9 3.38 2.76
10 4.07 3.05
# menghitung rata-rata untuk 10 folds
mean_metric_american <- colMeans(metric_american)
mean_metric_american rmse mae
3.793082 2.895665
Fungsi Tangga
fungsi.tangga <- lm(american$mpg~cut(american$horsepower,10))
prediksi_american <- predict(fungsi.tangga, data.frame(american$horsepower),interval = "predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar4.jpg")
plot(american$horsepower,american$mpg,
pch=9,col="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Fungsi Tangga ; American")
ix <- sort(american$horsepower,index.return=T)$ix
lines(american$horsepower[ix], prediksi_american[ix],
col="blue",
type="l", lwd=2)#dev.off()# cross validation
set.seed(1501211014)
# fungsi tangga
cross_val <- vfold_cv(df.american,v=10,strata = "mpg")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga_american <- map_dfr(cross_val$splits,
function(x){
training <- df.american[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- df.american[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga_american
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga_american)
})
best_tangga <- cbind(best_tangga)
# menampilkan hasil all breaks
best_tangga rmse mae
1 4.732655 3.688594
2 4.124001 3.076672
3 4.011764 3.108658
4 4.194509 3.181011
5 4.174749 3.165399
6 4.021932 3.032369
7 3.720188 2.835682
8 3.867501 2.983208
mean_metric_ft <- colMeans(best_tangga)
mean_metric_ft rmse mae
4.105912 3.133949
Regresi Natural Spline
library(splines)
regresi.spline.american <- lm(df.american$mpg~ns(df.american$horsepower,knots=c(80,120,160,200)))
prediksi.ns <- predict(regresi.spline.american, data.frame(df.american$horsepower), interval = "predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar4.jpg")
plot(df.american$horsepower, df.american$mpg, pch=19, col="coral",
xlab="Horse Power", ylab="mile per gallon",
main="Natural Cubic Splines dengan knots 80, 120, 160, 200")
ix <- sort(df.american$horsepower,index.return=T)$ix
lines(df.american$horsepower[ix], prediksi.ns[ix],
main = "Polynomial Regression", col="blue",
type="l", lwd=2)
abline(v=c(80, 120, 160, 200), col="grey50", lty=2)#dev.off()# cross validation
cross_val <- vfold_cv(df.american,v=10,strata = "mpg")
metric_spline_american <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg~ns(horsepower,knots=c(80,120,160,200)),
data=df.american[x$in_id,])
pred <- predict(mod,
newdata=df.american[-x$in_id,])
truth <- df.american[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline_american# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.98 3.67
2 3.07 2.40
3 3.44 2.71
4 3.02 2.39
5 4.34 3.02
6 3.12 2.67
7 4.65 3.55
8 3.43 2.79
9 4.60 3.12
10 3.44 2.77
# menghitung rata-rata untuk 10 folds
mean_metric_spline_american <- colMeans(metric_spline_american)
mean_metric_spline_american rmse mae
3.810119 2.909049
Japan
Regresi Polinomial
# regresi polinomial ordo 3 for American
polinom.japan <- lm(df.japan$mpg~poly(df.japan$horsepower,degree=3))
prediksi.japan <- predict(polinom.japan,
data.frame(df.japan$horsepower),
interval="predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar5.jpg")
plot(df.japan$horsepower,df.japan$mpg, pch=16,
col ="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 3 : Japan")
ix <- sort(df.japan$horsepower,index.return=T)$ix
lines(df.japan$horsepower[ix],prediksi.japan[ix],
main="Polinomial Regression",
col = "blue",
type = "l",
lwd = 2)#dev.off()
# cross validation
set.seed(1501211014)
# Regresi Polinomial Ordo 2 for American
cross_val_japan <- vfold_cv(df.japan,v=10)
metric_japan <- map_dfr(cross_val_japan$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=df.japan[x$in_id,])
pred <- predict(mod,
newdata=df.japan[-x$in_id,])
truth <- df.japan[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_japan# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 6.50 5.79
2 4.12 3.10
3 4.16 3.63
4 5.54 4.44
5 3.56 2.97
6 3.95 3.14
7 5.30 3.79
8 3.97 2.96
9 8.64 5.79
10 2.27 1.78
# menghitung rata-rata untuk 10 folds
mean_metric_japan <- colMeans(metric_japan)
mean_metric_japan rmse mae
4.801797 3.739180
# fungsi tangga
fungsi.tangga.japan <- lm(df.japan$mpg~cut(df.japan$horsepower,10))
prediksi.ft.japan <- predict(fungsi.tangga.japan, data.frame(df.japan$horsepower),interval = "predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar5.jpg")
plot(df.japan$horsepower,df.japan$mpg,
pch=16,col="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Fungsi Tangga")
ix <- sort(df.japan$horsepower,index.return=T)$ix
lines(df.japan$horsepower[ix], prediksi.ft.japan[ix],
col="blue",
type="l", lwd=2)#dev.off()# cross validation
set.seed(1501211014)
# fungsi tangga
cross_val.ft <- vfold_cv(df.japan,v=10)
breaks <- 2:5
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val.ft$splits,
function(x){
training <- df.japan[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- df.japan[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
})
best_tangga <- cbind(best_tangga)
# menampilkan hasil all breaks
best_tangga rmse mae
1 4.335414 3.334392
2 4.176612 3.341504
3 4.110073 3.280559
4 4.280540 3.440718
mean_metric_ft <- colMeans(best_tangga)
mean_metric_ft rmse mae
4.225660 3.349293
Regresi Natural Spline
library(splines)
regresi.spline.japan <- lm(df.japan$mpg~ns(df.japan$horsepower,knots=c( 70, 90,110 )))
prediksi.ns <- predict(regresi.spline.japan, data.frame(df.japan$horsepower), interval = "predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar5.jpg")
plot(df.japan$horsepower, df.japan$mpg, pch=19, col="coral",
xlab="Horse Power", ylab="mile per gallon",
main="Natural Cubic Splines dengan knots 70, 90,110")
ix <- sort(df.japan$horsepower,index.return=T)$ix
lines(df.japan$horsepower[ix], prediksi.ns[ix],
main = "Polynomial Regression", col="blue",
type="l", lwd=2)
abline(v=c(70, 90,110), col="grey50", lty=2)#dev.off()
# cross validation
cross_val <- vfold_cv(df.japan,v=10)
metric_spline_japan <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg~ns(horsepower,knots=c(70, 90,110)),
data=df.japan[x$in_id,])
pred <- predict(mod,
newdata=df.japan[-x$in_id,])
truth <- df.japan[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline_japan# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 5.55 4.79
2 4.66 2.74
3 4.53 3.41
4 3.64 3.29
5 4.87 3.74
6 4.62 2.96
7 3.34 2.56
8 2.79 2.13
9 3.20 2.39
10 5.98 4.69
# menghitung rata-rata untuk 10 folds
mean_metric_spline_japan<- colMeans(metric_spline_japan)
mean_metric_spline_japan rmse mae
4.316356 3.269669
European
Regresi Polinomial
# regresi polinomial ordo 4 for American
polinom.european <- lm(df.european$mpg~poly(df.european$horsepower,degree=4))
prediksi.european <- predict(polinom.european,
data.frame(df.european$horsepower),
interval="predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar6.jpg")
plot(df.european$horsepower,df.european$mpg, pch=16,
col ="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Regresi Polinomial Ordo 4 : European")
ix <- sort(df.european$horsepower,index.return=T)$ix
lines(df.european$horsepower[ix],prediksi.european[ix],
col = "blue",
type = "l",
lwd = 2)#dev.off()
# cross validation
set.seed(1501211014)
# Regresi Polinomial Ordo 2 for American
cross_val_european <- vfold_cv(df.european,v=10)
metric_european <- map_dfr(cross_val_european$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),
data=df.european[x$in_id,])
pred <- predict(mod,
newdata=df.european[-x$in_id,])
truth <- df.european[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_european# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.56 4.07
2 4.99 3.61
3 3.75 3.15
4 4.09 3.33
5 6.35 5.52
6 5.07 3.59
7 5.75 3.95
8 6.07 5.16
9 4.91 3.75
10 2.71 2.09
# menghitung rata-rata untuk 10 folds
mean_metric_european <- colMeans(metric_european)
mean_metric_european rmse mae
4.824346 3.820599
Regresi Natural Spline
library(splines)
regresi.spline.european<- lm(df.european$mpg~ns(df.european$horsepower,knots=c( 70, 90,110 )))
prediksi.ns <- predict(regresi.spline.european, data.frame(df.european$horsepower), interval = "predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar6.jpg")
plot(df.european$horsepower, df.european$mpg, pch=19, col="coral",
xlab="Horse Power", ylab="mile per gallon",
main="Natural Cubic Splines dengan knots 70, 90,110")
ix <- sort(df.european$horsepower,index.return=T)$ix
lines(df.european$horsepower[ix], prediksi.ns[ix],
main = "Polynomial Regression", col="blue",
type="l", lwd=2)
abline(v=c(70, 90,110), col="grey50", lty=2)#dev.off()# cross validation
cross_val <- vfold_cv(df.european,v=10)
metric_spline_european <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg~ns(horsepower,knots=c(70, 90,110)),
data=df.european[x$in_id,])
pred <- predict(mod,
newdata=df.european[-x$in_id,])
truth <- df.european[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline_european# A tibble: 10 x 2
rmse mae
<dbl> <dbl>
1 4.70 3.22
2 5.50 4.55
3 4.27 3.63
4 3.22 2.53
5 5.08 4.06
6 3.95 2.92
7 6.76 5.26
8 5.08 4.00
9 7.40 5.82
10 5.66 4.62
# menghitung rata-rata untuk 10 folds
mean_metric_spline_european<- colMeans(metric_spline_european)
mean_metric_spline_european rmse mae
5.161558 4.061438
# fungsi tangga
fungsi.tangga.eurpean <- lm(df.european$mpg~cut(df.european$horsepower,12))
prediksi.ft.european <- predict(fungsi.tangga.eurpean, data.frame(df.european$horsepower),interval = "predict")
#dev.off()
#jpeg(filename="C:/Users/user/Desktop/pictures/gambar6.jpg")
plot(df.european$horsepower,df.european$mpg,
pch=16,col="red",
xlab = "Horse Power",
ylab = "Mile Per Gallon",
main = "Fungsi Tangga : European")
ix <- sort(df.european$horsepower,index.return=T)$ix
lines(df.european$horsepower[ix], prediksi.ft.european[ix],
col="blue",
type="l", lwd=2)#dev.off()# cross validation
set.seed(1501211014)
# fungsi tangga
cross_val.ft <- vfold_cv(df.european,v=10)
breaks <- 2:5
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val.ft$splits,
function(x){
training <- df.european[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- df.european[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
})
best_tangga <- cbind(best_tangga)
# menampilkan hasil all breaks
best_tangga rmse mae
1 5.033635 4.034447
2 5.410103 4.459703
3 5.085279 4.123989
4 5.050154 3.978805
mean_metric_ft <- colMeans(best_tangga)
mean_metric_ft rmse mae
5.144793 4.149236