MOVING BEYOND LINEARITY
SOAL
Data bisa diperoleh untuk tugas dapat diperoleh di package ISLR.Silahkan Install dulu packagenya.
Gunakan model-model non-linier yang sudah dipelajari pada data Auto dengan peubah respon mpg dan pilih tiga kolom untuk dijadikan peubah prediktor.
Apakah ada bukti untuk hubungan non-linier untuk peubah-peubah yang anda pilih? Buat visualisasi untuk Mendukung jawaban Anda.
Tentukan model non-linear terbaik untuk masing pasangan peubah (Secara visual dan/atau secara empiris)
DATA
Data yang digunakan adalah data Auto dari Package ISLR. Data terdiri dari 9 variabel dan 392 observasi atau pengamatan.Pada kali ini variabel yang digunakan ada 4 variabel, yaitu:
mpg : mile per gallon (Y)
horsepower : Kekuatan mesin (X1)
dispacement : Perpindahan (X2)
weight : Berat (X3)
datatable(Auto) Auto1 <- Auto %>% select(mpg,horsepower,displacement,weight)
datatable(Auto1)par(mfrow=c(1,1))
hp <- plot(Auto1$horsepower, Auto$mpg,pch=19, col="brown",
xlab="Horse Power", ylab="Mile per Gallon")dp <- plot(Auto1$displacement, Auto$mpg,pch=19, col="brown",
xlab="Displacement", ylab="Mile per Gallon")w <- plot(Auto1$weight, Auto$mpg,pch=19, col="brown",
xlab="Weight", ylab="Mile per Gallon")Scatter plot terdiri dari 392 kendaraan dimana masing-masing kendaraan diukur empat peubah/variabel, yaitu horse power (X1), displacement (X2), weight (X3) dan mile per gallon (Y). Jika diamati lebih detail dari scatter plot nya, pola data ini cenderung tidak linier, maka ada beberapa metode yang bisa digunakan. Metode yang akan digunakan pada kali ini ada 3 yaitu Regresi Polinomial, Piecewise constant, dan Natural cubic splines.
mpg (Y) vs horsepower (X1)
1. Regresi Polinomial
cv_poly1 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly1 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(horsepower,1,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly1<- colMeans(metric_poly1)
return(poly1)
}
cv_poly2 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly2<- colMeans(metric_poly2)
return(poly2)
}
cv_poly3 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(horsepower,3,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly3<- colMeans(metric_poly3)
return(poly3)
}
cv_poly4 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(horsepower,4,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly4<- colMeans(metric_poly4)
return(poly4)
}cv_poly <- rbind (cv_poly1(Auto1),cv_poly2(Auto1),cv_poly3(Auto1),cv_poly4(Auto1))
cv_poly## rmse
## [1,] 4.909546
## [2,] 4.353575
## [3,] 4.352157
## [4,] 4.363377
best.poly <- min(cv_poly)
best.poly## [1] 4.352157
Pemilihan model regresi polinomial terbaik dari data horsepower vs mpg menggunakan cross validation (CV) dengan 10 fold (lipatan). Secara empirik melihat nilai rmse, diperoleh bahwa regresi polinomial ordo 3 memiliki nilai rmse terkecil. Ini berarti, untuk model regresi polinomial terbaik dipilih yang polinomial ordo 3.
poly.1 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,1,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
poly.2 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,2,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
poly.3 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,3,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
poly.4 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,4,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly.1, poly.2, poly.3, poly.4, labels=c("Regresi.Linier","Polinomial2", "Polinomial3", "Polinomial4"), label_size = 6)2. Piecewise Constant
cv_pc <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
breaks <- 3:12
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,function(x){
training <- Auto1[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks<-cbind(lower = as.numeric(sub("\\((.+),.*","\\1",labs_x)),
upper=as.numeric(sub("[^,]*,([^]]*)\\]","\\1",
labs_x)))
testing <- Auto1[-x$in_id,]
horsepower_new<-cut(testing$horsepower,c(labs_x_breaks[1,1],
labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse<-mlr3measures::rmse(truth=data_eval$truth,response = data_eval$pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#Menghitung rata-rata 10 fold
colMeans(metric_tangga)
})
# menampilkan hasil all breaks
best_tangga <- cbind(breaks=breaks,best_tangga)
rmse <- as.data.frame(best_tangga %>% slice_min(rmse))
return(rbind(rmse))
}cv_pc(Auto1)## breaks rmse
## 1 8 4.408006
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 8.
ggplot(Auto1,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="brown") +
stat_smooth(method = "lm",formula = y~cut(x,8),lty = 1, col = "blue",se = F) +
theme_bw()Berdasarkan empiris diperoleh interval yang terbaik dengan melihat nilai rmse terkecil adalah interval 8. Jika dilihat berdasarkan visual semakin banyak interval maka akan semakin mulus. kurva fungsi tangga (piecewise constant) pada interval 8 membentuk pola dan mengikuti sebaran data.
3. Natural cubic splines
Penentuan banyaknya basis
set.seed(2154)
cross.val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 1:8
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
metric.spline3
#rata-rata untuk 10 folds
mean.metric.spline3 <- colMeans(metric.spline3)
mean.metric.spline3
})
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse)))
basis## df rmse
## 1 7 4.27459
Menentukan knot berdasarkan basis terpilih yaitu 7
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto1$horsepower, df=7),"knots")## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 68.85714 79.71429 89.57143 98.00000 113.57143 150.00000
Menentukan knot secara terencana yaitu 4
#nilai knots yang tererncana
attr(ns(Auto1$horsepower, df=4),"knots")## 25% 50% 75%
## 75.0 93.5 126.0
# Knot dari basis terpilih
mod_spline3ns = lm(mpg ~ ns(horsepower, knots =
c(68.85714,79.71429,89.57143,98,113.57143,150)),data=Auto1)
summary(mod_spline3ns)##
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(68.85714, 79.71429,
## 89.57143, 98, 113.57143, 150)), data = Auto1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6158 -2.5868 -0.1339 2.2474 14.4829
##
## Coefficients:
## Estimate
## (Intercept) 33.748
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))1 -7.888
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))2 -7.844
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))3 -14.226
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))4 -12.527
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))5 -24.049
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))6 -19.387
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))7 -20.803
## Std. Error
## (Intercept) 1.563
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))1 1.694
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))2 1.923
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))3 1.812
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))4 2.077
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))5 1.715
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))6 3.664
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))7 1.761
## t value
## (Intercept) 21.592
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))1 -4.657
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))2 -4.078
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))3 -7.853
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))4 -6.032
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))5 -14.021
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))6 -5.291
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))7 -11.816
## Pr(>|t|)
## (Intercept) < 2e-16
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))1 4.43e-06
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))2 5.52e-05
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))3 4.11e-14
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))4 3.81e-09
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))5 < 2e-16
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))6 2.05e-07
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))7 < 2e-16
##
## (Intercept) ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))1 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))2 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))3 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))4 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))5 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))6 ***
## ns(horsepower, knots = c(68.85714, 79.71429, 89.57143, 98, 113.57143, 150))7 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.272 on 384 degrees of freedom
## Multiple R-squared: 0.7057, Adjusted R-squared: 0.7004
## F-statistic: 131.6 on 7 and 384 DF, p-value: < 2.2e-16
# Knot dari basis yang terencana
mod_spline3ns1 = lm(mpg ~ ns(horsepower, knots = c(75,93.5,126)),data=Auto1)
summary(mod_spline3ns1)##
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(75, 93.5, 126)),
## data = Auto1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5781 -2.4975 -0.2753 2.0543 15.7782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.019 1.326 27.167 <2e-16
## ns(horsepower, knots = c(75, 93.5, 126))1 -15.110 1.308 -11.554 <2e-16
## ns(horsepower, knots = c(75, 93.5, 126))2 -20.243 1.290 -15.692 <2e-16
## ns(horsepower, knots = c(75, 93.5, 126))3 -26.848 3.019 -8.892 <2e-16
## ns(horsepower, knots = c(75, 93.5, 126))4 -21.761 1.569 -13.874 <2e-16
##
## (Intercept) ***
## ns(horsepower, knots = c(75, 93.5, 126))1 ***
## ns(horsepower, knots = c(75, 93.5, 126))2 ***
## ns(horsepower, knots = c(75, 93.5, 126))3 ***
## ns(horsepower, knots = c(75, 93.5, 126))4 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.328 on 387 degrees of freedom
## Multiple R-squared: 0.6957, Adjusted R-squared: 0.6925
## F-statistic: 221.2 on 4 and 387 DF, p-value: < 2.2e-16
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 7:12
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg~ ns(horsepower,df=i),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
})# menampilkan hasil all breaks
best_spline3 <- cbind(df=df,best_spline3)
best_spline3## df rmse
## 1 7 4.274590
## 2 8 4.289782
## 3 9 4.292602
## 4 10 4.273561
## 5 11 4.298625
## 6 12 4.296527
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)## df rmse
## 1 10 4.273561
Secara empiris berdasarkan rmse terbaik dengan df=10
hp6 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="brown")+
stat_smooth(method = "lm",formula = y~ns(x, knots = c(68.85714,79.71429,89.57143,98,113.57143,150)),lty = 1,se=F) +
theme_bw()
hp3 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="brown")+
stat_smooth(method = "lm",formula = y~ns(x, knots =
c(75,93.5,126)),lty = 1,se=F) + theme_bw()
plot_grid(hp6,hp3)Dari visual yang terbaik adalah horsepower dengan 3 knot, karena tidak ada garis melengkung di awal dan akhir line pada plotnya.
Perbandingan Model
perbandinganAutohp <- rbind(best.poly, cv_pc(Auto1)[1,-1],
best_spline3 %>% select(-1) %>% slice_min(rmse))
perbandinganAutohp$metode <- c("Polinomial ordo 3", "Piecewise Constant (breaks=8)", "Natural Cubic Splines 3 Knot (df=10)")
perbandinganAutohp## rmse metode
## 1 4.352157 Polinomial ordo 3
## 2 4.408006 Piecewise Constant (breaks=8)
## 3 4.273561 Natural Cubic Splines 3 Knot (df=10)
Dari ketiga metode, diperoleh metode terbaik dengan melihat nilai RMSE terkecil adalah Metode Regresi Natural Cubic Splines 3 Knot (df=10).
mpg (Y) vs displacement (X2)
1. Regresi Polinomial
cv_poly1 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly1 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(displacement,1,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly1<- colMeans(metric_poly1)
return(poly1)
}
cv_poly2 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(displacement,2,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly2<- colMeans(metric_poly2)
return(poly2)
}
cv_poly3 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(displacement,3,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly3<- colMeans(metric_poly3)
return(poly3)
}
cv_poly4 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(displacement,4,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly4<- colMeans(metric_poly4)
return(poly4)
}cv_poly <- rbind (cv_poly1(Auto1),cv_poly2(Auto1),cv_poly3(Auto1),cv_poly4(Auto1))
cv_poly## rmse
## [1,] 4.597766
## [2,] 4.325171
## [3,] 4.327439
## [4,] 4.337798
best.poly <- min(cv_poly)
best.poly## [1] 4.325171
Pemilihan model regresi polinomial terbaik dari data horsepower vs mpg menggunakan cross validation (CV) dengan 10 fold (lipatan). Secara empirik melihat nilai rmse, diperoleh bahwa regresi polinomial ordo 2 memiliki nilai rmse terkecil. Ini berarti, untuk model regresi polinomial terbaik dipilih yang polinomial ordo 2.
poly.1 <- ggplot(Auto1,aes(x=displacement, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,1,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
poly.2 <- ggplot(Auto1,aes(x=displacement, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,2,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
poly.3 <- ggplot(Auto1,aes(x=displacement, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,3,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
poly.4 <- ggplot(Auto1,aes(x=displacement, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,4,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly.1, poly.2, poly.3, poly.4, labels=c("Regresi.Linier","Polinomial2", "Polinomial3", "Polinomial4"), label_size = 6)2. Piecewise Constant
cv_pc <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
breaks <- 3:12
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,function(x){
training <- Auto1[x$in_id,]
training$displacement <- cut(training$displacement,i)
mod <- lm(mpg ~ displacement,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks<-cbind(lower = as.numeric(sub("\\((.+),.*","\\1",labs_x)),
upper=as.numeric(sub("[^,]*,([^]]*)\\]","\\1",
labs_x)))
testing <- Auto1[-x$in_id,]
displacement_new<-cut(testing$displacement,c(labs_x_breaks[1,1],
labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(displacement=displacement_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse<-mlr3measures::rmse(truth=data_eval$truth,response = data_eval$pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#Menghitung rata-rata 10 fold
colMeans(metric_tangga)
})
# menampilkan hasil all breaks
best_tangga <- cbind(breaks=breaks,best_tangga)
rmse <- as.data.frame(best_tangga %>% slice_min(rmse))
return(rbind(rmse))
}cv_pc(Auto1)## breaks rmse
## 1 9 4.220435
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 9.
ggplot(Auto1,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="brown") +
stat_smooth(method = "lm",formula = y~cut(x,9),lty = 1, col = "blue",se = F) +
theme_bw()Berdasarkan empiris diperoleh interval yang terbaik dengan melihat nilai rmse terkecil adalah interval 9. Jika dilihat berdasarkan visual semakin banyak interval maka akan semakin mulus. kurva fungsi tangga (piecewise constant) pada interval 9 membentuk pola dan mengikuti sebaran data.
3. Natural cubic splines
Penentuan banyaknya basis
set.seed(2154)
cross.val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 2:9
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,
function(x){
mod <- lm(mpg ~ ns(displacement,df=i),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
metric.spline3
#rata-rata untuk 10 folds
mean.metric.spline3 <- colMeans(metric.spline3)
mean.metric.spline3
})
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse)))
basis## df rmse
## 1 9 4.169282
Menentukan knot berdasarkan basis terpilih yaitu 9
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto1$displacement, df=9),"knots")## 11.11111% 22.22222% 33.33333% 44.44444% 55.55556% 66.66667% 77.77778% 88.88889%
## 91.0000 98.0000 119.0000 140.0000 173.0000 232.0000 302.2222 350.0000
Menentukan knot secara terencana yaitu 4
#nilai knots yang terencana
attr(ns(Auto1$displacement, df=4),"knots")## 25% 50% 75%
## 105.00 151.00 275.75
# Knot dari basis terpilih
mod_spline3ns = lm(mpg ~ ns(displacement, knots =
c(91,98,119,140,173,232,302.222,350)),data=Auto1)
summary(mod_spline3ns)##
## Call:
## lm(formula = mpg ~ ns(displacement, knots = c(91, 98, 119, 140,
## 173, 232, 302.222, 350)), data = Auto1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.490 -2.441 -0.441 2.118 20.226
##
## Coefficients:
## Estimate
## (Intercept) 25.5644
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))1 3.0660
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))2 -0.2712
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))3 0.9362
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))4 -4.0868
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))5 -6.0371
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))6 -9.9299
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))7 -15.4379
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))8 -1.9057
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))9 -18.2925
## Std. Error
## (Intercept) 1.6984
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))1 1.7769
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))2 2.2002
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))3 2.0436
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))4 2.5610
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))5 2.1810
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))6 2.1083
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))7 1.6746
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))8 3.9042
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))9 1.7370
## t value
## (Intercept) 15.052
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))1 1.725
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))2 -0.123
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))3 0.458
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))4 -1.596
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))5 -2.768
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))6 -4.710
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))7 -9.219
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))8 -0.488
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))9 -10.531
## Pr(>|t|)
## (Intercept) < 2e-16
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))1 0.08526
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))2 0.90196
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))3 0.64713
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))4 0.11135
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))5 0.00591
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))6 3.47e-06
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))7 < 2e-16
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))8 0.62574
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))9 < 2e-16
##
## (Intercept) ***
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))1 .
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))2
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))3
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))4
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))5 **
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))6 ***
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))7 ***
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))8
## ns(displacement, knots = c(91, 98, 119, 140, 173, 232, 302.222, 350))9 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.163 on 382 degrees of freedom
## Multiple R-squared: 0.722, Adjusted R-squared: 0.7155
## F-statistic: 110.2 on 9 and 382 DF, p-value: < 2.2e-16
# Knot dari basis yang terencana
mod_spline3ns1 = lm(mpg ~ ns(displacement, knots=c(105,151,275.75)),data=Auto1)
summary(mod_spline3ns1)##
## Call:
## lm(formula = mpg ~ ns(displacement, knots = c(105, 151, 275.75)),
## data = Auto1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.7432 -2.3595 -0.2352 2.0952 20.4131
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 33.999 1.070 31.761
## ns(displacement, knots = c(105, 151, 275.75))1 -12.157 1.268 -9.588
## ns(displacement, knots = c(105, 151, 275.75))2 -17.094 1.346 -12.696
## ns(displacement, knots = c(105, 151, 275.75))3 -24.535 2.502 -9.808
## ns(displacement, knots = c(105, 151, 275.75))4 -18.385 1.341 -13.706
## Pr(>|t|)
## (Intercept) <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))1 <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))2 <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))3 <2e-16 ***
## ns(displacement, knots = c(105, 151, 275.75))4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.368 on 387 degrees of freedom
## Multiple R-squared: 0.6901, Adjusted R-squared: 0.6869
## F-statistic: 215.4 on 4 and 387 DF, p-value: < 2.2e-16
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 3:12
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg~ ns(displacement,df=i),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
})# menampilkan hasil all breaks
best_spline3 <- cbind(df=df,best_spline3)
best_spline3## df rmse
## 1 3 4.325494
## 2 4 4.340964
## 3 5 4.301844
## 4 6 4.226619
## 5 7 4.194683
## 6 8 4.188847
## 7 9 4.169282
## 8 10 4.160931
## 9 11 4.126660
## 10 12 4.116457
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)## df rmse
## 1 12 4.116457
Secara empiris berdasarkan rmse terbaik dengan df=12
dp8 <- ggplot(Auto1,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="brown")+
stat_smooth(method = "lm",formula = y~ns(x, knots = c(91,98,119,140,173,232,302.222,350)),lty = 1,se=F) +
theme_bw()
dp3 <- ggplot(Auto1,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="brown")+
stat_smooth(method = "lm",formula = y~ns(x, knots =
c(105,151,275.75)),lty = 1,se=F) + theme_bw()
plot_grid(dp8,dp3) Dari visual yang terbaik adalah displacement dengan 3 knot, karena tidak ada garis melengkung di awal dan akhir line pada plotnya.
Perbandingan Model
perbandinganAutodp <- rbind(best.poly, cv_pc(Auto)[1,-1],
best_spline3 %>% select(-1) %>% slice_min(rmse))
perbandinganAutodp$metode <- c("Polinomial ordo 2", "Piecewise Constant (breaks=9)", "Natural Cubic Splines 3 Knot (df=12)")
perbandinganAutodp## rmse metode
## 1 4.325171 Polinomial ordo 2
## 2 4.220435 Piecewise Constant (breaks=9)
## 3 4.116457 Natural Cubic Splines 3 Knot (df=12)
Dari ketiga metode, diperoleh metode terbaik dengan melihat nilai RMSE terkecil adalah Metode Regresi Natural Cubic Splines 3 Knot (df=12).
mpg (Y) vs weight (X3)
1. Regresi Polinomial
cv_poly1 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly1 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(weight,1,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly1<- colMeans(metric_poly1)
return(poly1)
}
cv_poly2 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly2 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(weight,2,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly2<- colMeans(metric_poly2)
return(poly2)
}
cv_poly3 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly3 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(weight,3,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly3<- colMeans(metric_poly3)
return(poly3)
}
cv_poly4 <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
metric_poly4 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(weight,4,raw = T),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#menghitung rmse rata-rata untuk 10 folds
poly4<- colMeans(metric_poly4)
return(poly4)
}cv_poly <- rbind (cv_poly1(Auto1),cv_poly2(Auto1),cv_poly3(Auto1),cv_poly4(Auto1))
cv_poly## rmse
## [1,] 4.300195
## [2,] 4.152781
## [3,] 4.158427
## [4,] 4.165461
best.poly <- min(cv_poly)
best.poly## [1] 4.152781
Pemilihan model regresi polinomial terbaik dari data horsepower vs mpg menggunakan cross validation (CV) dengan 10 fold (lipatan). Secara empirik melihat nilai rmse, diperoleh bahwa regresi polinomial ordo 2 memiliki nilai rmse terkecil. Ini berarti, untuk model regresi polinomial terbaik dipilih yang polinomial ordo 2.
poly.1 <- ggplot(Auto1,aes(x=weight, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,1,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
poly.2 <- ggplot(Auto1,aes(x=weight, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,2,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
poly.3 <- ggplot(Auto1,aes(x=weight, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,3,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
poly.4 <- ggplot(Auto1,aes(x=weight, y=mpg)) + geom_point(alpha=0.55,
color="brown") + stat_smooth(method = "lm", formula
=y~poly(x,4,raw=T), lty = 1, col = "blue",se = F)+
theme_bw()
plot_grid(poly.1, poly.2, poly.3, poly.4, labels=c("Regresi.Linier","Polinomial2", "Polinomial3", "Polinomial4"), label_size = 6)2. Piecewise Constant
cv_pc <- function(Auto1){
set.seed(2154)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
breaks <- 3:12
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,function(x){
training <- Auto[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks<-cbind(lower = as.numeric(sub("\\((.+),.*","\\1",labs_x)),
upper=as.numeric(sub("[^,]*,([^]]*)\\]","\\1",
labs_x)))
testing <- Auto1[-x$in_id,]
weight_new<-cut(testing$weight,c(labs_x_breaks[1,1],
labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse<-mlr3measures::rmse(truth=data_eval$truth,response = data_eval$pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
#Menghitung rata-rata 10 fold
colMeans(metric_tangga)
})
# menampilkan hasil all breaks
best_tangga <- cbind(breaks=breaks,best_tangga)
rmse <- as.data.frame(best_tangga %>% slice_min(rmse))
return(rbind(rmse))
}cv_pc(Auto1)## breaks rmse
## 1 12 4.18149
Secara empiris berdasarkan rmse banyaknya interval yang terbaik adalah 12.
ggplot(Auto1,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="brown") +
stat_smooth(method = "lm",formula = y~cut(x,12),lty = 1, col = "blue",se = F) +
theme_bw()Berdasarkan empiris diperoleh interval yang terbaik dengan melihat nilai rmse terkecil adalah interval 12. Jika dilihat berdasarkan visual semakin banyak interval maka akan semakin mulus. kurva fungsi tangga (piecewise constant) pada interval 12 membentuk pola dan mengikuti sebaran data.
3. Natural cubic splines
Penentuan banyaknya basis
set.seed(2154)
cross.val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 2:12
best.spline3 <- map_dfr(df, function(i){
metric.spline3 <- map_dfr(cross.val$splits,function(x){
mod <- lm(mpg ~ ns(weight,df=i),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
metric.spline3
#rata-rata untuk 10 folds
mean.metric.spline3 <- colMeans(metric.spline3)
mean.metric.spline3
})
best.spline3 <- cbind(df=df,best.spline3)
basis<-data.frame(rbind(best.spline3 %>% slice_min(rmse)))
basis## df rmse
## 1 7 4.151979
Menentukan knot berdasarkan basis terpilih yaitu 7
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto1$weight, df=7),"knots")## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 2074.857 2285.429 2635.000 2986.571 3446.143 4096.286
Menentukan knot yang terencana yaitu 4
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto1$weight, df=4),"knots")## 25% 50% 75%
## 2225.25 2803.50 3614.75
# Knot dari basis yang terpilih
mod_spline3ns = lm(mpg ~ ns(weight, knots =
c(2074.857,2285.429,2635.000,2986.571,3446.143,4096.286)),
data=Auto1)
summary(mod_spline3ns)##
## Call:
## lm(formula = mpg ~ ns(weight, knots = c(2074.857, 2285.429, 2635,
## 2986.571, 3446.143, 4096.286)), data = Auto1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3314 -2.5302 -0.4222 1.8281 16.3921
##
## Coefficients:
## Estimate
## (Intercept) 31.575
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))1 -6.249
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))2 -4.599
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))3 -10.661
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))4 -13.286
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))5 -19.001
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))6 -15.370
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))7 -22.327
## Std. Error
## (Intercept) 2.038
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))1 1.933
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))2 2.514
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))3 2.255
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))4 2.378
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))5 1.727
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))6 4.628
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))7 1.958
## t value
## (Intercept) 15.495
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))1 -3.233
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))2 -1.829
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))3 -4.728
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))4 -5.587
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))5 -11.005
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))6 -3.321
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))7 -11.401
## Pr(>|t|)
## (Intercept) < 2e-16
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))1 0.001332
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))2 0.068144
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))3 3.19e-06
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))4 4.39e-08
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))5 < 2e-16
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))6 0.000982
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))7 < 2e-16
##
## (Intercept) ***
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))1 **
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))2 .
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))3 ***
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))4 ***
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))5 ***
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))6 ***
## ns(weight, knots = c(2074.857, 2285.429, 2635, 2986.571, 3446.143, 4096.286))7 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.143 on 384 degrees of freedom
## Multiple R-squared: 0.7233, Adjusted R-squared: 0.7183
## F-statistic: 143.4 on 7 and 384 DF, p-value: < 2.2e-16
# Knot dari basis yang terencana
mod_spline3ns1 = lm(mpg ~ ns(weight, knots = c(2225.25,2803.5,614.75)),
data=Auto1)
summary(mod_spline3ns1)##
## Call:
## lm(formula = mpg ~ ns(weight, knots = c(2225.25, 2803.5, 614.75)),
## data = Auto1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.7276 -2.7113 -0.4017 1.8321 16.2330
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) -41.063 4.220 -9.730
## ns(weight, knots = c(2225.25, 2803.5, 614.75))1 57.013 3.044 18.728
## ns(weight, knots = c(2225.25, 2803.5, 614.75))2 16.079 2.332 6.895
## ns(weight, knots = c(2225.25, 2803.5, 614.75))3 149.194 9.914 15.049
## ns(weight, knots = c(2225.25, 2803.5, 614.75))4 NA NA NA
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## ns(weight, knots = c(2225.25, 2803.5, 614.75))1 < 2e-16 ***
## ns(weight, knots = c(2225.25, 2803.5, 614.75))2 2.19e-11 ***
## ns(weight, knots = c(2225.25, 2803.5, 614.75))3 < 2e-16 ***
## ns(weight, knots = c(2225.25, 2803.5, 614.75))4 NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.18 on 388 degrees of freedom
## Multiple R-squared: 0.7154, Adjusted R-squared: 0.7132
## F-statistic: 325.1 on 3 and 388 DF, p-value: < 2.2e-16
set.seed(2154)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 7:12
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg~ ns(weight,df=i),data=Auto1[x$in_id,])
pred <- predict(mod,newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth, response = pred)
metric <- c(rmse)
names(metric) <- c("rmse")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
})# menampilkan hasil all breaks
best_spline3 <- cbind(df=df,best_spline3)
best_spline3## df rmse
## 1 7 4.151979
## 2 8 4.171000
## 3 9 4.180805
## 4 10 4.166101
## 5 11 4.168050
## 6 12 4.186651
#berdasarkan rmse
best_spline3 %>% slice_min(rmse)## df rmse
## 1 7 4.151979
Secara empiris berdasarkan rmse terbaik dengan df=7
w6 <- ggplot(Auto1,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="brown")+
stat_smooth(method = "lm",formula = y~ns(x, knots = c(2074.857,2285.429,2635.000,2986.571,3446.143,4096.286)),lty =
1,se=F) + theme_bw()
w3 <- ggplot(Auto1,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="brown")+
stat_smooth(method = "lm",formula = y~ns(x, knots =
c(2225.25,2803.5,614.75)),lty = 1,se=F) + theme_bw()
plot_grid(w6,w3)## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
Dari visual yang terbaik adalah weight dengan 3 knot, karena tidak ada garis melengkung di awal dan akhir line pada plotnya.
Perbandingan Model
perbandinganAutow <- rbind(best.poly, cv_pc(Auto1)[1,-1],
best_spline3 %>% select(-1) %>% slice_min(rmse))
perbandinganAutow$metode <- c("Polinomial ordo 2", "Piecewise Constant (breaks=12)", "Natural Cubic Splines 3 Knot (df=7)")
perbandinganAutow## rmse metode
## 1 4.152781 Polinomial ordo 2
## 2 4.181490 Piecewise Constant (breaks=12)
## 3 4.151979 Natural Cubic Splines 3 Knot (df=7)
Dari ketiga metode, diperoleh metode terbaik dengan melihat nilai RMSE terkecil adalah Metode Regresi Natural Cubic Splines 3 Knot (df=7).
Plot Model Terbaik
hp_ncs3 <- ggplot(Auto1,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55,color="brown") +
stat_smooth(method = "lm",formula = y ~ ns(x, knots =
c(75,93.5,126)),lty = 1,se=F) + theme_bw() +
labs(title = "NCS 3 Knots (df=10)",subtitle = "mpg vs horsepower")
dp_ncs3 <- ggplot(Auto1,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="brown") +
stat_smooth(method = "lm",formula = y~ns(x, knots = c(105,151,275.75)),lty = 1,se=F) + theme_bw() +
labs(title = "NCS 3 Knots (df=12)", subtitle = "mpg vs displacement")
w_ncs3 <- ggplot(Auto1,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="brown") +
stat_smooth(method = "lm",formula = y~ns(x, knots =
c(2225.25,2803.5,614.75)),lty = 1,se=F) +
theme_bw() +
labs(title = "NCS 3 Knots (df=7)",subtitle = "mpg vs weight")
plot_grid(hp_ncs3, dp_ncs3, w_ncs3)## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
Perbandingan Model
A <- min(perbandinganAutohp[3,-2])
B <- min(perbandinganAutodp[3,-2])
C <- min(perbandinganAutow[3,-2])
perbandinganAll <- rbind(A,B,C)
perbandinganAll$metode <- c("Natural Cubic Splines 3 Knot (df=10)", "Natural Cubic Splines 3 Knot (df=12)","Natural Cubic Splines 3 Knot (df=7)")## Warning in perbandinganAll$metode <- c("Natural Cubic Splines 3 Knot (df=10)", :
## Coercing LHS to a list
perbandinganAll## [[1]]
## [1] 4.273561
##
## [[2]]
## [1] 4.116457
##
## [[3]]
## [1] 4.151979
##
## $metode
## [1] "Natural Cubic Splines 3 Knot (df=10)"
## [2] "Natural Cubic Splines 3 Knot (df=12)"
## [3] "Natural Cubic Splines 3 Knot (df=7)"
Kesimpulan:
Dari ketiga variabel prediktor, nilai RMSE terkecil terhadap mpg yaitu variabel X2 atau displacement yang terbaik sebesar 4.116457.
Referensi
Sartono, Bagus. 2020. https://rpubs.com/bagusco/taklinear.
Dito, Gerry Alfa. https://rpubs.com/gdito/regresi-nonlinear1.