library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(rsample)
library(ISLR)
library(ggplot2)
attach(Auto)## The following object is masked from package:ggplot2:
##
## mpg
head(Auto)## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
Auto1 <- data.frame(cbind(mpg, weight))
ggplot(Auto1,aes(x=weight, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()mod_linear = lm(mpg~weight,data=Auto1)
summary(mod_linear)##
## Call:
## lm(formula = mpg ~ weight, data = Auto1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9736 -2.7556 -0.3358 2.1379 16.5194
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.216524 0.798673 57.87 <2e-16 ***
## weight -0.007647 0.000258 -29.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.333 on 390 degrees of freedom
## Multiple R-squared: 0.6926, Adjusted R-squared: 0.6918
## F-statistic: 878.8 on 1 and 390 DF, p-value: < 2.2e-16
set.seed(123)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
breaks <- 2:10
best_poly <- map_dfr(breaks, function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(weight,i),
data=Auto1[x$in_id,])
pred <- predict(mod,
newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
})
best_poly <- cbind(breaks=breaks, best_poly)
# menampilkan hasil all breaks
best_poly## breaks rmse mae
## 1 2 4.149977 3.060704
## 2 3 4.158482 3.068020
## 3 4 4.165585 3.067210
## 4 5 4.169760 3.079497
## 5 6 4.178027 3.098482
## 6 7 4.170375 3.100742
## 7 8 4.181430 3.108979
## 8 9 4.254235 3.164729
## 9 10 4.299992 3.167556
best_poly%>% slice_min(rmse)## breaks rmse mae
## 1 2 4.149977 3.060704
best_poly%>% slice_min(mae)## breaks rmse mae
## 1 2 4.149977 3.060704
mod_polinomial=lm(mpg~poly(weight, 2, raw=T), data = Auto1)
mod_polinomial = lm(Auto1$mpg~ poly(Auto$weight,2,raw = T),data=Auto1)
Auto1best<-ggplot(Auto,aes(x=Auto1$weight, y=Auto1$mpg)) +
ggtitle("Polinomial Ordo 2")+
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+ theme_bw()
Auto1bestset.seed(123)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto1[x$in_id,]
training$weight <- cut(training$weight,i)
mod <- lm(mpg ~ weight,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto1[-x$in_id,]
weight_new <- cut(testing$weight,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(weight=weight_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
best_tangga## breaks rmse mae
## 1 3 4.868847 3.657505
## 2 4 4.706593 3.633584
## 3 5 4.389515 3.221556
## 4 6 4.201264 3.112221
## 5 7 4.279395 3.191726
## 6 8 4.370964 3.249765
## 7 9 4.341743 3.182396
## 8 10 4.329114 3.176390
best_tangga %>% slice_min(rmse)## breaks rmse mae
## 1 6 4.201264 3.112221
best_tangga %>% slice_min(mae)## breaks rmse mae
## 1 6 4.201264 3.112221
mod_tangga = lm(mpg ~ cut(weight,5),data=Auto1)
summary(mod_tangga)##
## Call:
## lm(formula = mpg ~ cut(weight, 5), data = Auto1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3101 -2.3101 -0.1055 1.8971 18.2945
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.3101 0.4048 77.34 <2e-16 ***
## cut(weight, 5)(2.32e+03,3.02e+03] -6.2046 0.5841 -10.62 <2e-16 ***
## cut(weight, 5)(3.02e+03,3.73e+03] -12.2930 0.6485 -18.96 <2e-16 ***
## cut(weight, 5)(3.73e+03,4.43e+03] -16.2148 0.6881 -23.56 <2e-16 ***
## cut(weight, 5)(4.43e+03,5.14e+03] -18.5184 0.9882 -18.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.416 on 387 degrees of freedom
## Multiple R-squared: 0.6831, Adjusted R-squared: 0.6798
## F-statistic: 208.6 on 4 and 387 DF, p-value: < 2.2e-16
Auto1abest<- ggplot(Auto1,aes(x=mpg, y=weight)) +
ggtitle("Fungsi Tangga Knot 5")+
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+
theme_bw()
Auto1abestset.seed(123)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 6:12
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(weight,df=i),
data=Auto1[x$in_id,])
pred <- predict(mod,
newdata=Auto1[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)## Warning in bs(weight, degree = 3L, knots = c(`25%` = 2227.5, `50%` = 2792.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`25%` = 2234, `50%` = 2835, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`20%` = 2155, `40%` = 2577.8, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`20%` = 2166.8, `40%` = 2604, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`16.66667%` = 2125, `33.33333%` =
## 2391, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`16.66667%` = 2130, `33.33333%` =
## 2406, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`14.28571%` = 2076.42857142857, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`14.28571%` = 2102.28571428571, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`12.5%` = 2049.375, `25%` =
## 2227.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`12.5%` = 2051, `25%` = 2234, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`11.11111%` = 2020, `22.22222%` =
## 2190, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`11.11111%` = 2021.66666666667, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`10%` = 1995.5, `20%` = 2155, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(weight, degree = 3L, knots = c(`10%` = 1996, `20%` = 2166.8, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3## df rmse mae
## 1 6 4.169320 3.097698
## 2 7 4.182779 3.098524
## 3 8 4.182940 3.120736
## 4 9 4.140723 3.065880
## 5 10 4.156925 3.096623
## 6 11 4.168753 3.095133
## 7 12 4.172423 3.093106
best_spline3 %>% slice_min(rmse)## df rmse mae
## 1 9 4.140723 3.06588
best_spline3 %>% slice_min(mae)## df rmse mae
## 1 9 4.140723 3.06588
mod_spline3 = lm(mpg ~ ns(weight, df=9), data=Auto1)
summary(mod_spline3)##
## Call:
## lm(formula = mpg ~ ns(weight, df = 9), data = Auto1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3376 -2.5092 -0.4385 1.8081 16.4843
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.211 2.345 13.738 < 2e-16 ***
## ns(weight, df = 9)1 -2.185 2.242 -0.975 0.33028
## ns(weight, df = 9)2 -7.507 2.936 -2.557 0.01094 *
## ns(weight, df = 9)3 -5.201 2.594 -2.006 0.04561 *
## ns(weight, df = 9)4 -11.233 2.766 -4.060 5.95e-05 ***
## ns(weight, df = 9)5 -12.573 2.663 -4.721 3.29e-06 ***
## ns(weight, df = 9)6 -15.798 2.687 -5.881 8.93e-09 ***
## ns(weight, df = 9)7 -19.209 1.932 -9.944 < 2e-16 ***
## ns(weight, df = 9)8 -17.570 5.328 -3.298 0.00107 **
## ns(weight, df = 9)9 -22.259 2.202 -10.107 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.159 on 382 degrees of freedom
## Multiple R-squared: 0.7226, Adjusted R-squared: 0.7161
## F-statistic: 110.6 on 9 and 382 DF, p-value: < 2.2e-16
Auto1bbest <-ggplot(Auto1,aes(x=weight, y=mpg)) +
ggtitle("Regresi Spline df=9")+
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~ns(x, df=9),
lty = 1,se = F)
Auto1bbestlibrary(gridExtra)##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(Auto1best, Auto1abest, Auto1bbest, ncol=2) Model terbaik untuk data Auto adalah Regresi Polinomial Ordo 2.
Auto2 <- data.frame(cbind(mpg, acceleration))
ggplot(Auto2,aes(x=acceleration, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()mod_linear = lm(mpg~acceleration,data=Auto2)
summary(mod_linear)##
## Call:
## lm(formula = mpg ~ acceleration, data = Auto2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.989 -5.616 -1.199 4.801 23.239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.8332 2.0485 2.359 0.0188 *
## acceleration 1.1976 0.1298 9.228 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.08 on 390 degrees of freedom
## Multiple R-squared: 0.1792, Adjusted R-squared: 0.1771
## F-statistic: 85.15 on 1 and 390 DF, p-value: < 2.2e-16
set.seed(123)
cross_val <- vfold_cv(Auto2,v=10,strata = "mpg")
breaks <- 2:10
best_poly <- map_dfr(breaks, function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(acceleration,i),
data=Auto2[x$in_id,])
pred <- predict(mod,
newdata=Auto2[-x$in_id,])
truth <- Auto2[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
})
best_poly <- cbind(breaks=breaks, best_poly)
# menampilkan hasil all breaks
best_poly## breaks rmse mae
## 1 2 7.037689 5.755305
## 2 3 7.048668 5.792450
## 3 4 7.008647 5.700795
## 4 5 7.042178 5.729396
## 5 6 7.046243 5.704385
## 6 7 7.097228 5.711475
## 7 8 7.167747 5.767367
## 8 9 7.150072 5.744992
## 9 10 7.164808 5.754932
best_poly%>% slice_min(rmse)## breaks rmse mae
## 1 4 7.008647 5.700795
best_poly%>% slice_min(mae)## breaks rmse mae
## 1 4 7.008647 5.700795
mod_polinomial=lm(mpg~poly(acceleration, 4, raw=T), data = Auto2)
mod_polinomial = lm(Auto1$mpg~ poly(Auto$acceleration,4,raw = T),data=Auto2)
Auto2best<-ggplot(Auto2,aes(x=Auto2$acceleration, y=Auto2$mpg)) +
ggtitle("Polinomial Ordo 4")+
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,4,raw=T),
lty = 1, col = "blue",se = F)+ theme_bw()
Auto2best## Warning: Use of `Auto2$acceleration` is discouraged. Use `acceleration` instead.
## Warning: Use of `Auto2$mpg` is discouraged. Use `mpg` instead.
## Warning: Use of `Auto2$acceleration` is discouraged. Use `acceleration` instead.
## Warning: Use of `Auto2$mpg` is discouraged. Use `mpg` instead.
set.seed(123)
cross_val <- vfold_cv(Auto2,v=10,strata = "mpg")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto2[x$in_id,]
training$acceleration <- cut(training$acceleration,i)
mod <- lm(mpg ~ acceleration,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto2[-x$in_id,]
acceleration_new <- cut(testing$acceleration,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(acceleration=acceleration_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
best_tangga## breaks rmse mae
## 1 3 7.003365 5.637141
## 2 4 7.243047 5.879089
## 3 5 7.240624 5.948255
## 4 6 6.987921 5.640505
## 5 7 7.202002 5.846671
## 6 8 7.018944 5.692063
## 7 9 7.012388 5.626494
## 8 10 7.145911 5.803971
best_tangga %>% slice_min(rmse)## breaks rmse mae
## 1 6 6.987921 5.640505
best_tangga %>% slice_min(mae)## breaks rmse mae
## 1 9 7.012388 5.626494
mod_tangga = lm(mpg ~ cut(acceleration,5),data=Auto2)
summary(mod_tangga)##
## Call:
## lm(formula = mpg ~ cut(acceleration, 5), data = Auto2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.637 -5.879 -1.186 5.155 23.721
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.186 1.564 9.708 < 2e-16 ***
## cut(acceleration, 5)(11.4,14.7] 5.694 1.682 3.386 0.000782 ***
## cut(acceleration, 5)(14.7,18.1] 10.057 1.659 6.064 3.17e-09 ***
## cut(acceleration, 5)(18.1,21.4] 10.452 1.834 5.698 2.41e-08 ***
## cut(acceleration, 5)(21.4,24.8] 16.760 2.668 6.282 9.02e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.169 on 387 degrees of freedom
## Multiple R-squared: 0.1651, Adjusted R-squared: 0.1564
## F-statistic: 19.13 on 4 and 387 DF, p-value: 2.277e-14
Auto2abest<- ggplot(Auto2,aes(x=mpg, y=acceleration)) +
ggtitle("Fungsi Tangga Knot 5")+
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+
theme_bw()
Auto2abestmod_tangga = lm(mpg ~ cut(acceleration,8),data=Auto2)
summary(mod_tangga)##
## Call:
## lm(formula = mpg ~ cut(acceleration, 8), data = Auto2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.4630 -5.2518 -0.7662 4.5657 24.6519
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.600 2.198 6.642 1.06e-10 ***
## cut(acceleration, 8)(10.1,12.2] 1.712 2.500 0.685 0.4940
## cut(acceleration, 8)(12.2,14.3] 5.348 2.333 2.292 0.0224 *
## cut(acceleration, 8)(14.3,16.4] 10.548 2.279 4.628 5.05e-06 ***
## cut(acceleration, 8)(16.4,18.5] 10.863 2.330 4.663 4.31e-06 ***
## cut(acceleration, 8)(18.5,20.6] 12.184 2.470 4.932 1.21e-06 ***
## cut(acceleration, 8)(20.6,22.7] 12.454 2.924 4.260 2.58e-05 ***
## cut(acceleration, 8)(22.7,24.8] 19.800 4.112 4.815 2.12e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.951 on 384 degrees of freedom
## Multiple R-squared: 0.2211, Adjusted R-squared: 0.2069
## F-statistic: 15.57 on 7 and 384 DF, p-value: < 2.2e-16
Auto2bbest<- ggplot(Auto2,aes(x=mpg, y=acceleration)) +
ggtitle("Fungsi Tangga Knot 8")+
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,8),
lty = 1, col = "blue",se = F)+
theme_bw()
Auto2bbestset.seed(123)
cross_val <- vfold_cv(Auto1,v=10,strata = "mpg")
df <- 6:12
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(acceleration,df=i),
data=Auto2[x$in_id,])
pred <- predict(mod,
newdata=Auto2[-x$in_id,])
truth <- Auto1[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)## Warning in bs(acceleration, degree = 3L, knots = c(`25%` = 13.7, `50%` = 15.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`25%` = 13.8, `50%` = 15.5, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`20%` = 13.42, `40%` =
## 14.8, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`20%` = 13.5, `40%` = 14.8, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`16.66667%` = 13, `33.33333%`
## = 14.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`16.66667%` = 13, `33.33333%`
## = 14.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`14.28571%` = 12.8,
## `28.57143%` = 14, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`14.28571%` = 12.8,
## `28.57143%` = 14, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`12.5%` = 12.5, `25%` =
## 13.7, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`12.5%` = 12.5, `25%` =
## 13.8, : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`11.11111%` = 12.2,
## `22.22222%` = 13.5, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`11.11111%` = 12.5,
## `22.22222%` = 13.5, : some 'x' values beyond boundary knots may cause ill-
## conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`10%` = 12, `20%` = 13.42, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(acceleration, degree = 3L, knots = c(`10%` = 12.02, `20%` =
## 13.5, : some 'x' values beyond boundary knots may cause ill-conditioned bases
best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3## df rmse mae
## 1 6 7.027505 5.700351
## 2 7 7.014061 5.636830
## 3 8 7.039149 5.664311
## 4 9 7.044218 5.686154
## 5 10 7.019215 5.651353
## 6 11 7.075200 5.697487
## 7 12 7.061283 5.700106
best_spline3 %>% slice_min(rmse)## df rmse mae
## 1 7 7.014061 5.63683
best_spline3 %>% slice_min(mae)## df rmse mae
## 1 7 7.014061 5.63683
mod_spline3 = lm(mpg ~ ns(acceleration, df=7), data=Auto2)
summary(mod_spline3)##
## Call:
## lm(formula = mpg ~ ns(acceleration, df = 7), data = Auto2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.0600 -4.9927 -0.6968 4.8321 23.3231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.450 3.530 4.094 5.18e-05 ***
## ns(acceleration, df = 7)1 6.727 3.410 1.973 0.04923 *
## ns(acceleration, df = 7)2 13.110 4.114 3.187 0.00156 **
## ns(acceleration, df = 7)3 7.781 3.793 2.051 0.04091 *
## ns(acceleration, df = 7)4 13.760 3.900 3.528 0.00047 ***
## ns(acceleration, df = 7)5 7.985 3.033 2.633 0.00882 **
## ns(acceleration, df = 7)6 15.990 8.223 1.945 0.05254 .
## ns(acceleration, df = 7)7 21.223 3.944 5.381 1.29e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.915 on 384 degrees of freedom
## Multiple R-squared: 0.2292, Adjusted R-squared: 0.2151
## F-statistic: 16.31 on 7 and 384 DF, p-value: < 2.2e-16
Auto2cbest <-ggplot(Auto2,aes(x=acceleration, y=mpg)) +
ggtitle("Regresi Spline df=7")+
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~ns(x, df=7),
lty = 1,se = F)
Auto2cbestlibrary(gridExtra)
grid.arrange(Auto2best, Auto2abest, Auto2bbest, Auto2cbest)## Warning: Use of `Auto2$acceleration` is discouraged. Use `acceleration` instead.
## Warning: Use of `Auto2$mpg` is discouraged. Use `mpg` instead.
## Warning: Use of `Auto2$acceleration` is discouraged. Use `acceleration` instead.
## Warning: Use of `Auto2$mpg` is discouraged. Use `mpg` instead.
Model terbaik untuk data Auto adalah Regresi Polinomial Ordo 4.
Auto3 <- data.frame(cbind(mpg, year))
ggplot(Auto3,aes(x=year, y=mpg)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()mod_linear = lm(mpg~year,data=Auto3)
summary(mod_linear)##
## Call:
## lm(formula = mpg ~ year, data = Auto3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.0212 -5.4411 -0.4412 4.9739 18.2088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -70.01167 6.64516 -10.54 <2e-16 ***
## year 1.23004 0.08736 14.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.363 on 390 degrees of freedom
## Multiple R-squared: 0.337, Adjusted R-squared: 0.3353
## F-statistic: 198.3 on 1 and 390 DF, p-value: < 2.2e-16
set.seed(123)
cross_val <- vfold_cv(Auto2,v=10,strata = "mpg")
breaks <- 2:10
best_poly <- map_dfr(breaks, function(i){
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(year,i),
data=Auto3[x$in_id,])
pred <- predict(mod,
newdata=Auto3[-x$in_id,])
truth <- Auto3[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
})
best_poly <- cbind(breaks=breaks, best_poly)
# menampilkan hasil all breaks
best_poly## breaks rmse mae
## 1 2 6.222687 5.271367
## 2 3 6.230273 5.290035
## 3 4 6.226953 5.261184
## 4 5 6.246170 5.271508
## 5 6 6.236244 5.251166
## 6 7 6.177974 5.211158
## 7 8 6.191036 5.210099
## 8 9 6.150086 5.210250
## 9 10 6.081733 5.203132
best_poly%>% slice_min(rmse)## breaks rmse mae
## 1 10 6.081733 5.203132
best_poly%>% slice_min(mae)## breaks rmse mae
## 1 10 6.081733 5.203132
mod_polinomial=lm(mpg~poly(year, 10, raw=T), data = Auto3)
mod_polinomial = lm(Auto3$mpg~ poly(Auto3$year,10,raw = T),data=Auto3)
Auto3best<-ggplot(Auto3,aes(x=Auto3$year, y=Auto3$mpg)) +
ggtitle("Polinomial Ordo 10")+
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,10,raw=T),
lty = 1, col = "blue",se = F)+ theme_bw()
Auto3best## Warning: Use of `Auto3$year` is discouraged. Use `year` instead.
## Warning: Use of `Auto3$mpg` is discouraged. Use `mpg` instead.
## Warning: Use of `Auto3$year` is discouraged. Use `year` instead.
## Warning: Use of `Auto3$mpg` is discouraged. Use `mpg` instead.
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
set.seed(123)
cross_val <- vfold_cv(Auto2,v=10,strata = "mpg")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto3[x$in_id,]
training$year <- cut(training$year,i)
mod <- lm(mpg ~ year,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto3[-x$in_id,]
year_new <- cut(testing$year,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(year=year_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga <- cbind(breaks=breaks,best_tangga)
best_tangga## breaks rmse mae
## 1 3 6.436057 5.346212
## 2 4 6.116266 5.159484
## 3 5 6.215227 5.233387
## 4 6 6.430025 5.386850
## 5 7 6.361801 5.339819
## 6 8 6.121254 5.204424
## 7 9 6.214099 5.259250
## 8 10 6.143947 5.216197
best_tangga %>% slice_min(rmse)## breaks rmse mae
## 1 4 6.116266 5.159484
best_tangga %>% slice_min(mae)## breaks rmse mae
## 1 4 6.116266 5.159484
mod_tangga = lm(mpg ~ cut(year,3),data=Auto3)
summary(mod_tangga)##
## Call:
## lm(formula = mpg ~ cut(year, 3), data = Auto3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.72 -5.22 -1.04 4.78 20.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.2200 0.5196 36.993 < 2e-16 ***
## cut(year, 3)(74,78] 3.1409 0.7657 4.102 4.99e-05 ***
## cut(year, 3)(78,82] 11.0046 0.7907 13.918 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.363 on 389 degrees of freedom
## Multiple R-squared: 0.3387, Adjusted R-squared: 0.3353
## F-statistic: 99.62 on 2 and 389 DF, p-value: < 2.2e-16
Auto3abest<- ggplot(Auto3,aes(x=mpg, y=year)) +
ggtitle("Fungsi Tangga Knot 3")+
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,3),
lty = 1, col = "blue",se = F)+
theme_bw()
Auto3abestset.seed(123)
cross_val <- vfold_cv(Auto3,v=10,strata = "mpg")
df <- 6:12
best_spline3 <- map_dfr(df, function(i){
metric_spline3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ bs(year,df=i),
data=Auto3[x$in_id,])
pred <- predict(mod,
newdata=Auto3[-x$in_id,])
truth <- Auto3[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline3
# menghitung rata-rata untuk 10 folds
mean_metric_spline3 <- colMeans(metric_spline3)
mean_metric_spline3
}
)
best_spline3 <- cbind(df=df,best_spline3)
# menampilkan hasil all breaks
best_spline3## df rmse mae
## 1 6 6.227589 5.238634
## 2 7 6.214179 5.240899
## 3 8 6.199109 5.227936
## 4 9 6.131527 5.201834
## 5 10 6.087050 5.206879
## 6 11 6.083599 5.195066
## 7 12 6.123817 5.226678
best_spline3 %>% slice_min(rmse)## df rmse mae
## 1 11 6.083599 5.195066
best_spline3 %>% slice_min(mae)## df rmse mae
## 1 11 6.083599 5.195066
mod_spline3 = lm(mpg ~ ns(year, df=11), data=Auto3)
summary(mod_spline3)##
## Call:
## lm(formula = mpg ~ ns(year, df = 11), data = Auto3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6834 -4.7129 -0.9992 4.2887 19.1824
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.6895 1.1104 15.931 < 2e-16 ***
## ns(year, df = 11)1 1.1100 2.3042 0.482 0.630259
## ns(year, df = 11)2 -3.1654 2.1301 -1.486 0.138097
## ns(year, df = 11)3 8.0610 2.3655 3.408 0.000725 ***
## ns(year, df = 11)4 0.5022 2.3002 0.218 0.827291
## ns(year, df = 11)5 5.4459 2.4630 2.211 0.027624 *
## ns(year, df = 11)6 7.2107 2.3021 3.132 0.001869 **
## ns(year, df = 11)7 4.4140 2.2389 1.972 0.049388 *
## ns(year, df = 11)8 20.5266 2.3691 8.664 < 2e-16 ***
## ns(year, df = 11)9 8.6745 2.1079 4.115 4.75e-05 ***
## ns(year, df = 11)10 16.9914 2.9507 5.758 1.75e-08 ***
## ns(year, df = 11)11 11.5737 1.5679 7.382 9.96e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.98 on 380 degrees of freedom
## Multiple R-squared: 0.4296, Adjusted R-squared: 0.413
## F-statistic: 26.01 on 11 and 380 DF, p-value: < 2.2e-16
Auto3bbest <-ggplot(Auto3,aes(x=year, y=mpg)) +
ggtitle("Regresi Spline df=11")+
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~ns(x, df=11),
lty = 1,se = F)
Auto3bbestlibrary(gridExtra)
grid.arrange(Auto3best, Auto3abest, Auto3bbest, ncol=2)## Warning: Use of `Auto3$year` is discouraged. Use `year` instead.
## Warning: Use of `Auto3$mpg` is discouraged. Use `mpg` instead.
## Warning: Use of `Auto3$year` is discouraged. Use `year` instead.
## Warning: Use of `Auto3$mpg` is discouraged. Use `mpg` instead.
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
Model terbaik untuk data Auto adalah Regresi Polinomial Ordo 10.