Pemodelan Non-Linear
Soal
Load Packages
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(rsample)
library(mlr3measures)## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.
library(ISLR)
library(grid)#Import data Auto Data Auto adalah data frame dengan 392 Observasi yag terdiri dari 9 Variabel
data("Auto",package="ISLR")
attach(Auto) #data("Auto",package="ISLR")## The following object is masked from package:ggplot2:
##
## mpg
head(cbind(mpg, horsepower,weight, displacement, acceleration))## mpg horsepower weight displacement acceleration
## [1,] 18 130 3504 307 12.0
## [2,] 15 165 3693 350 11.5
## [3,] 18 150 3436 318 11.0
## [4,] 16 150 3433 304 12.0
## [5,] 17 140 3449 302 10.5
## [6,] 15 198 4341 429 10.0
A. Hubungan Non-Linier
Menampilkan plot Non Linear Model untuk 3 Peubah Penjelas yaitu mpg, horsepower, dan displacement
# 1. mpg vs horsepower
lmhp <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.7, color="coral") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "100",se = F)+
labs(title="Non Linear Model mpg vs horsepower")+
xlab("HorsePower")+
ylab("mpg")
lmhp# 2. mpg vs weight
lmw <- ggplot(Auto,aes(x= weight, y=mpg)) +
geom_point(alpha=0.7, color="orange") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "100",se = F)+
labs(title="Non Linear Model mpg vs weight")+
xlab("weight")+
ylab("mpg")
lmw# 3. mpg vs displacement
lmdp <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.7, color="magenta") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "100",se = F)+
labs(title="Non Linear Model mpg vs displacement")+
xlab("displacement")+
ylab("mpg")
lmdpVisually
# Menampilkan plot ketiga variabel data
vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)
arrange <- function(..., nrow=NULL, ncol=NULL, as.table=FALSE) {
dots <- list(...)
n <- length(dots)
if(is.null(nrow) & is.null(ncol)) { nrow = floor(n/2) ; ncol = ceiling(n/nrow)}
if(is.null(nrow)) { nrow = ceiling(n/ncol)}
if(is.null(ncol)) { ncol = ceiling(n/nrow)}
## NOTE see n2mfrow in grDevices for possible alternative
grid.newpage()
pushViewport(viewport(layout=grid.layout(nrow,ncol) ) )
ii.p <- 1
for(ii.row in seq(1, nrow)){
ii.table.row <- ii.row
if(as.table) {ii.table.row <- nrow - ii.table.row + 1}
for(ii.col in seq(1, ncol)){
ii.table <- ii.p
if(ii.p > n) break
print(dots[[ii.table]], vp=vp.layout(ii.table.row, ii.col))
ii.p <- ii.p + 1
}
}
}
nlm_all <- arrange(lmdp, lmhp, lmw) Dari ketiga model yang dipilih untuk masing-masing hubungan peubah prediktor dan peubah penjelas menunjukkan model yang non-linear. Olehnya itu akan dilakukan pemodelan non linear dengan 3 cara. Pemodelan tersebut yaitu Regresi Polinomial, Regresi Fungsi Tangga, dan Regresi Kubik Spline Natural.
B. Menduga Model Non-Linier Terbaik
B.1. mpg vs horsepower
- Polynomial regression
##Empirically
##Polynomial regression dengan Cross Validation
set.seed(1501211001)
cross_val <- vfold_cv(Auto,v= 10, strata ="mpg")
ordo <- 2:8
best_poly_hp <- map_dfr (ordo, function(i) {
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,ordo=i,raw = T),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_hp <- colMeans(metric_poly)
mean_metric_hp
}
)
best_poly_hp <- cbind(ordo,best_poly_hp)
best_poly_hp## ordo rmse mae
## 1 2 4.370676 3.275163
## 2 3 4.393599 3.292301
## 3 4 4.421085 3.316888
## 4 5 4.387112 3.292114
## 5 6 4.379439 3.290104
## 6 7 4.366408 3.270198
## 7 8 4.372519 3.279792
Model yang akan digunakan adalah derajat 2 karena niali rmse untuk ordo 2 adalah yang terendah.
##Visually
##2nd degree polynomial regression
polihp <- lm( mpg ~ poly(horsepower,2,raw = T),data= Auto)
poly_hp <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "100",se = F)+
labs(title="Regresi Polinomial Derajat 2")+
xlab("HorsePower ")+
ylab("mpg")
poly_hp##2. Piecewise Constant Function (Fungsi Tangga)
### Empirically
set.seed(1001)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 2:15
best_tangga_hp <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga_hp <- cbind(breaks=breaks,best_tangga_hp)
# menampilkan hasil all breaks
best_tangga_hp## breaks rmse mae
## 1 2 6.142857 4.800847
## 2 3 5.776608 4.522114
## 3 4 4.981764 3.782906
## 4 5 4.713393 3.578057
## 5 6 4.649392 3.537827
## 6 7 4.570616 3.392555
## 7 8 4.457050 3.407758
## 8 9 4.536498 3.441833
## 9 10 4.585155 3.455845
## 10 11 4.552872 3.374287
## 11 12 4.456176 3.318251
## 12 13 4.520986 3.356401
## 13 14 4.403887 3.241835
## 14 15 4.300994 3.249859
Dari hasil rmse dan mae, saya menggunkan 7 interval untuk dimodelkan.
#Visually
tangga1 = lm(mpg ~ cut(horsepower,7),data= Auto)
summary(tangga1)##
## Call:
## lm(formula = mpg ~ cut(horsepower, 7), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.5280 -2.5530 -0.3758 2.2881 16.7482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.5280 0.5069 64.17 <2e-16 ***
## cut(horsepower, 7)(72.3,98.6] -6.8162 0.6358 -10.72 <2e-16 ***
## cut(horsepower, 7)(98.6,125] -12.1523 0.7591 -16.01 <2e-16 ***
## cut(horsepower, 7)(125,151] -16.5763 0.7957 -20.83 <2e-16 ***
## cut(horsepower, 7)(151,177] -18.5020 1.0831 -17.08 <2e-16 ***
## cut(horsepower, 7)(177,204] -19.4447 1.4187 -13.71 <2e-16 ***
## cut(horsepower, 7)(204,230] -19.6280 1.5375 -12.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.59 on 385 degrees of freedom
## Multiple R-squared: 0.6594, Adjusted R-squared: 0.6541
## F-statistic: 124.2 on 6 and 385 DF, p-value: < 2.2e-16
tangga_hp <- ggplot(Auto,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.7, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,13),
lty = 1, col = "100",se = F)+
labs(title="Regresi Fungsi Tangga Knot 12")+
xlab("HorsePower")+
ylab("mpg")
tangga_hp#3. Natural Cubic Spline
#nilai knots yang ditentukan oleh komputer, misal df=5
attr(ns(Auto$horsepower, df=5),"knots")## 20% 40% 60% 80%
## 72 88 100 140
#Plot BSpline
ggplot(Auto,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = c(70, 90, 100, 140)),
lty = 1, col = "brown",se = F)+
labs(title="Regresi Cubic Spline Knot 70, 90, 100, 140")+
xlab("HorsePower")+
ylab("mpg")## Empirically
set.seed(1501211001)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 2:12
best_spline_hp <- map_dfr(df, function(i){
metric_spline <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline
# menghitung rata-rata untuk 10 folds
mean_spline1 <- colMeans(metric_spline)
mean_spline1
}
)
best_spline_hp <- cbind(df=df,best_spline_hp)
# menampilkan hasil all breaks
best_spline_hp## df rmse mae
## 1 2 4.353554 3.267611
## 2 3 4.394696 3.296135
## 3 4 4.388334 3.295952
## 4 5 4.384292 3.286203
## 5 6 4.381250 3.278629
## 6 7 4.356872 3.257715
## 7 8 4.361673 3.263892
## 8 9 4.357740 3.252422
## 9 10 4.346026 3.233591
## 10 11 4.380925 3.264800
## 11 12 4.364587 3.238694
Dari hasil rmse dan mae, saya menggunkan 13 interval untuk dimodelkan.
##Visually
#nilai knots yang ditentukan oleh komputer, misal df= 10
attr(ns(Auto$horsepower, df=10),"knots")## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
splinehp = lm(mpg ~ ns(horsepower, df =10), data= Auto)
summary(splinehp)##
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 10), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5003 -2.4956 -0.2086 2.2577 14.6400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.462 1.680 19.324 < 2e-16 ***
## ns(horsepower, df = 10)1 -4.600 1.789 -2.572 0.010495 *
## ns(horsepower, df = 10)2 -3.007 2.555 -1.177 0.239996
## ns(horsepower, df = 10)3 -8.663 2.043 -4.239 2.82e-05 ***
## ns(horsepower, df = 10)4 -7.170 2.188 -3.278 0.001142 **
## ns(horsepower, df = 10)5 -14.722 2.125 -6.927 1.84e-11 ***
## ns(horsepower, df = 10)6 -8.446 2.467 -3.424 0.000684 ***
## ns(horsepower, df = 10)7 -16.947 2.080 -8.148 5.38e-15 ***
## ns(horsepower, df = 10)8 -21.715 1.891 -11.483 < 2e-16 ***
## ns(horsepower, df = 10)9 -14.384 4.116 -3.494 0.000531 ***
## ns(horsepower, df = 10)10 -22.326 1.946 -11.476 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.24 on 381 degrees of freedom
## Multiple R-squared: 0.7124, Adjusted R-squared: 0.7049
## F-statistic: 94.39 on 10 and 381 DF, p-value: < 2.2e-16
ncspline_hp <- ggplot(Auto,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(67,72,80,88,93.5, 100, 110, 140,157.7)),
lty = 1, col = "100",se = F)+
labs(title="Natural Cubic Spline df = 10")+
xlab("HorsePower ")+
ylab("mpg")+
geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7), col="grey50", lty=2)
ncspline_hpKarena model yang dihasilkan untuk interval 13 kurang baik, maka saya mencoba df 2 untuk rmse terkecil kedua untuk dimodelkan.
#Menggunakan df=2
#nilai knots yang ditentukan oleh komputer df= 2
attr(ns(Auto$horsepower, df=2),"knots")## 50%
## 93.5
ncspline_hp2 <- ggplot(Auto,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~ns(x, df=2),
lty = 1, col = "100",se = F)+
labs(title="Natural Cubic Spline df = 2")+
xlab("HorsePower ")+
ylab("mpg")+
geom_vline(xintercept = c(93,5), col="grey50", lty=2)
ncspline_hp2Hasil yang diperoleh menunjukkan bahwa model df=2 yang memiliki plot yang sesuai.
#Pembandingan Model
plot_hp <- arrange(poly_hp, tangga_hp, ncspline_hp2)B.2. mpg vs weight
- Polynomial regression
Untuk variabel selanjutnya, yaitu ‘weight’ dan ‘displacement’ menggunakan langkah dan cara yang sama dengan sebelumnya, olehnya itu, akan ditampilkan plotnya saja atau secara visual.
##Visually
##2nd degree polynomial regression
poliw <- lm( mpg ~ poly(weight,2,raw = T),data= Auto)
poly_w <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="orange") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "100",se = F)+
labs(title="Regresi Polinomial Derajat 2")+
xlab("weight ")+
ylab("mpg")
poly_w- Piecewise Constant Function (Fungsi Tangga)
Dari hasil rmse dan mae, saya menggunkan 12 interval dari nilai mae terendah untuk dimodelkan
#Visually
tanggaw = lm(mpg ~ cut(weight,12),data= Auto)
summary(tanggaw)##
## Call:
## lm(formula = mpg ~ cut(weight, 12), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.3889 -2.3952 -0.4377 1.7340 16.3604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.4611 0.9801 34.141 < 2e-16 ***
## cut(weight, 12)(1.91e+03,2.2e+03] -1.0722 1.0958 -0.979 0.328
## cut(weight, 12)(2.2e+03,2.49e+03] -6.4215 1.1344 -5.661 2.97e-08 ***
## cut(weight, 12)(2.49e+03,2.79e+03] -7.5468 1.1460 -6.585 1.52e-10 ***
## cut(weight, 12)(2.79e+03,3.08e+03] -10.6536 1.1802 -9.027 < 2e-16 ***
## cut(weight, 12)(3.08e+03,3.38e+03] -13.5763 1.2184 -11.143 < 2e-16 ***
## cut(weight, 12)(3.38e+03,3.67e+03] -15.2640 1.2061 -12.656 < 2e-16 ***
## cut(weight, 12)(3.67e+03,3.96e+03] -16.9315 1.2653 -13.382 < 2e-16 ***
## cut(weight, 12)(3.96e+03,4.26e+03] -18.6694 1.2965 -14.400 < 2e-16 ***
## cut(weight, 12)(4.26e+03,4.55e+03] -19.5051 1.2854 -15.175 < 2e-16 ***
## cut(weight, 12)(4.55e+03,4.85e+03] -20.8611 1.6400 -12.720 < 2e-16 ***
## cut(weight, 12)(4.85e+03,5.14e+03] -21.4611 1.9602 -10.949 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.158 on 380 degrees of freedom
## Multiple R-squared: 0.7242, Adjusted R-squared: 0.7162
## F-statistic: 90.69 on 11 and 380 DF, p-value: < 2.2e-16
tangga_w <- ggplot(Auto,aes(x=weight, y= mpg)) +
geom_point(alpha=0.7, color="orange") +
stat_smooth(method = "lm",
formula = y~cut(x,12),
lty = 1, col = "100",se = F)+
labs(title="Regresi Fungsi Tangga Knot 11")+
xlab("weight")+
ylab("mpg")
tangga_w- Natural Cubic Spline
#nilai knots yang ditentukan oleh komputer, misal df=5
attr(ns(Auto$weight, df=5),"knots")## 20% 40% 60% 80%
## 2155.0 2583.2 3113.4 3820.8
#Plot BSpline
ggplot(Auto,aes(x=weight, y= mpg)) +
geom_point(alpha=0.55, color="orange") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = c(2155, 2583.2, 3113.4, 3820.8)),
lty = 1, col = "brown",se = F)+
labs(title="Regresi Cubic Spline Knot 2155, 2583.2, 3113.4, 3820.8")+
xlab("weight")+
ylab("mpg")Dari hasil rmse dan mae, saya menggunkan 7 interval untuk dimodelkan
##Visually
#nilai knots yang ditentukan oleh komputer, misal df= 7
attr(ns(Auto$weight, df=7),"knots")## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%
## 2074.857 2285.429 2635.000 2986.571 3446.143 4096.286
splinew = lm(mpg ~ ns(weight, df =7), data= Auto)
summary(splinew)##
## Call:
## lm(formula = mpg ~ ns(weight, df = 7), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3314 -2.5302 -0.4222 1.8281 16.3921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.575 2.038 15.495 < 2e-16 ***
## ns(weight, df = 7)1 -6.249 1.933 -3.233 0.001332 **
## ns(weight, df = 7)2 -4.599 2.514 -1.829 0.068144 .
## ns(weight, df = 7)3 -10.661 2.255 -4.728 3.19e-06 ***
## ns(weight, df = 7)4 -13.286 2.378 -5.587 4.39e-08 ***
## ns(weight, df = 7)5 -19.001 1.727 -11.005 < 2e-16 ***
## ns(weight, df = 7)6 -15.370 4.628 -3.321 0.000982 ***
## ns(weight, df = 7)7 -22.327 1.958 -11.401 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.143 on 384 degrees of freedom
## Multiple R-squared: 0.7233, Adjusted R-squared: 0.7183
## F-statistic: 143.4 on 7 and 384 DF, p-value: < 2.2e-16
ncspline_w <- ggplot(Auto,aes(x=weight, y= mpg)) +
geom_point(alpha=0.55, color="orange") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(2074, 2285, 2635, 2986, 3446, 4096)),
lty = 1, col = "100",se = F)+
labs(title="Natural Cubic Spline df = 10")+
xlab("weight ")+
ylab("mpg")+
geom_vline(xintercept = c(2074, 2285, 2635, 2986, 3446, 4096), col="grey50", lty=2)
ncspline_w#Menggunakan df=2
#nilai knots yang ditentukan oleh komputer df= 2
attr(ns(Auto$weight, df=2),"knots")## 50%
## 2803.5
ncspline_w2 <- ggplot(Auto,aes(x=weight, y= mpg)) +
geom_point(alpha=0.55, color="orange") +
stat_smooth(method = "lm",
formula = y~ns(x, df=2),
lty = 1, col = "100",se = F)+
labs(title="Natural Cubic Spline df = 2")+
xlab("weight ")+
ylab("mpg")+
geom_vline(xintercept = c(2803), col="grey50", lty=2)
ncspline_w2#Pembandingan Model
plot_w <- arrange(poly_w, tangga_w, ncspline_w2)B.3. mpg vs displacement
- Polynomial regression
##Visually
##3rd degree polynomial regression
polidp <- lm( mpg ~ poly(displacement,3,raw = T),data= Auto)
poly_dp <- ggplot(Auto,aes(x=displacement, y=mpg)) +
geom_point(alpha=0.55, color="magenta") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "100",se = F)+
labs(title="Regresi Polinomial Derajat 3")+
xlab("HorsePower ")+
ylab("mpg")
poly_dpModel yang terbaik adalah yang nilai mae nya paling rendah yaitu derajat 3.
- Piecewise Constant Function (Fungsi Tangga)
Dari hasil rmse dan mae, saya menggunkan 13 interval untuk dimodelkan
#Visually
tanggadp = lm(mpg ~ cut(displacement,13),data= Auto)
summary(tanggadp)##
## Call:
## lm(formula = mpg ~ cut(displacement, 13), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.7038 -2.5393 -0.3469 2.0850 19.4607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.7038 0.5048 62.804 < 2e-16 ***
## cut(displacement, 13)(97.8,128] -3.4955 0.7010 -4.986 9.39e-07 ***
## cut(displacement, 13)(128,157] -5.9712 0.8127 -7.347 1.25e-12 ***
## cut(displacement, 13)(157,187] -8.4948 1.4359 -5.916 7.38e-09 ***
## cut(displacement, 13)(187,217] -10.8288 1.3825 -7.833 4.84e-14 ***
## cut(displacement, 13)(217,247] -12.2851 0.9359 -13.126 < 2e-16 ***
## cut(displacement, 13)(247,276] -13.1646 0.9822 -13.403 < 2e-16 ***
## cut(displacement, 13)(276,306] -16.1311 1.0763 -14.988 < 2e-16 ***
## cut(displacement, 13)(306,336] -16.7238 1.1174 -14.966 < 2e-16 ***
## cut(displacement, 13)(336,366] -16.9780 0.9466 -17.936 < 2e-16 ***
## cut(displacement, 13)(366,395] -17.7038 2.6231 -6.749 5.58e-11 ***
## cut(displacement, 13)(395,425] -17.7423 1.3356 -13.284 < 2e-16 ***
## cut(displacement, 13)(425,455] -18.4816 1.5695 -11.775 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.458 on 379 degrees of freedom
## Multiple R-squared: 0.6837, Adjusted R-squared: 0.6737
## F-statistic: 68.28 on 12 and 379 DF, p-value: < 2.2e-16
tangga_dp <- ggplot(Auto,aes(x=displacement, y= mpg)) +
geom_point(alpha=0.7, color="magenta") +
stat_smooth(method = "lm",
formula = y~cut(x,13),
lty = 1, col = "100",se = F)+
labs(title="Regresi Fungsi Tangga Knot 12")+
xlab("HorsePower")+
ylab("mpg")
tangga_dp#3. Natural Cubic Spline
#nilai knots yang ditentukan oleh komputer, misal df=5
attr(ns(Auto$displacement, df=5),"knots")## 20% 40% 60% 80%
## 98 122 225 305
#Plot BSpline
ggplot(Auto,aes(x=displacement, y= mpg)) +
geom_point(alpha=0.55, color="magenta") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = c(98, 122, 225, 305)),
lty = 1, col = "brown",se = F)+
labs(title="Regresi Cubic Spline Knot 98, 122, 225, 305")+
xlab("HorsePower")+
ylab("mpg")Dari hasil rmse dan mae, saya menggunkan 11 interval untuk dimodelkan.
##Visually
#nilai knots yang ditentukan oleh komputer, misal df= 11
attr(ns(Auto$displacement, df=11),"knots")## 9.090909% 18.18182% 27.27273% 36.36364% 45.45455% 54.54545% 63.63636% 72.72727%
## 90 97 107 120 140 168 231 258
## 81.81818% 90.90909%
## 318 351
splinedp = lm(mpg ~ ns(displacement, df =11), data= Auto)
summary(splinedp)##
## Call:
## lm(formula = mpg ~ ns(displacement, df = 11), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.7869 -2.4652 -0.3588 2.0662 19.6697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.072 1.725 13.956 < 2e-16 ***
## ns(displacement, df = 11)1 4.014 1.791 2.241 0.025602 *
## ns(displacement, df = 11)2 8.300 2.382 3.485 0.000549 ***
## ns(displacement, df = 11)3 -1.989 2.148 -0.926 0.355155
## ns(displacement, df = 11)4 3.740 2.191 1.707 0.088692 .
## ns(displacement, df = 11)5 -2.691 2.825 -0.953 0.341422
## ns(displacement, df = 11)6 -4.750 2.392 -1.986 0.047757 *
## ns(displacement, df = 11)7 -5.493 2.482 -2.213 0.027497 *
## ns(displacement, df = 11)8 -9.443 2.112 -4.471 1.03e-05 ***
## ns(displacement, df = 11)9 -15.271 1.696 -9.002 < 2e-16 ***
## ns(displacement, df = 11)10 3.221 4.067 0.792 0.428812
## ns(displacement, df = 11)11 -18.798 1.766 -10.642 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.096 on 380 degrees of freedom
## Multiple R-squared: 0.7324, Adjusted R-squared: 0.7246
## F-statistic: 94.53 on 11 and 380 DF, p-value: < 2.2e-16
ncspline_dp <- ggplot(Auto,aes(x=displacement, y= mpg)) +
geom_point(alpha=0.55, color="magenta") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(90,97,107,120, 140, 168,231,258,318,351)),
lty = 1, col = "100",se = F)+
labs(title="Natural Cubic Spline df = 11")+
xlab("HorsePower ")+
ylab("mpg")+
geom_vline(xintercept = c(90,97,107,120, 140, 168,231,258,318,351), col="grey50", lty=2)
ncspline_dp#Pembandingan Model
plot_dp <- arrange(poly_dp, tangga_dp, ncspline_dp)Kesimpulan
Model terbaik yang dihasilkan untuk regresi non-linear yaitu: 1. mpg vs horsepower dengan model terbaik yaitu natural cubic spline df 2 2. mpg vs weight dengan model terbaik yaitu natural cubic spline df 2 3. mpg vs displacement, model terbaik yaitu menggunakan Regresi Fungsi Tangga knot 12
Menampilkan perbandingan plot
plot_best <- arrange(ncspline_hp2, ncspline_w2, tangga_dp)