Polynomial Regression, Peacewise Constant Function Regression and Spline Regression in R
Load Package
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(rsample)
library(mlr3measures)
## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.
library(ISLR)
library(grid)
##import data
data("Auto",package="ISLR")
attach(Auto) #data("Auto",package="ISLR")
## The following object is masked from package:ggplot2:
##
## mpg
Data Auto merupakan data frame dengan 392 Observasi yag terdiri dari 9 Variabel
#Menampilkan sebagian data
head(cbind(mpg,horsepower,origin))
## mpg horsepower origin
## [1,] 18 130 1
## [2,] 15 165 1
## [3,] 18 150 1
## [4,] 16 150 1
## [5,] 17 140 1
## [6,] 15 198 1
# LM Plot
lmAuto <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.7, color="orange") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "brown",se = F)+
labs(title="Non Linear Model Data Auto")+
xlab("HorsePower")+
ylab("mpg")
lmAuto
#a. Polynomial regression
##Empirically
##Polynomial regression degree 3 dengan Cross Validation
set.seed(1501211001)
cross_val <- vfold_cv(Auto,v= 10, strata = "horsepower")
breaksp <- 2:8
best_poly_Auto <- map_dfr (breaksp, function(i) {
metric_poly <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ poly(horsepower,breaksp=i,raw = T),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly
# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly
}
)
best_poly_Auto <- cbind(breaksp,best_poly_Auto)
best_poly_Auto
## breaksp rmse mae
## 1 2 4.352578 3.278945
## 2 3 4.353783 3.276225
## 3 4 4.351293 3.285103
## 4 5 4.297097 3.240996
## 5 6 4.280785 3.243448
## 6 7 4.262132 3.220544
## 7 8 4.280944 3.245170
Berdasarkan nilai rmse dan mae yang diperoleh saya memilih derajat 2 untuk dimodelkan dalam regresi polinomial karena memiliki nilai yang minimum dan lebih kecil dari derajat 4.
##Visually
##2nd degree polynomial regression
polimodel <- lm( mpg ~ poly(horsepower,2,raw = T),data= Auto)
summary(polimodel)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2, raw = T), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.9000997 1.8004268 31.60 <2e-16 ***
## poly(horsepower, 2, raw = T)1 -0.4661896 0.0311246 -14.98 <2e-16 ***
## poly(horsepower, 2, raw = T)2 0.0012305 0.0001221 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
Auto_poly <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="orange") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "100",se = F)+
labs(title="Regresi Polinomial Derajat 2")+
xlab("HorsePower ")+
ylab("mpg")
Auto_poly
##b. Piecewise Constant Function
### Empirically
set.seed(1001)
cross_valta <- vfold_cv(Auto,v=10,strata = "mpg")
breaks <- 3:15
best_tangga_Auto <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_valta$splits,
function(x){
training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
mod <- lm(mpg ~ horsepower,
data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- Auto[-x$in_id,]
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,
newdata=list(horsepower=horsepower_new))
truth <- testing$mpg
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,
response = data_eval$pred
)
mae <- mlr3measures::mae(truth = data_eval$truth,
response = data_eval$pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
})
best_tangga_Auto <- cbind(breaks=breaks,best_tangga_Auto)
# menampilkan hasil all breaks
best_tangga_Auto
## breaks rmse mae
## 1 3 5.776608 4.522114
## 2 4 4.981764 3.782906
## 3 5 4.713393 3.578057
## 4 6 4.649392 3.537827
## 5 7 4.570616 3.392555
## 6 8 4.457050 3.407758
## 7 9 4.536498 3.441833
## 8 10 4.585155 3.455845
## 9 11 4.552872 3.374287
## 10 12 4.456176 3.318251
## 11 13 4.520986 3.356401
## 12 14 4.403887 3.241835
## 13 15 4.300994 3.249859
#Visually
tangga1 = lm(mpg ~ cut(horsepower,13),data= Auto)
Auto_tangga <- ggplot(Auto,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.7, color="orange") +
stat_smooth(method = "lm",
formula = y~cut(x,13),
lty = 1, col = "100",se = F)+
labs(title="Regresi Fungsi Tangga Knot 12")+
xlab("HorsePower")+
ylab("mpg")
Auto_tangga
#c. Natural Cubic Spline
#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$horsepower, df=10),"knots")
## 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 67.0 72.0 80.0 88.0 93.5 100.0 110.0 140.0 157.7
#Plot BSpline
ggplot(Auto,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.55, color="orange") +
stat_smooth(method = "lm",
formula = y~bs(x, knots = c(60, 80,100, 120)),
lty = 1, col = "brown",se = F)+
labs(title="Regresi Cubic Spline Knot 60, 80,100, 120")+
xlab("HorsePower")+
ylab("mpg")
Dapat dilihat bahwa model plot yang dihasilkan memiliki ujung-ujung yang cenderung liar dalam hal ini ragamnya yg cenderung besar. Oleh karena itu, untuk mengurangi ketidakstabilan ujung-ujung model tersebut akan digunakan pemodelan Natural Cubic Spline.
Natural Cubic Spline
## Empirically
set.seed(1501211001)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")
df <- 2:15
best_spline_Auto <- map_dfr(df, function(i){
metric_spline <- map_dfr(cross_val$splits,
function(x){
mod <- lm(mpg ~ ns(horsepower,df=i),
data=Auto[x$in_id,])
pred <- predict(mod,
newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,
response = pred
)
mae <- mlr3measures::mae(truth = truth,
response = pred
)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_spline
# menghitung rata-rata untuk 10 folds
mean_spline1 <- colMeans(metric_spline)
mean_spline1
}
)
best_spline_Auto <- cbind(df=df,best_spline_Auto)
# menampilkan hasil all breaks
best_spline_Auto
## df rmse mae
## 1 2 4.353554 3.267611
## 2 3 4.394696 3.296135
## 3 4 4.388334 3.295952
## 4 5 4.384292 3.286203
## 5 6 4.381250 3.278629
## 6 7 4.356872 3.257715
## 7 8 4.361673 3.263892
## 8 9 4.357740 3.252422
## 9 10 4.346026 3.233591
## 10 11 4.380925 3.264800
## 11 12 4.364587 3.238694
## 12 13 4.376466 3.255618
## 13 14 4.371661 3.272694
## 14 15 4.349977 3.275905
Jika merujuk pada nilai rmse dan mae, nilai terendah yaitu pada df 10 (knot 9) dan terendah kedua adalah pada df 2 (knot 1).
##Visually dengan 9 Konts
splineAns = lm(mpg ~ ns(horsepower, knots = c(67,72,80,88,94,100,110,140,160)),data = Auto)
Auto_ncspline9 <- ggplot(Auto,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.55, color="orange") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(60,80,90,100,110,140,160,180,200)),
lty = 1, col = "100",se = F)+
labs(title="Natural Cubic Spline Knot 60,80,90,100,110,140,160,180,200")+
xlab("HorsePower ")+
ylab("mpg")
Auto_ncspline9
##Visually dengan 1 Konts
splineAns1 = lm(mpg ~ ns(horsepower, knots = c(120)),data = Auto)
Auto_ncspline1 <- ggplot(Auto,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.55, color="orange") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(120)),
lty = 1, col = "100",se = F)+
labs(title="Natural Cubic Spline Knot 120")+
xlab("HorsePower ")+
ylab("mpg")
Auto_ncspline1
Karena plot yang dihasilkan dengan 1 knot lebih mulus, maka saya menggunakan 1 knot pada pemodelan natural cubic spline.
#Perbandingan Model
vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)
arrange <- function(..., nrow=NULL, ncol=NULL, as.table=FALSE) {
dots <- list(...)
n <- length(dots)
if(is.null(nrow) & is.null(ncol)) { nrow = floor(n/2) ; ncol = ceiling(n/nrow)}
if(is.null(nrow)) { nrow = ceiling(n/ncol)}
if(is.null(ncol)) { ncol = ceiling(n/nrow)}
## NOTE see n2mfrow in grDevices for possible alternative
grid.newpage()
pushViewport(viewport(layout=grid.layout(nrow,ncol) ) )
ii.p <- 1
for(ii.row in seq(1, nrow)){
ii.table.row <- ii.row
if(as.table) {ii.table.row <- nrow - ii.table.row + 1}
for(ii.col in seq(1, ncol)){
ii.table <- ii.p
if(ii.p > n) break
print(dots[[ii.table]], vp=vp.layout(ii.table.row, ii.col))
ii.p <- ii.p + 1
}
}
}
plot_Amrik <- arrange(lmAuto, Auto_poly, Auto_tangga, Auto_ncspline1)
# 1. Data Amerika
#________________
# Menampilkan data mpg dan horsepower berdasarkan mesin asal Amerika
data_amerika <-subset(Auto,origin == 1)
auto_amerika <- data_amerika[,c("mpg","horsepower")]
Data Auto_amerika ini terdiri dari 245 Observasi.
# ScatterPlot
lmAm <- ggplot(auto_amerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.7, color="magenta") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "brown",se = F)+
labs(title="Non Linear Model Data Amerika")+
xlab("HorsePower American")+
ylab("mpg American")
lmAm
##Visually
##2nd degree polynomial regression
polinom1 <- lm( mpg ~ poly(horsepower,2,raw = T),data= auto_amerika)
Am_poly <- ggplot(auto_amerika,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="magenta") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "100",se = F)+
labs(title="Regresi Polinomial Derajat 2")+
xlab("HorsePower American")+
ylab("mpg American")
Am_poly
#Visually
tangga2 = lm(mpg ~ cut(horsepower,9),data= auto_amerika)
Am_tangga <- ggplot(auto_amerika,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.7, color="magenta") +
stat_smooth(method = "lm",
formula = y~cut(x,9),
lty = 1, col = "100",se = F)+
labs(title="Regresi Fungsi Tangga Knot 8")+
xlab("HorsePower American")+
ylab("mpg American")
Am_tangga
#c. Natural Cubic Spline
##Visually
spline1ns = lm(mpg ~ ns(horsepower, knots = c(80, 120, 160, 200)),data = auto_amerika)
Am_spline <- ggplot(auto_amerika,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.55, color="magenta") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(80, 120, 160, 200)),
lty = 1, col = "100",se = F)+
labs(title="Regresi Natural Spline Knot 80, 120, 160, 200")+
xlab("HorsePower American")+
ylab("mpg American")
Am_spline
#Perbandingan Model
plot_Amrik <- arrange(lmAm, Am_poly, Am_tangga, Am_spline)
#_______________________________________________________________________________
# 2. Data Eropa
#________________
# Menampilkan data mpg dan horsepower berdasarkan mesin asal Amerika
data_eropa <-subset(Auto,origin == 2)
auto_eropa <- data_eropa[,c("mpg","horsepower")]
# Terdiri dari 68 Observasi
# ScatterPlot
lmEr <- ggplot(auto_eropa, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "brown",se = F)+
labs(title="Linear Model Data eropa")+
xlab("HorsePower European")+
ylab("mpg European")
lmEr
#a. Polynomial regression ##2nd degree polynomial regression
# Visually
polinom2 <- lm( mpg ~ poly(horsepower,2,raw = T),data= auto_eropa)
Er_poly <- ggplot(auto_eropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "100",se = F)+
labs(title="Regresi POlinomial Derajat 2")+
xlab("HorsePower European")+
ylab("mpg European")
Er_poly
#Visually
tangga2 = lm(mpg ~ cut(horsepower,7),data= auto_eropa)
Er_tangga <- ggplot(auto_eropa,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.7, color="coral") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "100",se = F)+
labs(title="Regresi Fungsi Tangga Knot 6")+
xlab("HorsePower European")+
ylab("mpg European")
Er_tangga
#c. Natural Cubic Spline
#nilai knots yang ditentukan oleh komputer
attr(ns(auto_eropa$horsepower, df=5),"knots")
## 20% 40% 60% 80%
## 67.0 74.8 81.4 95.0
##Visually
spline2ns = lm(mpg ~ ns(horsepower, knots = c(70, 80, 100, 120)),data = auto_eropa)
summary(spline1ns)
##
## Call:
## lm(formula = mpg ~ ns(horsepower, knots = c(80, 120, 160, 200)),
## data = auto_amerika)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.4671 -2.1836 -0.3698 2.0508 13.2883
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 35.039 1.813 19.322
## ns(horsepower, knots = c(80, 120, 160, 200))1 -18.270 1.748 -10.454
## ns(horsepower, knots = c(80, 120, 160, 200))2 -20.386 2.556 -7.975
## ns(horsepower, knots = c(80, 120, 160, 200))3 -20.800 2.173 -9.574
## ns(horsepower, knots = c(80, 120, 160, 200))4 -30.835 4.178 -7.380
## ns(horsepower, knots = c(80, 120, 160, 200))5 -14.921 1.967 -7.586
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## ns(horsepower, knots = c(80, 120, 160, 200))1 < 2e-16 ***
## ns(horsepower, knots = c(80, 120, 160, 200))2 6.29e-14 ***
## ns(horsepower, knots = c(80, 120, 160, 200))3 < 2e-16 ***
## ns(horsepower, knots = c(80, 120, 160, 200))4 2.61e-12 ***
## ns(horsepower, knots = c(80, 120, 160, 200))5 7.33e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.819 on 239 degrees of freedom
## Multiple R-squared: 0.6557, Adjusted R-squared: 0.6485
## F-statistic: 91.02 on 5 and 239 DF, p-value: < 2.2e-16
Er_ncspline <- ggplot(auto_eropa,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.55, color="coral") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(70, 80, 100, 120)),
lty = 1, col = "100",se = F)+
labs(title="Natural Cubic Spline Knot 70,80,100,120")+
xlab("HorsePower European")+
ylab("mpg European")
Er_ncspline
#Perbandingan Model
plot_Eropa <- arrange(lmEr, Er_poly, Er_tangga, Er_ncspline)
#______________________________________________________________________________
# 3. Data Jepang
#________________
# Menampilkan data mpg dan horsepower berdasarkan mesin asal Amerika
data_jepang <-subset(Auto,origin == 3)
auto_jepang <- data_jepang[,c("mpg","horsepower")]
Data auto_jepang terdiri dari 79 data
lmJ <- ggplot(auto_jepang, aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "brown",se = F)+
labs(title="Linear Model Data Jepang")+
xlab("HorsePower Japanese")+
ylab("mpg Japanese")
lmJ
#a. Polynomial regression ##1st degree polynomial regression
# Visually
polinom3 <- lm( mpg ~ poly(horsepower,1,raw = T),data= auto_jepang)
summary(polinom3)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 1, raw = T), data = auto_jepang)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.112 -2.758 -0.751 2.563 14.249
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.8162 2.3555 20.724 < 2e-16 ***
## poly(horsepower, 1, raw = T) -0.2300 0.0288 -7.986 1.08e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.533 on 77 degrees of freedom
## Multiple R-squared: 0.4531, Adjusted R-squared: 0.446
## F-statistic: 63.78 on 1 and 77 DF, p-value: 1.08e-11
J_poly <- ggplot(auto_jepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~poly(x,1,raw=T),
lty = 1, col = "orange",se = F)+
labs(title="Regresi POlinomial Derajat 1")+
xlab("HorsePower Japanese")+
ylab("mpg Japanese")
J_poly
#Visually
tangga3 = lm(mpg ~ cut(horsepower,7),data= auto_jepang)
summary(tangga3)
##
## Call:
## lm(formula = mpg ~ cut(horsepower, 7), data = auto_jepang)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5400 -2.7379 -0.8857 2.3143 11.9143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.7900 1.3659 25.470 < 2e-16 ***
## cut(horsepower, 7)(63.4,74.9] -0.1043 1.5913 -0.066 0.947929
## cut(horsepower, 7)(74.9,86.3] -3.7150 2.0489 -1.813 0.073976 .
## cut(horsepower, 7)(86.3,97.7] -9.2500 1.6162 -5.723 2.24e-07 ***
## cut(horsepower, 7)(97.7,109] -9.5900 2.8434 -3.373 0.001201 **
## cut(horsepower, 7)(109,121] -11.0900 2.8434 -3.900 0.000214 ***
## cut(horsepower, 7)(121,132] -8.4400 3.3459 -2.523 0.013863 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.32 on 72 degrees of freedom
## Multiple R-squared: 0.5356, Adjusted R-squared: 0.4969
## F-statistic: 13.84 on 6 and 72 DF, p-value: 2.148e-10
J_tangga <- ggplot(auto_jepang,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.7, color="blue") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "orange",se = F)+
labs(title="Regresi Fungsi Tangga Knot 8")+
xlab("HorsePower Japanese")+
ylab("mpg Japanese")
J_tangga
#c. Natural Cubic Spline
##Visually
spline_ns <- lm(mpg ~ ns(horsepower, knots = c(70,110)),data = auto_jepang)
J_ncspline <- ggplot(auto_jepang,aes(x=horsepower, y= mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~ns(x, knots = c(70,110)),
lty = 1, col = "orange",se = F)+
labs(title="Regresi Natural Spline Knot 70, 110")+
xlab("HorsePower Japanese")+
ylab("mpg Japanese")
J_ncspline
#Perbandingan all model
plot_Eropa <- arrange(lmJ, J_poly, J_tangga, J_ncspline)