Pemodelan Non-Linear

Soal

Load Packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(splines)
library(rsample)
library(mlr3measures)
## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.
library(ISLR)
library(grid)

#Import data Auto Data Auto adalah data frame dengan 392 Observasi yag terdiri dari 9 Variabel

data("Auto",package="ISLR")
attach(Auto)   #data("Auto",package="ISLR")
## The following object is masked from package:ggplot2:
## 
##     mpg
head(cbind(mpg, horsepower,weight, displacement, acceleration))
##      mpg horsepower weight displacement acceleration
## [1,]  18        130   3504          307         12.0
## [2,]  15        165   3693          350         11.5
## [3,]  18        150   3436          318         11.0
## [4,]  16        150   3433          304         12.0
## [5,]  17        140   3449          302         10.5
## [6,]  15        198   4341          429         10.0

A. Hubungan Non-Linier

Menampilkan plot Non Linear Model untuk 3 Peubah Penjelas yaitu mpg, horsepower, dan displacement

# 1. mpg vs horsepower
lmhp <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.7, color="coral") + 
  stat_smooth(method = "lm", 
              formula = y~x,lty = 1,
              col = "100",se = F)+
  labs(title="Non Linear Model mpg vs horsepower")+
  xlab("HorsePower")+
  ylab("mpg")
lmhp

# 2. mpg vs weight
lmw <- ggplot(Auto,aes(x= weight, y=mpg)) +
  geom_point(alpha=0.7, color="orange") + 
  stat_smooth(method = "lm", 
              formula = y~x,lty = 1,
              col = "100",se = F)+
  labs(title="Non Linear Model mpg vs weight")+
  xlab("weight")+
  ylab("mpg")
lmw

# 3. mpg vs displacement
lmdp <- ggplot(Auto,aes(x=displacement, y=mpg)) +
  geom_point(alpha=0.7, color="magenta") + 
  stat_smooth(method = "lm", 
              formula = y~x,lty = 1,
              col = "100",se = F)+
  labs(title="Non Linear Model mpg vs displacement")+
  xlab("displacement")+
  ylab("mpg")
lmdp

Visually

# Menampilkan plot ketiga variabel data
vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)
arrange <- function(..., nrow=NULL, ncol=NULL, as.table=FALSE) {
  dots <- list(...)
  n <- length(dots)
  if(is.null(nrow) & is.null(ncol)) { nrow = floor(n/2) ; ncol = ceiling(n/nrow)}
  if(is.null(nrow)) { nrow = ceiling(n/ncol)}
  if(is.null(ncol)) { ncol = ceiling(n/nrow)}
  ## NOTE see n2mfrow in grDevices for possible alternative
  grid.newpage()
  pushViewport(viewport(layout=grid.layout(nrow,ncol) ) )
  ii.p <- 1
  for(ii.row in seq(1, nrow)){
    ii.table.row <- ii.row
    if(as.table) {ii.table.row <- nrow - ii.table.row + 1}
    for(ii.col in seq(1, ncol)){
      ii.table <- ii.p
      if(ii.p > n) break
      print(dots[[ii.table]], vp=vp.layout(ii.table.row, ii.col))
      ii.p <- ii.p + 1
    }
  }
}


nlm_all <- arrange(lmdp, lmhp, lmw)

Dari ketiga model yang dipilih untuk masing-masing hubungan peubah prediktor dan peubah penjelas menunjukkan model yang non-linear. Olehnya itu akan dilakukan pemodelan non linear dengan 3 cara. Pemodelan tersebut yaitu Regresi Polinomial, Regresi Fungsi Tangga, dan Regresi Kubik Spline Natural.

B. Menduga Model Non-Linier Terbaik

B.1. mpg vs horsepower

  1. Polynomial regression
##Empirically
##Polynomial regression  dengan Cross Validation

set.seed(1501211001)
cross_val <- vfold_cv(Auto,v= 10, strata ="mpg")

ordo <- 2:8
best_poly_hp <- map_dfr (ordo, function(i) {
  metric_poly <- map_dfr(cross_val$splits,
                         function(x){
                           mod <- lm(mpg ~ poly(horsepower,ordo=i,raw = T),
                                     data=Auto[x$in_id,])
                           pred <- predict(mod,
                                           newdata=Auto[-x$in_id,])
                           truth <- Auto[-x$in_id,]$mpg
                           rmse <- mlr3measures::rmse(truth = truth,
                                                      response = pred
                           )
                           mae <- mlr3measures::mae(truth = truth,
                                                    response = pred
                           )
                           metric <- c(rmse,mae)
                           names(metric) <- c("rmse","mae")
                           return(metric)
                         }
  )
  metric_poly
  # menghitung rata-rata untuk 10 folds
  mean_metric_hp <- colMeans(metric_poly)
  mean_metric_hp
}
)
best_poly_hp <- cbind(ordo,best_poly_hp)
best_poly_hp
##   ordo     rmse      mae
## 1    2 4.370676 3.275163
## 2    3 4.393599 3.292301
## 3    4 4.421085 3.316888
## 4    5 4.387112 3.292114
## 5    6 4.379439 3.290104
## 6    7 4.366408 3.270198
## 7    8 4.372519 3.279792

Model yang akan digunakan adalah derajat 2 karena niali rmse untuk ordo 2 adalah yang terendah.

##Visually
##2nd degree polynomial regression

polihp <-  lm( mpg ~ poly(horsepower,2,raw = T),data= Auto)

poly_hp <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "100",se = F)+
  labs(title="Regresi Polinomial Derajat 2")+
  xlab("HorsePower ")+
  ylab("mpg")
poly_hp

##2. Piecewise Constant Function (Fungsi Tangga)

### Empirically

set.seed(1001)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:15

best_tangga_hp <- map_dfr(breaks, function(i){
  
  metric_tangga <- map_dfr(cross_val$splits,
                            function(x){
                              
                              training <- Auto[x$in_id,]
                              training$horsepower <- cut(training$horsepower,i)
                              
                              mod <- lm(mpg ~ horsepower,
                                        data=training)
                              
                              labs_x <- levels(mod$model[,2])
                              
                              labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                                                     upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
                              
                              
                              testing <- Auto[-x$in_id,]
                              
                              horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                              
                              pred <- predict(mod,
                                              newdata=list(horsepower=horsepower_new))
                              truth <- testing$mpg
                              
                              data_eval <- na.omit(data.frame(truth,pred))
                              
                              rmse <- mlr3measures::rmse(truth = data_eval$truth,
                                                         response = data_eval$pred
                              )
                              mae <- mlr3measures::mae(truth = data_eval$truth,
                                                       response = data_eval$pred
                              )
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                            }
  )
  
  metric_tangga
  
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  
  mean_metric_tangga
})
best_tangga_hp <- cbind(breaks=breaks,best_tangga_hp)
# menampilkan hasil all breaks
best_tangga_hp
##    breaks     rmse      mae
## 1       2 6.142857 4.800847
## 2       3 5.776608 4.522114
## 3       4 4.981764 3.782906
## 4       5 4.713393 3.578057
## 5       6 4.649392 3.537827
## 6       7 4.570616 3.392555
## 7       8 4.457050 3.407758
## 8       9 4.536498 3.441833
## 9      10 4.585155 3.455845
## 10     11 4.552872 3.374287
## 11     12 4.456176 3.318251
## 12     13 4.520986 3.356401
## 13     14 4.403887 3.241835
## 14     15 4.300994 3.249859

Dari hasil rmse dan mae, saya menggunkan 7 interval untuk dimodelkan.

#Visually
tangga1 = lm(mpg ~ cut(horsepower,7),data= Auto)
summary(tangga1)
## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 7), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5280  -2.5530  -0.3758   2.2881  16.7482 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    32.5280     0.5069   64.17   <2e-16 ***
## cut(horsepower, 7)(72.3,98.6]  -6.8162     0.6358  -10.72   <2e-16 ***
## cut(horsepower, 7)(98.6,125]  -12.1523     0.7591  -16.01   <2e-16 ***
## cut(horsepower, 7)(125,151]   -16.5763     0.7957  -20.83   <2e-16 ***
## cut(horsepower, 7)(151,177]   -18.5020     1.0831  -17.08   <2e-16 ***
## cut(horsepower, 7)(177,204]   -19.4447     1.4187  -13.71   <2e-16 ***
## cut(horsepower, 7)(204,230]   -19.6280     1.5375  -12.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.59 on 385 degrees of freedom
## Multiple R-squared:  0.6594, Adjusted R-squared:  0.6541 
## F-statistic: 124.2 on 6 and 385 DF,  p-value: < 2.2e-16
tangga_hp <- ggplot(Auto,aes(x=horsepower, y= mpg)) +
  geom_point(alpha=0.7, color="coral") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,13), 
              lty = 1, col = "100",se = F)+
  labs(title="Regresi Fungsi Tangga Knot 12")+
  xlab("HorsePower")+
  ylab("mpg")
tangga_hp

#3. Natural Cubic Spline 

#nilai knots yang ditentukan oleh komputer, misal df=5
attr(ns(Auto$horsepower, df=5),"knots")
## 20% 40% 60% 80% 
##  72  88 100 140
#Plot BSpline
ggplot(Auto,aes(x=horsepower, y= mpg)) +
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
              formula = y~bs(x, knots = c(70, 90, 100, 140)),
              lty = 1, col = "brown",se = F)+
  labs(title="Regresi Cubic Spline Knot 70, 90, 100, 140")+
  xlab("HorsePower")+
  ylab("mpg")

## Empirically

set.seed(1501211001)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 2:12

best_spline_hp <- map_dfr(df, function(i){
  metric_spline <- map_dfr(cross_val$splits,
                            function(x){
                              mod <- lm(mpg ~ ns(horsepower,df=i),
                                        data=Auto[x$in_id,])
                              pred <- predict(mod,
                                              newdata=Auto[-x$in_id,])
                              truth <- Auto[-x$in_id,]$mpg
                              
                              rmse <- mlr3measures::rmse(truth = truth,
                                                         response = pred
                              )
                              mae <- mlr3measures::mae(truth = truth,
                                                       response = pred
                              )
                              metric <- c(rmse,mae)
                              names(metric) <- c("rmse","mae")
                              return(metric)
                            }
  )
  
  metric_spline
  
  # menghitung rata-rata untuk 10 folds
  mean_spline1 <- colMeans(metric_spline)
  
  mean_spline1
}
)
best_spline_hp <- cbind(df=df,best_spline_hp)
# menampilkan hasil all breaks
best_spline_hp
##    df     rmse      mae
## 1   2 4.353554 3.267611
## 2   3 4.394696 3.296135
## 3   4 4.388334 3.295952
## 4   5 4.384292 3.286203
## 5   6 4.381250 3.278629
## 6   7 4.356872 3.257715
## 7   8 4.361673 3.263892
## 8   9 4.357740 3.252422
## 9  10 4.346026 3.233591
## 10 11 4.380925 3.264800
## 11 12 4.364587 3.238694

Dari hasil rmse dan mae, saya menggunkan 13 interval untuk dimodelkan.

##Visually

#nilai knots yang ditentukan oleh komputer, misal df= 10
attr(ns(Auto$horsepower, df=10),"knots")
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7
splinehp = lm(mpg ~ ns(horsepower, df =10), data= Auto)
summary(splinehp)
## 
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 10), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5003  -2.4956  -0.2086   2.2577  14.6400 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 32.462      1.680  19.324  < 2e-16 ***
## ns(horsepower, df = 10)1    -4.600      1.789  -2.572 0.010495 *  
## ns(horsepower, df = 10)2    -3.007      2.555  -1.177 0.239996    
## ns(horsepower, df = 10)3    -8.663      2.043  -4.239 2.82e-05 ***
## ns(horsepower, df = 10)4    -7.170      2.188  -3.278 0.001142 ** 
## ns(horsepower, df = 10)5   -14.722      2.125  -6.927 1.84e-11 ***
## ns(horsepower, df = 10)6    -8.446      2.467  -3.424 0.000684 ***
## ns(horsepower, df = 10)7   -16.947      2.080  -8.148 5.38e-15 ***
## ns(horsepower, df = 10)8   -21.715      1.891 -11.483  < 2e-16 ***
## ns(horsepower, df = 10)9   -14.384      4.116  -3.494 0.000531 ***
## ns(horsepower, df = 10)10  -22.326      1.946 -11.476  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.24 on 381 degrees of freedom
## Multiple R-squared:  0.7124, Adjusted R-squared:  0.7049 
## F-statistic: 94.39 on 10 and 381 DF,  p-value: < 2.2e-16
ncspline_hp <- ggplot(Auto,aes(x=horsepower, y= mpg)) +
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, knots = c(67,72,80,88,93.5, 100, 110, 140,157.7)),
              lty = 1, col = "100",se = F)+
  labs(title="Natural Cubic Spline df = 10")+
  xlab("HorsePower ")+
  ylab("mpg")+
  geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140, 157.7), col="grey50", lty=2)
ncspline_hp

Karena model yang dihasilkan untuk interval 13 kurang baik, maka saya mencoba df 2 untuk rmse terkecil kedua untuk dimodelkan.

#Menggunakan df=2
#nilai knots yang ditentukan oleh komputer df= 2
attr(ns(Auto$horsepower, df=2),"knots")
##  50% 
## 93.5
ncspline_hp2 <- ggplot(Auto,aes(x=horsepower, y= mpg)) +
  geom_point(alpha=0.55, color="coral") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, df=2),
              lty = 1, col = "100",se = F)+
  labs(title="Natural Cubic Spline df = 2")+
  xlab("HorsePower ")+
  ylab("mpg")+
  geom_vline(xintercept = c(93,5), col="grey50", lty=2)
ncspline_hp2

Hasil yang diperoleh menunjukkan bahwa model df=2 yang memiliki plot yang sesuai.

#Pembandingan Model

plot_hp <- arrange(poly_hp, tangga_hp, ncspline_hp2)

B.2. mpg vs weight

  1. Polynomial regression

Untuk variabel selanjutnya, yaitu ‘weight’ dan ‘displacement’ menggunakan langkah dan cara yang sama dengan sebelumnya, olehnya itu, akan ditampilkan plotnya saja atau secara visual.

##Visually
##2nd degree polynomial regression

poliw <-  lm( mpg ~ poly(weight,2,raw = T),data= Auto)

poly_w <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "100",se = F)+
  labs(title="Regresi Polinomial Derajat 2")+
  xlab("weight ")+
  ylab("mpg")
poly_w

  1. Piecewise Constant Function (Fungsi Tangga)

Dari hasil rmse dan mae, saya menggunkan 12 interval dari nilai mae terendah untuk dimodelkan

#Visually
tanggaw = lm(mpg ~ cut(weight,12),data= Auto)
summary(tanggaw)
## 
## Call:
## lm(formula = mpg ~ cut(weight, 12), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.3889  -2.3952  -0.4377   1.7340  16.3604 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         33.4611     0.9801  34.141  < 2e-16 ***
## cut(weight, 12)(1.91e+03,2.2e+03]   -1.0722     1.0958  -0.979    0.328    
## cut(weight, 12)(2.2e+03,2.49e+03]   -6.4215     1.1344  -5.661 2.97e-08 ***
## cut(weight, 12)(2.49e+03,2.79e+03]  -7.5468     1.1460  -6.585 1.52e-10 ***
## cut(weight, 12)(2.79e+03,3.08e+03] -10.6536     1.1802  -9.027  < 2e-16 ***
## cut(weight, 12)(3.08e+03,3.38e+03] -13.5763     1.2184 -11.143  < 2e-16 ***
## cut(weight, 12)(3.38e+03,3.67e+03] -15.2640     1.2061 -12.656  < 2e-16 ***
## cut(weight, 12)(3.67e+03,3.96e+03] -16.9315     1.2653 -13.382  < 2e-16 ***
## cut(weight, 12)(3.96e+03,4.26e+03] -18.6694     1.2965 -14.400  < 2e-16 ***
## cut(weight, 12)(4.26e+03,4.55e+03] -19.5051     1.2854 -15.175  < 2e-16 ***
## cut(weight, 12)(4.55e+03,4.85e+03] -20.8611     1.6400 -12.720  < 2e-16 ***
## cut(weight, 12)(4.85e+03,5.14e+03] -21.4611     1.9602 -10.949  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.158 on 380 degrees of freedom
## Multiple R-squared:  0.7242, Adjusted R-squared:  0.7162 
## F-statistic: 90.69 on 11 and 380 DF,  p-value: < 2.2e-16
tangga_w <- ggplot(Auto,aes(x=weight, y= mpg)) +
  geom_point(alpha=0.7, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,12), 
              lty = 1, col = "100",se = F)+
  labs(title="Regresi Fungsi Tangga Knot 11")+
  xlab("weight")+
  ylab("mpg")
tangga_w

  1. Natural Cubic Spline
#nilai knots yang ditentukan oleh komputer, misal df=5
attr(ns(Auto$weight, df=5),"knots")
##    20%    40%    60%    80% 
## 2155.0 2583.2 3113.4 3820.8
#Plot BSpline
ggplot(Auto,aes(x=weight, y= mpg)) +
  geom_point(alpha=0.55, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~bs(x, knots = c(2155, 2583.2, 3113.4, 3820.8)),
              lty = 1, col = "brown",se = F)+
  labs(title="Regresi Cubic Spline Knot 2155, 2583.2, 3113.4, 3820.8")+
  xlab("weight")+
  ylab("mpg")

Dari hasil rmse dan mae, saya menggunkan 7 interval untuk dimodelkan

##Visually

#nilai knots yang ditentukan oleh komputer, misal df= 7
attr(ns(Auto$weight, df=7),"knots")
## 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429% 
##  2074.857  2285.429  2635.000  2986.571  3446.143  4096.286
splinew = lm(mpg ~ ns(weight, df =7), data= Auto)
summary(splinew)
## 
## Call:
## lm(formula = mpg ~ ns(weight, df = 7), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3314  -2.5302  -0.4222   1.8281  16.3921 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           31.575      2.038  15.495  < 2e-16 ***
## ns(weight, df = 7)1   -6.249      1.933  -3.233 0.001332 ** 
## ns(weight, df = 7)2   -4.599      2.514  -1.829 0.068144 .  
## ns(weight, df = 7)3  -10.661      2.255  -4.728 3.19e-06 ***
## ns(weight, df = 7)4  -13.286      2.378  -5.587 4.39e-08 ***
## ns(weight, df = 7)5  -19.001      1.727 -11.005  < 2e-16 ***
## ns(weight, df = 7)6  -15.370      4.628  -3.321 0.000982 ***
## ns(weight, df = 7)7  -22.327      1.958 -11.401  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.143 on 384 degrees of freedom
## Multiple R-squared:  0.7233, Adjusted R-squared:  0.7183 
## F-statistic: 143.4 on 7 and 384 DF,  p-value: < 2.2e-16
ncspline_w <- ggplot(Auto,aes(x=weight, y= mpg)) +
  geom_point(alpha=0.55, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, knots = c(2074, 2285, 2635, 2986, 3446, 4096)),
              lty = 1, col = "100",se = F)+
  labs(title="Natural Cubic Spline df = 10")+
  xlab("weight ")+
  ylab("mpg")+
  geom_vline(xintercept = c(2074, 2285, 2635, 2986, 3446, 4096), col="grey50", lty=2)
ncspline_w

#Menggunakan df=2
#nilai knots yang ditentukan oleh komputer df= 2
attr(ns(Auto$weight, df=2),"knots")
##    50% 
## 2803.5
ncspline_w2 <- ggplot(Auto,aes(x=weight, y= mpg)) +
  geom_point(alpha=0.55, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, df=2),
              lty = 1, col = "100",se = F)+
  labs(title="Natural Cubic Spline df = 2")+
  xlab("weight ")+
  ylab("mpg")+
  geom_vline(xintercept = c(2803), col="grey50", lty=2)
ncspline_w2

#Pembandingan Model

plot_w <- arrange(poly_w, tangga_w, ncspline_w2)

B.3. mpg vs displacement

  1. Polynomial regression
##Visually
##3rd degree polynomial regression

polidp <-  lm( mpg ~ poly(displacement,3,raw = T),data= Auto)

poly_dp <- ggplot(Auto,aes(x=displacement, y=mpg)) + 
  geom_point(alpha=0.55, color="magenta") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,3,raw=T), 
              lty = 1, col = "100",se = F)+
  labs(title="Regresi Polinomial Derajat 3")+
  xlab("HorsePower ")+
  ylab("mpg")
poly_dp

Model yang terbaik adalah yang nilai mae nya paling rendah yaitu derajat 3.

  1. Piecewise Constant Function (Fungsi Tangga)

Dari hasil rmse dan mae, saya menggunkan 13 interval untuk dimodelkan

#Visually
tanggadp = lm(mpg ~ cut(displacement,13),data= Auto)
summary(tanggadp)
## 
## Call:
## lm(formula = mpg ~ cut(displacement, 13), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.7038  -2.5393  -0.3469   2.0850  19.4607 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      31.7038     0.5048  62.804  < 2e-16 ***
## cut(displacement, 13)(97.8,128]  -3.4955     0.7010  -4.986 9.39e-07 ***
## cut(displacement, 13)(128,157]   -5.9712     0.8127  -7.347 1.25e-12 ***
## cut(displacement, 13)(157,187]   -8.4948     1.4359  -5.916 7.38e-09 ***
## cut(displacement, 13)(187,217]  -10.8288     1.3825  -7.833 4.84e-14 ***
## cut(displacement, 13)(217,247]  -12.2851     0.9359 -13.126  < 2e-16 ***
## cut(displacement, 13)(247,276]  -13.1646     0.9822 -13.403  < 2e-16 ***
## cut(displacement, 13)(276,306]  -16.1311     1.0763 -14.988  < 2e-16 ***
## cut(displacement, 13)(306,336]  -16.7238     1.1174 -14.966  < 2e-16 ***
## cut(displacement, 13)(336,366]  -16.9780     0.9466 -17.936  < 2e-16 ***
## cut(displacement, 13)(366,395]  -17.7038     2.6231  -6.749 5.58e-11 ***
## cut(displacement, 13)(395,425]  -17.7423     1.3356 -13.284  < 2e-16 ***
## cut(displacement, 13)(425,455]  -18.4816     1.5695 -11.775  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.458 on 379 degrees of freedom
## Multiple R-squared:  0.6837, Adjusted R-squared:  0.6737 
## F-statistic: 68.28 on 12 and 379 DF,  p-value: < 2.2e-16
tangga_dp <- ggplot(Auto,aes(x=displacement, y= mpg)) +
  geom_point(alpha=0.7, color="magenta") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,13), 
              lty = 1, col = "100",se = F)+
  labs(title="Regresi Fungsi Tangga Knot 12")+
  xlab("HorsePower")+
  ylab("mpg")
tangga_dp

#3. Natural Cubic Spline 

#nilai knots yang ditentukan oleh komputer, misal df=5
attr(ns(Auto$displacement, df=5),"knots")
## 20% 40% 60% 80% 
##  98 122 225 305
#Plot BSpline
ggplot(Auto,aes(x=displacement, y= mpg)) +
  geom_point(alpha=0.55, color="magenta") +
  stat_smooth(method = "lm", 
              formula = y~bs(x, knots = c(98, 122, 225, 305)),
              lty = 1, col = "brown",se = F)+
  labs(title="Regresi Cubic Spline Knot 98, 122, 225, 305")+
  xlab("HorsePower")+
  ylab("mpg")

Dari hasil rmse dan mae, saya menggunkan 11 interval untuk dimodelkan.

##Visually

#nilai knots yang ditentukan oleh komputer, misal df= 11
attr(ns(Auto$displacement, df=11),"knots")
## 9.090909% 18.18182% 27.27273% 36.36364% 45.45455% 54.54545% 63.63636% 72.72727% 
##        90        97       107       120       140       168       231       258 
## 81.81818% 90.90909% 
##       318       351
splinedp = lm(mpg ~ ns(displacement, df =11), data= Auto)
summary(splinedp)
## 
## Call:
## lm(formula = mpg ~ ns(displacement, df = 11), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.7869  -2.4652  -0.3588   2.0662  19.6697 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   24.072      1.725  13.956  < 2e-16 ***
## ns(displacement, df = 11)1     4.014      1.791   2.241 0.025602 *  
## ns(displacement, df = 11)2     8.300      2.382   3.485 0.000549 ***
## ns(displacement, df = 11)3    -1.989      2.148  -0.926 0.355155    
## ns(displacement, df = 11)4     3.740      2.191   1.707 0.088692 .  
## ns(displacement, df = 11)5    -2.691      2.825  -0.953 0.341422    
## ns(displacement, df = 11)6    -4.750      2.392  -1.986 0.047757 *  
## ns(displacement, df = 11)7    -5.493      2.482  -2.213 0.027497 *  
## ns(displacement, df = 11)8    -9.443      2.112  -4.471 1.03e-05 ***
## ns(displacement, df = 11)9   -15.271      1.696  -9.002  < 2e-16 ***
## ns(displacement, df = 11)10    3.221      4.067   0.792 0.428812    
## ns(displacement, df = 11)11  -18.798      1.766 -10.642  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.096 on 380 degrees of freedom
## Multiple R-squared:  0.7324, Adjusted R-squared:  0.7246 
## F-statistic: 94.53 on 11 and 380 DF,  p-value: < 2.2e-16
ncspline_dp <- ggplot(Auto,aes(x=displacement, y= mpg)) +
  geom_point(alpha=0.55, color="magenta") +
  stat_smooth(method = "lm", 
              formula = y~ns(x, knots = c(90,97,107,120, 140, 168,231,258,318,351)),
              lty = 1, col = "100",se = F)+
  labs(title="Natural Cubic Spline df = 11")+
  xlab("HorsePower ")+
  ylab("mpg")+
  geom_vline(xintercept = c(90,97,107,120, 140, 168,231,258,318,351), col="grey50", lty=2)
ncspline_dp

#Pembandingan Model

plot_dp <- arrange(poly_dp, tangga_dp, ncspline_dp)

Kesimpulan

Model terbaik yang dihasilkan untuk regresi non-linear yaitu: 1. mpg vs horsepower dengan model terbaik yaitu natural cubic spline df 2 2. mpg vs weight dengan model terbaik yaitu natural cubic spline df 2 3. mpg vs displacement, model terbaik yaitu menggunakan Regresi Fungsi Tangga knot 12

Menampilkan perbandingan plot

plot_best <- arrange(ncspline_hp2, ncspline_w2, tangga_dp)