Nonlinear Regression

Library

#install.packages("MultiKink")
library(MultiKink)
library(MatrixModels)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6      v purrr   0.3.4 
## v tibble  3.1.8      v dplyr   1.0.10
## v tidyr   1.2.1      v stringr 1.4.1 
## v readr   2.1.2      v forcats 0.5.2 
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(mlr3measures)

## In order to avoid name clashes, do not attach 'mlr3measures'. Instead, only load the namespace with `requireNamespace("mlrmeasures")` and access the measures directly via `::`, e.g. `mlr3measures::auc()`.

library(splines)
library(cowplot)  
library(DT)

Data

set.seed(120)
peubah.x <- rnorm(1000,2,1)
error <- rnorm(1000)
y <- 2+1*peubah.x+4*peubah.x^2+error
plot(peubah.x,y)

Pemodelan Data Linier

lin.mod <-lm( y~peubah.x)
plot(peubah.x,y)
lines(peubah.x,lin.mod$fitted.values,col="red")

summary(lin.mod)

## 
## Call:
## lm(formula = y ~ peubah.x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.656 -3.689 -2.091  1.478 59.779 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.1075     0.4217  -26.34   <2e-16 ***
## peubah.x     17.6424     0.1857   94.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6 on 998 degrees of freedom
## Multiple R-squared:  0.9004, Adjusted R-squared:  0.9003 
## F-statistic:  9022 on 1 and 998 DF,  p-value: < 2.2e-16

Pemodelan Data Polynomial

pol.mod <- lm( y~peubah.x+I(peubah.x^2)) #ordo 2
ix <- sort(peubah.x,index.return=T)$ix #ix=observasi keberapa
plot(peubah.x,y,xlim=c(-2,5), ylim=c(-10,70))
lines(peubah.x[ix], pol.mod$fitted.values[ix],col="blue", cex=2)

summary(pol.mod)

## 
## Call:
## lm(formula = y ~ peubah.x + I(peubah.x^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.84314 -0.64438 -0.00555  0.63566  2.87286 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.80844    0.09535   18.97   <2e-16 ***
## peubah.x       1.13236    0.09051   12.51   <2e-16 ***
## I(peubah.x^2)  3.98865    0.02063  193.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9678 on 997 degrees of freedom
## Multiple R-squared:  0.9974, Adjusted R-squared:  0.9974 
## F-statistic: 1.921e+05 on 2 and 997 DF,  p-value: < 2.2e-16

Pemodelan Data Fungsi Tangga

#regresi fungsi tangga
range(peubah.x) #nilai min dan max

## [1] -1.083235  6.089670

step.mod <- lm(y~ cut(peubah.x,5))
plot(peubah.x,y)
lines(peubah.x,lin.mod$fitted.values,col="red")
lines(peubah.x[ix], step.mod$fitted.values[ix],col="Blue")

summary(step.mod)

## 
## Call:
## lm(formula = y ~ cut(peubah.x, 5))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.669  -5.373  -0.731   4.152  42.954 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.274      1.108   2.053   0.0403 *  
## cut(peubah.x, 5)(0.351,1.79]    7.360      1.173   6.272 5.32e-10 ***
## cut(peubah.x, 5)(1.79,3.22]    26.045      1.160  22.443  < 2e-16 ***
## cut(peubah.x, 5)(3.22,4.66]    58.630      1.293  45.345  < 2e-16 ***
## cut(peubah.x, 5)(4.66,6.1]    110.880      3.876  28.604  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.43 on 995 degrees of freedom
## Multiple R-squared:  0.8477, Adjusted R-squared:  0.8471 
## F-statistic:  1385 on 4 and 995 DF,  p-value: < 2.2e-16

Komparasi Model

MSE=function(pred,actual){
  mean((pred-actual)^2)
}
model<- data.frame("Model" = c("Linear", "Polynomial (2)", "Step (6)"), "AIC" = c(AIC(lin.mod), AIC(pol.mod), AIC(step.mod)), "MSE" = c(MSE(predict(lin.mod), y), MSE(predict(pol.mod), y), MSE(predict(step.mod), y)))
knitr::kable(model)

Model	AIC	MSE
Linear	6425.359	35.9267479
Polynomial (2)	2777.312	0.9337327
Step (6)	6855.818	54.9234939

Model terbaik bisa dilihat dari nilai AIC dan MSE yang paling kecil. Berdasarkan ketiga hasil pemodelan pada Materi Nonlinier Regression I didapatkan bahwa nilai AIC dan MSE-nya yang paling kecil terdapat pada model polynomial dengan derajat 2, dimana nilainya adalah (2777.312 dan 0.9337327). Dapat diketahui juga bahwa model tersebut tidak linear.

model Polynomial

# Membagi data menjadi 2 bagian berdasarkan mean (mean = 3)
data <- cbind(peubah.x, y)
data1 <- data[peubah.x <= 3,]
data2 <- data[peubah.x > 3,]

# Plotting
plot(peubah.x, y, main="Piecewise Cubic Polynomial\nData Bangkitan")
abline(v=3, col="red", lty=3)
cub.mod.data1 <- lm(data1[,2] ~ data1[,1]+I(data1[,1]^2)+I(data1[,1]^3))
ix.data1 <- sort(data1[,1], index.return=T)$ix
lines(data1[ix.data1,1],cub.mod.data1$fitted.values[ix.data1],col="blue", lwd=2)
cub.mod.data2 <- lm(data2[,2] ~ data2[,1]+I(data2[,1]^2)+I(data2[,1]^3))
ix.data2 <- sort(data2[,1], index.return=T)$ix
lines(data2[ix.data2,1],cub.mod.data2$fitted.values[ix.data2],col="blue", lwd=2)

Spline Regression

#1. Menentukan banyaknya fungsi basis dan knots
data <- as.data.frame(data)
knots.val <- attr(bs(data$peubah.x, df=6), "knots")
knots.val

##      25%      50%      75% 
## 1.320316 2.036762 2.667127

#2. Pemodelan regresi spline, df=2
spline <- lm(y ~ bs(peubah.x, knots=knots.val), data=data)
summary(spline)

## 
## Call:
## lm(formula = y ~ bs(peubah.x, knots = knots.val), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.84657 -0.65390 -0.00086  0.64618  2.89459 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        5.4530     0.6168   8.841  < 2e-16 ***
## bs(peubah.x, knots = knots.val)1  -6.2864     0.9744  -6.452 1.73e-10 ***
## bs(peubah.x, knots = knots.val)2  -3.9714     0.5611  -7.077 2.78e-12 ***
## bs(peubah.x, knots = knots.val)3  14.0370     0.6469  21.700  < 2e-16 ***
## bs(peubah.x, knots = knots.val)4  45.9965     0.6324  72.732  < 2e-16 ***
## bs(peubah.x, knots = knots.val)5  94.0344     0.8746 107.522  < 2e-16 ***
## bs(peubah.x, knots = knots.val)6 151.0685     1.0978 137.613  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9691 on 993 degrees of freedom
## Multiple R-squared:  0.9974, Adjusted R-squared:  0.9974 
## F-statistic: 6.385e+04 on 6 and 993 DF,  p-value: < 2.2e-16

ggplot(data, aes(x=peubah.x, y=y)) + geom_point(color="Blue") + stat_smooth(method="lm", formula=y~bs(peubah.x, knots=knots), lty=1, se=F)

## Warning: Computation failed in `stat_smooth()`:
## 'x' must be atomic

## Smooting Spline

mod<- smooth.spline(peubah.x, y, all.knots=T)
mod1 <- with(data=data, smooth.spline(peubah.x, y))
mod1

## Call:
## smooth.spline(x = peubah.x, y = y)
## 
## Smoothing Parameter  spar= 0.9182199  lambda= 0.0004650269 (16 iterations)
## Equivalent Degrees of Freedom (Df): 12.31604
## Penalized Criterion (RSS): 924.07
## GCV: 0.9497684

plot(peubah.x, y, pch=19, main="Smoothing Spline Data Bangkitan")
lines(mod, col="orange", lwd=1.5)

## LOESS Function

loess <- loess(formula=y~peubah.x, data=data)
summary(loess)

## Call:
## loess(formula = y ~ peubah.x, data = data)
## 
## Number of Observations: 1000 
## Equivalent Number of Parameters: 5.4 
## Residual Standard Error: 0.9682 
## Trace of smoother matrix: 5.91  (exact)
## 
## Control settings:
##   span     :  0.75 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE

loess1 <- data.frame(span=seq(0.1,2,by=0.1)) %>% 
  group_by(span) %>% 
  do(broom::augment(loess(y~peubah.x, data=data,span=.$span)))
ggplot(loess1, aes(x=peubah.x, y=y)) + geom_line(aes(y=.fitted), col="blue", lty=1) + facet_wrap(~span)

## Perbandingan Seluruh Model

ggplot(data, aes(x=peubah.x, y=y)) + geom_point(color="black") + stat_smooth(method="lm", formula=y~x, col="blue", show.legend=T) + stat_smooth(method="lm", formula=y~poly(x,2,raw=T), col="red", lwd=3.5, show.legend=T) + stat_smooth(method="lm", formula=y~cut(x,6), col="darkgreen", show.legend=T) + stat_smooth(method="lm", formula=y~bs(x, knots=knots), col="green", lwd=2, show.legend=T) + stat_smooth(method="loess", formula=y~x, col="orange", show.legend=T)

## Warning: Computation failed in `stat_smooth()`:
## 'x' must be atomic

MSE=function(pred,actual){
  mean((pred-actual)^2)
}
model<- data.frame("Model" = c("Linear", "Polynomial (2)", "Step (6)"), "AIC" = c(AIC(lin.mod), AIC(pol.mod), AIC(step.mod)), "MSE" = c(MSE(predict(lin.mod), y), MSE(predict(pol.mod), y), MSE(predict(step.mod), y)))
knitr::kable(model)

Model	AIC	MSE
Linear	6425.359	35.9267479
Polynomial (2)	2777.312	0.9337327
Step (6)	6855.818	54.9234939

Pemodelan terbaik yaitu Polynomial (2) berdasarkan nilai MSE dan AIC terkecil

MSE=function(pred,actual){
  mean((pred-actual)^2)
}
model.data2 <- data.frame(Model = c("Linear", "Polynomial (2)", "Step-Function (6)", "Spline Regression", "LOESS Function"), MSE = c(MSE(predict(lin.mod), y), MSE(predict(pol.mod), y), MSE(predict(step.mod), y), MSE(predict(spline), y), MSE(predict(loess), y)))
knitr::kable(model.data2)

Model	MSE
Linear	35.9267479
Polynomial (2)	0.9337327
Step-Function (6)	54.9234939
Spline Regression	0.9325520
LOESS Function	0.9314191

Pemodelan terbaik yaitu LOESS Function berdasarkan nilai MSE terkecil

Contoh data ISLR

library(ISLR)
AutoData = Auto %>% select(mpg, horsepower, origin)
data_Auto <- tibble(AutoData)
head(data_Auto)

## # A tibble: 6 x 3
##     mpg horsepower origin
##   <dbl>      <dbl>  <dbl>
## 1    18        130      1
## 2    15        165      1
## 3    18        150      1
## 4    16        150      1
## 5    17        140      1
## 6    15        198      1

mpg <- data_Auto$mpg
horsepower <- data_Auto$horsepower
origin <- data_Auto$origin

Jika kita gambarkan dalam bentuk scatterplot

ggplot(data_Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="blue") + 
  theme_bw()

X= horsepower Y= mpg (mile per gallon).

Berdasarkan scatter plot tersebut dapat di katakan bahwa kendaraan dengan horsepower yang besar akan membutuhkan bahan bakar dalam jumlah yang besar pula. Semakin rendah horsepower maka akan semakin tinggi jarak tempuh kendaraan untuk setiap galon bahan bakarnya.

Selanjutnya adalah mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.

ggplot(data_Auto,aes(x=horsepower, y=mpg)) +
   geom_point(alpha=0.55, color="blue") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1,
               col = "black",se = F)+
  theme_bw()

Regresi Linear

set.seed(123)
cross_val <- vfold_cv(data_Auto,v=10,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
    function(x){
    mod <- lm(mpg ~ horsepower,data=data_Auto[x$in_id,])
    pred <- predict(mod,newdata=data_Auto[-x$in_id,])
    truth <- data_Auto[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    mae <- mlr3measures::mae(truth = truth,response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
  }
)
metric_linear

## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  4.52  3.73
##  2  4.86  3.80
##  3  4.54  3.35
##  4  4.67  3.99
##  5  5.12  4.12
##  6  4.39  3.25
##  7  5.93  4.65
##  8  5.05  3.95
##  9  4.71  3.75
## 10  5.10  3.73

# menghitung rata-rata untuk 10 folds
mean_metric_lin <- colMeans(metric_linear)
mean_metric_lin

##     rmse      mae 
## 4.889086 3.832328

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(data_Auto,v=10,strata = "mpg")

df <- 1:10

best_polinomial <- map_dfr(df,function(i){
metric_polinomial <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(horsepower,df=i),data=data_Auto[x$in_id,])
#Mempresdiksi dengan data testing
pred <- predict(mod,newdata=data_Auto[-x$in_id,])
truth <- data_Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polinomial

#Rata-rata 10 folds 
mean_Polinomial<-colMeans(metric_polinomial)
mean_Polinomial
})
best_polinomial<-cbind(df=df,best_polinomial)
best_polinomial

##    df     rmse      mae
## 1   1 4.889086 3.832328
## 2   2 4.351129 3.266460
## 3   3 4.355430 3.266945
## 4   4 4.367009 3.280452
## 5   5 4.318767 3.249003
## 6   6 4.317431 3.257272
## 7   7 4.293242 3.233376
## 8   8 4.309511 3.251758
## 9   9 4.323321 3.244919
## 10 10 4.371903 3.274887

Dari output diatas diketahui rmse dan mae terkecil terdapat pada point nomor 7 yaitu 4.293242 dan 3.233376

ggplot(data_Auto,aes(x=horsepower,y=mpg)) +
  geom_point(alpha=0.55,color="blue") +
  stat_smooth(method = "lm",
              formula = y~poly(x,7,raw=T),
              lty = 1,col="black",se=F) +
  ggtitle("Regresi Polinomial Ordo = 7") +
  ylab("mpg") +
  xlab("horsepower") +
  theme(plot.title = element_text(hjust = 0.5))

Plot di atas merupakan kurva regresi polinomial dengan ordo = 7. diketahui plot tersebut memiliki hubungan negatif yang dapat diartikan, semakin tinggi horsepower maka akan semakin rendah jarak tempuh kendaraan tersebut untuk setiap galon bahan bakarnya.

FUngsi Tangga

set.seed(123)
cross_val <- vfold_cv(data_Auto,v=10,strata = "mpg")
breaks <- 3:20
best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,
    function(x){
        training <- data_Auto[x$in_id,]
        training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
        testing <- data_Auto[-x$in_id,]
        horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
        pred <- predict(mod,newdata=list(horsepower=horsepower_new))
        truth <- testing$mpg
        data_eval <- na.omit(data.frame(truth,pred))
        rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
        mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
  metric_tangga
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  mean_metric_tangga
  }
)

best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga

##    breaks     rmse      mae
## 1       3 5.775792 4.521829
## 2       4 4.985270 3.774549
## 3       5 4.712623 3.571614
## 4       6 4.644383 3.548375
## 5       7 4.554116 3.379980
## 6       8 4.437597 3.405433
## 7       9 4.549668 3.471999
## 8      10 4.589172 3.455531
## 9      11 4.531039 3.380931
## 10     12 4.442697 3.321151
## 11     13 4.527126 3.375170
## 12     14 4.428824 3.270424
## 13     15 4.313162 3.286289
## 14     16 4.336661 3.302031
## 15     17 4.414343 3.347532
## 16     18 4.340715 3.221099
## 17     19 4.331798 3.224404
## 18     20 4.480710 3.320605

#berdasarkan rmse
best_tangga %>% slice_min(rmse)

##   breaks     rmse      mae
## 1     15 4.313162 3.286289

#berdasarkan mae
best_tangga %>% slice_min(mae)

##   breaks     rmse      mae
## 1     18 4.340715 3.221099

Model fungsi tangga terbaik yaitu nilai rmse dan mae terkecil. diperoleh, RMSE terkecil terletak pada point 15 dan MAE paling kecil terletak pada point 18

ggplot(data_Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,15),
              lty = 1, col = "black",se = F)+
  ggtitle("Fungsi Tangga breaks=15") +
  ylab("mpg") +
  xlab("horsepower") +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(data_Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,18),
              lty = 1, col = "black",se = F)+
  ggtitle("Fungsi Tangga breaks=18") +
  ylab("mpg") +
  xlab("horsepower") +
  theme(plot.title = element_text(hjust = 0.5))

Plot diatas menunjukkan kurva fungsi tangga yang berhenti pada break ke-15 dan breaks = 18. Berdasarakan plot tersebut, terlihat bahwa plot fungsi tangga dengan breaks 15 atau knots = 14 cenderung lebih mulus dibandingkan dnegan breaks 18 atau knots = 17. Sehingga fungsi tangga yang dipilih adalah plot fungsi tangga dengan 15 interval dengan 14 knots.

Regresi Natural Spline

set.seed(123)
cross_val <- vfold_cv(data_Auto,v=10,strata = "mpg")

df <- 1:10

best_nspline <- map_dfr(df, function(i){
nspline <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ horsepower,df=i,data=data_Auto[x$in_id,])
pred <- predict(mod,newdata=data_Auto[-x$in_id,])
truth <- data_Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred  )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
nspline

  
# menghitung rata-rata untuk 10 folds
mean_nspline <- colMeans(nspline)
mean_nspline
}
)

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

best_nspline <- cbind(df=df,best_nspline)
best_nspline

##    df     rmse      mae
## 1   1 4.889086 3.832328
## 2   2 4.889086 3.832328
## 3   3 4.889086 3.832328
## 4   4 4.889086 3.832328
## 5   5 4.889086 3.832328
## 6   6 4.889086 3.832328
## 7   7 4.889086 3.832328
## 8   8 4.889086 3.832328
## 9   9 4.889086 3.832328
## 10 10 4.889086 3.832328

#berdasarkan rmse
best_nspline %>% slice_min(rmse)

##    df     rmse      mae
## 1   1 4.889086 3.832328
## 2   2 4.889086 3.832328
## 3   3 4.889086 3.832328
## 4   4 4.889086 3.832328
## 5   5 4.889086 3.832328
## 6   6 4.889086 3.832328
## 7   7 4.889086 3.832328
## 8   8 4.889086 3.832328
## 9   9 4.889086 3.832328
## 10 10 4.889086 3.832328

#berdasarkan mae
best_nspline %>% slice_min(mae)

##    df     rmse      mae
## 1   1 4.889086 3.832328
## 2   2 4.889086 3.832328
## 3   3 4.889086 3.832328
## 4   4 4.889086 3.832328
## 5   5 4.889086 3.832328
## 6   6 4.889086 3.832328
## 7   7 4.889086 3.832328
## 8   8 4.889086 3.832328
## 9   9 4.889086 3.832328
## 10 10 4.889086 3.832328

#Menentukan nilai knots dengan komputer
knots <- attr(ns(data_Auto$horsepower,df=10),"knots")
ggplot(data_Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
              formula = y~bs(x, knots = knots), 
              lty = 1,se = F)+
  ggtitle("Regresi Natural Spline df=10")+
  ylab("mpg")+
  xlab("horsepower")+
  theme(plot.title = element_text(hjust = 0.5))

LOESS

set.seed(123)
cross_val <- vfold_cv(data_Auto,v=10,strata = "mpg")

span <- seq(0.1,1,length.out=50)

best_loess <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
    function(x){
mod <- loess(mpg ~ horsepower,span = i,
         data=data_Auto[x$in_id,])
pred <- predict(mod,
               newdata=data_Auto[-x$in_id,])
truth <- data_Auto[-x$in_id,]$mpg

data_eval <- na.omit(data.frame(pred=pred,
                                truth=truth))


rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
})

head(metric_loess,20)

# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)

mean_metric_loess
})

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 88

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.4554e-017

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 88

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.099e-016

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4

best_loess <- cbind(span=span,best_loess)

# menampilkan hasil all breaks
best_loess

##         span     rmse      mae
## 1  0.1000000 4.279961 3.212481
## 2  0.1183673 4.279930 3.199382
## 3  0.1367347 4.282181 3.206301
## 4  0.1551020 4.271935 3.206052
## 5  0.1734694 4.280129 3.216263
## 6  0.1918367 4.291612 3.230315
## 7  0.2102041 4.304326 3.241927
## 8  0.2285714 4.306384 3.242725
## 9  0.2469388 4.299509 3.226658
## 10 0.2653061 4.294184 3.215434
## 11 0.2836735 4.295722 3.222993
## 12 0.3020408 4.289119 3.217525
## 13 0.3204082 4.292602 3.222667
## 14 0.3387755 4.284627 3.213171
## 15 0.3571429 4.284646 3.210649
## 16 0.3755102 4.286424 3.214093
## 17 0.3938776 4.286272 3.214380
## 18 0.4122449 4.290260 3.218785
## 19 0.4306122 4.295797 3.225056
## 20 0.4489796 4.291814 3.224207
## 21 0.4673469 4.293737 3.225118
## 22 0.4857143 4.298144 3.228398
## 23 0.5040816 4.301433 3.232241
## 24 0.5224490 4.302167 3.232329
## 25 0.5408163 4.303161 3.232579
## 26 0.5591837 4.304828 3.234132
## 27 0.5775510 4.307357 3.236528
## 28 0.5959184 4.308655 3.237146
## 29 0.6142857 4.307372 3.236599
## 30 0.6326531 4.306337 3.235772
## 31 0.6510204 4.305561 3.234744
## 32 0.6693878 4.306828 3.235237
## 33 0.6877551 4.307049 3.236061
## 34 0.7061224 4.308953 3.237984
## 35 0.7244898 4.310339 3.239400
## 36 0.7428571 4.318304 3.244223
## 37 0.7612245 4.324715 3.247615
## 38 0.7795918 4.330090 3.250112
## 39 0.7979592 4.331711 3.249172
## 40 0.8163265 4.335098 3.249201
## 41 0.8346939 4.338779 3.250354
## 42 0.8530612 4.338608 3.249875
## 43 0.8714286 4.339232 3.249140
## 44 0.8897959 4.340280 3.249661
## 45 0.9081633 4.345057 3.253641
## 46 0.9265306 4.349224 3.257563
## 47 0.9448980 4.352061 3.259012
## 48 0.9632653 4.354034 3.260748
## 49 0.9816327 4.355639 3.260118
## 50 1.0000000 4.355334 3.260488

#berdasarkan rmse
best_loess %>% slice_min(rmse)

##       span     rmse      mae
## 1 0.155102 4.271935 3.206052

#berdasarkan mae
best_loess %>% slice_min(mae)

##        span    rmse      mae
## 1 0.1183673 4.27993 3.199382

ggplot(data_Auto, aes(horsepower,mpg)) +
  geom_point(alpha=0.5,color="black") +
  stat_smooth(method='loess',
              formula=y~x,
              span = 0.4306122,
              col="blue",
              lty=1,
              se=F)+
  ggtitle("Local Regression")

Komparasi Model Terbaik

nilai_metric <- rbind(best_polinomial %>% select(-1) %>% slice_min(rmse),
                      best_tangga %>% select(-1) %>% slice_min(rmse),
                      best_nspline %>%  select(-1)%>% slice_min(rmse))

nama_model <- c("Polinomial (df=7)",
                "Fungsi Tangga (breaks=15)",
                "Natural Spline (df=10)")

gabungan_model <- data.frame(nama_model,nilai_metric)
gabungan_model

##                   nama_model     rmse      mae
## 1          Polinomial (df=7) 4.293242 3.233376
## 2  Fungsi Tangga (breaks=15) 4.313162 3.286289
## 3     Natural Spline (df=10) 4.889086 3.832328
## 4          Polinomial (df=7) 4.889086 3.832328
## 5  Fungsi Tangga (breaks=15) 4.889086 3.832328
## 6     Natural Spline (df=10) 4.889086 3.832328
## 7          Polinomial (df=7) 4.889086 3.832328
## 8  Fungsi Tangga (breaks=15) 4.889086 3.832328
## 9     Natural Spline (df=10) 4.889086 3.832328
## 10         Polinomial (df=7) 4.889086 3.832328
## 11 Fungsi Tangga (breaks=15) 4.889086 3.832328
## 12    Natural Spline (df=10) 4.889086 3.832328

ggplot(data_Auto,aes(x=horsepower, y=mpg))+
  geom_point(alpha=0.05, color="blue")+
stat_smooth(method="lm",formula=y~poly(x,7,raw=T),
            lty=1, aes(col="Regresi Polinomial"),col="red",se=F)+
stat_smooth(method="lm", formula=y~cut(x,15),
            lty=1, aes(col="Fungsi Tangga"),col="black",se=F)+
stat_smooth(method="lm", formula=y~ns(x,df=10),
            lty=1, aes(col="Regresi Natural Spline"),col="orange",se=F)+
scale_color_manual(values = c("Regresi Polynomial"="red","Fungsi Tangga"="darkblue","Regresi Natural Spline"="yellow"))+
  ggtitle("Perbandingan Model")

Model yang terbaik adalah regresi natural spline bisa dilihihat dari rmse dan mae terkecil. Diperoleh rmse dan mae terkecil terdapat pada df = 10.

Kesimpulan

Model regresi nya adalah natural spline dengan df = 10.

Pengantar Sains Data

Oksi alhadi

10/21/2022