Tugas Individu L1

Data Bangkitan

Library

#install.packages("MultiKink")
library(MultiKink)
library(MatrixModels)
library(tidyverse)
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(mlr3measures)
library(splines)

library(cowplot)  
library(DT)

Data

set.seed(100)
x <- rnorm(1000,3,2)
error <- rnorm(1000)
y <- 3+2*x+4*x^2+error
plot(x,y)

Pemodelan Data Linier

lin.mod <-lm( y~x)
plot( x,y)
lines(x,lin.mod$fitted.values,col="red")

Dapat dilihat bahwa data bangkitan di atas tidak linier. Berdasarkan visualisasi dengan regresi linier, garis regresi linier kurang merepesentasikan sebaran data.

summary(lin.mod)

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.964 -15.097  -9.668   5.385 158.414 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -15.8200     1.3600  -11.63   <2e-16 ***
## x            25.9359     0.3709   69.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.16 on 998 degrees of freedom
## Multiple R-squared:  0.8305, Adjusted R-squared:  0.8303 
## F-statistic:  4890 on 1 and 998 DF,  p-value: < 2.2e-16

Berdasakan model juga mengindikasikan regresi linier kurang merepresentasikan data, dengan Residual standard error, Multiple R-squared, dan Adjusted R-squared sebesar 24.16, 0.8305, dan 0.8303. Maka diperlukan model yang lebih baik yang dapat merepresentasikan data tersebut.

Pemodelan Data Polynomial

pol.mod <- lm( y~x+I(x^2)) #ordo 2
ix <- sort(x,index.return=T)$ix #ix=observasi keberapa
plot(x,y,xlim=c(-2,5), ylim=c(-10,70))
lines(x[ix], pol.mod$fitted.values[ix],col="blue", cex=2)

Setelah dimodelkan dengan regresi polinomial berordo 3 diperoleh garis regresi yang lebih merepresentasikan data.

summary(pol.mod)

## 
## Call:
## lm(formula = y ~ x + I(x^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.91797 -0.69702  0.00792  0.62214  3.15126 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.955106   0.060281   49.02   <2e-16 ***
## x           2.043222   0.034230   59.69   <2e-16 ***
## I(x^2)      3.993896   0.005138  777.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9812 on 997 degrees of freedom
## Multiple R-squared:  0.9997, Adjusted R-squared:  0.9997 
## F-statistic: 1.785e+06 on 2 and 997 DF,  p-value: < 2.2e-16

Berdasakan model juga mengindikasikan regresi linier lebih merepresentasikan data, dengan Residual standard error, Multiple R-squared dan Adjusted R-squared yang besar yaitu sebesar 0.9812, 0.9997, dan 0.9997. ## Pemodelan Data Fungsi Tangga

#regresi fungsi tangga
range(x) #nilai min dan max

## [1] -3.641564  9.608302

step.mod <- lm(y~ cut(x,5))
plot(x,y)
lines(x,lin.mod$fitted.values,col="red")
lines(x[ix], step.mod$fitted.values[ix],col="green")

summary(step.mod)

## 
## Call:
## lm(formula = y ~ cut(x, 5))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.745 -15.419  -2.259  10.556 128.359 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              13.531      4.173   3.243  0.00122 ** 
## cut(x, 5)(-0.992,1.66]   -5.495      4.435  -1.239  0.21561    
## cut(x, 5)(1.66,4.31]     33.027      4.292   7.695 3.39e-14 ***
## cut(x, 5)(4.31,6.96]    112.200      4.406  25.468  < 2e-16 ***
## cut(x, 5)(6.96,9.62]    247.801      5.850  42.360  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.08 on 995 degrees of freedom
## Multiple R-squared:  0.8589, Adjusted R-squared:  0.8583 
## F-statistic:  1514 on 4 and 995 DF,  p-value: < 2.2e-16

Komparasi Model

MSE=function(pred,actual){
  mean((pred-actual)^2)
}
model<- data.frame("Model" = c("Linear", "Polynomial (2)", "Step (6)"), "AIC" = c(AIC(lin.mod), AIC(pol.mod), AIC(step.mod)), "MSE" = c(MSE(predict(lin.mod), y), MSE(predict(pol.mod), y), MSE(predict(step.mod), y)))
knitr::kable(model)

Model	AIC	MSE
Linear	9211.470	582.6538465
Polynomial (2)	2804.925	0.9598752
Step (6)	9034.176	485.0726489

Model terbaik saat ini (Materi Nonlinier Regression I) berdasarkan nilai AIC dan MSE yang paling rendah. Berdasarkan ketiga hasil pemodelan, didapatkan bahwa model tersebut tidak linier. Model pada data bangkitan ini merupakan model polynomial dengan derajat 2 karena pada model tersebut, nilai AIC dan MSE-nya yang paling kecil (2804.925 dan 0.9598752)diantara model lainnya.

Polynomial Modelling

# Membagi data menjadi 2 bagian berdasarkan mean (mean = 3)
tr <- cbind(x, y)
tr1 <- tr[x <= 3,]
tr2 <- tr[x > 3,]

# Plotting
plot(x, y, main="Piecewise Cubic Polynomial\nData Bangkitan")
abline(v=3, col="orange", lty=3)
cub.mod.tr1 <- lm(tr1[,2] ~ tr1[,1]+I(tr1[,1]^2)+I(tr1[,1]^3))
ix.tr1 <- sort(tr1[,1], index.return=T)$ix
lines(tr1[ix.tr1,1],cub.mod.tr1$fitted.values[ix.tr1],col="red", lwd=2)
cub.mod.tr2 <- lm(tr2[,2] ~ tr2[,1]+I(tr2[,1]^2)+I(tr2[,1]^3))
ix.tr2 <- sort(tr2[,1], index.return=T)$ix
lines(tr2[ix.tr2,1],cub.mod.tr2$fitted.values[ix.tr2],col="red", lwd=2)

Spline Regression

#1. Menentukan banyaknya fungsi basis dan knots
tr <- as.data.frame(tr)
knots <- attr(bs(tr$x, df=6), "knots")
knots

##      25%      50%      75% 
## 1.700603 3.073804 4.419178

#2. Pemodelan regresi spline, df=2
spline <- lm(y ~ bs(x, knots=knots), data=tr)
summary(spline)

## 
## Call:
## lm(formula = y ~ bs(x, knots = knots), data = tr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.87248 -0.69901 -0.00065  0.61581  3.13784 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            48.1527     0.6741  71.428  < 2e-16 ***
## bs(x, knots = knots)1 -47.7787     1.0475 -45.612  < 2e-16 ***
## bs(x, knots = knots)2 -60.5310     0.6178 -97.982  < 2e-16 ***
## bs(x, knots = knots)3  -3.9264     0.7058  -5.563 3.41e-08 ***
## bs(x, knots = knots)4  80.2959     0.6803 118.032  < 2e-16 ***
## bs(x, knots = knots)5 207.4230     0.8582 241.704  < 2e-16 ***
## bs(x, knots = knots)6 342.2277     0.9255 369.786  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9815 on 993 degrees of freedom
## Multiple R-squared:  0.9997, Adjusted R-squared:  0.9997 
## F-statistic: 5.946e+05 on 6 and 993 DF,  p-value: < 2.2e-16

ggplot(tr, aes(x=x, y=y)) + geom_point(color="red") + stat_smooth(method="lm", formula=y~bs(x, knots=knots), lty=1, se=F)

## Smooting Spline

ss <- smooth.spline(x, y, all.knots=T)
ss1 <- with(data=tr, smooth.spline(x, y))
ss1

## Call:
## smooth.spline(x = x, y = y)
## 
## Smoothing Parameter  spar= 0.789453  lambda= 7.429145e-05 (11 iterations)
## Equivalent Degrees of Freedom (Df): 19.7076
## Penalized Criterion (RSS): 946.0417
## GCV: 6950.123

plot(x, y, pch=19, main="Smoothing Spline Data Bangkitan")
lines(ss, col="orange", lwd=1.5)

## LOESS Function

loess <- loess(formula=y~x, data=tr)
summary(loess)

## Call:
## loess(formula = y ~ x, data = tr)
## 
## Number of Observations: 1000 
## Equivalent Number of Parameters: 5.31 
## Residual Standard Error: 0.9811 
## Trace of smoother matrix: 5.82  (exact)
## 
## Control settings:
##   span     :  0.75 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE

loess1 <- data.frame(span=seq(0.1,2,by=0.1)) %>% 
  group_by(span) %>% 
  do(broom::augment(loess(y~x, data=tr,span=.$span)))
ggplot(loess1, aes(x=x, y=y)) + geom_line(aes(y=.fitted), col="blue", lty=1) + facet_wrap(~span)

## Perbandingan Seluruh Model

ggplot(tr, aes(x=x, y=y)) + geom_point(color="black") + stat_smooth(method="lm", formula=y~x, col="red", show.legend=T) + stat_smooth(method="lm", formula=y~poly(x,2,raw=T), col="blue", lwd=3.5, show.legend=T) + stat_smooth(method="lm", formula=y~cut(x,6), col="darkgreen", show.legend=T) + stat_smooth(method="lm", formula=y~bs(x, knots=knots), col="green", lwd=2, show.legend=T) + stat_smooth(method="loess", formula=y~x, col="orange", show.legend=T)

MSE=function(pred,actual){
  mean((pred-actual)^2)
}
model.tr2 <- data.frame(Model = c("Linear", "Polynomial (2)", "Step-Function (6)", "Spline Regression", "LOESS Function"), MSE = c(MSE(predict(lin.mod), y), MSE(predict(pol.mod), y), MSE(predict(step.mod), y), MSE(predict(spline), y), MSE(predict(loess), y)))
knitr::kable(model.tr2)

Model	MSE
Linear	582.6538465
Polynomial (2)	0.9598752
Step-Function (6)	485.0726489
Spline Regression	0.9565396
LOESS Function	0.9564997

Pemodelan terbaik yaitu Polinomial 2 berdasarkan nilai MSE terkecil

Contoh data ISLR

Data yang digunakan dalam analisis regresi non-linier ini merupakan data Auto yang berasal dari package ISLR. Data Auto terdiri dari 392 observasi dan 9 variabel.

Variabel tersebut antara lain:

mpg: miles per gallon
cylinders: Number of cylinders between 4 and 8
displacement: Engine displacement (cu. inches)
horsepower: Engine horsepower
weight: Vehicle weight (lbs.)
acceleration: Time to accelerate from 0 to 60 mph (sec.)
year: Model year (modulo 100)
origin: Origin of car (1. American, 2. European, 3. Japanese)
name: Vehicle name

library(ISLR)
AutoData = Auto %>% select(mpg, horsepower, origin)
data_Auto <- tibble(AutoData)
head(data_Auto)

## # A tibble: 6 x 3
##     mpg horsepower origin
##   <dbl>      <dbl>  <dbl>
## 1    18        130      1
## 2    15        165      1
## 3    18        150      1
## 4    16        150      1
## 5    17        140      1
## 6    15        198      1

mpg <- data_Auto$mpg
horsepower <- data_Auto$horsepower
origin <- data_Auto$origin

Jika kita gambarkan dalam bentuk scatterplot

ggplot(data_Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="darkgreen") + 
  theme_bw()

Grafik di atas merupakan scatter plot antara peubah X nya adalah horsepower (kekuatan mesin) dan sumbu Y adalah mpg (mile per gallon). Berdasarkan scatter plot tersebut dapat di katakan bahwa kendaraan dengan horsepower yang besar akan membutuhkan bahan bakar dalam jumlah yang besar pula. Hal ini ditunjukkan pada grafik yakni semakin tinggi horsepower maka akan semakin rendah jarak tempuh kendaraan tersebut untuk setiap galon bahan bakarnya, begitu pula sebaliknya.Semakin rendah horsepower maka akan semakin tinggi jarak tempuh kendaraan untuk setiap galon bahan bakarnya.Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.

ggplot(data_Auto,aes(x=horsepower, y=mpg)) +
   geom_point(alpha=0.55, color="black") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1,
               col = "blue",se = F)+
  theme_bw()

Secara ekploratif, jika plot data diberikan garis regresi linier, model regresi linier kurang baik dalam merepresentasikan atau menjelaskan pola hubungan mpg dengan horsepower karena pola garis lurus yang terbentuk tidak seutuhnya mengikuti pola sebaran data. Selanjutnya, perlu dilakukan pemilihan model regresi non-linier terbaik

Regresi Linear

set.seed(123)
cross_val <- vfold_cv(data_Auto,v=10,strata = "mpg")
metric_linear <- map_dfr(cross_val$splits,
    function(x){
    mod <- lm(mpg ~ horsepower,data=data_Auto[x$in_id,])
    pred <- predict(mod,newdata=data_Auto[-x$in_id,])
    truth <- data_Auto[-x$in_id,]$mpg
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    mae <- mlr3measures::mae(truth = truth,response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
  }
)
metric_linear

## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  4.52  3.73
##  2  4.86  3.80
##  3  4.54  3.35
##  4  4.67  3.99
##  5  5.12  4.12
##  6  4.39  3.25
##  7  5.93  4.65
##  8  5.05  3.95
##  9  4.71  3.75
## 10  5.10  3.73

Berikut merupakan rata-rata nilai rmse dan mae dari regresi linier:

# menghitung rata-rata untuk 10 folds
mean_metric_lin <- colMeans(metric_linear)
mean_metric_lin

##     rmse      mae 
## 4.889086 3.832328

Regresi Polinomial

set.seed(123)
cross_val <- vfold_cv(data_Auto,v=10,strata = "mpg")

df <- 1:10

best_polinomial <- map_dfr(df,function(i){
metric_polinomial <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ poly(horsepower,df=i),data=data_Auto[x$in_id,])
#Mempresdiksi dengan data testing
pred <- predict(mod,newdata=data_Auto[-x$in_id,])
truth <- data_Auto[-x$in_id,]$mpg
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_polinomial

#Rata-rata 10 folds 
mean_Polinomial<-colMeans(metric_polinomial)
mean_Polinomial
})
best_polinomial<-cbind(df=df,best_polinomial)
best_polinomial

##    df     rmse      mae
## 1   1 4.889086 3.832328
## 2   2 4.351129 3.266460
## 3   3 4.355430 3.266945
## 4   4 4.367009 3.280452
## 5   5 4.318767 3.249003
## 6   6 4.317431 3.257272
## 7   7 4.293242 3.233376
## 8   8 4.309511 3.251758
## 9   9 4.323321 3.244919
## 10 10 4.371903 3.274887

Berdasarkan output di atas, dapat terlihat bahwa ordo ke-7 merupakan model regresi polinomial terbaik dari 1 hingga 10 ordo yg ada. Hal ini dikarenakan ordo ke-7 memiliki nilai RMSE dan MAE terkecil.

ggplot(data_Auto,aes(x=horsepower,y=mpg)) +
  geom_point(alpha=0.55,color="darkgreen") +
  stat_smooth(method = "lm",
              formula = y~poly(x,7,raw=T),
              lty = 1,col="black",se=F) +
  ggtitle("Regresi Polinomial Ordo = 7") +
  ylab("mpg") +
  xlab("horsepower") +
  theme(plot.title = element_text(hjust = 0.5))

Plot di atas merupakan kurva regresi polinomial dengan ordo = 7. Berdasarakan plot tersebut dapat terlihat bahwa semakin tinggi horsepower maka akan semakin rendah jarak tempuh kendaraan tersebut untuk setiap galon bahan bakarnya.

Fungsi Tangga

set.seed(123)
cross_val <- vfold_cv(data_Auto,v=10,strata = "mpg")
breaks <- 3:20
best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,
    function(x){
        training <- data_Auto[x$in_id,]
        training$horsepower <- cut(training$horsepower,i)

mod <- lm(mpg ~ horsepower,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
        testing <- data_Auto[-x$in_id,]
        horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
        pred <- predict(mod,newdata=list(horsepower=horsepower_new))
        truth <- testing$mpg
        data_eval <- na.omit(data.frame(truth,pred))
        rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
        mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
  metric_tangga
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  mean_metric_tangga
  }
)

best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga

##    breaks     rmse      mae
## 1       3 5.775792 4.521829
## 2       4 4.985270 3.774549
## 3       5 4.712623 3.571614
## 4       6 4.644383 3.548375
## 5       7 4.554116 3.379980
## 6       8 4.437597 3.405433
## 7       9 4.549668 3.471999
## 8      10 4.589172 3.455531
## 9      11 4.531039 3.380931
## 10     12 4.442697 3.321151
## 11     13 4.527126 3.375170
## 12     14 4.428824 3.270424
## 13     15 4.313162 3.286289
## 14     16 4.336661 3.302031
## 15     17 4.414343 3.347532
## 16     18 4.340715 3.221099
## 17     19 4.331798 3.224404
## 18     20 4.480710 3.320605

#berdasarkan rmse
best_tangga %>% slice_min(rmse)

##   breaks     rmse      mae
## 1     15 4.313162 3.286289

#berdasarkan mae
best_tangga %>% slice_min(mae)

##   breaks     rmse      mae
## 1     18 4.340715 3.221099

Berdasarkan output di atas, dengan menggunakan break 3 hingga 20, dapat dilihat bahwa model fungsi tangga terbaik yaitu dengan jumlah break sebanyak 15 karena memiliki nilai RMSE terkecil dibandingkan nilai break yang lainnya.Sedangkan, jika dilihat dari mae maka break ke-18 memiliki nilai MAE paling kecil diantara breaks yang lainnya.

ggplot(data_Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="darkgreen") +
  stat_smooth(method = "lm",formula = y~cut(x,15),
              lty = 1, col = "black",se = F)+
  ggtitle("Fungsi Tangga breaks=15") +
  ylab("mpg") +
  xlab("horsepower") +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(data_Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="darkgreen") +
  stat_smooth(method = "lm",formula = y~cut(x,18),
              lty = 1, col = "black",se = F)+
  ggtitle("Fungsi Tangga breaks=18") +
  ylab("mpg") +
  xlab("horsepower") +
  theme(plot.title = element_text(hjust = 0.5))

Plot diatas menunjukkan kurva fungsi tangga yang berhenti pada break ke-15 dan breaks = 18. Berdasarakan plot tersebut, terlihat bahwa plot fungsi tangga dengan breaks 15 atau knots = 14 cenderung lebih mulus dibandingkan dnegan breaks 18 atau knots = 17. Sehingga fungsi tangga yang dipilih adalah plot fungsi tangga dengan 15 interval dengan 14 knots.

Regresi Natural Spline

set.seed(123)
cross_val <- vfold_cv(data_Auto,v=10,strata = "mpg")

df <- 1:10

best_nspline <- map_dfr(df, function(i){
nspline <- map_dfr(cross_val$splits,function(x){
mod <- lm(mpg ~ horsepower,df=i,data=data_Auto[x$in_id,])
pred <- predict(mod,newdata=data_Auto[-x$in_id,])
truth <- data_Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred  )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
nspline

  
# menghitung rata-rata untuk 10 folds
mean_nspline <- colMeans(nspline)
mean_nspline
}
)

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'df' will be disregarded

best_nspline <- cbind(df=df,best_nspline)
best_nspline

##    df     rmse      mae
## 1   1 4.889086 3.832328
## 2   2 4.889086 3.832328
## 3   3 4.889086 3.832328
## 4   4 4.889086 3.832328
## 5   5 4.889086 3.832328
## 6   6 4.889086 3.832328
## 7   7 4.889086 3.832328
## 8   8 4.889086 3.832328
## 9   9 4.889086 3.832328
## 10 10 4.889086 3.832328

#berdasarkan rmse
best_nspline %>% slice_min(rmse)

##    df     rmse      mae
## 1   1 4.889086 3.832328
## 2   2 4.889086 3.832328
## 3   3 4.889086 3.832328
## 4   4 4.889086 3.832328
## 5   5 4.889086 3.832328
## 6   6 4.889086 3.832328
## 7   7 4.889086 3.832328
## 8   8 4.889086 3.832328
## 9   9 4.889086 3.832328
## 10 10 4.889086 3.832328

#berdasarkan mae
best_nspline %>% slice_min(mae)

##    df     rmse      mae
## 1   1 4.889086 3.832328
## 2   2 4.889086 3.832328
## 3   3 4.889086 3.832328
## 4   4 4.889086 3.832328
## 5   5 4.889086 3.832328
## 6   6 4.889086 3.832328
## 7   7 4.889086 3.832328
## 8   8 4.889086 3.832328
## 9   9 4.889086 3.832328
## 10 10 4.889086 3.832328

Berdasarkan output di atas, dengan menggunakan df 1 hingga 10, dapat terlihat bahwa model regresi natural spline terbaik terjadi ketika df = 10. Hal ini dikarenakan df = 10 memiliki nilai RMSE dan MAE paling kecil diantara df yang lainnya.

library(splines)
#Menentukan nilai knots dengan komputer
knots <- attr(ns(data_Auto$horsepower,df=10),"knots")
ggplot(data_Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="darkgreen") +
  stat_smooth(method = "lm", 
              formula = y~bs(x, knots = knots), 
              lty = 1,se = F)+
  ggtitle("Regresi Natural Spline df=10")+
  ylab("mpg")+
  xlab("horsepower")+
  theme(plot.title = element_text(hjust = 0.5))

Plot di atas merupakan kurva natural cubic spline dengan menggunakan df = 10. Berdasarkan plot tersebut, terlihat bahwa semakin meningkat nilai horsepower maka nilai mpg semakin menurun.

LOESS

set.seed(123)
cross_val <- vfold_cv(data_Auto,v=10,strata = "mpg")

span <- seq(0.1,1,length.out=50)

best_loess <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
    function(x){
mod <- loess(mpg ~ horsepower,span = i,
         data=data_Auto[x$in_id,])
pred <- predict(mod,
               newdata=data_Auto[-x$in_id,])
truth <- data_Auto[-x$in_id,]$mpg

data_eval <- na.omit(data.frame(pred=pred,
                                truth=truth))


rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
})

head(metric_loess,20)

# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)

mean_metric_loess
})

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 88

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.4554e-017

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 89

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 88

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.099e-016

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4

best_loess <- cbind(span=span,best_loess)

# menampilkan hasil all breaks
best_loess

##         span     rmse      mae
## 1  0.1000000 4.279961 3.212481
## 2  0.1183673 4.279930 3.199382
## 3  0.1367347 4.282181 3.206301
## 4  0.1551020 4.271935 3.206052
## 5  0.1734694 4.280129 3.216263
## 6  0.1918367 4.291612 3.230315
## 7  0.2102041 4.304326 3.241927
## 8  0.2285714 4.306384 3.242725
## 9  0.2469388 4.299509 3.226658
## 10 0.2653061 4.294184 3.215434
## 11 0.2836735 4.295722 3.222993
## 12 0.3020408 4.289119 3.217525
## 13 0.3204082 4.292602 3.222667
## 14 0.3387755 4.284627 3.213171
## 15 0.3571429 4.284646 3.210649
## 16 0.3755102 4.286424 3.214093
## 17 0.3938776 4.286272 3.214380
## 18 0.4122449 4.290260 3.218785
## 19 0.4306122 4.295797 3.225056
## 20 0.4489796 4.291814 3.224207
## 21 0.4673469 4.293737 3.225118
## 22 0.4857143 4.298144 3.228398
## 23 0.5040816 4.301433 3.232241
## 24 0.5224490 4.302167 3.232329
## 25 0.5408163 4.303161 3.232579
## 26 0.5591837 4.304828 3.234132
## 27 0.5775510 4.307357 3.236528
## 28 0.5959184 4.308655 3.237146
## 29 0.6142857 4.307372 3.236599
## 30 0.6326531 4.306337 3.235772
## 31 0.6510204 4.305561 3.234744
## 32 0.6693878 4.306828 3.235237
## 33 0.6877551 4.307049 3.236061
## 34 0.7061224 4.308953 3.237984
## 35 0.7244898 4.310339 3.239400
## 36 0.7428571 4.318304 3.244223
## 37 0.7612245 4.324715 3.247615
## 38 0.7795918 4.330090 3.250112
## 39 0.7979592 4.331711 3.249172
## 40 0.8163265 4.335098 3.249201
## 41 0.8346939 4.338779 3.250354
## 42 0.8530612 4.338608 3.249875
## 43 0.8714286 4.339232 3.249140
## 44 0.8897959 4.340280 3.249661
## 45 0.9081633 4.345057 3.253641
## 46 0.9265306 4.349224 3.257563
## 47 0.9448980 4.352061 3.259012
## 48 0.9632653 4.354034 3.260748
## 49 0.9816327 4.355639 3.260118
## 50 1.0000000 4.355334 3.260488

#berdasarkan rmse
best_loess %>% slice_min(rmse)

##       span     rmse      mae
## 1 0.155102 4.271935 3.206052

#berdasarkan mae
best_loess %>% slice_min(mae)

##        span    rmse      mae
## 1 0.1183673 4.27993 3.199382

ggplot(data_Auto, aes(horsepower,mpg)) +
  geom_point(alpha=0.5,color="black") +
  stat_smooth(method='loess',
              formula=y~x,
              span = 0.4306122,
              col="blue",
              lty=1,
              se=F)+
  ggtitle("Local Regression")

Komparasi Model Terbaik

nilai_metric <- rbind(best_polinomial %>% select(-1) %>% slice_min(rmse),
                      best_tangga %>% select(-1) %>% slice_min(rmse),
                      best_nspline %>%  select(-1)%>% slice_min(rmse))

nama_model <- c("Polinomial (df=7)",
                "Fungsi Tangga (breaks=15)",
                "Natural Spline (df=10)")

gabungan_model <- data.frame(nama_model,nilai_metric)
gabungan_model

##                   nama_model     rmse      mae
## 1          Polinomial (df=7) 4.293242 3.233376
## 2  Fungsi Tangga (breaks=15) 4.313162 3.286289
## 3     Natural Spline (df=10) 4.889086 3.832328
## 4          Polinomial (df=7) 4.889086 3.832328
## 5  Fungsi Tangga (breaks=15) 4.889086 3.832328
## 6     Natural Spline (df=10) 4.889086 3.832328
## 7          Polinomial (df=7) 4.889086 3.832328
## 8  Fungsi Tangga (breaks=15) 4.889086 3.832328
## 9     Natural Spline (df=10) 4.889086 3.832328
## 10         Polinomial (df=7) 4.889086 3.832328
## 11 Fungsi Tangga (breaks=15) 4.889086 3.832328
## 12    Natural Spline (df=10) 4.889086 3.832328

ggplot(data_Auto,aes(x=horsepower, y=mpg))+
  geom_point(alpha=0.05, color="darkgreen")+
stat_smooth(method="lm",formula=y~poly(x,7,raw=T),
            lty=1, aes(col="Regresi Polinomial"),col="red",se=F)+
stat_smooth(method="lm", formula=y~cut(x,15),
            lty=1, aes(col="Fungsi Tangga"),col="darkblue",se=F)+
stat_smooth(method="lm", formula=y~ns(x,df=10),
            lty=1, aes(col="Regresi Natural Spline"),col="green",se=F)+
scale_color_manual(values = c("Regresi Polynomial"="red","Fungsi Tangga"="darkblue","Regresi Natural Spline"="green"))+
  ggtitle("Perbandingan Model")

Model yang terbaik adalah regresi natural spline dengan df = 10. Hal ini dikarenakan model regresi spline memiliki nilai RMSE dan MAE yang paling kecil dibandingkan model lainnya.

Kesimpulan

Model regresi terbaik untuk variabel mpg vs horsepower adalah natural spline dengan df = 10.