Tugas Individu 1 STA1381 - Pengantar Sains Data

Library

library("MultiKink")
library(tidyverse)
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(mlr3measures)
library(tidyverse)
library(ggplot2)
library(splines)

Data Bangkitan

Membangkitkan data dengan menggunakan syntax berikut :

set.seed(321)
data.x <- rnorm(1000,3,2)
err <- rnorm(1000)
y <- 5+3*data.x+2*data.x^2+err ## Model Polynomial Ordo 2
plot(data.x,y)

Regresi linier

lin.mod <-lm(y~data.x)
plot(data.x,y)
lines(data.x,lin.mod$fitted.values,col="red")

summary(lin.mod)

## 
## Call:
## lm(formula = y ~ data.x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.854  -6.607  -3.948   3.221  57.784 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.4595     0.6084  -10.62   <2e-16 ***
## data.x       15.4341     0.1662   92.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.41 on 998 degrees of freedom
## Multiple R-squared:  0.8963, Adjusted R-squared:  0.8962 
## F-statistic:  8623 on 1 and 998 DF,  p-value: < 2.2e-16

Polynomial

pol.mod <- lm(y~data.x+I(data.x^2)) #ordo 2
ix <- sort(data.x,index.return=T)$ix
plot(data.x,y)
lines(data.x[ix], pol.mod$fitted.values[ix],col="blue", cex=2)

summary(pol.mod)

## 
## Call:
## lm(formula = y ~ data.x + I(data.x^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.03219 -0.74520  0.00675  0.71680  3.11017 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.006600   0.069069   72.49   <2e-16 ***
## data.x      3.013992   0.041731   72.22   <2e-16 ***
## I(data.x^2) 1.997457   0.006186  322.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.014 on 997 degrees of freedom
## Multiple R-squared:  0.999,  Adjusted R-squared:  0.999 
## F-statistic: 5.068e+05 on 2 and 997 DF,  p-value: < 2.2e-16

Fungsi Tangga

#regresi fungsi tangga
range(data.x)

## [1] -2.594879  8.822139

c1 <- as.factor(ifelse(data.x<=0,1,0))
c2 <- as.factor(ifelse(data.x<=2 & data.x>0,1,0))
c3 <- as.factor(ifelse(data.x>2,1,0))
head(data.frame(y,c1,c2,c3))

##           y c1 c2 c3
## 1 104.90426  0  0  1
## 2  13.40932  0  1  0
## 3  25.68133  0  0  1
## 4  28.55372  0  0  1
## 5  30.30488  0  0  1
## 6  39.50984  0  0  1

step.mod <- lm(y~c1+c2+c3)
plot(data.x,y)
lines(data.x,lin.mod$fitted.values,col="red")
lines(data.x[ix], step.mod$fitted.values[ix],col="green")

summary(step.mod)

## 
## Call:
## lm(formula = y ~ c1 + c2 + c3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.224 -17.311  -1.781   6.077 131.966 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  54.1863     0.9565   56.65   <2e-16 ***
## c11         -49.0936     3.6051  -13.62   <2e-16 ***
## c21         -42.6593     1.8728  -22.78   <2e-16 ***
## c31               NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.31 on 997 degrees of freedom
## Multiple R-squared:  0.3879, Adjusted R-squared:  0.3867 
## F-statistic: 315.9 on 2 and 997 DF,  p-value: < 2.2e-16

Komparasi Model

nilai_AIC <- rbind(AIC(lin.mod),
                   AIC(pol.mod),
                   AIC(step.mod))

nama_model <- c("Linear","Poly (ordo=2)","Tangga (breaks=3)")
data.frame(nama_model,nilai_AIC)

##          nama_model nilai_AIC
## 1            Linear  7527.860
## 2     Poly (ordo=2)  2870.430
## 3 Tangga (breaks=3)  9304.906

MSE = function(pred,actual){
  mean((pred-actual)^2)
}

MSE(predict(lin.mod),y)

## [1] 108.2002

MSE(predict(pol.mod),y)

## [1] 1.024857

MSE(predict(step.mod),y)

## [1] 638.4405

model.com <- data.frame("Model" = c("Linear", "Poly (ordo=2)", "Tangga (breaks=3)"), "AIC" = c(AIC(lin.mod), AIC(pol.mod), AIC(step.mod)), "MSE" = c(MSE(predict(lin.mod), y), MSE(predict(pol.mod), y), MSE(predict(step.mod), y)))
knitr::kable(model.com)

Model	AIC	MSE
Linear	7527.860	108.200170
Poly (ordo=2)	2870.430	1.024857
Tangga (breaks=3)	9304.906	638.440474

Berdasarkan ketiga hasil pemodelan, dicari nilai AIC dan MSE yang paling rendah. Model terbaik pada data bangkitan saat ini adalah model polynomial dengan derajat 2 Poly (ordo=2) karena pada model tersebut nilai AIC dan MSE-nya yang paling kecil yaitu 2870.430 dan 1.024857 dibandingkan model lainnya.

Piecewise cubic polynomial

dt.all <- cbind(y,data.x)

##knots=1
dt1 <- dt.all[data.x <=1,]
dim(dt1)

## [1] 151   2

dt2 <- dt.all[data.x >1,]
dim(dt2)

## [1] 849   2

plot(data.x,y, pch=16, col="orange")

cub.mod1 <- lm(dt1[,1]~dt1[,2]+I(dt1[,2]^2)+I(dt1[,2]^3))
ix <- sort(dt1[,2], index.return=T)$ix
lines(dt1[ix,2],cub.mod1$fitted.values[ix],col="blue", lwd=2)

cub.mod2 <- lm(dt2[,1]~dt2[,2]+I(dt2[,2]^2)+I(dt2[,2]^3))
ix <- sort(dt2[,2], index.return=T)$ix
lines(dt2[ix,2],cub.mod2$fitted.values[ix],col="blue", lwd=2)

Truncated power basis

#dengan menggunakan truncated power basis
plot(data.x,y, pch=16,col="orange")
abline(v=1,col="red", lty=2)

hx <- ifelse( data.x>1,(data.x-1)^3,0)

cubspline.mod <- lm( y~data.x+I(data.x^2)+I(data.x^3)+hx)
ix <- sort( data.x,index.return=T)$ix
lines( data.x[ix], cubspline.mod$fitted.values[ix],col="green", lwd=2)

Perbandingan dengan k-fold CV

plot( data.x,y, pch=16,col="orange")
abline(v=0,col="red", lty=2)
abline(v=2,col="red", lty=2)

hx1 <- ifelse( data.x>0,(data.x-0)^3,0)
hx2 <- ifelse( data.x>2,(data.x-2)^3,0)

cubspline.mod2 <- lm( y~data.x+I(data.x^2)+I(data.x^3)+hx1+hx2)
ix <- sort( data.x,index.return=T)$ix
lines( data.x[ix],cubspline.mod2$fitted.values[ix],col="green", lwd=2)

Perbandingan 1 knot dan 2 knot dengan 5-fold CV

##1 knot
data.all <- cbind( y,data.x,hx)
set.seed(456)
ind <- sample(1:5,length( data.x),replace=T)
res <- c()
for( i in 1:5){
  dt.train <- data.all[ ind!=i,]
  x.test <- data.all[ ind==i, -1]
  y.test <- data.all[ ind==i,1]
  mod1 <- lm( dt.train[,1]~dt.train[,2]+I( dt.train[,2]^2)+I( dt.train[,2]^3)+dt.train[,3])
  x.test.olah <- cbind(1,x.test[,1], x.test[,1]^2,x.test[,1]^3,x.test[,2])
  beta <- coefficients(mod1)
  prediksi <- x.test.olah%*%beta
  res <- c( res,mean(( y.test-prediksi)^2))
}
res

## [1] 0.9895305 1.0322377 1.0089171 1.1336757 1.0315655

mean(res)

## [1] 1.039185

##2 knot (knot = 0, 2)
data.all2 <- cbind(y,data.x,hx1,hx2)
set.seed(456)
ind2 <- sample(1:5,length( data.x),replace=T)
res2 <- c()
for( i in 1:5){
  dt.train2 <- data.all2[ind2!=i,]
  x.test2 <- data.all2[ind2==i,-1]
  y.test2 <- data.all2[ind2==i,1]
  mod2 <- lm(dt.train2[,1]~dt.train2[,2]+I(dt.train2[,2]^2)+I(dt.train2[,2]^3)+dt.train2[,3]+dt.train2[,4])
  x.test.olah2 <- cbind(1,x.test2[,1],x.test2[,1]^2,x.test2[,1]^3,x.test2[,2],x.test2[,3])
  beta2 <- coefficients(mod2)
  prediksi2 <- x.test.olah2%*%beta2
  res2 <- c(res2,mean((y.test2-prediksi2)^2))
}
res2

## [1] 0.985706 1.036767 1.009072 1.131794 1.026744

mean(res2)

## [1] 1.038017

Smoothing Spline

ss1 <- smooth.spline(data.x,y,all.knots = T)
plot(data.x,y, pch=16,col="orange")
lines(ss1,col="purple", lwd=2)

Data AutoData

Pemodelan nonlinear ini menggunakan data spesifikasi kualitas mesin dalam standar horsepower dan tingkat kehematan penggunaan bensin dalam standar miles per galon (mpg) pada 392 kendaraan. Data diambil dari library(ISLR) pada software R. Data ini adalah data yang diambil dan diolah oleh Carnegie Mellon University dan digunakan dalam 1983 American Statistical Association Exposition.

library(ISLR)
AutoData = Auto %>% select(mpg, horsepower)
tibble(AutoData)

## # A tibble: 392 x 2
##      mpg horsepower
##    <dbl>      <dbl>
##  1    18        130
##  2    15        165
##  3    18        150
##  4    16        150
##  5    17        140
##  6    15        198
##  7    14        220
##  8    14        215
##  9    14        225
## 10    15        190
## # ... with 382 more rows

ggplot(AutoData,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black") + 
                 theme_bw()

Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.

Regresi Linear

mod_linear = lm(horsepower~mpg,data=AutoData)
summary(mod_linear)

## 
## Call:
## lm(formula = horsepower ~ mpg, data = AutoData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.892 -15.716  -2.094  13.108  96.947 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 194.4756     3.8732   50.21   <2e-16 ***
## mpg          -3.8389     0.1568  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.19 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

Berdasarkan output di atas, seluruh parameter regresi linear signifikan pada taraf 5% sehingga dapat diperoleh model regresi linear sebagai berikut:

y=194.4756-3.8389x

Berdasarkan model regresi linear di atas, dapat dimaknai bahwa perubahan nilai besar mpg sebesar satu satuan akan menurunkan dugaan rataan rata-rata besar horsepower sebesar 3.8389 satuan pada tiap kendaraan. Interpretasi ini dapat diamati secara eksploratif melalui grafik sebagai berikut,

Jika kita gambarkan dalam bentuk scatterplot ;

ggplot(AutoData,aes(x=mpg, y=horsepower)) +
   geom_point(alpha=0.55, color="black") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1,
               col = "blue",se = F)+
  theme_bw()

ringkasan_linear <- summary(mod_linear)
ringkasan_linear$r.squared

## [1] 0.6059483

AIC(mod_linear)

## [1] 3614.324

tabel_data <- rbind(data.frame("y_aktual"=AutoData$horsepower,
                               "y_pred"=predict(mod_linear),
                               "residuals"=residuals(mod_linear)))
head(tabel_data)

##   y_aktual   y_pred residuals
## 1      130 125.3757  4.624341
## 2      165 136.8923 28.107677
## 3      150 125.3757 24.624341
## 4      150 133.0534 16.946565
## 5      140 129.2145 10.785453
## 6      198 136.8923 61.107677

Regresi Polinomial Derajat 2 (ordo 2)

mod_polinomial2 = lm(horsepower ~ poly(mpg,2,raw = T),
                     data=AutoData)
summary(mod_polinomial2)

## 
## Call:
## lm(formula = horsepower ~ poly(mpg, 2, raw = T), data = AutoData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -72.52 -11.24  -0.11   9.47  92.98 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            302.06824    9.25689   32.63   <2e-16 ***
## poly(mpg, 2, raw = T)1 -13.32406    0.77456  -17.20   <2e-16 ***
## poly(mpg, 2, raw = T)2   0.18804    0.01513   12.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.49 on 389 degrees of freedom
## Multiple R-squared:  0.718,  Adjusted R-squared:  0.7165 
## F-statistic: 495.1 on 2 and 389 DF,  p-value: < 2.2e-16

Berdasarkan output di atas, seluruh parameter regresi polinomial dengan ordo 2 signifikan pada taraf 5% sehingga dapat diperoleh model regresi polinomial dengan ordo 2 untuk gugus data terkait sebagai berikut:

y=302.06824 + 0.18804x^2 − 13.32406x

Berdasarkan model regresi polinomial ordo dua di atas, dapat dimaknai bahwa perubahan nilai besar mpg sebesar satu satuan akan menurunkan dugaan rataan rata-rata besar horsepower sebesar 13.32406 satuan dan pada saat yang bersamaan meningkatkan dugaan rataannya secara kuadratik sebesar 0.18804 pada tiap kendaraan.

Interpretasi ini dapat diamati secara eksploratif melalui grafik sebagai berikut:

ggplot(AutoData,aes(x=mpg, y=horsepower)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = T)+
  theme_bw()

AIC(mod_polinomial2)

## [1] 3485.217

Regresi Polinomial Derajat 3 (ordo 3)

mod_polinomial3 = lm(horsepower ~ poly(mpg,3,raw = T),data=AutoData)
summary(mod_polinomial3)

## 
## Call:
## lm(formula = horsepower ~ poly(mpg, 3, raw = T), data = AutoData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.935 -11.251  -1.338   9.324  95.459 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            429.581461  23.823018  18.032  < 2e-16 ***
## poly(mpg, 3, raw = T)1 -30.131244   3.006538 -10.022  < 2e-16 ***
## poly(mpg, 3, raw = T)2   0.866793   0.118533   7.313 1.51e-12 ***
## poly(mpg, 3, raw = T)3  -0.008506   0.001474  -5.770 1.62e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.69 on 388 degrees of freedom
## Multiple R-squared:  0.7403, Adjusted R-squared:  0.7382 
## F-statistic: 368.6 on 3 and 388 DF,  p-value: < 2.2e-16

Berdasarkan output di atas, seluruh parameter regresi polinomial dengan ordo 3 signifikan pada taraf 5% sehingga model secara keseluruhan signifikan untuk menggambarkan pola hubungan antara spesifikasi horsepower dengan tingkat miles per gallon. Meskipun demikian, secara eksploratif, model regresi ini dapat diamati melalui plot sebagai berikut:

ggplot(AutoData,aes(x=mpg, y=horsepower)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = T)+
  theme_bw()

AIC <- cbind(AIC(mod_linear),AIC(mod_polinomial2),AIC(mod_polinomial3))

MSE = function(pred,actual)
  {
  mean((pred-actual)^2)
}

MSE(predict(mod_linear),AutoData$horsepower)

## [1] 582.3257

MSE(predict(mod_polinomial2),AutoData$horsepower)

## [1] 416.7871

MSE(predict(mod_polinomial3),AutoData$horsepower)

## [1] 383.8523

Regresi Fungsi Tangga (5)

range(AutoData$horsepower)

## [1]  46 230

Gugus data yang diamati memiliki range data dari 46 hingga 230 sehingga secara keseluruhan, data dapat dibagi menjadi delapan selang yang masing-masing selangnya terbagi dalam nilai yang bulat.

mod_tangga5 = lm(horsepower ~ cut(mpg,5),data=AutoData)
summary(mod_tangga5)

## 
## Call:
## lm(formula = horsepower ~ cut(mpg, 5), data = AutoData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -86.242 -11.378  -3.000   8.832  71.758 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             158.242      2.260   70.03   <2e-16 ***
## cut(mpg, 5)(16.5,24]    -55.242      2.942  -18.78   <2e-16 ***
## cut(mpg, 5)(24,31.6]    -77.073      3.116  -24.74   <2e-16 ***
## cut(mpg, 5)(31.6,39.1]  -85.971      3.603  -23.86   <2e-16 ***
## cut(mpg, 5)(39.1,46.6]  -98.542      7.182  -13.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.56 on 387 degrees of freedom
## Multiple R-squared:  0.6896, Adjusted R-squared:  0.6863 
## F-statistic: 214.9 on 4 and 387 DF,  p-value: < 2.2e-16

Berdasarkan output di atas, seluruh parameter regresi fungsi tangga(5) signifikan pada taraf 5%. Secara eksploratif, pola hubungan data yang digambarkan oleh model regresi ini dapat diamati melalui grafik sebagai berikut:

ggplot(AutoData,aes(x=mpg, y=horsepower)) +
       geom_point(alpha=0.55, color="black") +
       stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi Fungsi Tangga (7)

mod_tangga7 = lm(horsepower ~ cut(mpg,7),data=AutoData)
summary(mod_tangga7)

## 
## Call:
## lm(formula = horsepower ~ cut(mpg, 7), data = AutoData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -52.48 -12.96  -2.25  10.31 105.52 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             169.692      2.960   57.32   <2e-16 ***
## cut(mpg, 7)(14.4,19.7]  -45.208      3.669  -12.32   <2e-16 ***
## cut(mpg, 7)(19.7,25.1]  -74.908      3.734  -20.06   <2e-16 ***
## cut(mpg, 7)(25.1,30.5]  -88.317      3.885  -22.73   <2e-16 ***
## cut(mpg, 7)(30.5,35.9]  -96.865      4.186  -23.14   <2e-16 ***
## cut(mpg, 7)(35.9,41.2] -100.442      5.268  -19.07   <2e-16 ***
## cut(mpg, 7)(41.2,46.6] -111.978      8.594  -13.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.35 on 385 degrees of freedom
## Multiple R-squared:  0.6972, Adjusted R-squared:  0.6924 
## F-statistic: 147.7 on 6 and 385 DF,  p-value: < 2.2e-16

Berdasarkan output di atas, seluruh parameter regresi fungsi tangga(7) signifikan pada taraf 5%. Secara eksploratif, pola hubungan data yang digambarkan oleh model regresi ini dapat diamati melalui grafik sebagai berikut:

ggplot(AutoData,aes(x=mpg, y=horsepower)) +
       geom_point(alpha=0.55, color="black") +
       stat_smooth(method = "lm", 
               formula = y~cut(x,7), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Berdasarkan fungsi tangga yang berhasil dibuat pada grafik di atas, dapat diamati bahwa terdapat hubungan negatif antara mpg dengan horsepower. Hal ini dapat dimaknai bahwa peningkatan tingkat kehematan penggunaan bensin dalam standar miles per gallon akan menurunkan dugaan rataan kualitas kendaraan berdasarkan standar horsepower.

Perbandingan Model

MSE = function(pred,actual){
  mean((pred-actual)^2)
}

Membandingkan model

nilai_MSE <- rbind(MSE(predict(mod_linear),AutoData$horsepower),
              MSE(predict(mod_polinomial2),AutoData$horsepower),
              MSE(predict(mod_polinomial3),AutoData$horsepower),
              MSE(predict(mod_tangga5),AutoData$horsepower),
              MSE(predict(mod_tangga7),AutoData$horsepower))
nama_model <- c("Linear","Poly (ordo=2)", "Poly (ordo=3)",
                "Tangga (breaks=5)", "Tangga (breaks=7)")
knitr::kable(data.frame(nama_model,nilai_MSE))

nama_model	nilai_MSE
Linear	582.3257
Poly (ordo=2)	416.7871
Poly (ordo=3)	383.8523
Tangga (breaks=5)	458.7770
Tangga (breaks=7)	447.5318

Berdasarkan perbandingan model nilai MSE diatas, Pemodelan terbaik untuk data AutoData berdasarkan nilai MSE paling kecil adalah model Poly (ordo=3) dengan nilai MSE sebesar 383.8523.

Evaluasi Model AutoData Menggunakan Cross Validation

Regresi Linear

set.seed(321)
cross_val <- vfold_cv(AutoData,v=10,strata = "horsepower")
metric_linear <- map_dfr(cross_val$splits,
    function(x){
    mod <- lm(horsepower ~ mpg,data=AutoData[x$in_id,])
    pred <- predict(mod,newdata=AutoData[-x$in_id,])
    truth <- AutoData[-x$in_id,]$horsepower
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    mae <- mlr3measures::mae(truth = truth,response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
  }
)
metric_linear

## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  27.6  19.8
##  2  20.5  16.4
##  3  24.6  19.6
##  4  22.3  17.9
##  5  23.8  19.5
##  6  26.9  18.2
##  7  26.7  19.6
##  8  20.8  16.3
##  9  24.7  19.3
## 10  23.9  18.4

# menghitung rata-rata untuk 10 folds
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear

##     rmse      mae 
## 24.17563 18.49880

Polynomial Derajat 2 (ordo 2)

set.seed(321)
cross_val <- vfold_cv(AutoData,v=10,strata = "horsepower")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
  mod <- lm(horsepower ~ poly(mpg,2,raw = T),data=AutoData[x$in_id,])
  pred <- predict(mod,newdata=AutoData[-x$in_id,])
  truth <- AutoData[-x$in_id,]$horsepower
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
metric_poly2

## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  22.6  15.8
##  2  18.8  15.1
##  3  21.6  15.9
##  4  21.5  16.6
##  5  19.4  14.7
##  6  23.8  15.1
##  7  22.6  15.9
##  8  17.4  12.8
##  9  18.5  13.3
## 10  18.7  13.7

# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2

##     rmse      mae 
## 20.48613 14.89612

Polynomial Derajat 3 (ordo 3)

set.seed(321)
cross_val <- vfold_cv(AutoData,v=10,strata = "horsepower")

metric_poly3 <- map_dfr(cross_val$splits,
function(x){
  mod <- lm(horsepower ~ poly(mpg,3,raw = T),data=AutoData[x$in_id,])
  pred <- predict(mod,newdata=AutoData[-x$in_id,])
  truth <- AutoData[-x$in_id,]$horsepower
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
metric_poly3

## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  21.4  14.4
##  2  16.8  13.5
##  3  20.1  14.1
##  4  20.8  16.1
##  5  18.4  13.6
##  6  23.5  15.0
##  7  23.2  16.9
##  8  17.0  13.1
##  9  18.3  13.5
## 10  16.7  12.8

# menghitung rata-rata untuk 10 folds
mean_metric_poly3 <- colMeans(metric_poly3)
mean_metric_poly3

##     rmse      mae 
## 19.61264 14.31697

Fungsi Tangga

set.seed(321)
cross_val <- vfold_cv(AutoData,v=10,strata = "horsepower")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,
    function(x){
        training <- AutoData[x$in_id,]
        training$mpg <- cut(training$mpg,i)
        mod <- lm(horsepower ~ mpg,data=training)
        labs_x <- levels(mod$model[,2])
        labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
        testing <- AutoData[-x$in_id,]
        mpg_new <- cut(testing$mpg,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
        pred <- predict(mod,newdata=list(mpg=mpg_new))
        truth <- testing$horsepower
        data_eval <- na.omit(data.frame(truth,pred))
        rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
        mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
  metric_tangga
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  mean_metric_tangga
  }
)

best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga

##   breaks     rmse      mae
## 1      3 27.44423 20.78447
## 2      4 23.71931 16.50031
## 3      5 21.56049 15.60556
## 4      6 22.36664 16.09863
## 5      7 21.27573 15.94281
## 6      8 21.82000 15.76356
## 7      9 20.75449 14.64103
## 8     10 20.11574 14.32368

Berdasarkan nilai RMSE dan MAE didapatkan jumlah breaks terbaik adalah 10 breaks karena nilai RMSE dan MAE nya lebih kecil dari breaks yang lainnya yaitu bernilai 20.11574 dan 14.32368.

#berdasarkan rmse
best_tangga %>% slice_min(rmse)

##   breaks     rmse      mae
## 1     10 20.11574 14.32368

#berdasarkan mae
best_tangga %>% slice_min(mae)

##   breaks     rmse      mae
## 1     10 20.11574 14.32368

Perbandingan Model

nilai_metric <- rbind(mean_metric_linear,
                      mean_metric_poly2,
                      mean_metric_poly3,
                      best_tangga %>% select(-1) %>% slice_min(mae))

nama_model <- c("Linear","Poly2","Poly3","Tangga (breaks=10)")
knitr::kable(data.frame(nama_model,nilai_metric))

nama_model	rmse	mae
Linear	24.17563	18.49880
Poly2	20.48613	14.89612
Poly3	19.61264	14.31697
Tangga (breaks=10)	20.11574	14.32368

Berdasarkan hasil pemodelan, model terbaik pada data AutoData ini adalah model polynomial dengan derajat 3 Poly3 karena pada model tersebut nilai RMSE dan MAE paling rendah yaitu 19.61264 dan 14.31697 dibandingkan model lainnya.

Regresi Spline

#nilai knots yang ditentukan oleh komputer
knot_value_pc_df_6 <- attr(bs(AutoData$mpg, df=6),"knots")
knot_value_pc_df_6

##   25%   50%   75% 
## 17.00 22.75 29.00

b-spline

Menggunakan knot yang ditentukan fungsi komputer

knot_value_pc_df_6 = attr(bs(AutoData$mpg, df=6),"knots")
mod_spline_1 = lm(horsepower ~ bs(mpg, knots =knot_value_pc_df_6 ),data=AutoData)
summary(mod_spline_1)

## 
## Call:
## lm(formula = horsepower ~ bs(mpg, knots = knot_value_pc_df_6), 
##     data = AutoData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.043 -10.103  -0.818   8.075  95.778 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           195.012     13.181  14.795  < 2e-16 ***
## bs(mpg, knots = knot_value_pc_df_6)1    2.792     18.879   0.148    0.882    
## bs(mpg, knots = knot_value_pc_df_6)2  -80.488     12.649  -6.363 5.62e-10 ***
## bs(mpg, knots = knot_value_pc_df_6)3 -103.774     14.645  -7.086 6.62e-12 ***
## bs(mpg, knots = knot_value_pc_df_6)4 -126.115     14.600  -8.638  < 2e-16 ***
## bs(mpg, knots = knot_value_pc_df_6)5 -127.143     18.836  -6.750 5.45e-11 ***
## bs(mpg, knots = knot_value_pc_df_6)6 -141.413     18.226  -7.759 7.77e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.56 on 385 degrees of freedom
## Multiple R-squared:  0.7457, Adjusted R-squared:  0.7417 
## F-statistic: 188.1 on 6 and 385 DF,  p-value: < 2.2e-16

ggplot(AutoData,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, knots = knot_value_pc_df_6), 
               lty = 1,se = F)

Natural Spline

mod_spline3ns = lm(horsepower ~ ns(mpg, knots = knot_value_pc_df_6),data=AutoData)
summary(mod_spline3ns)

## 
## Call:
## lm(formula = horsepower ~ ns(mpg, knots = knot_value_pc_df_6), 
##     data = AutoData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -72.461 -10.731  -1.433   8.759  95.650 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           218.000      6.786   32.13   <2e-16 ***
## ns(mpg, knots = knot_value_pc_df_6)1 -130.511      6.470  -20.17   <2e-16 ***
## ns(mpg, knots = knot_value_pc_df_6)2 -109.368      6.183  -17.69   <2e-16 ***
## ns(mpg, knots = knot_value_pc_df_6)3 -237.134     15.815  -14.99   <2e-16 ***
## ns(mpg, knots = knot_value_pc_df_6)4 -115.116      8.615  -13.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.62 on 387 degrees of freedom
## Multiple R-squared:  0.7429, Adjusted R-squared:  0.7403 
## F-statistic: 279.6 on 4 and 387 DF,  p-value: < 2.2e-16

ggplot(AutoData,aes(x=mpg, y=horsepower)) +
                 geom_point(alpha=0.55, color="black")+
      stat_smooth(method = "lm", 
                 formula = y~ns(x, knots = knot_value_pc_df_6), 
                 lty = 1,se=F)

Smoothing Splin

Fungsi smooth.spline() dapat digunakan untuk melakukan smoothing spline pada R

model_sms <- with(data = AutoData,smooth.spline(mpg,horsepower))
model_sms

## Call:
## smooth.spline(x = mpg, y = horsepower)
## 
## Smoothing Parameter  spar= 0.8528728  lambda= 0.003422279 (12 iterations)
## Equivalent Degrees of Freedom (Df): 6.989153
## Penalized Criterion (RSS): 35962.53
## GCV: 386.3516

Fungsi smooth.spline secara otomatis memilih paramter lambda terbaik dengan menggunakan Cross-Validation (CV) dan Generalized Cross-Validation (GCV). Perbedaan utama antara keduanya adalah pada ukuran kebaikan model yang digunakan. Kalau CV menggunakan MSE sebagai ukuran kebaikan model, sedangkan GCV menggunakan Weighted-MSE. Banyaknya folds (lipatan) pada CV dan GCV ini adalah sejumlah obserbasinya.

pred_data <- broom::augment(model_sms)

ggplot(pred_data,aes(x=x,y=y))+
  geom_point(alpha=0.55, color="black")+
  geom_line(aes(y=.fitted),col="blue",
            lty=1)+
  xlab("mpg")+
  ylab("horsepower")+
  theme_bw()

Pengaruh parameter lambda terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:

model_sms_lambda <- data.frame(lambda=seq(0,5,by=0.5)) %>% 
  group_by(lambda) %>% 
  do(broom::augment(with(data = AutoData,smooth.spline(mpg,horsepower,lambda = .$lambda))))

p <- ggplot(model_sms_lambda,
       aes(x=x,y=y))+
  geom_line(aes(y=.fitted),
            col="blue",
            lty=1
            )+
  facet_wrap(~lambda)
p

Jika kita tentukan df=7, maka hasil kurva model smooth.spline akan lebih merepresentasikan data

model_sms <- with(data = AutoData,smooth.spline(mpg,horsepower,df=7))
model_sms

## Call:
## smooth.spline(x = mpg, y = horsepower, df = 7)
## 
## Smoothing Parameter  spar= 0.8524981  lambda= 0.003401012 (13 iterations)
## Equivalent Degrees of Freedom (Df): 6.998735
## Penalized Criterion (RSS): 35955.27
## GCV: 386.3517

pred_data <- broom::augment(model_sms)

ggplot(pred_data,aes(x=x,y=y))+
  geom_point(alpha=0.55, color="black")+
  geom_line(aes(y=.fitted),col="blue",
            lty=1)+
  xlab("mpg")+
  ylab("horsepower")+
  theme_bw()

LOESS

Masih menggunakan data triceps seperti pada ilustrasi sebelumnya, kali ini kita akan mencoba melakukan pendekatan local regression dengan fungsi loess().

model_loess <- loess(horsepower~mpg,
                     data = AutoData)
summary(model_loess)

## Call:
## loess(formula = horsepower ~ mpg, data = AutoData)
## 
## Number of Observations: 392 
## Equivalent Number of Parameters: 4.8 
## Residual Standard Error: 19.58 
## Trace of smoother matrix: 5.24  (exact)
## 
## Control settings:
##   span     :  0.75 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE

Pengaruh parameter span terhadap tingkat smoothness kurva bisa dilihat dari ilustrasi berikut:

model_loess_span <- data.frame(span=seq(0.1,5,by=0.5)) %>% 
  group_by(span) %>% 
  do(broom::augment(loess(horsepower~mpg,
                     data = AutoData,span=.$span)))

p2 <- ggplot(model_loess_span,
       aes(x=mpg,y=horsepower))+
  geom_line(aes(y=.fitted),
            col="blue",
            lty=1
            )+
  facet_wrap(~span)
p2

Pendekatan ini juga dapat dilakukan dengan fungsi stat_smooth() pada package ggplot2.

library(ggplot2)
ggplot(AutoData, aes(mpg,horsepower)) +
  geom_point(alpha=0.5,color="black") +
  stat_smooth(method='loess',
               formula=y~x,
              span = 0.75,
              col="blue",
              lty=1,
              se=F)

Tuning span dapat dilakukan dengan menggunakan cross-validation:

set.seed(321)
cross_val <- vfold_cv(AutoData,v=10,strata = "horsepower")

span <- seq(0.1,1,length.out=50)

best_loess <- map_dfr(span, function(i){
metric_loess <- map_dfr(cross_val$splits,
    function(x){
mod <- loess(horsepower ~ mpg,span = i,
         data=AutoData[x$in_id,])
pred <- predict(mod,
               newdata=AutoData[-x$in_id,])
truth <- AutoData[-x$in_id,]$horsepower

data_eval <- na.omit(data.frame(pred=pred,
                                truth=truth))


rmse <- mlr3measures::rmse(truth = data_eval$truth,
                           response = data_eval$pred
                           )
mae <- mlr3measures::mae(truth = data_eval$truth,
                           response = data_eval$pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

head(metric_loess,20)

# menghitung rata-rata untuk 10 folds
mean_metric_loess <- colMeans(metric_loess)

mean_metric_loess
}
)

best_loess <- cbind(span=span,best_loess)
# menampilkan hasil all breaks
best_loess

##         span     rmse      mae
## 1  0.1000000 20.06658 14.50343
## 2  0.1183673 19.98018 14.45827
## 3  0.1367347 20.02960 14.50068
## 4  0.1551020 19.83222 14.33130
## 5  0.1734694 19.76067 14.25341
## 6  0.1918367 19.69695 14.20192
## 7  0.2102041 19.60227 14.06374
## 8  0.2285714 19.68914 13.99014
## 9  0.2469388 19.61269 13.90335
## 10 0.2653061 19.57755 13.88404
## 11 0.2836735 19.58542 13.88573
## 12 0.3020408 19.54519 13.87337
## 13 0.3204082 19.49646 13.82199
## 14 0.3387755 19.50335 13.85751
## 15 0.3571429 19.49059 13.84485
## 16 0.3755102 19.47745 13.84261
## 17 0.3938776 19.48092 13.84017
## 18 0.4122449 19.46175 13.83984
## 19 0.4306122 19.44309 13.83248
## 20 0.4489796 19.43226 13.82679
## 21 0.4673469 19.41030 13.82051
## 22 0.4857143 19.41848 13.82088
## 23 0.5040816 19.40417 13.82529
## 24 0.5224490 19.40444 13.81642
## 25 0.5408163 19.40158 13.82037
## 26 0.5591837 19.39331 13.81911
## 27 0.5775510 19.40145 13.83342
## 28 0.5959184 19.40865 13.85073
## 29 0.6142857 19.41877 13.86244
## 30 0.6326531 19.41516 13.87345
## 31 0.6510204 19.42004 13.88835
## 32 0.6693878 19.42951 13.90725
## 33 0.6877551 19.43720 13.92326
## 34 0.7061224 19.44473 13.93961
## 35 0.7244898 19.44736 13.94615
## 36 0.7428571 19.45931 13.97391
## 37 0.7612245 19.46561 13.98878
## 38 0.7795918 19.46900 14.00013
## 39 0.7979592 19.47484 14.01445
## 40 0.8163265 19.48400 14.02772
## 41 0.8346939 19.49541 14.05544
## 42 0.8530612 19.50050 14.06533
## 43 0.8714286 19.51227 14.07338
## 44 0.8897959 19.53289 14.10543
## 45 0.9081633 19.54367 14.12423
## 46 0.9265306 19.57023 14.13567
## 47 0.9448980 19.57647 14.14780
## 48 0.9632653 19.61259 14.19896
## 49 0.9816327 19.67852 14.26367
## 50 1.0000000 19.85137 14.39736

#berdasarkan rmse
best_loess %>% slice_min(rmse)

##        span     rmse      mae
## 1 0.5591837 19.39331 13.81911

#berdasarkan mae
best_loess %>% slice_min(mae)

##       span     rmse      mae
## 1 0.522449 19.40444 13.81642

library(ggplot2)
ggplot(AutoData, aes(mpg,horsepower)) +
  geom_point(alpha=0.5,color="black") +
  stat_smooth(method='loess',
              formula=y~x,
              span = 0.4306122,
              col="blue",
              lty=1,
              se=F)