Tugas STA581 | Moving Beyond Linearity

Data Auto

attach(Auto)
head(cbind(mpg, horsepower))
##      mpg horsepower
## [1,]  18        130
## [2,]  15        165
## [3,]  18        150
## [4,]  16        150
## [5,]  17        140
## [6,]  15        198

Plot Awal

ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=1, color="orange") +
  theme_bw()

Cross Validation

1. Regresi Linear

Secara Visual

## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

Secara Empiris

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

metric_linear <- map_dfr(cross_val$splits,
    function(x) {
      
mod <- lm(mpg ~ horsepower, 
          data=Auto[x$in_id,])

pred <- predict(mod, newdata=Auto[-x$in_id,])

truth <- Auto[-x$in_id,]$mpg
                              
rmse <- mlr3measures::rmse(truth = truth, response = pred)

mae <- mlr3measures::mae(truth = truth, response = pred)

metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_linear
## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  4.52  3.73
##  2  4.86  3.80
##  3  4.54  3.35
##  4  4.67  3.99
##  5  5.12  4.12
##  6  4.39  3.25
##  7  5.93  4.65
##  8  5.05  3.95
##  9  4.71  3.75
## 10  5.10  3.73
# menghitung rata-rata untuk 10 folds
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
##     rmse      mae 
## 4.889086 3.832328

2. Regresi Polinomial

Secara Empiris

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

ordo <- 2:10

best_poly <- map_dfr(ordo, function(i){
metric_poly <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ poly(horsepower,ordo=i), 
          data=Auto[x$in_id,])
pred <- predict(mod, 
                newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly

# menghitung rata-rata untuk 10 folds
mean_metric_poly <- colMeans(metric_poly)
mean_metric_poly

}
)

# menampilkan hasil all ordo
best_poly <- cbind(ordo=ordo,best_poly)
best_poly
##   ordo     rmse      mae
## 1    2 4.351129 3.266460
## 2    3 4.355430 3.266945
## 3    4 4.367009 3.280452
## 4    5 4.318767 3.249003
## 5    6 4.317431 3.257272
## 6    7 4.293242 3.233376
## 7    8 4.309511 3.251758
## 8    9 4.323321 3.244919
## 9   10 4.371903 3.274887
#berdasarkan rmse
best_poly %>% slice_min(rmse)
##   ordo     rmse      mae
## 1    7 4.293242 3.233376

Plot CV

best_poly$ordo <- as.factor(best_poly$ordo)
ggplot(data=best_poly, aes(x=ordo, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("Ordo") + 
  ylab("RMSE")+
  theme_bw()

Secara Visual

A <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=1, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Derajat 2") +
  xlab("horse power") + 
  ylab("mile per gallon") 

B <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=1, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~poly(x,7,raw=T), 
              lty = 1, col = "blue",se = F)+
  theme_bw()+
  ggtitle("Regresi Polinomial Derajat 7") +
  xlab("horse power") + 
  ylab("mile per gallon")

ggarrange(A, B, ncol = 2, nrow = 1)

Jika dilihat berdasarkan plot, meskipun nilai RMSE Regresi Polinomial ordo 7 lebih kecil daripada ordo 2, namun model terbaik yang dipilih adalah Regresi Polinomial ordo 2 karena untuk ordo 7 pada bagian ujung-ujung plot cenderung tidak stabil.

Regresi Polinomial Derajat 2

mod_polinomial2 = lm(mpg ~ poly(horsepower,2))
summary(mod_polinomial2)
## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.7135  -2.5943  -0.0859   2.2868  15.8961 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.4459     0.2209  106.13   <2e-16 ***
## poly(horsepower, 2)1 -120.1377     4.3739  -27.47   <2e-16 ***
## poly(horsepower, 2)2   44.0895     4.3739   10.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.686 
## F-statistic:   428 on 2 and 389 DF,  p-value: < 2.2e-16
set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

metric_poly2 <- map_dfr(cross_val$splits,
    function(x){
mod <- lm(mpg ~ poly(horsepower,2,raw=T), 
          data=Auto[x$in_id,])
pred <- predict(mod, 
                newdata=Auto[-x$in_id,])
truth <- Auto[-x$in_id,]$mpg

rmse <- mlr3measures::rmse(truth = truth,
                           response = pred
                           )
mae <- mlr3measures::mae(truth = truth,
                           response = pred
                           )
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_poly2
## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  4.06  3.03
##  2  4.62  3.59
##  3  4.28  3.22
##  4  4.19  3.13
##  5  4.97  3.94
##  6  3.53  2.51
##  7  4.92  3.66
##  8  4.07  2.96
##  9  3.79  3.07
## 10  5.09  3.55
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
##     rmse      mae 
## 4.351129 3.266460

3. Piecewise Constant (Fungsi Tangga)

Secara Empiris

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

breaks <- 2:20

best_tangga <- map_dfr(breaks, function(i){

metric_tangga <- map_dfr(cross_val$splits,
    function(x){

training <- Auto[x$in_id,]
training$horsepower <- cut(training$horsepower,i)
                             
mod <- lm(mpg ~ horsepower,
          data=training)

labs_x <- levels(mod$model[,2])

labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                 upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x)))
                             
testing <- Auto[-x$in_id,]
                             
horsepower_new <- cut(testing$horsepower,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
                             
pred <- predict(mod,   
                newdata=list(horsepower=horsepower_new))
                             
truth <- testing$mpg
                             
data_eval <- na.omit(data.frame(truth,pred))
                             
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
                             
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
                             
metric <- c(rmse,mae)
                             
names(metric) <- c("rmse","mae")
return(metric)
}
)

metric_tangga

# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
}
)

# menampilkan hasil all breaks
best_tangga <- cbind(breaks=breaks,best_tangga)
best_tangga
##    breaks     rmse      mae
## 1       2 6.147626 4.790234
## 2       3 5.775792 4.521829
## 3       4 4.985270 3.774549
## 4       5 4.712623 3.571614
## 5       6 4.644383 3.548375
## 6       7 4.554116 3.379980
## 7       8 4.437597 3.405433
## 8       9 4.549668 3.471999
## 9      10 4.589172 3.455531
## 10     11 4.531039 3.380931
## 11     12 4.442697 3.321151
## 12     13 4.527126 3.375170
## 13     14 4.428824 3.270424
## 14     15 4.313162 3.286289
## 15     16 4.336661 3.302031
## 16     17 4.414343 3.347532
## 17     18 4.340715 3.221099
## 18     19 4.331798 3.224404
## 19     20 4.480710 3.320605
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
##   breaks     rmse      mae
## 1     15 4.313162 3.286289

Plot CV

best_tangga$breaks <- as.factor(best_tangga$breaks)
ggplot(data=best_tangga, aes(x=breaks, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("Breaks") + 
  ylab("RMSE")+
  theme_bw()

Secara Visual

mod_tangga = lm(mpg ~ cut(horsepower,15))
summary(mod_tangga)
## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 15))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.0137  -2.5309  -0.3381   2.2360  14.9800 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     34.1800     1.1077  30.857  < 2e-16 ***
## cut(horsepower, 15)(58.3,70.5]  -0.6514     1.2472  -0.522    0.602    
## cut(horsepower, 15)(70.5,82.8]  -6.1663     1.2601  -4.894 1.47e-06 ***
## cut(horsepower, 15)(82.8,95.1]  -9.4160     1.1974  -7.864 3.97e-14 ***
## cut(horsepower, 15)(95.1,107]  -13.2778     1.2756 -10.409  < 2e-16 ***
## cut(horsepower, 15)(107,120]   -13.1421     1.3644  -9.632  < 2e-16 ***
## cut(horsepower, 15)(120,132]   -16.8400     1.5665 -10.750  < 2e-16 ***
## cut(horsepower, 15)(132,144]   -16.4600     1.5665 -10.507  < 2e-16 ***
## cut(horsepower, 15)(144,156]   -19.3439     1.3184 -14.672  < 2e-16 ***
## cut(horsepower, 15)(156,169]   -20.3425     1.8782 -10.831  < 2e-16 ***
## cut(horsepower, 15)(169,181]   -20.3800     1.5665 -13.010  < 2e-16 ***
## cut(horsepower, 15)(181,193]   -21.0550     2.4141  -8.722  < 2e-16 ***
## cut(horsepower, 15)(193,205]   -21.8467     2.7133  -8.052 1.08e-14 ***
## cut(horsepower, 15)(205,218]   -22.3800     2.2154 -10.102  < 2e-16 ***
## cut(horsepower, 15)(218,230]   -20.1800     2.2154  -9.109  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.29 on 377 degrees of freedom
## Multiple R-squared:  0.7087, Adjusted R-squared:  0.6979 
## F-statistic: 65.51 on 14 and 377 DF,  p-value: < 2.2e-16
ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=1, color="orange") +
  stat_smooth(method = "lm", 
              formula = y~cut(x,15),lty = 1,
              col = "blue",se = F)+
  theme_bw()+
  ggtitle("Regresi Fungsi Tangga dengan 15 knots") +
  xlab("horse power") + 
  ylab("mile per gallon") 

4. Natural Cubic Splines

Secara Empiris

set.seed(123)
cross_val <- vfold_cv(Auto,v=10,strata = "mpg")

df <- 2:15

best_spline <- map_dfr(df, function(i) {
metric_spline <- map_dfr(cross_val$splits,
    function(x) {
                              
mod <- lm(mpg ~ ns(horsepower,df=i), 
          data=Auto[x$in_id,])
                              
pred <- predict(mod, 
                newdata=Auto[-x$in_id,])
                             
truth <- Auto[-x$in_id,]$mpg
                              
rmse <- mlr3measures::rmse(truth = truth, 
                           response = pred
                           )
                              
mae <- mlr3measures::mae(truth = truth, 
                         response = pred
                         )
                              
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
  
metric_spline

# menghitung rata-rata untuk 10 folds
mean_metric_spline <- colMeans(metric_spline)
mean_metric_spline
}
)

# menampilkan hasil all df
best_spline <- cbind(df=df,best_spline)
best_spline
##    df     rmse      mae
## 1   2 4.330108 3.252908
## 2   3 4.348006 3.261538
## 3   4 4.329461 3.257926
## 4   5 4.313312 3.246412
## 5   6 4.310011 3.237990
## 6   7 4.294671 3.229174
## 7   8 4.293369 3.232221
## 8   9 4.281877 3.211860
## 9  10 4.260316 3.192010
## 10 11 4.288667 3.220958
## 11 12 4.285063 3.192363
## 12 13 4.302071 3.220998
## 13 14 4.322821 3.244848
## 14 15 4.305928 3.250204
#berdasarkan rmse
best_spline %>% slice_min(rmse)
##   df     rmse     mae
## 1 10 4.260316 3.19201

Plot CV

best_spline$df <- as.factor(best_spline$df)
ggplot(data=best_spline, aes(x=df, y=rmse, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point(cex=3,col="red")+
  ggtitle("K-Fold CV") +
  xlab("DF") + 
  ylab("RMSE")+
  theme_bw()

Secara Visual

Knots

#nilai knots yang ditentukan oleh komputer
attr(ns(Auto$horsepower, df=10),"knots")
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##  67.0  72.0  80.0  88.0  93.5 100.0 110.0 140.0 157.7
mod_ncs = lm(mpg ~ ns(horsepower, df=10))
summary(mod_ncs)
## 
## Call:
## lm(formula = mpg ~ ns(horsepower, df = 10))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5003  -2.4956  -0.2086   2.2577  14.6400 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 32.462      1.680  19.324  < 2e-16 ***
## ns(horsepower, df = 10)1    -4.600      1.789  -2.572 0.010495 *  
## ns(horsepower, df = 10)2    -3.007      2.555  -1.177 0.239996    
## ns(horsepower, df = 10)3    -8.663      2.043  -4.239 2.82e-05 ***
## ns(horsepower, df = 10)4    -7.170      2.188  -3.278 0.001142 ** 
## ns(horsepower, df = 10)5   -14.722      2.125  -6.927 1.84e-11 ***
## ns(horsepower, df = 10)6    -8.446      2.467  -3.424 0.000684 ***
## ns(horsepower, df = 10)7   -16.947      2.080  -8.148 5.38e-15 ***
## ns(horsepower, df = 10)8   -21.715      1.891 -11.483  < 2e-16 ***
## ns(horsepower, df = 10)9   -14.384      4.116  -3.494 0.000531 ***
## ns(horsepower, df = 10)10  -22.326      1.946 -11.476  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.24 on 381 degrees of freedom
## Multiple R-squared:  0.7124, Adjusted R-squared:  0.7049 
## F-statistic: 94.39 on 10 and 381 DF,  p-value: < 2.2e-16
ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=1, color="orange") +
stat_smooth(method = "lm", 
               formula = y~ns(x, df=10), 
               lty = 1,col = "blue", se=F)+
  theme_bw()+
  ggtitle("Natural Cubic Splines (df=10)") +
  xlab("horse power") + 
  ylab("mile per gallon")+
  geom_vline(xintercept = c(67, 72, 80, 88, 93.5, 100, 110, 140,157.7), col="grey50", lty=2)

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

nilai_metric <- rbind(mean_metric_linear,
                      mean_metric_poly2,
                      best_tangga %>% select(-1) %>% slice_min(rmse),
                      best_spline %>%  select(-1)%>% slice_min(rmse)
                      )

nama_model <- c("Linear",
                "Polinomial ordo 2",
                "Fungsi Tangga (breaks=15)",
                "Natural Cubic Splines (df=10)"
                )

komparasi<-data.frame(nama_model,nilai_metric)
komparasi %>%  datatable(komparasi, class = 'cell-border stripe')

Berdasarkan Plot

ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=1, color="orange") +
    stat_smooth(method = "lm", 
              formula = y~x,
              lty = 1, col = "black",se = F)+
  
    stat_smooth(method = "lm", 
              formula = y~poly(x,2,raw=T), 
              lty = 1, col = "red",se = F)+
  
  
    stat_smooth(method = "lm", 
              formula = y~cut(x,15), 
              lty = 1, col = "green3",se = F)+
  
  
    stat_smooth(method = "lm", 
               formula = y~ns(x, df=10), 
               lty = 1,col = "blue", se=F)+
  ggtitle("Auto")+
  theme_bw()

Subset Data Amerika

Plot Awal

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

Berdasarkan Plot

Subset Data Eropa

Plot Awal

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

Berdasarkan Plot

Subset Data Jepang

Plot Awal

Komparasi Model

Berdasarkan Nilai RMSE dan MAE

Berdasarkan Plot

Plot model terbaik masing-masing subset negara adalah sebagai berikut:

Terima kasih