STA1381: Nonlinear Regression Part 1

Library

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.5      v purrr   0.3.4 
## v tibble  3.1.8      v dplyr   1.0.10
## v tidyr   1.2.0      v stringr 1.4.0 
## v readr   2.1.2      v forcats 0.5.1 
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(MultiKink)

Contoh Pada Kuliah

set.seed(123)
data.x <- rnorm(1000,1,1)
err <- rnorm(1000)
y <- 5+2*data.x+3*data.x^2+err
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))

# Regresi linier

lin.mod <-lm( y~data.x)
plot( data.x,y,xlim=c( -2,5), ylim=c( -10,70))
lines(data.x,lin.mod$fitted.values,col="red")

summary(lin.mod)
## 
## Call:
## lm(formula = y ~ data.x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.686 -2.574 -1.428  1.195 27.185 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.6056     0.1902   24.22   <2e-16 ***
## data.x        8.3790     0.1340   62.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.2 on 998 degrees of freedom
## Multiple R-squared:  0.7967, Adjusted R-squared:  0.7965 
## F-statistic:  3911 on 1 and 998 DF,  p-value: < 2.2e-16

Polynomial

pol.mod <- lm( y~data.x+I(data.x^2))
ix <- sort( data.x,index.return=T)$ix
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))
lines(data.x[ix], pol.mod$fitted.values[ix],col="blue", cex=2)

summary(pol.mod)
## 
## Call:
## lm(formula = y ~ data.x + I(data.x^2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0319 -0.6942  0.0049  0.7116  3.2855 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.95193    0.04568  108.41   <2e-16 ***
## data.x       2.10732    0.05861   35.95   <2e-16 ***
## I(data.x^2)  2.99081    0.02338  127.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.007 on 997 degrees of freedom
## Multiple R-squared:  0.9883, Adjusted R-squared:  0.9883 
## F-statistic: 4.221e+04 on 2 and 997 DF,  p-value: < 2.2e-16

Fungsi Tangga

#regresi fungsi tangga
range( data.x)
## [1] -1.809775  4.241040
c1 <- as.factor(ifelse(data.x<=0,1,0))
c2 <- as.factor(ifelse(data.x<=2 & data.x>0,1,0))
c3 <- as.factor(ifelse(data.x>2,1,0))
step.mod <- lm(y~c1+c2+c3)
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))
lines(data.x,lin.mod$fitted.values,col="red")
lines(data.x[ix], step.mod$fitted.values[ix],col="green")

summary(step.mod)
## 
## Call:
## lm(formula = y ~ c1 + c2 + c3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.395  -3.534  -0.530   2.527  36.876 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.4499     0.4087   74.50   <2e-16 ***
## c11         -25.2395     0.5710  -44.21   <2e-16 ***
## c21         -19.4184     0.4536  -42.81   <2e-16 ***
## c31               NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.121 on 997 degrees of freedom
## Multiple R-squared:  0.698,  Adjusted R-squared:  0.6974 
## F-statistic:  1152 on 2 and 997 DF,  p-value: < 2.2e-16

Komparasi Model

nilai_AIC <- rbind(AIC(lin.mod),
                   AIC(pol.mod),
                   AIC(step.mod))

nama_model <- c("Linear","Poly (ordo=2)","Tangga (breaks=3)")
data.frame(nama_model,nilai_AIC)
##          nama_model nilai_AIC
## 1            Linear  5711.836
## 2     Poly (ordo=2)  2856.470
## 3 Tangga (breaks=3)  6109.609

Contoh data TRICEPS

Data yang digunakan untuk ilustrasi berasal dari studi antropometri terhadap 892 perempuan di bawah 50 tahun di tiga desa Gambia di Afrika Barat. Data terdiri dari 3 Kolom yaitu Age, Intriceps dan tricepts. Berikut adalah penjelasannya pada masing-masing kolom:

age : umur responden Intriceps : logaritma dari ketebalan lipatan kulit triceps triceps: ketebalan lipatan kulit triceps Lipatan kulit trisep diperlukan untuk menghitung lingkar otot lengan atas. Ketebalannya memberikan informasi tentang cadangan lemak tubuh, sedangkan massa otot yang dihitung memberikan informasi tentang cadangan protein

data("triceps", package="MultiKink")

Jika kita gambarkan dalam bentuk scatterplot

ggplot(triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") + 
                 theme_bw() 

## Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.

Regresi Linear

mod_linear = lm(triceps~age,data=triceps)
summary(mod_linear)
## 
## Call:
## lm(formula = triceps ~ age, data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9512  -2.3965  -0.5154   1.5822  25.1233 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.19717    0.21244   29.17   <2e-16 ***
## age          0.21584    0.01014   21.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.007 on 890 degrees of freedom
## Multiple R-squared:  0.3372, Adjusted R-squared:  0.3365 
## F-statistic: 452.8 on 1 and 890 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1,
               col = "blue",se = F)+
  theme_bw()

Regresi Polinomial Derajat 2 (ordo 2)

mod_polinomial2 = lm(triceps ~ poly(age,2,raw = T),
                     data=triceps)
summary(mod_polinomial2)
## 
## Call:
## lm(formula = triceps ~ poly(age, 2, raw = T), data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.5677  -2.4401  -0.4587   1.6368  24.9961 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.0229191  0.3063806  19.658  < 2e-16 ***
## poly(age, 2, raw = T)1  0.2434733  0.0364403   6.681 4.17e-11 ***
## poly(age, 2, raw = T)2 -0.0006257  0.0007926  -0.789     0.43    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.008 on 889 degrees of freedom
## Multiple R-squared:  0.3377, Adjusted R-squared:  0.3362 
## F-statistic: 226.6 on 2 and 889 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

ggplot(triceps,aes(x=age, y=triceps)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = T)+
  theme_bw()

Regresi Polinomial Derajat 3 (ordo 3)

mod_polinomial3 = lm(triceps ~ poly(age,3,raw = T),data=triceps)
summary(mod_polinomial3)
## 
## Call:
## lm(formula = triceps ~ poly(age, 3, raw = T), data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5832  -1.9284  -0.5415   1.3283  24.4440 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             8.004e+00  3.831e-01  20.893  < 2e-16 ***
## poly(age, 3, raw = T)1 -3.157e-01  7.721e-02  -4.089 4.73e-05 ***
## poly(age, 3, raw = T)2  3.101e-02  3.964e-03   7.824 1.45e-14 ***
## poly(age, 3, raw = T)3 -4.566e-04  5.612e-05  -8.135 1.38e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.868 on 888 degrees of freedom
## Multiple R-squared:  0.3836, Adjusted R-squared:  0.3815 
## F-statistic: 184.2 on 3 and 888 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,3,raw=T), 
               lty = 1, col = "blue",se = T)+
  theme_bw()

Regresi Fungsi Tangga (5)

mod_tangga5 = lm(triceps ~ cut(age,5),data=triceps)
summary(mod_tangga5)
## 
## Call:
## lm(formula = triceps ~ cut(age, 5), data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5474  -2.0318  -0.4465   1.3682  23.3759 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              7.2318     0.1994  36.260  < 2e-16 ***
## cut(age, 5)(10.6,20.9]   1.6294     0.3244   5.023 6.16e-07 ***
## cut(age, 5)(20.9,31.2]   5.9923     0.4222  14.192  < 2e-16 ***
## cut(age, 5)(31.2,41.5]   7.5156     0.4506  16.678  < 2e-16 ***
## cut(age, 5)(41.5,51.8]   7.4561     0.5543  13.452  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.939 on 887 degrees of freedom
## Multiple R-squared:  0.3617, Adjusted R-squared:  0.3588 
## F-statistic: 125.7 on 4 and 887 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
       geom_point(alpha=0.55, color="black") +
       stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi Fungsi Tangga (7)

mod_tangga7 = lm(triceps ~ cut(age,7),data=triceps)
summary(mod_tangga7)
## 
## Call:
## lm(formula = triceps ~ cut(age, 7), data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8063  -1.7592  -0.4366   1.2894  23.1461 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              7.5592     0.2219  34.060  < 2e-16 ***
## cut(age, 7)(7.62,15]    -0.6486     0.3326  -1.950   0.0515 .  
## cut(age, 7)(15,22.3]     3.4534     0.4239   8.146 1.27e-15 ***
## cut(age, 7)(22.3,29.7]   5.8947     0.4604  12.804  < 2e-16 ***
## cut(age, 7)(29.7,37]     7.8471     0.5249  14.949  < 2e-16 ***
## cut(age, 7)(37,44.4]     6.9191     0.5391  12.835  < 2e-16 ***
## cut(age, 7)(44.4,51.8]   6.3013     0.6560   9.606  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.805 on 885 degrees of freedom
## Multiple R-squared:  0.4055, Adjusted R-squared:  0.4014 
## F-statistic: 100.6 on 6 and 885 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
       geom_point(alpha=0.55, color="black") +
       stat_smooth(method = "lm", 
               formula = y~cut(x,7), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Perbandingan Model

MSE = function(pred,actual){
  mean((pred-actual)^2)
}

Membandingkan model

nilai_MSE <- rbind(MSE(predict(mod_linear),triceps$triceps),
              MSE(predict(mod_polinomial2),triceps$triceps),
              MSE(predict(mod_polinomial3),triceps$triceps),
              MSE(predict(mod_tangga5),triceps$triceps),
              MSE(predict(mod_tangga7),triceps$triceps))
nama_model <- c("Linear","Poly (ordo=2)", "Poly (ordo=3)",
                "Tangga (breaks=5)", "Tangga (breaks=7)")
data.frame(nama_model,nilai_MSE)
##          nama_model nilai_MSE
## 1            Linear  16.01758
## 2     Poly (ordo=2)  16.00636
## 3     Poly (ordo=3)  14.89621
## 4 Tangga (breaks=5)  15.42602
## 5 Tangga (breaks=7)  14.36779

Evaluasi Model Menggunakan Cross Validation

Regresi Linear

set.seed(123)
cross_val <- vfold_cv(triceps,v=10,strata = "triceps")
metric_linear <- map_dfr(cross_val$splits,
    function(x){
    mod <- lm(triceps ~ age,data=triceps[x$in_id,])
    pred <- predict(mod,newdata=triceps[-x$in_id,])
    truth <- triceps[-x$in_id,]$triceps
    rmse <- mlr3measures::rmse(truth = truth,response = pred)
    mae <- mlr3measures::mae(truth = truth,response = pred)
    metric <- c(rmse,mae)
    names(metric) <- c("rmse","mae")
    return(metric)
  }
)
metric_linear
## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  3.65  2.82
##  2  4.62  3.22
##  3  4.38  3.00
##  4  3.85  2.80
##  5  3.08  2.36
##  6  3.83  2.81
##  7  3.59  2.78
##  8  4.66  3.06
##  9  3.50  2.56
## 10  4.59  2.93
# menghitung rata-rata untuk 10 folds
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
##     rmse      mae 
## 3.973249 2.833886

Polynomial Derajat 2 (ordo 2)

set.seed(123)
cross_val <- vfold_cv(triceps,v=10,strata = "triceps")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
  mod <- lm(triceps ~ poly(age,2,raw = T),data=triceps[x$in_id,])
  pred <- predict(mod,newdata=triceps[-x$in_id,])
  truth <- triceps[-x$in_id,]$triceps
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
metric_poly2
## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  3.64  2.82
##  2  4.62  3.26
##  3  4.39  3.02
##  4  3.85  2.81
##  5  3.10  2.38
##  6  3.82  2.81
##  7  3.62  2.83
##  8  4.65  3.07
##  9  3.50  2.57
## 10  4.59  2.94
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
##     rmse      mae 
## 3.977777 2.851787

Polynomial Derajat 3 (ordo 3)

set.seed(123)
cross_val <- vfold_cv(triceps,v=10,strata = "triceps")

metric_poly3 <- map_dfr(cross_val$splits,
function(x){
  mod <- lm(triceps ~ poly(age,3,raw = T),data=triceps[x$in_id,])
  pred <- predict(mod,newdata=triceps[-x$in_id,])
  truth <- triceps[-x$in_id,]$triceps
  rmse <- mlr3measures::rmse(truth = truth,response = pred)
  mae <- mlr3measures::mae(truth = truth,response = pred)
  metric <- c(rmse,mae)
  names(metric) <- c("rmse","mae")
  return(metric)
  }
)
metric_poly3
## # A tibble: 10 x 2
##     rmse   mae
##    <dbl> <dbl>
##  1  3.49  2.60
##  2  4.48  2.99
##  3  4.21  2.85
##  4  4.02  2.75
##  5  3.03  2.09
##  6  3.63  2.59
##  7  3.53  2.52
##  8  4.54  2.91
##  9  3.27  2.33
## 10  4.27  2.68
# menghitung rata-rata untuk 10 folds
mean_metric_poly3 <- colMeans(metric_poly3)
mean_metric_poly3
##     rmse      mae 
## 3.845976 2.632125

Fungsi Tangga

set.seed(123)
cross_val <- vfold_cv(triceps,v=10,strata = "triceps")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
    metric_tangga <- map_dfr(cross_val$splits,
    function(x){
        training <- triceps[x$in_id,]
        training$age <- cut(training$age,i)
        mod <- lm(triceps ~ age,data=training)
        labs_x <- levels(mod$model[,2])
        labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
                  upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
        testing <- triceps[-x$in_id,]
        age_new <- cut(testing$age,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
        pred <- predict(mod,newdata=list(age=age_new))
        truth <- testing$triceps
        data_eval <- na.omit(data.frame(truth,pred))
        rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
        mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
        metric <- c(rmse,mae)
        names(metric) <- c("rmse","mae")
        return(metric)
      }
    )
  metric_tangga
  # menghitung rata-rata untuk 10 folds
  mean_metric_tangga <- colMeans(metric_tangga)
  mean_metric_tangga
  }
)

best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
##   breaks     rmse      mae
## 1      3 3.835357 2.618775
## 2      4 3.882932 2.651911
## 3      5 3.917840 2.724368
## 4      6 3.836068 2.622939
## 5      7 3.789715 2.555062
## 6      8 3.812789 2.555563
## 7      9 3.781720 2.518706
## 8     10 3.795877 2.529479
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
##   breaks    rmse      mae
## 1      9 3.78172 2.518706
#berdasarkan mae
best_tangga %>% slice_min(mae)
##   breaks    rmse      mae
## 1      9 3.78172 2.518706

Perbandingan Model

nilai_metric <- rbind(mean_metric_linear,
                      mean_metric_poly2,
                      mean_metric_poly3,
                      best_tangga %>% select(-1) %>% slice_min(mae))

nama_model <- c("Linear","Poly2","Poly3","Tangga (breaks=9)")
data.frame(nama_model,nilai_metric)
##          nama_model     rmse      mae
## 1            Linear 3.973249 2.833886
## 2             Poly2 3.977777 2.851787
## 3             Poly3 3.845976 2.632125
## 4 Tangga (breaks=9) 3.781720 2.518706