STA1381: Nonlinear Regression Part 1
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.10
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(dplyr)
library(purrr)
library(rsample)
library(MultiKink)
set.seed(123)
data.x <- rnorm(1000,1,1)
err <- rnorm(1000)
y <- 5+2*data.x+3*data.x^2+err
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))
# Regresi linier
lin.mod <-lm( y~data.x)
plot( data.x,y,xlim=c( -2,5), ylim=c( -10,70))
lines(data.x,lin.mod$fitted.values,col="red")
summary(lin.mod)
##
## Call:
## lm(formula = y ~ data.x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.686 -2.574 -1.428 1.195 27.185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6056 0.1902 24.22 <2e-16 ***
## data.x 8.3790 0.1340 62.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.2 on 998 degrees of freedom
## Multiple R-squared: 0.7967, Adjusted R-squared: 0.7965
## F-statistic: 3911 on 1 and 998 DF, p-value: < 2.2e-16
pol.mod <- lm( y~data.x+I(data.x^2))
ix <- sort( data.x,index.return=T)$ix
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))
lines(data.x[ix], pol.mod$fitted.values[ix],col="blue", cex=2)
summary(pol.mod)
##
## Call:
## lm(formula = y ~ data.x + I(data.x^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0319 -0.6942 0.0049 0.7116 3.2855
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.95193 0.04568 108.41 <2e-16 ***
## data.x 2.10732 0.05861 35.95 <2e-16 ***
## I(data.x^2) 2.99081 0.02338 127.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.007 on 997 degrees of freedom
## Multiple R-squared: 0.9883, Adjusted R-squared: 0.9883
## F-statistic: 4.221e+04 on 2 and 997 DF, p-value: < 2.2e-16
Fungsi Tangga
#regresi fungsi tangga
range( data.x)
## [1] -1.809775 4.241040
c1 <- as.factor(ifelse(data.x<=0,1,0))
c2 <- as.factor(ifelse(data.x<=2 & data.x>0,1,0))
c3 <- as.factor(ifelse(data.x>2,1,0))
step.mod <- lm(y~c1+c2+c3)
plot(data.x,y,xlim=c(-2,5), ylim=c(-10,70))
lines(data.x,lin.mod$fitted.values,col="red")
lines(data.x[ix], step.mod$fitted.values[ix],col="green")
summary(step.mod)
##
## Call:
## lm(formula = y ~ c1 + c2 + c3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.395 -3.534 -0.530 2.527 36.876
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.4499 0.4087 74.50 <2e-16 ***
## c11 -25.2395 0.5710 -44.21 <2e-16 ***
## c21 -19.4184 0.4536 -42.81 <2e-16 ***
## c31 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.121 on 997 degrees of freedom
## Multiple R-squared: 0.698, Adjusted R-squared: 0.6974
## F-statistic: 1152 on 2 and 997 DF, p-value: < 2.2e-16
Komparasi Model
nilai_AIC <- rbind(AIC(lin.mod),
AIC(pol.mod),
AIC(step.mod))
nama_model <- c("Linear","Poly (ordo=2)","Tangga (breaks=3)")
data.frame(nama_model,nilai_AIC)
## nama_model nilai_AIC
## 1 Linear 5711.836
## 2 Poly (ordo=2) 2856.470
## 3 Tangga (breaks=3) 6109.609
Contoh data TRICEPS
age : umur responden Intriceps : logaritma dari ketebalan lipatan kulit triceps triceps: ketebalan lipatan kulit triceps Lipatan kulit trisep diperlukan untuk menghitung lingkar otot lengan atas. Ketebalannya memberikan informasi tentang cadangan lemak tubuh, sedangkan massa otot yang dihitung memberikan informasi tentang cadangan protein
data("triceps", package="MultiKink")
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
theme_bw()
## Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model yang bisa merepresentasikan pola hubungan tersebut dengan baik.
Regresi Linear
mod_linear = lm(triceps~age,data=triceps)
summary(mod_linear)
##
## Call:
## lm(formula = triceps ~ age, data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.9512 -2.3965 -0.5154 1.5822 25.1233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.19717 0.21244 29.17 <2e-16 ***
## age 0.21584 0.01014 21.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.007 on 890 degrees of freedom
## Multiple R-squared: 0.3372, Adjusted R-squared: 0.3365
## F-statistic: 452.8 on 1 and 890 DF, p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "blue",se = F)+
theme_bw()
Regresi Polinomial Derajat 2 (ordo 2)
mod_polinomial2 = lm(triceps ~ poly(age,2,raw = T),
data=triceps)
summary(mod_polinomial2)
##
## Call:
## lm(formula = triceps ~ poly(age, 2, raw = T), data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.5677 -2.4401 -0.4587 1.6368 24.9961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0229191 0.3063806 19.658 < 2e-16 ***
## poly(age, 2, raw = T)1 0.2434733 0.0364403 6.681 4.17e-11 ***
## poly(age, 2, raw = T)2 -0.0006257 0.0007926 -0.789 0.43
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.008 on 889 degrees of freedom
## Multiple R-squared: 0.3377, Adjusted R-squared: 0.3362
## F-statistic: 226.6 on 2 and 889 DF, p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = F)+
theme_bw()
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,2,raw=T),
lty = 1, col = "blue",se = T)+
theme_bw()
Regresi Polinomial Derajat 3 (ordo 3)
mod_polinomial3 = lm(triceps ~ poly(age,3,raw = T),data=triceps)
summary(mod_polinomial3)
##
## Call:
## lm(formula = triceps ~ poly(age, 3, raw = T), data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5832 -1.9284 -0.5415 1.3283 24.4440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.004e+00 3.831e-01 20.893 < 2e-16 ***
## poly(age, 3, raw = T)1 -3.157e-01 7.721e-02 -4.089 4.73e-05 ***
## poly(age, 3, raw = T)2 3.101e-02 3.964e-03 7.824 1.45e-14 ***
## poly(age, 3, raw = T)3 -4.566e-04 5.612e-05 -8.135 1.38e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.868 on 888 degrees of freedom
## Multiple R-squared: 0.3836, Adjusted R-squared: 0.3815
## F-statistic: 184.2 on 3 and 888 DF, p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~poly(x,3,raw=T),
lty = 1, col = "blue",se = T)+
theme_bw()
Regresi Fungsi Tangga (5)
mod_tangga5 = lm(triceps ~ cut(age,5),data=triceps)
summary(mod_tangga5)
##
## Call:
## lm(formula = triceps ~ cut(age, 5), data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5474 -2.0318 -0.4465 1.3682 23.3759
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2318 0.1994 36.260 < 2e-16 ***
## cut(age, 5)(10.6,20.9] 1.6294 0.3244 5.023 6.16e-07 ***
## cut(age, 5)(20.9,31.2] 5.9923 0.4222 14.192 < 2e-16 ***
## cut(age, 5)(31.2,41.5] 7.5156 0.4506 16.678 < 2e-16 ***
## cut(age, 5)(41.5,51.8] 7.4561 0.5543 13.452 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.939 on 887 degrees of freedom
## Multiple R-squared: 0.3617, Adjusted R-squared: 0.3588
## F-statistic: 125.7 on 4 and 887 DF, p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,5),
lty = 1, col = "blue",se = F)+
theme_bw()
Regresi Fungsi Tangga (7)
mod_tangga7 = lm(triceps ~ cut(age,7),data=triceps)
summary(mod_tangga7)
##
## Call:
## lm(formula = triceps ~ cut(age, 7), data = triceps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8063 -1.7592 -0.4366 1.2894 23.1461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.5592 0.2219 34.060 < 2e-16 ***
## cut(age, 7)(7.62,15] -0.6486 0.3326 -1.950 0.0515 .
## cut(age, 7)(15,22.3] 3.4534 0.4239 8.146 1.27e-15 ***
## cut(age, 7)(22.3,29.7] 5.8947 0.4604 12.804 < 2e-16 ***
## cut(age, 7)(29.7,37] 7.8471 0.5249 14.949 < 2e-16 ***
## cut(age, 7)(37,44.4] 6.9191 0.5391 12.835 < 2e-16 ***
## cut(age, 7)(44.4,51.8] 6.3013 0.6560 9.606 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.805 on 885 degrees of freedom
## Multiple R-squared: 0.4055, Adjusted R-squared: 0.4014
## F-statistic: 100.6 on 6 and 885 DF, p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
geom_point(alpha=0.55, color="black") +
stat_smooth(method = "lm",
formula = y~cut(x,7),
lty = 1, col = "blue",se = F)+
theme_bw()
Perbandingan Model
MSE = function(pred,actual){
mean((pred-actual)^2)
}
nilai_MSE <- rbind(MSE(predict(mod_linear),triceps$triceps),
MSE(predict(mod_polinomial2),triceps$triceps),
MSE(predict(mod_polinomial3),triceps$triceps),
MSE(predict(mod_tangga5),triceps$triceps),
MSE(predict(mod_tangga7),triceps$triceps))
nama_model <- c("Linear","Poly (ordo=2)", "Poly (ordo=3)",
"Tangga (breaks=5)", "Tangga (breaks=7)")
data.frame(nama_model,nilai_MSE)
## nama_model nilai_MSE
## 1 Linear 16.01758
## 2 Poly (ordo=2) 16.00636
## 3 Poly (ordo=3) 14.89621
## 4 Tangga (breaks=5) 15.42602
## 5 Tangga (breaks=7) 14.36779
Evaluasi Model Menggunakan Cross Validation
Regresi Linear
set.seed(123)
cross_val <- vfold_cv(triceps,v=10,strata = "triceps")
metric_linear <- map_dfr(cross_val$splits,
function(x){
mod <- lm(triceps ~ age,data=triceps[x$in_id,])
pred <- predict(mod,newdata=triceps[-x$in_id,])
truth <- triceps[-x$in_id,]$triceps
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_linear
## # A tibble: 10 x 2
## rmse mae
## <dbl> <dbl>
## 1 3.65 2.82
## 2 4.62 3.22
## 3 4.38 3.00
## 4 3.85 2.80
## 5 3.08 2.36
## 6 3.83 2.81
## 7 3.59 2.78
## 8 4.66 3.06
## 9 3.50 2.56
## 10 4.59 2.93
# menghitung rata-rata untuk 10 folds
mean_metric_linear <- colMeans(metric_linear)
mean_metric_linear
## rmse mae
## 3.973249 2.833886
Polynomial Derajat 2 (ordo 2)
set.seed(123)
cross_val <- vfold_cv(triceps,v=10,strata = "triceps")
metric_poly2 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(triceps ~ poly(age,2,raw = T),data=triceps[x$in_id,])
pred <- predict(mod,newdata=triceps[-x$in_id,])
truth <- triceps[-x$in_id,]$triceps
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly2
## # A tibble: 10 x 2
## rmse mae
## <dbl> <dbl>
## 1 3.64 2.82
## 2 4.62 3.26
## 3 4.39 3.02
## 4 3.85 2.81
## 5 3.10 2.38
## 6 3.82 2.81
## 7 3.62 2.83
## 8 4.65 3.07
## 9 3.50 2.57
## 10 4.59 2.94
# menghitung rata-rata untuk 10 folds
mean_metric_poly2 <- colMeans(metric_poly2)
mean_metric_poly2
## rmse mae
## 3.977777 2.851787
Polynomial Derajat 3 (ordo 3)
set.seed(123)
cross_val <- vfold_cv(triceps,v=10,strata = "triceps")
metric_poly3 <- map_dfr(cross_val$splits,
function(x){
mod <- lm(triceps ~ poly(age,3,raw = T),data=triceps[x$in_id,])
pred <- predict(mod,newdata=triceps[-x$in_id,])
truth <- triceps[-x$in_id,]$triceps
rmse <- mlr3measures::rmse(truth = truth,response = pred)
mae <- mlr3measures::mae(truth = truth,response = pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_poly3
## # A tibble: 10 x 2
## rmse mae
## <dbl> <dbl>
## 1 3.49 2.60
## 2 4.48 2.99
## 3 4.21 2.85
## 4 4.02 2.75
## 5 3.03 2.09
## 6 3.63 2.59
## 7 3.53 2.52
## 8 4.54 2.91
## 9 3.27 2.33
## 10 4.27 2.68
# menghitung rata-rata untuk 10 folds
mean_metric_poly3 <- colMeans(metric_poly3)
mean_metric_poly3
## rmse mae
## 3.845976 2.632125
Fungsi Tangga
set.seed(123)
cross_val <- vfold_cv(triceps,v=10,strata = "triceps")
breaks <- 3:10
best_tangga <- map_dfr(breaks, function(i){
metric_tangga <- map_dfr(cross_val$splits,
function(x){
training <- triceps[x$in_id,]
training$age <- cut(training$age,i)
mod <- lm(triceps ~ age,data=training)
labs_x <- levels(mod$model[,2])
labs_x_breaks <- cbind(lower = as.numeric( sub("\\((.+),.*", "\\1", labs_x) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs_x) ))
testing <- triceps[-x$in_id,]
age_new <- cut(testing$age,c(labs_x_breaks[1,1],labs_x_breaks[,2]))
pred <- predict(mod,newdata=list(age=age_new))
truth <- testing$triceps
data_eval <- na.omit(data.frame(truth,pred))
rmse <- mlr3measures::rmse(truth = data_eval$truth,response = data_eval$pred)
mae <- mlr3measures::mae(truth = data_eval$truth,response = data_eval$pred)
metric <- c(rmse,mae)
names(metric) <- c("rmse","mae")
return(metric)
}
)
metric_tangga
# menghitung rata-rata untuk 10 folds
mean_metric_tangga <- colMeans(metric_tangga)
mean_metric_tangga
}
)
best_tangga <- cbind(breaks=breaks,best_tangga)
# menampilkan hasil all breaks
best_tangga
## breaks rmse mae
## 1 3 3.835357 2.618775
## 2 4 3.882932 2.651911
## 3 5 3.917840 2.724368
## 4 6 3.836068 2.622939
## 5 7 3.789715 2.555062
## 6 8 3.812789 2.555563
## 7 9 3.781720 2.518706
## 8 10 3.795877 2.529479
#berdasarkan rmse
best_tangga %>% slice_min(rmse)
## breaks rmse mae
## 1 9 3.78172 2.518706
#berdasarkan mae
best_tangga %>% slice_min(mae)
## breaks rmse mae
## 1 9 3.78172 2.518706
Perbandingan Model
nilai_metric <- rbind(mean_metric_linear,
mean_metric_poly2,
mean_metric_poly3,
best_tangga %>% select(-1) %>% slice_min(mae))
nama_model <- c("Linear","Poly2","Poly3","Tangga (breaks=9)")
data.frame(nama_model,nilai_metric)
## nama_model rmse mae
## 1 Linear 3.973249 2.833886
## 2 Poly2 3.977777 2.851787
## 3 Poly3 3.845976 2.632125
## 4 Tangga (breaks=9) 3.781720 2.518706