Regresi Polinomial, Regresi Fungsi Tangga dan Regresi Spline

Gerry Alfa Dito1

Rahma Anisa2

Package

Silahkan install jika belum ada

install.packages("tidyverse")
install.packages("MultiKink")
install.packages("splines")
library(tidyverse)
library(splines)

Data

data("triceps",package="MultiKink")
ggplot(triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") + 
                 theme_bw() 

Regresi linear

mod_linear = lm(triceps~age,data=triceps)
summary(mod_linear)
## 
## Call:
## lm(formula = triceps ~ age, data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9512  -2.3965  -0.5154   1.5822  25.1233 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.19717    0.21244   29.17   <2e-16 ***
## age          0.21584    0.01014   21.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.007 on 890 degrees of freedom
## Multiple R-squared:  0.3372, Adjusted R-squared:  0.3365 
## F-statistic: 452.8 on 1 and 890 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") +
   stat_smooth(method = "lm", 
               formula = y~x,lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi polinomial

mod_polinomial2 = lm(triceps ~ poly(age,2,raw = T),data=triceps)
summary(mod_polinomial2)
## 
## Call:
## lm(formula = triceps ~ poly(age, 2, raw = T), data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.5677  -2.4401  -0.4587   1.6368  24.9961 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.0229191  0.3063806  19.658  < 2e-16 ***
## poly(age, 2, raw = T)1  0.2434733  0.0364403   6.681 4.17e-11 ***
## poly(age, 2, raw = T)2 -0.0006257  0.0007926  -0.789     0.43    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.008 on 889 degrees of freedom
## Multiple R-squared:  0.3377, Adjusted R-squared:  0.3362 
## F-statistic: 226.6 on 2 and 889 DF,  p-value: < 2.2e-16
  ggplot(triceps,aes(x=age, y=triceps)) + 
  geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~poly(x,2,raw=T), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

mod_polinomial3 = lm(triceps ~ poly(age,3,raw = T),data=triceps)
summary(mod_polinomial3)
## 
## Call:
## lm(formula = triceps ~ poly(age, 3, raw = T), data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5832  -1.9284  -0.5415   1.3283  24.4440 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             8.004e+00  3.831e-01  20.893  < 2e-16 ***
## poly(age, 3, raw = T)1 -3.157e-01  7.721e-02  -4.089 4.73e-05 ***
## poly(age, 3, raw = T)2  3.101e-02  3.964e-03   7.824 1.45e-14 ***
## poly(age, 3, raw = T)3 -4.566e-04  5.612e-05  -8.135 1.38e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.868 on 888 degrees of freedom
## Multiple R-squared:  0.3836, Adjusted R-squared:  0.3815 
## F-statistic: 184.2 on 3 and 888 DF,  p-value: < 2.2e-16

Regresi Fungsi Tangga

mod_tangga = lm(triceps ~ cut(age,5),data=triceps)
summary(mod_tangga)
## 
## Call:
## lm(formula = triceps ~ cut(age, 5), data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5474  -2.0318  -0.4465   1.3682  23.3759 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              7.2318     0.1994  36.260  < 2e-16 ***
## cut(age, 5)(10.6,20.9]   1.6294     0.3244   5.023 6.16e-07 ***
## cut(age, 5)(20.9,31.2]   5.9923     0.4222  14.192  < 2e-16 ***
## cut(age, 5)(31.2,41.5]   7.5156     0.4506  16.678  < 2e-16 ***
## cut(age, 5)(41.5,51.8]   7.4561     0.5543  13.452  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.939 on 887 degrees of freedom
## Multiple R-squared:  0.3617, Adjusted R-squared:  0.3588 
## F-statistic: 125.7 on 4 and 887 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~cut(x,5), 
               lty = 1, col = "blue",se = F)+
  theme_bw()

Regresi Spline

mod_spline3 = lm(triceps ~ bs(age, knots = c(5, 10, 20, 30, 40)),data=triceps)
summary(mod_spline3)
## 
## Call:
## lm(formula = triceps ~ bs(age, knots = c(5, 10, 20, 30, 40)), 
##     data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.5234  -1.6912  -0.2917   1.1356  23.0922 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                              6.9598     0.9729   7.154 1.77e-12 ***
## bs(age, knots = c(5, 10, 20, 30, 40))1   2.5367     1.7154   1.479   0.1396    
## bs(age, knots = c(5, 10, 20, 30, 40))2  -0.3032     0.9629  -0.315   0.7529    
## bs(age, knots = c(5, 10, 20, 30, 40))3  -1.9092     1.2993  -1.469   0.1421    
## bs(age, knots = c(5, 10, 20, 30, 40))4   7.4056     1.2179   6.081 1.78e-09 ***
## bs(age, knots = c(5, 10, 20, 30, 40))5   6.1050     1.4043   4.347 1.54e-05 ***
## bs(age, knots = c(5, 10, 20, 30, 40))6  10.1770     1.5427   6.597 7.23e-11 ***
## bs(age, knots = c(5, 10, 20, 30, 40))7   3.9428     1.9082   2.066   0.0391 *  
## bs(age, knots = c(5, 10, 20, 30, 40))8  10.1473     1.7545   5.784 1.01e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.743 on 883 degrees of freedom
## Multiple R-squared:  0.4261, Adjusted R-squared:  0.4209 
## F-statistic: 81.94 on 8 and 883 DF,  p-value: < 2.2e-16
mod_spline3ns = lm(triceps ~ ns(age, knots = c(5, 10, 20, 30, 40)),data=triceps)
summary(mod_spline3ns)
## 
## Call:
## lm(formula = triceps ~ ns(age, knots = c(5, 10, 20, 30, 40)), 
##     data = triceps)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4875  -1.6873  -0.3665   1.1146  22.8643 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                              8.3811     0.5219  16.059  < 2e-16 ***
## ns(age, knots = c(5, 10, 20, 30, 40))1  -3.5592     0.6712  -5.303 1.44e-07 ***
## ns(age, knots = c(5, 10, 20, 30, 40))2   5.7803     1.0379   5.569 3.39e-08 ***
## ns(age, knots = c(5, 10, 20, 30, 40))3   5.5118     0.9416   5.853 6.78e-09 ***
## ns(age, knots = c(5, 10, 20, 30, 40))4   6.9070     0.9050   7.632 5.99e-14 ***
## ns(age, knots = c(5, 10, 20, 30, 40))5   5.4136     1.3783   3.928 9.24e-05 ***
## ns(age, knots = c(5, 10, 20, 30, 40))6   6.6460     1.0829   6.137 1.27e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.759 on 885 degrees of freedom
## Multiple R-squared:  0.4199, Adjusted R-squared:  0.416 
## F-statistic: 106.8 on 6 and 885 DF,  p-value: < 2.2e-16
ggplot(triceps,aes(x=age, y=triceps)) +
                 geom_point(alpha=0.55, color="black") +
  stat_smooth(method = "lm", 
               formula = y~bs(x, knots = c(5, 10, 20, 30, 40)), 
               lty = 1, aes(col = "Cubic Spline"),se = F)+
    stat_smooth(method = "lm", 
               formula = y~ns(x, knots = c(5, 10, 20, 30, 40)), 
               lty = 1, aes(col = "Natural Cubic Spline"),se = F)+labs(color="Tipe Spline")+
  scale_color_manual(values = c("Natural Cubic Spline"="red","Cubic Spline"="blue"))+theme_bw()

## Komparasi model

membuat fungsi MSE

MSE = function(pred,actual){
  mean((pred-actual)^2)
}

Membandingkan model

data.frame(Linear=MSE(predict(mod_linear),triceps$triceps),
           Poly2=MSE(predict(mod_polinomial2),triceps$triceps),
           Poly3=MSE(predict(mod_polinomial3),triceps$triceps),
           Tangga=MSE(predict(mod_tangga),triceps$triceps),
           Spline=MSE(predict(mod_spline3),triceps$triceps),
           NSpline=MSE(predict(mod_spline3ns),triceps$triceps)
           )
##     Linear    Poly2    Poly3   Tangga   Spline  NSpline
## 1 16.01758 16.00636 14.89621 15.42602 13.87026 14.01904