Non-Linier Regression

Soal 1 : Melakukan Cross Validation

Lakukan cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal menggunakan 3 metode berikut: * Regresi Polinomial
* Piecewise Constant
* Natural Cubil Splines

Regresi Polinomial

Plot Data Awal

library(ISLR)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(splines)
library(boot)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(grid)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

attach(Auto)

## The following object is masked from package:ggplot2:
## 
##     mpg

head(cbind(mpg, horsepower))

##      mpg horsepower
## [1,]  18        130
## [2,]  15        165
## [3,]  18        150
## [4,]  16        150
## [5,]  17        140
## [6,]  15        198

ggplot(Auto,aes(x=horsepower, y=mpg)) +
  geom_point(alpha=0.55, color="blue") + 
  stat_smooth(method = "lm", 
               formula = y~x,lty = 1,
               col = "red",se = F)+
  ggtitle("Horse Power VS mile per gallon") +
  xlab("Horse Power")+
  ylab("mile per gallon")+
  theme_grey()

Regresi Polinomial

#plot degree 1-9
d1 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~poly(x,1,raw=T), 
              lty = 1, col = "red",se = F)+
  ggtitle("Derajat 1")+
  theme_classic()

d2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~poly(x,2,raw=T), 
              lty = 1, col = "red",se = F)+
  ggtitle("Derajat 2")+
  theme_classic()

d3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~poly(x,3,raw=T), 
              lty = 1, col = "red",se = F)+
  ggtitle("Derajat 3")+
  theme_classic()

d4 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~poly(x,4,raw=T), 
              lty = 1, col = "red",se = F)+
  ggtitle("Derajat 4")+
  theme_classic()

d5 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~poly(x,5,raw=T), 
              lty = 1, col = "red",se = F)+
  ggtitle("Derajat 5")+
  theme_classic()

d6 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~poly(x,6,raw=T), 
              lty = 1, col = "red",se = F)+
  ggtitle("Derajat 6")+
  theme_classic()

d7 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~poly(x,7,raw=T), 
              lty = 1, col = "red",se = F)+
  ggtitle("Derajat 7")+
  theme_classic()

d8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~poly(x,8,raw=T), 
              lty = 1, col = "red",se = F)+
  ggtitle("Derajat 8")+
  theme_classic()

d9 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~poly(x,9,raw=T), 
              lty = 1, col = "red",se = F)+
  ggtitle("Derajat 9")+
  theme_classic()


grid.arrange(d1,d2,d3,d4,d5,d6,d7,d8,d9, nrow=4,ncol=3,
             top = textGrob("Plot Regresi Polinomial",gp=gpar(fontsize=20,font=2)))

#CV k-fold
degree=1:10
cv.error1=rep(0,10)
cv.error2=rep(0,10)
set.seed(123)
for(d in degree){
  glm.fit = glm(mpg~poly(horsepower, d), data=Auto)
  cv.error1[d] = cv.glm(Auto,glm.fit,K=5)$delta[1]
  cv.error2[d] = cv.glm(Auto,glm.fit,K=5)$delta[2]
}

cv.error1

##  [1] 24.29291 19.43357 19.26214 19.18035 19.05298 18.89108 18.84472 19.21623
##  [9] 18.77669 19.14110

plot(degree,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Derajat Polinomial",main="Galat Validasi Silang Derajat 1-10",col="blue")

#plot regresi polinomial dengan model terbaik
d5 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~poly(x,5,raw=T), 
              lty = 1, col = "red",se = F)+
  ggtitle("Derajat 5")+
  theme_classic()
grid.arrange(d5, top = textGrob("Regresi Polinomial Derajat 5",gp=gpar(fontsize=20,font=2)))

#Membandingkan model
poly1 = lm(mpg ~ poly(horsepower,1,raw = T),data=Auto)
poly2 = lm(mpg ~ poly(horsepower,2,raw = T),data=Auto)
poly3 = lm(mpg ~ poly(horsepower,3,raw = T),data=Auto)
poly4 = lm(mpg ~ poly(horsepower,4,raw = T),data=Auto)
poly5 = lm(mpg ~ poly(horsepower,5,raw = T),data=Auto)
poly6 = lm(mpg ~ poly(horsepower,6,raw = T),data=Auto)
poly7 = lm(mpg ~ poly(horsepower,7,raw = T),data=Auto)
poly8 = lm(mpg ~ poly(horsepower,8,raw = T),data=Auto)
poly9 = lm(mpg ~ poly(horsepower,9,raw = T),data=Auto)
poly10 = lm(mpg ~ poly(horsepower,10,raw = T),data=Auto)

MSE = function(pred,actual){
  mean((pred-actual)^2)
}

hasil<-data.frame(MSE1=MSE(predict(poly1),Auto$mpg),
           MSE2=MSE(predict(poly2),Auto$mpg),
           MSE3=MSE(predict(poly3),Auto$mpg),
           MSE4=MSE(predict(poly4),Auto$mpg),
           MSE5=MSE(predict(poly5),Auto$mpg),
           MSE6=MSE(predict(poly6),Auto$mpg),
           MSE7=MSE(predict(poly7),Auto$mpg),
           MSE8=MSE(predict(poly8),Auto$mpg),
           MSE9=MSE(predict(poly9),Auto$mpg),
           MSE10=MSE(predict(poly10),Auto$mpg))
hasil

##       MSE1     MSE2     MSE3     MSE4     MSE5     MSE6     MSE7     MSE8
## 1 23.94366 18.98477 18.94499 18.87633 18.42697 18.24065 18.07817 18.06613
##       MSE9    MSE10
## 1 18.02697 18.00953

plot(c(1:10),hasil,type="b",col="blue",xlab="Derajat Polinomial", ylab="MSE", main="MSE Regresi Polinomial")

summary(poly5)

## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 5, raw = T), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.4326  -2.5285  -0.2925   2.1750  15.9730 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                   -3.223e+01  2.857e+01  -1.128  0.26003   
## poly(horsepower, 5, raw = T)1  3.700e+00  1.303e+00   2.840  0.00475 **
## poly(horsepower, 5, raw = T)2 -7.142e-02  2.253e-02  -3.170  0.00164 **
## poly(horsepower, 5, raw = T)3  5.931e-04  1.850e-04   3.206  0.00146 **
## poly(horsepower, 5, raw = T)4 -2.281e-06  7.243e-07  -3.150  0.00176 **
## poly(horsepower, 5, raw = T)5  3.330e-09  1.085e-09   3.068  0.00231 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.326 on 386 degrees of freedom
## Multiple R-squared:  0.6967, Adjusted R-squared:  0.6928 
## F-statistic: 177.4 on 5 and 386 DF,  p-value: < 2.2e-16

Piecewise Constant

#plot interval 1-9
d2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,2,raw=T), 
              lty = 1,lwd=2, col = "red",se = F)+
  ggtitle("Interval 2")+
  theme_classic()

d3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,3,raw=T), 
              lty=1,lwd=2, col = "red",se = F)+
  ggtitle("Interval 3")+
  theme_classic()

d4 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,4,raw=T), 
              lty=1,lwd=2, col = "red",se = F)+
  ggtitle("Interval 4")+
  theme_classic()

d5 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,5,raw=T), 
              lty=1,lwd=2, col = "red",se = F)+
  ggtitle("Interval 5")+
  theme_classic()

d6 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,6,raw=T), 
              lty = 1,lwd=2, col = "red",se = F)+
  ggtitle("Interval 6")+
  theme_classic()

d7 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,7,raw=T), 
              lty=1,lwd=2, col = "red",se = F)+
  ggtitle("Interval 7")+
  theme_classic()

d8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,8,raw=T), 
              lty=1,lwd=2, col = "red",se = F)+
  ggtitle("Interval 8")+
  theme_classic()

d9 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,9,raw=T), 
              lty=1,lwd=2, col = "red",se = F)+
  ggtitle("Interval 9")+
  theme_classic()

d10<- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,10,raw=T), 
              lty=1,lwd=2, col = "red",se = F)+
  ggtitle("Interval 10")+
  theme_classic()

grid.arrange(d2,d3,d4,d5,d6,d7,d8,d9,d10,nrow=3,ncol=3,
             top = textGrob("Plot Regresi Fungsi Tangga",gp=gpar(fontsize=20,font=3)))

#CV k fold
t=2:10
cv.error1=rep(0,9)
cv.error2=rep(0,9)
set.seed(123)
for(i in t){
  Auto$tangga <- cut(Auto$horsepower,i)
  glm.fit = glm(mpg~tangga, data=Auto)
  cv.error1[i-1] = cv.glm(Auto,glm.fit,K=5)$delta[1]
  cv.error2[i-1] = cv.glm(Auto,glm.fit,K=5)$delta[2]
}
cv.error1

## [1] 38.18664 33.52055 24.98593 22.46398 22.03468 21.29707 20.18401 20.87612
## [9] 21.21782

plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Jumlah Interval",main="Galat Validasi Silang Jumlah Interval 2-10",col="blue")

#Membandingkan model
cut2 = lm(mpg ~ cut(horsepower,2,raw = T),data=Auto)
cut3 = lm(mpg ~ cut(horsepower,3,raw = T),data=Auto)
cut4 = lm(mpg ~ cut(horsepower,4,raw = T),data=Auto)
cut5 = lm(mpg ~ cut(horsepower,5,raw = T),data=Auto)
cut6 = lm(mpg ~ cut(horsepower,6,raw = T),data=Auto)
cut7 = lm(mpg ~ cut(horsepower,7,raw = T),data=Auto)
cut8 = lm(mpg ~ cut(horsepower,8,raw = T),data=Auto)
cut9 = lm(mpg ~ cut(horsepower,9,raw = T),data=Auto)
cut10 = lm(mpg ~ cut(horsepower,10,raw = T),data=Auto)

#####


hasil<-data.frame(MSE2=MSE(predict(cut2),Auto$mpg),
           MSE3=MSE(predict(cut3),Auto$mpg),
           MSE4=MSE(predict(cut4),Auto$mpg),
           MSE5=MSE(predict(cut5),Auto$mpg),
           MSE6=MSE(predict(cut6),Auto$mpg),
           MSE7=MSE(predict(cut7),Auto$mpg),
           MSE8=MSE(predict(cut8),Auto$mpg),
           MSE9=MSE(predict(cut9),Auto$mpg),
           MSE10=MSE(predict(cut10),Auto$mpg))
hasil

##       MSE2     MSE3     MSE4     MSE5     MSE6     MSE7     MSE8     MSE9
## 1 37.77106 33.26797 24.75519 21.98227 21.49764 20.69428 19.24845 20.21836
##      MSE10
## 1 20.39147

plot(c(2:10),hasil,type="b",col="blue",xlab="Jumlah Interval", ylab="MSE", main="MSE Regresi Fungsi Tangga")

#plot regresi tangga dengan model terbaik
d8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,8,raw=T), 
              lty=1,lwd=2, col = "red",se = F)+
  ggtitle("Interval 8")+
  theme_classic()
grid.arrange(d8, top = textGrob("Plot Piecewise Constant Regression",gp=gpar(fontsize=20,font=3)))

summary(cut8)

## 
## Call:
## lm(formula = mpg ~ cut(horsepower, 8, raw = T), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9471  -2.6757  -0.1533   2.4015  14.5529 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           33.9085     0.5771   58.76   <2e-16 ***
## cut(horsepower, 8, raw = T)(69,92]    -6.9614     0.6910  -10.07   <2e-16 ***
## cut(horsepower, 8, raw = T)(92,115]  -12.7551     0.7425  -17.18   <2e-16 ***
## cut(horsepower, 8, raw = T)(115,138] -15.6656     1.1264  -13.91   <2e-16 ***
## cut(horsepower, 8, raw = T)(138,161] -18.7799     0.8568  -21.92   <2e-16 ***
## cut(horsepower, 8, raw = T)(161,184] -19.9735     1.1470  -17.41   <2e-16 ***
## cut(horsepower, 8, raw = T)(184,207] -21.1228     1.7720  -11.92   <2e-16 ***
## cut(horsepower, 8, raw = T)(207,230] -21.0085     1.5159  -13.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.433 on 384 degrees of freedom
## Multiple R-squared:  0.6832, Adjusted R-squared:  0.6774 
## F-statistic: 118.3 on 7 and 384 DF,  p-value: < 2.2e-16

Natural Cubic Splines

hplimits=range(horsepower)
horsepower.grid = seq(from=hplimits[1],to=hplimits[2])

fit<-lm(mpg ~ ns(horsepower,knots = c(90,120,150,180)),data = Auto )
pred<-predict(fit,newdata=list(horsepower=horsepower.grid),se=T)

plot(Auto$horsepower,Auto$mpg,col="blue")
lines(horsepower.grid,pred$fit,lty=1,lwd=2,col="red")
abline(v = c(0,90,120,150,180),lty="dashed")

glm.fit = glm(mpg~horsepower, data=Auto)
t=2:25
cv.error1=rep(0,9)
cv.error2=rep(0,9)
set.seed(123)
for(i in t){
  Auto$tangga <- ns(Auto$horsepower,df=i)
  glm.fit = glm(mpg~tangga, data=Auto)
  cv.error1[i-1] = cv.glm(Auto,glm.fit,K=5)$delta[1]
  cv.error2[i-1] = cv.glm(Auto,glm.fit,K=5)$delta[1]
}
cv.error1

##  [1] 18.97653 19.43084 18.75576 18.71510 19.35841 18.50553 18.95521 19.20868
##  [9] 18.53729 18.98381 19.12881 19.44986 19.78876 19.11963 19.50886 19.56679
## [17] 19.25412 19.87808 19.90940 19.19221 19.26636 19.43675 19.49779 18.51231

plot(t,cv.error1,type="b",ylab="Galat Validasi Silang", 
     xlab="Jumlah Knot",main="Galat Validasi Silang Jumlah Knot 2-10",col="blue")

#Membandingkan model
ns2 = lm(mpg ~ ns(horsepower,df=2),data=Auto)
ns3 = lm(mpg ~ ns(horsepower,3),data=Auto)
ns4 = lm(mpg ~ ns(horsepower,4),data=Auto)
ns5 = lm(mpg ~ ns(horsepower,5),data=Auto)
ns6 = lm(mpg ~ ns(horsepower,6),data=Auto)
ns7 = lm(mpg ~ ns(horsepower,7),data=Auto)
ns8 = lm(mpg ~ ns(horsepower,8),data=Auto)
ns9 = lm(mpg ~ ns(horsepower,9),data=Auto)
ns10 = lm(mpg ~ ns(horsepower,10),data=Auto)
ns11 = lm(mpg ~ ns(horsepower,11),data=Auto)
ns12 = lm(mpg ~ ns(horsepower,12),data=Auto)
ns13 = lm(mpg ~ ns(horsepower,13),data=Auto)
ns14 = lm(mpg ~ ns(horsepower,14),data=Auto)
ns15 = lm(mpg ~ ns(horsepower,15),data=Auto)
ns16 = lm(mpg ~ ns(horsepower,16),data=Auto)
ns17 = lm(mpg ~ ns(horsepower,17),data=Auto)
ns18 = lm(mpg ~ ns(horsepower,18),data=Auto)
ns19 = lm(mpg ~ ns(horsepower,19),data=Auto)
ns20 = lm(mpg ~ ns(horsepower,20),data=Auto)
ns21 = lm(mpg ~ ns(horsepower,21),data=Auto)
ns22 = lm(mpg ~ ns(horsepower,22),data=Auto)
ns23 = lm(mpg ~ ns(horsepower,23),data=Auto)
ns24 = lm(mpg ~ ns(horsepower,24),data=Auto)
ns25 = lm(mpg ~ ns(horsepower,25),data=Auto)

#####


hasil<-data.frame(MSE2=MSE(predict(ns2),Auto$mpg),
           MSE3=MSE(predict(ns3),Auto$mpg),
           MSE4=MSE(predict(ns4),Auto$mpg),
           MSE5=MSE(predict(ns5),Auto$mpg),
           MSE6=MSE(predict(ns6),Auto$mpg),
           MSE7=MSE(predict(ns7),Auto$mpg),
           MSE8=MSE(predict(ns8),Auto$mpg),
           MSE9=MSE(predict(ns9),Auto$mpg),
           MSE10=MSE(predict(ns10),Auto$mpg),
           MSE11=MSE(predict(ns11),Auto$mpg),
           MSE12=MSE(predict(ns12),Auto$mpg),
           MSE13=MSE(predict(ns13),Auto$mpg),
           MSE14=MSE(predict(ns14),Auto$mpg),
           MSE15=MSE(predict(ns15),Auto$mpg),
           MSE16=MSE(predict(ns16),Auto$mpg),
           MSE17=MSE(predict(ns17),Auto$mpg),
           MSE18=MSE(predict(ns18),Auto$mpg),
           MSE19=MSE(predict(ns19),Auto$mpg),
           MSE20=MSE(predict(ns20),Auto$mpg),
           MSE21=MSE(predict(ns21),Auto$mpg),
           MSE22=MSE(predict(ns22),Auto$mpg),
           MSE23=MSE(predict(ns23),Auto$mpg),
           MSE24=MSE(predict(ns24),Auto$mpg),
           MSE25=MSE(predict(ns25),Auto$mpg))
hasil

##      MSE2     MSE3     MSE4     MSE5     MSE6     MSE7     MSE8     MSE9
## 1 18.8135 18.83133 18.49131 18.29669 18.17391 17.88157 17.88026 17.74008
##      MSE10    MSE11    MSE12    MSE13   MSE14    MSE15    MSE16    MSE17
## 1 17.47398 17.67903 17.47017 17.43161 17.3897 17.16821 17.06234 17.09691
##     MSE18    MSE19    MSE20    MSE21   MSE22    MSE23    MSE24    MSE25
## 1 17.1029 17.08081 16.75519 16.77222 16.7328 17.04762 16.23641 15.96954

plot(c(2:25),hasil,type="b",col="blue",xlab="Jumlah knot", ylab="MSE", main="MSE Natural Cubic Splines")

summary(ns10)

## 
## Call:
## lm(formula = mpg ~ ns(horsepower, 10), data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5003  -2.4956  -0.2086   2.2577  14.6400 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            32.462      1.680  19.324  < 2e-16 ***
## ns(horsepower, 10)1    -4.600      1.789  -2.572 0.010495 *  
## ns(horsepower, 10)2    -3.007      2.555  -1.177 0.239996    
## ns(horsepower, 10)3    -8.663      2.043  -4.239 2.82e-05 ***
## ns(horsepower, 10)4    -7.170      2.188  -3.278 0.001142 ** 
## ns(horsepower, 10)5   -14.722      2.125  -6.927 1.84e-11 ***
## ns(horsepower, 10)6    -8.446      2.467  -3.424 0.000684 ***
## ns(horsepower, 10)7   -16.947      2.080  -8.148 5.38e-15 ***
## ns(horsepower, 10)8   -21.715      1.891 -11.483  < 2e-16 ***
## ns(horsepower, 10)9   -14.384      4.116  -3.494 0.000531 ***
## ns(horsepower, 10)10  -22.326      1.946 -11.476  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.24 on 381 degrees of freedom
## Multiple R-squared:  0.7124, Adjusted R-squared:  0.7049 
## F-statistic: 94.39 on 10 and 381 DF,  p-value: < 2.2e-16

plot(horsepower, mpg, pch=19, col="blue",
     xlab="Horse Power", ylab="Mile per Gallon",
     main="Model Terbaik Natural Cubic Spline Regression Knot 10")
ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix], predict(ns10)[ix], 
      main = "Natural Cubic Spline Regression Knot=10", col="red",
      lty=1, lwd=2)

Kesimpulan :
1. Model terbaik regresi polinomial : derajat 5
2. Model terbaik regresi fungsi tangga : jumlah interval 8
3. Model terbaik regresi spline : knot 10

MSE.gab<-c(MSE(predict(poly5),Auto$mpg),
            MSE(predict(cut8),Auto$mpg),
            MSE(predict(ns10),Auto$mpg))
MSE.gab

## [1] 18.42697 19.24845 17.47398

plot(MSE.gab,xaxt = "n",type="b",col="blue",xlab="Metode Regresi", ylab="MSE", main="MSE Regresi Non Linear")
axis(1, at=1:3, labels=c("polinomial derajat 5","fs tangga interval 8","spline knot 10"))

Soal 2 : Setiap Negara

Amerika

fil1=Auto$origin==1
data.AS=Auto[fil1,]

glm.fit = glm(mpg~horsepower, data=data.AS)
degree=1:10
cv.error1=rep(0,5)
set.seed(123)
for(d in degree){
  glm.fit = glm(mpg~poly(horsepower, d), data=data.AS)
  cv.error1[d] = cv.glm(data.AS,glm.fit,K=5)$delta[1]
}

cv.error1

##  [1] 18.28028 14.77245 15.07391 15.25180 14.82463 16.31074 14.60477 22.45411
##  [9] 15.03355 14.90820

plot(degree,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Derajat Polinomial",main="Galat Validasi Silang Derajat 1-10",col="blue")

glm.fit = glm(mpg~horsepower, data=data.AS)
t=2:10
cv.error1=rep(0,5)
set.seed(123)
for(i in t){
  data.AS$tangga <- cut(data.AS$horsepower,i)
  glm.fit = glm(mpg~tangga, data=data.AS)
  cv.error1[i-1] = cv.glm(data.AS,glm.fit,K=5)$delta[1]
}

plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Jumlah Interval",main="Galat Validasi Silang Jumlah Interval 2-10",col="blue")

glm.fit = glm(mpg~horsepower, data=data.AS)
t=2:10
cv.error1=rep(0,9)
set.seed(1)
for(i in t){
  data.AS$tangga <- ns(data.AS$horsepower,df=i)
  glm.fit = glm(mpg~tangga, data=data.AS)
  cv.error1[i-1] = cv.glm(data.AS,glm.fit,K=5)$delta[1]
}

plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Jumlah Knot",main="Galat Validasi Silang Jumlah Knot 2-10",col="blue")

Kesimpulan :
1. Model terbaik regresi polinomial : derajat 7
2. Model terbaik regresi fungsi tangga : jumlah interval 9
3. Model terbaik regresi spline : knot 7

poly3 = lm(mpg ~ poly(horsepower,7,raw = T),data=data.AS)
cut9 = lm(mpg ~ cut(horsepower,9,raw = T),data=data.AS)
ns7 = lm(mpg ~ ns(horsepower,7),data=data.AS)

MSE = function(pred,actual){
  mean((pred-actual)^2)
}
MSE.gab<-c(MSE(predict(poly3),data.AS$mpg),
            MSE(predict(cut9),data.AS$mpg),
            MSE(predict(ns7),data.AS$mpg))

plot(MSE.gab,xaxt = "n",type="b",col="blue",xlab="Metode Regresi", ylab="MSE", main="MSE Regresi Non Linear")
axis(1, at=1:3, labels=c("polinomial derajat 7","fs tangga interval 9","spline knot 7"))

#plot regresi tangga (amerika) sebagai model terbaik
d9 <- ggplot(data.AS,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,9,raw=T), 
              lty=1,lwd=2, col = "red",se = F)
  ggtitle("Interval 9")+
  theme_classic()

## NULL

grid.arrange(d9, top = textGrob("Model Terbaik : Amerika",gp=gpar(fontsize=20,font=3)))

Eropa

fil1=Auto$origin==2
data.Eropa=Auto[fil1,]

glm.fit = glm(mpg~horsepower, data=data.Eropa)
degree=1:5
cv.error1=rep(0,5)
set.seed(123)
for(d in degree){
  glm.fit = glm(mpg~poly(horsepower, d), data=data.Eropa)
  cv.error1[d] = cv.glm(data.Eropa,glm.fit,K=5)$delta[1]
}

cv.error1

## [1] 26.82990 24.98542 28.02877 37.53260 28.86751

plot(degree,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Derajat Polinomial",main="Galat Validasi Silang Derajat 1-5",col="blue")

glm.fit = glm(mpg~horsepower, data=data.Eropa)
t=2:10
cv.error1=rep(0,9)
set.seed(123)
for(i in t){
  data.Eropa$tangga <- cut(data.Eropa$horsepower,i)
  glm.fit = glm(mpg~horsepower, data=data.Eropa)
  cv.error1[i-1] = cv.glm(data.Eropa,glm.fit,K=5)$delta[1]
}

plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Jumlah Interval",main="Galat Validasi Silang Jumlah Interval 2-10",col="blue")

glm.fit = glm(mpg~horsepower, data=data.Eropa)
t=2:10
cv.error1=rep(0,9)
set.seed(123)
for(i in t){
  data.Eropa$tangga <- ns(data.Eropa$horsepower,df=i)
  glm.fit = glm(mpg~tangga, data=data.Eropa)
  cv.error1[i-1] = cv.glm(data.Eropa,glm.fit,K=5)$delta[1]
}

plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Jumlah Knot",main="Galat Validasi Silang Jumlah Knot 2-10",col="blue")

Kesimpulan :
1. Model terbaik regresi polinomial : derajat 2
2. Model terbaik regresi fungsi tangga : jumlah interval 8
3. Model terbaik regresi spline : knot 3

poly2 = lm(mpg ~ poly(horsepower,2,raw = T),data=data.Eropa)
cut8 = lm(mpg ~ cut(horsepower,8,raw = T),data=data.Eropa)
ns3 = lm(mpg ~ ns(horsepower,3),data=data.Eropa)

MSE = function(pred,actual){
  mean((pred-actual)^2)
}
MSE.gab<-c(MSE(predict(poly2),data.Eropa$mpg),
            MSE(predict(cut8),data.Eropa$mpg),
            MSE(predict(ns3),data.Eropa$mpg))

MSE.gab

## [1] 22.94428 22.73396 22.87518

plot(MSE.gab,xaxt = "n",type="b",col="blue",xlab="Metode Regresi", ylab="MSE", main="MSE Regresi Non Linear")
axis(1, at=1:3, labels=c("polinomial derajat 2","fs tangga interval 8","spline knot 3"))

#plot regresi tangga (eropa) sebagai model terbaik
d8 <- ggplot(data.Eropa,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,8,raw=T), 
              lty=1,lwd=2, col = "red",se = F)
  ggtitle("Interval 8")+
  theme_classic()

## NULL

grid.arrange(d8, top = textGrob("Model Terbaik : Eropa",gp=gpar(fontsize=20,font=3)))

Jepang

fil1=Auto$origin==3
data.Jepang=Auto[fil1,]

glm.fit = glm(mpg~horsepower, data=data.Jepang)
degree=1:5
cv.error1=rep(0,5)
set.seed(123)
for(d in degree){
  glm.fit = glm(mpg~poly(horsepower, d), data=data.Jepang)
  cv.error1[d] = cv.glm(data.Jepang,glm.fit,K=5)$delta[1]
}

cv.error1

## [1] 21.82064 23.23935 16.57910 20.55315 17.83392

plot(degree,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Derajat Polinomial",main="Galat Validasi Silang Derajat 1-5",col="blue")

glm.fit = glm(mpg~horsepower, data=data.Jepang)
t=2:10
cv.error1=rep(0,9)
set.seed(123)
for(i in t){
  data.Jepang$tangga <- cut(data.Jepang$horsepower,i)
  glm.fit = glm(mpg~horsepower, data=data.Jepang)
  cv.error1[i-1] = cv.glm(data.Jepang,glm.fit,K=5)$delta[1]
}

plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Jumlah Interval",main="Galat Validasi Silang Jumlah Interval 2-10",col="blue")

glm.fit = glm(mpg~horsepower, data=data.Jepang)
t=2:10
cv.error1=rep(0,9)
set.seed(123)
for(i in t){
  data.Jepang$tangga <- ns(data.Jepang$horsepower,df=i)
  glm.fit = glm(mpg~tangga, data=data.Jepang)
  cv.error1[i-1] = cv.glm(data.Jepang,glm.fit,K=5)$delta[1]
}

plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
     xlab="Jumlah Knot",main="Galat Validasi Silang Jumlah Knot 2-10",col="blue")

Kesimpulan :
1. Model terbaik regresi polinomial : derajat 3
2. Model terbaik regresi fungsi tangga : jumlah interval 8
3. Model terbaik regresi spline : knot 9

poly3 = lm(mpg ~ poly(horsepower,3,raw = T),data=data.Jepang)
cut8 = lm(mpg ~ cut(horsepower,8,raw = T),data=data.Jepang)
ns9 = lm(mpg ~ ns(horsepower,9),data=data.Jepang)

MSE = function(pred,actual){
  mean((pred-actual)^2)
}
MSE.gab<-c(MSE(predict(poly3),data.Jepang$mpg),
            MSE(predict(cut8),data.Jepang$mpg),
            MSE(predict(ns9),data.Jepang$mpg))

plot(MSE.gab,xaxt = "n",type="b",col="blue",xlab="Metode Regresi", ylab="MSE", main="MSE Regresi Non Linear")
axis(1, at=1:3, labels=c("polinomial derajat 3","fs tangga interval 8","spline knot 9"))

x <- MSE(predict(cut8),data.Jepang$mpg)

plot(x,xaxt = "n",type="b",col="blue",xlab="Metode Regresi", ylab="MSE", main="MSE Regresi Non Linear")
axis(1, at=1:3, labels=c("polinomial derajat 3","fs tangga interval 8","spline knot 9"))

#plot regresi tangga (amerika) sebagai model terbaik
d8 <- ggplot(data.Jepang,aes(x=horsepower, y=mpg)) + 
  geom_point(alpha=0.55, color="blue") +
  stat_smooth(method = "lm",formula = y~cut(x,8,raw=T), 
              lty=1,lwd=2, col = "red",se = F)
  ggtitle("Interval 8")+
  theme_classic()

## NULL

grid.arrange(d8, top = textGrob("Model Terbaik : Jepang",gp=gpar(fontsize=20,font=3)))

Gabungkan Grafik Setiap Negara

#prediksi model terbaik
data.AS$prediksi.cut9 = predict( glm(data.AS$mpg ~ cut(data.AS$horsepower,9)), data.frame(data.AS$horsepower), interval="predict" )
data.Eropa$prediksi.cut8 = predict( glm(data.Eropa$mpg ~ cut(data.Eropa$horsepower,8)), data.frame(data.Eropa$horsepower), interval="predict" )
data.Jepang$prediksi.cut8 = predict( glm(data.Jepang$mpg ~ cut(data.Jepang$horsepower,8)), data.frame(data.Jepang$horsepower), interval="predict" )

#data frame model terbaik
Amerika <-data.frame(data.AS$prediksi.cut9,data.AS$horsepower)
Eropa <-data.frame(data.Eropa$prediksi.cut8,data.Eropa$horsepower)
Jepang <-data.frame(data.Jepang$prediksi.cut8,data.Jepang$horsepower)

ggplot(Auto, aes(x=horsepower, y=mpg,col=as.factor(origin))) +  
  geom_point()+
  labs(
    x = "Horsepower",
    y = "Mile per Gallon",
    color = "Negara Produsen",
    title = "Plot Regresi Masing-Masing Negara") +
  theme_minimal()+
  geom_line(color='red',data = Amerika, aes(x=data.AS$horsepower, y=data.AS$prediksi.cut9))+
  geom_line(color='green',data = Eropa, aes(x=data.Eropa$horsepower, y=data.Eropa$prediksi.cut8))+
  geom_line(color='blue',data = Jepang, aes(x=data.Jepang$horsepower, y=data.Jepang$prediksi.cut8))