Non-Linier Regression
Soal 1 : Melakukan Cross Validation
Lakukan cross validation untuk menghasilkan pemodelan mpg vs horsepower optimal menggunakan 3 metode berikut: * Regresi Polinomial
* Piecewise Constant
* Natural Cubil Splines
Regresi Polinomial
Plot Data Awal
library(ISLR)
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(splines)
library(boot)
library(MASS)##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(grid)
library(gridExtra)##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
attach(Auto)## The following object is masked from package:ggplot2:
##
## mpg
head(cbind(mpg, horsepower))## mpg horsepower
## [1,] 18 130
## [2,] 15 165
## [3,] 18 150
## [4,] 16 150
## [5,] 17 140
## [6,] 15 198
ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",
formula = y~x,lty = 1,
col = "red",se = F)+
ggtitle("Horse Power VS mile per gallon") +
xlab("Horse Power")+
ylab("mile per gallon")+
theme_grey() Regresi Polinomial
#plot degree 1-9
d1 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~poly(x,1,raw=T),
lty = 1, col = "red",se = F)+
ggtitle("Derajat 1")+
theme_classic()
d2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~poly(x,2,raw=T),
lty = 1, col = "red",se = F)+
ggtitle("Derajat 2")+
theme_classic()
d3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~poly(x,3,raw=T),
lty = 1, col = "red",se = F)+
ggtitle("Derajat 3")+
theme_classic()
d4 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~poly(x,4,raw=T),
lty = 1, col = "red",se = F)+
ggtitle("Derajat 4")+
theme_classic()
d5 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~poly(x,5,raw=T),
lty = 1, col = "red",se = F)+
ggtitle("Derajat 5")+
theme_classic()
d6 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~poly(x,6,raw=T),
lty = 1, col = "red",se = F)+
ggtitle("Derajat 6")+
theme_classic()
d7 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~poly(x,7,raw=T),
lty = 1, col = "red",se = F)+
ggtitle("Derajat 7")+
theme_classic()
d8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~poly(x,8,raw=T),
lty = 1, col = "red",se = F)+
ggtitle("Derajat 8")+
theme_classic()
d9 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~poly(x,9,raw=T),
lty = 1, col = "red",se = F)+
ggtitle("Derajat 9")+
theme_classic()
grid.arrange(d1,d2,d3,d4,d5,d6,d7,d8,d9, nrow=4,ncol=3,
top = textGrob("Plot Regresi Polinomial",gp=gpar(fontsize=20,font=2)))#CV k-fold
degree=1:10
cv.error1=rep(0,10)
cv.error2=rep(0,10)
set.seed(123)
for(d in degree){
glm.fit = glm(mpg~poly(horsepower, d), data=Auto)
cv.error1[d] = cv.glm(Auto,glm.fit,K=5)$delta[1]
cv.error2[d] = cv.glm(Auto,glm.fit,K=5)$delta[2]
}
cv.error1## [1] 24.29291 19.43357 19.26214 19.18035 19.05298 18.89108 18.84472 19.21623
## [9] 18.77669 19.14110
plot(degree,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Derajat Polinomial",main="Galat Validasi Silang Derajat 1-10",col="blue")#plot regresi polinomial dengan model terbaik
d5 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~poly(x,5,raw=T),
lty = 1, col = "red",se = F)+
ggtitle("Derajat 5")+
theme_classic()
grid.arrange(d5, top = textGrob("Regresi Polinomial Derajat 5",gp=gpar(fontsize=20,font=2)))#Membandingkan model
poly1 = lm(mpg ~ poly(horsepower,1,raw = T),data=Auto)
poly2 = lm(mpg ~ poly(horsepower,2,raw = T),data=Auto)
poly3 = lm(mpg ~ poly(horsepower,3,raw = T),data=Auto)
poly4 = lm(mpg ~ poly(horsepower,4,raw = T),data=Auto)
poly5 = lm(mpg ~ poly(horsepower,5,raw = T),data=Auto)
poly6 = lm(mpg ~ poly(horsepower,6,raw = T),data=Auto)
poly7 = lm(mpg ~ poly(horsepower,7,raw = T),data=Auto)
poly8 = lm(mpg ~ poly(horsepower,8,raw = T),data=Auto)
poly9 = lm(mpg ~ poly(horsepower,9,raw = T),data=Auto)
poly10 = lm(mpg ~ poly(horsepower,10,raw = T),data=Auto)
MSE = function(pred,actual){
mean((pred-actual)^2)
}
hasil<-data.frame(MSE1=MSE(predict(poly1),Auto$mpg),
MSE2=MSE(predict(poly2),Auto$mpg),
MSE3=MSE(predict(poly3),Auto$mpg),
MSE4=MSE(predict(poly4),Auto$mpg),
MSE5=MSE(predict(poly5),Auto$mpg),
MSE6=MSE(predict(poly6),Auto$mpg),
MSE7=MSE(predict(poly7),Auto$mpg),
MSE8=MSE(predict(poly8),Auto$mpg),
MSE9=MSE(predict(poly9),Auto$mpg),
MSE10=MSE(predict(poly10),Auto$mpg))
hasil ## MSE1 MSE2 MSE3 MSE4 MSE5 MSE6 MSE7 MSE8
## 1 23.94366 18.98477 18.94499 18.87633 18.42697 18.24065 18.07817 18.06613
## MSE9 MSE10
## 1 18.02697 18.00953
plot(c(1:10),hasil,type="b",col="blue",xlab="Derajat Polinomial", ylab="MSE", main="MSE Regresi Polinomial")summary(poly5)##
## Call:
## lm(formula = mpg ~ poly(horsepower, 5, raw = T), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.4326 -2.5285 -0.2925 2.1750 15.9730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.223e+01 2.857e+01 -1.128 0.26003
## poly(horsepower, 5, raw = T)1 3.700e+00 1.303e+00 2.840 0.00475 **
## poly(horsepower, 5, raw = T)2 -7.142e-02 2.253e-02 -3.170 0.00164 **
## poly(horsepower, 5, raw = T)3 5.931e-04 1.850e-04 3.206 0.00146 **
## poly(horsepower, 5, raw = T)4 -2.281e-06 7.243e-07 -3.150 0.00176 **
## poly(horsepower, 5, raw = T)5 3.330e-09 1.085e-09 3.068 0.00231 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.326 on 386 degrees of freedom
## Multiple R-squared: 0.6967, Adjusted R-squared: 0.6928
## F-statistic: 177.4 on 5 and 386 DF, p-value: < 2.2e-16
Piecewise Constant
#plot interval 1-9
d2 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,2,raw=T),
lty = 1,lwd=2, col = "red",se = F)+
ggtitle("Interval 2")+
theme_classic()
d3 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,3,raw=T),
lty=1,lwd=2, col = "red",se = F)+
ggtitle("Interval 3")+
theme_classic()
d4 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,4,raw=T),
lty=1,lwd=2, col = "red",se = F)+
ggtitle("Interval 4")+
theme_classic()
d5 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,5,raw=T),
lty=1,lwd=2, col = "red",se = F)+
ggtitle("Interval 5")+
theme_classic()
d6 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,6,raw=T),
lty = 1,lwd=2, col = "red",se = F)+
ggtitle("Interval 6")+
theme_classic()
d7 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,7,raw=T),
lty=1,lwd=2, col = "red",se = F)+
ggtitle("Interval 7")+
theme_classic()
d8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,8,raw=T),
lty=1,lwd=2, col = "red",se = F)+
ggtitle("Interval 8")+
theme_classic()
d9 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,9,raw=T),
lty=1,lwd=2, col = "red",se = F)+
ggtitle("Interval 9")+
theme_classic()
d10<- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,10,raw=T),
lty=1,lwd=2, col = "red",se = F)+
ggtitle("Interval 10")+
theme_classic()
grid.arrange(d2,d3,d4,d5,d6,d7,d8,d9,d10,nrow=3,ncol=3,
top = textGrob("Plot Regresi Fungsi Tangga",gp=gpar(fontsize=20,font=3)))#CV k fold
t=2:10
cv.error1=rep(0,9)
cv.error2=rep(0,9)
set.seed(123)
for(i in t){
Auto$tangga <- cut(Auto$horsepower,i)
glm.fit = glm(mpg~tangga, data=Auto)
cv.error1[i-1] = cv.glm(Auto,glm.fit,K=5)$delta[1]
cv.error2[i-1] = cv.glm(Auto,glm.fit,K=5)$delta[2]
}
cv.error1## [1] 38.18664 33.52055 24.98593 22.46398 22.03468 21.29707 20.18401 20.87612
## [9] 21.21782
plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Jumlah Interval",main="Galat Validasi Silang Jumlah Interval 2-10",col="blue") #Membandingkan model
cut2 = lm(mpg ~ cut(horsepower,2,raw = T),data=Auto)
cut3 = lm(mpg ~ cut(horsepower,3,raw = T),data=Auto)
cut4 = lm(mpg ~ cut(horsepower,4,raw = T),data=Auto)
cut5 = lm(mpg ~ cut(horsepower,5,raw = T),data=Auto)
cut6 = lm(mpg ~ cut(horsepower,6,raw = T),data=Auto)
cut7 = lm(mpg ~ cut(horsepower,7,raw = T),data=Auto)
cut8 = lm(mpg ~ cut(horsepower,8,raw = T),data=Auto)
cut9 = lm(mpg ~ cut(horsepower,9,raw = T),data=Auto)
cut10 = lm(mpg ~ cut(horsepower,10,raw = T),data=Auto)
#####
hasil<-data.frame(MSE2=MSE(predict(cut2),Auto$mpg),
MSE3=MSE(predict(cut3),Auto$mpg),
MSE4=MSE(predict(cut4),Auto$mpg),
MSE5=MSE(predict(cut5),Auto$mpg),
MSE6=MSE(predict(cut6),Auto$mpg),
MSE7=MSE(predict(cut7),Auto$mpg),
MSE8=MSE(predict(cut8),Auto$mpg),
MSE9=MSE(predict(cut9),Auto$mpg),
MSE10=MSE(predict(cut10),Auto$mpg))
hasil## MSE2 MSE3 MSE4 MSE5 MSE6 MSE7 MSE8 MSE9
## 1 37.77106 33.26797 24.75519 21.98227 21.49764 20.69428 19.24845 20.21836
## MSE10
## 1 20.39147
plot(c(2:10),hasil,type="b",col="blue",xlab="Jumlah Interval", ylab="MSE", main="MSE Regresi Fungsi Tangga")#plot regresi tangga dengan model terbaik
d8 <- ggplot(Auto,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,8,raw=T),
lty=1,lwd=2, col = "red",se = F)+
ggtitle("Interval 8")+
theme_classic()
grid.arrange(d8, top = textGrob("Plot Piecewise Constant Regression",gp=gpar(fontsize=20,font=3)))summary(cut8)##
## Call:
## lm(formula = mpg ~ cut(horsepower, 8, raw = T), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9471 -2.6757 -0.1533 2.4015 14.5529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.9085 0.5771 58.76 <2e-16 ***
## cut(horsepower, 8, raw = T)(69,92] -6.9614 0.6910 -10.07 <2e-16 ***
## cut(horsepower, 8, raw = T)(92,115] -12.7551 0.7425 -17.18 <2e-16 ***
## cut(horsepower, 8, raw = T)(115,138] -15.6656 1.1264 -13.91 <2e-16 ***
## cut(horsepower, 8, raw = T)(138,161] -18.7799 0.8568 -21.92 <2e-16 ***
## cut(horsepower, 8, raw = T)(161,184] -19.9735 1.1470 -17.41 <2e-16 ***
## cut(horsepower, 8, raw = T)(184,207] -21.1228 1.7720 -11.92 <2e-16 ***
## cut(horsepower, 8, raw = T)(207,230] -21.0085 1.5159 -13.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.433 on 384 degrees of freedom
## Multiple R-squared: 0.6832, Adjusted R-squared: 0.6774
## F-statistic: 118.3 on 7 and 384 DF, p-value: < 2.2e-16
Natural Cubic Splines
hplimits=range(horsepower)
horsepower.grid = seq(from=hplimits[1],to=hplimits[2])
fit<-lm(mpg ~ ns(horsepower,knots = c(90,120,150,180)),data = Auto )
pred<-predict(fit,newdata=list(horsepower=horsepower.grid),se=T)
plot(Auto$horsepower,Auto$mpg,col="blue")
lines(horsepower.grid,pred$fit,lty=1,lwd=2,col="red")
abline(v = c(0,90,120,150,180),lty="dashed")glm.fit = glm(mpg~horsepower, data=Auto)
t=2:25
cv.error1=rep(0,9)
cv.error2=rep(0,9)
set.seed(123)
for(i in t){
Auto$tangga <- ns(Auto$horsepower,df=i)
glm.fit = glm(mpg~tangga, data=Auto)
cv.error1[i-1] = cv.glm(Auto,glm.fit,K=5)$delta[1]
cv.error2[i-1] = cv.glm(Auto,glm.fit,K=5)$delta[1]
}
cv.error1## [1] 18.97653 19.43084 18.75576 18.71510 19.35841 18.50553 18.95521 19.20868
## [9] 18.53729 18.98381 19.12881 19.44986 19.78876 19.11963 19.50886 19.56679
## [17] 19.25412 19.87808 19.90940 19.19221 19.26636 19.43675 19.49779 18.51231
plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Jumlah Knot",main="Galat Validasi Silang Jumlah Knot 2-10",col="blue")#Membandingkan model
ns2 = lm(mpg ~ ns(horsepower,df=2),data=Auto)
ns3 = lm(mpg ~ ns(horsepower,3),data=Auto)
ns4 = lm(mpg ~ ns(horsepower,4),data=Auto)
ns5 = lm(mpg ~ ns(horsepower,5),data=Auto)
ns6 = lm(mpg ~ ns(horsepower,6),data=Auto)
ns7 = lm(mpg ~ ns(horsepower,7),data=Auto)
ns8 = lm(mpg ~ ns(horsepower,8),data=Auto)
ns9 = lm(mpg ~ ns(horsepower,9),data=Auto)
ns10 = lm(mpg ~ ns(horsepower,10),data=Auto)
ns11 = lm(mpg ~ ns(horsepower,11),data=Auto)
ns12 = lm(mpg ~ ns(horsepower,12),data=Auto)
ns13 = lm(mpg ~ ns(horsepower,13),data=Auto)
ns14 = lm(mpg ~ ns(horsepower,14),data=Auto)
ns15 = lm(mpg ~ ns(horsepower,15),data=Auto)
ns16 = lm(mpg ~ ns(horsepower,16),data=Auto)
ns17 = lm(mpg ~ ns(horsepower,17),data=Auto)
ns18 = lm(mpg ~ ns(horsepower,18),data=Auto)
ns19 = lm(mpg ~ ns(horsepower,19),data=Auto)
ns20 = lm(mpg ~ ns(horsepower,20),data=Auto)
ns21 = lm(mpg ~ ns(horsepower,21),data=Auto)
ns22 = lm(mpg ~ ns(horsepower,22),data=Auto)
ns23 = lm(mpg ~ ns(horsepower,23),data=Auto)
ns24 = lm(mpg ~ ns(horsepower,24),data=Auto)
ns25 = lm(mpg ~ ns(horsepower,25),data=Auto)
#####
hasil<-data.frame(MSE2=MSE(predict(ns2),Auto$mpg),
MSE3=MSE(predict(ns3),Auto$mpg),
MSE4=MSE(predict(ns4),Auto$mpg),
MSE5=MSE(predict(ns5),Auto$mpg),
MSE6=MSE(predict(ns6),Auto$mpg),
MSE7=MSE(predict(ns7),Auto$mpg),
MSE8=MSE(predict(ns8),Auto$mpg),
MSE9=MSE(predict(ns9),Auto$mpg),
MSE10=MSE(predict(ns10),Auto$mpg),
MSE11=MSE(predict(ns11),Auto$mpg),
MSE12=MSE(predict(ns12),Auto$mpg),
MSE13=MSE(predict(ns13),Auto$mpg),
MSE14=MSE(predict(ns14),Auto$mpg),
MSE15=MSE(predict(ns15),Auto$mpg),
MSE16=MSE(predict(ns16),Auto$mpg),
MSE17=MSE(predict(ns17),Auto$mpg),
MSE18=MSE(predict(ns18),Auto$mpg),
MSE19=MSE(predict(ns19),Auto$mpg),
MSE20=MSE(predict(ns20),Auto$mpg),
MSE21=MSE(predict(ns21),Auto$mpg),
MSE22=MSE(predict(ns22),Auto$mpg),
MSE23=MSE(predict(ns23),Auto$mpg),
MSE24=MSE(predict(ns24),Auto$mpg),
MSE25=MSE(predict(ns25),Auto$mpg))
hasil## MSE2 MSE3 MSE4 MSE5 MSE6 MSE7 MSE8 MSE9
## 1 18.8135 18.83133 18.49131 18.29669 18.17391 17.88157 17.88026 17.74008
## MSE10 MSE11 MSE12 MSE13 MSE14 MSE15 MSE16 MSE17
## 1 17.47398 17.67903 17.47017 17.43161 17.3897 17.16821 17.06234 17.09691
## MSE18 MSE19 MSE20 MSE21 MSE22 MSE23 MSE24 MSE25
## 1 17.1029 17.08081 16.75519 16.77222 16.7328 17.04762 16.23641 15.96954
plot(c(2:25),hasil,type="b",col="blue",xlab="Jumlah knot", ylab="MSE", main="MSE Natural Cubic Splines")summary(ns10)##
## Call:
## lm(formula = mpg ~ ns(horsepower, 10), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5003 -2.4956 -0.2086 2.2577 14.6400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.462 1.680 19.324 < 2e-16 ***
## ns(horsepower, 10)1 -4.600 1.789 -2.572 0.010495 *
## ns(horsepower, 10)2 -3.007 2.555 -1.177 0.239996
## ns(horsepower, 10)3 -8.663 2.043 -4.239 2.82e-05 ***
## ns(horsepower, 10)4 -7.170 2.188 -3.278 0.001142 **
## ns(horsepower, 10)5 -14.722 2.125 -6.927 1.84e-11 ***
## ns(horsepower, 10)6 -8.446 2.467 -3.424 0.000684 ***
## ns(horsepower, 10)7 -16.947 2.080 -8.148 5.38e-15 ***
## ns(horsepower, 10)8 -21.715 1.891 -11.483 < 2e-16 ***
## ns(horsepower, 10)9 -14.384 4.116 -3.494 0.000531 ***
## ns(horsepower, 10)10 -22.326 1.946 -11.476 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.24 on 381 degrees of freedom
## Multiple R-squared: 0.7124, Adjusted R-squared: 0.7049
## F-statistic: 94.39 on 10 and 381 DF, p-value: < 2.2e-16
plot(horsepower, mpg, pch=19, col="blue",
xlab="Horse Power", ylab="Mile per Gallon",
main="Model Terbaik Natural Cubic Spline Regression Knot 10")
ix <- sort(horsepower,index.return=T)$ix
lines(horsepower[ix], predict(ns10)[ix],
main = "Natural Cubic Spline Regression Knot=10", col="red",
lty=1, lwd=2)Kesimpulan :
1. Model terbaik regresi polinomial : derajat 5
2. Model terbaik regresi fungsi tangga : jumlah interval 8
3. Model terbaik regresi spline : knot 10
MSE.gab<-c(MSE(predict(poly5),Auto$mpg),
MSE(predict(cut8),Auto$mpg),
MSE(predict(ns10),Auto$mpg))
MSE.gab## [1] 18.42697 19.24845 17.47398
plot(MSE.gab,xaxt = "n",type="b",col="blue",xlab="Metode Regresi", ylab="MSE", main="MSE Regresi Non Linear")
axis(1, at=1:3, labels=c("polinomial derajat 5","fs tangga interval 8","spline knot 10"))Soal 2 : Setiap Negara
Amerika
fil1=Auto$origin==1
data.AS=Auto[fil1,]
glm.fit = glm(mpg~horsepower, data=data.AS)
degree=1:10
cv.error1=rep(0,5)
set.seed(123)
for(d in degree){
glm.fit = glm(mpg~poly(horsepower, d), data=data.AS)
cv.error1[d] = cv.glm(data.AS,glm.fit,K=5)$delta[1]
}
cv.error1## [1] 18.28028 14.77245 15.07391 15.25180 14.82463 16.31074 14.60477 22.45411
## [9] 15.03355 14.90820
plot(degree,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Derajat Polinomial",main="Galat Validasi Silang Derajat 1-10",col="blue")glm.fit = glm(mpg~horsepower, data=data.AS)
t=2:10
cv.error1=rep(0,5)
set.seed(123)
for(i in t){
data.AS$tangga <- cut(data.AS$horsepower,i)
glm.fit = glm(mpg~tangga, data=data.AS)
cv.error1[i-1] = cv.glm(data.AS,glm.fit,K=5)$delta[1]
}
plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Jumlah Interval",main="Galat Validasi Silang Jumlah Interval 2-10",col="blue")glm.fit = glm(mpg~horsepower, data=data.AS)
t=2:10
cv.error1=rep(0,9)
set.seed(1)
for(i in t){
data.AS$tangga <- ns(data.AS$horsepower,df=i)
glm.fit = glm(mpg~tangga, data=data.AS)
cv.error1[i-1] = cv.glm(data.AS,glm.fit,K=5)$delta[1]
}
plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Jumlah Knot",main="Galat Validasi Silang Jumlah Knot 2-10",col="blue") Kesimpulan :
1. Model terbaik regresi polinomial : derajat 7
2. Model terbaik regresi fungsi tangga : jumlah interval 9
3. Model terbaik regresi spline : knot 7
poly3 = lm(mpg ~ poly(horsepower,7,raw = T),data=data.AS)
cut9 = lm(mpg ~ cut(horsepower,9,raw = T),data=data.AS)
ns7 = lm(mpg ~ ns(horsepower,7),data=data.AS)
MSE = function(pred,actual){
mean((pred-actual)^2)
}
MSE.gab<-c(MSE(predict(poly3),data.AS$mpg),
MSE(predict(cut9),data.AS$mpg),
MSE(predict(ns7),data.AS$mpg))
plot(MSE.gab,xaxt = "n",type="b",col="blue",xlab="Metode Regresi", ylab="MSE", main="MSE Regresi Non Linear")
axis(1, at=1:3, labels=c("polinomial derajat 7","fs tangga interval 9","spline knot 7"))#plot regresi tangga (amerika) sebagai model terbaik
d9 <- ggplot(data.AS,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,9,raw=T),
lty=1,lwd=2, col = "red",se = F)
ggtitle("Interval 9")+
theme_classic()## NULL
grid.arrange(d9, top = textGrob("Model Terbaik : Amerika",gp=gpar(fontsize=20,font=3)))Eropa
fil1=Auto$origin==2
data.Eropa=Auto[fil1,]
glm.fit = glm(mpg~horsepower, data=data.Eropa)
degree=1:5
cv.error1=rep(0,5)
set.seed(123)
for(d in degree){
glm.fit = glm(mpg~poly(horsepower, d), data=data.Eropa)
cv.error1[d] = cv.glm(data.Eropa,glm.fit,K=5)$delta[1]
}
cv.error1## [1] 26.82990 24.98542 28.02877 37.53260 28.86751
plot(degree,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Derajat Polinomial",main="Galat Validasi Silang Derajat 1-5",col="blue")glm.fit = glm(mpg~horsepower, data=data.Eropa)
t=2:10
cv.error1=rep(0,9)
set.seed(123)
for(i in t){
data.Eropa$tangga <- cut(data.Eropa$horsepower,i)
glm.fit = glm(mpg~horsepower, data=data.Eropa)
cv.error1[i-1] = cv.glm(data.Eropa,glm.fit,K=5)$delta[1]
}
plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Jumlah Interval",main="Galat Validasi Silang Jumlah Interval 2-10",col="blue")glm.fit = glm(mpg~horsepower, data=data.Eropa)
t=2:10
cv.error1=rep(0,9)
set.seed(123)
for(i in t){
data.Eropa$tangga <- ns(data.Eropa$horsepower,df=i)
glm.fit = glm(mpg~tangga, data=data.Eropa)
cv.error1[i-1] = cv.glm(data.Eropa,glm.fit,K=5)$delta[1]
}
plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Jumlah Knot",main="Galat Validasi Silang Jumlah Knot 2-10",col="blue") Kesimpulan :
1. Model terbaik regresi polinomial : derajat 2
2. Model terbaik regresi fungsi tangga : jumlah interval 8
3. Model terbaik regresi spline : knot 3
poly2 = lm(mpg ~ poly(horsepower,2,raw = T),data=data.Eropa)
cut8 = lm(mpg ~ cut(horsepower,8,raw = T),data=data.Eropa)
ns3 = lm(mpg ~ ns(horsepower,3),data=data.Eropa)
MSE = function(pred,actual){
mean((pred-actual)^2)
}
MSE.gab<-c(MSE(predict(poly2),data.Eropa$mpg),
MSE(predict(cut8),data.Eropa$mpg),
MSE(predict(ns3),data.Eropa$mpg))
MSE.gab## [1] 22.94428 22.73396 22.87518
plot(MSE.gab,xaxt = "n",type="b",col="blue",xlab="Metode Regresi", ylab="MSE", main="MSE Regresi Non Linear")
axis(1, at=1:3, labels=c("polinomial derajat 2","fs tangga interval 8","spline knot 3"))#plot regresi tangga (eropa) sebagai model terbaik
d8 <- ggplot(data.Eropa,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,8,raw=T),
lty=1,lwd=2, col = "red",se = F)
ggtitle("Interval 8")+
theme_classic()## NULL
grid.arrange(d8, top = textGrob("Model Terbaik : Eropa",gp=gpar(fontsize=20,font=3)))Jepang
fil1=Auto$origin==3
data.Jepang=Auto[fil1,]
glm.fit = glm(mpg~horsepower, data=data.Jepang)
degree=1:5
cv.error1=rep(0,5)
set.seed(123)
for(d in degree){
glm.fit = glm(mpg~poly(horsepower, d), data=data.Jepang)
cv.error1[d] = cv.glm(data.Jepang,glm.fit,K=5)$delta[1]
}
cv.error1## [1] 21.82064 23.23935 16.57910 20.55315 17.83392
plot(degree,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Derajat Polinomial",main="Galat Validasi Silang Derajat 1-5",col="blue")glm.fit = glm(mpg~horsepower, data=data.Jepang)
t=2:10
cv.error1=rep(0,9)
set.seed(123)
for(i in t){
data.Jepang$tangga <- cut(data.Jepang$horsepower,i)
glm.fit = glm(mpg~horsepower, data=data.Jepang)
cv.error1[i-1] = cv.glm(data.Jepang,glm.fit,K=5)$delta[1]
}
plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Jumlah Interval",main="Galat Validasi Silang Jumlah Interval 2-10",col="blue")glm.fit = glm(mpg~horsepower, data=data.Jepang)
t=2:10
cv.error1=rep(0,9)
set.seed(123)
for(i in t){
data.Jepang$tangga <- ns(data.Jepang$horsepower,df=i)
glm.fit = glm(mpg~tangga, data=data.Jepang)
cv.error1[i-1] = cv.glm(data.Jepang,glm.fit,K=5)$delta[1]
}
plot(t,cv.error1,type="b",ylab="Galat Validasi Silang",
xlab="Jumlah Knot",main="Galat Validasi Silang Jumlah Knot 2-10",col="blue") Kesimpulan :
1. Model terbaik regresi polinomial : derajat 3
2. Model terbaik regresi fungsi tangga : jumlah interval 8
3. Model terbaik regresi spline : knot 9
poly3 = lm(mpg ~ poly(horsepower,3,raw = T),data=data.Jepang)
cut8 = lm(mpg ~ cut(horsepower,8,raw = T),data=data.Jepang)
ns9 = lm(mpg ~ ns(horsepower,9),data=data.Jepang)
MSE = function(pred,actual){
mean((pred-actual)^2)
}
MSE.gab<-c(MSE(predict(poly3),data.Jepang$mpg),
MSE(predict(cut8),data.Jepang$mpg),
MSE(predict(ns9),data.Jepang$mpg))
plot(MSE.gab,xaxt = "n",type="b",col="blue",xlab="Metode Regresi", ylab="MSE", main="MSE Regresi Non Linear")
axis(1, at=1:3, labels=c("polinomial derajat 3","fs tangga interval 8","spline knot 9"))x <- MSE(predict(cut8),data.Jepang$mpg)
plot(x,xaxt = "n",type="b",col="blue",xlab="Metode Regresi", ylab="MSE", main="MSE Regresi Non Linear")
axis(1, at=1:3, labels=c("polinomial derajat 3","fs tangga interval 8","spline knot 9"))#plot regresi tangga (amerika) sebagai model terbaik
d8 <- ggplot(data.Jepang,aes(x=horsepower, y=mpg)) +
geom_point(alpha=0.55, color="blue") +
stat_smooth(method = "lm",formula = y~cut(x,8,raw=T),
lty=1,lwd=2, col = "red",se = F)
ggtitle("Interval 8")+
theme_classic()## NULL
grid.arrange(d8, top = textGrob("Model Terbaik : Jepang",gp=gpar(fontsize=20,font=3)))Gabungkan Grafik Setiap Negara
#prediksi model terbaik
data.AS$prediksi.cut9 = predict( glm(data.AS$mpg ~ cut(data.AS$horsepower,9)), data.frame(data.AS$horsepower), interval="predict" )
data.Eropa$prediksi.cut8 = predict( glm(data.Eropa$mpg ~ cut(data.Eropa$horsepower,8)), data.frame(data.Eropa$horsepower), interval="predict" )
data.Jepang$prediksi.cut8 = predict( glm(data.Jepang$mpg ~ cut(data.Jepang$horsepower,8)), data.frame(data.Jepang$horsepower), interval="predict" )
#data frame model terbaik
Amerika <-data.frame(data.AS$prediksi.cut9,data.AS$horsepower)
Eropa <-data.frame(data.Eropa$prediksi.cut8,data.Eropa$horsepower)
Jepang <-data.frame(data.Jepang$prediksi.cut8,data.Jepang$horsepower)ggplot(Auto, aes(x=horsepower, y=mpg,col=as.factor(origin))) +
geom_point()+
labs(
x = "Horsepower",
y = "Mile per Gallon",
color = "Negara Produsen",
title = "Plot Regresi Masing-Masing Negara") +
theme_minimal()+
geom_line(color='red',data = Amerika, aes(x=data.AS$horsepower, y=data.AS$prediksi.cut9))+
geom_line(color='green',data = Eropa, aes(x=data.Eropa$horsepower, y=data.Eropa$prediksi.cut8))+
geom_line(color='blue',data = Jepang, aes(x=data.Jepang$horsepower, y=data.Jepang$prediksi.cut8))