knitr::opts_chunk$set(echo = TRUE)
library(dLagM) #bisa otomatis timeseries datanya
library(dynlm) #data harus timeseries
library(MLmetrics) #MAPE
library(lmtest)
library(car)
library(readxl)
library(caTools)
library(ggplot2)
sumber data: “https://www.kaggle.com/code/sandhyakrishnan02/simple-linear-regression-salary-vs-experience/data”
data1 <- read.csv("TugasIndividu_Salary_Data.csv")
str(data1)
## 'data.frame': 30 obs. of 2 variables:
## $ YearsExperience: num 1.1 1.3 1.5 2 2.2 2.9 3 3.2 3.2 3.7 ...
## $ Salary : num 39343 46205 37731 43525 39891 ...
summary(data1$Salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 37731 56721 65237 76003 100545 122391
data1$Salary
## [1] 39343 46205 37731 43525 39891 56642 60150 54445 64445 57189
## [11] 63218 55794 56957 57081 61111 67938 66029 83088 81363 93940
## [21] 91738 98273 101302 113812 109431 105582 116969 112635 122391 121872
set.seed(123)
split = sample.split(data1$Salary, SplitRatio = 2/3)
training_set = subset(data1, split == TRUE)
test_set = subset(data1, split == FALSE)
regressor = lm(formula = Salary ~ YearsExperience,
data = training_set)
summary(regressor)
##
## Call:
## lm(formula = Salary ~ YearsExperience, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7325.1 -3814.4 427.7 3559.7 8884.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25592 2646 9.672 1.49e-08 ***
## YearsExperience 9365 421 22.245 1.52e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5391 on 18 degrees of freedom
## Multiple R-squared: 0.9649, Adjusted R-squared: 0.963
## F-statistic: 494.8 on 1 and 18 DF, p-value: 1.524e-14
Berdasarkan hasil di atas, “***” menyatakan bahwa terdapat hubungan linear positif yang kuat pada kedua peubah yang digunakan. Bukti lainnya melalui p-value (1.52e-14) < alpha(0.05) menyatakan bahwa terdapat hubungan yang siginifikan antara variabel independen terhadap variabel dependen.
y_predict = predict(regressor, newdata = test_set)
y_predict
## 2 4 5 8 11 16 20 21
## 37766.77 44322.33 46195.35 55560.43 62115.99 71481.07 81782.66 89274.72
## 24 26
## 102385.84 109877.90
model.koyck = dLagM::koyckDlm(x = training_set$YearsExperience, y = training_set$Salary)
summary(model.koyck)
##
## Call:
## "Y ~ (Intercept) + Y.1 + X.t"
##
## Residuals:
## Min 1Q Median 3Q Max
## -6706 -4177 -2312 3207 10132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.592e+04 6.754e+03 2.357 0.0315 *
## Y.1 4.446e-01 2.882e-01 1.543 0.1425
## X.t 5.223e+03 2.763e+03 1.890 0.0770 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5855 on 16 degrees of freedom
## Multiple R-Squared: 0.9589, Adjusted R-squared: 0.9537
## Wald test: 182.4 on 2 and 16 DF, p-value: 9.733e-12
##
## Diagnostic tests:
## NULL
##
## alpha beta phi
## Geometric coefficients: 28659.23 5223.063 0.4446081
AIC(model.koyck)
## [1] 388.306
BIC(model.koyck)
## [1] 392.0838
(fore.koyck <- forecast(model = model.koyck, x=training_set$YearsExperience, h=10))
## $forecasts
## [1] 75847.75 57474.23 56617.49 56758.89 57866.37 60970.29 63917.24 65227.48
## [9] 66332.33 68912.78
##
## $call
## forecast.koyckDlm(model = model.koyck, x = training_set$YearsExperience,
## h = 10)
##
## attr(,"class")
## [1] "forecast.koyckDlm" "dLagM"
mape.koyck <- MAPE(fore.koyck$forecasts, test_set$Salary)
mape_train <- dLagM::GoF(model.koyck)["MAPE"]
c("MAPE_testing" = mape.koyck, "MAPE_training" = mape_train)
## $MAPE_testing
## [1] 0.2984112
##
## $MAPE_training.MAPE
## [1] 0.06879367
model.dlm = dLagM::dlm(x = training_set$YearsExperience, y = training_set$Salary , q = 2)
summary(model.dlm)
##
## Call:
## lm(formula = model.formula, data = design)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7302.2 -2838.9 -305.3 2454.8 11725.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22173 3474 6.383 1.7e-05 ***
## x.t 13304 3230 4.118 0.00104 **
## x.1 1200 4745 0.253 0.80404
## x.2 -5389 3411 -1.580 0.13648
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5281 on 14 degrees of freedom
## Multiple R-squared: 0.9659, Adjusted R-squared: 0.9586
## F-statistic: 132.1 on 3 and 14 DF, p-value: 1.667e-10
##
## AIC and BIC values for the model:
## AIC BIC
## 1 365.1454 369.5973
AIC(model.dlm)
## [1] 365.1454
BIC(model.dlm)
## [1] 369.5973
(fore.dlm <- forecast(model = model.dlm, x = test_set$YearsExperience, h=10)) #meramalkan 10 periode ke depan
## $forecasts
## [1] -3437.032 -6239.950 46836.855 56609.250 66044.495 74800.191
## [7] 86862.895 93437.614 107096.231 115108.636
##
## $call
## forecast.dlm(model = model.dlm, x = test_set$YearsExperience,
## h = 10)
##
## attr(,"class")
## [1] "forecast.dlm" "dLagM"
mape.dlm <- MAPE(fore.dlm$forecasts, test_set$Salary)
mape_train <- GoF(model.dlm)["MAPE"]
c("MAPE_testing" = mape.dlm, "MAPE_training" = mape_train)
## $MAPE_testing
## [1] 0.2820441
##
## $MAPE_training.MAPE
## [1] 0.05029993
finiteDLMauto(formula = Salary ~ YearsExperience,
data = data.frame(training_set), q.min = 1, q.max = 8,
model.type = "dlm", error.type = "AIC", trace = TRUE) ##q max lag maksimum (h-2)
## q - k MASE AIC BIC GMRAE MBRAE R.Adj.Sq Ljung-Box
## 8 8 0.30262 245.1115 250.4454 0.35615 0.73902 0.92683 0.05479065
## 7 7 0.34171 262.3364 267.9859 0.53493 -0.59068 0.96234 0.08297241
## 6 6 0.36979 282.1952 287.9467 0.40582 0.75023 0.96811 0.24902126
## 5 5 0.38991 299.1424 304.8068 0.57787 0.99742 0.97495 0.44428710
## 4 4 0.51182 323.7723 329.1804 0.56456 -0.19951 0.96453 0.94671760
## 3 3 0.56825 341.3768 346.3761 0.73932 14.45538 0.96791 0.73518966
## 2 2 0.62295 365.1454 369.5973 0.75649 2.18333 0.95856 0.64008157
## 1 1 0.57794 385.0460 388.8238 0.57643 0.68726 0.96101 0.17946708
model.dlm2 = dLagM::dlm(x = training_set$YearsExperience,y = training_set$Salary , q = 8) #terdapat lag yang tidak signifikan sehingga dapat dikurangi jumlah lagnya
summary(model.dlm2)
##
## Call:
## lm(formula = model.formula, data = design)
##
## Residuals:
## 1 2 3 4 5 6 7 8 9 10
## 755.2 557.3 -5836.0 4387.2 -1835.1 2721.1 1782.9 -2005.6 -1534.4 -344.3
## 11 12
## 2662.5 -1310.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25567 18561 1.377 0.302
## x.t 4598 12777 0.360 0.753
## x.1 8084 16525 0.489 0.673
## x.2 -6226 16395 -0.380 0.741
## x.3 13213 29878 0.442 0.702
## x.4 -24841 32961 -0.754 0.530
## x.5 14319 26653 0.537 0.645
## x.6 -6806 23799 -0.286 0.802
## x.7 4144 9926 0.418 0.717
## x.8 5130 9990 0.513 0.659
##
## Residual standard error: 6459 on 2 degrees of freedom
## Multiple R-squared: 0.9867, Adjusted R-squared: 0.9268
## F-statistic: 16.48 on 9 and 2 DF, p-value: 0.05849
##
## AIC and BIC values for the model:
## AIC BIC
## 1 245.1115 250.4454
AIC(model.dlm2)
## [1] 245.1115
BIC(model.dlm2)
## [1] 250.4454
(fore.dlm2 <- forecast(model = model.dlm2, x=test_set$YearsExperience, h=10)) #meramalkan 10 periode ke depan dengan lag optimum
## $forecasts
## [1] 73653.56 23497.61 76015.63 -31895.16 218570.10 78174.27 172255.58
## [8] 124005.03 94552.44 101897.98
##
## $call
## forecast.dlm(model = model.dlm2, x = test_set$YearsExperience,
## h = 10)
##
## attr(,"class")
## [1] "forecast.dlm" "dLagM"
mape.dlm2 <- MAPE(fore.dlm2$forecasts, test_set$Salary)
mape_train <- GoF(model.dlm2)["MAPE"]
c("MAPE_testing" = mape.dlm2, "MAPE_training" = mape_train)
## $MAPE_testing
## [1] 0.7543198
##
## $MAPE_training.MAPE
## [1] 0.02486609
Apabila peubah dependen dipengaruhi oleh peubah independen pada waktu sekarang, serta dipengaruhii juga oleh peubah dependen itu sendiri pada satu waktu yang lalu maka model tersebut disebut autoregressive (Gujarati, 2004)
model.ardl = ardlDlm(x = training_set$YearsExperience, y = training_set$Salary, p = 1 , q = 1) #p:lag x, q:lag y
#model untuk p=1, q=1: y = b0 + b1y-1 + b2x + b3x -1
#model untuk p=2, q=3:y = b0 + b1y-1 + b2y-2 + b3x + b4x-1 + b5x-2
summary(model.ardl)
##
## Time series regression with "ts" data:
## Start = 2, End = 20
##
## Call:
## dynlm(formula = as.formula(model.text), data = data, start = 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6052 -2894 -1309 1282 11720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14547.0283 6315.4285 2.303 0.03599 *
## X.t 12821.3567 3180.1200 4.032 0.00109 **
## X.1 -6818.5784 3590.9373 -1.899 0.07699 .
## Y.1 0.3581 0.2290 1.563 0.13884
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5146 on 15 degrees of freedom
## Multiple R-squared: 0.9702, Adjusted R-squared: 0.9642
## F-statistic: 162.8 on 3 and 15 DF, p-value: 1.153e-11
AIC(model.ardl)
## [1] 384.1783
BIC(model.ardl)
## [1] 388.9005
(fore.ardl <- forecast(model = model.ardl, x=test_set$YearsExperience, h=10))
## $forecasts
## [1] 3256.692 32491.667 40750.685 55165.515 62483.200 73151.691
## [7] 84256.520 90989.317 105895.070 111943.237
##
## $call
## forecast.ardlDlm(model = model.ardl, x = test_set$YearsExperience,
## h = 10)
##
## attr(,"class")
## [1] "forecast.ardlDlm" "dLagM"
mape.ardl <- MAPE(fore.ardl$forecasts, test_set$Salary)
mape_train <- GoF(model.ardl)["MAPE"]
c("MAPE_testing" = mape.ardl, "MAPE_training" = mape_train)
## $MAPE_testing
## [1] 0.1547214
##
## $MAPE_training.MAPE
## [1] 0.05372245
note: yang digunakan harusnya data train, tetapi karena keterbatasan data jika menggunakan data train menyebabkan error sehingga dicontohkan menggunakan keseluruhan data seperti kode di bawah ini
ardlBoundOrders(data = data.frame(data1) , formula = Salary ~ YearsExperience ) #
## $p
## YearsExperience
## 1 11
##
## $q
## [1] 1
##
## $Stat.table
## q = 1 q = 2 q = 3 q = 4 q = 5 q = 6 q = 7 q = 8
## p = 1 566.3544 547.5660 525.6416 505.7084 477.3989 456.8015 439.0627 413.9533
## p = 2 548.9209 549.3876 527.6412 507.5017 479.3767 458.4930 435.0941 411.4029
## p = 3 528.8994 528.8994 526.6374 507.1503 477.3448 459.0383 436.2863 413.0752
## p = 4 502.9566 504.6683 504.6683 506.0194 477.6071 458.6948 434.3159 406.3868
## p = 5 489.7947 491.6794 489.6931 489.6931 479.4583 460.6076 431.3379 407.1324
## p = 6 461.3920 460.2315 460.4720 461.0696 461.0696 459.4034 425.8025 407.2366
## p = 7 446.0449 447.8654 448.8347 449.3867 438.4742 438.4742 427.2948 396.2064
## p = 8 423.2051 422.6934 424.6790 396.3301 391.9981 390.4516 390.4516 375.0602
## p = 9 394.2130 392.0228 370.1293 367.9575 337.9889 NA NA NA
## p = 10 377.1738 379.1689 356.7796 NA NA NA NA NA
## p = 11 288.3299 NA NA NA NA NA NA NA
## q = 9 q = 10 q = 11 q = 12 q = 13 q = 14 q = 15
## p = 1 390.7190 375.0049 344.2562 NA NA NA NA
## p = 2 360.1913 344.6047 317.2517 NA NA NA NA
## p = 3 362.0708 346.5691 NA NA NA NA NA
## p = 4 354.6035 338.2928 NA NA NA NA NA
## p = 5 356.6024 NA NA NA NA NA NA
## p = 6 305.3155 NA NA NA NA NA NA
## p = 7 NA NA NA NA NA NA NA
## p = 8 NA NA NA NA NA NA NA
## p = 9 NA NA NA NA NA NA NA
## p = 10 NA NA NA NA NA NA NA
## p = 11 NA NA NA NA NA NA NA
##
## $min.Stat
## [1] 288.3299
note: yang digunakan harusnya data train, tetapi karena keterbatasan data jika menggunakan data train menyebabkan error sehingga dicontohkan menggunakan keseluruhan data
###PEMODELAN DLM dan ARDL dengan library dynlm
#sama dengan model dlm p=1
cons_lm1 <- dynlm(Salary ~ YearsExperience+L(YearsExperience),data = training_set)
#sama dengan model ardl p=0 q=1
cons_lm2 <- dynlm(Salary ~ YearsExperience+L(Salary),data = training_set)
#sama dengan ardl p=1 q=1
cons_lm3 <- dynlm(Salary ~ YearsExperience+L(YearsExperience)+L(Salary),data = training_set)
#sama dengan dlm p=2
cons_lm4 <- dynlm(Salary ~ YearsExperience+L(YearsExperience)+L(YearsExperience,2),data = training_set)
summary(cons_lm1)
##
## Time series regression with "numeric" data:
## Start = 1, End = 20
##
## Call:
## dynlm(formula = Salary ~ YearsExperience + L(YearsExperience),
## data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7325.1 -3814.4 427.7 3559.7 8884.6
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25592 2646 9.672 1.49e-08 ***
## YearsExperience 9365 421 22.245 1.52e-14 ***
## L(YearsExperience) NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5391 on 18 degrees of freedom
## Multiple R-squared: 0.9649, Adjusted R-squared: 0.963
## F-statistic: 494.8 on 1 and 18 DF, p-value: 1.524e-14
summary(cons_lm2)
##
## Time series regression with "numeric" data:
## Start = 1, End = 20
##
## Call:
## dynlm(formula = Salary ~ YearsExperience + L(Salary), data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.033e-12 4.460e-14 1.382e-13 5.163e-13 1.606e-12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.955e-12 1.693e-12 4.108e+00 0.000734 ***
## YearsExperience 3.409e-12 5.776e-13 5.903e+00 1.74e-05 ***
## L(Salary) 1.000e+00 6.059e-17 1.651e+16 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.386e-12 on 17 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.881e+33 on 2 and 17 DF, p-value: < 2.2e-16
summary(cons_lm3)
##
## Time series regression with "numeric" data:
## Start = 1, End = 20
##
## Call:
## dynlm(formula = Salary ~ YearsExperience + L(YearsExperience) +
## L(Salary), data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.033e-12 4.460e-14 1.382e-13 5.163e-13 1.606e-12
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.955e-12 1.693e-12 4.108e+00 0.000734 ***
## YearsExperience 3.409e-12 5.776e-13 5.903e+00 1.74e-05 ***
## L(YearsExperience) NA NA NA NA
## L(Salary) 1.000e+00 6.059e-17 1.651e+16 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.386e-12 on 17 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.881e+33 on 2 and 17 DF, p-value: < 2.2e-16
summary(cons_lm4)
##
## Time series regression with "numeric" data:
## Start = 1, End = 20
##
## Call:
## dynlm(formula = Salary ~ YearsExperience + L(YearsExperience) +
## L(YearsExperience, 2), data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7325.1 -3814.4 427.7 3559.7 8884.6
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25592 2646 9.672 1.49e-08 ***
## YearsExperience 9365 421 22.245 1.52e-14 ***
## L(YearsExperience) NA NA NA NA
## L(YearsExperience, 2) NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5391 on 18 degrees of freedom
## Multiple R-squared: 0.9649, Adjusted R-squared: 0.963
## F-statistic: 494.8 on 1 and 18 DF, p-value: 1.524e-14
#SSE
deviance(cons_lm1)
## [1] 523049701
deviance(cons_lm2)
## [1] 3.263809e-23
deviance(cons_lm3)
## [1] 3.263809e-23
deviance(cons_lm4)
## [1] 523049701
#Diagnostik
#durbin watson
dwtest(cons_lm1)
##
## Durbin-Watson test
##
## data: cons_lm1
## DW = 1.1823, p-value = 1.878e-11
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(cons_lm2)
##
## Durbin-Watson test
##
## data: cons_lm2
## DW = 2.372, p-value = 0.6845
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(cons_lm3)
##
## Durbin-Watson test
##
## data: cons_lm3
## DW = 2.372, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(cons_lm4)
##
## Durbin-Watson test
##
## data: cons_lm4
## DW = 1.1823, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
# bptest
bptest(cons_lm1)
##
## studentized Breusch-Pagan test
##
## data: cons_lm1
## BP = 3.5005, df = 1, p-value = 0.06135
bptest(cons_lm2)
##
## studentized Breusch-Pagan test
##
## data: cons_lm2
## BP = 2.2382, df = 2, p-value = 0.3266
bptest(cons_lm3)
##
## studentized Breusch-Pagan test
##
## data: cons_lm3
## BP = 2.2382, df = 2, p-value = 0.3266
bptest(cons_lm4)
##
## studentized Breusch-Pagan test
##
## data: cons_lm4
## BP = 3.5005, df = 1, p-value = 0.06135
#shapiro wilk
shapiro.test(residuals(cons_lm1))
##
## Shapiro-Wilk normality test
##
## data: residuals(cons_lm1)
## W = 0.94087, p-value = 0.249
shapiro.test(residuals(cons_lm2))
##
## Shapiro-Wilk normality test
##
## data: residuals(cons_lm2)
## W = 0.63996, p-value = 7.865e-06
shapiro.test(residuals(cons_lm3))
##
## Shapiro-Wilk normality test
##
## data: residuals(cons_lm3)
## W = 0.63996, p-value = 7.865e-06
shapiro.test(residuals(cons_lm4))
##
## Shapiro-Wilk normality test
##
## data: residuals(cons_lm4)
## W = 0.94087, p-value = 0.249
#Perbandingan
akurasi <- matrix(c(mape.koyck, mape.dlm, mape.dlm2, mape.ardl))
row.names(akurasi)<- c("Koyck","DLM 1","DLM Optimum","Autoregressive")
colnames(akurasi) <- c("MAPE")
akurasi
## MAPE
## Koyck 0.2984112
## DLM 1 0.2820441
## DLM Optimum 0.7543198
## Autoregressive 0.1547214
#PLOT
par(mfrow=c(1,1))
plot(test_set$YearsExperience, test_set$Salary, type="b", col="black", ylim=c(35000,130000))
points(test_set$YearsExperience, fore.koyck$forecasts,col="red")
lines(test_set$YearsExperience, fore.koyck$forecasts,col="red")
points(test_set$YearsExperience, fore.dlm$forecasts,col="blue")
lines(test_set$YearsExperience, fore.dlm$forecasts,col="blue")
points(test_set$YearsExperience, fore.ardl$forecasts,col="green")
lines(test_set$YearsExperience, fore.ardl$forecasts,col="green")
legend("topleft",c("aktual", "koyck","DLM 1", "autoregressive"), lty=1, col=c("black","red","blue","green"), cex=0.8)
###note: model DLM2 dihapus dari plot di bawah ini