library(tidyverse) # data manipulation
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom) # clustering algorithms
## Warning: package 'broom' was built under R version 4.4.3
library(datarium) # clustering algorithms & visualization
## Warning: package 'datarium' was built under R version 4.4.3
library(ggplot2)
data("marketing")
head(marketing, 3)
## youtube facebook newspaper sales
## 1 276.12 45.36 83.04 26.52
## 2 53.40 47.16 54.12 12.48
## 3 20.64 55.08 83.16 11.16
tail(marketing, 3)
## youtube facebook newspaper sales
## 198 212.40 11.16 7.68 15.36
## 199 340.32 50.40 79.44 30.60
## 200 278.52 10.32 10.44 16.08
model <- lm(sales ~ youtube, data = marketing)
model
##
## Call:
## lm(formula = sales ~ youtube, data = marketing)
##
## Coefficients:
## (Intercept) youtube
## 8.43911 0.04754
Maka didapat sebuah model persamaan linier sebagai berikut Sales = 8.439 + 0.047Youtube
#Plot Residual
model.diag.metrics <- augment(model)
ggplot(model.diag.metrics, aes(youtube, sales)) +
geom_point() +
stat_smooth(method = lm, se = FALSE) +
geom_segment(aes(xend = youtube, yend = .fitted), color = "blue", linewidth = 0.3)+
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
Garis-garis vertikal merah yang menunjukkan selisih antara nilai prediksi dan nilai aktual (residual) semakin panjang seiring berjalannya waktu, yang mengindikasikan adanya heteroskedastisitas, yaitu variansi residual yang tidak konstan. Hal ini merupakan pelanggaran terhadap salah satu asumsi dasar dalam regresi linier dan menunjukkan bahwa model cenderung kurang akurat dalam memprediksi nilai penjualan pada periode dengan volume yang lebih tinggi.
plot(model, 1)
Dari grafik ini, terlihat bahwa seiring meningkatnya nilai prediksi
(fitted values), variasi atau sebaran residual juga semakin besar. Hal
ini mengindikasikan adanya masalah heteroskedastisitas, yaitu
pelanggaran terhadap asumsi dasar regresi yang menyatakan bahwa residual
harus memiliki varian yang konstan (homoskedastisitas)
plot(model, 3)
Terlihat bahwa variasi (varian) dari titik-titik residual meningkat seiring dengan nilai variabel hasil yang diprediksi, yang menunjukkan adanya varian residual yang tidak konstan (atau terdapat heteroskedastisitas).
#Without Packages
error2 <- residuals(model)^2
model.error <- lm(error2 ~ marketing$youtube)
chi_stat <- dim(marketing)[1]*summary(model.error)$r.square
p_value <- 1 - pchisq(chi_stat, df = 1)
cat("BP =",chi_stat, "; p-value =", p_value)
## BP = 48.03797 ; p-value = 4.180434e-12
#With Packages in R
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 48.038, df = 1, p-value = 4.18e-12
Berdasarkan hasil uji kedua code diatas menunjukkan bahwa p-value sangat kecil (jauh di bawah 0.05), sehingga kita dapat menolak Hâ‚€ (hipotesis nol) yang artinya terdapat bukti kuat bahwa model mengalami heteroskedastisitas, yaitu varian residual berubah-ubah tergantung pada nilai prediktor.
log.model <- lm(log(sales) ~ youtube, data = marketing)
log.model
##
## Call:
## lm(formula = log(sales) ~ youtube, data = marketing)
##
## Coefficients:
## (Intercept) youtube
## 2.189514 0.003156
model persamaan linier sebagai berikut Sales = 2.189 + 0.003Youtube
bptest(log.model)
##
## studentized Breusch-Pagan test
##
## data: log.model
## BP = 1.1959, df = 1, p-value = 0.2742
dapat dilihat di p-value telah lebih dari 0.05 artinya model tidak menunjukkan gejala heteroskedastisitas atau model bersifat homoskedastisitas
plot(log.model, 2)
Dapat dilihat semua titik jatuh kira-kira sepanjang garis, sehingga kita dapat mengasumsikan bahwa data yang telah di transformasi berdistribusi normal.
#Without Packages
error <- residuals(log.model)
e_cdf <- ecdf(error)
t_cdf <- pnorm(error, mean = 0, sd = sd(error))
D <- max(abs(e_cdf(error) - t_cdf))
print(D)
## [1] 0.0658037
#With Packages from R
ks.test(error, "pnorm", 0, sd(error))
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: error
## D = 0.070804, p-value = 0.2686
## alternative hypothesis: two-sided
Karena nilai p-value lebih dari alfa 0.05 maka dapat disimpulkan data berdistribusi normal
ks.test(error, "pnorm", 0, sd(residuals(model)))
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: error
## D = 0.45656, p-value < 2.2e-16
## alternative hypothesis: two-sided
Dapat dilihat Karena nilai p-value kurang dari alfa 0.05 maka dapat disimpulkan data tidak berdistribusi normal
#Without Function
n <- dim(marketing)[1]
r <- cor(error[-1], error[-n])
DW <- 2*(1-r)
DW
## [1] 1.819105
dU <- 1.778
if(DW > dU && DW < 4-dU){
cat("No Autocorrelation, DW = ", DW)
} else {
cat("Autocorrelation exists, DW = ", DW)
}
## No Autocorrelation, DW = 1.819105
#With Function From R
dwtest(log.model)
##
## Durbin-Watson test
##
## data: log.model
## DW = 1.81, p-value = 0.08847
## alternative hypothesis: true autocorrelation is greater than 0
Karena nilia p-value lebih dari 0.05 maka dapat disimpulkan tidak terdapat autocorrelation
dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 1.9347, p-value = 0.3213
## alternative hypothesis: true autocorrelation is greater than 0
Karena nilia p-value lebih dari 0.05 maka dapat disimpulkan tidak terdapat pula autocorrelation pada model asli
model.mlr <- lm(sales ~ youtube + facebook + newspaper, data = marketing)
model.mlr
##
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper, data = marketing)
##
## Coefficients:
## (Intercept) youtube facebook newspaper
## 3.526667 0.045765 0.188530 -0.001037
Model persamaan linier sales = 3.527 + 0.046youtube + 0.188facebook − 0.001newspaper
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
marketing$fit.r <- model.mlr$residuals
marketing$fit.p <- model.mlr$fitted.values
marketing %>%
melt(measure.vars = c("youtube", "facebook", "newspaper", "fit.p")) %>%
ggplot(aes(value, fit.r, group = variable)) +
geom_point(shape = 1) +
geom_smooth(method = loess) +
geom_hline(yintercept = 0) +
facet_wrap(~ variable, scales = "free")
## `geom_smooth()` using formula = 'y ~ x'
Pada plot-plot diatas terlihat bahwa Asumsi linearitas belum sepenuhnya
terpenuhi maka dapat dilakukan uji test lain untuk memastikan linieritas
nya
marketing$fit.r <- model.mlr$residuals
marketing$fit.p <- model.mlr$fitted.values
marketing$sqrt.fit.r <- sqrt(abs(marketing$fit.r))
marketing_long <- marketing %>%
melt(measure.vars = c("youtube", "facebook", "newspaper", "fit.p"))
ggplot(marketing_long, aes(value, sqrt.fit.r, group = variable)) +
geom_point(shape = 1) +
geom_smooth(method = "loess") +
geom_hline(yintercept = 0, linetype = "dashed") +
facet_wrap(~ variable, scales = "free") +
labs(x = "Fitted values / Original Variable", y = "Square Root of |Residuals|") +
theme_minimal() +
ggtitle("Scale-Location Plot")
## `geom_smooth()` using formula = 'y ~ x'
#Without Packages
error2 <- residuals(model.mlr)^2
model.error <- lm(error2 ~ youtube + facebook + newspaper, data = marketing)
chi_stat <- dim(marketing)[1]*summary(model.error)$r.square
p_value <- 1 - pchisq(chi_stat, df = 3)
cat("BP =",chi_stat, "; p-value =", p_value)
## BP = 5.132872 ; p-value = 0.1623222
#With Packages in R
library(lmtest)
bptest(model.mlr)
##
## studentized Breusch-Pagan test
##
## data: model.mlr
## BP = 5.1329, df = 3, p-value = 0.1623
Karena p-value lebih dari 0.05 maka dapat disimpulkan dapat bersifat homosekdastisitas
plot(model.mlr, 2)
### Kolmogorov-Smirnov Uji
#Without Packages
error <- residuals(model.mlr)
e_cdf <- ecdf(error)
t_cdf <- pnorm(error, mean = 0, sd = sd(error))
D <- max(abs(e_cdf(error) - t_cdf))
print(D)
## [1] 0.1269655
#With Packages from R
ks.test(error, "pnorm", 0, sd(error))
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: error
## D = 0.13197, p-value = 0.001887
## alternative hypothesis: two-sided
#Without Function
n <- dim(marketing)[1]
r <- cor(error[-1], error[-n])
DW <- 2*(1-r)
DW
## [1] 2.094184
dU <- 1.778
if(DW > dU && DW < 4-dU){
cat("No Autocorrelation, DW = ", DW)
} else {
cat("Autocorrelation exists, DW = ", DW)
}
## No Autocorrelation, DW = 2.094184
#With Function From R
dwtest(model.mlr)
##
## Durbin-Watson test
##
## data: model.mlr
## DW = 2.0836, p-value = 0.7236
## alternative hypothesis: true autocorrelation is greater than 0
Karena nilia p value lebih dari 0.05 dapat disimpulkan bahwa tidak terjadi autocorrelation pada model ini ### Multicorrelation
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
corrplot.mixed(cor(marketing[,1:4]), upper = "square")
### Variance Inflation Factor (VIF)
#Without Packages
m_youtube <- lm(youtube ~ facebook + newspaper, data = marketing)
m_facebook <- lm(facebook ~ youtube + newspaper, data = marketing)
m_newspaper <- lm(newspaper ~ youtube + facebook, data = marketing)
R2_youtube <- summary(m_youtube)$r.sq
R2_facebook <- summary(m_facebook)$r.sq
R2_newspaper <- summary(m_newspaper)$r.sq
VIF_y <- 1/(1-R2_youtube)
VIF_f <- 1/(1-R2_facebook)
VIF_n <- 1/(1-R2_newspaper)
data.frame(Youtube = VIF_y,
Facebook = VIF_f,
Newspaper = VIF_n)
## Youtube Facebook Newspaper
## 1 1.004611 1.144952 1.145187
#With Packages from R
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(model.mlr)
## youtube facebook newspaper
## 1.004611 1.144952 1.145187
Terlihat bahwa nilai vif semua variabel kurang dari 4, mengindikasikan tidak ada multikorelasi yang terjadi
set.seed(123)
marketing$X4 <- 0.5 * marketing$facebook + marketing$newspaper + rnorm(n, 0, 2)
model.update <- lm(sales ~ youtube + facebook + newspaper + X4, data = marketing)
summary(model.update)
##
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper + X4, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.2145 -0.9580 0.3256 1.4265 3.5258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.528939 0.371577 9.497 < 2e-16 ***
## youtube 0.045543 0.001389 32.780 < 2e-16 ***
## facebook 0.264243 0.039402 6.706 2.1e-10 ***
## newspaper 0.148471 0.076176 1.949 0.0527 .
## X4 -0.149359 0.075876 -1.968 0.0504 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.008 on 195 degrees of freedom
## Multiple R-squared: 0.8992, Adjusted R-squared: 0.8971
## F-statistic: 434.9 on 4 and 195 DF, p-value: < 2.2e-16
m_x4 <- lm(X4 ~ newspaper + youtube + facebook, data = marketing)
R2_x4 <- summary(m_x4)$r.square
1/(1 - R2_x4)
## [1] 266.0315
vif(model.update)
## youtube facebook newspaper X4
## 1.01127 24.32257 195.61739 266.03155
Setelah menambahkan variabel baru (X4), VIF dan R-squared menunjukkan bahwa tidak ada masalah multikolinearitas yang signifikan.