Memanggil packages yang akan digunakan

library(tidyverse) # data manipulation
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom) # clustering algorithms
## Warning: package 'broom' was built under R version 4.3.3
library(datarium) # clustering algorithms & visualization
## Warning: package 'datarium' was built under R version 4.3.3
library(ggplot2) 

Data Preparation

data("marketing")

Inspect the Data

head(marketing, 3)
##   youtube facebook newspaper sales
## 1  276.12    45.36     83.04 26.52
## 2   53.40    47.16     54.12 12.48
## 3   20.64    55.08     83.16 11.16
tail(marketing, 3)
##     youtube facebook newspaper sales
## 198  212.40    11.16      7.68 15.36
## 199  340.32    50.40     79.44 30.60
## 200  278.52    10.32     10.44 16.08

Fit the LM Model

Sales = β0 + β1 Youtube + ε

model <- lm(sales ~ youtube, data = marketing)
model
## 
## Call:
## lm(formula = sales ~ youtube, data = marketing)
## 
## Coefficients:
## (Intercept)      youtube  
##     8.43911      0.04754

Visualisasi Regresi Lineara

model.diag.metrics <- augment(model)
ggplot(model.diag.metrics, aes(youtube, sales)) +
  geom_point() +
  stat_smooth(method = lm, se = FALSE) +
  geom_segment(aes(xend = youtube, yend = .fitted), color = "green", size = 0.3)+
  theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

Visualisasi Linearitas

plot(model, 3)

Homogenitas dari Varians

Breush-Pagan Test Manual

Menghitung ϵ²

error2 <- residuals(model)^2

Sesuaikan model regresi baru dengan ϵ² sebagai variabel dependen

ϵ² = α0 + α1Y outube

model.error <- lm(error2 ~ marketing$youtube)

Calculate the test statistic

X²stat = n × R²new

chi_stat <- dim(marketing)[1]*summary(model.error)$r.square

Compute the p-value

p_value <- 1 - pchisq(chi_stat, df = 1)
cat("BP =",chi_stat, "; p-value =", p_value)
## BP = 48.03797 ; p-value = 4.180434e-12

Breusch-Pagan Test Function

library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 48.038, df = 1, p-value = 4.18e-12
p_value <- 1 - pchisq(chi_stat, df = 1)
cat("BP =",chi_stat, "; p-value =", p_value)
## BP = 48.03797 ; p-value = 4.180434e-12

Mentransformasi pada sales

log(sales) = β0 + β1youtube + ε

log.model<-lm(log(sales) ~ youtube, data = marketing)
log.model
## 
## Call:
## lm(formula = log(sales) ~ youtube, data = marketing)
## 
## Coefficients:
## (Intercept)      youtube  
##    2.189514     0.003156

Memeriksa

Visualisasi pemeriksaan

plot(log.model, 3)

periksa lagi Homogenitas dari Varians

bptest(log.model)
## 
##  studentized Breusch-Pagan test
## 
## data:  log.model
## BP = 1.1959, df = 1, p-value = 0.2742

Normalitas

Visualisasi

plot(log.model, 2)

Normalitas manual

error <- residuals(log.model)
ecdf_function <- ecdf(error)
t_cdf <- pnorm(error, mean = 0, sd = sd(error))
e_cdf <- ecdf_function(error)
D <- max(abs(e_cdf - t_cdf))
print(D)
## [1] 0.0658037

Normalitas Function Kolmogorov Smirnov

ks.test(error, "pnorm", 0, sd(error))
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  error
## D = 0.070804, p-value = 0.2686
## alternative hypothesis: two-sided

Normalitas dengan model umum KS

ks.test(error, "pnorm", 0, sd(residuals(model)))
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  error
## D = 0.45656, p-value < 2.2e-16
## alternative hypothesis: two-sided

Autocorrelation

plot(log.model, 1)

Autocorrelation check Durbin - Watson Manual

DW ≈ 2(1 − ρ)
n <- dim(marketing)[1]
r <- cor(error[-1], error[-n])
DW <- 2*(1-r)
DW
## [1] 1.819105
dU <- 1.778
if(DW > dU && DW < 4-dU){
cat("No Autocorrelation, DW = ", DW)
} else {
cat("Autocorrelation exists, DW = ", DW)
}
## No Autocorrelation, DW =  1.819105

Autocorrelation check Durbin - Watson Function

dwtest(log.model)
## 
##  Durbin-Watson test
## 
## data:  log.model
## DW = 1.81, p-value = 0.08847
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.9347, p-value = 0.3213
## alternative hypothesis: true autocorrelation is greater than 0

Regresi Linier Berganda

#####Menambahkan Variabel Kedalam Model ##### sales = β0 + β1youtube + β2facebook + β3newspaper + ε

model.mlr <- lm(sales ~ youtube + facebook + newspaper, data = marketing)

Linearitas

library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
marketing$fit.r <- model.mlr$residuals
marketing$fit.p <- model.mlr$fitted.values

marketing %>%
  melt(measure.vars = c("youtube", "facebook", "newspaper", "fit.p")) %>%
  ggplot(aes(value, fit.r, group = variable)) +
  geom_point(shape = 1) +
  geom_smooth(method = loess) +
  geom_hline(yintercept = 0) +
  facet_wrap(~ variable, scales = "free")
## `geom_smooth()` using formula = 'y ~ x'

Setelah linieritas, kita juga dapat memeriksa apakah model ini mengalami heteroskedasititas atau homoskedasititas.

marketing$fit.r <- model.mlr$residuals
marketing$fit.p <- model.mlr$fitted.values
marketing$sqrt.fit.r <- sqrt(abs(marketing$fit.r))
marketing_long <- marketing %>%
  melt(measure.vars = c("youtube", "facebook", "newspaper", "fit.p"))
ggplot(marketing_long, aes(value, sqrt.fit.r, group = variable)) +
  geom_point(shape = 1) +                     
  geom_smooth(method = "loess") +              
  geom_hline(yintercept = 0, linetype = "dashed") +  
  facet_wrap(~ variable, scales = "free") +   
  labs(x = "Fitted values / Original Variable", y = "Square Root of |Residuals|") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Breusch-Pagan Test (Function)

library(lmtest)
bptest(model.mlr)
## 
##  studentized Breusch-Pagan test
## 
## data:  model.mlr
## BP = 5.1329, df = 3, p-value = 0.1623

Karena p-value lebih dari 0.05 maka dapat disimpulkan dapat bersifat homosekdastisitas.

Setelah itu, kita dapat memeriksa normalitas dari model.

plot(model.mlr, 2)

Berdasarkan Q-Q plot, sebagian besar residual model mengikuti distribusi normal dengan baik, terutama di bagian tengah, meskipun ada penyimpangan kecil di bagian ekor kiri dan kanan yang menunjukkan kemungkinan keberadaan outlier atau heavy tails. Agar kita tahu, kita dapat lakukan tes Kolmogrov-Smirnov.

Kolmogrov-Smirnvov (Manual)

error <- residuals(model.mlr)
e_cdf <- ecdf(error)
t_cdf <- pnorm(error, mean = 0, sd = sd(error))
D <- max(abs(e_cdf(error) - t_cdf))
print(D)
## [1] 0.1269655

Kolmogrov-Smirnov (Function)

ks.test(error, "pnorm", 0, sd(error))
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  error
## D = 0.13197, p-value = 0.001887
## alternative hypothesis: two-sided

Berdasarkan uji Kolmogorov-Smirnov, diperoleh nilai p-value sebesar 0.001887, yang lebih kecil dari 0.05, sehingga dapat disimpulkan bahwa residual tidak berdistribusi normal secara statistik.

Selanjutnya kita akan memeriksa apa terjadi autokorelasi pada model kita.

Autokorelasi dengan Durbin-Watson (Manual)

n <- dim(marketing)[1]
r <- cor(error[-1], error[-n])
DW <- 2*(1-r)
DW
## [1] 2.094184
dU <- 1.788 #n= 200 dan k= 3
if(DW > dU && DW < 4-dU){
cat("No Autocorrelation, DW = ", DW)
} else {
cat("Autocorrelation exists, DW = ", DW)
}
## No Autocorrelation, DW =  2.094184

Autokorelasi dengan Durbin-Watson (Function)

dwtest(model.mlr)
## 
##  Durbin-Watson test
## 
## data:  model.mlr
## DW = 2.0836, p-value = 0.7236
## alternative hypothesis: true autocorrelation is greater than 0

Karena nilai p value lebih dari 0.05 dapat disimpulkan bahwa tidak terjadi autocorrelation pada model ini.

Multicolinierity

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
corrplot.mixed(cor(marketing[,1:4]), upper = "square")

Variance Inflation Factor (VIF)

m_youtube <- lm(youtube ~ facebook + newspaper, data = marketing)
m_facebook <- lm(facebook ~ youtube + newspaper, data = marketing)
m_newspaper <- lm(newspaper ~ youtube + facebook, data = marketing)

VIF

R2_youtube <- summary(m_youtube)$r.sq
R2_facebook <- summary(m_facebook)$r.sq
R2_newspaper <- summary(m_newspaper)$r.sq
VIF_y <- 1/(1-R2_youtube)
VIF_f <- 1/(1-R2_facebook)
VIF_n <- 1/(1-R2_newspaper)

#Let's check the vIF
data.frame(Youtube = VIF_y,
            Facebook = VIF_f,
  Newspaper = VIF_n)
##    Youtube Facebook Newspaper
## 1 1.004611 1.144952  1.145187

Membandingkan dengan ‘vif’ dari ‘R’

library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
vif(model.mlr)
##   youtube  facebook newspaper 
##  1.004611  1.144952  1.145187

Menambahkan colinear variable

marketing$X4 <- 0.5*marketing$facebook + marketing$newspaper + rnorm(n,0,2)
model.update <- lm(sales ~ youtube + facebook + newspaper + X4, data = marketing)
model.update
## 
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper + X4, data = marketing)
## 
## Coefficients:
## (Intercept)      youtube     facebook    newspaper           X4  
##     3.52509      0.04575      0.17358     -0.02948      0.02881

Memeriksa VIF dari X4

m_x4 <- lm(X4 ~ newspaper + youtube + facebook, data = marketing)
R2_x4 <- summary(m_x4)$r.square
1/(1- R2_x4)
## [1] 260.4106
#Bandingkan with vif dari car:
vif(model.update)
##    youtube   facebook  newspaper         X4 
##   1.005493  25.232403 188.792241 260.410587

Setelah menambahkan variabel baru (X4), VIF dan R-squared menunjukkan bahwa tidak ada masalah multikolinearitas yang signifikan.

Kesimpulan

Dengan memanfaatkan fungsi bawaan dari R, kita sudah bisa mendapatkan gambaran atau analisis yang cukup baik untuk mengevaluasi model. Namun, jika menggunakan sintaks manual, hasil yang diperoleh bisa lebih rinci, meskipun memerlukan pengecekan berulang untuk memastikan bahwa sintaks yang digunakan sudah benar.