# CODE VERSI PACKAGES

# Load packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
library(datarium)
## Warning: package 'datarium' was built under R version 4.3.3
library(ggplot2)
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
# Load and inspect data
data("marketing")
head(marketing)
##   youtube facebook newspaper sales
## 1  276.12    45.36     83.04 26.52
## 2   53.40    47.16     54.12 12.48
## 3   20.64    55.08     83.16 11.16
## 4  181.80    49.56     70.20 22.20
## 5  216.96    12.96     70.08 15.48
## 6   10.44    58.68     90.00  8.64
tail(marketing)
##     youtube facebook newspaper sales
## 195  179.64    42.72      7.20 20.76
## 196   45.84     4.44     16.56  9.12
## 197  113.04     5.88      9.72 11.64
## 198  212.40    11.16      7.68 15.36
## 199  340.32    50.40     79.44 30.60
## 200  278.52    10.32     10.44 16.08
# Simple Linear Regression
model <- lm(sales ~ youtube, data = marketing)
summary(model)
## 
## Call:
## lm(formula = sales ~ youtube, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0632  -2.3454  -0.2295   2.4805   8.6548 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.439112   0.549412   15.36   <2e-16 ***
## youtube     0.047537   0.002691   17.67   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.91 on 198 degrees of freedom
## Multiple R-squared:  0.6119, Adjusted R-squared:  0.6099 
## F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16
# Diagnostic Plot
model.diag.metrics <- augment(model)
ggplot(model.diag.metrics, aes(youtube, sales)) +
  geom_point() +
  stat_smooth(method = lm, se = FALSE) +
  geom_segment(aes(xend = youtube, yend = .fitted), color = "red", linewidth = 0.3) +
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

# Breusch-Pagan test manual
error2 <- residuals(model)^2
model.error <- lm(error2 ~ marketing$youtube)
chi_stat <- dim(marketing)[1] * summary(model.error)$r.square
p_value <- 1 - pchisq(chi_stat, df = 1)
cat("BP =", chi_stat, "; p-value =", p_value)
## BP = 48.03797 ; p-value = 4.180434e-12
# Breusch-Pagan test dengan package
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 48.038, df = 1, p-value = 4.18e-12
# Log Transformasi
log.model <- lm(log(sales) ~ youtube, data = marketing)
summary(log.model)
## 
## Call:
## lm(formula = log(sales) ~ youtube, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.53984 -0.13125  0.03008  0.16696  0.42554 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.1895138  0.0361853   60.51   <2e-16 ***
## youtube     0.0031555  0.0001772   17.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2575 on 198 degrees of freedom
## Multiple R-squared:  0.6156, Adjusted R-squared:  0.6137 
## F-statistic: 317.1 on 1 and 198 DF,  p-value: < 2.2e-16
# Breusch-Pagan setelah transformasi
bptest(log.model)
## 
##  studentized Breusch-Pagan test
## 
## data:  log.model
## BP = 1.1959, df = 1, p-value = 0.2742
# Normality manual
error <- residuals(log.model)
e_cdf <- ecdf(error)(error)
t_cdf <- pnorm(error, mean = 0, sd = sd(error))
D <- max(abs(e_cdf - t_cdf))
print(D)
## [1] 0.0658037
# Normality dengan ks.test
ks.test(error, "pnorm", 0, sd(error))
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  error
## D = 0.070804, p-value = 0.2686
## alternative hypothesis: two-sided
# Autocorrelation manual
n <- dim(marketing)[1]
r <- cor(error[-1], error[-n])
DW <- 2 * (1 - r)
DW
## [1] 1.819105
# Autocorrelation rule
dU <- 1.778
if (DW > dU && DW < 4 - dU) {
  cat("No Autocorrelation, DW = ", DW)
} else {
  cat("Autocorrelation exists, DW = ", DW)
}
## No Autocorrelation, DW =  1.819105
# Autocorrelation test dengan dwtest
dwtest(log.model)
## 
##  Durbin-Watson test
## 
## data:  log.model
## DW = 1.81, p-value = 0.08847
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.9347, p-value = 0.3213
## alternative hypothesis: true autocorrelation is greater than 0
# Multiple Linear Regression
model.mlr <- lm(sales ~ youtube + facebook + newspaper, data = marketing)
summary(model.mlr)
## 
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5932  -1.0690   0.2902   1.4272   3.3951 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.526667   0.374290   9.422   <2e-16 ***
## youtube      0.045765   0.001395  32.809   <2e-16 ***
## facebook     0.188530   0.008611  21.893   <2e-16 ***
## newspaper   -0.001037   0.005871  -0.177     0.86    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.023 on 196 degrees of freedom
## Multiple R-squared:  0.8972, Adjusted R-squared:  0.8956 
## F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16
# Residual plot
marketing$fit.r <- model.mlr$residuals
marketing$fit.p <- model.mlr$fitted.values

marketing %>%
  melt(measure.vars = c("youtube", "facebook", "newspaper", "fit.p")) %>%
  ggplot(aes(value, fit.r, group = variable)) +
  geom_point(shape = 1) +
  geom_smooth(method = loess) +
  geom_hline(yintercept = 0) +
  facet_wrap(~ variable, scales = "free")
## `geom_smooth()` using formula = 'y ~ x'

# Correlation plot
corrplot.mixed(cor(marketing[, 1:4]), upper = "square")

# VIF manual
m_youtube <- lm(youtube ~ facebook + newspaper, data = marketing)
m_facebook <- lm(facebook ~ youtube + newspaper, data = marketing)
m_newspaper <- lm(newspaper ~ youtube + facebook, data = marketing)

R2_youtube <- summary(m_youtube)$r.sq
R2_facebook <- summary(m_facebook)$r.sq
R2_newspaper <- summary(m_newspaper)$r.sq

VIF_y <- 1 / (1 - R2_youtube)
VIF_f <- 1 / (1 - R2_facebook)
VIF_n <- 1 / (1 - R2_newspaper)

data.frame(Youtube = VIF_y, Facebook = VIF_f, Newspaper = VIF_n)
##    Youtube Facebook Newspaper
## 1 1.004611 1.144952  1.145187
# VIF package
vif(model.mlr)
##   youtube  facebook newspaper 
##  1.004611  1.144952  1.145187
# Tambahkan variabel X4 (pastikan set.seed)
set.seed(123)
marketing$X4 <- 0.5 * marketing$facebook + marketing$newspaper + rnorm(n, 0, 2)

model.update <- lm(sales ~ youtube + facebook + newspaper + X4, data = marketing)
summary(model.update)
## 
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper + X4, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2145  -0.9580   0.3256   1.4265   3.5258 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.528939   0.371577   9.497  < 2e-16 ***
## youtube      0.045543   0.001389  32.780  < 2e-16 ***
## facebook     0.264243   0.039402   6.706  2.1e-10 ***
## newspaper    0.148471   0.076176   1.949   0.0527 .  
## X4          -0.149359   0.075876  -1.968   0.0504 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.008 on 195 degrees of freedom
## Multiple R-squared:  0.8992, Adjusted R-squared:  0.8971 
## F-statistic: 434.9 on 4 and 195 DF,  p-value: < 2.2e-16
# VIF untuk model.update manual
m_x4 <- lm(X4 ~ newspaper + youtube + facebook, data = marketing)
R2_x4 <- summary(m_x4)$r.square
1 / (1 - R2_x4)
## [1] 266.0315
# VIF package
vif(model.update)
##   youtube  facebook newspaper        X4 
##   1.01127  24.32257 195.61739 266.03155

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# CODE VERSI MANUAL

data("marketing")
head(marketing)
##   youtube facebook newspaper sales
## 1  276.12    45.36     83.04 26.52
## 2   53.40    47.16     54.12 12.48
## 3   20.64    55.08     83.16 11.16
## 4  181.80    49.56     70.20 22.20
## 5  216.96    12.96     70.08 15.48
## 6   10.44    58.68     90.00  8.64
tail(marketing)
##     youtube facebook newspaper sales
## 195  179.64    42.72      7.20 20.76
## 196   45.84     4.44     16.56  9.12
## 197  113.04     5.88      9.72 11.64
## 198  212.40    11.16      7.68 15.36
## 199  340.32    50.40     79.44 30.60
## 200  278.52    10.32     10.44 16.08
model <- lm(sales ~ youtube, data = marketing)
summary(model)
## 
## Call:
## lm(formula = sales ~ youtube, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0632  -2.3454  -0.2295   2.4805   8.6548 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.439112   0.549412   15.36   <2e-16 ***
## youtube     0.047537   0.002691   17.67   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.91 on 198 degrees of freedom
## Multiple R-squared:  0.6119, Adjusted R-squared:  0.6099 
## F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16
error2 <- residuals(model)^2
model.error <- lm(error2 ~ marketing$youtube)
chi_stat <- dim(marketing)[1] * summary(model.error)$r.square
p_value <- 1 - pchisq(chi_stat, df = 1)

cat("BP =", chi_stat, "; p-value =", p_value)
## BP = 48.03797 ; p-value = 4.180434e-12
log.model <- lm(log(sales) ~ youtube, data = marketing)
summary(log.model)
## 
## Call:
## lm(formula = log(sales) ~ youtube, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.53984 -0.13125  0.03008  0.16696  0.42554 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.1895138  0.0361853   60.51   <2e-16 ***
## youtube     0.0031555  0.0001772   17.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2575 on 198 degrees of freedom
## Multiple R-squared:  0.6156, Adjusted R-squared:  0.6137 
## F-statistic: 317.1 on 1 and 198 DF,  p-value: < 2.2e-16
error2_log <- residuals(log.model)^2
model.error.log <- lm(error2_log ~ marketing$youtube)
chi_stat_log <- dim(marketing)[1] * summary(model.error.log)$r.square
p_value_log <- 1 - pchisq(chi_stat_log, df = 1)

cat("BP (log-model) =", chi_stat_log, "; p-value =", p_value_log)
## BP (log-model) = 1.195861 ; p-value = 0.2741504
error <- residuals(log.model)
e_cdf <- ecdf(error)(error)
t_cdf <- pnorm(error, mean = 0, sd = sd(error))
D <- max(abs(e_cdf - t_cdf))
print(D)
## [1] 0.0658037
n <- dim(marketing)[1]
r <- cor(error[-1], error[-n])
DW <- 2 * (1 - r)
DW
## [1] 1.819105
model.mlr <- lm(sales ~ youtube + facebook + newspaper, data = marketing)
summary(model.mlr)
## 
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5932  -1.0690   0.2902   1.4272   3.3951 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.526667   0.374290   9.422   <2e-16 ***
## youtube      0.045765   0.001395  32.809   <2e-16 ***
## facebook     0.188530   0.008611  21.893   <2e-16 ***
## newspaper   -0.001037   0.005871  -0.177     0.86    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.023 on 196 degrees of freedom
## Multiple R-squared:  0.8972, Adjusted R-squared:  0.8956 
## F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16
m_youtube <- lm(youtube ~ facebook + newspaper, data = marketing)
m_facebook <- lm(facebook ~ youtube + newspaper, data = marketing)
m_newspaper <- lm(newspaper ~ youtube + facebook, data = marketing)

R2_youtube <- summary(m_youtube)$r.sq
R2_facebook <- summary(m_facebook)$r.sq
R2_newspaper <- summary(m_newspaper)$r.sq

VIF_y <- 1 / (1 - R2_youtube)
VIF_f <- 1 / (1 - R2_facebook)
VIF_n <- 1 / (1 - R2_newspaper)

data.frame(Youtube = VIF_y, Facebook = VIF_f, Newspaper = VIF_n)
##    Youtube Facebook Newspaper
## 1 1.004611 1.144952  1.145187
set.seed(123)
marketing$X4 <- 0.5 * marketing$facebook + marketing$newspaper + rnorm(n, 0, 2)

model.update <- lm(sales ~ youtube + facebook + newspaper + X4, data = marketing)
summary(model.update)
## 
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper + X4, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2145  -0.9580   0.3256   1.4265   3.5258 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.528939   0.371577   9.497  < 2e-16 ***
## youtube      0.045543   0.001389  32.780  < 2e-16 ***
## facebook     0.264243   0.039402   6.706  2.1e-10 ***
## newspaper    0.148471   0.076176   1.949   0.0527 .  
## X4          -0.149359   0.075876  -1.968   0.0504 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.008 on 195 degrees of freedom
## Multiple R-squared:  0.8992, Adjusted R-squared:  0.8971 
## F-statistic: 434.9 on 4 and 195 DF,  p-value: < 2.2e-16
m_x4 <- lm(X4 ~ newspaper + youtube + facebook, data = marketing)
R2_x4 <- summary(m_x4)$r.square
VIF_x4 <- 1 / (1 - R2_x4)
VIF_x4
## [1] 266.0315