library(tidyverse) # data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom) # clustering algorithms
library(datarium) # clustering algorithms & visualization
library(ggplot2)
data("marketing")
head(marketing, 3)
## youtube facebook newspaper sales
## 1 276.12 45.36 83.04 26.52
## 2 53.40 47.16 54.12 12.48
## 3 20.64 55.08 83.16 11.16
tail(marketing, 3)
## youtube facebook newspaper sales
## 198 212.40 11.16 7.68 15.36
## 199 340.32 50.40 79.44 30.60
## 200 278.52 10.32 10.44 16.08
model.mlr <- lm(sales ~ youtube + facebook + newspaper, data = marketing)
model.mlr
##
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper, data = marketing)
##
## Coefficients:
## (Intercept) youtube facebook newspaper
## 3.526667 0.045765 0.188530 -0.001037
Model persamaan linier sales = 3.527 + 0.046youtube + 0.188facebook − 0.001newspaper
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
marketing$fit.r <- model.mlr$residuals
marketing$fit.p <- model.mlr$fitted.values
marketing %>%
melt(measure.vars = c("youtube", "facebook", "newspaper", "fit.p")) %>%
ggplot(aes(value, fit.r, group = variable)) +
geom_point(shape = 1) +
geom_smooth(method = loess) +
geom_hline(yintercept = 0) +
facet_wrap(~ variable, scales = "free")
## `geom_smooth()` using formula = 'y ~ x'
Pada plot-plot diatas terlihat bahwa Asumsi linearitas belum sepenuhnya
terpenuhi maka dapat dilakukan uji test lain untuk memastikan linieritas
nya
marketing$fit.r <- model.mlr$residuals
marketing$fit.p <- model.mlr$fitted.values
marketing$sqrt.fit.r <- sqrt(abs(marketing$fit.r))
marketing_long <- marketing %>%
melt(measure.vars = c("youtube", "facebook", "newspaper", "fit.p"))
ggplot(marketing_long, aes(value, sqrt.fit.r, group = variable)) +
geom_point(shape = 1) +
geom_smooth(method = "loess") +
geom_hline(yintercept = 0, linetype = "dashed") +
facet_wrap(~ variable, scales = "free") +
labs(x = "Fitted values / Original Variable", y = "Square Root of |Residuals|") +
theme_minimal() +
ggtitle("Scale-Location Plot")
## `geom_smooth()` using formula = 'y ~ x'
#Without Packages
error2 <- residuals(model.mlr)^2
model.error <- lm(error2 ~ youtube + facebook + newspaper, data = marketing)
chi_stat <- dim(marketing)[1]*summary(model.error)$r.square
p_value <- 1 - pchisq(chi_stat, df = 3)
cat("BP =",chi_stat, "; p-value =", p_value)
## BP = 5.132872 ; p-value = 0.1623222
#With Packages in R
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(model.mlr)
##
## studentized Breusch-Pagan test
##
## data: model.mlr
## BP = 5.1329, df = 3, p-value = 0.1623
Karena p-value lebih dari 0.05 maka dapat disimpulkan dapat bersifat homosekdastisitas
plot(model.mlr, 2)
### Kolmogorov-Smirnov Uji
#Without Packages
error <- residuals(model.mlr)
e_cdf <- ecdf(error)
t_cdf <- pnorm(error, mean = 0, sd = sd(error))
D <- max(abs(e_cdf(error) - t_cdf))
print(D)
## [1] 0.1269655
#With Packages from R
ks.test(error, "pnorm", 0, sd(error))
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: error
## D = 0.13197, p-value = 0.001887
## alternative hypothesis: two-sided
#Without Function
n <- dim(marketing)[1]
r <- cor(error[-1], error[-n])
DW <- 2*(1-r)
DW
## [1] 2.094184
dU <- 1.778
if(DW > dU && DW < 4-dU){
cat("No Autocorrelation, DW = ", DW)
} else {
cat("Autocorrelation exists, DW = ", DW)
}
## No Autocorrelation, DW = 2.094184
#With Function From R
dwtest(model.mlr)
##
## Durbin-Watson test
##
## data: model.mlr
## DW = 2.0836, p-value = 0.7236
## alternative hypothesis: true autocorrelation is greater than 0
Karena nilia p value lebih dari 0.05 dapat disimpulkan bahwa tidak terjadi autocorrelation pada model ini ### Multicorrelation
library(corrplot)
## corrplot 0.95 loaded
corrplot.mixed(cor(marketing[,1:4]), upper = "square")
### Variance Inflation Factor (VIF)
#Without Packages
m_youtube <- lm(youtube ~ facebook + newspaper, data = marketing)
m_facebook <- lm(facebook ~ youtube + newspaper, data = marketing)
m_newspaper <- lm(newspaper ~ youtube + facebook, data = marketing)
R2_youtube <- summary(m_youtube)$r.sq
R2_facebook <- summary(m_facebook)$r.sq
R2_newspaper <- summary(m_newspaper)$r.sq
VIF_y <- 1/(1-R2_youtube)
VIF_f <- 1/(1-R2_facebook)
VIF_n <- 1/(1-R2_newspaper)
data.frame(Youtube = VIF_y,
Facebook = VIF_f,
Newspaper = VIF_n)
## Youtube Facebook Newspaper
## 1 1.004611 1.144952 1.145187
#With Packages from R
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(model.mlr)
## youtube facebook newspaper
## 1.004611 1.144952 1.145187
Terlihat bahwa nilai vif semua variabel kurang dari 4, mengindikasikan tidak ada multikorelasi yang terjadi
set.seed(123)
marketing$X4 <- 0.5 * marketing$facebook + marketing$newspaper + rnorm(n, 0, 2)
model.update <- lm(sales ~ youtube + facebook + newspaper + X4, data = marketing)
summary(model.update)
##
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper + X4, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.2145 -0.9580 0.3256 1.4265 3.5258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.528939 0.371577 9.497 < 2e-16 ***
## youtube 0.045543 0.001389 32.780 < 2e-16 ***
## facebook 0.264243 0.039402 6.706 2.1e-10 ***
## newspaper 0.148471 0.076176 1.949 0.0527 .
## X4 -0.149359 0.075876 -1.968 0.0504 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.008 on 195 degrees of freedom
## Multiple R-squared: 0.8992, Adjusted R-squared: 0.8971
## F-statistic: 434.9 on 4 and 195 DF, p-value: < 2.2e-16
m_x4 <- lm(X4 ~ newspaper + youtube + facebook, data = marketing)
R2_x4 <- summary(m_x4)$r.square
1/(1 - R2_x4)
## [1] 266.0315
vif(model.update)
## youtube facebook newspaper X4
## 1.01127 24.32257 195.61739 266.03155
Setelah menambahkan variabel baru (X4), VIF dan R-squared menunjukkan bahwa tidak ada masalah multikolinearitas yang signifikan.