library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
advert <- read.csv("C:/Users/kkayl/Downloads/Advertising.csv")
str(advert)
## 'data.frame': 200 obs. of 4 variables:
## $ TV : num 230.1 44.5 17.2 151.5 180.8 ...
## $ Radio : num 37.8 39.3 45.9 41.3 10.8 48.9 32.8 19.6 2.1 2.6 ...
## $ Newspaper: num 69.2 45.1 69.3 58.5 58.4 75 23.5 11.6 1 21.2 ...
## $ Sales : num 22.1 10.4 12 16.5 17.9 7.2 11.8 13.2 4.8 15.6 ...
head(advert)
## TV Radio Newspaper Sales
## 1 230.1 37.8 69.2 22.1
## 2 44.5 39.3 45.1 10.4
## 3 17.2 45.9 69.3 12.0
## 4 151.5 41.3 58.5 16.5
## 5 180.8 10.8 58.4 17.9
## 6 8.7 48.9 75.0 7.2
par(mfrow=c(2,2))
hist(advert$Sales, col="red")
hist(advert$TV, col="blue")
hist(advert$Radio, col="green")
hist(advert$Newspaper, col="yellow")
Dari histogram tersebut, sebaran yang mengikuti sebaran normal adalah Sales. Bentuk sebaran TV dan Radio cenderung rata, sedangkan sebaran Newspaper menurun ke kanan.
boxplot(advert, col=c("blue", "green", "yellow", "red"))
Bentuk boxplot untuk semua peubah cenderung simetrik kecuali untuk Newspaper yang memiliki dua pencilan atas.
advert.model <- lm(Sales ~ ., data=advert)
summary(advert.model)
##
## Call:
## lm(formula = Sales ~ ., data = advert)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3034 -0.8244 -0.0008 0.8976 3.7473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6251241 0.3075012 15.041 <2e-16 ***
## TV 0.0544458 0.0013752 39.592 <2e-16 ***
## Radio 0.1070012 0.0084896 12.604 <2e-16 ***
## Newspaper 0.0003357 0.0057881 0.058 0.954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.662 on 196 degrees of freedom
## Multiple R-squared: 0.9026, Adjusted R-squared: 0.9011
## F-statistic: 605.4 on 3 and 196 DF, p-value: < 2.2e-16
Model yang didapatkan adalah sebagai berikut.
\[ \hat{y}=4.6251241+0.0544458TV+0.1070012Radio+0.0003357Newspaper \]
dengan R-squared sebesar \(0.9026\) dan Adjusted R-squared sebesar \(0.9011\). Dari hasil tersebut juga dapat terlihat bahwa Newspaper tidak berpengaruh signifikan.
library(psych)
## Warning: package 'psych' was built under R version 4.2.3
corPlot(advert, cex = 1.2)
Dari matriks korelasi, terlihat bahwa Sales memiliki hubungan positif yang kuat dengan TV sebesar \(0.90\) sedangkan Sales memiliki hubungan positif yang paling lemah dengan Newspaper sebesar \(0.16\).
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.2.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_vif_tol(advert.model)
## Variables Tolerance VIF
## 1 TV 0.9954104 1.004611
## 2 Radio 0.8733991 1.144952
## 3 Newspaper 0.8732195 1.145187
Karena semua nilai VIF < 10, maka tidak ada multikolinearitas.
par(mfrow=c(2,2))
plot(advert.model,1) # plot sisaan vs yduga
plot(advert.model,2) # qq-plot
plot(x = 1:dim(advert)[1],
y = advert.model$residuals,
type = 'b',
ylab = "Residuals",
xlab = "Observation") # plot sisaan vs urutan
#nilai sisaan sama dengan nol
t.test(advert.model$residuals,
mu = 0,
conf.level = 0.95)
##
## One Sample t-test
##
## data: advert.model$residuals
## t = 3.5802e-16, df = 199, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.2299509 0.2299509
## sample estimates:
## mean of x
## 4.174931e-17
\(H0\) : Nilai harapan sisaan sama dengan nol
\(H1\) : Nilai harapan sisaan tidak sama dengan nol.
Karena \[p-value > \alpha\] (5%) maka terima H0. Dapat disimpulkan bahwa nilai harapan sisaan sama dengan nol.
# Sisaan saling bebas
# Bisa pake Run Test atau Durbin Watson
library(randtests)
runs.test(advert.model$residuals)
##
## Runs Test
##
## data: advert.model$residuals
## statistic = 0.28356, runs = 103, n1 = 100, n2 = 100, n = 200, p-value =
## 0.7768
## alternative hypothesis: nonrandomness
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
dwtest(advert.model)
##
## Durbin-Watson test
##
## data: advert.model
## DW = 2.2506, p-value = 0.9625
## alternative hypothesis: true autocorrelation is greater than 0
\(H0\) : Sisaan tidak ada autokorelasi
\(H1\) : Sisaan terdapat autokorelasi
Karena \[p-value > \alpha\] (5%), maka terima H0. Oleh karena itu, sisaan tidak terdapat autokorelasi.
#Ragam sisaan homogen
library(lmtest)
bptest(advert.model)
##
## studentized Breusch-Pagan test
##
## data: advert.model
## BP = 3.9785, df = 3, p-value = 0.2638
library(car)
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
ncvTest(advert.model)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.1332549, Df = 1, p = 0.71508
\(H0\) : ragam sisaan homogen
\(H1\) : ragam sisaan tidak homogen
Karena \[p-value > \alpha\] (5%) maka terima H0. Oleh karena itu, ragam sisaan homogen.
library(car)
shapiro.test(advert.model$residuals)
##
## Shapiro-Wilk normality test
##
## data: advert.model$residuals
## W = 0.97581, p-value = 0.001576
\(H0\) : sisaan menyebar normal
\(H1\) : sisaan tidak menyebar normal
Karena \[p-value < \alpha \] maka tolak H0. Maka dari itu, sisaan tidak menyebar normal.
library(olsrr)
advert.plot <- ols_plot_resid_lev(advert.model)
#Cook's Distance
advert.di <- cooks.distance(advert.model)
advert.f <- qf(0.05,3,197, lower.tail = F) # qf(p,db1,db2, lower.tail=F). db1=n-p, db2=n-p, lower.tail=F untuk p=taraf nyata(alpha) dan sebaliknya untuk p= (1-alpha)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
advert.cooks_crit = advert.f
advert.model_cooks <- cooks.distance(advert.model)
advert.df <- data.frame(obs = names(advert.model_cooks),
cooks = advert.model_cooks)
ggplot(advert.df, aes(y = cooks, x = obs)) +
geom_point() +
geom_hline(yintercept = advert.cooks_crit, linetype="dashed") +
labs(title = "Cook's Distance",
subtitle = "Influential Observation ",
x = "Observation Number",
y = "Cook's")
Pada plot diatas terlihat bahwa tidak ada amatan berpengaruh, sehingga semua pencilan dan leverage bisa dihapus.
advert.new <- advert[-c(151, 131, 17, 6, 76, 37, 129, 98, 155, 166, 102, 11, 197),]
advert.model1 <- lm(Sales ~ ., data=advert.new)
summary(advert.model1)
##
## Call:
## lm(formula = Sales ~ ., data = advert.new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5064 -0.8194 -0.0463 0.7233 3.1477
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.494475 0.276061 16.281 <2e-16 ***
## TV 0.053560 0.001255 42.678 <2e-16 ***
## Radio 0.109201 0.007797 14.006 <2e-16 ***
## Newspaper 0.007442 0.005854 1.271 0.205
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.436 on 183 degrees of freedom
## Multiple R-squared: 0.9232, Adjusted R-squared: 0.922
## F-statistic: 733.8 on 3 and 183 DF, p-value: < 2.2e-16
Model yang diperoleh dari data tanpa pencilan adalah
\[ \hat{y}=4.494475+0.053560TV+0.109201Radio+0.007442Newspaper \]
dengan \(R-squared\) sebesar \(0.9232\) dan \(Adjusted R-Squared\) sebesar \(0.922\).
shapiro.test(advert.model1$residuals)
##
## Shapiro-Wilk normality test
##
## data: advert.model1$residuals
## W = 0.98616, p-value = 0.06309
Pada model tanpa pencilan, sisaan sudah menyebar normal.
library(MASS)
## Warning: package 'MASS' was built under R version 4.2.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:olsrr':
##
## cement
#---Stepwise Regression Model----
advert.step <- stepAIC(advert.model1, direction = "both")
## Start: AIC=139.37
## Sales ~ TV + Radio + Newspaper
##
## Df Sum of Sq RSS AIC
## - Newspaper 1 3.3 380.9 139.02
## <none> 377.5 139.37
## - Radio 1 404.7 782.2 273.60
## - TV 1 3757.6 4135.1 584.98
##
## Step: AIC=139.02
## Sales ~ TV + Radio
##
## Df Sum of Sq RSS AIC
## <none> 380.9 139.02
## + Newspaper 1 3.3 377.5 139.37
## - Radio 1 506.5 887.4 295.20
## - TV 1 3786.0 4166.9 584.41
#---Backward Regression Model----
advert.back <- stepAIC(advert.model1, direction = "backward")
## Start: AIC=139.37
## Sales ~ TV + Radio + Newspaper
##
## Df Sum of Sq RSS AIC
## - Newspaper 1 3.3 380.9 139.02
## <none> 377.5 139.37
## - Radio 1 404.7 782.2 273.60
## - TV 1 3757.6 4135.1 584.98
##
## Step: AIC=139.02
## Sales ~ TV + Radio
##
## Df Sum of Sq RSS AIC
## <none> 380.9 139.02
## - Radio 1 506.5 887.4 295.20
## - TV 1 3786.0 4166.9 584.41
#---Forward Regression Model----
advert.fwd <- stepAIC(advert.model1, direction = "forward")
## Start: AIC=139.37
## Sales ~ TV + Radio + Newspaper
Model dengan AIC terendah dimiliki oleh model dengan peubah TV dan Radio
\[ \hat{y}=4.613542+0.053659TV+0.112974Radio \]
dengan R-squared \(0.9226\) dan Adjusted R-squared \(0.9217\)
summary(advert.step)
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = advert.new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7249 -0.8557 -0.0257 0.7834 3.2013
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.613542 0.260120 17.74 <2e-16 ***
## TV 0.053659 0.001255 42.77 <2e-16 ***
## Radio 0.112974 0.007222 15.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.439 on 184 degrees of freedom
## Multiple R-squared: 0.9226, Adjusted R-squared: 0.9217
## F-statistic: 1096 on 2 and 184 DF, p-value: < 2.2e-16
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.2.3
## Loaded glmnet 4.1-8
advert.x <- data.matrix(advert.new[,-4])
advert.y <- advert.new$Sales
advert.cv.r <- cv.glmnet(advert.x, advert.y, alpha=0);plot(advert.cv.r)
advert.best.lr <- advert.cv.r$lambda.min
advert.bestridge <- glmnet(advert.x, advert.y , alpha = 0,lambda = advert.best.lr);coef(advert.bestridge)
## 4 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 5.24608806
## TV 0.04916380
## Radio 0.10044343
## Newspaper 0.01066241
advert.rsq<-function(bestmodel,bestlambda,x,y){
#y duga
y.duga <- predict(bestmodel, s = bestlambda, newx = x)
#JKG dan JKT
jkt <- sum((y - mean(y))^2)
jkg <- sum((y.duga- y)^2)
#find R-Squared
advert.rsq <- 1 - jkg/jkt
return(advert.rsq)
}
advert.rsq(advert.bestridge, advert.best.lr, advert.x, advert.y)
## [1] 0.9173818
R-squared pada model ini adalah sebesar \(0.0185756\).
advert.cv.l <- cv.glmnet(advert.x, advert.y, alpha=1);plot(advert.cv.l)
advert.best.ll <- advert.cv.l$lambda.min
advert.bestlasso <- glmnet(advert.x, advert.y, alpha = 1, lambda = advert.best.ll);coef(advert.bestlasso)
## 4 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 4.570646300
## TV 0.053337660
## Radio 0.108230692
## Newspaper 0.006725392
advert.rsq(advert.bestlasso, advert.best.ll, advert.x, advert.y)
## [1] 0.9232096
R-squared untuk model ini adalah sebesar \(0.9232096\).
library(lmridge)
##
## Attaching package: 'lmridge'
## The following object is masked from 'package:car':
##
## vif
advert.lmr <- lmridge(Sales ~ ., data = advert.new, scaling="centered");plot(advert.lmr);vif(advert.lmr)
## TV Radio Newspaper
## k=0 0 3e-05 2e-05
rse<-function(bestmodel,x,y){
train_predictions <- predict(bestmodel,newx = x)
residuals <- y - train_predictions
df <- length(y) - length(bestmodel$beta)
residual_variance <- sum(residuals^2) / df
rse <- sqrt(residual_variance)
return(rse)
}
#RSE
advert.awal.rse <- 1.662
advert.nopencilan.rse <- 1.436
advert.fwd.rse <- 1.436
advert.lasso.rse <- rse(advert.bestlasso, advert.x, advert.y)
advert.ridge.rse <- rse(advert.bestridge, advert.x, advert.y)
advert.awal.rsq <- 0.9026
advert.nopencilan.rsq <- 0.9232
advert.fwd.rsq <- 0.9266
advert.lasso.rsq <- advert.rsq(advert.bestlasso, advert.best.ll, advert.x, advert.y)
advert.ridge.rsq <- advert.rsq(advert.bestridge, advert.best.ll, advert.x, advert.y)
advert.best <- matrix(c(advert.awal.rse, advert.awal.rsq, advert.nopencilan.rse, advert.nopencilan.rsq, advert.fwd.rse, advert.fwd.rsq, advert.lasso.rse, advert.lasso.rsq, advert.ridge.rse, advert.ridge.rsq), 2, 5)
row.names(advert.best)<- c("RSE", "RSQ")
colnames(advert.best) <- c("Awal", "Tanpa Pencilan", "Stepwise", "Lasso", "Ridge")
advert.best
## Awal Tanpa Pencilan Stepwise Lasso Ridge
## RSE 1.6620 1.4360 1.4360 1.4327493 1.4861230
## RSQ 0.9026 0.9232 0.9266 0.9232096 0.9173818
Berdasarkan hasil tersebut, model terbaik dengan R-squared tertinggi adalah model Stepwise sebesar 92.32% dan RSE sebesar \(0.9266\).