#How do advertising spendings in TV, radio, and newspapers impact sales performance?
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
Advertising <- read_excel("C:/Users/PC/Desktop/Advertising.xlsx")
## New names:
## • `` -> `...1`
mydata <- read_excel("C:/Users/PC/Desktop/Advertising.xlsx")
## New names:
## • `` -> `...1`
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Next we will show the first 10 units
head(mydata, 10)
## # A tibble: 10 × 5
## ...1 TV radio newspaper sales
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 230. 37.8 69.2 22.1
## 2 2 44.5 39.3 45.1 10.4
## 3 3 17.2 45.9 69.3 9.3
## 4 4 152. 41.3 58.5 18.5
## 5 5 181. 10.8 58.4 12.9
## 6 6 8.7 48.9 75 7.2
## 7 7 57.5 32.8 23.5 11.8
## 8 8 120. 19.6 11.6 13.2
## 9 9 8.6 2.1 1 4.8
## 10 10 200. 2.6 21.2 10.6
library(pastecs)
## Warning: package 'pastecs' was built under R version 4.3.3
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
summary_stats <- round(stat.desc(mydata[-1]), 2)
#Now we will create a scatterplot matrix to check the relationship between explanatory variables. Additionally we will check for multicoliniarity and whether the errors are normally distributed.
pairs(mydata[-1], main = "Scatterplot Matrix")
#Correlation matrix
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.3.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(mydata[,c(-1,-1)]))
## TV radio newspaper sales
## TV 1.00 0.05 0.06 0.78
## radio 0.05 1.00 0.35 0.58
## newspaper 0.06 0.35 1.00 0.23
## sales 0.78 0.58 0.23 1.00
##
## n= 200
##
##
## P
## TV radio newspaper sales
## TV 0.4408 0.4256 0.0000
## radio 0.4408 0.0000 0.0000
## newspaper 0.4256 0.0000 0.0011
## sales 0.0000 0.0000 0.0011
#Here we can see that there is positive and weak correlation between explanatory variables radio and TV. Also the p-value is showing us that we cannot reject the null hypothesis which states that there is no correlation. #Next we are going to fit the regression model to see the relationship between the dependent and the independent variables.
fit1 <- lm(sales ~ TV + radio + newspaper, data = mydata)
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif_values <- vif(fit1)
vif(fit1)
## TV radio newspaper
## 1.004611 1.144952 1.145187
mean(vif(fit1))
## [1] 1.09825
#In our case we got good values beacuase we checked for multicolinearity and our values were all less than 5. On the other hand our mean is close to 1 which is in our favor. #Next we will do tests to check whether we have any outliers or any high value units.
mydata$stdresid <- round(rstandard(fit1), 3)
mydata$CooksD <- round(cooks.distance(fit1), 3)
hist(mydata$stdresid,
xlab = "Standardized Residuals",
ylab = "Frequency",
main = "Histogram of Standardized Residuals")
shapiro.test(mydata$stdresid)
##
## Shapiro-Wilk normality test
##
## data: mydata$stdresid
## W = 0.91661, p-value = 3.31e-09
#Based on the standardized residuals we can see that we have values that are under -3, and shapiro wilk-test shows us that we can reject the null hypothesis which is ” errors are normally distributed” #Now we will show the histogram of Cooks Distances.
hist(mydata$CooksD,
xlab = "Cook's Distance",
ylab = "Frequency",
main = "Histogram of Cook's Distance")
#In this test we can see a gap between 0.15 and 0.25 which is a problem
for us. #Now we will show the three units with the lowest st.
residuals.
head(mydata[order(mydata$stdresid),], 3)
## # A tibble: 3 × 7
## ...1 TV radio newspaper sales stdresid CooksD
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 131 0.7 39.6 8.7 1.6 -5.34 0.273
## 2 6 8.7 48.9 75 7.2 -3.21 0.128
## 3 36 291. 4.1 8.5 12.8 -2.54 0.051
#Now showing the six units with the highest cooks distance.
head(mydata[order(-mydata$CooksD),], 6)
## # A tibble: 6 × 7
## ...1 TV radio newspaper sales stdresid CooksD
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 131 0.7 39.6 8.7 1.6 -5.34 0.273
## 2 6 8.7 48.9 75 7.2 -3.21 0.128
## 3 76 16.9 43.7 89.4 8.7 -1.93 0.056
## 4 36 291. 4.1 8.5 12.8 -2.54 0.051
## 5 179 277. 2.3 23.7 11.8 -2.53 0.046
## 6 127 7.8 38.9 50.6 6.6 -2.39 0.04
#Based on the results we want to remove row number 131, 6 and 76.
row_to_remove <- 131
row_to_remove <- 6
row_to_remove <- 76
#Now we are going to fit the model again so we can reestimate.
fit1 <- lm(sales ~ TV + radio + newspaper, data = mydata)
mydata$stdresid <- round(rstandard(fit1), 3)
mydata$CooksD <- round(cooks.distance(fit1), 3)
hist(mydata$stdresid,
xlab = "Standardized Residuals",
ylab = "Frequency",
main = "Histogram of Standardized Residuals")
shapiro.test(mydata$stdresid)
##
## Shapiro-Wilk normality test
##
## data: mydata$stdresid
## W = 0.91661, p-value = 3.31e-09
hist(mydata$CooksD,
xlab = "Cook's Distance",
ylab = "Frequency",
main = "Histogram of Cook's Distance")
#Since we have a large sample we are not concerned about the other
values that might have a slightly higher st. residuals or cooks
distance. #Now we are going to check the relationship between
standardized residuals and standardized fitted values. This will show us
whether homoscedasticity and linearity is met.
library(car)
mydata$Stdfitted <- scale(fit1$fitted.values)
scatterplot(y = mydata$stdresid, x = mydata$Stdfitted, ylab = "Standardized Residuals", xlab = "Standardized Fitted Values", boxplot = FALSE, regLine = FALSE, smooth = FALSE)
#Next we are going to run the breusch pagan test to check for
heteroscadasticity with the p-value.
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.3.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit1)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## ---------------------------------
## Response : sales
## Variables: fitted values of sales
##
## Test Summary
## -----------------------------
## DF = 1
## Chi2 = 5.355982
## Prob > Chi2 = 0.02065131
#Based on the p-value we cannot reject the null hypothesis.So the variance is constant.
summary(fit1)
##
## Call:
## lm(formula = sales ~ TV + radio + newspaper, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8277 -0.8908 0.2418 1.1893 2.8292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.938889 0.311908 9.422 <2e-16 ***
## TV 0.045765 0.001395 32.809 <2e-16 ***
## radio 0.188530 0.008611 21.893 <2e-16 ***
## newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.686 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
lm(formula = sales ~ TV + radio + newspaper, data = mydata)
##
## Call:
## lm(formula = sales ~ TV + radio + newspaper, data = mydata)
##
## Coefficients:
## (Intercept) TV radio newspaper
## 2.938889 0.045765 0.188530 -0.001037
#The results of the regression is showing us that only newspaper cannot be explained because it is not significant p-value > 0.1. The model shows that for every 100 euros spent on TV ads, the company on average will sell 4 more units, all other factors the same. P<0.001. Based on the Multiple R-squared and p-value we can say that at least one coefficient is different from zero.
sqrt(summary(fit1)$r.squared)
## [1] 0.947212
#We have a linear relationship between the dependent and independent variables.
library(lm.beta)
## Warning: package 'lm.beta' was built under R version 4.3.3
lm.beta(fit1)
##
## Call:
## lm(formula = sales ~ TV + radio + newspaper, data = mydata)
##
## Standardized Coefficients::
## (Intercept) TV radio newspaper
## NA 0.753065912 0.536481550 -0.004330686
#With this function we showed which variables contributed the most to r2. The larger the coefficient the more important on explaining. In our case TV is the one that variable that contributed the most. ```{r