#How do advertising spendings in TV, radio, and newspapers impact sales performance?

library(readxl)

## Warning: package 'readxl' was built under R version 4.3.3

Advertising <- read_excel("C:/Users/PC/Desktop/Advertising.xlsx")

## New names:
## • `` -> `...1`

mydata <- read_excel("C:/Users/PC/Desktop/Advertising.xlsx")

## New names:
## • `` -> `...1`

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Next we will show the first 10 units

head(mydata, 10)

## # A tibble: 10 × 5
##     ...1    TV radio newspaper sales
##    <dbl> <dbl> <dbl>     <dbl> <dbl>
##  1     1 230.   37.8      69.2  22.1
##  2     2  44.5  39.3      45.1  10.4
##  3     3  17.2  45.9      69.3   9.3
##  4     4 152.   41.3      58.5  18.5
##  5     5 181.   10.8      58.4  12.9
##  6     6   8.7  48.9      75     7.2
##  7     7  57.5  32.8      23.5  11.8
##  8     8 120.   19.6      11.6  13.2
##  9     9   8.6   2.1       1     4.8
## 10    10 200.    2.6      21.2  10.6

ID = Compnay, TV = Amount of money spent on TV ads (hundreds),Radio = Amount of money spent on radio ads (hundreds), newspaper = amount of money spent on newspaper ads (hundreds), sales = the units of products they sell.

library(pastecs)

## Warning: package 'pastecs' was built under R version 4.3.3

## 
## Attaching package: 'pastecs'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

summary_stats <- round(stat.desc(mydata[-1]), 2)

#Now we will create a scatterplot matrix to check the relationship between explanatory variables. Additionally we will check for multicoliniarity and whether the errors are normally distributed.

pairs(mydata[-1], main = "Scatterplot Matrix")

#Correlation matrix

library(Hmisc)

## Warning: package 'Hmisc' was built under R version 4.3.3

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

rcorr(as.matrix(mydata[,c(-1,-1)]))

##             TV radio newspaper sales
## TV        1.00  0.05      0.06  0.78
## radio     0.05  1.00      0.35  0.58
## newspaper 0.06  0.35      1.00  0.23
## sales     0.78  0.58      0.23  1.00
## 
## n= 200 
## 
## 
## P
##           TV     radio  newspaper sales 
## TV               0.4408 0.4256    0.0000
## radio     0.4408        0.0000    0.0000
## newspaper 0.4256 0.0000           0.0011
## sales     0.0000 0.0000 0.0011

#Here we can see that there is positive and weak correlation between explanatory variables radio and TV. Also the p-value is showing us that we cannot reject the null hypothesis which states that there is no correlation. #Next we are going to fit the regression model to see the relationship between the dependent and the independent variables.

fit1 <- lm(sales ~ TV + radio + newspaper, data = mydata)
library(car)

## Warning: package 'car' was built under R version 4.3.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.3.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

vif_values <- vif(fit1)
vif(fit1)

##        TV     radio newspaper 
##  1.004611  1.144952  1.145187

mean(vif(fit1))

## [1] 1.09825

#In our case we got good values beacuase we checked for multicolinearity and our values were all less than 5. On the other hand our mean is close to 1 which is in our favor. #Next we will do tests to check whether we have any outliers or any high value units.

mydata$stdresid <- round(rstandard(fit1), 3)
mydata$CooksD <- round(cooks.distance(fit1), 3)
hist(mydata$stdresid, 
      xlab = "Standardized Residuals", 
      ylab = "Frequency", 
      main = "Histogram of Standardized Residuals")

shapiro.test(mydata$stdresid)

## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$stdresid
## W = 0.91661, p-value = 3.31e-09

#Based on the standardized residuals we can see that we have values that are under -3, and shapiro wilk-test shows us that we can reject the null hypothesis which is ” errors are normally distributed” #Now we will show the histogram of Cooks Distances.

 hist(mydata$CooksD, 
      xlab = "Cook's Distance", 
      ylab = "Frequency", 
      main = "Histogram of Cook's Distance")

#In this test we can see a gap between 0.15 and 0.25 which is a problem for us. #Now we will show the three units with the lowest st. residuals.

head(mydata[order(mydata$stdresid),], 3)

## # A tibble: 3 × 7
##    ...1    TV radio newspaper sales stdresid CooksD
##   <dbl> <dbl> <dbl>     <dbl> <dbl>    <dbl>  <dbl>
## 1   131   0.7  39.6       8.7   1.6    -5.34  0.273
## 2     6   8.7  48.9      75     7.2    -3.21  0.128
## 3    36 291.    4.1       8.5  12.8    -2.54  0.051

#Now showing the six units with the highest cooks distance.

head(mydata[order(-mydata$CooksD),], 6)

## # A tibble: 6 × 7
##    ...1    TV radio newspaper sales stdresid CooksD
##   <dbl> <dbl> <dbl>     <dbl> <dbl>    <dbl>  <dbl>
## 1   131   0.7  39.6       8.7   1.6    -5.34  0.273
## 2     6   8.7  48.9      75     7.2    -3.21  0.128
## 3    76  16.9  43.7      89.4   8.7    -1.93  0.056
## 4    36 291.    4.1       8.5  12.8    -2.54  0.051
## 5   179 277.    2.3      23.7  11.8    -2.53  0.046
## 6   127   7.8  38.9      50.6   6.6    -2.39  0.04

#Based on the results we want to remove row number 131, 6 and 76.

row_to_remove <- 131
row_to_remove <- 6
row_to_remove <- 76

#Now we are going to fit the model again so we can reestimate.

fit1 <- lm(sales ~ TV + radio + newspaper, data = mydata)
mydata$stdresid <- round(rstandard(fit1), 3)
 mydata$CooksD <- round(cooks.distance(fit1), 3)
 hist(mydata$stdresid, 
      xlab = "Standardized Residuals", 
      ylab = "Frequency", 
      main = "Histogram of Standardized Residuals")

 shapiro.test(mydata$stdresid)

## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$stdresid
## W = 0.91661, p-value = 3.31e-09

hist(mydata$CooksD, 
     xlab = "Cook's Distance", 
     ylab = "Frequency", 
     main = "Histogram of Cook's Distance")

#Since we have a large sample we are not concerned about the other values that might have a slightly higher st. residuals or cooks distance. #Now we are going to check the relationship between standardized residuals and standardized fitted values. This will show us whether homoscedasticity and linearity is met.

library(car)
mydata$Stdfitted <- scale(fit1$fitted.values)
scatterplot(y = mydata$stdresid, x = mydata$Stdfitted, ylab = "Standardized Residuals", xlab = "Standardized Fitted Values", boxplot = FALSE, regLine = FALSE, smooth = FALSE)

#Next we are going to run the breusch pagan test to check for heteroscadasticity with the p-value.

library(olsrr)

## Warning: package 'olsrr' was built under R version 4.3.3

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

ols_test_breusch_pagan(fit1)

## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##               Data                
##  ---------------------------------
##  Response : sales 
##  Variables: fitted values of sales 
## 
##         Test Summary          
##  -----------------------------
##  DF            =    1 
##  Chi2          =    5.355982 
##  Prob > Chi2   =    0.02065131

#Based on the p-value we cannot reject the null hypothesis.So the variance is constant.

summary(fit1)

## 
## Call:
## lm(formula = sales ~ TV + radio + newspaper, data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8277 -0.8908  0.2418  1.1893  2.8292 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.938889   0.311908   9.422   <2e-16 ***
## TV           0.045765   0.001395  32.809   <2e-16 ***
## radio        0.188530   0.008611  21.893   <2e-16 ***
## newspaper   -0.001037   0.005871  -0.177     0.86    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.686 on 196 degrees of freedom
## Multiple R-squared:  0.8972, Adjusted R-squared:  0.8956 
## F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16

lm(formula = sales ~ TV + radio + newspaper, data = mydata)

## 
## Call:
## lm(formula = sales ~ TV + radio + newspaper, data = mydata)
## 
## Coefficients:
## (Intercept)           TV        radio    newspaper  
##    2.938889     0.045765     0.188530    -0.001037

#The results of the regression is showing us that only newspaper cannot be explained because it is not significant p-value > 0.1. The model shows that for every 100 euros spent on TV ads, the company on average will sell 4 more units, all other factors the same. P<0.001. Based on the Multiple R-squared and p-value we can say that at least one coefficient is different from zero.

sqrt(summary(fit1)$r.squared)

## [1] 0.947212

#We have a linear relationship between the dependent and independent variables.

library(lm.beta)

## Warning: package 'lm.beta' was built under R version 4.3.3

lm.beta(fit1)

## 
## Call:
## lm(formula = sales ~ TV + radio + newspaper, data = mydata)
## 
## Standardized Coefficients::
##  (Intercept)           TV        radio    newspaper 
##           NA  0.753065912  0.536481550 -0.004330686

#With this function we showed which variables contributed the most to r2. The larger the coefficient the more important on explaining. In our case TV is the one that variable that contributed the most. ```{r

Advertisment

Granit Uka

2024-07-23

ID = Compnay, TV = Amount of money spent on TV ads (hundreds),Radio = Amount of money spent on radio ads (hundreds), newspaper = amount of money spent on newspaper ads (hundreds), sales = the units of products they sell.