This analysis investigates how advertising budgets, market size, and holiday weeks influence weekly sales performance for an e-commerce company. Using linear regression models in R, we evaluated both simple and multiple regression approaches to determine actionable insights.
sales_data <- read.csv("e_commerce_sales_data.csv")
# Convert categorical variables to factors
sales_data$MarketSize <- as.factor(sales_data$MarketSize)
sales_data$HolidayWeek <- as.factor(sales_data$HolidayWeek)
str(sales_data)
## 'data.frame': 52 obs. of 5 variables:
## $ Week : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Advertising: num 28.8 22 24.9 31.2 29.3 ...
## $ MarketSize : Factor w/ 3 levels "Large","Medium",..: 2 3 2 1 3 2 1 3 1 3 ...
## $ HolidayWeek: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 1 1 2 2 ...
## $ Sales : num 69.3 46.6 60.2 68.3 60.8 ...
summary(sales_data)
## Week Advertising MarketSize HolidayWeek Sales
## Min. : 1.00 Min. : 7.24 Large :25 No :24 Min. :17.01
## 1st Qu.:13.75 1st Qu.:17.16 Medium:12 Yes:28 1st Qu.:37.86
## Median :26.50 Median :20.75 Small :15 Median :47.27
## Mean :26.50 Mean :20.63 Mean :46.23
## 3rd Qu.:39.25 3rd Qu.:24.43 3rd Qu.:53.84
## Max. :52.00 Max. :31.35 Max. :77.11
ggplot(sales_data, aes(x = Advertising, y = Sales)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Sales vs Advertising Budget", x = "Advertising (in $1000s)", y = "Sales (in $1000s)")
## `geom_smooth()` using formula = 'y ~ x'
model_simple <- lm(Sales ~ Advertising, data = sales_data)
summary(model_simple)
##
## Call:
## lm(formula = Sales ~ Advertising, data = sales_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.0493 -5.8952 0.7356 7.4668 16.7441
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.8572 4.8823 1.814 0.0757 .
## Advertising 1.8121 0.2285 7.929 2.14e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.173 on 50 degrees of freedom
## Multiple R-squared: 0.557, Adjusted R-squared: 0.5482
## F-statistic: 62.88 on 1 and 50 DF, p-value: 2.139e-10
model_multiple <- lm(Sales ~ Advertising + MarketSize + HolidayWeek, data = sales_data)
summary(model_multiple)
##
## Call:
## lm(formula = Sales ~ Advertising + MarketSize + HolidayWeek,
## data = sales_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.237 -5.153 1.178 5.034 14.213
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0106 4.8336 1.244 0.21985
## Advertising 1.9287 0.2178 8.855 1.39e-11 ***
## MarketSizeMedium -6.7073 3.5317 -1.899 0.06369 .
## MarketSizeSmall -8.8684 3.2713 -2.711 0.00934 **
## HolidayWeekYes 8.4428 2.9542 2.858 0.00634 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.571 on 47 degrees of freedom
## Multiple R-squared: 0.6364, Adjusted R-squared: 0.6055
## F-statistic: 20.57 on 4 and 47 DF, p-value: 7.537e-10
cat("Simple Model R-squared: ", summary(model_simple)$r.squared, "\n")
## Simple Model R-squared: 0.557034
cat("Multiple Model R-squared: ", summary(model_multiple)$r.squared, "\n")
## Multiple Model R-squared: 0.6364134
cat("Multiple Model Adjusted R-squared: ", summary(model_multiple)$adj.r.squared, "\n")
## Multiple Model Adjusted R-squared: 0.6054699
sales_data$PredictedSales <- predict(model_multiple)
ggplot(sales_data, aes(x = Advertising, y = Sales, color = MarketSize)) +
geom_point() +
geom_line(aes(y = PredictedSales), linetype = "dashed") +
labs(title = "Multiple Regression Predictions by Market Size", x = "Advertising", y = "Sales")
write.csv(data.frame(Week = sales_data$Week,
ActualSales = sales_data$Sales,
PredictedSales = sales_data$PredictedSales),
"predicted_sales_results.csv",
row.names = FALSE)