We start by loading the necessary libraries and importing the dataset obtained from Kaggle.
## Loading required libraries
library(readr) # For reading CSV files
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(modelr) # For working with models and residuals
library(tseries) # Statistical tests
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(lmtest) # Heteroscedasticity tests
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) # VIF calculation
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggplot2) # For plots
## Loading the dataset
df <- read.csv("sales_data.csv")
head(df)
## Product_ID Sale_Date Sales_Rep Region Sales_Amount Quantity_Sold
## 1 1052 2023-02-03 Bob North 5053.97 18
## 2 1093 2023-04-21 Bob West 4384.02 17
## 3 1015 2023-09-21 David South 4631.23 30
## 4 1072 2023-08-24 Bob South 2167.94 39
## 5 1061 2023-03-24 Charlie East 3750.20 13
## 6 1021 2023-02-11 Charlie West 3761.15 32
## Product_Category Unit_Cost Unit_Price Customer_Type Discount Payment_Method
## 1 Furniture 152.75 267.22 Returning 0.09 Cash
## 2 Furniture 3816.39 4209.44 Returning 0.11 Cash
## 3 Food 261.56 371.40 Returning 0.20 Bank Transfer
## 4 Clothing 4330.03 4467.75 New 0.02 Credit Card
## 5 Electronics 637.37 692.71 New 0.08 Credit Card
## 6 Food 900.79 1106.51 New 0.21 Cash
## Sales_Channel Region_and_Sales_Rep
## 1 Online North-Bob
## 2 Retail West-Bob
## 3 Retail South-David
## 4 Retail South-Bob
## 5 Online East-Charlie
## 6 Online West-Charlie
We examine the dataset structure and check for missing values.
## Dataset structure
glimpse(df)
## Rows: 1,000
## Columns: 14
## $ Product_ID <int> 1052, 1093, 1015, 1072, 1061, 1021, 1083, 1087, 1…
## $ Sale_Date <chr> "2023-02-03", "2023-04-21", "2023-09-21", "2023-0…
## $ Sales_Rep <chr> "Bob", "Bob", "David", "Bob", "Charlie", "Charlie…
## $ Region <chr> "North", "West", "South", "South", "East", "West"…
## $ Sales_Amount <dbl> 5053.97, 4384.02, 4631.23, 2167.94, 3750.20, 3761…
## $ Quantity_Sold <int> 18, 17, 30, 39, 13, 32, 29, 46, 30, 18, 13, 43, 2…
## $ Product_Category <chr> "Furniture", "Furniture", "Food", "Clothing", "El…
## $ Unit_Cost <dbl> 152.75, 3816.39, 261.56, 4330.03, 637.37, 900.79,…
## $ Unit_Price <dbl> 267.22, 4209.44, 371.40, 4467.75, 692.71, 1106.51…
## $ Customer_Type <chr> "Returning", "Returning", "Returning", "New", "Ne…
## $ Discount <dbl> 0.09, 0.11, 0.20, 0.02, 0.08, 0.21, 0.14, 0.12, 0…
## $ Payment_Method <chr> "Cash", "Cash", "Bank Transfer", "Credit Card", "…
## $ Sales_Channel <chr> "Online", "Retail", "Retail", "Retail", "Online",…
## $ Region_and_Sales_Rep <chr> "North-Bob", "West-Bob", "South-David", "South-Bo…
## Checking for missing values
colSums(is.na(df))
## Product_ID Sale_Date Sales_Rep
## 0 0 0
## Region Sales_Amount Quantity_Sold
## 0 0 0
## Product_Category Unit_Cost Unit_Price
## 0 0 0
## Customer_Type Discount Payment_Method
## 0 0 0
## Sales_Channel Region_and_Sales_Rep
## 0 0
We generate summary statistics for all numeric variables to identify ranges, averages, and possible outliers.
summary(df)
## Product_ID Sale_Date Sales_Rep Region
## Min. :1001 Length:1000 Length:1000 Length:1000
## 1st Qu.:1024 Class :character Class :character Class :character
## Median :1051 Mode :character Mode :character Mode :character
## Mean :1050
## 3rd Qu.:1075
## Max. :1100
## Sales_Amount Quantity_Sold Product_Category Unit_Cost
## Min. : 100.1 Min. : 1.00 Length:1000 Min. : 60.28
## 1st Qu.:2550.3 1st Qu.:13.00 Class :character 1st Qu.:1238.38
## Median :5019.3 Median :25.00 Mode :character Median :2467.24
## Mean :5019.3 Mean :25.36 Mean :2475.30
## 3rd Qu.:7507.4 3rd Qu.:38.00 3rd Qu.:3702.86
## Max. :9989.0 Max. :49.00 Max. :4995.30
## Unit_Price Customer_Type Discount Payment_Method
## Min. : 167.1 Length:1000 Min. :0.0000 Length:1000
## 1st Qu.:1509.1 Class :character 1st Qu.:0.0800 Class :character
## Median :2696.4 Mode :character Median :0.1500 Mode :character
## Mean :2728.4 Mean :0.1524
## 3rd Qu.:3958.0 3rd Qu.:0.2300
## Max. :5442.1 Max. :0.3000
## Sales_Channel Region_and_Sales_Rep
## Length:1000 Length:1000
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Since the dataset did not include a profit variable, we define:
\[ Profit = Sales\_Amount - (Unit\_Cost \times Quantity\_Sold) \]
df$Profit <- df$Sales_Amount - (df$Unit_Cost * df$Quantity_Sold)
## Initial linear regression model
model <- lm(Profit ~ Quantity_Sold + Discount + Sales_Amount + Unit_Cost + Unit_Price,
data = df)
## Summary
summary(model)
##
## Call:
## lm(formula = Profit ~ Quantity_Sold + Discount + Sales_Amount +
## Unit_Cost + Unit_Price, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57644 -11145 663 11459 57897
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59597.9687 2582.3124 23.079 < 2e-16 ***
## Quantity_Sold -2487.0545 45.2188 -55.000 < 2e-16 ***
## Discount -2159.8801 7323.1587 -0.295 0.768
## Sales_Amount 1.1332 0.2245 5.047 5.34e-07 ***
## Unit_Cost -32.7155 4.5359 -7.213 1.09e-12 ***
## Unit_Price 7.4257 4.5320 1.638 0.102
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20170 on 994 degrees of freedom
## Multiple R-squared: 0.8688, Adjusted R-squared: 0.8681
## F-statistic: 1316 on 5 and 994 DF, p-value: < 2.2e-16
We calculate the Variance Inflation Factor (VIF) to detect redundant predictors.
vif(model)
## Quantity_Sold Discount Sales_Amount Unit_Cost Unit_Price
## 1.006190 1.000931 1.002920 101.527482 101.569515
Due to high collinearity between Unit_Cost and
Unit_Price, we create a simplified model:
model2 <- lm(Profit ~ Quantity_Sold + Unit_Price,
data = df)
summary(model2)
##
## Call:
## lm(formula = Profit ~ Quantity_Sold + Unit_Price, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58835 -11658 742 12097 58706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72605.2317 1816.8238 39.96 <2e-16 ***
## Quantity_Sold -2486.3554 46.8448 -53.08 <2e-16 ***
## Unit_Price -25.0643 0.4673 -53.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20930 on 997 degrees of freedom
## Multiple R-squared: 0.8583, Adjusted R-squared: 0.858
## F-statistic: 3020 on 2 and 997 DF, p-value: < 2.2e-16
We calculate residuals and visualize their distribution.
par(mfrow=c(2,2))
plot(model2)
df <- df %>%
add_residuals(model2)
## Summary of residuals
summary(df$resid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -58834.8 -11657.9 741.9 0.0 12096.8 58705.8
## Histogram of residuals
hist(model2$resid,
main = "Histogram of Residuals",
xlab = "Residuals",
col = "lightblue",
breaks = 15)
## Scatterplots with regression line
ggplot(df, aes(x = Quantity_Sold, y = Profit)) +
geom_point(alpha = 0.4, color = "darkblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Profit vs Quantity Sold", x = "Quantity Sold", y = "Profit")
## `geom_smooth()` using formula = 'y ~ x'
jarque.bera.test(df$resid)
##
## Jarque Bera Test
##
## data: df$resid
## X-squared = 2.0763, df = 2, p-value = 0.3541
bptest(model2)
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 1.3417, df = 2, p-value = 0.5113
The adjusted linear regression model predicts Profit
using Quantity_Sold and Unit_Price. The
regression equation is:
\[ \text{Profit} = 72,605.23 - 2,486.36 \times \text{Quantity\_Sold} - 25.06 \times \text{Unit\_Price} \]
Key interpretations:
Residuals summary:
Model diagnostics (graphs):
Conclusion:
Overall, the adjusted linear regression model satisfies classical
assumptions (linearity, normality of residuals, homoscedasticity),
exhibits high explanatory power (Adjusted R² ≈ 0.86), and provides a
reliable predictive relationship for Profit based on
sales quantity and unit price. The combination of coefficient
significance and visual diagnostics supports the use of this model for
managerial decision-making regarding pricing and sales strategies, while
minor deviations suggest monitoring potential outliers or non-linear
effects in future analyses.