This project analyzes the Superstore Sales dataset to understand business performance and identify key insights related to Sales, Profit, Discount, and Product Categories. Using statistical analysis, machine learning models, and visualizations, we explored how various factors affect profit and sales patterns.
Removed missing values and ensured numeric columns were properly formatted.
Found overall distribution and correlation between Sales, Quantity, Discount, and Profit.
Studied how Sales, Quantity, and Discount impact Profit.
Checked whether different Product Categories show a significant difference in average Profit.
Grouped data into clusters based on Sales and Profit to identify similar sales-performance patterns.
Predicted whether a transaction’s Profit will be High or Low based on other factors.
Created multiple charts showing which categories and sub-categories generate the highest sales and profit, how discount affects profit, and the overall distribution of sales.
# Load Required Libraries
packages <- c("tidyverse","ggplot2","cluster","factoextra",
"class","caret","corrplot","dplyr","gridExtra")
for(p in packages){
if(!require(p, character.only=TRUE)){
install.packages(p, dependencies = TRUE)
library(p, character.only=TRUE)
}
}
# Load Dataset
df <- read.csv("~/Downloads/Sample - Superstore.csv", stringsAsFactors = FALSE)
# Data Cleaning
df$Sales <- as.numeric(df$Sales)
df$Quantity <- as.numeric(df$Quantity)
df$Discount <- as.numeric(df$Discount)
df$Profit <- as.numeric(df$Profit)
df <- df %>% filter(!is.na(Sales), !is.na(Profit))
cat("\n\n Q1: What is the overall distribution and summary of Sales, Quantity, Discount, and Profit?\n\n")
##
##
## Q1: What is the overall distribution and summary of Sales, Quantity, Discount, and Profit?
print(summary(df[,c("Sales","Quantity","Discount","Profit")]))
## Sales Quantity Discount Profit
## Min. : 0.444 Min. : 1.00 Min. :0.0000 Min. :-6599.978
## 1st Qu.: 17.280 1st Qu.: 2.00 1st Qu.:0.0000 1st Qu.: 1.729
## Median : 54.490 Median : 3.00 Median :0.2000 Median : 8.666
## Mean : 229.858 Mean : 3.79 Mean :0.1562 Mean : 28.657
## 3rd Qu.: 209.940 3rd Qu.: 5.00 3rd Qu.:0.2000 3rd Qu.: 29.364
## Max. :22638.480 Max. :14.00 Max. :0.8000 Max. : 8399.976
corrplot(cor(df[,c("Sales","Quantity","Discount","Profit")]), method="color")
cat("\n\n Q2: How does Sales, Quantity, and Discount affect Profit? (Regression)\n\n")
##
##
## Q2: How does Sales, Quantity, and Discount affect Profit? (Regression)
lm_model <- lm(Profit ~ Sales + Quantity + Discount, data=df)
print(summary(lm_model))
##
## Call:
## lm(formula = Profit ~ Sales + Quantity + Discount, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7266.8 -23.8 -0.4 25.6 5229.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.497e+01 4.218e+00 8.291 < 2e-16 ***
## Sales 1.800e-01 3.275e-03 54.961 < 2e-16 ***
## Quantity -2.962e+00 9.171e-01 -3.230 0.00124 **
## Discount -2.335e+02 9.686e+00 -24.101 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 199.8 on 9990 degrees of freedom
## Multiple R-squared: 0.2727, Adjusted R-squared: 0.2725
## F-statistic: 1249 on 3 and 9990 DF, p-value: < 2.2e-16
cat("\n\n Q3: Does Profit vary significantly across different Product Categories? (ANOVA)\n\n")
##
##
## Q3: Does Profit vary significantly across different Product Categories? (ANOVA)
anova_model <- aov(Profit ~ Category, data=df)
print(summary(anova_model))
## Df Sum Sq Mean Sq F value Pr(>F)
## Category 2 5898009 2949004 54.31 <2e-16 ***
## Residuals 9991 542495827 54298
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n\n Q4: Can we cluster customers based on Sales and Profit? (K-Means)\n\n")
##
##
## Q4: Can we cluster customers based on Sales and Profit? (K-Means)
cluster_df <- scale(df[,c("Sales","Profit")])
set.seed(123)
km <- kmeans(cluster_df, centers=3)
df$Cluster <- as.factor(km$cluster)
ggplot(df, aes(Sales, Profit, color=Cluster)) +
geom_point(alpha=0.7) + theme_minimal() +
labs(title="K-Means Clustering (Sales vs Profit)")
cat("\n\n Q5: Can we predict if Profit will be High or Low using KNN?\n\n")
##
##
## Q5: Can we predict if Profit will be High or Low using KNN?
df$ProfitLevel <- factor(ifelse(df$Profit > median(df$Profit),"High","Low"))
set.seed(123)
idx <- sample(1:nrow(df), 0.7*nrow(df))
train <- df[idx, ]; test <- df[-idx, ]
knn_pred <- knn(train[,c("Sales","Quantity","Discount")],
test[,c("Sales","Quantity","Discount")],
train$ProfitLevel, k=5)
print(confusionMatrix(knn_pred, test$ProfitLevel))
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 1255 466
## Low 229 1049
##
## Accuracy : 0.7683
## 95% CI : (0.7527, 0.7833)
## No Information Rate : 0.5052
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5372
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8457
## Specificity : 0.6924
## Pos Pred Value : 0.7292
## Neg Pred Value : 0.8208
## Prevalence : 0.4948
## Detection Rate : 0.4185
## Detection Prevalence : 0.5739
## Balanced Accuracy : 0.7690
##
## 'Positive' Class : High
##
cat("\n\n Q6: Which Category generates the highest Sales?\n\n")
##
##
## Q6: Which Category generates the highest Sales?
sales_summary <- df %>% group_by(Category) %>%
summarise(TotalSales=sum(Sales)) %>% arrange(desc(TotalSales))
p1 <- ggplot(sales_summary, aes(Category, TotalSales, fill=Category)) +
geom_col() + theme_minimal() + labs(title="Total Sales by Category")
grid.arrange(p1, tableGrob(sales_summary), ncol=2)
cat("\n\n Q7: Which Sub-Category contributes most to Profit?\n\n")
##
##
## Q7: Which Sub-Category contributes most to Profit?
profit_summary <- df %>% group_by(Sub.Category) %>%
summarise(TotalProfit=sum(Profit)) %>% arrange(desc(TotalProfit))
p2 <- ggplot(profit_summary, aes(Sub.Category, TotalProfit, fill=Sub.Category)) +
geom_col() + theme_minimal() +
theme(axis.text.x = element_text(angle=90)) +
labs(title="Total Profit by Sub-Category")
grid.arrange(p2, tableGrob(head(profit_summary,10)), ncol=2)
cat("\n\n Q8: What is the relationship between Discount and Profit?\n\n")
##
##
## Q8: What is the relationship between Discount and Profit?
p3 <- ggplot(df, aes(Discount, Profit)) +
geom_point(alpha=0.5, color="darkblue") +
geom_smooth(color="red") + theme_minimal() +
labs(title="Impact of Discount on Profit")
p3
cat("\n\n Q9: How are Sales distributed across transactions?\n\n")
##
##
## Q9: How are Sales distributed across transactions?
p4 <- ggplot(df, aes(Sales)) +
geom_histogram(bins=40, fill="lightgreen", color="black") +
theme_minimal() + labs(title="Distribution of Sales")
p4
cat("\n\n Q10: What are the correlations between numerical variables?\n\n")
##
##
## Q10: What are the correlations between numerical variables?
corrplot(cor(df[,c("Sales","Quantity","Discount","Profit")]), method="circle")
# =============================== END
==========================================