Project Overview: Superstore Complete Analysis

This project analyzes the Superstore Sales dataset to understand business performance and identify key insights related to Sales, Profit, Discount, and Product Categories. Using statistical analysis, machine learning models, and visualizations, we explored how various factors affect profit and sales patterns.

Data Cleaning

Removed missing values and ensured numeric columns were properly formatted.

Descriptive Analysis

Found overall distribution and correlation between Sales, Quantity, Discount, and Profit.

Regression Analysis

Studied how Sales, Quantity, and Discount impact Profit.

ANOVA Test

Checked whether different Product Categories show a significant difference in average Profit.

K-Means Clustering

Grouped data into clusters based on Sales and Profit to identify similar sales-performance patterns.

KNN Classification

Predicted whether a transaction’s Profit will be High or Low based on other factors.

Visualization

Created multiple charts showing which categories and sub-categories generate the highest sales and profit, how discount affects profit, and the overall distribution of sales.

Load Required Libraries

# Load Required Libraries
packages <- c("tidyverse","ggplot2","cluster","factoextra",
              "class","caret","corrplot","dplyr","gridExtra")
for(p in packages){
  if(!require(p, character.only=TRUE)){
    install.packages(p, dependencies = TRUE)
    library(p, character.only=TRUE)
  }
}

Load Dataset & Data Cleaning

# Load Dataset
df <- read.csv("~/Downloads/Sample - Superstore.csv", stringsAsFactors = FALSE)

# Data Cleaning
df$Sales <- as.numeric(df$Sales)
df$Quantity <- as.numeric(df$Quantity)
df$Discount <- as.numeric(df$Discount)
df$Profit <- as.numeric(df$Profit)
df <- df %>% filter(!is.na(Sales), !is.na(Profit))

====== START ANALYSIS ======

Q1: What is the overall distribution and summary of Sales, Quantity, Discount, and Profit?

cat("\n\n Q1: What is the overall distribution and summary of Sales, Quantity, Discount, and Profit?\n\n")

## 
## 
##  Q1: What is the overall distribution and summary of Sales, Quantity, Discount, and Profit?

print(summary(df[,c("Sales","Quantity","Discount","Profit")]))

##      Sales              Quantity        Discount          Profit         
##  Min.   :    0.444   Min.   : 1.00   Min.   :0.0000   Min.   :-6599.978  
##  1st Qu.:   17.280   1st Qu.: 2.00   1st Qu.:0.0000   1st Qu.:    1.729  
##  Median :   54.490   Median : 3.00   Median :0.2000   Median :    8.666  
##  Mean   :  229.858   Mean   : 3.79   Mean   :0.1562   Mean   :   28.657  
##  3rd Qu.:  209.940   3rd Qu.: 5.00   3rd Qu.:0.2000   3rd Qu.:   29.364  
##  Max.   :22638.480   Max.   :14.00   Max.   :0.8000   Max.   : 8399.976

corrplot(cor(df[,c("Sales","Quantity","Discount","Profit")]), method="color")

Q2: How does Sales, Quantity, and Discount affect Profit? (Regression)

cat("\n\n Q2: How does Sales, Quantity, and Discount affect Profit? (Regression)\n\n")

## 
## 
##  Q2: How does Sales, Quantity, and Discount affect Profit? (Regression)

lm_model <- lm(Profit ~ Sales + Quantity + Discount, data=df)
print(summary(lm_model))

## 
## Call:
## lm(formula = Profit ~ Sales + Quantity + Discount, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7266.8   -23.8    -0.4    25.6  5229.6 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.497e+01  4.218e+00   8.291  < 2e-16 ***
## Sales        1.800e-01  3.275e-03  54.961  < 2e-16 ***
## Quantity    -2.962e+00  9.171e-01  -3.230  0.00124 ** 
## Discount    -2.335e+02  9.686e+00 -24.101  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 199.8 on 9990 degrees of freedom
## Multiple R-squared:  0.2727, Adjusted R-squared:  0.2725 
## F-statistic:  1249 on 3 and 9990 DF,  p-value: < 2.2e-16

Q3: Does Profit vary significantly across different Product Categories? (ANOVA)

cat("\n\n Q3: Does Profit vary significantly across different Product Categories? (ANOVA)\n\n")

## 
## 
##  Q3: Does Profit vary significantly across different Product Categories? (ANOVA)

anova_model <- aov(Profit ~ Category, data=df)
print(summary(anova_model))

##               Df    Sum Sq Mean Sq F value Pr(>F)    
## Category       2   5898009 2949004   54.31 <2e-16 ***
## Residuals   9991 542495827   54298                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Q4: Can we cluster customers based on Sales and Profit? (K-Means)

cat("\n\n Q4: Can we cluster customers based on Sales and Profit? (K-Means)\n\n")

## 
## 
##  Q4: Can we cluster customers based on Sales and Profit? (K-Means)

cluster_df <- scale(df[,c("Sales","Profit")])
set.seed(123)
km <- kmeans(cluster_df, centers=3)
df$Cluster <- as.factor(km$cluster)
ggplot(df, aes(Sales, Profit, color=Cluster)) +
  geom_point(alpha=0.7) + theme_minimal() +
  labs(title="K-Means Clustering (Sales vs Profit)")

Q5: Can we predict if Profit will be High or Low using KNN?

cat("\n\n Q5: Can we predict if Profit will be High or Low using KNN?\n\n")

## 
## 
##  Q5: Can we predict if Profit will be High or Low using KNN?

df$ProfitLevel <- factor(ifelse(df$Profit > median(df$Profit),"High","Low"))
set.seed(123)
idx <- sample(1:nrow(df), 0.7*nrow(df))
train <- df[idx, ]; test <- df[-idx, ]
knn_pred <- knn(train[,c("Sales","Quantity","Discount")],
                test[,c("Sales","Quantity","Discount")],
                train$ProfitLevel, k=5)
print(confusionMatrix(knn_pred, test$ProfitLevel))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High  Low
##       High 1255  466
##       Low   229 1049
##                                           
##                Accuracy : 0.7683          
##                  95% CI : (0.7527, 0.7833)
##     No Information Rate : 0.5052          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5372          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8457          
##             Specificity : 0.6924          
##          Pos Pred Value : 0.7292          
##          Neg Pred Value : 0.8208          
##              Prevalence : 0.4948          
##          Detection Rate : 0.4185          
##    Detection Prevalence : 0.5739          
##       Balanced Accuracy : 0.7690          
##                                           
##        'Positive' Class : High            
##

Q6: Which Category generates the highest Sales?

cat("\n\n Q6: Which Category generates the highest Sales?\n\n")

## 
## 
##  Q6: Which Category generates the highest Sales?

sales_summary <- df %>% group_by(Category) %>%
  summarise(TotalSales=sum(Sales)) %>% arrange(desc(TotalSales))
p1 <- ggplot(sales_summary, aes(Category, TotalSales, fill=Category)) +
  geom_col() + theme_minimal() + labs(title="Total Sales by Category")
grid.arrange(p1, tableGrob(sales_summary), ncol=2)

Q7: Which Sub-Category contributes most to Profit?

cat("\n\n Q7: Which Sub-Category contributes most to Profit?\n\n")

## 
## 
##  Q7: Which Sub-Category contributes most to Profit?

profit_summary <- df %>% group_by(Sub.Category) %>%
  summarise(TotalProfit=sum(Profit)) %>% arrange(desc(TotalProfit))
p2 <- ggplot(profit_summary, aes(Sub.Category, TotalProfit, fill=Sub.Category)) +
  geom_col() + theme_minimal() +
  theme(axis.text.x = element_text(angle=90)) +
  labs(title="Total Profit by Sub-Category")
grid.arrange(p2, tableGrob(head(profit_summary,10)), ncol=2)

Q8: What is the relationship between Discount and Profit?

cat("\n\n Q8: What is the relationship between Discount and Profit?\n\n")

## 
## 
##  Q8: What is the relationship between Discount and Profit?

p3 <- ggplot(df, aes(Discount, Profit)) +
  geom_point(alpha=0.5, color="darkblue") +
  geom_smooth(color="red") + theme_minimal() +
  labs(title="Impact of Discount on Profit")
p3

Q9: How are Sales distributed across transactions?

cat("\n\n Q9: How are Sales distributed across transactions?\n\n")

## 
## 
##  Q9: How are Sales distributed across transactions?

p4 <- ggplot(df, aes(Sales)) +
  geom_histogram(bins=40, fill="lightgreen", color="black") +
  theme_minimal() + labs(title="Distribution of Sales")
p4

Q10: What are the correlations between numerical variables?

cat("\n\n Q10: What are the correlations between numerical variables?\n\n")

## 
## 
##  Q10: What are the correlations between numerical variables?

corrplot(cor(df[,c("Sales","Quantity","Discount","Profit")]), method="circle")

# =============================== END ==========================================

Superstore Complete Analysis

Ankit | Reg no 12317222 | Roll no 50