edataCA4.knit

title: “E-commerce Sales Data Analysis in R” author: “Shruti Sahu” date: “2026-04-25”

output: html_document: toc: true toc_float: true number_sections: true

1 Load Libraries

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.5.3

library(lubridate)

## Warning: package 'lubridate' was built under R version 4.5.3

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(readr) 
library(scales)

## Warning: package 'scales' was built under R version 4.5.3

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.5.3

## corrplot 0.95 loaded

2 Import Data

df <- read.csv("C:/Users/SHRUTI SAHU/Downloads/data.csv", fileEncoding = "latin1") 

head(df)

##   InvoiceNo StockCode                         Description Quantity
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6
## 2    536365     71053                 WHITE METAL LANTERN        6
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2
##      InvoiceDate UnitPrice CustomerID        Country
## 1 12/1/2010 8:26      2.55      17850 United Kingdom
## 2 12/1/2010 8:26      3.39      17850 United Kingdom
## 3 12/1/2010 8:26      2.75      17850 United Kingdom
## 4 12/1/2010 8:26      3.39      17850 United Kingdom
## 5 12/1/2010 8:26      3.39      17850 United Kingdom
## 6 12/1/2010 8:26      7.65      17850 United Kingdom

str(df)

## 'data.frame':    541909 obs. of  8 variables:
##  $ InvoiceNo  : chr  "536365" "536365" "536365" "536365" ...
##  $ StockCode  : chr  "85123A" "71053" "84406B" "84029G" ...
##  $ Description: chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
##  $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
##  $ InvoiceDate: chr  "12/1/2010 8:26" "12/1/2010 8:26" "12/1/2010 8:26" "12/1/2010 8:26" ...
##  $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
##  $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
##  $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...

dim(df)

## [1] 541909      8

summary(df)

##   InvoiceNo          StockCode         Description           Quantity         
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.000  
##  Class :character   Class :character   Class :character   1st Qu.:     1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.000  
##                                                           Mean   :     9.552  
##                                                           3rd Qu.:    10.000  
##                                                           Max.   : 80995.000  
##                                                                               
##  InvoiceDate          UnitPrice            CustomerID       Country         
##  Length:541909      Min.   :-11062.060   Min.   :12346    Length:541909     
##  Class :character   1st Qu.:     1.250   1st Qu.:13953    Class :character  
##  Mode  :character   Median :     2.080   Median :15152    Mode  :character  
##                     Mean   :     4.611   Mean   :15288                      
##                     3rd Qu.:     4.130   3rd Qu.:16791                      
##                     Max.   : 38970.000   Max.   :18287                      
##                                          NA's   :135080

3 Data Cleaning & Preprocessing

4 Data Cleaning

# Fix date column 
df$InvoiceDate <- as.POSIXct(df$InvoiceDate, format = "%m/%d/%Y %H:%M") 

# Remove missing CustomerID 
df_clean <- df[!is.na(df$CustomerID), ]

# Remove cancelled orders
df_clean <- df_clean[!grepl("^C", df_clean$InvoiceNo), ] 

# Remove negative quantities 
df_clean <- df_clean[df_clean$Quantity > 0, ] 

# Remove duplicates 
df_clean <- df_clean[!duplicated(df_clean), ]

# Create Revenue column 

df_clean$Revenue <- df_clean$Quantity * df_clean$UnitPrice


nrow(df_clean)

## [1] 392732

5 Create Revenue Column

df_clean$Revenue <-
  df_clean$Quantity *
  df_clean$UnitPrice

head(df_clean)

##   InvoiceNo StockCode                         Description Quantity
## 1    536365    85123A  WHITE HANGING HEART T-LIGHT HOLDER        6
## 2    536365     71053                 WHITE METAL LANTERN        6
## 3    536365    84406B      CREAM CUPID HEARTS COAT HANGER        8
## 4    536365    84029G KNITTED UNION FLAG HOT WATER BOTTLE        6
## 5    536365    84029E      RED WOOLLY HOTTIE WHITE HEART.        6
## 6    536365     22752        SET 7 BABUSHKA NESTING BOXES        2
##           InvoiceDate UnitPrice CustomerID        Country Revenue
## 1 2010-12-01 08:26:00      2.55      17850 United Kingdom   15.30
## 2 2010-12-01 08:26:00      3.39      17850 United Kingdom   20.34
## 3 2010-12-01 08:26:00      2.75      17850 United Kingdom   22.00
## 4 2010-12-01 08:26:00      3.39      17850 United Kingdom   20.34
## 5 2010-12-01 08:26:00      3.39      17850 United Kingdom   20.34
## 6 2010-12-01 08:26:00      7.65      17850 United Kingdom   15.30

nrow(df_clean)

## [1] 392732

6 Exploratory Data Analysis

summary(df_clean$Revenue)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      0.00      4.95     12.39     22.63     19.80 168469.60

mean(df_clean$Revenue)

## [1] 22.62919

median(df_clean$Revenue)

## [1] 12.39

sd(df_clean$Revenue)

## [1] 311.0835

7 Unique Customers Count

length(unique(df_clean$CustomerID))

## [1] 4339

8 Country Frequency

as.data.frame(table(df_clean$Country))

##                    Var1   Freq
## 1             Australia   1184
## 2               Austria    398
## 3               Bahrain     17
## 4               Belgium   2031
## 5                Brazil     32
## 6                Canada    151
## 7       Channel Islands    747
## 8                Cyprus    603
## 9        Czech Republic     25
## 10              Denmark    380
## 11                 EIRE   7228
## 12   European Community     60
## 13              Finland    685
## 14               France   8327
## 15              Germany   9027
## 16               Greece    145
## 17              Iceland    182
## 18               Israel    245
## 19                Italy    758
## 20                Japan    321
## 21              Lebanon     45
## 22            Lithuania     35
## 23                Malta    112
## 24          Netherlands   2363
## 25               Norway   1072
## 26               Poland    330
## 27             Portugal   1453
## 28                  RSA     58
## 29         Saudi Arabia      9
## 30            Singapore    222
## 31                Spain   2480
## 32               Sweden    450
## 33          Switzerland   1842
## 34 United Arab Emirates     68
## 35       United Kingdom 349227
## 36          Unspecified    241
## 37                  USA    179

9 Revenue Distribution Analysis

hist(
  df_clean$Revenue[df_clean$Revenue < 500],
  main = "Revenue Distribution",
  xlab = "Revenue (£)",
  col = "steelblue"
)

plot(
  density(df_clean$Revenue),
  lwd = 3,
  main = "Revenue Density Plot"
)

hist(
  df_clean$Revenue,
  probability = TRUE,
  col = "lightgreen",
  breaks = 40,
  main = "Revenue Distribution"
)

lines(
  density(df_clean$Revenue),
  lwd = 3,
  col = "red"
)

## Boxplot for Outlier Detection

boxplot(
  df_clean$Revenue,
  main = "Revenue Distribution",
  col = "pink"
)

10 Outlier Detection using IQR

Q1 <- quantile(df_clean$Revenue, 0.25)

Q3 <- quantile(df_clean$Revenue, 0.75)

IQR_val <- IQR(df_clean$Revenue)

outliers <- df_clean[
  df_clean$Revenue >
    (Q3 + 1.5 * IQR_val),
]

nrow(outliers)

## [1] 31231

11 Country-wise Revenue Analysis

12 Group-wise Revenue Analysis

country_revenue <- aggregate(Revenue ~ Country,
                             data = df_clean,
                             FUN = sum)

head(country_revenue)

##     Country   Revenue
## 1 Australia 138453.81
## 2   Austria  10198.68
## 3   Bahrain    548.40
## 4   Belgium  41196.34
## 5    Brazil   1143.60
## 6    Canada   3666.38

13 Top 10 Countries by Revenue

top10_country <-
  country_revenue[
    order(-country_revenue$Revenue),
  ][1:10, ]

top10_country

##           Country    Revenue
## 35 United Kingdom 7285024.64
## 24    Netherlands  285446.34
## 11           EIRE  265262.46
## 15        Germany  228678.40
## 14         France  208934.31
## 1       Australia  138453.81
## 31          Spain   61558.56
## 33    Switzerland   56443.95
## 4         Belgium   41196.34
## 32         Sweden   38367.83

14 Barplot of Top Countries

barplot(
  top10_country$Revenue,
  names.arg = top10_country$Country,
  las = 2,
  col = "steelblue",
  main = "Top 10 Countries by Revenue"
)

15 Product Analysis

16 Products with Highest Revenue

product_rev <- aggregate(
  Revenue ~ Description,
  data = df_clean,
  sum
)

top_products <-
  product_rev[
    order(-product_rev$Revenue),
  ][1:10, ]

top_products

##                             Description   Revenue
## 2320        PAPER CRAFT , LITTLE BIRDIE 168469.60
## 2768           REGENCY CAKESTAND 3 TIER 142264.75
## 3699 WHITE HANGING HEART T-LIGHT HOLDER 100392.10
## 1763            JUMBO BAG RED RETROSPOT  85040.54
## 1994     MEDIUM CERAMIC TOP STORAGE JAR  81416.73
## 2612                            POSTAGE  77803.96
## 2346                      PARTY BUNTING  68785.23
## 217       ASSORTED COLOUR BIRD ORNAMENT  56413.03
## 1985                             Manual  53419.93
## 2657                 RABBIT NIGHT LIGHT  51251.24

17 Customer Behavior Analysis

18 Repeat Customers

cust_orders <- table(df_clean$CustomerID)

sum(cust_orders > 1)

## [1] 4267

19 Revenue per Customer

cust_rev <- aggregate(
  Revenue ~ CustomerID,
  data = df_clean,
  sum
)

head(cust_rev)

##   CustomerID  Revenue
## 1      12346 77183.60
## 2      12347  4310.00
## 3      12348  1797.24
## 4      12349  1757.55
## 5      12350   334.40
## 6      12352  2506.04

20 Customer Spending Distribution

hist(
  cust_rev$Revenue,
  col = "steelblue",
  main = "Customer Spending Distribution",
  xlab = "Revenue"
)

21 Countries with High-value Customers

country_avg <- aggregate(
  Revenue ~ Country,
  df_clean,
  mean
)

top10_avg <-
  country_avg[
    order(-country_avg$Revenue),
  ][1:10, ]

top10_avg

##        Country   Revenue
## 24 Netherlands 120.79828
## 1    Australia 116.93734
## 20       Japan 116.56190
## 30   Singapore  95.85266
## 32      Sweden  85.26184
## 10     Denmark  49.88247
## 22   Lithuania  47.45886
## 21     Lebanon  37.64178
## 11        EIRE  36.69929
## 5       Brazil  35.73750

22 Time-based Sales Analysis

23 Monthly Sales Analysis

df_clean$Month <-
  format(df_clean$InvoiceDate, "%Y-%m")

monthly <- aggregate(
  Revenue ~ Month,
  data = df_clean,
  sum
)

monthly

##      Month   Revenue
## 1  2010-12  570422.7
## 2  2011-01  568101.3
## 3  2011-02  446084.9
## 4  2011-03  594081.8
## 5  2011-04  468374.3
## 6  2011-05  677355.2
## 7  2011-06  660046.1
## 8  2011-07  598962.9
## 9  2011-08  644051.0
## 10 2011-09  950690.2
## 11 2011-10 1035642.4
## 12 2011-11 1156205.6
## 13 2011-12  517190.4

##Month with Highest Sales

monthly[
  which.max(monthly$Revenue),
]

##      Month Revenue
## 12 2011-11 1156206

24 Monthly Revenue Trend

plot(
  monthly$Revenue,
  type = "l",
  main = "Monthly Revenue Trend",
  xlab = "Month",
  ylab = "Revenue",
  col = "steelblue"
)

axis(
  1,
  at = 1:nrow(monthly),
  labels = monthly$Month,
  las = 2,
  cex.axis = 0.7
)

## Day-wise Revenue Analysis

df_clean$Day <-
  weekdays(df_clean$InvoiceDate)

aggregate(
  Revenue ~ Day,
  df_clean,
  sum
)

##         Day   Revenue
## 1    Friday 1483080.8
## 2    Monday 1363604.4
## 3    Sunday  785490.3
## 4  Thursday 1973015.7
## 5   Tuesday 1697733.8
## 6 Wednesday 1584283.8

df_clean$Day <- weekdays(df_clean$InvoiceDate)

day_sales <- aggregate(
  Revenue ~ Day,
  df_clean,
  sum
)

plot(day_sales$Revenue,
     type = "o",
     col = "blue",
     pch = 16,
     xaxt = "n",
     main = "Day-wise Revenue Trend",
     xlab = "Day",
     ylab = "Revenue")

axis(1,
     at = 1:nrow(day_sales),
     labels = day_sales$Day)

25 Correlation between UnitPrice and Revenue

cor(
  df_clean$UnitPrice,
  df_clean$Revenue
)

## [1] 0.08161959

26 Correlation Heatmap

num_data <- df_clean[
  ,
  c(
    "Quantity",
    "UnitPrice",
    "Revenue"
  )
]

cor_matrix <- cor(num_data)

corrplot(
  cor_matrix,
  method = "color",
  addCoef.col = "black",
  tl.col = "black"
)

27 Which country appears most frequently?

country_count <- 
  as.data.frame(table(df_clean$Country))
country_count

##                    Var1   Freq
## 1             Australia   1184
## 2               Austria    398
## 3               Bahrain     17
## 4               Belgium   2031
## 5                Brazil     32
## 6                Canada    151
## 7       Channel Islands    747
## 8                Cyprus    603
## 9        Czech Republic     25
## 10              Denmark    380
## 11                 EIRE   7228
## 12   European Community     60
## 13              Finland    685
## 14               France   8327
## 15              Germany   9027
## 16               Greece    145
## 17              Iceland    182
## 18               Israel    245
## 19                Italy    758
## 20                Japan    321
## 21              Lebanon     45
## 22            Lithuania     35
## 23                Malta    112
## 24          Netherlands   2363
## 25               Norway   1072
## 26               Poland    330
## 27             Portugal   1453
## 28                  RSA     58
## 29         Saudi Arabia      9
## 30            Singapore    222
## 31                Spain   2480
## 32               Sweden    450
## 33          Switzerland   1842
## 34 United Arab Emirates     68
## 35       United Kingdom 349227
## 36          Unspecified    241
## 37                  USA    179

28 Regression Analysis

29 Simple Linear Regression

model_simple <- lm(
  Revenue ~ Quantity,
  data = df_clean
)

summary(model_simple)

## 
## Call:
## lm(formula = Revenue ~ Quantity, data = df_clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38372     -5     -1      6  42357 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.14869    0.20758   10.35   <2e-16 ***
## Quantity     1.55701    0.00114 1365.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 129.7 on 392730 degrees of freedom
## Multiple R-squared:  0.8261, Adjusted R-squared:  0.8261 
## F-statistic: 1.865e+06 on 1 and 392730 DF,  p-value: < 2.2e-16

30 Scatterplot with Regression Line

plot(
  df_clean$Quantity,
  df_clean$Revenue,
  col = "steelblue",
  pch = 16,
  main = "Quantity vs Revenue",
  xlab = "Quantity",
  ylab = "Revenue"
)

abline(
  model_simple,
  col = "red",
  lwd = 2
)

31 Revenue Prediction

predict(
  model_simple,
  newdata = data.frame(
    Quantity = 10
  )
)

##        1 
## 17.71882

32 Polynomial Regression

lm(
  Revenue ~ Quantity +
    I(Quantity^2),
  data = df_clean
)

## 
## Call:
## lm(formula = Revenue ~ Quantity + I(Quantity^2), data = df_clean)
## 
## Coefficients:
##   (Intercept)       Quantity  I(Quantity^2)  
##     1.437e+01      5.942e-01      1.322e-05

33 Does Day Affect Revenue?

lm(
  Revenue ~ Day,
  data = df_clean
)

## 
## Call:
## lm(formula = Revenue ~ Day, data = df_clean)
## 
## Coefficients:
##  (Intercept)     DayMonday     DaySunday   DayThursday    DayTuesday  
##       27.347        -6.120       -14.516        -2.455        -1.525  
## DayWednesday  
##       -4.064

34 Advanced ggplot Line Chart

ggplot(
  monthly,
  aes(
    x = Month,
    y = Revenue,
    group = 1
  )
) +
  geom_line(
    color = "blue",
    linewidth = 1.2
  ) +
  geom_point(
    color = "red",
    size = 3
  ) +
  theme(
    axis.text.x =
      element_text(angle = 90)
  )

35 Scatterplot with Transparency

plot(
  df_clean$Quantity,
  df_clean$Revenue,
  pch = 16,
  col = rgb(0,0,1,0.3),
  main = "Quantity vs Revenue"
)

abline(
  model_simple,
  col = "red",
  lwd = 3
)

## Bubble Chart

symbols(
  df_clean$Quantity[1:200],
  df_clean$Revenue[1:200],
  circles =
    df_clean$UnitPrice[1:200],
  inches = 0.3,
  main = "Bubble Chart"
)