title: “E-commerce Sales Data Analysis in R” author: “Shruti Sahu” date: “2026-04-25”
output: html_document: toc: true toc_float: true number_sections: true
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.5.3
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readr)
library(scales)
## Warning: package 'scales' was built under R version 4.5.3
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
df <- read.csv("C:/Users/SHRUTI SAHU/Downloads/data.csv", fileEncoding = "latin1")
head(df)
## InvoiceNo StockCode Description Quantity
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
## 2 536365 71053 WHITE METAL LANTERN 6
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2
## InvoiceDate UnitPrice CustomerID Country
## 1 12/1/2010 8:26 2.55 17850 United Kingdom
## 2 12/1/2010 8:26 3.39 17850 United Kingdom
## 3 12/1/2010 8:26 2.75 17850 United Kingdom
## 4 12/1/2010 8:26 3.39 17850 United Kingdom
## 5 12/1/2010 8:26 3.39 17850 United Kingdom
## 6 12/1/2010 8:26 7.65 17850 United Kingdom
str(df)
## 'data.frame': 541909 obs. of 8 variables:
## $ InvoiceNo : chr "536365" "536365" "536365" "536365" ...
## $ StockCode : chr "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: chr "12/1/2010 8:26" "12/1/2010 8:26" "12/1/2010 8:26" "12/1/2010 8:26" ...
## $ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
dim(df)
## [1] 541909 8
summary(df)
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.000
## Class :character Class :character Class :character 1st Qu.: 1.000
## Mode :character Mode :character Mode :character Median : 3.000
## Mean : 9.552
## 3rd Qu.: 10.000
## Max. : 80995.000
##
## InvoiceDate UnitPrice CustomerID Country
## Length:541909 Min. :-11062.060 Min. :12346 Length:541909
## Class :character 1st Qu.: 1.250 1st Qu.:13953 Class :character
## Mode :character Median : 2.080 Median :15152 Mode :character
## Mean : 4.611 Mean :15288
## 3rd Qu.: 4.130 3rd Qu.:16791
## Max. : 38970.000 Max. :18287
## NA's :135080
# Fix date column
df$InvoiceDate <- as.POSIXct(df$InvoiceDate, format = "%m/%d/%Y %H:%M")
# Remove missing CustomerID
df_clean <- df[!is.na(df$CustomerID), ]
# Remove cancelled orders
df_clean <- df_clean[!grepl("^C", df_clean$InvoiceNo), ]
# Remove negative quantities
df_clean <- df_clean[df_clean$Quantity > 0, ]
# Remove duplicates
df_clean <- df_clean[!duplicated(df_clean), ]
# Create Revenue column
df_clean$Revenue <- df_clean$Quantity * df_clean$UnitPrice
nrow(df_clean)
## [1] 392732
df_clean$Revenue <-
df_clean$Quantity *
df_clean$UnitPrice
head(df_clean)
## InvoiceNo StockCode Description Quantity
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
## 2 536365 71053 WHITE METAL LANTERN 6
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2
## InvoiceDate UnitPrice CustomerID Country Revenue
## 1 2010-12-01 08:26:00 2.55 17850 United Kingdom 15.30
## 2 2010-12-01 08:26:00 3.39 17850 United Kingdom 20.34
## 3 2010-12-01 08:26:00 2.75 17850 United Kingdom 22.00
## 4 2010-12-01 08:26:00 3.39 17850 United Kingdom 20.34
## 5 2010-12-01 08:26:00 3.39 17850 United Kingdom 20.34
## 6 2010-12-01 08:26:00 7.65 17850 United Kingdom 15.30
nrow(df_clean)
## [1] 392732
summary(df_clean$Revenue)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 4.95 12.39 22.63 19.80 168469.60
mean(df_clean$Revenue)
## [1] 22.62919
median(df_clean$Revenue)
## [1] 12.39
sd(df_clean$Revenue)
## [1] 311.0835
length(unique(df_clean$CustomerID))
## [1] 4339
as.data.frame(table(df_clean$Country))
## Var1 Freq
## 1 Australia 1184
## 2 Austria 398
## 3 Bahrain 17
## 4 Belgium 2031
## 5 Brazil 32
## 6 Canada 151
## 7 Channel Islands 747
## 8 Cyprus 603
## 9 Czech Republic 25
## 10 Denmark 380
## 11 EIRE 7228
## 12 European Community 60
## 13 Finland 685
## 14 France 8327
## 15 Germany 9027
## 16 Greece 145
## 17 Iceland 182
## 18 Israel 245
## 19 Italy 758
## 20 Japan 321
## 21 Lebanon 45
## 22 Lithuania 35
## 23 Malta 112
## 24 Netherlands 2363
## 25 Norway 1072
## 26 Poland 330
## 27 Portugal 1453
## 28 RSA 58
## 29 Saudi Arabia 9
## 30 Singapore 222
## 31 Spain 2480
## 32 Sweden 450
## 33 Switzerland 1842
## 34 United Arab Emirates 68
## 35 United Kingdom 349227
## 36 Unspecified 241
## 37 USA 179
hist(
df_clean$Revenue[df_clean$Revenue < 500],
main = "Revenue Distribution",
xlab = "Revenue (£)",
col = "steelblue"
)
plot(
density(df_clean$Revenue),
lwd = 3,
main = "Revenue Density Plot"
)
hist(
df_clean$Revenue,
probability = TRUE,
col = "lightgreen",
breaks = 40,
main = "Revenue Distribution"
)
lines(
density(df_clean$Revenue),
lwd = 3,
col = "red"
)
## Boxplot for Outlier Detection
boxplot(
df_clean$Revenue,
main = "Revenue Distribution",
col = "pink"
)
Q1 <- quantile(df_clean$Revenue, 0.25)
Q3 <- quantile(df_clean$Revenue, 0.75)
IQR_val <- IQR(df_clean$Revenue)
outliers <- df_clean[
df_clean$Revenue >
(Q3 + 1.5 * IQR_val),
]
nrow(outliers)
## [1] 31231
country_revenue <- aggregate(Revenue ~ Country,
data = df_clean,
FUN = sum)
head(country_revenue)
## Country Revenue
## 1 Australia 138453.81
## 2 Austria 10198.68
## 3 Bahrain 548.40
## 4 Belgium 41196.34
## 5 Brazil 1143.60
## 6 Canada 3666.38
top10_country <-
country_revenue[
order(-country_revenue$Revenue),
][1:10, ]
top10_country
## Country Revenue
## 35 United Kingdom 7285024.64
## 24 Netherlands 285446.34
## 11 EIRE 265262.46
## 15 Germany 228678.40
## 14 France 208934.31
## 1 Australia 138453.81
## 31 Spain 61558.56
## 33 Switzerland 56443.95
## 4 Belgium 41196.34
## 32 Sweden 38367.83
barplot(
top10_country$Revenue,
names.arg = top10_country$Country,
las = 2,
col = "steelblue",
main = "Top 10 Countries by Revenue"
)
product_rev <- aggregate(
Revenue ~ Description,
data = df_clean,
sum
)
top_products <-
product_rev[
order(-product_rev$Revenue),
][1:10, ]
top_products
## Description Revenue
## 2320 PAPER CRAFT , LITTLE BIRDIE 168469.60
## 2768 REGENCY CAKESTAND 3 TIER 142264.75
## 3699 WHITE HANGING HEART T-LIGHT HOLDER 100392.10
## 1763 JUMBO BAG RED RETROSPOT 85040.54
## 1994 MEDIUM CERAMIC TOP STORAGE JAR 81416.73
## 2612 POSTAGE 77803.96
## 2346 PARTY BUNTING 68785.23
## 217 ASSORTED COLOUR BIRD ORNAMENT 56413.03
## 1985 Manual 53419.93
## 2657 RABBIT NIGHT LIGHT 51251.24
cust_orders <- table(df_clean$CustomerID)
sum(cust_orders > 1)
## [1] 4267
cust_rev <- aggregate(
Revenue ~ CustomerID,
data = df_clean,
sum
)
head(cust_rev)
## CustomerID Revenue
## 1 12346 77183.60
## 2 12347 4310.00
## 3 12348 1797.24
## 4 12349 1757.55
## 5 12350 334.40
## 6 12352 2506.04
hist(
cust_rev$Revenue,
col = "steelblue",
main = "Customer Spending Distribution",
xlab = "Revenue"
)
country_avg <- aggregate(
Revenue ~ Country,
df_clean,
mean
)
top10_avg <-
country_avg[
order(-country_avg$Revenue),
][1:10, ]
top10_avg
## Country Revenue
## 24 Netherlands 120.79828
## 1 Australia 116.93734
## 20 Japan 116.56190
## 30 Singapore 95.85266
## 32 Sweden 85.26184
## 10 Denmark 49.88247
## 22 Lithuania 47.45886
## 21 Lebanon 37.64178
## 11 EIRE 36.69929
## 5 Brazil 35.73750
df_clean$Month <-
format(df_clean$InvoiceDate, "%Y-%m")
monthly <- aggregate(
Revenue ~ Month,
data = df_clean,
sum
)
monthly
## Month Revenue
## 1 2010-12 570422.7
## 2 2011-01 568101.3
## 3 2011-02 446084.9
## 4 2011-03 594081.8
## 5 2011-04 468374.3
## 6 2011-05 677355.2
## 7 2011-06 660046.1
## 8 2011-07 598962.9
## 9 2011-08 644051.0
## 10 2011-09 950690.2
## 11 2011-10 1035642.4
## 12 2011-11 1156205.6
## 13 2011-12 517190.4
##Month with Highest Sales
monthly[
which.max(monthly$Revenue),
]
## Month Revenue
## 12 2011-11 1156206
plot(
monthly$Revenue,
type = "l",
main = "Monthly Revenue Trend",
xlab = "Month",
ylab = "Revenue",
col = "steelblue"
)
axis(
1,
at = 1:nrow(monthly),
labels = monthly$Month,
las = 2,
cex.axis = 0.7
)
## Day-wise Revenue Analysis
df_clean$Day <-
weekdays(df_clean$InvoiceDate)
aggregate(
Revenue ~ Day,
df_clean,
sum
)
## Day Revenue
## 1 Friday 1483080.8
## 2 Monday 1363604.4
## 3 Sunday 785490.3
## 4 Thursday 1973015.7
## 5 Tuesday 1697733.8
## 6 Wednesday 1584283.8
df_clean$Day <- weekdays(df_clean$InvoiceDate)
day_sales <- aggregate(
Revenue ~ Day,
df_clean,
sum
)
plot(day_sales$Revenue,
type = "o",
col = "blue",
pch = 16,
xaxt = "n",
main = "Day-wise Revenue Trend",
xlab = "Day",
ylab = "Revenue")
axis(1,
at = 1:nrow(day_sales),
labels = day_sales$Day)
cor(
df_clean$UnitPrice,
df_clean$Revenue
)
## [1] 0.08161959
num_data <- df_clean[
,
c(
"Quantity",
"UnitPrice",
"Revenue"
)
]
cor_matrix <- cor(num_data)
corrplot(
cor_matrix,
method = "color",
addCoef.col = "black",
tl.col = "black"
)
country_count <-
as.data.frame(table(df_clean$Country))
country_count
## Var1 Freq
## 1 Australia 1184
## 2 Austria 398
## 3 Bahrain 17
## 4 Belgium 2031
## 5 Brazil 32
## 6 Canada 151
## 7 Channel Islands 747
## 8 Cyprus 603
## 9 Czech Republic 25
## 10 Denmark 380
## 11 EIRE 7228
## 12 European Community 60
## 13 Finland 685
## 14 France 8327
## 15 Germany 9027
## 16 Greece 145
## 17 Iceland 182
## 18 Israel 245
## 19 Italy 758
## 20 Japan 321
## 21 Lebanon 45
## 22 Lithuania 35
## 23 Malta 112
## 24 Netherlands 2363
## 25 Norway 1072
## 26 Poland 330
## 27 Portugal 1453
## 28 RSA 58
## 29 Saudi Arabia 9
## 30 Singapore 222
## 31 Spain 2480
## 32 Sweden 450
## 33 Switzerland 1842
## 34 United Arab Emirates 68
## 35 United Kingdom 349227
## 36 Unspecified 241
## 37 USA 179
model_simple <- lm(
Revenue ~ Quantity,
data = df_clean
)
summary(model_simple)
##
## Call:
## lm(formula = Revenue ~ Quantity, data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38372 -5 -1 6 42357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.14869 0.20758 10.35 <2e-16 ***
## Quantity 1.55701 0.00114 1365.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 129.7 on 392730 degrees of freedom
## Multiple R-squared: 0.8261, Adjusted R-squared: 0.8261
## F-statistic: 1.865e+06 on 1 and 392730 DF, p-value: < 2.2e-16
plot(
df_clean$Quantity,
df_clean$Revenue,
col = "steelblue",
pch = 16,
main = "Quantity vs Revenue",
xlab = "Quantity",
ylab = "Revenue"
)
abline(
model_simple,
col = "red",
lwd = 2
)
predict(
model_simple,
newdata = data.frame(
Quantity = 10
)
)
## 1
## 17.71882
lm(
Revenue ~ Quantity +
I(Quantity^2),
data = df_clean
)
##
## Call:
## lm(formula = Revenue ~ Quantity + I(Quantity^2), data = df_clean)
##
## Coefficients:
## (Intercept) Quantity I(Quantity^2)
## 1.437e+01 5.942e-01 1.322e-05
lm(
Revenue ~ Day,
data = df_clean
)
##
## Call:
## lm(formula = Revenue ~ Day, data = df_clean)
##
## Coefficients:
## (Intercept) DayMonday DaySunday DayThursday DayTuesday
## 27.347 -6.120 -14.516 -2.455 -1.525
## DayWednesday
## -4.064
ggplot(
monthly,
aes(
x = Month,
y = Revenue,
group = 1
)
) +
geom_line(
color = "blue",
linewidth = 1.2
) +
geom_point(
color = "red",
size = 3
) +
theme(
axis.text.x =
element_text(angle = 90)
)
plot(
df_clean$Quantity,
df_clean$Revenue,
pch = 16,
col = rgb(0,0,1,0.3),
main = "Quantity vs Revenue"
)
abline(
model_simple,
col = "red",
lwd = 3
)
## Bubble Chart
symbols(
df_clean$Quantity[1:200],
df_clean$Revenue[1:200],
circles =
df_clean$UnitPrice[1:200],
inches = 0.3,
main = "Bubble Chart"
)