install.packages('readxl',repos="http://cran.rstudio.com/")
## Installing package into 'C:/Users/tuhin/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\tuhin\AppData\Local\Temp\RtmpYR9kfS\downloaded_packages
library('readxl')
library(readxl)
retail <- read_excel("C:\\Users\\tuhin\\MY PRACTICE\\R-Programming\\R ML Models\\1.0 DataSet ML Model\\Retail\\Online Retail.xlsx")
head(retail)
## # A tibble: 6 x 8
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## <chr> <chr> <chr> <dbl> <dttm> <dbl>
## 1 536365 85123A WHITE HANGING HEAR~ 6 2010-12-01 08:26:00 2.55
## 2 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39
## 3 536365 84406B CREAM CUPID HEARTS~ 8 2010-12-01 08:26:00 2.75
## 4 536365 84029G KNITTED UNION FLAG~ 6 2010-12-01 08:26:00 3.39
## 5 536365 84029E RED WOOLLY HOTTIE ~ 6 2010-12-01 08:26:00 3.39
## 6 536365 22752 SET 7 BABUSHKA NES~ 2 2010-12-01 08:26:00 7.65
## # ... with 2 more variables: CustomerID <dbl>, Country <chr>
# NA value checking
colSums(is.na(retail))
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0 0 1454 0 0 0
## CustomerID Country
## 135080 0
#remove all na value
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0 v purrr 0.3.4
## v tibble 3.1.6 v stringr 1.5.0
## v tidyr 1.2.1 v forcats 0.5.1
## v readr 2.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
retail <- retail %>%
drop_na(CustomerID)
#remove duplicated data
retail <- retail[!duplicated(retail),]
#check overall data
summary(retail)
## InvoiceNo StockCode Description Quantity
## Length:401604 Length:401604 Length:401604 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 5.00
## Mean : 12.18
## 3rd Qu.: 12.00
## Max. : 80995.00
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00 Min. : 0.00 Min. :12346
## 1st Qu.:2011-04-06 15:02:00 1st Qu.: 1.25 1st Qu.:13939
## Median :2011-07-29 15:40:00 Median : 1.95 Median :15145
## Mean :2011-07-10 12:08:23 Mean : 3.47 Mean :15281
## 3rd Qu.:2011-10-20 11:58:30 3rd Qu.: 3.75 3rd Qu.:16784
## Max. :2011-12-09 12:50:00 Max. :38970.00 Max. :18287
## Country
## Length:401604
## Class :character
## Mode :character
##
##
##
retail <- retail %>%
filter(Quantity>0 & UnitPrice>0)
summary(retail)
## InvoiceNo StockCode Description Quantity
## Length:392692 Length:392692 Length:392692 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.: 2.00
## Mode :character Mode :character Mode :character Median : 6.00
## Mean : 13.12
## 3rd Qu.: 12.00
## Max. :80995.00
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00 Min. : 0.001 Min. :12346
## 1st Qu.:2011-04-07 11:12:00 1st Qu.: 1.250 1st Qu.:13955
## Median :2011-07-31 12:02:00 Median : 1.950 Median :15150
## Mean :2011-07-10 19:13:07 Mean : 3.126 Mean :15288
## 3rd Qu.:2011-10-20 12:53:00 3rd Qu.: 3.750 3rd Qu.:16791
## Max. :2011-12-09 12:50:00 Max. :8142.750 Max. :18287
## Country
## Length:392692
## Class :character
## Mode :character
##
##
##
library(lubridate)
## Loading required package: timechange
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Creating cohort group
retail_cohort <- retail %>%
group_by(CustomerID) %>%
summarise(CohortMonth = floor_date(min(InvoiceDate), unit = "month"))
# Extracting InvoiceMonth from InvoiceDate
retail <- retail %>%
mutate(InvoiceMonth = floor_date(InvoiceDate, unit = "month"))
# Merge the cohort group data frame to the prior dataframe
retail_merge <- merge(retail, retail_cohort, by.x = "CustomerID", by.y = "CustomerID")
head(retail_merge)
## CustomerID InvoiceNo StockCode Description Quantity
## 1 12346 541431 23166 MEDIUM CERAMIC TOP STORAGE JAR 74215
## 2 12347 581180 21731 RED TOADSTOOL LED NIGHT LIGHT 24
## 3 12347 573511 23506 MINI PLAYING CARDS SPACEBOY 20
## 4 12347 573511 22131 FOOD CONTAINER SET 3 LOVE HEART 6
## 5 12347 573511 22726 ALARM CLOCK BAKELIKE GREEN 8
## 6 12347 549222 22423 REGENCY CAKESTAND 3 TIER 3
## InvoiceDate UnitPrice Country InvoiceMonth CohortMonth
## 1 2011-01-18 10:01:00 1.04 United Kingdom 2011-01-01 2011-01-01
## 2 2011-12-07 15:52:00 1.65 Iceland 2011-12-01 2010-12-01
## 3 2011-10-31 12:25:00 0.42 Iceland 2011-10-01 2010-12-01
## 4 2011-10-31 12:25:00 1.95 Iceland 2011-10-01 2010-12-01
## 5 2011-10-31 12:25:00 3.75 Iceland 2011-10-01 2010-12-01
## 6 2011-04-07 10:43:00 12.75 Iceland 2011-04-01 2010-12-01
# Making date transaction into hourly
retail_fc <- retail %>%
mutate(datetime = floor_date(InvoiceDate,unit = "hour"))
# Summarise data to get the number of leads hourly
retail_fc <- retail_fc %>%
group_by(datetime) %>%
summarise(total_purchased = n_distinct(InvoiceNo))
#time padding
library(padr)
min_date <- min(retail_fc$datetime)
max_date <- max(retail_fc$datetime)
retail_fc <- retail_fc %>%
pad(start_val = make_datetime(year = year(min_date),month = month(min_date), day= day(min_date), hour = 0), end_val =make_datetime(year = year(max_date),month = month(max_date), day= day(max_date), hour = 23) )
## pad applied on the interval: hour
retail_fc
## # A tibble: 8,976 x 2
## datetime total_purchased
## <dttm> <int>
## 1 2010-12-01 00:00:00 NA
## 2 2010-12-01 01:00:00 NA
## 3 2010-12-01 02:00:00 NA
## 4 2010-12-01 03:00:00 NA
## 5 2010-12-01 04:00:00 NA
## 6 2010-12-01 05:00:00 NA
## 7 2010-12-01 06:00:00 NA
## 8 2010-12-01 07:00:00 NA
## 9 2010-12-01 08:00:00 6
## 10 2010-12-01 09:00:00 16
## # ... with 8,966 more rows
retail_fc %>%
mutate(hour = as.factor(hour(datetime)),
total_purchased=replace_na(total_purchased,0)) %>%
group_by(hour) %>%
summarise(total = sum(total_purchased))
## # A tibble: 24 x 2
## hour total
## <fct> <int>
## 1 0 0
## 2 1 0
## 3 2 0
## 4 3 0
## 5 4 0
## 6 5 0
## 7 6 1
## 8 7 29
## 9 8 555
## 10 9 1393
## # ... with 14 more rows
retail_fc <- retail_fc %>%
filter(hour(datetime)>=6 & hour(datetime)<=20) %>%
mutate(total_purchased=replace_na(total_purchased,0))
retail_fc
## # A tibble: 5,610 x 2
## datetime total_purchased
## <dttm> <int>
## 1 2010-12-01 06:00:00 0
## 2 2010-12-01 07:00:00 0
## 3 2010-12-01 08:00:00 6
## 4 2010-12-01 09:00:00 16
## 5 2010-12-01 10:00:00 11
## 6 2010-12-01 11:00:00 12
## 7 2010-12-01 12:00:00 22
## 8 2010-12-01 13:00:00 12
## 9 2010-12-01 14:00:00 8
## 10 2010-12-01 15:00:00 14
## # ... with 5,600 more rows
# Converting dataframe into time series data
retail_ts <- ts(retail_fc$total_purchased, start = c(1,1), frequency = 15)
# Visualization
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
retail_ts %>%
autoplot()+
theme_minimal()

retail_ts %>%
tail(13*7*4) %>%
stl(s.window = "periodic") %>%
autoplot()

#Assigning the decomposed single seasonality model
retail_single_decompose <- decompose(retail_ts)
# Create multiple seasonality time series
retail_msts <- msts(data = retail_fc$total_purchased,seasonal.periods = c(15,15*7))
# Decomposing
retail_msts %>%
tail(15*7*4) %>%
stl(s.window = "periodic") %>%
autoplot()

# Assigning decomposed multiple seasonality model
retail_double_decompose <- mstl(retail_msts)
# Single seasonality
retail_fc %>%
mutate(
seasonal = retail_single_decompose$seasonal,
hour = hour(datetime)
) %>%
distinct(hour, seasonal) %>%
ggplot(mapping = aes(x = hour, y = seasonal)) +
geom_col() +
theme_minimal() +
scale_x_continuous(breaks = seq(6,20,1)) +
labs(
title = "Single Seasonality Analysis",
subtitle = "Daily"
)

# Multiple Seasonality
as.data.frame(retail_double_decompose) %>%
mutate(datetime = retail_fc$datetime) %>%
mutate(
dow = wday(datetime, label = TRUE, abbr = FALSE),
hour = as.factor(hour(datetime))
) %>%
group_by(dow, hour) %>%
summarise(seasonal = sum(Seasonal15 + Seasonal105)) %>%
ggplot(mapping = aes(x = hour, y = seasonal)) +
geom_col(aes(fill = dow)) +
scale_fill_viridis_d(option = "cream") +
theme_minimal() +
labs(
title = "Multiple Seasonality Analysis",
subtitle = "Daily & Weekly"
)
## `summarise()` has grouped output by 'dow'. You can override using the `.groups`
## argument.
## Warning in viridisLite::viridis(n, alpha, begin, end, direction, option): Option
## 'cream' does not exist. Defaulting to 'viridis'.

salesData <- retail
salesData$Amount <- salesData$Quantity*salesData$UnitPrice
sapply(salesData, function(k) sum(is.na(k)))
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0 0 0 0 0 0
## CustomerID Country InvoiceMonth Amount
## 0 0 0 0
salesData <- na.omit(salesData)
salesData <- salesData[!is.na(salesData$CustomerID),]
salesData <- salesData[!is.na(salesData$Description),]
nrow(salesData)
## [1] 392692
salesData$Day <- format(as.Date(salesData$InvoiceDate),"%d")
salesData$Month <- format(as.Date(salesData$InvoiceDate),"%m")
salesData$Year <- format(as.Date(salesData$InvoiceDate),"%Y")
cncldSalesData <- salesData[startsWith(salesData$InvoiceNo,"C"),]
salesData <- salesData[!startsWith(salesData$InvoiceNo,"C"),]
boxplot(salesData$Amount)

salesData <- salesData[salesData$Amount<20000,]
Top_2010 <- aggregate(Amount~Description+Year,salesData,sum)
Top_2011 <- subset(Top_2010, Year=="2011")
Top_2010 <- subset(Top_2010,Year=="2010")
Top_2010 <- head(Top_2010[with(Top_2010,order(-Amount)),])
Top_2011 <- head(Top_2011[with(Top_2011,order(-Amount)),])
Top_2010$Description <- factor(Top_2010$Description,levels=Top_2010$Description[order(Top_2010$Amount)])
Top_2011$Description <- factor(Top_2011$Description,levels=Top_2011$Description[order(Top_2011$Amount)])
ggplot(Top_2010,aes(x=Description,y=Amount)) + geom_bar(stat = "identity") + coord_flip()

ggplot(Top_2011,aes(x=Description,y=Amount)) + geom_bar(stat = "identity") + coord_flip()

library(imager)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following object is masked from 'package:padr':
##
## pad
## The following object is masked from 'package:stringr':
##
## boundary
## The following object is masked from 'package:tidyr':
##
## fill
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
im<-load.image("C:\\Users\\tuhin\\MY PRACTICE\\R-Programming\\R ML Models\\1.0 DataSet ML Model\\Retail\\rpus_pic.PNG")
plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
rasterImage(im,0,0,1,1)
