install.packages('readxl',repos="http://cran.rstudio.com/")
## Installing package into 'C:/Users/tuhin/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tuhin\AppData\Local\Temp\RtmpYR9kfS\downloaded_packages
library('readxl')

library(readxl)
retail <- read_excel("C:\\Users\\tuhin\\MY PRACTICE\\R-Programming\\R ML Models\\1.0 DataSet ML Model\\Retail\\Online Retail.xlsx")
head(retail)
## # A tibble: 6 x 8
##   InvoiceNo StockCode Description         Quantity InvoiceDate         UnitPrice
##   <chr>     <chr>     <chr>                  <dbl> <dttm>                  <dbl>
## 1 536365    85123A    WHITE HANGING HEAR~        6 2010-12-01 08:26:00      2.55
## 2 536365    71053     WHITE METAL LANTERN        6 2010-12-01 08:26:00      3.39
## 3 536365    84406B    CREAM CUPID HEARTS~        8 2010-12-01 08:26:00      2.75
## 4 536365    84029G    KNITTED UNION FLAG~        6 2010-12-01 08:26:00      3.39
## 5 536365    84029E    RED WOOLLY HOTTIE ~        6 2010-12-01 08:26:00      3.39
## 6 536365    22752     SET 7 BABUSHKA NES~        2 2010-12-01 08:26:00      7.65
## # ... with 2 more variables: CustomerID <dbl>, Country <chr>
# NA value checking
colSums(is.na(retail))
##   InvoiceNo   StockCode Description    Quantity InvoiceDate   UnitPrice 
##           0           0        1454           0           0           0 
##  CustomerID     Country 
##      135080           0
#remove all na value
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0     v purrr   0.3.4
## v tibble  3.1.6     v stringr 1.5.0
## v tidyr   1.2.1     v forcats 0.5.1
## v readr   2.1.2     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
retail <- retail %>% 
  drop_na(CustomerID)


#remove duplicated data
retail <- retail[!duplicated(retail),]



#check overall data
summary(retail)
##   InvoiceNo          StockCode         Description           Quantity        
##  Length:401604      Length:401604      Length:401604      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     5.00  
##                                                           Mean   :    12.18  
##                                                           3rd Qu.:    12.00  
##                                                           Max.   : 80995.00  
##   InvoiceDate                    UnitPrice          CustomerID   
##  Min.   :2010-12-01 08:26:00   Min.   :    0.00   Min.   :12346  
##  1st Qu.:2011-04-06 15:02:00   1st Qu.:    1.25   1st Qu.:13939  
##  Median :2011-07-29 15:40:00   Median :    1.95   Median :15145  
##  Mean   :2011-07-10 12:08:23   Mean   :    3.47   Mean   :15281  
##  3rd Qu.:2011-10-20 11:58:30   3rd Qu.:    3.75   3rd Qu.:16784  
##  Max.   :2011-12-09 12:50:00   Max.   :38970.00   Max.   :18287  
##    Country         
##  Length:401604     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
retail <- retail %>% 
  filter(Quantity>0 & UnitPrice>0)


summary(retail)
##   InvoiceNo          StockCode         Description           Quantity       
##  Length:392692      Length:392692      Length:392692      Min.   :    1.00  
##  Class :character   Class :character   Class :character   1st Qu.:    2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    6.00  
##                                                           Mean   :   13.12  
##                                                           3rd Qu.:   12.00  
##                                                           Max.   :80995.00  
##   InvoiceDate                    UnitPrice          CustomerID   
##  Min.   :2010-12-01 08:26:00   Min.   :   0.001   Min.   :12346  
##  1st Qu.:2011-04-07 11:12:00   1st Qu.:   1.250   1st Qu.:13955  
##  Median :2011-07-31 12:02:00   Median :   1.950   Median :15150  
##  Mean   :2011-07-10 19:13:07   Mean   :   3.126   Mean   :15288  
##  3rd Qu.:2011-10-20 12:53:00   3rd Qu.:   3.750   3rd Qu.:16791  
##  Max.   :2011-12-09 12:50:00   Max.   :8142.750   Max.   :18287  
##    Country         
##  Length:392692     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
library(lubridate)
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Creating cohort group
retail_cohort <- retail %>% 
  group_by(CustomerID) %>% 
  summarise(CohortMonth = floor_date(min(InvoiceDate), unit = "month"))

# Extracting InvoiceMonth from InvoiceDate
retail <- retail %>% 
  mutate(InvoiceMonth = floor_date(InvoiceDate, unit = "month"))


# Merge the cohort group data frame to the prior dataframe
retail_merge <- merge(retail, retail_cohort, by.x = "CustomerID", by.y = "CustomerID")
head(retail_merge)
##   CustomerID InvoiceNo StockCode                     Description Quantity
## 1      12346    541431     23166  MEDIUM CERAMIC TOP STORAGE JAR    74215
## 2      12347    581180     21731   RED TOADSTOOL LED NIGHT LIGHT       24
## 3      12347    573511     23506     MINI PLAYING CARDS SPACEBOY       20
## 4      12347    573511     22131 FOOD CONTAINER SET 3 LOVE HEART        6
## 5      12347    573511     22726      ALARM CLOCK BAKELIKE GREEN        8
## 6      12347    549222     22423        REGENCY CAKESTAND 3 TIER        3
##           InvoiceDate UnitPrice        Country InvoiceMonth CohortMonth
## 1 2011-01-18 10:01:00      1.04 United Kingdom   2011-01-01  2011-01-01
## 2 2011-12-07 15:52:00      1.65        Iceland   2011-12-01  2010-12-01
## 3 2011-10-31 12:25:00      0.42        Iceland   2011-10-01  2010-12-01
## 4 2011-10-31 12:25:00      1.95        Iceland   2011-10-01  2010-12-01
## 5 2011-10-31 12:25:00      3.75        Iceland   2011-10-01  2010-12-01
## 6 2011-04-07 10:43:00     12.75        Iceland   2011-04-01  2010-12-01
# Making date transaction into hourly
retail_fc <- retail %>% 
  mutate(datetime = floor_date(InvoiceDate,unit = "hour"))

# Summarise data to get the number of leads hourly
retail_fc <- retail_fc %>% 
  group_by(datetime) %>% 
  summarise(total_purchased = n_distinct(InvoiceNo))

#time padding
library(padr)
min_date <- min(retail_fc$datetime)
max_date <- max(retail_fc$datetime)
retail_fc <- retail_fc %>% 
  pad(start_val = make_datetime(year = year(min_date),month = month(min_date), day= day(min_date), hour = 0), end_val =make_datetime(year = year(max_date),month = month(max_date), day= day(max_date), hour = 23) )
## pad applied on the interval: hour
retail_fc
## # A tibble: 8,976 x 2
##    datetime            total_purchased
##    <dttm>                        <int>
##  1 2010-12-01 00:00:00              NA
##  2 2010-12-01 01:00:00              NA
##  3 2010-12-01 02:00:00              NA
##  4 2010-12-01 03:00:00              NA
##  5 2010-12-01 04:00:00              NA
##  6 2010-12-01 05:00:00              NA
##  7 2010-12-01 06:00:00              NA
##  8 2010-12-01 07:00:00              NA
##  9 2010-12-01 08:00:00               6
## 10 2010-12-01 09:00:00              16
## # ... with 8,966 more rows
retail_fc %>% 
  mutate(hour = as.factor(hour(datetime)),
         total_purchased=replace_na(total_purchased,0)) %>% 
  group_by(hour) %>% 
  summarise(total = sum(total_purchased))
## # A tibble: 24 x 2
##    hour  total
##    <fct> <int>
##  1 0         0
##  2 1         0
##  3 2         0
##  4 3         0
##  5 4         0
##  6 5         0
##  7 6         1
##  8 7        29
##  9 8       555
## 10 9      1393
## # ... with 14 more rows
retail_fc <- retail_fc %>% 
  filter(hour(datetime)>=6 & hour(datetime)<=20) %>% 
  mutate(total_purchased=replace_na(total_purchased,0))

retail_fc
## # A tibble: 5,610 x 2
##    datetime            total_purchased
##    <dttm>                        <int>
##  1 2010-12-01 06:00:00               0
##  2 2010-12-01 07:00:00               0
##  3 2010-12-01 08:00:00               6
##  4 2010-12-01 09:00:00              16
##  5 2010-12-01 10:00:00              11
##  6 2010-12-01 11:00:00              12
##  7 2010-12-01 12:00:00              22
##  8 2010-12-01 13:00:00              12
##  9 2010-12-01 14:00:00               8
## 10 2010-12-01 15:00:00              14
## # ... with 5,600 more rows
# Converting dataframe into time series data
retail_ts <- ts(retail_fc$total_purchased, start = c(1,1), frequency = 15)

# Visualization
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
retail_ts %>% 
  autoplot()+
  theme_minimal()

retail_ts %>% 
  tail(13*7*4) %>% 
  stl(s.window = "periodic") %>% 
  autoplot()

#Assigning the decomposed single seasonality model
retail_single_decompose <- decompose(retail_ts)




# Create multiple seasonality time series
retail_msts <- msts(data = retail_fc$total_purchased,seasonal.periods = c(15,15*7))

# Decomposing
retail_msts %>% 
  tail(15*7*4) %>% 
  stl(s.window = "periodic") %>% 
  autoplot() 

# Assigning decomposed multiple seasonality model
retail_double_decompose <- mstl(retail_msts)





# Single seasonality

retail_fc %>% 
  mutate(
    seasonal = retail_single_decompose$seasonal,
    hour = hour(datetime)
  ) %>% 
  distinct(hour, seasonal) %>% 
  ggplot(mapping = aes(x = hour, y = seasonal)) +
  geom_col() +
  theme_minimal() +
  scale_x_continuous(breaks = seq(6,20,1)) +
  labs(
    title = "Single Seasonality Analysis",
    subtitle = "Daily"
  )

# Multiple Seasonality

as.data.frame(retail_double_decompose) %>% 
  mutate(datetime = retail_fc$datetime) %>% 
  mutate(
    dow = wday(datetime, label = TRUE, abbr = FALSE),
    hour = as.factor(hour(datetime))
  ) %>% 
  group_by(dow, hour) %>% 
  summarise(seasonal = sum(Seasonal15 + Seasonal105)) %>% 
  ggplot(mapping = aes(x = hour, y = seasonal)) +
  geom_col(aes(fill = dow)) +
  scale_fill_viridis_d(option = "cream") +
  theme_minimal() +
  labs(
    title = "Multiple Seasonality Analysis",
    subtitle = "Daily & Weekly"
  )
## `summarise()` has grouped output by 'dow'. You can override using the `.groups`
## argument.
## Warning in viridisLite::viridis(n, alpha, begin, end, direction, option): Option
## 'cream' does not exist. Defaulting to 'viridis'.

salesData <- retail

salesData$Amount <- salesData$Quantity*salesData$UnitPrice


sapply(salesData, function(k) sum(is.na(k)))
##    InvoiceNo    StockCode  Description     Quantity  InvoiceDate    UnitPrice 
##            0            0            0            0            0            0 
##   CustomerID      Country InvoiceMonth       Amount 
##            0            0            0            0
salesData <- na.omit(salesData)
salesData <- salesData[!is.na(salesData$CustomerID),]
salesData <- salesData[!is.na(salesData$Description),]
nrow(salesData)
## [1] 392692
salesData$Day <- format(as.Date(salesData$InvoiceDate),"%d")
salesData$Month <- format(as.Date(salesData$InvoiceDate),"%m")
salesData$Year <- format(as.Date(salesData$InvoiceDate),"%Y")

cncldSalesData <- salesData[startsWith(salesData$InvoiceNo,"C"),]
salesData <- salesData[!startsWith(salesData$InvoiceNo,"C"),]

boxplot(salesData$Amount)

salesData <- salesData[salesData$Amount<20000,]



Top_2010 <- aggregate(Amount~Description+Year,salesData,sum)
Top_2011 <- subset(Top_2010, Year=="2011")
Top_2010 <- subset(Top_2010,Year=="2010")
Top_2010 <- head(Top_2010[with(Top_2010,order(-Amount)),])
Top_2011 <- head(Top_2011[with(Top_2011,order(-Amount)),])
Top_2010$Description <- factor(Top_2010$Description,levels=Top_2010$Description[order(Top_2010$Amount)])
Top_2011$Description <- factor(Top_2011$Description,levels=Top_2011$Description[order(Top_2011$Amount)])

ggplot(Top_2010,aes(x=Description,y=Amount)) + geom_bar(stat = "identity") + coord_flip()

ggplot(Top_2011,aes(x=Description,y=Amount)) + geom_bar(stat = "identity") + coord_flip()

library(imager)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
## 
##     add
## The following object is masked from 'package:padr':
## 
##     pad
## The following object is masked from 'package:stringr':
## 
##     boundary
## The following object is masked from 'package:tidyr':
## 
##     fill
## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum
## The following object is masked from 'package:graphics':
## 
##     frame
## The following object is masked from 'package:base':
## 
##     save.image
im<-load.image("C:\\Users\\tuhin\\MY PRACTICE\\R-Programming\\R ML Models\\1.0 DataSet ML Model\\Retail\\rpus_pic.PNG")
plot(0:1,0:1,type="n",ann=FALSE,axes=FALSE)
rasterImage(im,0,0,1,1)