Introduction

This project is intended to compare the number of searching times based on keywords these are Tiki, Sendo, Shopee,Lazada, then understand more and more e-commerce users activites within a week (7 day)

Set up the packages and standardize datasets to visualize

Load packages

suppressMessages(library(tidyverse))
## Warning: package 'tidyverse' was built under R version 3.6.2
suppressMessages(library(viridis))
## Warning: package 'viridis' was built under R version 3.6.2
suppressMessages(library(ggsci))
## Warning: package 'ggsci' was built under R version 3.6.3
suppressMessages(library(lubridate))
suppressMessages(library(stringr))
suppressMessages(library(stringi))
suppressMessages(library(explore))
## Warning: package 'explore' was built under R version 3.6.2
suppressMessages(library(DT))
## Warning: package 'DT' was built under R version 3.6.2
suppressMessages(library(Amelia))
## Warning: package 'Amelia' was built under R version 3.6.3
suppressMessages(library(gridExtra))
## Warning: package 'gridExtra' was built under R version 3.6.2
theme_set(theme_minimal())

Load datasets

These concepts to manipulate datasets are create a list then use the lapply to replicate to optimize time process

dataset<- list()
dataset<- lapply(Sys.glob("*.csv"),read.csv)

"Built function to standardize"
## [1] "Built function to standardize"
clean_function <- function(dt) {
 
  dt <- dt %>% mutate(date = as.POSIXct(str_replace_all(date,"T"," "), format = "%Y-%m-%d %H"))
  }
dataset<-lapply(dataset, clean_function)



lazada<-as.data.frame(dataset[1])%>% mutate(hour=factor(hour(date)))%>% mutate(wday=factor(wday(date)))
sendo<-as.data.frame(dataset[2])%>% mutate(hour=factor(hour(date)))%>% mutate(wday=factor(wday(date)))
shopee<-as.data.frame(dataset[3])%>% mutate(hour=factor(hour(date)))%>% mutate(wday=factor(wday(date)))
tiki<-as.data.frame(dataset[4])%>% mutate(hour=factor(hour(date)))%>% mutate(wday=factor(wday(date)))

Let have a look at sample Lazada dataset right now

library(knitr)
knitr::kable(head(lazada))
date n hour wday
2020-03-03 09:00:00 86 9 3
2020-03-03 10:00:00 87 10 3
2020-03-03 11:00:00 92 11 3
2020-03-03 12:00:00 81 12 3
2020-03-03 13:00:00 84 13 3
2020-03-03 14:00:00 86 14 3

From these table, you can easily see the main components are time and n (n is the total of searching times in each record (row))

I also standardize the datasets and extract the weekdays and hour for these below purposes

##Check NA values Once more time, be sure that you are comfortable without NA values

#Test na in datasets
missmap(tiki)
missmap(lazada)

missmap(sendo)
missmap(shopee)

These result is quite good

#Visualize the datasets

##Histogram in data set

tiki_plot<-tiki%>% ggplot(aes(x=n,y=..density..))+geom_histogram(bins=35,fill="steelblue",col="white")+geom_density(fill="green",alpha=0.3,col="red")+ggtitle(label="Number of Suffering Tiki Website")

lazada_plot<-lazada%>% ggplot(aes(x=n,y=..density..))+geom_histogram(bins=35,fill="steelblue",col="white")+geom_density(fill="green",alpha=0.3,col="red")+ggtitle(label="Number of Suffering lazada Website")

sendo_plot<-sendo%>% ggplot(aes(x=n,y=..density..))+geom_histogram(bins=35,fill="steelblue",col="white")+geom_density(fill="green",alpha=0.3,col="red")+ggtitle(label="Number of Suffering sendo Website")

shopee_plot<-shopee%>% ggplot(aes(x=n,y=..density..))+geom_histogram(bins=35,fill="steelblue",col="white")+geom_density(fill="green",alpha=0.3,col="red")+ggtitle(label="Number of Suffering shopee Website")

              grid.arrange(tiki_plot,sendo_plot,shopee_plot,lazada_plot)

From these plot I can conclude that Shopee has the largest number of searching keyword. But bins are not quite same. Likewise, Lazada frequency clearly separates to 2 parts. Tiki and Sendo are quite same day to day

Hour vs Weekday

Tiki

plot1<- tiki%>% group_by(hour)%>%summarise(total=sum(n))%>%ggplot(aes(x=hour,y=total,fill=hour,alpha=0.5))+geom_col()+labs(title="Searching numbers pear hour of tiki",xlabs="Hour",ylabs="Frequency")+scale_color_viridis(discrete=TRUE,option="A")

plot2<-tiki%>% group_by(wday)%>%summarise(total=sum(n))%>%ggplot(aes(x=wday,y=total,fill=wday))+geom_col()+labs(title="Searching numbers pear wday of tiki",xlabs="Wday",ylabs="Frequency")+scale_color_viridis(discrete=TRUE,option="A")

plot3<- tiki%>%group_by(hour,wday)%>%summarise(total=sum(n))%>%ggplot(aes(x=hour,y=wday))+geom_tile(aes(fill=total))+scale_fill_viridis(option="A")+labs(title="Searching numbers pear hour vs wday of tiki",xlabs="Hour",ylabs="Wday")

plot4<- tiki%>% ggplot(aes(x=date,y=n))+geom_line(aes(group=1),color="blue",lwd=3)+labs(title="Searching numbers pear hour vs wday of tiki",xlabs="Date",ylabs="Frequency")

            grid.arrange(plot1,plot2,plot3,plot4,nrow=2,ncol=2)

Sendo

plot1<-sendo%>% group_by(hour)%>%summarise(total=sum(n))%>%ggplot(aes(x=hour,y=total,fill=hour,alpha=0.5))+geom_col()+labs(title="Searching numbers pear hour of sendo",xlabs="Hour",ylabs="Frequency")+scale_color_viridis(discrete=TRUE,option="A")

plot2<-sendo%>% group_by(wday)%>%summarise(total=sum(n))%>%ggplot(aes(x=wday,y=total,fill=wday))+geom_col()+labs(title="Searching numbers pear wday of sendo",xlabs="Wday",ylabs="Frequency")+scale_color_viridis(discrete=TRUE,option="A")

plot3<- sendo%>%group_by(hour,wday)%>%summarise(total=sum(n))%>%ggplot(aes(x=hour,y=wday))+geom_tile(aes(fill=total))+scale_fill_viridis(option="A")+labs(title="Searching numbers pear hour vs wday of sendo",xlabs="Hour",ylabs="Wday")

plot4<- sendo%>% ggplot(aes(x=date,y=n))+geom_line(aes(group=1),color="blue",lwd=3)+labs(title="Searching numbers pear hour vs wday of sendo",xlabs="Date",ylabs="Frequency")

            grid.arrange(plot1,plot2,plot3,plot4,nrow=2,ncol=2)

Lazada

plot1<-lazada%>% group_by(hour)%>%summarise(total=sum(n))%>%ggplot(aes(x=hour,y=total,fill=hour,alpha=0.5))+geom_col()+labs(title="Searching numbers pear hour of lazada",xlabs="Hour",ylabs="Frequency")+scale_color_viridis(discrete=TRUE,option="A")

plot2<-lazada%>% group_by(wday)%>%summarise(total=sum(n))%>%ggplot(aes(x=wday,y=total,fill=wday))+geom_col()+labs(title="Searching numbers pear wday of lazada",xlabs="Wday",ylabs="Frequency")+scale_color_viridis(discrete=TRUE,option="A")

plot3<- lazada%>%group_by(hour,wday)%>%summarise(total=sum(n))%>%ggplot(aes(x=hour,y=wday))+geom_tile(aes(fill=total))+scale_fill_viridis(option="A")+labs(title="Searching numbers pear hour vs wday of lazada",xlabs="Hour",ylabs="Wday")

plot4<- lazada%>% ggplot(aes(x=date,y=n))+geom_line(aes(group=1),color="blue",lwd=3)+labs(title="Searching numbers pear hour vs wday of lazada",xlabs="Date",ylabs="Frequency")

         grid.arrange(plot1,plot2,plot3,plot4,nrow=2,ncol=2)

Shopee

plot1<-shopee%>% group_by(hour)%>%summarise(total=sum(n))%>%ggplot(aes(x=hour,y=total,fill=hour,alpha=0.5))+geom_col()+labs(title="Searching numbers pear hour of shopee",xlabs="Hour",ylabs="Frequency")+scale_color_viridis(discrete=TRUE,option="A")

plot2<-shopee%>% group_by(wday)%>%summarise(total=sum(n))%>%ggplot(aes(x=wday,y=total,fill=wday))+geom_col()+labs(title="Searching numbers pear wday of shopee",xlabs="Wday",ylabs="Frequency")+scale_color_viridis(discrete=TRUE,option="A")

plot3<- shopee%>%group_by(hour,wday)%>%summarise(total=sum(n))%>%ggplot(aes(x=hour,y=wday))+geom_tile(aes(fill=total))+scale_fill_viridis(option="A")+labs(title="Searching numbers pear hour vs wday of shopee",xlabs="Hour",ylabs="Wday")

plot4<- shopee%>%ggplot(aes(x=date,y=n))+geom_line(aes(group=1),color="blue",lwd=3)+labs(title="Searching numbers pear hour vs wday of shopee",xlabs="Date",ylabs="Frequency")

    grid.arrange(plot1,plot2,plot3,plot4,nrow=2,ncol=2)