The assignment is for testing the analytcal skills. I was asked to use data science concepts and try find insights for mother care stores. For this reason I have divided the assignment into three section:
P:S. The first three section only includes the codes.
The store that I’ve analyzed are the following
setwd("Z:/Ferguson 13 march/other assignment/DS/ai mir/sorted data")
library(rJava)
options(java.parameters = "-Xmx16000m")
#http://stackoverflow.com/questions/19147884/importing-a-big-xlsx-file-into-r
library("openxlsx")
library("XLConnect")
dha <- readWorksheetFromFile("combined3.xlsx",
sheet = 1, check.names = FALSE)
dmc <- readWorksheetFromFile("combined3.xlsx",
sheet = 4, check.names = FALSE)
dmt <- readWorksheetFromFile("combined3.xlsx",
sheet= 5, check.names = FALSE)
kmg <- readWorksheetFromFile("combined3.xlsx",
sheet = 6, check.names = FALSE)
xin <- dha <- readWorksheetFromFile("combined3.xlsx",
sheet = 7, check.names = FALSE)
colnames(dmc)[1] <- "date"
colnames(dmc)[2] <- "day"
colnames(dha)[1] <- "date"
colnames(dha)[2] <- "day"
colnames(xin)[1] <- "date"
colnames(xin)[2] <- "day"
dmc <- dmc[,-c(123:143)]
dmc <- dmc[, c(1:120,122,121)]
totals <- c("total_customers", "total_trans", "total_units_sold",
"total_amount", "conversion_ratio", "ATV", "UPT_IPC")
colnames(dha)[(ncol(dha)- 6): ncol(dha)] <- totals
colnames(dmc)[(ncol(dmc)- 6): ncol(dmc)] <- totals
colnames(dmt)[(ncol(dmt)- 6): ncol(dmt)] <- totals
colnames(kmg)[(ncol(kmg)- 6): ncol(kmg)] <- totals
colnames(xin)[(ncol(xin)- 6): ncol(xin)] <- totals
common_cols <- intersect(colnames(dha), colnames(dmc))
combine1 <- rbind(dha[,common_cols], dmc[,common_cols], kmg[,common_cols])
common_cols <- intersect(colnames(combine1), colnames(dmt))
combine2 <- rbind(combine1[,common_cols], dmt[,common_cols])
common_cols <- intersect(colnames(combine2), colnames(xin))
combine3 <- rbind(combine2[,common_cols], xin[,common_cols])
combo <- combine3[apply(combine3, 1,function(x) sum(!is.na(x))/length(x) >= .1),]
library(plyr)
combo$day <- trimws(combo$day)
combo$day <- tolower(combo$day)
combo$day <- mapvalues(combo$day, c("mon", "tue", "tuseday", "wed",
"thr", "thu", "fri", "sat", "sun", "thur",
"thuesday"),
c("monday", "tuesday", "tuesday", "wednesday",
"thursday", "thursday", "friday", "saturday", "sunday",
"thursday", "tuesday"))
library(tidyr)
combo2 <-gather(combo, factor, value, -c(date, day, store,total_customers, total_trans,
total_units_sold, total_amount,
conversion_ratio, ATV, UPT_IPC))
combo3 <- separate(data = combo2, col = factor,into = c("Time_Interval", "factor"),
sep = "-")
combo3$date<- combo3$date[order(as.Date(combo3$date, format = "%Y-%m-%d"))]
combo4 <- subset(combo3, date > 1901-03-11)
combo5 <- combo4[complete.cases(combo4$value),]
combo5 <- combo5[combo5$day != 42094,]
combo5 <- combo5[combo5$store != "dmc",]
combo5 <- combo5[combo5$total_customers < 300,]
combo5$day <- factor(combo5$day, levels = c("monday","tuesday","wednesday","thursday",
"friday", "saturday","sunday"))
library(dplyr)
#ATV by day in each store
atv_store <- combo5 %>% group_by(store, day) %>% summarise(avg = mean(na.omit(ATV)))
#and ATV for time interval
atv_time <- combo5 %>% group_by(store, Time_Interval) %>% summarise(avg = mean(na.omit(ATV)))
#UPT_IPC by day in each store
combo5$UPT_IPC <- as.numeric(as.character(combo5$UPT_IPC))
upt_store <- combo5 %>% group_by(store, day) %>% summarise(avg = mean(na.omit(UPT_IPC)))
upt_time <- combo5 %>% group_by(store, Time_Interval) %>% summarise(avg = mean(na.omit(UPT_IPC)))
#Conversion rate by in each store
con_store <- combo5 %>% filter(conversion_ratio < 100) %>% group_by(store, day) %>%
summarise(avg = mean(na.omit(conversion_ratio)))
con_time <- combo5 %>% filter(conversion_ratio < 100) %>% group_by(store, Time_Interval) %>%
summarise(avg = mean(na.omit(conversion_ratio)))
#Total customers by day in each store
cus_store <- combo5 %>% filter(total_customers < 200) %>% group_by(store, day) %>% summarise(avg = mean(na.omit(total_customers)))
cus_time <- combo5 %>% filter(total_customers <200) %>% group_by(store, Time_Interval) %>% summarise(avg = mean(na.omit(total_customers)))
# total transaction by day in each store
trans_store <- combo5 %>% group_by(store, day) %>% summarise(avg = mean(na.omit(total_trans)))
trans_time <- combo5 %>% group_by(store, Time_Interval) %>% summarise(avg = mean(na.omit(total_trans)))
# total units by day in each store
unit_store <- combo5 %>% group_by(store, day) %>% summarise(avg = mean(na.omit(total_units_sold)))
unit_time <- combo5 %>% group_by(store, Time_Interval) %>% summarise(avg = mean(na.omit(total_units_sold)))
# amount by day in each store
amount_store <- combo5 %>% group_by(store, day) %>% summarise(avg = mean(na.omit(total_amount)))
amount_time <- combo5 %>% group_by(store, Time_Interval) %>% summarise(avg = mean(na.omit(total_amount)))
This section includes plots the relation between the different factors in the mother care data.
library(ggthemes); library(ggplot2)
#total number of transation and total customers by store
ggplot(combo5, aes(total_trans, total_customers)) +
geom_point(aes(colour = store))+
theme_economist() + labs(title = "Total Number Transaction and Total Customers",
x = "Total Number of Transaction",
y = "Total Customers") +
theme(legend.key = element_rect(colour = 'blue', fill = 'white', size = 0.8))+
theme(legend.title = element_blank())
The plot above shows the relation between Total Number of customers and Total number of transaction. The scatter plot shows the there is a linear positive correlation between the two variable. How dichotimizing on the basis on store gives a better insight where the pink polka dots represents store “DMT”, green represents “KMG” and blue represents “XIN”. Inspecting the these stores shows that at “KMG store” the customers are concentrated at the origin meaning there is low activity at both levels(Total customers and number of transacitons). However, at “DMT” store there is high economic activity where the dots are scattter along the 45 degree line. Finally, there is the “XIN” store which somehow suggests that the store is at the high end area because of the flat relation. i.e although there are few customers yet the number of transaction are greater propotionally to the other stores.
# Total Customers and Total Amount by store
ggplot(combo5, aes(total_amount, total_customers)) + geom_point(aes(colour = store)) +
theme_economist() + labs(title = "Total Amout and Total Customers", x = "Total Amount",
y = "Total Customers") + theme(legend.key = element_rect(colour = "blue",
fill = "white", size = 0.8)) + theme(legend.title = element_blank())
The plot shows the similar results as the top one where the holistic view that the relationship is lineally positive. The pink Polka dot representing the “dmt” store shows the most economic activity. One thing that is pertinent is the the pink dot plot are steeper than the kmg and xin store suggesting that the visitors at dmt have lower purchasing power than the the other stores
#total number of transation and total customers by Time interval
ggplot(combo5, aes(total_trans, total_customers)) +
geom_jitter(aes(alpha = Time_Interval))+
theme_economist() + labs(title = "Total Number Transaction and Total Customers",
x = "Total Number of Transaction",
y = "Total Customers") +
theme(legend.key = element_rect(colour = 'blue', fill = 'white', size = 0.8))+
theme(legend.title = element_blank())
The above plot shows the the relationship between total number of customers and total number of transaction for different time periods irrespective of stores. As shown in the plot legends, the transparent dots represent the after noon period and with the passage of time the dots get darker. As evident most of the economic activity has taken place in the the evening and night.
#Total Customers and Total Amount by time_interval
ggplot(combo5, aes(total_amount, total_customers)) +
geom_jitter(aes(alpha = Time_Interval))+
theme_economist() + labs(title = "Total Amout and Total Customers",
x = "Total Amount",
y = "Total Customers") +
theme(legend.key = element_rect(colour = 'blue', fill = 'white', size = 0.8))+
theme(legend.title = element_blank())
The above plot shows the relationship between Total customer and Total Amount. The relationship is positive as above and tell the same story that most of the economic activity took place in the evening and the night.
###atv_store and atv time
p <- ggplot(atv_store, aes(day, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_solarized() + ggtitle("Average ATV in a week for three stores")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold"))
The above plot shows the average ATV for three store over the week. The average ATV is on the Y-axis and days of the week are on the x-axis. It shows that on average the lowest ATV is for kmg store. The ATV is highest on saturday for dmt store while on the week days xin store has shown the highest ATV
p <- ggplot(atv_time, aes(Time_Interval, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_wsj() + ggtitle("Average ATV for three stores")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold")) +
coord_flip()
The above plot shows average ATV for a particular day.The plot shows that ATV is similar for dmt and kmg store while ATV is lowest for kmg for any particular day on average.
###upt_store and upt time
p <- ggplot(upt_store, aes(day, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_solarized() + ggtitle("Average UPT in a week for three stores")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold"))
The above plot shows average UPT for three stores in week.The average ATV is on the Y-axis and days of the week are on the x-axis.The average UPT has remained below 3 for almost all three store over a week with the only exception kmg store on tuesday. DMT stores have lower UPT compared to other stores.
p <- ggplot(upt_time, aes(Time_Interval, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_wsj() + ggtitle("Average UPT for different Periods")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold")) +
coord_flip()
The above plot shows average UPT for a particular day.The plot shows that UPT is highest for KMG stores and lowest for DMT stores on a particular day.
###con_store and con time
p <- ggplot(con_store, aes(day, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_solarized() + ggtitle("Average Conversion Ratio in a week")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold"))
The above plot shows the average conversion of three stores in a particular week. The conversion ratio is lowest for dmt store and highest for the xin stores
p <- ggplot(con_time, aes(Time_Interval, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_wsj() + ggtitle("Average conversion ratio over a Day")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold")) +
coord_flip()
The plot avove shows the average conversion ratio over a particular day. The conversion ratio is highest for the xin and lowest for the dmt indicating customer cant buy thing they came for.
###cus_store and cus time
p <- ggplot(cus_store, aes(day, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_solarized() + ggtitle("Average Number of Customers in a week")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold"))
The above plot shows the average number of customers in a week for three stores. It is evident that dmt has the highest number of customers peaking over the weekend while kmg has the lowest number of customers on average in a particular week.
p <- ggplot(cus_time, aes(Time_Interval, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_wsj() + ggtitle("Average Number of Customers in a particular day")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold")) +
coord_flip()
This graphs endorse the the first graph where dmt has the highest number of customer and kmg store has the lowest number of customers. The number of customers arrival at the particular time slots is similar for all three stores in a particular day.
###trans_store and trnas time
p <- ggplot(trans_store, aes(day, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_solarized() + ggtitle("Average Number of transactions in a week")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold"))
The above graph shows the number of transaction in a particular week. The highest number of transaction are for xin store while the lowest are for kmg store. The highest number of transaction peaks on saturday for all three stores
p <- ggplot(trans_time, aes(Time_Interval, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_wsj() + ggtitle("Average Number of transaction for different Time")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold")) +
coord_flip()
This graphs endorse the the first graph where dmt has the highest number of transaction and kmg store has the lowest number of transaction. The number of transaction at the particular time slots is similar for all three stores in a particular day.
###upt_store and upt time
p <- ggplot(unit_store, aes(day, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_solarized() + ggtitle("Average Number of Units sold in a week")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold"))
The above graph shows the number units sold in a particular week. The highest number of transaction are for xin store while the lowest are for kmg store. The highest number of transaction peaks on saturday for all three stores.
p <- ggplot(unit_time, aes(Time_Interval, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_wsj() + ggtitle("Average number of Units Sold for different Periods")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold")) +
coord_flip()
This graphs endorse the the first graph where dmt has the highest number of units sold and kmg store has the lowest number of units sold. The number of transaction at the particular time slots is similar for all three stores in a particular day.
###amount_store and amount time
p <- ggplot(amount_store, aes(day, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_solarized() + ggtitle("Average Revenue in a week for days")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold"))
The above graph shows the average revenuw in a particular week. The highest revenue store is xin store while the lowest is for kmg store. The highest number of transaction peaks on saturday for all three stores.
p <- ggplot(amount_time, aes(Time_Interval, avg)) +
geom_point(aes(colour = store, size = 2))+
theme_wsj() + ggtitle("Average Revenue for different Periods")
p + scale_size(guide = "none") + theme(legend.title = element_blank(),
legend.text = element_text(colour = "black",
size = 16, face = "bold"))+
theme(plot.title = element_text(lineheight = .8, size = 20,face = "bold")) +
coord_flip()
This graphs endorse the the first graph where dmt has the highest revenue and kmg store has the lowest. The revenue at the particular time slots is similar for all three stores in a particular day.