#set working directory
getwd()
## [1] "C:/Users/Thinithi/Desktop/PORTFOLIO/Customer Analysis"
setwd("C:/Users/Thinithi/Desktop/PORTFOLIO/Customer Analysis")
#Load the Libraries
library(ggplot2)
library(RColorBrewer)
library(ggfortify) # convert time series data to a dataframe
## Warning: package 'ggfortify' was built under R version 3.5.3
library(doBy) # To use the ordrBY function
library(leaflet) # For the map
library(widgetframe) # To add interactivity to the map
## Loading required package: htmlwidgets
library(forecast) # Convert data to time series
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Load the data
store<-read.csv("orders.csv")
str(store)
## 'data.frame': 9994 obs. of 21 variables:
## $ Row.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Order.ID : Factor w/ 5009 levels "CA-2014-100006",..: 2501 2501 2297 4373 4373 202 202 202 202 202 ...
## $ Order.Date : Factor w/ 1237 levels "1/1/2017","1/10/2014",..: 305 305 836 94 94 922 922 922 922 922 ...
## $ Ship.Date : Factor w/ 1334 levels "1/1/2015","1/1/2016",..: 220 220 907 129 129 897 897 897 897 897 ...
## $ Ship.Mode : Factor w/ 4 levels "First Class",..: 3 3 3 4 4 4 4 4 4 4 ...
## $ Customer.ID : Factor w/ 793 levels "AA-10315","AA-10375",..: 144 144 240 706 706 89 89 89 89 89 ...
## $ Customer.Name: Factor w/ 793 levels "Aaron Bergman",..: 167 167 202 688 688 114 114 114 114 114 ...
## $ Segment : Factor w/ 3 levels "Consumer","Corporate",..: 1 1 2 1 1 1 1 1 1 1 ...
## $ Country : Factor w/ 1 level "United States": 1 1 1 1 1 1 1 1 1 1 ...
## $ city : Factor w/ 531 levels "Aberdeen","Abilene",..: 195 195 267 154 154 267 267 267 267 267 ...
## $ State : Factor w/ 49 levels "Alabama","Arizona",..: 16 16 4 9 9 4 4 4 4 4 ...
## $ Postal.Code : int 42420 42420 90036 33311 33311 90032 90032 90032 90032 90032 ...
## $ Region : Factor w/ 4 levels "Central","East",..: 3 3 4 3 3 4 4 4 4 4 ...
## $ Product.ID : Factor w/ 1862 levels "FUR-BO-10000112",..: 13 56 947 320 1317 186 563 1762 795 438 ...
## $ Category : Factor w/ 3 levels "Furniture","Office Supplies",..: 1 1 2 1 2 1 2 3 2 2 ...
## $ Sub.Category : Factor w/ 17 levels "Accessories",..: 5 6 11 17 15 10 3 14 4 2 ...
## $ Product.Name : Factor w/ 1850 levels "\"While you Were Out\" Message Book, One Form per Page",..: 387 833 1440 368 574 570 1137 1099 535 295 ...
## $ Sales : num 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : int 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num 41.91 219.58 6.87 -383.03 2.52 ...
#Testing for missing values
missing<-sapply(store, function(x) sum(is.na(x)))
missing
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode
## 0 0 0 0 0
## Customer.ID Customer.Name Segment Country city
## 0 0 0 0 0
## State Postal.Code Region Product.ID Category
## 0 0 0 0 0
## Sub.Category Product.Name Sales Quantity Discount
## 0 0 0 0 0
## Profit
## 0
#Transactions per customer
customers<- aggregate(store$Customer.Name, by=list(store$Customer.Name), FUN=length)
colnames(customers)<-c("Customer","Count")
head(customers)
## Customer Count
## 1 Aaron Bergman 6
## 2 Aaron Hawkins 11
## 3 Aaron Smayling 10
## 4 Adam Bellavance 18
## 5 Adam Hart 20
## 6 Adam Shillingsburg 25
#Profitability by customer
profits<-aggregate(store$Profit, by=list(Category=store$Customer.Name), FUN=sum)
colnames(profits)<-c("Customer","profit")
head(profits)
## Customer profit
## 1 Aaron Bergman 129.3465
## 2 Aaron Hawkins 365.2152
## 3 Aaron Smayling -253.5746
## 4 Adam Bellavance 2054.5885
## 5 Adam Hart 281.1890
## 6 Adam Shillingsburg 64.5374
##Number of orders from each city
citypop<-aggregate(store$city, by=list(store$city), FUN=length)
colnames(citypop)<-c("City","Count")
head(citypop)
## City Count
## 1 Aberdeen 1
## 2 Abilene 1
## 3 Akron 21
## 4 Albuquerque 14
## 5 Alexandria 16
## 6 Allen 4
#cutomers prefered shipping mode
shipmode<-store[,5]
#assign integer to shipmode
shipdata<-data.frame(store$Customer.Name,store$Ship.Mode)
colnames(shipdata)<-c("customer","shipmode")
head(shipdata)
## customer shipmode
## 1 Claire Gute Second Class
## 2 Claire Gute Second Class
## 3 Darrin Van Huff Second Class
## 4 Sean O'Donnell Standard Class
## 5 Sean O'Donnell Standard Class
## 6 Brosina Hoffman Standard Class
#Number of times each type of shipmode was used by each customer
ship1<-count(shipdata,customer,shipmode)
head(ship1)
## # A tibble: 6 x 3
## customer shipmode n
## <fct> <fct> <int>
## 1 Aaron Bergman First Class 5
## 2 Aaron Bergman Standard Class 1
## 3 Aaron Hawkins First Class 3
## 4 Aaron Hawkins Second Class 3
## 5 Aaron Hawkins Standard Class 5
## 6 Aaron Smayling First Class 6
#Extract the shipmode mostly used by each customer
ship2 <- ship1 %>%
group_by(customer) %>%
filter(n==max(n))
shipp<-ddply(ship2, .(customer), head, n = 1)
head(shipp)
## customer shipmode n
## 1 Aaron Bergman First Class 5
## 2 Aaron Hawkins Standard Class 5
## 3 Aaron Smayling First Class 6
## 4 Adam Bellavance Standard Class 10
## 5 Adam Hart First Class 9
## 6 Adam Shillingsburg Standard Class 16
#Dataframe of the 793 customers
Ptable<-data.frame(customers,profits$profit,shipp$shipmode)
head(Ptable)
## Customer Count profits.profit shipp.shipmode
## 1 Aaron Bergman 6 129.3465 First Class
## 2 Aaron Hawkins 11 365.2152 Standard Class
## 3 Aaron Smayling 10 -253.5746 First Class
## 4 Adam Bellavance 18 2054.5885 Standard Class
## 5 Adam Hart 20 281.1890 First Class
## 6 Adam Shillingsburg 25 64.5374 Standard Class
#Regular customers in descending order
c<-customers[order(customers$Count,decreasing = TRUE),]
head(c)
## Customer Count
## 788 William Brown 37
## 388 John Lee 34
## 503 Matt Abelman 34
## 596 Paul Prost 34
## 146 Chloris Kastensmidt 32
## 250 Edward Hooks 32
#Top 45 regular customers
top45<-customers[customers$Count > quantile(customers$Count,prob=1-0.063),]
top45_ordered<-top45[order(top45$Count,decreasing = TRUE),]
head(top45_ordered)
## Customer Count
## 788 William Brown 37
## 388 John Lee 34
## 503 Matt Abelman 34
## 596 Paul Prost 34
## 146 Chloris Kastensmidt 32
## 250 Edward Hooks 32
#Plot the number of transactions of the top 45 regular customers
ggplot(top45_ordered,aes(x=Count,colour='blue'))+geom_histogram(fill='blue',binwidth = 1,boundary = 0)+ggtitle("Top 45 regular customers(1/3/2014-30/12/2017)")+geom_vline(aes(xintercept=mean(Count)),linetype="dashed")+geom_text(aes(x=mean(top45_ordered$Count ),y=15),label="Mean 27.76",hjust=1, size=3,col=1)+geom_text(aes(x=median(top45_ordered$Count ),y=10),label="Median 27",hjust=1, size=3,col=1)+xlab("Number of Transactions")+ylab("Frequency")+geom_vline(aes(xintercept=median(Count)))+ scale_x_continuous(breaks = seq(0,40,1))
During the period March 2014- December 2017, 15 out of the 45 regular customers, each have 25 transactions at the superstore, while 50% of the customers have 27 or less transactions. In addition, the histogram appears to be slightly positively skewed (median < mean) with an average of 27.76.
#Order the customers by frequency of transactions and profitability
profit_ordered<-Ptable[order(Ptable$Count,Ptable$profits.profit,decreasing = TRUE),]
head(profit_ordered)
## Customer Count profits.profit shipp.shipmode
## 788 William Brown 37 714.3311 Standard Class
## 596 Paul Prost 34 1495.0854 Standard Class
## 503 Matt Abelman 34 1240.2266 Second Class
## 388 John Lee 34 228.9070 Standard Class
## 250 Edward Hooks 32 1393.5154 Standard Class
## 690 Seth Vernon 32 1199.4242 Standard Class
#Top 45 customers
profit45<-profit_ordered[profit_ordered$Count > quantile(profit_ordered$Count,prob=1-0.063),]
colnames(profit45)<-c("Customer","Count","Profit","shipmode")
head(profit45)
## Customer Count Profit shipmode
## 788 William Brown 37 714.3311 Standard Class
## 596 Paul Prost 34 1495.0854 Standard Class
## 503 Matt Abelman 34 1240.2266 Second Class
## 388 John Lee 34 228.9070 Standard Class
## 250 Edward Hooks 32 1393.5154 Standard Class
## 690 Seth Vernon 32 1199.4242 Standard Class
#Top customers with Transaction >20 and Profit > $2000 for the period March2014-Dec2017
best<-list()
for(i in c(1:793)){if((Ptable$Count[i]>20)&(Ptable$profits.profit[i]>2000) ){best<-c(best,Ptable[i,])}}
head(best)
## $Customer
## [1] Brian Moss
## 793 Levels: Aaron Bergman Aaron Hawkins Aaron Smayling ... Zuschuss Donatelli
##
## $Count
## [1] 29
##
## $profits.profit
## [1] 2199.278
##
## $shipp.shipmode
## [1] Standard Class
## Levels: First Class Same Day Second Class Standard Class
##
## $Customer
## [1] Greg Tran
## 793 Levels: Aaron Bergman Aaron Hawkins Aaron Smayling ... Zuschuss Donatelli
##
## $Count
## [1] 29
attributes(best[4])
## $names
## [1] "shipp.shipmode"
d<-list()
e<-list()
f<-list()
g<-list()
for(i in c(1:24)){if(attributes(best[i])=="Customer"){d<-c(d,best[i])}}
for(i in c(1:24)){if(attributes(best[i])=="Count"){e<-c(e,best[i])}}
for(i in c(1:24)){if(attributes(best[i])=="profits.profit"){f<-c(f,best[i])}}
for(i in c(1:24)){if(attributes(best[i])=="shipp.shipmode"){g<-c(g,best[i])}}
D<-unlist(d)
E<-unlist(e)
J<-unlist(f)
G<-unlist(g)
Best<-data.frame(D,E,J,G)
colnames(Best)<-c("Customer","Count","Profit","Prefered_Shipmode")
Best
## Customer Count Profit Prefered_Shipmode
## 1 Brian Moss 29 2199.278 Standard Class
## 2 Greg Tran 29 2163.427 Standard Class
## 3 Keith Dawkins 28 3038.625 Standard Class
## 4 Laura Armstrong 26 2059.120 Standard Class
## 5 Pete Kriz 25 2038.268 Standard Class
## 6 Sanjit Chand 22 5757.412 Standard Class
ggplot(data=Best, aes(x=Customer, y=Profit,fill=Prefered_Shipmode))+geom_bar(stat="identity",show.legend = TRUE)+theme(axis.text.x = element_text(angle = 45, hjust = 1))+ggtitle("Top priority customers")+ylab("Profit (USD)")
## Create a daily Date object
dates <- as.Date(store$Order.Date, "%m/%d/%Y")
head(dates)
## [1] "2016-11-08" "2016-11-08" "2016-06-12" "2015-10-11" "2015-10-11"
## [6] "2014-06-09"
#Extract days from the data
days<-weekdays(as.Date(dates,'%Y-%m-%d')) #weekdays function takes into account leap years as well
head(days)
## [1] "Tuesday" "Tuesday" "Sunday" "Sunday" "Sunday" "Monday"
#Extract the year from the data
Years<-substring(dates ,1,4)
head(Years)
## [1] "2016" "2016" "2016" "2015" "2015" "2014"
salesD<-data.frame(days,store$Sales,dates,Years)
head(salesD)
## days store.Sales dates Years
## 1 Tuesday 261.9600 2016-11-08 2016
## 2 Tuesday 731.9400 2016-11-08 2016
## 3 Sunday 14.6200 2016-06-12 2016
## 4 Sunday 957.5775 2015-10-11 2015
## 5 Sunday 22.3680 2015-10-11 2015
## 6 Monday 48.8600 2014-06-09 2014
#Edit level order
salesD$days <- factor(salesD$days, levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
ggplot(data=salesD, aes(x=days, y=store.Sales))+geom_bar(stat="identity",show.legend = TRUE,fill="Red")+theme(axis.text.x = element_text(angle = 45, hjust = 1))+ggtitle("Total sales:Monday to sunday")+facet_wrap(~Years,ncol=2)+ylab("Sales (USD)")
From 2015 onwards, wednesdays appear to have the lowest sales anually and Tuesdays continue to be comparatively lower than the rest of the days.
date<- format(as.Date(dates), "%d/%m/%Y")
head(date)
## [1] "08/11/2016" "08/11/2016" "12/06/2016" "11/10/2015" "11/10/2015"
## [6] "09/06/2014"
salest<-data.frame(date,store$Sales)
salests<-ts(salest[,-1],frequency =12,start=c(2014,03),end=c(2017,12))
head(salests)
## Mar Apr May Jun Jul Aug
## 2014 261.9600 731.9400 14.6200 957.5775 22.3680 48.8600
plot(salests,xlab="Year",ylab="Sales (USD)",main="Superstore Sales,March2014-December2017")
According to the sales plot above, there is no seasonal or cyclical pattern to be observed, but further analysis is required to confirm the lack of seasonality. There is no trend either.
seasonplot(salests,ylab="Sales (USD)",col=c(1,2,3,4),main="Seasonalplot:Superstore Sales,March2014-December2017",year.labels=TRUE)
Acf(salests)
If there is significant seasonality, the autocorrelation plot should show large spikes at lags equal to the period. Since there are no significantly large lags present in the ACF plot, this confirms the lack of seasonality.
##Futher analysis on the superstore sales
#convert timeseries data to a dataframe
salesm<-fortify(salests)
head(salesm)
## Index Data
## 1 2014-03-01 261.9600
## 2 2014-04-01 731.9400
## 3 2014-05-01 14.6200
## 4 2014-06-01 957.5775
## 5 2014-07-01 22.3680
## 6 2014-08-01 48.8600
#Extract months from the data
months<-months(as.Date(salesm$Index,'%Y-%m-%d'))
head(months)
## [1] "March" "April" "May" "June" "July" "August"
#Extract years from the data
years<-substring(salesm$Index,1,4)
head(years)
## [1] "2014" "2014" "2014" "2014" "2014" "2014"
df<-data.frame(salesm,months,years)
#Order the factors
df$months <- factor(df$months, levels = c("January","February","March","April","May","June","July","August","September","October","November","December"))
head(df)
## Index Data months years
## 1 2014-03-01 261.9600 March 2014
## 2 2014-04-01 731.9400 April 2014
## 3 2014-05-01 14.6200 May 2014
## 4 2014-06-01 957.5775 June 2014
## 5 2014-07-01 22.3680 July 2014
## 6 2014-08-01 48.8600 August 2014
#Detecting Outliers
A<-boxplot(df$Data)
outliers<-A$out
outliers
## [1] 957.5775 907.1520 1706.1840 911.4240 1044.6300 3083.4300 1097.5440
#Analyzing outliers
out1<-df[df$Data==957.5775,]
out2<-df[df$Data==907.1520,]
out3<-df[df$Data==1706.1840,]
out4<-df[df$Data==911.4240,]
out5<-df[df$Data==1044.6300,]
out6<-df[df$Data==3083.4300 ,]
out7<-df[df$Data==1097.5440,]
outdata<-rbind(out1,out2,out3,out4,out5,out6,out7)
outdata
## Index Data months years
## 4 2014-06-01 957.5775 June 2014
## 8 2014-10-01 907.1520 October 2014
## 11 2015-01-01 1706.1840 January 2015
## 12 2015-02-01 911.4240 February 2015
## 25 2016-03-01 1044.6300 March 2016
## 28 2016-06-01 3083.4300 June 2016
## 36 2017-02-01 1097.5440 February 2017
out11<-store[store$Sales==957.5775,]
out21<-store[store$Sales==907.1520,]
out31<-store[store$Sales==1706.1840,]
out41<-store[store$Sales==911.4240,]
out51<-store[store$Sales==1044.6300,]
out61<-store[store$Sales==3083.4300 ,]
out71<-store[store$Sales==1097.5440,]
outdata2<-rbind(out11,out21,out31,out41,out51,out61,out71)
outdata2
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode
## 4 4 US-2015-108966 10/11/2015 10/18/2015 Standard Class
## 8 8 CA-2014-115812 6/9/2014 6/14/2014 Standard Class
## 11 11 CA-2014-115812 6/9/2014 6/14/2014 Standard Class
## 12 12 CA-2014-115812 6/9/2014 6/14/2014 Standard Class
## 25 25 CA-2015-106320 9/25/2015 9/30/2015 Standard Class
## 1563 1563 US-2017-102890 6/30/2017 6/30/2017 Same Day
## 28 28 US-2015-150630 9/17/2015 9/21/2015 Standard Class
## 36 36 CA-2016-117590 12/8/2016 12/10/2016 First Class
## Customer.ID Customer.Name Segment Country city
## 4 SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale
## 8 BH-11710 Brosina Hoffman Consumer United States Los Angeles
## 11 BH-11710 Brosina Hoffman Consumer United States Los Angeles
## 12 BH-11710 Brosina Hoffman Consumer United States Los Angeles
## 25 EB-13870 Emily Burns Consumer United States Orem
## 1563 SG-20470 Sheri Gordon Consumer United States New York City
## 28 TB-21520 Tracy Blumstein Consumer United States Philadelphia
## 36 GH-14485 Gene Hale Corporate United States Richardson
## State Postal.Code Region Product.ID Category
## 4 Florida 33311 South FUR-TA-10000577 Furniture
## 8 California 90032 West TEC-PH-10002275 Technology
## 11 California 90032 West FUR-TA-10001539 Furniture
## 12 California 90032 West TEC-PH-10002033 Technology
## 25 Utah 84057 West FUR-TA-10000577 Furniture
## 1563 New York 10011 East FUR-TA-10000577 Furniture
## 28 Pennsylvania 19140 East FUR-BO-10004834 Furniture
## 36 Texas 75080 Central TEC-PH-10004977 Technology
## Sub.Category
## 4 Tables
## 8 Phones
## 11 Tables
## 12 Phones
## 25 Tables
## 1563 Tables
## 28 Bookcases
## 36 Phones
## Product.Name
## 4 Bretford CR4500 Series Slim Rectangular Table
## 8 Mitel 5320 IP Phone VoIP phone
## 11 Chromcraft Rectangular Conference Tables
## 12 Konftel 250 Conference phone - Charcoal black
## 25 Bretford CR4500 Series Slim Rectangular Table
## 1563 Bretford CR4500 Series Slim Rectangular Table
## 28 Riverside Palais Royal Lawyers Bookcase, Royale Cherry Finish
## 36 GE 30524EE4
## Sales Quantity Discount Profit
## 4 957.5775 5 0.45 -383.0310
## 8 907.1520 6 0.20 90.7152
## 11 1706.1840 9 0.20 85.3092
## 12 911.4240 4 0.20 68.3568
## 25 1044.6300 3 0.00 240.2649
## 1563 1044.6300 5 0.40 -295.9785
## 28 3083.4300 7 0.50 -1665.0522
## 36 1097.5440 7 0.20 123.4737
#Treating Outliers
qn<-quantile(df$Data, c(0.05, 0.85), na.rm = TRUE)
df<- within(df,{Data = ifelse(Data < qn[1], qn[1], Data)
Data = ifelse(Data> qn[2], qn[2], Data)})
#Re-testing for Outliers
boxplot(df)
#Plot sales for each year catagories by the month
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73")
ggplot(data=df, aes(x=years, y=Data,fill=years))+geom_bar(stat="identity",show.legend = TRUE)+theme(axis.text.x = element_text(angle = 45, hjust = 1))+ggtitle("Monthly sales from March2014-December2017") + facet_grid(. ~ months)+theme(axis.text.x=element_blank())+ylab("Sales (USD)")
The extreme outliers are due to items of high value, such as furniture and phones, bought in large quantities. The outliers are replaced with quantile values to avoid them from making the usual sales seem insignificant. August, September,November and December seem to have low sales throughout the 4 years.
monthplot(salests,xlab="Month",ylab="Sales (USD)",main="Monthplot:Superstore Sales,March2014-December2017")
From the monthly plot above, it is clear that the sales for the months January, April and October have been reducing thru out the years, only to show a slight improvement during 2017.
#set working directory
getwd()
## [1] "C:/Users/Thinithi/Desktop/PORTFOLIO/Customer Analysis"
setwd("C:/Users/Thinithi/Desktop/PORTFOLIO")
#Load the coordinates data
cities<-read.csv("USAcities.csv")
head(cities)
## city lat lng
## 1 Henderson 36.03950 -114.98170
## 2 Henderson 36.03950 -114.98170
## 3 Los Angeles 33.98998 -118.17998
## 4 Fort Lauderdale 26.13606 -80.14179
## 5 Fort Lauderdale 26.13606 -80.14179
## 6 Los Angeles 33.98998 -118.17998
##Creating a Dataframe
#Extract unique rows
unique<-unique(cities[,1:3]) #Must speicify columns
AB<-orderBy(~city,unique)
#Extract the cordinates of cities in "citypop" rom the file "cities"
t<-list()
#test whether city names are the same
for(i in c(1:531)){if(citypop$City[i]==AB$city[i] ){t<-c(t,0)}else{t<-c(t,1)}}
ABC<-AB[-196,]
t<-list()
for(i in c(1:531)){if(citypop$City[i]==ABC$city[i] ){t<-c(t,0)}else{t<-c(t,1)}}
cord<-data.frame(citypop,ABC$lng,ABC$lat)
colnames(cord)<-c("City","Orders","lng","lat")
head(cord)
## City Orders lng lat
## 1 Aberdeen 1 -98.48640 45.46512
## 2 Abilene 1 -99.73279 32.44863
## 3 Akron 21 -81.52000 41.07040
## 4 Albuquerque 14 -106.64133 35.10497
## 5 Alexandria 16 -77.09998 38.82043
## 6 Allen 4 -96.67350 33.10870
map <- leaflet() %>%
setView(lat = 36, lng = -99.7129, zoom=4) %>%
addTiles(group="OSM") %>%
addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircleMarkers(data =cord, ~lng, ~lat,weight = 0.5, col = 'black',fillColor = "purple", radius =~ Orders/25 , fillOpacity = 0.6, stroke = T, label = ~paste0(as.character(City),' ',as.character(Orders)),group = 'Points')
map