Importing dataset

library(readxl)
dataset<- read_excel("C:/Users/pavan/Downloads/DATA SCIENCE SET.25.26 (1).xlsx")

Data Exploration

summary(dataset)
##     User id            Category          Order Date       
##  Min.   :9.000e+09   Length:221         Length:221        
##  1st Qu.:9.123e+09   Class :character   Class :character  
##  Median :1.000e+10   Mode  :character   Mode  :character  
##  Mean   :9.674e+09                                        
##  3rd Qu.:1.000e+10                                        
##  Max.   :1.000e+10                                        
##    Order Time                    bill value         Age       
##  Min.   :1899-12-31 03:33:55   Min.   :   15   Min.   :13.00  
##  1st Qu.:1899-12-31 12:25:00   1st Qu.:  549   1st Qu.:23.00  
##  Median :1899-12-31 16:05:06   Median : 1500   Median :26.00  
##  Mean   :1899-12-31 15:47:52   Mean   : 3309   Mean   :30.17  
##  3rd Qu.:1899-12-31 20:00:00   3rd Qu.: 3000   3rd Qu.:32.00  
##  Max.   :1899-12-31 23:40:00   Max.   :50000   Max.   :62.00  
##      Income          Gender           Profession        Home Address      
##  Min.   :     0   Length:221         Length:221         Length:221        
##  1st Qu.: 22000   Class :character   Class :character   Class :character  
##  Median : 30000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 30448                                                           
##  3rd Qu.: 45000                                                           
##  Max.   :100000                                                           
##  Office Address     Family members Languages Known   
##  Length:221         Min.   :1.0    Length:221        
##  Class :character   1st Qu.:3.0    Class :character  
##  Mode  :character   Median :4.0    Mode  :character  
##                     Mean   :4.1                      
##                     3rd Qu.:5.0                      
##                     Max.   :8.0
str(dataset)
## Classes 'tbl_df', 'tbl' and 'data.frame':    221 obs. of  13 variables:
##  $ User id        : num  9.12e+09 1.00e+10 9.12e+09 1.00e+10 9.12e+09 ...
##  $ Category       : chr  "Others" "Apparel" "Jewellery" "Movies/Entertainment" ...
##  $ Order Date     : chr  "43101" "43104" "43110" "43112" ...
##  $ Order Time     : POSIXct, format: "1899-12-31 21:30:00" "1899-12-31 22:55:00" ...
##  $ bill value     : num  2000 2650 25000 750 8700 1500 22000 2200 1400 20000 ...
##  $ Age            : num  26 30 24 24 28 21 26 24 39 24 ...
##  $ Income         : num  45000 22000 50000 23000 60000 23000 45000 38000 45000 50000 ...
##  $ Gender         : chr  "Female" "Female" "Female" "Female" ...
##  $ Profession     : chr  "Doctor" "Entreprenuer" "Engineer" "IT Professional" ...
##  $ Home Address   : chr  "17.4886661,78.3897115,17z(J.N.T.U)" "17.400205, 78.487070" "17.5182524,78.4732573,13z(Bolarum)" "17.424879, 78.507825" ...
##  $ Office Address : chr  "17.4743872,78.2975193,14z(Serilingampally)" "17.408753, 78.438758" "17.3722658,78.48243,15z(Malakpet)" "17.485128, 78.511785" ...
##  $ Family members : num  7 2 4 5 4 5 7 4 5 4 ...
##  $ Languages Known: chr  "Malyalam,Telugu and English" "English, Telugu" "Tamil,English and Telugu" "English, Telugu" ...
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
c<-dataset %>% group_by(`User id`,Income,Profession,Gender,`Family members`,) %>% summarise(count=n(),bills_made=list(`bill value`),total = sum(`bill value`),categories=list(Category))
x<-c[,c('Income','total')]
wcss<-vector()
for(i in 1:10) wcss[i]<-sum(kmeans(x,i)$withinss)
plot(1:10,wcss)

kmeans<-kmeans(x,4,iter.max = 300,nstart = 10)
library(cluster)
clusplot(x,
         kmeans$cluster,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels = 2,
         plotchar = FALSE,
         span = TRUE,
         main = paste('cluster of clients'),
         xlab = "Total bill value",
         ylab = "income")

From above cluster plot we can infer that cluster 1,4 are standard,they only purchase when needed.

Whereas cluster 3 we can call them as “Targets” as we target those who have high income and high spenders.

Coming to cluster 2 they have less income but they spend higher.

Individual consumption representation - Buyers consumption categories

library(ggplot2)
ggplot(c,aes(Profession))+geom_bar()+coord_flip()

By seeing above plot,Buyer whose profession is “Business”" purchase so many times compared to others.

Similary students,Engineers,IT Professionals and doctors.

ggplot(dataset,aes(Category))+geom_bar()+coord_flip()

Seeing this representation you can clearly say certain categories like Grocery,Others,Events,Apparel are purchased frequently.

gender_dis<-dataset %>% group_by(Gender) %>% summarise(total = sum(`bill value`))
gender_dis
## # A tibble: 2 x 2
##   Gender  total
##   <chr>   <dbl>
## 1 Female 420269
## 2 Male   311028

Female purchase more products than male. Hence proved

library(ggplot2)
ggplot(dataset,aes(x=Age,y=`bill value`,colour = "blue"))+geom_point()+xlab("Age") + ylab("Amount per purchase")+theme_dark()

Age between 20 and 30 are the consumers who purchase a lot.

Area wise purchase distribution- Heat map

library(ggmap)
library(tidyr)
library(dplyr)

address<-separate(dataset,'Home Address',sep=",",c("latitude","longitude","nameofplace"))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 140 rows
## [2, 4, 13, 22, 23, 27, 30, 32, 33, 36, 38, 39, 40, 42, 43, 44, 46, 49, 51,
## 52, ...].
d<-address %>% group_by(`User id`,latitude,longitude) %>% summarise(total = sum(`bill value`),count = n())
d$longitude<-as.numeric(d$longitude)
d$latitude<-as.numeric(d$latitude)
hyd_center = as.numeric(geocode("Hyderabad"))
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Hyderabad&sensor=false
hydmap = ggmap(get_googlemap(center= c(lon = 78.45667, lat = 17.45504), scale=2, zoom=12), extent="panel")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=17.45504,78.45667&zoom=12&size=640x640&scale=2&maptype=terrain&sensor=false
hydmap + geom_point(aes(x=longitude, y=latitude), data= d, col="orange", alpha=0.4,size = d$count) + scale_size_continuous(range=range(d$count))+ggtitle("Area Wise Purchase Distribution")
## Warning: Removed 3 rows containing missing values (geom_point).

From above near Banjara Hills and kukatpally area has more number of purchases.

Time of Purchase

library(tidyr)
time<-separate(dataset,'Order Time',sep = " ",c("date","order_time"))
t<-time[,c("bill value","order_time")]
t<-separate(t,order_time,sep = ":",c("hours","min","sec"))
t$hours<-as.numeric(t$hours)
t$min<-as.numeric(t$min)
t<-select(t,-c("sec"))
t$hours<-ifelse(t$min>=30,t$hours+1,t$min)
t<-select(t,-c("min"))

clustering

wcss2<-vector()
for(i in 1:10) wcss2[i]<-sum(kmeans(t,i)$withinss)
plot(1:10,wcss2)

kmeans2<-kmeans(t,3,iter.max=300,nstart =10)
library(cluster)
clusplot(t,
         kmeans2$cluster,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels = 4,
         plotchar = FALSE,
         span = TRUE,
         main = paste('Time of purchasing cluster'))

age_wise<-time[,c("Age","order_time")]
age_wise<-separate(age_wise,order_time,sep = ":",c("hours","min","sec"))
age_wise$hours<-as.numeric(age_wise$hours)
age_wise$min<-as.numeric(age_wise$min)
age_wise<-select(age_wise,-c("sec"))
age_wise$hours<-ifelse(age_wise$min>=30,age_wise$hours+1,age_wise$min)
age_wise<-select(age_wise,-c("min"))
wcss3<-vector()
for(i in 1:10) wcss3[i]<-sum(kmeans(age_wise,i)$withinss)
plot(1:10,wcss3)

kmeans3<-kmeans(age_wise,4,iter.max=300,nstart =10)
library(cluster)
clusplot(age_wise,
         kmeans3$cluster,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels = 4,
         plotchar = FALSE,
         span = TRUE,
         main = paste('Time of purchasing cluster'))

Correlation between Categories

library(vcd)
## Loading required package: grid
assocstats(table(dataset$Category,dataset$Gender))
##                     X^2 df   P(> X^2)
## Likelihood Ratio 45.425 14 3.4793e-05
## Pearson          41.475 14 1.4970e-04
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.398 
## Cramer's V        : 0.433

There is a strong correlation between category and gender

ggplot(dataset,aes(Gender,Category))+geom_jitter()

Female purchases Apparel,Grocery category products more compared to male. While Male purchase Movies/Entertainment,Transit category products more.