setwd("F:/Work_tech/Dataset/black-friday")
train = read.csv("train.csv")
dim(train)
## [1] 550068 12
library(psych)
## Warning: package 'psych' was built under R version 3.4.4
describe(train)
library(psych)
describe(train)
str(train)
## 'data.frame': 550068 obs. of 12 variables:
## $ User_ID : int 1000001 1000001 1000001 1000001 1000002 1000003 1000004 1000004 1000004 1000005 ...
## $ Product_ID : Factor w/ 3631 levels "P00000142","P00000242",..: 673 2377 853 829 2735 1832 1746 3321 3605 2632 ...
## $ Gender : Factor w/ 2 levels "F","M": 1 1 1 1 2 2 2 2 2 2 ...
## $ Age : Factor w/ 7 levels "0-17","18-25",..: 1 1 1 1 7 3 5 5 5 3 ...
## $ Occupation : int 10 10 10 10 16 15 7 7 7 20 ...
## $ City_Category : Factor w/ 3 levels "A","B","C": 1 1 1 1 3 1 2 2 2 1 ...
## $ Stay_In_Current_City_Years: Factor w/ 5 levels "0","1","2","3",..: 3 3 3 3 5 4 3 3 3 2 ...
## $ Marital_Status : int 0 0 0 0 0 0 1 1 1 1 ...
## $ Product_Category_1 : int 3 1 12 12 8 1 1 1 1 8 ...
## $ Product_Category_2 : int NA 6 NA 14 NA 2 8 15 16 NA ...
## $ Product_Category_3 : int NA 14 NA NA NA NA 17 NA NA NA ...
## $ Purchase : int 8370 15200 1422 1057 7969 15227 19215 15854 15686 7871 ...
one way contigency table
mytable <- with(train,table(Gender))
mytable
## Gender
## F M
## 135809 414259
lbls <- c("Male","Female")
pct <- round(mytable/sum(mytable)*100)
lbls <- paste(lbls, pct)
lbls <- paste(lbls,"%",sep="")
pie(mytable,labels = lbls,col=c("white","green"))
mytable1 <- with(train,table(Marital_Status))
mytable1
## Marital_Status
## 0 1
## 324731 225337
lbls1 <- c("Single","Married")
pct1 <- round(mytable1/sum(mytable1)*100)
lbls1 <- paste(lbls1, pct1)
lbls1<- paste(lbls1,"%",sep="")
pie(mytable1,labels = lbls1,col=c("white","green"))
mytable2 <- with(train,table(City_Category))
mytable2
## City_Category
## A B C
## 147720 231173 171175
lbls2 <- c("A","B","C")
pct2 <- round(mytable2/sum(mytable2)*100)
lbls2 <- paste(lbls2, pct2)
lbls2<- paste(lbls2,"%",sep="")
pie(mytable2,labels = lbls2,col=c("white","green","blue"))
library(plyr)
## Warning: package 'plyr' was built under R version 3.4.4
library(highcharter)
## Warning: package 'highcharter' was built under R version 3.4.4
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(data.table)
## Warning: package 'data.table' was built under R version 3.4.4
data_product= ddply(train, c("Product_Category_1"), summarise,Totalpurchase= sum(Product_Category_1))
product_catagory<-data.frame(table(data_product$Product_Category_1))
grapha=highchart() %>%
hc_xAxis(categories = product_catagory$Var1) %>%
hc_add_series(name = "Category", data = data_product$Totalpurchase,color = "green")
grapha
Products from category 5 & 8 are in high demand.
##total sum of each product category based on gender
data_gender= ddply(train, c("Product_Category_1","Gender"), summarise,Totalpurchase= sum(Product_Category_1))
data_new_male=subset(data_gender, data_gender$Gender=='M')
data_new_female=subset(data_gender, data_gender$Gender=='F')
graphb<- highchart() %>%
hc_xAxis(categories = product_catagory$Var1) %>%
hc_add_series(name = "Male", data = data_new_male$Totalpurchase) %>%
hc_add_series(name = "Female", data = data_new_female$Totalpurchase)
graphb
In most of the cases purchase amount for males is high except few categoires.