Introduction Dataset contain the information about numbers of the shoppers that have engaged in shopping on black friday from 2005 to 2013. Our data is stored based on product id i.e. if a person buying 10 products then his data will be stored in 10 observation and hence there will berepetition of many people age group. Research Question: The challenge is to predict the purchase amount of various products by users across categories given historic data of purchase amounts.
Data fields Most of the fields are self-explanatory. The following are descriptions for those that aren’t.
Loading the required packages.
#{echo=FALSE, cache=FALSE, results=FALSE, warning=FALSE, comment=FALSE, Message=FALSE}
library(data.table)
## Warning: package 'data.table' was built under R version 3.5.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.5.1
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.1
## corrplot 0.84 loaded
library(rpart)
## Warning: package 'rpart' was built under R version 3.5.1
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.1
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(stepPlr)
library(C50)
## Warning: package 'C50' was built under R version 3.5.1
library(plyr)
## Warning: package 'plyr' was built under R version 3.5.1
library(MASS)
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.5.1
library(caretEnsemble)
## Warning: package 'caretEnsemble' was built under R version 3.5.1
##
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
##
## autoplot
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Loading Data
train_set <- read.csv("D:\\Projects\\Level2\\BlackFriday\\train.csv", stringsAsFactors = FALSE, header = TRUE, na.strings = c('NA',''))
test_set<- read.csv("D:\\Projects\\Level2\\BlackFriday\\test.csv", stringsAsFactors = FALSE, header = TRUE, na.strings = c('NA',''))
#Structre of dataset
str(train_set)
## 'data.frame': 550068 obs. of 12 variables:
## $ User_ID : int 1000001 1000001 1000001 1000001 1000002 1000003 1000004 1000004 1000004 1000005 ...
## $ Product_ID : chr "P00069042" "P00248942" "P00087842" "P00085442" ...
## $ Gender : chr "F" "F" "F" "F" ...
## $ Age : chr "0-17" "0-17" "0-17" "0-17" ...
## $ Occupation : int 10 10 10 10 16 15 7 7 7 20 ...
## $ City_Category : chr "A" "A" "A" "A" ...
## $ Stay_In_Current_City_Years: chr "2" "2" "2" "2" ...
## $ Marital_Status : int 0 0 0 0 0 0 1 1 1 1 ...
## $ Product_Category_1 : int 3 1 12 12 8 1 1 1 1 8 ...
## $ Product_Category_2 : int NA 6 NA 14 NA 2 8 15 16 NA ...
## $ Product_Category_3 : int NA 14 NA NA NA NA 17 NA NA NA ...
## $ Purchase : int 8370 15200 1422 1057 7969 15227 19215 15854 15686 7871 ...
#Checking missing data
sapply(train_set, function(x) sum(is.na(x)))
## User_ID Product_ID
## 0 0
## Gender Age
## 0 0
## Occupation City_Category
## 0 0
## Stay_In_Current_City_Years Marital_Status
## 0 0
## Product_Category_1 Product_Category_2
## 0 173638
## Product_Category_3 Purchase
## 383247 0
#Structre of Test dataset
str(test_set)
## 'data.frame': 233599 obs. of 11 variables:
## $ User_ID : int 1000004 1000009 1000010 1000010 1000011 1000013 1000013 1000013 1000015 1000022 ...
## $ Product_ID : chr "P00128942" "P00113442" "P00288442" "P00145342" ...
## $ Gender : chr "M" "M" "F" "F" ...
## $ Age : chr "46-50" "26-35" "36-45" "36-45" ...
## $ Occupation : int 7 17 1 1 1 1 1 1 7 15 ...
## $ City_Category : chr "B" "C" "B" "B" ...
## $ Stay_In_Current_City_Years: chr "2" "0" "4+" "4+" ...
## $ Marital_Status : int 1 0 1 1 0 1 1 1 0 0 ...
## $ Product_Category_1 : int 1 3 5 4 4 2 1 2 10 5 ...
## $ Product_Category_2 : int 11 5 14 9 5 3 11 4 13 14 ...
## $ Product_Category_3 : int NA NA NA NA 12 15 15 9 16 NA ...
#Checking missing data
sapply(test_set, function(x) sum(is.na(x)))
## User_ID Product_ID
## 0 0
## Gender Age
## 0 0
## Occupation City_Category
## 0 0
## Stay_In_Current_City_Years Marital_Status
## 0 0
## Product_Category_1 Product_Category_2
## 0 72344
## Product_Category_3
## 162562
# Unique Data For EDA & Data type modification
train_set$User_ID <- as.factor(train_set$User_ID)
train_set$Product_ID <- as.factor(train_set$Product_ID)
train_set$Marital_Status <- as.factor(ifelse(train_set$Marital_Status == 1, 'Married', 'Single'))
train_set$Age <- as.factor(train_set$Age)
train_set$Gender <- as.factor(ifelse(train_set$Gender=='M', 'Male', 'Female'))
train_set$Occupation <- as.factor(train_set$Occupation)
train_set$City_Category <- as.factor(train_set$City_Category)
train_set$Stay_In_Current_City_Years <- as.factor(train_set$Stay_In_Current_City_Years)
test_set$User_ID <- as.factor(test_set$User_ID)
test_set$Product_ID <- as.factor(test_set$Product_ID)
test_set$Marital_Status <- as.factor(ifelse(test_set$Marital_Status == 1, 'Married', 'Single'))
test_set$Age <- as.factor(test_set$Age)
test_set$Gender <- as.factor(ifelse(test_set$Gender=='M', 'Male', 'Female'))
test_set$Occupation <- as.factor(test_set$Occupation)
test_set$City_Category <- as.factor(test_set$City_Category)
test_set$Stay_In_Current_City_Years <- as.factor(test_set$Stay_In_Current_City_Years)
#str(train_set)
#str(test_set)
EDA_Distinct <- distinct(train_set, User_ID, Age, Gender, Marital_Status, Occupation, City_Category, Stay_In_Current_City_Years)
#str(EDA_Distinct)
head(EDA_Distinct)
## User_ID Gender Age Occupation City_Category Stay_In_Current_City_Years
## 1 1000001 Female 0-17 10 A 2
## 2 1000002 Male 55+ 16 C 4+
## 3 1000003 Male 26-35 15 A 3
## 4 1000004 Male 46-50 7 B 2
## 5 1000005 Male 26-35 20 A 1
## 6 1000006 Female 51-55 9 A 1
## Marital_Status
## 1 Single
## 2 Single
## 3 Single
## 4 Married
## 5 Married
## 6 Single
#This feature is a have the data on user distinction Id
head(train_set$User_ID,40)
## [1] 1000001 1000001 1000001 1000001 1000002 1000003 1000004 1000004
## [9] 1000004 1000005 1000005 1000005 1000005 1000005 1000006 1000006
## [17] 1000006 1000006 1000007 1000008 1000008 1000008 1000008 1000008
## [25] 1000008 1000009 1000009 1000009 1000009 1000010 1000010 1000010
## [33] 1000010 1000010 1000010 1000010 1000010 1000010 1000010 1000010
## 5891 Levels: 1000001 1000002 1000003 1000004 1000005 1000006 ... 1006040
head(test_set$User_ID,40)
## [1] 1000004 1000009 1000010 1000010 1000011 1000013 1000013 1000013
## [9] 1000015 1000022 1000026 1000026 1000026 1000026 1000028 1000029
## [17] 1000033 1000033 1000034 1000035 1000036 1000036 1000042 1000045
## [25] 1000045 1000045 1000048 1000048 1000053 1000053 1000053 1000053
## [33] 1000053 1000059 1000062 1000063 1000063 1000065 1000068 1000069
## 5891 Levels: 1000001 1000002 1000003 1000004 1000005 1000006 ... 1006040
#creating a new data frame to stor the number of purchase made by each user
userIDCount <- as.data.frame(table(train_set$User_ID))
names(userIDCount) <- c("User_ID","User_Purchase_Count")
head(userIDCount)
## User_ID User_Purchase_Count
## 1 1000001 35
## 2 1000002 77
## 3 1000003 29
## 4 1000004 14
## 5 1000005 106
## 6 1000006 47
train_set <- merge(x = train_set, y = userIDCount, by = "User_ID", all.x = TRUE)
str(train_set)
## 'data.frame': 550068 obs. of 13 variables:
## $ User_ID : Factor w/ 5891 levels "1000001","1000002",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_ID : Factor w/ 3631 levels "P00000142","P00000242",..: 673 2377 853 829 250 723 2041 1560 1028 834 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : Factor w/ 7 levels "0-17","18-25",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Occupation : Factor w/ 21 levels "0","1","2","3",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ City_Category : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Stay_In_Current_City_Years: Factor w/ 5 levels "0","1","2","3",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Marital_Status : Factor w/ 2 levels "Married","Single": 2 2 2 2 2 2 2 2 2 2 ...
## $ Product_Category_1 : int 3 1 12 12 1 3 14 8 8 2 ...
## $ Product_Category_2 : int NA 6 NA 14 2 4 NA NA NA 4 ...
## $ Product_Category_3 : int NA 14 NA NA 9 12 NA NA NA 8 ...
## $ Purchase : int 8370 15200 1422 1057 15416 10572 11011 10003 8094 12842 ...
## $ User_Purchase_Count : int 35 35 35 35 35 35 35 35 35 35 ...
test_set <- merge(x = test_set, y = userIDCount, by = "User_ID", all.x = TRUE)
#Now we can remove the UserIDCount dataframe
rm(userIDCount)
'%!in%' <- function(x,y)!('%in%'(x,y))
#if(test_set$User_ID %!in% train_set$User_ID){ assign(test_set$userIDCount, 0)}
test_set[is.na(test_set$User_Purchase_Count), "User_Purchase_Count"] <- 1
class(test_set$User_Purchase_Count)
## [1] "numeric"
str(test_set)
## 'data.frame': 233599 obs. of 12 variables:
## $ User_ID : Factor w/ 5891 levels "1000001","1000002",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_ID : Factor w/ 3491 levels "P00000142","P00000242",..: 2679 1016 1976 559 663 553 3457 1942 2393 966 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : Factor w/ 7 levels "0-17","18-25",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Occupation : Factor w/ 21 levels "0","1","2","3",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ City_Category : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Stay_In_Current_City_Years: Factor w/ 5 levels "0","1","2","3",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Marital_Status : Factor w/ 2 levels "Married","Single": 2 2 2 2 2 2 2 2 2 2 ...
## $ Product_Category_1 : int 8 8 3 3 3 3 4 5 5 8 ...
## $ Product_Category_2 : int NA NA 4 4 5 4 5 NA 9 NA ...
## $ Product_Category_3 : int NA NA 5 12 16 5 12 NA NA NA ...
## $ User_Purchase_Count : num 35 35 35 35 35 35 35 35 35 35 ...
test_set$User_Purchase_Count <- as.integer(test_set$User_Purchase_Count)
#Updating EDA_Distinct dataframe
EDA_Distinct <- distinct(train_set, User_ID, Age, Gender, Marital_Status, Occupation, City_Category, Stay_In_Current_City_Years, User_Purchase_Count)
d1 <- summary(EDA_Distinct$User_Purchase_Count)
p1 <- ggplot(EDA_Distinct, aes(x=User_Purchase_Count)) +geom_density(fill="red", col="black", alpha=0.80) + annotate(geom = "text", x = 6, y = 0.0125, label = "Min") + annotate(geom = "text", x = 24, y = 0.013, label = "1st Qu.") + annotate(geom = "text", x = 50, y = 0.0125, label = "Median") + annotate(geom = "text", x = 90, y = 0.013, label = "Mean") + annotate(geom = "text", x = 112, y = 0.0125, label = "3rd Qu.") + annotate(geom = "text", x = 1015, y = 0.0125, label = "Max") + geom_vline(xintercept = c(6, 26, 54, 93.37, 117, 1026), size = 0.2, col = 'black') #+ lims(x = )
p2 <- ggplot(EDA_Distinct, aes(x=User_Purchase_Count)) +geom_histogram(fill="red", col="black", alpha=0.80)
p3 <- ggplot(EDA_Distinct,aes(x= Age,y=User_Purchase_Count, fill=Age)) + geom_boxplot() + facet_grid(Gender~Marital_Status) + labs(x="Age",y="Customer Purchase Count")
p4 <- ggplot(EDA_Distinct,aes(x= Occupation,y=User_Purchase_Count, fill=Occupation)) + geom_boxplot() + facet_grid(Gender~Marital_Status) + labs(x="Occupation",y="Customer Purchase Count")
p5 <- ggplot(EDA_Distinct,aes(x=Age,y=User_Purchase_Count,fill=Stay_In_Current_City_Years))+geom_boxplot()+facet_grid(City_Category~ Stay_In_Current_City_Years) + labs(x="Age",y="Customer Purchase Count")
p5i <- ggplot(EDA_Distinct,aes(x=Age,y=User_Purchase_Count,fill=Stay_In_Current_City_Years))+geom_boxplot()+facet_grid( Stay_In_Current_City_Years ~ City_Category) + labs(x="Age",y="Customer Purchase Count")
p6 <- ggplot(EDA_Distinct,aes(x=Age,y=User_Purchase_Count,fill=Marital_Status))+geom_boxplot()+facet_grid(Gender~City_Category) + scale_fill_manual(values=c("tan4","limegreen")) + labs(x="Age",y="Customer Purchase Count")
#grid.arrange(p1, p2, p3, p4 ,p5i ,p6, ncol = 1, nrow = 6);
d1;p1;p2;p3;p4;p5;p5i;p6
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 26.00 54.00 93.37 117.00 1026.00
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#This feature is a have the data on user distinction Id
head(train_set$Product_ID,15)
## [1] P00069042 P00248942 P00087842 P00085442 P00025442 P00074142 P00214842
## [8] P00165942 P00111842 P00085942 P00297042 P00117942 P00258742 P00142242
## [15] P00178242
## 3631 Levels: P00000142 P00000242 P00000342 P00000442 P00000542 ... P0099942
#creating a new data frame to stor the number of purchase made by each user
ProductIDCount <- as.data.frame(table(train_set$Product_ID))
names(ProductIDCount) <- c("Product_ID","Product_Sold_Count")
head(ProductIDCount)
## Product_ID Product_Sold_Count
## 1 P00000142 1152
## 2 P00000242 376
## 3 P00000342 244
## 4 P00000442 92
## 5 P00000542 149
## 6 P00000642 512
# joining i.e. storing the Product sold count in original data frame
train_set <- merge(x = train_set, y = ProductIDCount, by = "Product_ID", all.x = TRUE)
str(train_set)
## 'data.frame': 550068 obs. of 14 variables:
## $ Product_ID : Factor w/ 3631 levels "P00000142","P00000242",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ User_ID : Factor w/ 5891 levels "1000001","1000002",..: 3000 3309 235 5439 2386 612 1907 4255 4639 4588 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 1 2 2 2 ...
## $ Age : Factor w/ 7 levels "0-17","18-25",..: 3 4 2 2 3 1 2 4 4 2 ...
## $ Occupation : Factor w/ 21 levels "0","1","2","3",..: 8 21 20 2 1 11 2 1 18 21 ...
## $ City_Category : Factor w/ 3 levels "A","B","C": 2 2 1 3 2 3 2 3 3 3 ...
## $ Stay_In_Current_City_Years: Factor w/ 5 levels "0","1","2","3",..: 2 5 2 2 1 3 1 2 2 2 ...
## $ Marital_Status : Factor w/ 2 levels "Married","Single": 1 2 2 1 1 2 1 2 2 2 ...
## $ Product_Category_1 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Product_Category_2 : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Product_Category_3 : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Purchase : int 13642 8335 13374 10883 10843 11041 13206 10568 13578 10784 ...
## $ User_Purchase_Count : int 197 171 39 51 282 129 287 73 62 175 ...
## $ Product_Sold_Count : int 1152 1152 1152 1152 1152 1152 1152 1152 1152 1152 ...
d2 <- summary(train_set$Product_Sold_Count)
p7 <- ggplot(train_set, aes(x=Product_Sold_Count)) +geom_density(fill="red", col="black", alpha=0.80) + annotate(geom = "text", x = 1, y = 0.004, label = "Min") + annotate(geom = "text", x = 174, y = 0.00385, label = "1st Qu.") + annotate(geom = "text", x = 357, y = 0.004, label = "Median") + annotate(geom = "text", x = 450, y = 0.00385, label = "Mean") + annotate(geom = "text", x = 620, y = 0.004, label = "3rd Qu.") + annotate(geom = "text", x = 1880, y = 0.004, label = "Max") + geom_vline(xintercept = c(1,174,357,450.5,620,1880), size = 0.2, col = 'black')
d2;p7
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 174.0 357.0 450.5 620.0 1880.0
# Product which are sold most frequent and least frequent
head(ProductIDCount[order(-ProductIDCount$Product_Sold_Count),])
## Product_ID Product_Sold_Count
## 2537 P00265242 1880
## 250 P00025442 1615
## 1017 P00110742 1612
## 1031 P00112142 1562
## 566 P00057642 1470
## 1746 P00184942 1440
tail(ProductIDCount[order(-ProductIDCount$Product_Sold_Count),])
## Product_ID Product_Sold_Count
## 3397 P00353742 1
## 3435 P00357542 1
## 3501 P00364342 1
## 3503 P00364542 1
## 3517 P00365942 1
## 3609 P0097642 1
# writing code such that if a new user comes for the first time his count is set to one in test dataset
test_set <- merge(x = test_set, y = ProductIDCount, by = "Product_ID", all.x = TRUE)
#Now we can remove the UserIDCount dataframe
rm(ProductIDCount)
test_set[is.na(test_set$User_Purchase_Count), "User_Purchase_Count"] <- 1
#str(test_set)
test_set$User_Purchase_Count <- as.integer(test_set$User_Purchase_Count)
#ggplot(EDA_Distinct, aes(x=User_Purchase_Count, y=Purchase)) + geom_point()
#Gender and Martial_Status
#This feature is a have the data on user distinction Id
head(train_set$Gender); head(train_set$Marital_Status)
## [1] Male Male Male Female Male Female
## Levels: Female Male
## [1] Married Single Single Married Married Single
## Levels: Married Single
d3 <- table(EDA_Distinct$Gender, EDA_Distinct$Marital_Status)
p8 <- ggplot(EDA_Distinct, aes(x=Gender, fill= Marital_Status)) + geom_bar(position = "dodge") + ggtitle("") + labs(x="Gender",y="No. of distinct Sales") + annotate(geom = "text", x = 0.775, y = 619, label = "719") + annotate(geom = "text", x = 1.225, y = 847, label = "947") + annotate(geom = "text", x = 1.775, y = 1655, label = "1755") + annotate(geom = "text", x = 2.225, y = 2370, label = "2470") + scale_fill_manual(values=c("tan4","limegreen"))
d3; p8
##
## Married Single
## Female 719 947
## Male 1755 2470
Discussion It can be interpretted that the customer buying the most number of products are unmarried males and the least are unmarried women.It can also be observed from the graph that the ummarried people generaly buy more number of products than unmarried people.
# Age
#This feature represent the age group of the customer
head(train_set, 10)
## Product_ID User_ID Gender Age Occupation City_Category
## 1 P00000142 1003081 Male 26-35 7 B
## 2 P00000142 1003402 Male 36-45 20 B
## 3 P00000142 1000239 Male 18-25 19 A
## 4 P00000142 1005578 Female 18-25 1 C
## 5 P00000142 1002457 Male 26-35 0 B
## 6 P00000142 1000629 Female 0-17 10 C
## 7 P00000142 1001962 Female 18-25 1 B
## 8 P00000142 1004367 Male 36-45 0 C
## 9 P00000142 1004757 Male 36-45 17 C
## 10 P00000142 1004704 Male 18-25 20 C
## Stay_In_Current_City_Years Marital_Status Product_Category_1
## 1 1 Married 3
## 2 4+ Single 3
## 3 1 Single 3
## 4 1 Married 3
## 5 0 Married 3
## 6 2 Single 3
## 7 0 Married 3
## 8 1 Single 3
## 9 1 Single 3
## 10 1 Single 3
## Product_Category_2 Product_Category_3 Purchase User_Purchase_Count
## 1 4 5 13642 197
## 2 4 5 8335 171
## 3 4 5 13374 39
## 4 4 5 10883 51
## 5 4 5 10843 282
## 6 4 5 11041 129
## 7 4 5 13206 287
## 8 4 5 10568 73
## 9 4 5 13578 62
## 10 4 5 10784 175
## Product_Sold_Count
## 1 1152
## 2 1152
## 3 1152
## 4 1152
## 5 1152
## 6 1152
## 7 1152
## 8 1152
## 9 1152
## 10 1152
#Now Lets see the distribution of distinct customers
d4 <- table(EDA_Distinct$Age)
p9 <- ggplot(EDA_Distinct, aes(x=Age)) + geom_bar(fill=rainbow(7), col="black") + ggtitle("") + labs(x="Age Group",y="No. of distinct buyer") + annotate(geom = "text", x = 1, y = 168, label = "218") + annotate(geom = "text", x = 2, y = 1019, label = "1069") + annotate(geom = "text", x = 3, y = 2000, label = "2053") + annotate(geom = "text", x = 4, y = 1117, label = "1167") + annotate(geom = "text", x = 5, y = 481, label = "531") + annotate(geom = "text", x = 6, y = 431, label = "481") + annotate(geom = "text", x = 7, y = 322, label = "372")
d5 <- table(EDA_Distinct$Marital_Status, EDA_Distinct$Gender, EDA_Distinct$Age)
p10 <- ggplot(EDA_Distinct, aes(x= Age,fill= Gender, col= Marital_Status)) + geom_bar(position = "dodge", size=1.25) + labs(x="Age Group",y="No. of distinct buyer") + scale_fill_manual(values=c("hotpink", "royalblue")) + scale_color_manual(values=c("tan4","limegreen")) + ggtitle("")
p11 <- ggplot(EDA_Distinct,aes(x=Age,fill=Marital_Status))+geom_bar(position = "dodge")+facet_grid(Gender~.) + scale_fill_manual(values=c("tan4","limegreen"))
#grid.arrange(p9, p10, p11, ncol = 5, nrow = 1); wastrying to create thumbnail
d4; p9; d5; p10; p11
##
## 0-17 18-25 26-35 36-45 46-50 51-55 55+
## 218 1069 2053 1167 531 481 372
## , , = 0-17
##
##
## Female Male
## Married 0 0
## Single 78 140
##
## , , = 18-25
##
##
## Female Male
## Married 70 174
## Single 217 608
##
## , , = 26-35
##
##
## Female Male
## Married 225 584
## Single 320 924
##
## , , = 36-45
##
##
## Female Male
## Married 131 331
## Single 202 503
##
## , , = 46-50
##
##
## Female Male
## Married 133 242
## Single 49 107
##
## , , = 51-55
##
##
## Female Male
## Married 93 252
## Single 49 87
##
## , , = 55+
##
##
## Female Male
## Married 67 172
## Single 32 101
#This feature represent the age group of the customer
head(train_set$Occupation, 10)
## [1] 7 20 19 1 0 10 1 0 17 20
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
d6 <- table(EDA_Distinct$Occupation)
d7 <- table(EDA_Distinct$Gender, EDA_Distinct$Occupation)
p12 <- ggplot(EDA_Distinct, aes(x=Occupation, fill=Gender)) + geom_bar( col="black") + ggtitle("") + labs(x="Occupation",y="No. of distinct people") + scale_fill_manual(values=c("hotpink", "royalblue"))
d8 <- table(EDA_Distinct$Marital_Status, EDA_Distinct$Occupation)
p13 <- ggplot(EDA_Distinct, aes(x=Occupation, fill=Marital_Status)) + geom_bar( col="black") + ggtitle("") + labs(x="Occupation",y="No. of distinct people") + scale_fill_manual(values=c("tan4","limegreen"))
p14 <- ggplot(EDA_Distinct,aes(x=Occupation, fill=Age))+geom_bar()+facet_grid(Gender~Marital_Status)
#grid.arrange(p12, p13, p14, ncol = 5, nrow = 1);
d6; d7; p12; d8; p13; p14
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 688 517 256 170 740 111 228 669 17 88 192 128 376 140 294 140 235 491
## 18 19 20
## 67 71 273
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## Female 226 203 88 98 228 31 99 137 3 85 66 22 46 33 78 28
## Male 462 314 168 72 512 80 129 532 14 3 126 106 330 107 216 112
##
## 16 17 18 19 20
## Female 49 50 4 15 77
## Male 186 441 63 56 196
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## Married 286 238 112 80 196 46 126 314 6 47 8 55 157 90 143 67
## Single 402 279 144 90 544 65 102 355 11 41 184 73 219 50 151 73
##
## 16 17 18 19 20
## Married 125 203 32 14 129
## Single 110 288 35 57 144
#City_Category and Stay_In_Current_City_Years
#This feature represent the age group of the customer
head(train_set$Stay_In_Current_City_Years, 10); head(train_set$City_Category, 10)
## [1] 1 4+ 1 1 0 2 0 1 1 1
## Levels: 0 1 2 3 4+
## [1] B B A C B C B C C C
## Levels: A B C
d9 <- table(EDA_Distinct$City_Category, EDA_Distinct$Stay_In_Current_City_Years)
p15 <- ggplot(EDA_Distinct, aes(x=Stay_In_Current_City_Years, fill=City_Category)) + geom_bar( col="black") + ggtitle("") + labs(x="Stay in Current City (Years)",y="No. of distinct people")
p16 <- ggplot(EDA_Distinct,aes(City_Category,fill=Age))+geom_bar()
p17 <- ggplot(EDA_Distinct,aes(x=Age,fill=Stay_In_Current_City_Years))+geom_bar()+facet_grid(City_Category~ Stay_In_Current_City_Years)
p18 <- ggplot(EDA_Distinct,aes(x=Age,fill=Marital_Status))+geom_bar()+facet_grid(Gender~City_Category) + scale_fill_manual(values=c("tan4","limegreen"))
#grid.arrange(p15, p16, p17, p18, ncol = 5, nrow = 1);
d9; p15; p16; p17; p18
##
## 0 1 2 3 4+
## A 147 370 183 180 165
## B 211 608 342 295 251
## C 414 1108 620 504 493
# Product Category
#This feature is a unique Id for each store. Now lets first observe this features.
head(as.factor(train_set$Product_Category_1))
## [1] 3 3 3 3 3 3
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
head(as.factor(train_set$Product_Category_2))
## [1] 4 4 4 4 4 4
## Levels: 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
head(as.factor(train_set$Product_Category_3))
## [1] 5 5 5 5 5 5
## Levels: 3 4 5 6 8 9 10 11 12 13 14 15 16 17 18
Discussion It can be seen that there are 20 product categories for the product and to deal with the missing data we can do either of the two things described below: 1. We can create 20 binary variables for the 20 producgt type. 2. We can assign the missing value equal to product category of 1 i.e. same product category in the missing position.
quite possibly we would have some problem when using the 2 method when one of the value is missing out of three. the problem would lie which catgory to be pasted category 1 or 2.
I have planned to use 1st solution first.
#Feature manuplation and Creation - Training Set
# Converting Variable to Factor type
train_set$Product_Category_1 <- as.factor(train_set$Product_Category_1)
train_set$Product_Category_2 <- as.factor(train_set$Product_Category_2)
train_set$Product_Category_3 <- as.factor(train_set$Product_Category_3)
#Adding another factor level for missing value
train_set$Product_Category_2 <- factor(train_set$Product_Category_2, levels=c(levels(train_set$Product_Category_2), "0"))
train_set[is.na(train_set$Product_Category_2), "Product_Category_2"] <-"0"
#head(train_set$Product_Category_2, 10)
train_set$Product_Category_3 <- factor(train_set$Product_Category_3, levels=c(levels(train_set$Product_Category_3), "0"))
train_set[is.na(train_set$Product_Category_3), "Product_Category_3"] <-"0"
#head(train_set$Product_Category_3, 10)
# Creating new binary variable based on product category
train_set$Cat_1 <- as.factor(ifelse((train_set$Product_Category_1=='1' | train_set$Product_Category_2=='1' | train_set$Product_Category_3=='1'), 1,0))
for(i in 2:20)
{
assign(paste("Cat_", as.character(i), sep=""),as.factor(ifelse((train_set$Product_Category_1==i | train_set$Product_Category_2==i | train_set$Product_Category_3==i), 1,0)))
}
train_set <- cbind(train_set, Cat_2, Cat_3, Cat_4, Cat_5, Cat_6, Cat_7, Cat_8, Cat_9, Cat_10, Cat_11, Cat_12, Cat_13, Cat_14, Cat_15, Cat_16, Cat_17, Cat_18, Cat_19, Cat_20)
# Dropping the unnecessary variables
to_drop <- c("Product_Category_1", "Product_Category_2", "Product_Category_3")
train_set <- train_set[,!names(train_set)%in% to_drop]
rm(Cat_2, Cat_3, Cat_4, Cat_5, Cat_6, Cat_7, Cat_8, Cat_9, Cat_10, Cat_11, Cat_12, Cat_13, Cat_14, Cat_15, Cat_16, Cat_17, Cat_18, Cat_19, Cat_20)
#Checking the final structure of the dataset
dim(train_set)
## [1] 550068 31
as.matrix(sapply(train_set, function(x) class(x)))
## [,1]
## Product_ID "factor"
## User_ID "factor"
## Gender "factor"
## Age "factor"
## Occupation "factor"
## City_Category "factor"
## Stay_In_Current_City_Years "factor"
## Marital_Status "factor"
## Purchase "integer"
## User_Purchase_Count "integer"
## Product_Sold_Count "integer"
## Cat_1 "factor"
## Cat_2 "factor"
## Cat_3 "factor"
## Cat_4 "factor"
## Cat_5 "factor"
## Cat_6 "factor"
## Cat_7 "factor"
## Cat_8 "factor"
## Cat_9 "factor"
## Cat_10 "factor"
## Cat_11 "factor"
## Cat_12 "factor"
## Cat_13 "factor"
## Cat_14 "factor"
## Cat_15 "factor"
## Cat_16 "factor"
## Cat_17 "factor"
## Cat_18 "factor"
## Cat_19 "factor"
## Cat_20 "factor"
#Feature manuplation and Creation - Test Set
# Converting Variable to Factor type
test_set$Product_Category_1 <- as.factor(test_set$Product_Category_1)
test_set$Product_Category_2 <- as.factor(test_set$Product_Category_2)
test_set$Product_Category_3 <- as.factor(test_set$Product_Category_3)
#Adding another factor level for missing value
test_set$Product_Category_2 <- factor(test_set$Product_Category_2, levels=c(levels(test_set$Product_Category_2), "0"))
test_set[is.na(test_set$Product_Category_2), "Product_Category_2"] <-"0"
#head(test_set$Product_Category_2, 10)
test_set$Product_Category_3 <- factor(test_set$Product_Category_3, levels=c(levels(test_set$Product_Category_3), "0"))
test_set[is.na(test_set$Product_Category_3), "Product_Category_3"] <-"0"
#head(test_set$Product_Category_3, 10)
# Creating new binary variable based on product category
for(i in 1:20)
{
assign(paste("Cat_", as.character(i), sep=""),as.factor(ifelse((test_set$Product_Category_1==i | test_set$Product_Category_2==i | test_set$Product_Category_3==i), 1,0)))
}
test_set <- cbind(test_set, Cat_1, Cat_2, Cat_3, Cat_4, Cat_5, Cat_6, Cat_7, Cat_8, Cat_9, Cat_10, Cat_11, Cat_12, Cat_13, Cat_14, Cat_15, Cat_16, Cat_17, Cat_18, Cat_19, Cat_20)
# Dropping the unnecessary variables
to_drop <- c("Product_Category_1", "Product_Category_2", "Product_Category_3")
test_set <- test_set[,!names(test_set)%in% to_drop]
rm(Cat_1, Cat_2, Cat_3, Cat_4, Cat_5, Cat_6, Cat_7, Cat_8, Cat_9, Cat_10, Cat_11, Cat_12, Cat_13, Cat_14, Cat_15, Cat_16, Cat_17, Cat_18, Cat_19, Cat_20)
#Checking the final structure of the dataset
dim(test_set)
## [1] 233599 30
as.matrix(sapply(test_set, function(x) class(x)))
## [,1]
## Product_ID "factor"
## User_ID "factor"
## Gender "factor"
## Age "factor"
## Occupation "factor"
## City_Category "factor"
## Stay_In_Current_City_Years "factor"
## Marital_Status "factor"
## User_Purchase_Count "integer"
## Product_Sold_Count "integer"
## Cat_1 "factor"
## Cat_2 "factor"
## Cat_3 "factor"
## Cat_4 "factor"
## Cat_5 "factor"
## Cat_6 "factor"
## Cat_7 "factor"
## Cat_8 "factor"
## Cat_9 "factor"
## Cat_10 "factor"
## Cat_11 "factor"
## Cat_12 "factor"
## Cat_13 "factor"
## Cat_14 "factor"
## Cat_15 "factor"
## Cat_16 "factor"
## Cat_17 "factor"
## Cat_18 "factor"
## Cat_19 "factor"
## Cat_20 "factor"
#Clearing some work space
rm(d1,d2,d3,d4,d5,d6,d7,d8,d9,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16,p17,p18,i,to_drop)
# Final View
#str(train_set, list.len = 6);str(test_set, list.len = 6)
str(train_set); str(test_set)
## 'data.frame': 550068 obs. of 31 variables:
## $ Product_ID : Factor w/ 3631 levels "P00000142","P00000242",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ User_ID : Factor w/ 5891 levels "1000001","1000002",..: 3000 3309 235 5439 2386 612 1907 4255 4639 4588 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 1 2 2 2 ...
## $ Age : Factor w/ 7 levels "0-17","18-25",..: 3 4 2 2 3 1 2 4 4 2 ...
## $ Occupation : Factor w/ 21 levels "0","1","2","3",..: 8 21 20 2 1 11 2 1 18 21 ...
## $ City_Category : Factor w/ 3 levels "A","B","C": 2 2 1 3 2 3 2 3 3 3 ...
## $ Stay_In_Current_City_Years: Factor w/ 5 levels "0","1","2","3",..: 2 5 2 2 1 3 1 2 2 2 ...
## $ Marital_Status : Factor w/ 2 levels "Married","Single": 1 2 2 1 1 2 1 2 2 2 ...
## $ Purchase : int 13642 8335 13374 10883 10843 11041 13206 10568 13578 10784 ...
## $ User_Purchase_Count : int 197 171 39 51 282 129 287 73 62 175 ...
## $ Product_Sold_Count : int 1152 1152 1152 1152 1152 1152 1152 1152 1152 1152 ...
## $ Cat_1 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_2 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_3 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Cat_4 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Cat_5 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Cat_6 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_7 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_8 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_9 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_10 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_11 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_12 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_13 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_14 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_15 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_16 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_17 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_18 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_19 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_20 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## 'data.frame': 233599 obs. of 30 variables:
## $ Product_ID : Factor w/ 3491 levels "P00000142","P00000242",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ User_ID : Factor w/ 5891 levels "1000001","1000002",..: 3697 2856 4519 2608 3201 5509 1549 1753 3190 5405 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 1 2 2 1 1 2 2 2 2 ...
## $ Age : Factor w/ 7 levels "0-17","18-25",..: 5 4 1 2 6 4 3 3 3 3 ...
## $ Occupation : Factor w/ 21 levels "0","1","2","3",..: 15 21 1 5 7 13 17 3 18 18 ...
## $ City_Category : Factor w/ 3 levels "A","B","C": 3 2 2 3 2 2 3 2 3 1 ...
## $ Stay_In_Current_City_Years: Factor w/ 5 levels "0","1","2","3",..: 5 4 2 4 4 3 2 2 4 3 ...
## $ Marital_Status : Factor w/ 2 levels "Married","Single": 1 2 2 1 2 1 1 2 2 2 ...
## $ User_Purchase_Count : int 120 363 46 56 502 166 112 149 93 190 ...
## $ Product_Sold_Count : int 1152 1152 1152 1152 1152 1152 1152 1152 1152 1152 ...
## $ Cat_1 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_2 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_3 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Cat_4 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Cat_5 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Cat_6 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_7 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_8 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_9 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_10 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_11 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_12 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_13 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_14 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_15 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_16 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_17 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_18 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_19 : Factor w/ 1 level "0": 1 1 1 1 1 1 1 1 1 1 ...
## $ Cat_20 : Factor w/ 1 level "0": 1 1 1 1 1 1 1 1 1 1 ...
sapply(train_set, function(x) sum(is.na(x))); sapply(test_set, function(x) sum(is.na(x)))
## Product_ID User_ID
## 0 0
## Gender Age
## 0 0
## Occupation City_Category
## 0 0
## Stay_In_Current_City_Years Marital_Status
## 0 0
## Purchase User_Purchase_Count
## 0 0
## Product_Sold_Count Cat_1
## 0 0
## Cat_2 Cat_3
## 0 0
## Cat_4 Cat_5
## 0 0
## Cat_6 Cat_7
## 0 0
## Cat_8 Cat_9
## 0 0
## Cat_10 Cat_11
## 0 0
## Cat_12 Cat_13
## 0 0
## Cat_14 Cat_15
## 0 0
## Cat_16 Cat_17
## 0 0
## Cat_18 Cat_19
## 0 0
## Cat_20
## 0
## Product_ID User_ID
## 0 0
## Gender Age
## 0 0
## Occupation City_Category
## 0 0
## Stay_In_Current_City_Years Marital_Status
## 0 0
## User_Purchase_Count Product_Sold_Count
## 0 61
## Cat_1 Cat_2
## 0 0
## Cat_3 Cat_4
## 0 0
## Cat_5 Cat_6
## 0 0
## Cat_7 Cat_8
## 0 0
## Cat_9 Cat_10
## 0 0
## Cat_11 Cat_12
## 0 0
## Cat_13 Cat_14
## 0 0
## Cat_15 Cat_16
## 0 0
## Cat_17 Cat_18
## 0 0
## Cat_19 Cat_20
## 0 0
Conclusion From the various plot, plotted various kind of association and the inferences from the plot are easily observed.