Reading Transaction Dataset
trans <- read.csv("Retail data set//Transactions.csv")
head(trans)## transaction_id cust_id tran_date prod_subcat_code prod_cat_code Qty Rate
## 1 80712190438 270351 28-02-2014 1 1 -5 -772
## 2 29258453508 270384 27-02-2014 5 3 -5 -1497
## 3 51750724947 273420 24-02-2014 6 5 -2 -791
## 4 93274880719 271509 24-02-2014 11 6 -3 -1363
## 5 51750724947 273420 23-02-2014 6 5 -2 -791
## 6 97439039119 272357 23-02-2014 8 3 -2 -824
## Tax total_amt Store_type
## 1 405.300 -4265.300 e-Shop
## 2 785.925 -8270.925 e-Shop
## 3 166.110 -1748.110 TeleShop
## 4 429.345 -4518.345 e-Shop
## 5 166.110 -1748.110 TeleShop
## 6 173.040 -1821.040 TeleShop
Reading Customer Dataset
cust <- read.csv("Retail data set//Customer.csv")
head(cust)## customer_Id DOB Gender city_code
## 1 268408 02-01-1970 M 4
## 2 269696 07-01-1970 F 8
## 3 268159 08-01-1970 F 8
## 4 270181 10-01-1970 F 2
## 5 268073 11-01-1970 M 1
## 6 273216 15-01-1970 F 5
Reading Product Category Dataset
prod_cat_info <- read.csv("Retail data set//prod_cat_info.csv")
head(prod_cat_info)## prod_cat_code prod_cat prod_sub_cat_code prod_subcat
## 1 1 Clothing 4 Mens
## 2 1 Clothing 1 Women
## 3 1 Clothing 3 Kids
## 4 2 Footwear 1 Mens
## 5 2 Footwear 3 Women
## 6 2 Footwear 4 Kids
Merging Transaction and Customer dataset
trx_cust=merge(trans,cust,by.x ="cust_id" ,by.y ="customer_Id")
head(trx_cust)## cust_id transaction_id tran_date prod_subcat_code prod_cat_code Qty Rate
## 1 266783 25890929042 23-09-2011 1 2 4 1321
## 2 266783 16999552161 9/2/2013 10 5 2 835
## 3 266783 98477711300 21-10-2012 4 1 3 93
## 4 266783 25890929042 24-09-2011 1 2 -4 -1321
## 5 266783 8410316370 20-02-2013 4 1 1 869
## 6 266784 54234600611 23-08-2012 10 5 3 1291
## Tax total_amt Store_type DOB Gender city_code
## 1 554.820 5838.820 e-Shop 01-05-1974 M 4
## 2 175.350 1845.350 e-Shop 01-05-1974 M 4
## 3 29.295 308.295 TeleShop 01-05-1974 M 4
## 4 554.820 -5838.820 e-Shop 01-05-1974 M 4
## 5 91.245 960.245 e-Shop 01-05-1974 M 4
## 6 406.665 4279.665 TeleShop 13-12-1991 F 10
Merging trx_cust data with product category data
full_data=merge(trx_cust,prod_cat_info,by.x =c("prod_cat_code","prod_subcat_code"),by.y=c("prod_cat_code","prod_sub_cat_code"))
head(full_data)## prod_cat_code prod_subcat_code cust_id transaction_id tran_date Qty Rate
## 1 1 1 271847 788442271 11/10/2011 2 368
## 2 1 1 268161 44627697269 24-07-2012 3 1309
## 3 1 1 271803 41310269217 15-10-2011 4 624
## 4 1 1 267316 9044824427 14-10-2013 4 1386
## 5 1 1 271834 87754053788 30-03-2013 4 741
## 6 1 1 274956 70997907379 13-11-2013 4 1352
## Tax total_amt Store_type DOB Gender city_code prod_cat
## 1 77.280 813.280 e-Shop 24-12-1989 F 10 Clothing
## 2 412.335 4339.335 Flagship store 25-06-1979 M 4 Clothing
## 3 262.080 2758.080 TeleShop 28-05-1976 F 7 Clothing
## 4 582.120 6126.120 MBR 22-02-1992 M 1 Clothing
## 5 311.220 3275.220 TeleShop 12-05-1980 M 9 Clothing
## 6 567.840 5975.840 e-Shop 02-09-1981 M 10 Clothing
## prod_subcat
## 1 Women
## 2 Women
## 3 Women
## 4 Women
## 5 Women
## 6 Women
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)Q2.a)- Analyze which product categories are more popular among females vs male customers.
grp_gender <- full_data %>% group_by(Gender,prod_cat) %>%
summarise(category.popularity = length((transaction_id))) %>%
arrange(desc(category.popularity))## `summarise()` has grouped output by 'Gender'. You can override using the `.groups` argument.
grp_gender## # A tibble: 16 x 3
## # Groups: Gender [3]
## Gender prod_cat category.popularity
## <chr> <chr> <int>
## 1 "M" Books 3116
## 2 "F" Books 2949
## 3 "M" Electronics 2570
## 4 "F" Electronics 2328
## 5 "M" Home and kitchen 2134
## 6 "F" Home and kitchen 1994
## 7 "F" Footwear 1529
## 8 "M" Clothing 1518
## 9 "M" Footwear 1469
## 10 "F" Clothing 1439
## 11 "M" Bags 1004
## 12 "F" Bags 994
## 13 "" Books 4
## 14 "" Clothing 3
## 15 "" Footwear 1
## 16 "" Home and kitchen 1
Hence, “Books” is the most popular category among male and female customers followed by “Electronics” & “Home & Kitchen”.
Q2.b)- Which City code has the maximum customers and what was the percentage of customers from that city?
max_cust <- full_data %>% group_by(city_code) %>%
summarise(unique_cust = n_distinct(cust_id)) %>%
mutate(percent1=round(unique_cust/sum(unique_cust),3))%>%
arrange(desc(unique_cust))
max_cust## # A tibble: 11 x 3
## city_code unique_cust percent1
## <int> <int> <dbl>
## 1 3 576 0.105
## 2 5 570 0.104
## 3 4 569 0.103
## 4 7 563 0.102
## 5 8 551 0.1
## 6 2 546 0.099
## 7 10 546 0.099
## 8 1 535 0.097
## 9 9 532 0.097
## 10 6 516 0.094
## 11 NA 2 0
City code 3 has maximum number of unique customer
Q2.c)- Which store type sells the maximum products by value and by quantity?
max_prod <- full_data %>% group_by(Store_type) %>%
summarise(total_qty = sum(Qty),total_val= sum(total_amt)) %>%
arrange(desc(total_qty, total_val))
max_prod## # A tibble: 4 x 3
## Store_type total_qty total_val
## <chr> <int> <dbl>
## 1 e-Shop 22763 19824816.
## 2 MBR 11194 9674486.
## 3 Flagship store 11133 9715688.
## 4 TeleShop 10984 9364781.
e-Shop sell maximum products by value and by quantity.
Q2.d)- What was the total amount earned from the ”Electronics” and ”Clothing” categories from Flagship Stores?
Store_cat <- full_data %>% group_by(Store_type,prod_cat) %>%
summarise(total_val= sum(total_amt)) %>%
filter(Store_type== "Flagship store") %>%
filter(prod_cat== "Electronics" | prod_cat== "Clothing")## `summarise()` has grouped output by 'Store_type'. You can override using the `.groups` argument.
Store_cat## # A tibble: 2 x 3
## # Groups: Store_type [1]
## Store_type prod_cat total_val
## <chr> <chr> <dbl>
## 1 Flagship store Clothing 1194423.
## 2 Flagship store Electronics 2215136.
total_flag_elec_cloth=sum(Store_cat$total_val)total_flag_elec_cloth## [1] 3409559
the total amount earned from the ”Electronics” and ”Clothing” categories from Flagship Stores was 34,09,559.
Q2.e)-What was the total amount earned from ”Male” customers under the ”Electronics” category?
Male_Elec <- full_data %>% group_by(Gender,prod_cat) %>%
summarise(total_val= sum(total_amt)) %>%
filter(prod_cat== "Electronics" & Gender== "M")## `summarise()` has grouped output by 'Gender'. You can override using the `.groups` argument.
Male_Elec## # A tibble: 1 x 3
## # Groups: Gender [1]
## Gender prod_cat total_val
## <chr> <chr> <dbl>
## 1 M Electronics 5703109.
The total amount earned from ”Male” customers under the ”Electronics” category was 57,03,109.
Q2.f)- How many customers have more than 10 unique transactions, after removing all transactions which have any negative amounts? For all customers aged between 25 - 35, find out: a. What was the total amount spent for “Electronics” and “Books” product categories? b. What was the total amount spent by these customers between 1st Jan, 2014 to 1st Mar, 2014?
Filtering out negative data
non_neg_data= full_data %>% filter(total_amt>0)
summary(non_neg_data$total_amt)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 77.35 1030.69 2089.55 2608.44 3825.51 8287.50
Morethan10_tran <- non_neg_data %>% group_by(cust_id) %>%
summarise(unique_tran= length((transaction_id)))%>%
filter(unique_tran>10)
Morethan10_tran## # A tibble: 6 x 2
## cust_id unique_tran
## <int> <int>
## 1 266794 11
## 2 270535 11
## 3 270803 11
## 4 272741 11
## 5 273014 11
## 6 274227 11
count(Morethan10_tran)## # A tibble: 1 x 1
## n
## <int>
## 1 6
There are customer with 6 customers with more than 10 unique transactions.
Creating age column by subtracting DOB from today’s date
non_neg_data$DOB<- as.Date(non_neg_data$DOB,"%d-%m-%Y")
non_neg_data$age <- round((as.numeric(difftime(Sys.Date(),non_neg_data$DOB))/365.25),0)
head(non_neg_data)## prod_cat_code prod_subcat_code cust_id transaction_id tran_date Qty Rate
## 1 1 1 271847 788442271 11/10/2011 2 368
## 2 1 1 268161 44627697269 24-07-2012 3 1309
## 3 1 1 271803 41310269217 15-10-2011 4 624
## 4 1 1 267316 9044824427 14-10-2013 4 1386
## 5 1 1 271834 87754053788 30-03-2013 4 741
## 6 1 1 274956 70997907379 13-11-2013 4 1352
## Tax total_amt Store_type DOB Gender city_code prod_cat
## 1 77.280 813.280 e-Shop 1989-12-24 F 10 Clothing
## 2 412.335 4339.335 Flagship store 1979-06-25 M 4 Clothing
## 3 262.080 2758.080 TeleShop 1976-05-28 F 7 Clothing
## 4 582.120 6126.120 MBR 1992-02-22 M 1 Clothing
## 5 311.220 3275.220 TeleShop 1980-05-12 M 9 Clothing
## 6 567.840 5975.840 e-Shop 1981-09-02 M 10 Clothing
## prod_subcat age
## 1 Women 33
## 2 Women 43
## 3 Women 46
## 4 Women 30
## 5 Women 42
## 6 Women 41
Substituting “/” in tran_date column to “-”. And Further transforming it into date format
non_neg_data$tran_date <- gsub("/","-",non_neg_data$tran_date)
non_neg_data$tran_date<- as.Date(non_neg_data$tran_date,"%d-%m-%Y")Amount Spend on Electronic and Books by Age group 25-35…
Amount_spend <- non_neg_data %>% group_by(tran_date,age,prod_cat) %>%
summarise(total_val= sum(total_amt)) %>%
filter(age>=25 & age<=35)%>%
filter(prod_cat== "Electronics" | prod_cat== "Books")## `summarise()` has grouped output by 'tran_date', 'age'. You can override using the `.groups` argument.
sum(Amount_spend$total_val)## [1] 6702254
Customers between age group 25-35 spent 73,60,910 on “Electronics and Books.
Total amount spent by these customers between 1st Jan, 2014 to 1st Mar, 2014
Between_date_spend <- Amount_spend %>% group_by(tran_date) %>%
filter(tran_date>="2014-01-25" & tran_date<="2014-03-01")
sum(Between_date_spend$total_val)## [1] 162956.6
Total amount spent by these customers between 1st Jan, 2014 to 1st Mar, 2014 was 1,71,338. </font