Foundation Assignment R- Q2

Reading Transaction Dataset

trans <- read.csv("Retail data set//Transactions.csv")
head(trans)

##   transaction_id cust_id  tran_date prod_subcat_code prod_cat_code Qty  Rate
## 1    80712190438  270351 28-02-2014                1             1  -5  -772
## 2    29258453508  270384 27-02-2014                5             3  -5 -1497
## 3    51750724947  273420 24-02-2014                6             5  -2  -791
## 4    93274880719  271509 24-02-2014               11             6  -3 -1363
## 5    51750724947  273420 23-02-2014                6             5  -2  -791
## 6    97439039119  272357 23-02-2014                8             3  -2  -824
##       Tax total_amt Store_type
## 1 405.300 -4265.300     e-Shop
## 2 785.925 -8270.925     e-Shop
## 3 166.110 -1748.110   TeleShop
## 4 429.345 -4518.345     e-Shop
## 5 166.110 -1748.110   TeleShop
## 6 173.040 -1821.040   TeleShop

Reading Customer Dataset

cust <- read.csv("Retail data set//Customer.csv")
head(cust)

##   customer_Id        DOB Gender city_code
## 1      268408 02-01-1970      M         4
## 2      269696 07-01-1970      F         8
## 3      268159 08-01-1970      F         8
## 4      270181 10-01-1970      F         2
## 5      268073 11-01-1970      M         1
## 6      273216 15-01-1970      F         5

Reading Product Category Dataset

prod_cat_info <- read.csv("Retail data set//prod_cat_info.csv")
head(prod_cat_info)

##   prod_cat_code prod_cat prod_sub_cat_code prod_subcat
## 1             1 Clothing                 4        Mens
## 2             1 Clothing                 1       Women
## 3             1 Clothing                 3        Kids
## 4             2 Footwear                 1        Mens
## 5             2 Footwear                 3       Women
## 6             2 Footwear                 4        Kids

Merging Transaction and Customer dataset

trx_cust=merge(trans,cust,by.x ="cust_id" ,by.y ="customer_Id")
head(trx_cust)

##   cust_id transaction_id  tran_date prod_subcat_code prod_cat_code Qty  Rate
## 1  266783    25890929042 23-09-2011                1             2   4  1321
## 2  266783    16999552161   9/2/2013               10             5   2   835
## 3  266783    98477711300 21-10-2012                4             1   3    93
## 4  266783    25890929042 24-09-2011                1             2  -4 -1321
## 5  266783     8410316370 20-02-2013                4             1   1   869
## 6  266784    54234600611 23-08-2012               10             5   3  1291
##       Tax total_amt Store_type        DOB Gender city_code
## 1 554.820  5838.820     e-Shop 01-05-1974      M         4
## 2 175.350  1845.350     e-Shop 01-05-1974      M         4
## 3  29.295   308.295   TeleShop 01-05-1974      M         4
## 4 554.820 -5838.820     e-Shop 01-05-1974      M         4
## 5  91.245   960.245     e-Shop 01-05-1974      M         4
## 6 406.665  4279.665   TeleShop 13-12-1991      F        10

Merging trx_cust data with product category data

full_data=merge(trx_cust,prod_cat_info,by.x =c("prod_cat_code","prod_subcat_code"),by.y=c("prod_cat_code","prod_sub_cat_code"))
head(full_data)

##   prod_cat_code prod_subcat_code cust_id transaction_id  tran_date Qty Rate
## 1             1                1  271847      788442271 11/10/2011   2  368
## 2             1                1  268161    44627697269 24-07-2012   3 1309
## 3             1                1  271803    41310269217 15-10-2011   4  624
## 4             1                1  267316     9044824427 14-10-2013   4 1386
## 5             1                1  271834    87754053788 30-03-2013   4  741
## 6             1                1  274956    70997907379 13-11-2013   4 1352
##       Tax total_amt     Store_type        DOB Gender city_code prod_cat
## 1  77.280   813.280         e-Shop 24-12-1989      F        10 Clothing
## 2 412.335  4339.335 Flagship store 25-06-1979      M         4 Clothing
## 3 262.080  2758.080       TeleShop 28-05-1976      F         7 Clothing
## 4 582.120  6126.120            MBR 22-02-1992      M         1 Clothing
## 5 311.220  3275.220       TeleShop 12-05-1980      M         9 Clothing
## 6 567.840  5975.840         e-Shop 02-09-1981      M        10 Clothing
##   prod_subcat
## 1       Women
## 2       Women
## 3       Women
## 4       Women
## 5       Women
## 6       Women

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

Q2.a)- Analyze which product categories are more popular among females vs male customers.

grp_gender <- full_data %>% group_by(Gender,prod_cat)  %>%
                    summarise(category.popularity = length((transaction_id)))   %>%
                    arrange(desc(category.popularity))

## `summarise()` has grouped output by 'Gender'. You can override using the `.groups` argument.

grp_gender

## # A tibble: 16 x 3
## # Groups:   Gender [3]
##    Gender prod_cat         category.popularity
##    <chr>  <chr>                          <int>
##  1 "M"    Books                           3116
##  2 "F"    Books                           2949
##  3 "M"    Electronics                     2570
##  4 "F"    Electronics                     2328
##  5 "M"    Home and kitchen                2134
##  6 "F"    Home and kitchen                1994
##  7 "F"    Footwear                        1529
##  8 "M"    Clothing                        1518
##  9 "M"    Footwear                        1469
## 10 "F"    Clothing                        1439
## 11 "M"    Bags                            1004
## 12 "F"    Bags                             994
## 13 ""     Books                              4
## 14 ""     Clothing                           3
## 15 ""     Footwear                           1
## 16 ""     Home and kitchen                   1

Hence, “Books” is the most popular category among male and female customers followed by “Electronics” & “Home & Kitchen”.

Q2.b)- Which City code has the maximum customers and what was the percentage of customers from that city?

max_cust <- full_data %>% group_by(city_code)  %>%
                    summarise(unique_cust = n_distinct(cust_id)) %>%
                    mutate(percent1=round(unique_cust/sum(unique_cust),3))%>%
                    arrange(desc(unique_cust)) 
                    
max_cust

## # A tibble: 11 x 3
##    city_code unique_cust percent1
##        <int>       <int>    <dbl>
##  1         3         576    0.105
##  2         5         570    0.104
##  3         4         569    0.103
##  4         7         563    0.102
##  5         8         551    0.1  
##  6         2         546    0.099
##  7        10         546    0.099
##  8         1         535    0.097
##  9         9         532    0.097
## 10         6         516    0.094
## 11        NA           2    0

City code 3 has maximum number of unique customer

Q2.c)- Which store type sells the maximum products by value and by quantity?

max_prod <- full_data %>% group_by(Store_type)  %>%
                    summarise(total_qty = sum(Qty),total_val= sum(total_amt)) %>%
                    arrange(desc(total_qty, total_val)) 
                    
max_prod

## # A tibble: 4 x 3
##   Store_type     total_qty total_val
##   <chr>              <int>     <dbl>
## 1 e-Shop             22763 19824816.
## 2 MBR                11194  9674486.
## 3 Flagship store     11133  9715688.
## 4 TeleShop           10984  9364781.

e-Shop sell maximum products by value and by quantity.

Q2.d)- What was the total amount earned from the ”Electronics” and ”Clothing” categories from Flagship Stores?

Store_cat <- full_data %>% group_by(Store_type,prod_cat)  %>%
                    summarise(total_val= sum(total_amt)) %>%
                    filter(Store_type== "Flagship store") %>% 
                    filter(prod_cat== "Electronics" | prod_cat== "Clothing")

## `summarise()` has grouped output by 'Store_type'. You can override using the `.groups` argument.

Store_cat

## # A tibble: 2 x 3
## # Groups:   Store_type [1]
##   Store_type     prod_cat    total_val
##   <chr>          <chr>           <dbl>
## 1 Flagship store Clothing     1194423.
## 2 Flagship store Electronics  2215136.

total_flag_elec_cloth=sum(Store_cat$total_val)

total_flag_elec_cloth

## [1] 3409559

the total amount earned from the ”Electronics” and ”Clothing” categories from Flagship Stores was 34,09,559.

Q2.e)-What was the total amount earned from ”Male” customers under the ”Electronics” category?

Male_Elec <- full_data %>% group_by(Gender,prod_cat)  %>%
                    summarise(total_val= sum(total_amt)) %>%
                    filter(prod_cat== "Electronics" & Gender== "M")

## `summarise()` has grouped output by 'Gender'. You can override using the `.groups` argument.

Male_Elec

## # A tibble: 1 x 3
## # Groups:   Gender [1]
##   Gender prod_cat    total_val
##   <chr>  <chr>           <dbl>
## 1 M      Electronics  5703109.

The total amount earned from ”Male” customers under the ”Electronics” category was 57,03,109.

Q2.f)- How many customers have more than 10 unique transactions, after removing all transactions which have any negative amounts? For all customers aged between 25 - 35, find out: a. What was the total amount spent for “Electronics” and “Books” product categories? b. What was the total amount spent by these customers between 1st Jan, 2014 to 1st Mar, 2014?

Filtering out negative data

non_neg_data= full_data %>% filter(total_amt>0)
summary(non_neg_data$total_amt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   77.35 1030.69 2089.55 2608.44 3825.51 8287.50

Morethan10_tran <- non_neg_data %>% group_by(cust_id)  %>%
                    summarise(unique_tran= length((transaction_id)))%>%
                    filter(unique_tran>10)
Morethan10_tran

## # A tibble: 6 x 2
##   cust_id unique_tran
##     <int>       <int>
## 1  266794          11
## 2  270535          11
## 3  270803          11
## 4  272741          11
## 5  273014          11
## 6  274227          11

count(Morethan10_tran)

## # A tibble: 1 x 1
##       n
##   <int>
## 1     6

There are customer with 6 customers with more than 10 unique transactions.

Creating age column by subtracting DOB from today’s date

non_neg_data$DOB<- as.Date(non_neg_data$DOB,"%d-%m-%Y")

non_neg_data$age <- round((as.numeric(difftime(Sys.Date(),non_neg_data$DOB))/365.25),0)
head(non_neg_data)

##   prod_cat_code prod_subcat_code cust_id transaction_id  tran_date Qty Rate
## 1             1                1  271847      788442271 11/10/2011   2  368
## 2             1                1  268161    44627697269 24-07-2012   3 1309
## 3             1                1  271803    41310269217 15-10-2011   4  624
## 4             1                1  267316     9044824427 14-10-2013   4 1386
## 5             1                1  271834    87754053788 30-03-2013   4  741
## 6             1                1  274956    70997907379 13-11-2013   4 1352
##       Tax total_amt     Store_type        DOB Gender city_code prod_cat
## 1  77.280   813.280         e-Shop 1989-12-24      F        10 Clothing
## 2 412.335  4339.335 Flagship store 1979-06-25      M         4 Clothing
## 3 262.080  2758.080       TeleShop 1976-05-28      F         7 Clothing
## 4 582.120  6126.120            MBR 1992-02-22      M         1 Clothing
## 5 311.220  3275.220       TeleShop 1980-05-12      M         9 Clothing
## 6 567.840  5975.840         e-Shop 1981-09-02      M        10 Clothing
##   prod_subcat age
## 1       Women  33
## 2       Women  43
## 3       Women  46
## 4       Women  30
## 5       Women  42
## 6       Women  41

Substituting “/” in tran_date column to “-”. And Further transforming it into date format

non_neg_data$tran_date <- gsub("/","-",non_neg_data$tran_date)
non_neg_data$tran_date<- as.Date(non_neg_data$tran_date,"%d-%m-%Y")

Amount Spend on Electronic and Books by Age group 25-35…

Amount_spend <- non_neg_data %>% group_by(tran_date,age,prod_cat)  %>%
                    summarise(total_val= sum(total_amt)) %>%
                    filter(age>=25 & age<=35)%>%
                    filter(prod_cat== "Electronics" | prod_cat== "Books")

## `summarise()` has grouped output by 'tran_date', 'age'. You can override using the `.groups` argument.

sum(Amount_spend$total_val)

## [1] 6702254

Customers between age group 25-35 spent 73,60,910 on “Electronics and Books.

Total amount spent by these customers between 1st Jan, 2014 to 1st Mar, 2014

Between_date_spend <- Amount_spend %>% group_by(tran_date)  %>%
                    filter(tran_date>="2014-01-25" & tran_date<="2014-03-01")
sum(Between_date_spend$total_val)

## [1] 162956.6

Total amount spent by these customers between 1st Jan, 2014 to 1st Mar, 2014 was 1,71,338. </font