df <- read.csv('marketing_one.csv')
df$date_served <- mdy(df$date_served)
df$date_subscribed <- mdy(df$date_subscribed)
df$date_canceled <- mdy(df$date_canceled)
head(df,5)
##      user_id date_served marketing_channel         variant converted
## 1 a100000029  2018-01-01         House Ads personalization      TRUE
## 2 a100000030  2018-01-01         House Ads personalization      TRUE
## 3 a100000031  2018-01-01         House Ads personalization      TRUE
## 4 a100000032  2018-01-01         House Ads personalization      TRUE
## 5 a100000033  2018-01-01         House Ads personalization      TRUE
##   language_displayed language_preferred   age_group date_subscribed
## 1            English            English  0-18 years      2018-01-01
## 2            English            English 19-24 years      2018-01-01
## 3            English            English 24-30 years      2018-01-01
## 4            English            English 30-36 years      2018-01-01
## 5            English            English 36-45 years      2018-01-01
##   date_canceled subscribing_channel is_retained
## 1          <NA>           House Ads        TRUE
## 2          <NA>           House Ads        TRUE
## 3          <NA>           House Ads        TRUE
## 4          <NA>           House Ads        TRUE
## 5          <NA>           House Ads        TRUE
str(df)
## 'data.frame':    10037 obs. of  12 variables:
##  $ user_id            : chr  "a100000029" "a100000030" "a100000031" "a100000032" ...
##  $ date_served        : Date, format: "2018-01-01" "2018-01-01" ...
##  $ marketing_channel  : chr  "House Ads" "House Ads" "House Ads" "House Ads" ...
##  $ variant            : chr  "personalization" "personalization" "personalization" "personalization" ...
##  $ converted          : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ language_displayed : chr  "English" "English" "English" "English" ...
##  $ language_preferred : chr  "English" "English" "English" "English" ...
##  $ age_group          : chr  "0-18 years" "19-24 years" "24-30 years" "30-36 years" ...
##  $ date_subscribed    : Date, format: "2018-01-01" "2018-01-01" ...
##  $ date_canceled      : Date, format: NA NA ...
##  $ subscribing_channel: chr  "House Ads" "House Ads" "House Ads" "House Ads" ...
##  $ is_retained        : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
factor = c("marketing_channel",
                  "subscribing_channel",
                  "variant",
                  "language_preferred",
                  "language_displayed")

df <- df %>%
  mutate(marketing_channel = ifelse(marketing_channel == '', 'Organic', marketing_channel)) %>% 
  mutate(subscribing_channel = ifelse(subscribing_channel == '', 'Organic', subscribing_channel)) %>% 
  mutate_at(vars(factor), as.factor)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(factor)
## 
##   # Now:
##   data %>% select(all_of(factor))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Jan summary
summary(df)
##    user_id           date_served         marketing_channel
##  Length:10037       Min.   :2018-01-01   Email    : 565   
##  Class :character   1st Qu.:2018-01-08   Facebook :1860   
##  Mode  :character   Median :2018-01-15   House Ads:4733   
##                     Mean   :2018-01-15   Instagram:1871   
##                     3rd Qu.:2018-01-22   Organic  :  15   
##                     Max.   :2018-01-31   Push     : 993   
##                     NA's   :16                            
##             variant     converted       language_displayed language_preferred
##  control        :5091   Mode :logical   Arabic :  27       Arabic : 145      
##  personalization:4946   FALSE:8946      English:9793       English:9275      
##                         TRUE :1076      German :  81       German : 167      
##                         NA's :15        Spanish: 136       Spanish: 450      
##                                                                              
##                                                                              
##                                                                              
##   age_group         date_subscribed      date_canceled       
##  Length:10037       Min.   :2018-01-01   Min.   :2018-01-05  
##  Class :character   1st Qu.:2018-01-07   1st Qu.:2018-02-07  
##  Mode  :character   Median :2018-01-15   Median :2018-03-04  
##                     Mean   :2018-01-14   Mean   :2018-03-03  
##                     3rd Qu.:2018-01-19   3rd Qu.:2018-04-01  
##                     Max.   :2018-01-31   Max.   :2018-05-09  
##                     NA's   :8181         NA's   :9460        
##  subscribing_channel is_retained    
##  Email    : 290      Mode :logical  
##  Facebook : 445      FALSE:577      
##  House Ads: 354      TRUE :1279     
##  Instagram: 600      NA's :8181     
##  Organic  :8181                     
##  Push     : 167                     
## 
#Percentage of NA values in each row
round((colSums(is.na(df))/ nrow(df)),2)
##             user_id         date_served   marketing_channel             variant 
##                0.00                0.00                0.00                0.00 
##           converted  language_displayed  language_preferred           age_group 
##                0.00                0.00                0.00                0.00 
##     date_subscribed       date_canceled subscribing_channel         is_retained 
##                0.82                0.94                0.00                0.82
# Conversion Rate for January
converion_rate = round((1076/nrow(df)),2)
print(converion_rate)
## [1] 0.11
# Retention Rate for January
retention_rate = round((1279/nrow(df)),2)
print(retention_rate)
## [1] 0.13
# NA values dropped 
jsr <- df %>% select(c(user_id,date_subscribed)) %>% 
              group_by(date_subscribed) %>% 
              summarise(user_id_count = n_distinct(user_id)) %>% 
              drop_na()

jsr_plot <- ggplot(jsr, aes(x=date_subscribed, y=user_id_count)) 
(jsr_plot + geom_line()  +
    ylab("Count") + xlab("Date of Subscription") + theme_bw() + 
    ggtitle("January Subcription Rate"))

age_sub <- df %>%
  group_by(age_group, subscribing_channel) %>%
  summarise(user_count = n()) %>%
  drop_na()
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.
age_mark <- df %>%
  group_by(age_group, marketing_channel) %>%
  summarise(user_count = n()) %>%
  drop_na()
## `summarise()` has grouped output by 'age_group'. You can override using the
## `.groups` argument.
age_sub_plot <- ggplot(age_sub, aes(x=subscribing_channel, y= user_count, color = age_group))

p1 <- (age_sub_plot + geom_jitter(aes(color= age_group),position = position_dodge(width = .75))
  + xlab('Subscribing Channel') + ylab('Count') +
    ggtitle('Customer Capture January') + theme(legend.position = "none") )
age_mark_plot <- ggplot(age_mark, aes(x=marketing_channel, y= user_count, color = age_group))

p2 <- (age_mark_plot + geom_jitter(aes(color= age_group),position = position_dodge(width = .75))
  + xlab('Marketing Channel') + ylab(''))

cowplot::plot_grid(p1, p2, label_size = 7)