# This script PayPal study case, using two data sets provided pp_cust_data.csv and subscriber_data_sample.csv
# Objective: perform a first-pass exploratory data analysis and evaluate whether the vendor offering is worth discussing with your boss and why or why not.
# Author Irina Max. Principal Data Scientist
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.84 loaded
library(ggplot2)
setwd('/Users/irinamax/Documents/R/Experiments/vendor_case_study_updated/')
pp <- read.csv('/Users/irinamax/Documents/R/Experiments/vendor_case_study_updated/pp_cust_data.csv')
pp %>% head
##             email_address active_send active_receive
## 1 UQZXVWHPAR@GSLJXDZV.com           1              0
## 2  RLKSLDNYTDW@SIDPRQ.com           1              0
## 3 ONVGUCDMEACP@HMXNGE.com           1              1
## 4      DSMIX686@gmail.com           1              0
## 5    QHVKQNOGRG@DHQWP.com           1              1
## 6         FGA@hotmail.com           0              0
pp %>% dim       # Data has 2172 rows and 3 col
## [1] 2172    3
pp %>% str       # active_send and active_receive has binary structure
## 'data.frame':    2172 obs. of  3 variables:
##  $ email_address : chr  "UQZXVWHPAR@GSLJXDZV.com" "RLKSLDNYTDW@SIDPRQ.com" "ONVGUCDMEACP@HMXNGE.com" "DSMIX686@gmail.com" ...
##  $ active_send   : int  1 1 1 1 1 0 1 1 1 1 ...
##  $ active_receive: int  0 0 1 0 1 0 0 0 1 1 ...
pp %>% summary   # but different mean, where we can see senders more active
##  email_address       active_send     active_receive  
##  Length:2172        Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:1.0000   1st Qu.:0.0000  
##  Mode  :character   Median :1.0000   Median :0.0000  
##                     Mean   :0.7813   Mean   :0.4848  
##                     3rd Qu.:1.0000   3rd Qu.:1.0000  
##                     Max.   :1.0000   Max.   :1.0000
sum(is.na(pp))   # not missing value
## [1] 0
#How many active users in PP list who buying and selling
pp_a<- pp %>%  filter((active_send %in% 1), (active_receive %in% 1))
pp_a %>% head(10)
##              email_address active_send active_receive
## 1  ONVGUCDMEACP@HMXNGE.com           1              1
## 2     QHVKQNOGRG@DHQWP.com           1              1
## 3   MEAVGBKPVB@GYAEKFE.com           1              1
## 4  URHEUJIWCRCI@LZKSZZ.com           1              1
## 5  TODRSMYDRZFY@UPRWCL.com           1              1
## 6    ZQVCDHAZPD@GCPJBA.com           1              1
## 7  ZKWYJEDSDX@QBJAROOY.com           1              1
## 8  WMLMCJWWMRBK@EFHITG.com           1              1
## 9    JZTNIAMISZ@NXXLFW.com           1              1
## 10  ZRDIQGIEEZNT@FEKKT.com           1              1
pp_a %>% dim  ##  here is 832  active used in the PP list
## [1] 832   3
# How many not active?
pp_ch<- pp %>%  filter((active_send %in% 0), (active_receive %in% 0))
pp_ch %>% head(10)
##                   email_address active_send active_receive
## 1               FGA@hotmail.com           0              0
## 2           UHSUDS380@gmail.com           0              0
## 3       SJYBRYLMKF@NTYDLZQV.com           0              0
## 4  outdoorlivingGLUBC@gmail.com           0              0
## 5               LSQII@gmail.com           0              0
## 6        XAFWRZEDUMJ@DSHTPY.com           0              0
## 7         PYVPSNVECUM@CJGHL.com           0              0
## 8      RFTCLEKUEDXI@RZVROQO.com           0              0
## 9           MBXA.MBXA@gmail.com           0              0
## 10    FDDCEANLVZHJ@HKFKIJUO.com           0              0
pp_ch %>% dim   # 254 unactive users, or I would call them churned
## [1] 254   3
# ratio in PP list is 254/832 = 0.3052885
paste("Ratio of Churn users in pp table during last year: ", length( pp_ch[,1])/length(pp_a[,1]))
## [1] "Ratio of Churn users in pp table during last year:  0.305288461538462"
# In histogram of PP list I can see magority are sending users active and there churn is not big
qplot(x =pp$active_send, fill=..count.., geom="histogram",main = 'Distribution of active senders in PP' )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(x =pp$active_receive, fill=..count.., geom="histogram", main = 'Distribution of active receivers in PP')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Most of the people churning are user who making receiving transection

#Uploading and explore sc Vendor list
sc <- read.csv('/Users/irinamax/Documents/R/Experiments/vendor_case_study_updated/subscriber_data_sample.csv')
sc %>% head
##                               email_address            industry
## 1                            POYZ@yahoo.com              garden
## 2                       VGID.VGID@yahoo.com  landscape engineer
## 3                KTCGW@homeandgardenXYT.com     home and garden
## 4 YOQUFSG.YOQUFSG@EVTlandscapearchitect.biz landscape architect
## 5              SOSNEJAL.SOSNEJAL@DXKDhg.net                    
## 6             GGYDNEE3@homeandgardenBAHG.co                    
##   relationship_length site_visits
## 1                   2          86
## 2                  30        2019
## 3                  30         225
## 4                   4          80
## 5                   4          20
## 6                  12          48
sc %>% summary
##  email_address        industry         relationship_length  site_visits     
##  Length:679         Length:679         Min.   : 1.000      Min.   :    0.0  
##  Class :character   Class :character   1st Qu.: 2.000      1st Qu.:   28.0  
##  Mode  :character   Mode  :character   Median : 5.000      Median :   97.0  
##                                        Mean   : 8.931      Mean   :  434.2  
##                                        3rd Qu.:13.000      3rd Qu.:  301.0  
##                                        Max.   :30.000      Max.   :16551.0
#We have 679 users in Vendor list, but we dont know who of them had PayPal transections.

#How much subcsrubers in every industry on the Vendor list
sc_count <- sc %>% group_by(sc$industry ) %>% count 
sc_count   # The list of the 20 industries and numbers of users in every one
## # A tibble: 20 x 2
## # Groups:   sc$industry [20]
##    `sc$industry`             n
##    <chr>                 <int>
##  1 ""                      103
##  2 "architect"              27
##  3 "designer"               28
##  4 "garden"                 34
##  5 "gardening"              31
##  6 "grower"                 29
##  7 "hg"                     27
##  8 "home and garden"        44
##  9 "landscape architect"    29
## 10 "landscape designer"     28
## 11 "landscape engineer"     39
## 12 "landscaper"             15
## 13 "landscaping"            17
## 14 "nursery"                39
## 15 "orchard"                22
## 16 "outdoor"                44
## 17 "outdoor living"         38
## 18 "plants"                 25
## 19 "supply"                 36
## 20 "vineyard"               24
ggplot(sc, aes(sc$industry,  fill= factor(industry))) +geom_bar(stat = "count")+coord_flip()+
  ggtitle("Number of customers by industry in Vendor list")
## Warning: Use of `sc$industry` is discouraged. Use `industry` instead.

#On the Boxplot we can see the mean of site visits by industry on the Vendor list
ggplot(sc, aes(x = industry, y = site_visits, fill =as.factor(industry))) +
  geom_boxplot() +scale_y_log10()+
  ggtitle("Vendor customers site visits by industry")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 23 rows containing non-finite values (stat_boxplot).

# We have list of 20 idustries include one '' empty or Unknown spot, but this spot is not NA
sum(is.na(sc))  # we have not any missing value
## [1] 0
# I want to know how many subscribers are in the Vendor list using PayPal.
# For this purpose I will merge tables by emails, and R going to catch it automatically
df <- merge(sc, pp)
df %>% dim  
## [1] 135   6
# only 135 subscribes from Vendor list using PayPal with 6 columns
# I can verify outcome with another possible way just for to check out dimentions
df_check <- sc %>% inner_join(pp, by ='email_address')   
df_check %>% dim
## [1] 135   6
# the same result 135 with 6 columns

# Total, max and min site visits of Active senders using PP payment by industry, sorted descending order
df %>%group_by(df$industry, active_send=1 ) %>% summarise(total_visit = sum(site_visits )) %>% arrange(-total_visit)
## `summarise()` has grouped output by 'df$industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   df$industry [20]
##    `df$industry`         active_send total_visit
##    <chr>                       <dbl>       <int>
##  1 "outdoor"                       1       19385
##  2 "gardening"                     1       18367
##  3 "landscape architect"           1       14050
##  4 "landscape engineer"            1        8724
##  5 "designer"                      1        8336
##  6 "nursery"                       1        6084
##  7 ""                              1        4673
##  8 "garden"                        1        4052
##  9 "outdoor living"                1        3573
## 10 "landscape designer"            1        2894
## 11 "plants"                        1        2346
## 12 "grower"                        1        2223
## 13 "landscaping"                   1        1804
## 14 "home and garden"               1        1761
## 15 "supply"                        1        1302
## 16 "architect"                     1        1224
## 17 "orchard"                       1        1209
## 18 "vineyard"                      1         902
## 19 "landscaper"                    1         195
## 20 "hg"                            1         134
df %>% group_by(industry, active_send=1) %>% summarise(max = max(site_visits )) %>% arrange(-max)
## `summarise()` has grouped output by 'industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   industry [20]
##    industry              active_send   max
##    <chr>                       <dbl> <int>
##  1 "outdoor"                       1 16551
##  2 "gardening"                     1 15437
##  3 "landscape architect"           1 11610
##  4 "designer"                      1  7924
##  5 "nursery"                       1  4569
##  6 "landscape engineer"            1  4350
##  7 ""                              1  2106
##  8 "garden"                        1  1571
##  9 "outdoor living"                1  1517
## 10 "grower"                        1  1384
## 11 "landscape designer"            1  1374
## 12 "plants"                        1  1003
## 13 "landscaping"                   1   939
## 14 "home and garden"               1   922
## 15 "orchard"                       1   806
## 16 "supply"                        1   543
## 17 "architect"                     1   496
## 18 "vineyard"                      1   477
## 19 "landscaper"                    1   171
## 20 "hg"                            1   114
df %>% group_by(industry, active_send=1) %>% summarise(min = min(site_visits )) %>% arrange(-min)
## `summarise()` has grouped output by 'industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   industry [20]
##    industry              active_send   min
##    <chr>                       <dbl> <int>
##  1 "landscape engineer"            1   105
##  2 "designer"                      1    55
##  3 "architect"                     1    47
##  4 "landscaping"                   1    34
##  5 "supply"                        1    25
##  6 "landscaper"                    1    24
##  7 "outdoor living"                1    24
##  8 "nursery"                       1    21
##  9 "hg"                            1    20
## 10 "gardening"                     1    15
## 11 "grower"                        1    14
## 12 "vineyard"                      1    11
## 13 "outdoor"                       1     9
## 14 "orchard"                       1     8
## 15 "landscape designer"            1     5
## 16 ""                              1     0
## 17 "garden"                        1     0
## 18 "home and garden"               1     0
## 19 "landscape architect"           1     0
## 20 "plants"                        1     0
# Total,max and min site visits of Not active senders PP payment by industry, sorted descending order
df %>%group_by(df$industry, active_send=0 ) %>% summarise(total_visit = sum(site_visits )) %>% arrange(-total_visit)
## `summarise()` has grouped output by 'df$industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   df$industry [20]
##    `df$industry`         active_send total_visit
##    <chr>                       <dbl>       <int>
##  1 "outdoor"                       0       19385
##  2 "gardening"                     0       18367
##  3 "landscape architect"           0       14050
##  4 "landscape engineer"            0        8724
##  5 "designer"                      0        8336
##  6 "nursery"                       0        6084
##  7 ""                              0        4673
##  8 "garden"                        0        4052
##  9 "outdoor living"                0        3573
## 10 "landscape designer"            0        2894
## 11 "plants"                        0        2346
## 12 "grower"                        0        2223
## 13 "landscaping"                   0        1804
## 14 "home and garden"               0        1761
## 15 "supply"                        0        1302
## 16 "architect"                     0        1224
## 17 "orchard"                       0        1209
## 18 "vineyard"                      0         902
## 19 "landscaper"                    0         195
## 20 "hg"                            0         134
df %>% group_by(industry, active_send=0) %>% summarise(max = max(site_visits )) %>% arrange(-max)
## `summarise()` has grouped output by 'industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   industry [20]
##    industry              active_send   max
##    <chr>                       <dbl> <int>
##  1 "outdoor"                       0 16551
##  2 "gardening"                     0 15437
##  3 "landscape architect"           0 11610
##  4 "designer"                      0  7924
##  5 "nursery"                       0  4569
##  6 "landscape engineer"            0  4350
##  7 ""                              0  2106
##  8 "garden"                        0  1571
##  9 "outdoor living"                0  1517
## 10 "grower"                        0  1384
## 11 "landscape designer"            0  1374
## 12 "plants"                        0  1003
## 13 "landscaping"                   0   939
## 14 "home and garden"               0   922
## 15 "orchard"                       0   806
## 16 "supply"                        0   543
## 17 "architect"                     0   496
## 18 "vineyard"                      0   477
## 19 "landscaper"                    0   171
## 20 "hg"                            0   114
df %>% group_by(industry, active_send=0) %>% summarise(min = min(site_visits )) %>% arrange(-min)
## `summarise()` has grouped output by 'industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   industry [20]
##    industry              active_send   min
##    <chr>                       <dbl> <int>
##  1 "landscape engineer"            0   105
##  2 "designer"                      0    55
##  3 "architect"                     0    47
##  4 "landscaping"                   0    34
##  5 "supply"                        0    25
##  6 "landscaper"                    0    24
##  7 "outdoor living"                0    24
##  8 "nursery"                       0    21
##  9 "hg"                            0    20
## 10 "gardening"                     0    15
## 11 "grower"                        0    14
## 12 "vineyard"                      0    11
## 13 "outdoor"                       0     9
## 14 "orchard"                       0     8
## 15 "landscape designer"            0     5
## 16 ""                              0     0
## 17 "garden"                        0     0
## 18 "home and garden"               0     0
## 19 "landscape architect"           0     0
## 20 "plants"                        0     0
# Active receivers paiment total, max and min
df %>% group_by(industry, active_receive=1) %>% summarise(total = sum(site_visits , na.rm = T)) %>% arrange(-total)
## `summarise()` has grouped output by 'industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   industry [20]
##    industry              active_receive total
##    <chr>                          <dbl> <int>
##  1 "outdoor"                          1 19385
##  2 "gardening"                        1 18367
##  3 "landscape architect"              1 14050
##  4 "landscape engineer"               1  8724
##  5 "designer"                         1  8336
##  6 "nursery"                          1  6084
##  7 ""                                 1  4673
##  8 "garden"                           1  4052
##  9 "outdoor living"                   1  3573
## 10 "landscape designer"               1  2894
## 11 "plants"                           1  2346
## 12 "grower"                           1  2223
## 13 "landscaping"                      1  1804
## 14 "home and garden"                  1  1761
## 15 "supply"                           1  1302
## 16 "architect"                        1  1224
## 17 "orchard"                          1  1209
## 18 "vineyard"                         1   902
## 19 "landscaper"                       1   195
## 20 "hg"                               1   134
df %>%  group_by(industry, active_receive=1) %>% summarise(max = max(site_visits , na.rm = T)) %>% arrange(-max)
## `summarise()` has grouped output by 'industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   industry [20]
##    industry              active_receive   max
##    <chr>                          <dbl> <int>
##  1 "outdoor"                          1 16551
##  2 "gardening"                        1 15437
##  3 "landscape architect"              1 11610
##  4 "designer"                         1  7924
##  5 "nursery"                          1  4569
##  6 "landscape engineer"               1  4350
##  7 ""                                 1  2106
##  8 "garden"                           1  1571
##  9 "outdoor living"                   1  1517
## 10 "grower"                           1  1384
## 11 "landscape designer"               1  1374
## 12 "plants"                           1  1003
## 13 "landscaping"                      1   939
## 14 "home and garden"                  1   922
## 15 "orchard"                          1   806
## 16 "supply"                           1   543
## 17 "architect"                        1   496
## 18 "vineyard"                         1   477
## 19 "landscaper"                       1   171
## 20 "hg"                               1   114
df %>%  group_by(industry, active_receive=1) %>% summarise(min = min(site_visits , na.rm = T)) %>% arrange(-min)
## `summarise()` has grouped output by 'industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   industry [20]
##    industry              active_receive   min
##    <chr>                          <dbl> <int>
##  1 "landscape engineer"               1   105
##  2 "designer"                         1    55
##  3 "architect"                        1    47
##  4 "landscaping"                      1    34
##  5 "supply"                           1    25
##  6 "landscaper"                       1    24
##  7 "outdoor living"                   1    24
##  8 "nursery"                          1    21
##  9 "hg"                               1    20
## 10 "gardening"                        1    15
## 11 "grower"                           1    14
## 12 "vineyard"                         1    11
## 13 "outdoor"                          1     9
## 14 "orchard"                          1     8
## 15 "landscape designer"               1     5
## 16 ""                                 1     0
## 17 "garden"                           1     0
## 18 "home and garden"                  1     0
## 19 "landscape architect"              1     0
## 20 "plants"                           1     0
# Not active receivers paiment total, max and min
df %>% group_by(industry, active_receive=0) %>% summarise(total = sum(site_visits , na.rm = T)) %>% arrange(-total)
## `summarise()` has grouped output by 'industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   industry [20]
##    industry              active_receive total
##    <chr>                          <dbl> <int>
##  1 "outdoor"                          0 19385
##  2 "gardening"                        0 18367
##  3 "landscape architect"              0 14050
##  4 "landscape engineer"               0  8724
##  5 "designer"                         0  8336
##  6 "nursery"                          0  6084
##  7 ""                                 0  4673
##  8 "garden"                           0  4052
##  9 "outdoor living"                   0  3573
## 10 "landscape designer"               0  2894
## 11 "plants"                           0  2346
## 12 "grower"                           0  2223
## 13 "landscaping"                      0  1804
## 14 "home and garden"                  0  1761
## 15 "supply"                           0  1302
## 16 "architect"                        0  1224
## 17 "orchard"                          0  1209
## 18 "vineyard"                         0   902
## 19 "landscaper"                       0   195
## 20 "hg"                               0   134
df %>%  group_by(industry, active_receive=0) %>% summarise(max = max(site_visits , na.rm = T)) %>% arrange(-max)
## `summarise()` has grouped output by 'industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   industry [20]
##    industry              active_receive   max
##    <chr>                          <dbl> <int>
##  1 "outdoor"                          0 16551
##  2 "gardening"                        0 15437
##  3 "landscape architect"              0 11610
##  4 "designer"                         0  7924
##  5 "nursery"                          0  4569
##  6 "landscape engineer"               0  4350
##  7 ""                                 0  2106
##  8 "garden"                           0  1571
##  9 "outdoor living"                   0  1517
## 10 "grower"                           0  1384
## 11 "landscape designer"               0  1374
## 12 "plants"                           0  1003
## 13 "landscaping"                      0   939
## 14 "home and garden"                  0   922
## 15 "orchard"                          0   806
## 16 "supply"                           0   543
## 17 "architect"                        0   496
## 18 "vineyard"                         0   477
## 19 "landscaper"                       0   171
## 20 "hg"                               0   114
df %>%  group_by(industry, active_receive=0) %>% summarise(min = min(site_visits , na.rm = T)) %>% arrange(-min)
## `summarise()` has grouped output by 'industry'. You can override using the `.groups` argument.
## # A tibble: 20 x 3
## # Groups:   industry [20]
##    industry              active_receive   min
##    <chr>                          <dbl> <int>
##  1 "landscape engineer"               0   105
##  2 "designer"                         0    55
##  3 "architect"                        0    47
##  4 "landscaping"                      0    34
##  5 "supply"                           0    25
##  6 "landscaper"                       0    24
##  7 "outdoor living"                   0    24
##  8 "nursery"                          0    21
##  9 "hg"                               0    20
## 10 "gardening"                        0    15
## 11 "grower"                           0    14
## 12 "vineyard"                         0    11
## 13 "outdoor"                          0     9
## 14 "orchard"                          0     8
## 15 "landscape designer"               0     5
## 16 ""                                 0     0
## 17 "garden"                           0     0
## 18 "home and garden"                  0     0
## 19 "landscape architect"              0     0
## 20 "plants"                           0     0
# What industry mostly loosing PayPal subscrubers from Vendor list, Visualisation of churm PayPal users in Vendor by industry
ggplot(df) +geom_bar(aes(x = df$industry, fill = active_send, position = "dodge")) + coord_flip()+  # senders
  ggtitle("Churn PayPal senders  during last yearby industry")
## Warning: Ignoring unknown aesthetics: position
## Warning: Use of `df$industry` is discouraged. Use `industry` instead.

ggplot(df) +geom_bar(aes(x = df$industry, fill = active_receive, position = "dodge"))+ coord_flip() + # receivers
  ggtitle("Churn PayPal receivers during last yearby industry")
## Warning: Ignoring unknown aesthetics: position

## Warning: Use of `df$industry` is discouraged. Use `industry` instead.

# Find all submitters in Vendor list who not active in PayPal during last year and churned
# create list of Active senders and receivers inPP list
pp_ch<- pp %>%  filter((active_send %in% 0), (active_receive %in% 0)) 
# Merge with Vendor list
df_ppch <-  merge(sc, pp_ch)
df_ppch %>% dim       # There are 55 submitters from Vendor list  who used to use PayPal but not use it anymore
## [1] 55  6
df_ppch %>%  summary  # We can see in summary some of them have long relationship with PayPay but churned for some reason
##  email_address        industry         relationship_length  site_visits     
##  Length:55          Length:55          Min.   : 1.000      Min.   :    0.0  
##  Class :character   Class :character   1st Qu.: 2.000      1st Qu.:   43.5  
##  Mode  :character   Mode  :character   Median : 5.000      Median :  121.0  
##                                        Mean   : 9.345      Mean   :  969.6  
##                                        3rd Qu.:15.000      3rd Qu.:  790.0  
##                                        Max.   :30.000      Max.   :15437.0  
##   active_send active_receive
##  Min.   :0    Min.   :0     
##  1st Qu.:0    1st Qu.:0     
##  Median :0    Median :0     
##  Mean   :0    Mean   :0     
##  3rd Qu.:0    3rd Qu.:0     
##  Max.   :0    Max.   :0
df_ppch$site_visits %>% sum  
## [1] 53326
# That's mean PayPal missing 53326 site visits, because this transectionsnot did not us PayPal last year

# 1. I would recomeded to send them some News letter or promotion to return them to business
# 2. Investgate: why long time submitters like more then 5 years churned last year and May be 
# PayPal need to use agressive marketing tools or direct contact, or call to return them back 
# 3. Optional: investigate the reason why they moved or using the other website for transactions.

# Churned PayPal cusomers during last year by industry from Vendor list in table and Visualization
df_ppch %>%  group_by(industry) %>% count 
## # A tibble: 19 x 2
## # Groups:   industry [19]
##    industry                  n
##    <chr>                 <int>
##  1 ""                        9
##  2 "architect"               1
##  3 "designer"                2
##  4 "garden"                  4
##  5 "gardening"               3
##  6 "grower"                  2
##  7 "hg"                      1
##  8 "home and garden"         1
##  9 "landscape architect"     2
## 10 "landscape designer"      1
## 11 "landscape engineer"      3
## 12 "landscaping"             3
## 13 "nursery"                 3
## 14 "orchard"                 3
## 15 "outdoor"                 5
## 16 "outdoor living"          6
## 17 "plants"                  2
## 18 "supply"                  2
## 19 "vineyard"                2
ggplot(df_ppch) +geom_bar(aes(x = industry, fill = "count", position = "dodge"))+coord_flip()+
  ggtitle("Churn PayPal customers during last yearby industry")
## Warning: Ignoring unknown aesthetics: position

# Fild submiters in Vendor list, who are activly sending and receiving using PayPal
pp_a<- pp %>%  filter((active_send %in% 1), (active_receive %in% 1))
df_ppa <- merge(sc, pp_a)   # all submitters using PayPal in the vendor list

df_ppa %>% dim  #  only 8 submitters still using PayPal
## [1] 8 6
ggplot(df_ppa) +geom_bar(aes(x = industry, fill = active_receive, position = "dodge"))+coord_flip()+
  ggtitle("Barplot who actively Sending and receiving")
## Warning: Ignoring unknown aesthetics: position

# PayPal must be happy to keep them and better stimulate/appreciate these users with loyalry rewards and etc.
pp_s<- pp %>%  filter((active_send %in% 1), (active_receive %in% 0))
pp_s %>% dim  # 865 only senders using PP for sending payment though PP but not resiving 
## [1] 865   3
df_pps <- merge(sc, pp_s)
df_pps %>% dim   # we have 39 submitters from Vendor list sender on the vendor list who no recived anything in last year
## [1] 39  6
df_pps %>% head 
##               email_address           industry relationship_length site_visits
## 1     ACFRXBMV928@yahoo.com             grower                   8        1384
## 2     AHXEWFME354@yahoo.com landscape engineer                  12        4350
## 3 AZFAMOK.AZFAMOK@yahoo.com    home and garden                  15         763
## 4            BRRK@gmail.com            outdoor                  19       16551
## 5          BTBKXQ@yahoo.com          gardening                   3          65
## 6         DGF.DGF@gmail.com             plants                  30         817
##   active_send active_receive
## 1           1              0
## 2           1              0
## 3           1              0
## 4           1              0
## 5           1              0
## 6           1              0
#We can sum out all sending transections using PayPal in Vendor list
df_pps$site_visits %>% sum  # 33136
## [1] 33136
#Visualisation
ggplot(df_pps) +geom_bar(aes(x = industry, fill = active_send, position = "dodge"))+coord_flip() +
  ggtitle("Barplot PayPal Active Senders on Vendor list ")
## Warning: Ignoring unknown aesthetics: position

pp_r<- pp %>%  filter((active_send %in% 0), (active_receive %in% 1))  # filter only recivers fro PP
pp_r %>% dim  # we have 221 in the Vendor list 
## [1] 221   3
df_ppr <-  merge(sc,pp_r)
df_ppr %>% dim # 33 subscriber from the vendor list only reciving payment using PP
## [1] 33  6
#We can sum out all receiving transections using PayPal in Vendor list
df_ppr$site_visits %>% sum  #  10677
## [1] 10677
#Visualisation
ggplot(df_ppr) +geom_bar(aes(x = industry, fill = active_receive, position = "dodge"))+coord_flip()+ 
  ggtitle("Barplot Active Receivers by industry on Vendor list")
## Warning: Ignoring unknown aesthetics: position

# I can Calculate Attrition rate base of the data I explored
55/135
## [1] 0.4074074
#[1] 0.4074074   # 40% is very high Attrition rate
paste("Churn Rate or Attrittion Rate of PayPal users in Vendor list during last year: ", length( df_ppch[,1])/length(df[,1]))
## [1] "Churn Rate or Attrittion Rate of PayPal users in Vendor list during last year:  0.407407407407407"
#"Ratio of Churn PayPal users in Vendor list during last year:  0.407407407407407"
# This number can be improved by Churn prevention with counting Prabability to Churn, but here is not enouch information.
#
# Creating churn column where not active subscribers are "0"
df$churn <- ifelse(df$active_send == 0 & df$active_receive==0, 0, 1)
df %>% head
##                email_address           industry relationship_length site_visits
## 1      ACFRXBMV928@yahoo.com             grower                   8        1384
## 2      AHXEWFME354@yahoo.com landscape engineer                  12        4350
## 3       ALCNGHDT@hotmail.com landscape designer                   2           5
## 4 architectBPNFEBC@yahoo.com          architect                   6          47
## 5 architectYCUDSQT@yahoo.com          architect                   8         339
## 6            AUBXC@yahoo.com           vineyard                   7         269
##   active_send active_receive churn
## 1           1              0     1
## 2           1              0     1
## 3           1              1     1
## 4           0              0     0
## 5           0              1     1
## 6           0              0     0
df %>%  group_by(churn) %>% count
## # A tibble: 2 x 2
## # Groups:   churn [2]
##   churn     n
##   <dbl> <int>
## 1     0    55
## 2     1    80
ggplot(df) + geom_bar(aes(x = churn)) +ggtitle("Churn subscribers")  #We can see how big actual churn PayPal users according Vendor information

library(corrplot)
df$industry %>% str
##  chr [1:135] "grower" "landscape engineer" "landscape designer" "architect" ...
#df %>% select_if((is.numeric)) %>%  cor %>% corrplot::corrplot()
# try spearman
cor.m <-  data.matrix(df)
df.cor <-  cor(cor.m, use = "pairwise.complete.obs", method= "spearman")            
df.cor %>% corrplot::corrplot()

# Corralation shows how Churn column depended from Senders and receivers

# Creating Logistic Regression model for predict Churn in Vendor list
#install.packages('rms')
library (rms)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
set.seed(55)
ind <- sample(2, nrow(df), replace = T, prob = c(0.8, 0.2))
train <- df[ind == 1,]
test <- df[ind == 2,]
logModel <- glm(churn ~ site_visits  +relationship_length +active_send+active_receive, family = binomial, train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logModel        
## 
## Call:  glm(formula = churn ~ site_visits + relationship_length + active_send + 
##     active_receive, family = binomial, data = train)
## 
## Coefficients:
##         (Intercept)          site_visits  relationship_length  
##          -2.621e+01           -1.941e-05            7.672e-03  
##         active_send       active_receive  
##           5.213e+01            5.196e+01  
## 
## Degrees of Freedom: 98 Total (i.e. Null);  94 Residual
## Null Deviance:       135.5 
## Residual Deviance: 9.292e-10     AIC: 10
#Prediction on 20% test
pred <- predict(logModel,type = "response", test, na.action =na.exclude )
head(pred)
##           11           18           22           28           33           35 
## 1.000000e+00 1.000000e+00 1.000000e+00 4.650429e-12 5.118100e-12 1.000000e+00
pred <- round(pred)
(tab1 <-  table(test$churn,  pred))
##    pred
##      0  1
##   0 12  0
##   1  0 24
1 - sum(diag(tab1/sum(tab1)))  # missclassification error is "0", accuracy 100%
## [1] 0
# Also can calculate accuracy by recall and F1 value, which just confirm the model is good.
retrieved <- sum(pred)
precision <- sum(pred & test$churn) / retrieved
recall <- sum(pred & test$churn) / sum(test$churn)
F1 <- 2 * precision * recall / (precision + recall)
F1
## [1] 1
recall
## [1] 1
# Model is very accurately predicting Churn on test data but in reality with bigger data set 
# Missclassification error can be slightly different
# Boxplot by site visiting separating by Churn factor
ggplot(df, aes(x = industry, y = site_visits, fill =as.factor(churn))) +
  geom_boxplot() +coord_flip() +scale_y_log10()+facet_grid(.~churn) +
  ggtitle("Boxplot of Churn by industry, number of site visits")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

#The same with barplot, picture shows alert of churn visually in red color
ggplot(df, aes(x = industry, y = site_visits, fill =as.factor(churn))) +
  geom_bar(stat="identity")+coord_flip() +ggtitle("Barplot Churn by industry, number of site visits")

# Subscribers by industry base on long time of relationship
ggplot(df, aes(x = industry, y =relationship_length , fill = as.factor(churn))) +
  geom_boxplot()+coord_flip() +ggtitle("Churn subscribers by industry base on long time of relationship")

#  ---------------------------Part 6. Using churn factor as class---------------------------------
# Based on the Part 3, where I merged all users separately I got idea to make another column
# So I add another column "churn_s" where we can see not only churn as 1/0 but separated by class 
#  0 = Not active or churned
#  1 = only sender active
#  2 = only receiver active
#  3 = sender and receiver active
# within(df, df$churn_s <- ifelse((df$active_send == 0 & df$active_receive==0), 0,
#                                 ifelse((df$active_send == 1 & df$active_receive==0), 1,
#                                        ifelse((df$active_send == 0 & df$active_receive==1), 2, 3))) )

df1 <- df  # I will create new data frame for this purpose
df1$churn_s <- ifelse((df$active_send == 0 & df$active_receive==0), 0,
                      ifelse((df$active_send == 1 & df$active_receive==0), 1,
                             ifelse((df$active_send == 0 & df$active_receive==1), 2, 3)))

df1 %>% head
##                email_address           industry relationship_length site_visits
## 1      ACFRXBMV928@yahoo.com             grower                   8        1384
## 2      AHXEWFME354@yahoo.com landscape engineer                  12        4350
## 3       ALCNGHDT@hotmail.com landscape designer                   2           5
## 4 architectBPNFEBC@yahoo.com          architect                   6          47
## 5 architectYCUDSQT@yahoo.com          architect                   8         339
## 6            AUBXC@yahoo.com           vineyard                   7         269
##   active_send active_receive churn churn_s
## 1           1              0     1       1
## 2           1              0     1       1
## 3           1              1     1       3
## 4           0              0     0       0
## 5           0              1     1       2
## 6           0              0     0       0
##  Now we can see all of them by facet
# churn=0 with senders=1,  receivers=2 ,  active  senders and receivers=3
# by relationship
ggplot(df1, aes(x = industry, y =relationship_length , color = as.factor(churn_s))) +
  geom_boxplot()+facet_grid(.~churn_s)+coord_flip() +
  ggtitle("Boxplot Relationship years. Facet: churn=0, senders=1, receivers=2, senders and receivers=3")

# by site visits
ggplot(df1, aes(x = industry, y = site_visits, fill =as.factor(churn_s))) +
  geom_boxplot() +coord_flip() +scale_y_log10()+facet_grid(.~churn_s)+
  ggtitle("Boxplot Site Visitors: churn=0, senders=1, receivers=2, senders and receivers=3")
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 6 rows containing non-finite values (stat_boxplot).

#Barplot by site visits
ggplot(df1, aes(x = industry, y = site_visits, fill =as.factor(df1$churn_s))) +
  geom_bar(stat="identity")+coord_flip()+
  ggtitle("Site Visitors: churn=0, senders=1, receivers=2, senders and receivers=3")
## Warning: Use of `df1$churn_s` is discouraged. Use `churn_s` instead.

#Barplot by site visit separated by facet
ggplot(df1, aes(x = industry, y = site_visits, fill =as.factor(df1$churn_s))) +
  geom_bar(stat="identity")+coord_flip() +facet_grid(.~churn_s)+
  ggtitle("Site Visitors: churn=0, senders=1, receivers=2, senders and receivers=3")
## Warning: Use of `df1$churn_s` is discouraged. Use `churn_s` instead.

# Part of the Vendor data with subscrubers who never used PayPal must have big interest for PayPal manager.
# Users in this list can be potential PayPal customers.
# Now I anti join this tables
df_rest <- sc %>% anti_join(pp, by ='email_address')   
df_rest %>% dim   # there is 544 potental new customers for PayPal!!!
## [1] 544   4
# We can see how potential customers distributes across of industries
df_rest %>%  group_by(industry) %>% count 
## # A tibble: 20 x 2
## # Groups:   industry [20]
##    industry                  n
##    <chr>                 <int>
##  1 ""                       84
##  2 "architect"              23
##  3 "designer"               25
##  4 "garden"                 22
##  5 "gardening"              24
##  6 "grower"                 22
##  7 "hg"                     25
##  8 "home and garden"        39
##  9 "landscape architect"    23
## 10 "landscape designer"     20
## 11 "landscape engineer"     31
## 12 "landscaper"             13
## 13 "landscaping"            13
## 14 "nursery"                32
## 15 "orchard"                17
## 16 "outdoor"                33
## 17 "outdoor living"         31
## 18 "plants"                 18
## 19 "supply"                 29
## 20 "vineyard"               20
qplot(x =df_rest$relationship_length, fill=..count.., geom="histogram")+ggtitle("Potential customers by industry in Vendor list, X = time of relationship in years")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# How much potectial transection PayPal can have form this new customers
df_rest %>% group_by(industry) %>% summarise(total = sum(site_visits , na.rm = T)) %>% arrange(-total)
## # A tibble: 20 x 2
##    industry              total
##    <chr>                 <int>
##  1 ""                    25776
##  2 "landscape designer"  24729
##  3 "landscaping"         22179
##  4 "landscape architect" 19567
##  5 "home and garden"     13614
##  6 "designer"            12026
##  7 "nursery"             10510
##  8 "supply"               8823
##  9 "outdoor living"       8620
## 10 "outdoor"              8212
## 11 "architect"            6066
## 12 "landscape engineer"   5213
## 13 "hg"                   5024
## 14 "grower"               4555
## 15 "garden"               4349
## 16 "gardening"            4095
## 17 "orchard"              2856
## 18 "vineyard"             2441
## 19 "plants"               1798
## 20 "landscaper"           1113
# I sorted the rest of the vendor data by activity visiting site and pretty sure the first hundred or may be even more 
# will be interesting for PayPal Managment to recruit them with sending some AD letter or promotion letters to invite to PayPal
df_rest[ order(-df_rest$site_visits),] %>% head
##                               email_address            industry
## 28               landscapingCKBNQ@yahoo.com         landscaping
## 350       landscapedesignerSWKPTS@yahoo.com  landscape designer
## 289      RNYZOEZN@landscapearchitectSGT.net landscape architect
## 408                   nurseryTQVU@gmail.com             nursery
## 88  WAHWMBQC.WAHWMBQC@JOCBhomeandgarden.com     home and garden
## 369                           JAL@gmail.com                    
##     relationship_length site_visits
## 28                   30       16227
## 350                  30       14585
## 289                  30       14041
## 408                  30        5862
## 88                   30        3789
## 369                  30        3750
potential_200 <- df_rest[ order(-df_rest$site_visits),] %>% head(100)
potential_200 %>% head(20)
##                               email_address            industry
## 28               landscapingCKBNQ@yahoo.com         landscaping
## 350       landscapedesignerSWKPTS@yahoo.com  landscape designer
## 289      RNYZOEZN@landscapearchitectSGT.net landscape architect
## 408                   nurseryTQVU@gmail.com             nursery
## 88  WAHWMBQC.WAHWMBQC@JOCBhomeandgarden.com     home and garden
## 369                           JAL@gmail.com                    
## 462         EHQKF@HMFUlandscapedesigner.net  landscape designer
## 216             HOOQ1@LXGKoutdoorliving.com      outdoor living
## 297                  outdoorBFVIM@gmail.com             outdoor
## 366                  UJE@QSXlandscaping.biz         landscaping
## 110                  designerTUBU@gmail.com            designer
## 136                  designerMGDU@gmail.com            designer
## 479                   JFK.JFK@supplyUIHY.co              supply
## 536               VJXVKHarchitect@gmail.com           architect
## 17           GNTlandscapedesigner@gmail.com                    
## 207         KQKHlandscapedesigner@yahoo.com  landscape designer
## 309                 TCIQarchitect@yahoo.com           architect
## 258         DTQXlandscapedesigner@yahoo.com  landscape designer
## 7       UXHPKDAlandscapearchitect@yahoo.com landscape architect
## 111     DAHPAM.DAHPAM@NJSVhomeandgarden.biz     home and garden
##     relationship_length site_visits
## 28                   30       16227
## 350                  30       14585
## 289                  30       14041
## 408                  30        5862
## 88                   30        3789
## 369                  30        3750
## 462                  30        3587
## 216                  30        3476
## 297                  30        3072
## 366                  30        2858
## 110                  24        2849
## 136                  21        2787
## 479                  19        2680
## 536                  30        1739
## 17                   11        1709
## 207                   4        1657
## 309                  30        1623
## 258                  23        1564
## 7                    24        1516
## 111                  20        1504