The document contains the log of insurance datesets exploring. There were two main goals: (1) to get the feel of the data; (2) to understand how to measure the retention. Results of some calculations are available in an interactive Shiny dashboard.
All procedures were conducted using R language. The source code is available in a Gitlab repository: https://gitlab.com/Matrunich/insurance_customers_retention. The Shiny dashboard is published online at the Shinyapps.io server: https://malexan.shinyapps.io/customer_retention/. This RMarkdown document itself in the knitted form is provided through the RPubs service: http://rpubs.com/malexan/retention.
ds %>%
iwalk(~{print(vis_dat(.x, warn_large_data = FALSE) + ggtitle(.y))
print(summary(.x))
glimpse(.x)
# print(skim(.x))
})
## community_id community_name district region
## Min. : 695042 Gyangvuur : 3 Ejura-Sekyedumase: 222 GH-UE: 631
## 1st Qu.: 696060 Beposo : 3 Yendi Municipal : 188 GH-NP:1949
## Median : 697084 Tambona : 2 Sene West : 180 GH-UW: 752
## Mean :1063680 Cheshe : 2 Wa West : 159 GH-BA: 510
## 3rd Qu.:1729606 Tindugu : 2 East Mamprusi : 147 GH-AH: 224
## Max. :1731513 Nyong-Guma: 2 Garu-Tempane : 138 GH-TV: 1
## (Other) :4053 (Other) :3033
## country iso_code latitude longitude
## Ghana:4067 GH:4067 Min. :-1.576 Min. :-2.9268
## 1st Qu.: 9.125 1st Qu.:-1.3789
## Median : 9.655 Median :-0.8312
## Mean : 9.544 Mean :-0.9295
## 3rd Qu.:10.480 3rd Qu.:-0.2380
## Max. :52.190 Max. :69.1316
## NA's :530 NA's :528
## Observations: 4,067
## Variables: 8
## $ community_id <int> 695087, 696669, 696638, 696634, 1373566, 696652, …
## $ community_name <fct> Sinyangsa-Abuluk 1, Azoungo, Abugudabogo, Bobalan…
## $ district <fct> Builsa, Bawku West, Bawku West, Sawla-Tuna-Kalba,…
## $ region <fct> GH-UE, GH-UE, GH-UE, GH-NP, GH-UW, GH-UE, GH-NP, …
## $ country <fct> Ghana, Ghana, Ghana, Ghana, Ghana, Ghana, Ghana, …
## $ iso_code <fct> GH, GH, GH, GH, GH, GH, GH, GH, GH, GH, GH, GH, G…
## $ latitude <dbl> 10.626207, 10.815525, 10.714040, 9.609643, 9.3964…
## $ longitude <dbl> -1.253698, -0.507512, -0.438528, -2.386182, -0.83…
## community_payout_id community_id paid_date paid_by
## Min. : 1.0 Min. : 695042 Min. :2018-10-08 Min. : 15
## 1st Qu.:157.8 1st Qu.: 696327 1st Qu.:2018-11-13 1st Qu.:1262
## Median :314.5 Median :1349779 Median :2018-12-07 Median :1262
## Mean :315.2 Mean :1229340 Mean :2018-12-18 Mean :1238
## 3rd Qu.:471.2 3rd Qu.:1730174 3rd Qu.:2019-01-02 3rd Qu.:1264
## Max. :659.0 Max. :1731491 Max. :2019-06-19 Max. :1403
## paid_to transaction_amount confirmation_code
## Min. : 35 Min. : 1.0 Length:628
## 1st Qu.:11109 1st Qu.: 18.5 Class :character
## Median :13486 Median : 50.5 Mode :character
## Mean :12468 Mean : 127.4
## 3rd Qu.:16609 3rd Qu.: 120.4
## Max. :17744 Max. :2623.0
## created_at status season
## Min. :2019-02-05 23:19:24 payout due:217 Length:628
## 1st Qu.:2019-02-05 23:19:31 paid :158 Class :character
## Median :2019-02-05 23:19:40 renewed :253 Mode :character
## Mean :2019-02-17 13:26:25
## 3rd Qu.:2019-02-05 23:19:48
## Max. :2019-06-21 16:49:30
## Observations: 628
## Variables: 10
## $ community_payout_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ community_id <int> 782645, 1349779, 695163, 696672, 1729543, 17…
## $ paid_date <date> 2018-10-08, 2018-10-17, 2018-12-24, 2018-12…
## $ paid_by <int> 1262, 1262, 1262, 1262, 1262, 1262, 1262, 12…
## $ paid_to <int> 4323, 7789, 13462, 6641, 14055, 14055, 11969…
## $ transaction_amount <dbl> 1292.0, 1992.0, 122.0, 46.0, 106.0, 36.0, 29…
## $ confirmation_code <chr> "4423366468", "4486647010", "5002509062", "4…
## $ created_at <dttm> 2019-02-05 23:19:24, 2019-02-05 23:19:25, 2…
## $ status <fct> payout due, payout due, payout due, payout d…
## $ season <chr> "2018 major", "2018 major", "2018 major", "2…
## customer_policy_id date_issued created_at
## Min. : 4725 Min. :2015-01-01 Min. :2018-02-12 22:13:58
## 1st Qu.:19098 1st Qu.:2018-02-16 1st Qu.:2018-04-11 00:09:32
## Median :27143 Median :2018-04-27 Median :2018-05-01 00:13:52
## Mean :28356 Mean :2018-01-14 Mean :2018-05-30 16:34:26
## 3rd Qu.:38346 3rd Qu.:2018-06-05 3rd Qu.:2018-06-19 09:55:50
## Max. :50437 Max. :2019-12-04 Max. :2019-07-19 12:17:22
##
## updated_at customer_id date_planted
## Min. :2018-07-24 11:22:37 Min. : 1 Min. :2018-03-03
## 1st Qu.:2018-11-14 02:16:18 1st Qu.: 22492 1st Qu.:2018-05-28
## Median :2018-11-22 07:10:06 Median :176574 Median :2018-06-19
## Mean :2019-01-18 18:19:40 Mean :122641 Mean :2018-07-06
## 3rd Qu.:2019-06-12 17:24:18 3rd Qu.:191146 3rd Qu.:2018-07-10
## Max. :2019-07-18 08:12:20 Max. :206875 Max. :2019-12-31
## NA's :1387 NA's :15507
## date_priced status crop
## Min. :2018-06-04 expired :19780 Maize :17203
## 1st Qu.:2018-07-26 pending : 1632 Groundnut: 4452
## Median :2018-09-13 paid out : 1579 Sorghum : 1217
## Mean :2018-09-08 payout due: 678 Rice : 1312
## 3rd Qu.:2018-10-08 active : 210 Soybean : 11
## Max. :2019-07-04 planted : 156
## NA's :7467 (Other) : 160
## season
## Length:24195
## Class :character
## Mode :character
##
##
##
##
## Observations: 24,195
## Variables: 10
## $ customer_policy_id <int> 50324, 49257, 48940, 50314, 50398, 7569, 5654…
## $ date_issued <date> 2019-02-12, 2019-01-16, 2018-12-14, 2019-06-…
## $ created_at <dttm> 2019-06-13 07:04:00, 2019-01-19 07:04:29, 20…
## $ updated_at <dttm> NA, 2019-06-13 07:04:01, 2019-06-21 17:55:52…
## $ customer_id <int> 187920, 187933, 205040, 204826, 5768, 92800, …
## $ date_planted <date> NA, NA, 2019-04-11, 2019-02-26, NA, 2018-05-…
## $ date_priced <date> NA, NA, 2019-06-17, 2019-06-17, NA, 2018-10-…
## $ status <fct> pending, pending, active, expired, pending, e…
## $ crop <fct> Maize, Groundnut, Maize, Maize, Maize, Maize,…
## $ season <chr> "2019 minor", "2019 minor", "2019 major", "20…
## customer_id community_id gender literacy
## Min. : 1 Min. : 695042 M : 4924 native_rw :15473
## 1st Qu.: 27824 1st Qu.: 695752 F : 2904 none : 4680
## Median :178639 Median : 696720 NA's:12613 national_r : 11
## Mean :126204 Mean : 941286 native_r : 265
## 3rd Qu.:191322 3rd Qu.:1062790 national_rw: 12
## Max. :206875 Max. :1731513
##
## has_mobile_money registration_date farm_size
## Mode :logical Min. :2016-11-09 Min. : 1.00
## FALSE:3305 1st Qu.:2017-03-24 1st Qu.:13.00
## TRUE :923 Median :2018-03-26 Median :13.00
## NA's :16213 Mean :2017-11-21 Mean :12.88
## 3rd Qu.:2018-05-31 3rd Qu.:13.00
## Max. :2019-07-01 Max. :60.00
## NA's :3
## created_at updated_at
## Min. :2018-02-15 21:43:14 Min. :2018-02-17 00:03:14
## 1st Qu.:2018-02-15 21:43:14 1st Qu.:2018-02-18 00:20:23
## Median :2018-03-26 15:24:37 Median :2018-02-18 02:37:08
## Mean :2018-04-19 05:22:53 Mean :2018-05-19 19:21:38
## 3rd Qu.:2018-05-31 19:38:04 3rd Qu.:2018-08-06 22:51:37
## Max. :2019-07-18 08:10:44 Max. :2019-07-18 08:10:47
## NA's :10143
## ussd_created has_phone
## Mode :logical Mode :logical
## FALSE:20439 FALSE:16582
## TRUE :2 TRUE :3859
##
##
##
##
## Observations: 20,441
## Variables: 11
## $ customer_id <int> 30, 29, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26…
## $ community_id <int> 695118, 695118, 697075, 696863, 696863, 696863…
## $ gender <fct> M, M, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ literacy <fct> native_rw, native_rw, native_rw, native_rw, na…
## $ has_mobile_money <lgl> FALSE, TRUE, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ registration_date <date> 2016-11-09, 2016-11-09, 2016-11-09, 2016-11-0…
## $ farm_size <dbl> 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13…
## $ created_at <dttm> 2018-02-15 21:43:14, 2018-02-15 21:43:14, 201…
## $ updated_at <dttm> 2018-11-28 07:02:07, 2018-11-29 07:02:18, 201…
## $ ussd_created <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## $ has_phone <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
## policy_transaction_id customer_policy_id transaction_type
## Min. : 4544 Min. : 4725 Length:29069
## 1st Qu.:21867 1st Qu.:18438 Class :character
## Median :32118 Median :26939 Mode :character
## Mean :32848 Mean :27807
## 3rd Qu.:44060 3rd Qu.:37361
## Max. :58946 Max. :50433
##
## payment_method transaction_date transaction_amount
## Length:29069 Min. :2015-01-01 Min. : 0.000
## Class :character 1st Qu.:2018-02-28 1st Qu.: 2.000
## Mode :character Median :2018-04-30 Median : 5.000
## Mean :2018-01-29 Mean : 8.712
## 3rd Qu.:2018-06-13 3rd Qu.: 10.000
## Max. :2019-07-16 Max. :833.000
##
## receipt_number created_at
## Length:29069 Min. :2018-02-12 22:13:59
## Class :character 1st Qu.:2018-04-18 00:05:52
## Mode :character Median :2018-05-08 15:58:42
## Mean :2018-06-03 09:59:07
## 3rd Qu.:2018-06-26 23:51:00
## Max. :2019-07-18 08:11:53
##
## updated_at currency
## Min. :2018-03-20 18:19:01 GHS:29069
## 1st Qu.:2018-03-22 22:08:43
## Median :2018-07-28 07:06:18
## Mean :2018-08-04 19:36:14
## 3rd Qu.:2018-10-23 16:53:31
## Max. :2019-07-18 08:12:19
## NA's :20596
## Observations: 29,069
## Variables: 10
## $ policy_transaction_id <int> 40261, 4899, 12138, 4916, 32057, 32058, 61…
## $ customer_policy_id <int> 36340, 5096, 11277, 5113, 24534, 24536, 47…
## $ transaction_type <chr> "payment", "payment", "payment", "payment"…
## $ payment_method <chr> "cash", "mobile money", "mobile money", "m…
## $ transaction_date <date> 2018-05-25, 2018-02-13, 2018-03-15, 2018-…
## $ transaction_amount <dbl> 5, 2, 10, 5, 4, 1, 1, 1, 2, 2, 2, 10, 2, 5…
## $ receipt_number <chr> "GH0003958", "GH0006694", "GH0007208", "GH…
## $ created_at <dttm> 2018-06-04 21:01:11, 2018-02-14 00:11:06,…
## $ updated_at <dttm> 2018-10-23 16:54:01, 2018-03-22 22:05:03,…
## $ currency <fct> GHS, GHS, GHS, GHS, GHS, GHS, GHS, GHS, GH…
Up to 1000 random observations per set.
ds %>%
iwalk(~ print(vis_miss(
if(nrow(.x) > 1000L)
sample_n(.x, 1000L) else .x,
cluster = TRUE,
sort_miss = TRUE,
warn_large_data = FALSE) + ggtitle(.y)))
ds$customer_policies %>%
filter(date_planted > today())
## # A tibble: 8 x 10
## customer_policy… date_issued created_at updated_at
## <int> <date> <dttm> <dttm>
## 1 50374 2019-06-29 2019-06-29 10:54:49 2019-06-29 11:35:37
## 2 38590 2018-06-13 2018-06-20 12:20:18 2019-03-20 07:05:28
## 3 21450 2016-05-19 2018-04-23 22:00:05 2019-03-22 11:50:10
## 4 22200 2016-06-21 2018-04-23 22:02:13 2019-03-22 14:04:24
## 5 22270 2016-07-01 2018-04-23 22:02:24 2019-03-22 14:20:06
## 6 22380 2016-05-05 2018-04-23 22:02:43 2019-06-29 10:53:10
## 7 22362 2016-05-05 2018-04-23 22:02:40 2019-03-28 11:32:55
## 8 21550 2016-05-17 2018-04-23 22:00:23 2019-03-30 12:48:34
## # … with 6 more variables: customer_id <int>, date_planted <date>,
## # date_priced <date>, status <fct>, crop <fct>, season <chr>
What does it mean? Good example to apply validate package.
Let’s check does season match other periods.
ds$customers %>%
select(customer_id) %>%
sample_n(1) %>%
left_join(data %>%
mutate_if(is.POSIXt, as.Date) %>%
select_if(map_lgl(., is.Date) | names(.) %in% c(
"customer_id",
"policy_transaction_id",
"customer_policy_id")),
by = 'customer_id') %>%
gather("event", "date", -ends_with("_id")) %>%
arrange(date)
## # A tibble: 11 x 5
## customer_id customer_policy_… policy_transactio… event date
## <int> <int> <int> <chr> <date>
## 1 195136 39524 44694 date_planted 2018-06-07
## 2 195136 39524 44694 date_issued 2018-06-13
## 3 195136 39524 44694 transaction… 2018-06-13
## 4 195136 39524 44694 registratio… 2018-06-29
## 5 195136 39524 44694 client_crea… 2018-06-29
## 6 195136 39524 44694 policy_crea… 2018-06-29
## 7 195136 39524 44694 transaction… 2018-06-29
## 8 195136 39524 44694 date_priced 2018-07-26
## 9 195136 39524 44694 policy_upda… 2018-11-22
## 10 195136 39524 44694 client_upda… NA
## 11 195136 39524 44694 transaction… NA
data %>%
filter(!is.na(customer_id)) %>%
group_by(customer_id) %>%
summarize(records = n()) %>%
top_n(5, records)
## # A tibble: 8 x 2
## customer_id records
## <int> <int>
## 1 172600 11
## 2 172606 11
## 3 172610 12
## 4 172618 11
## 5 172620 11
## 6 174756 11
## 7 174759 11
## 8 188790 11
What is happening with one of champions?
data %>%
filter(customer_id == 172600) %>%
mutate_if(is.POSIXt, as.Date) %>%
select_if(map_lgl(., is.Date) | names(.) %in% c(
"customer_id",
"policy_transaction_id",
"customer_policy_id"))
## # A tibble: 11 x 14
## customer_id registration_da… client_created_… client_updated_…
## <int> <date> <date> <date>
## 1 172600 2018-02-23 2018-02-24 NA
## 2 172600 2018-02-23 2018-02-24 NA
## 3 172600 2018-02-23 2018-02-24 NA
## 4 172600 2018-02-23 2018-02-24 NA
## 5 172600 2018-02-23 2018-02-24 NA
## 6 172600 2018-02-23 2018-02-24 NA
## 7 172600 2018-02-23 2018-02-24 NA
## 8 172600 2018-02-23 2018-02-24 NA
## 9 172600 2018-02-23 2018-02-24 NA
## 10 172600 2018-02-23 2018-02-24 NA
## 11 172600 2018-02-23 2018-02-24 NA
## # … with 10 more variables: customer_policy_id <int>, date_issued <date>,
## # policy_created_at <date>, policy_updated_at <date>,
## # date_planted <date>, date_priced <date>, policy_transaction_id <int>,
## # transaction_date <date>, transaction_created_at <date>,
## # transaction_updated_at <date>
… only columns with differences.
data %>%
filter(customer_id == 172600) %>%
mutate_if(is.POSIXt, as.Date) %>%
select_if(~length(unique(.)) > 1L)
## # A tibble: 11 x 6
## policy_transact… transaction_date transaction_amo… receipt_number
## <int> <date> <dbl> <chr>
## 1 14018 2018-03-22 1 GH0009534
## 2 15791 2018-03-28 1 GH0011167
## 3 6765 2018-02-23 2 GH0006154
## 4 10068 2018-03-05 1 GH0007959
## 5 11000 2018-03-12 1 GH0007949
## 6 34249 2018-05-15 7 GH0023440
## 7 40775 2018-06-04 2 GH0016693
## 8 42951 2018-06-08 1 GH0329601
## 9 40335 2018-05-28 2 GH0023341
## 10 49176 2018-07-09 3 GH0321626
## 11 49177 2018-07-13 1 GH0328640
## # … with 2 more variables: transaction_created_at <date>,
## # transaction_updated_at <date>
Based on the provided documentation we can work with transaction_date, so created_at and updated_at could be dropped.
So each customer could have only one policy, right?
data %>%
select(customer_id, customer_policy_id) %>%
distinct() %>%
group_by(customer_id) %>%
summarize(policies = n()) %>%
top_n(5, policies)
## # A tibble: 9 x 2
## customer_id policies
## <int> <int>
## 1 117 5
## 2 157 5
## 3 158 5
## 4 493 5
## 5 703 6
## 6 1022 5
## 7 1030 5
## 8 189089 5
## 9 194705 5
Right. No, up to six policies per client in our set.
Transactions?
data %>%
# filter(customer_id == 172600) %>%
select(customer_id, customer_policy_id, policy_transaction_id) %>%
distinct() %>%
group_by(customer_id, customer_policy_id) %>%
summarize(transactions = n()) %>%
ungroup() %>%
top_n(5, transactions) %>%
arrange(desc(transactions))
## # A tibble: 5 x 3
## customer_id customer_policy_id transactions
## <int> <int> <int>
## 1 172610 6907 12
## 2 172600 6894 11
## 3 172606 6900 11
## 4 172620 6917 11
## 5 174756 9357 11
Up to twelve payments from a client per one policy.
Time range. One policy.
data %>%
filter(customer_id == 172600) %>%
select(registration_date, customer_policy_id, date_issued, policy_transaction_id, transaction_date, transaction_amount) %>%
arrange(date_issued, transaction_date)
## # A tibble: 11 x 6
## registration_da… customer_policy… date_issued policy_transact…
## <date> <int> <date> <int>
## 1 2018-02-23 6894 2018-02-23 6765
## 2 2018-02-23 6894 2018-02-23 10068
## 3 2018-02-23 6894 2018-02-23 11000
## 4 2018-02-23 6894 2018-02-23 14018
## 5 2018-02-23 6894 2018-02-23 15791
## 6 2018-02-23 6894 2018-02-23 34249
## 7 2018-02-23 6894 2018-02-23 40335
## 8 2018-02-23 6894 2018-02-23 40775
## 9 2018-02-23 6894 2018-02-23 42951
## 10 2018-02-23 6894 2018-02-23 49176
## 11 2018-02-23 6894 2018-02-23 49177
## # … with 2 more variables: transaction_date <date>,
## # transaction_amount <dbl>
Time range for multiple policies.
data %>%
filter(customer_id == 117) %>%
select(registration_date, customer_policy_id, date_issued, policy_transaction_id, transaction_date, transaction_amount) %>%
arrange(date_issued, transaction_date)
## # A tibble: 6 x 6
## registration_da… customer_policy… date_issued policy_transact…
## <date> <int> <date> <int>
## 1 2016-11-09 20936 2016-05-25 22958
## 2 2016-11-09 16330 2017-01-10 17953
## 3 2016-11-09 7345 2018-02-26 7245
## 4 2016-11-09 48830 2018-12-11 57391
## 5 2016-11-09 49539 2019-02-05 58110
## 6 2016-11-09 49539 2019-02-05 58724
## # … with 2 more variables: transaction_date <date>,
## # transaction_amount <dbl>
The 117 is a loyal customer!
Are there any policies covering more than season/year?
policy_years <- data %>%
mutate(transaction_year = year(transaction_date)) %>%
mutate(season_year = as.integer(str_extract(season, "^\\d{4}")),
season_type = str_remove(season, "^\\d{4} ")) %>%
select(customer_policy_id, ends_with("year"), starts_with("season")) %>%
distinct() %>%
glimpse()
## Observations: 24,310
## Variables: 5
## $ customer_policy_id <int> NA, 24710, 25637, 48000, 48383, 24712, 48005,…
## $ transaction_year <dbl> NA, 2018, 2018, 2018, 2018, 2018, 2018, 2018,…
## $ season_year <int> NA, 2018, 2018, 2019, 2019, 2018, 2019, 2019,…
## $ season <chr> NA, "2018 major", "2018 major", "2019 major",…
## $ season_type <chr> NA, "major", "major", "major", "major", "majo…
policy_years %>%
group_by(customer_policy_id) %>%
summarise(trans_years = length(unique(transaction_year))) %>%
arrange(desc(trans_years)) %>%
head()
## # A tibble: 6 x 2
## customer_policy_id trans_years
## <int> <int>
## 1 17871 2
## 2 17872 2
## 3 17873 2
## 4 17879 2
## 5 17880 2
## 6 17881 2
policy_years %>%
group_by(customer_policy_id) %>%
summarise(season_years = length(unique(season_year))) %>%
arrange(desc(season_years)) %>%
head()
## # A tibble: 6 x 2
## customer_policy_id season_years
## <int> <int>
## 1 4725 1
## 2 4726 1
## 3 4727 1
## 4 4728 1
## 5 4729 1
## 6 4730 1
policy_years %>%
group_by(customer_policy_id) %>%
summarise(seasons = length(unique(season))) %>%
arrange(desc(seasons)) %>%
head()
## # A tibble: 6 x 2
## customer_policy_id seasons
## <int> <int>
## 1 4725 1
## 2 4726 1
## 3 4727 1
## 4 4728 1
## 5 4729 1
## 6 4730 1
policy_years %>%
group_by(customer_policy_id) %>%
summarise(season_types = length(unique(season_type))) %>%
arrange(desc(season_types)) %>%
head()
## # A tibble: 6 x 2
## customer_policy_id season_types
## <int> <int>
## 1 4725 1
## 2 4726 1
## 3 4727 1
## 4 4728 1
## 5 4729 1
## 6 4730 1
OK, it is true: one policy per one season :)
Let’s check relations between various date/time columns.
ds$community_payouts %>%
ggplot(aes(paid_date, created_at)) +
geom_point(alpha = .3) +
ggtitle("Community payouts")
ds$customer_policies %>%
ggplot(aes(date_issued, created_at)) +
geom_point(alpha = .3) +
ggtitle("Customer policies")
ds$customers %>%
ggplot(aes(registration_date, created_at)) +
geom_point(alpha = .3) +
ggtitle("Customers")
## Warning: Removed 3 rows containing missing values (geom_point).
ds$customers %>%
ggplot(aes(updated_at, created_at)) +
geom_point(alpha = .3) +
ggtitle("Customers updated")
## Warning: Removed 10143 rows containing missing values (geom_point).
ds$policy_transactions %>%
ggplot(aes(transaction_date, created_at)) +
geom_point(alpha = .3) +
ggtitle("Transactions")
All columns like created_at look more technical, we can remove them. Let’s update the dataset using latest findings.
data <- data %>%
mutate(season_year = as.integer(str_extract(season, "^\\d{4}")),
season_type = str_remove(season, "^\\d{4} ")) %>%
select(
-country,
-iso_code,
-policy_created_at,
-client_created_at,
-transaction_created_at
)
data %>%
select(customer_policy_id, season_type, date_issued) %>%
distinct() %>%
ggplot(aes(date_issued, fill = season_type)) +
geom_density(alpha = .3) +
ggtitle("Do major and minor seasons not overlap across country?")
## Warning: Removed 1 rows containing non-finite values (stat_density).
In the whole sample major/minor season overlap, probably they vary across crops.
data %>%
select(crop, customer_policy_id, season_type, date_issued) %>%
filter(!is.na(crop)) %>%
distinct() %>%
ggplot(aes(date_issued, fill = season_type)) +
geom_density(alpha = .3) +
facet_wrap(~crop, ncol = 1L, scales = "free_y") +
labs(x = NULL, y = NULL,
title = "Minor and major seasons for crops")
Maybe regions?
data %>%
select(region, customer_policy_id, season_type, date_issued) %>%
distinct() %>%
ggplot(aes(date_issued, fill = season_type)) +
geom_density(alpha = .3) +
facet_wrap(~region, ncol = 1L, scales = "free_y") +
labs(x = NULL, y = NULL,
title = "Can minor and major seasons overlap inside of a region?")
## Warning: Removed 5 rows containing non-finite values (stat_density).
In GH-TV people are watching wide-screen TV. Right, there were 4 payments only. But even on the level of a region major and minor seasons can overlap. Let’s take one region with both types of season and split by crop.
data %>%
filter(region == "GH-BA",
!is.na(crop)) %>%
select(crop, customer_policy_id, season_type, date_issued) %>%
distinct() %>%
ggplot(aes(date_issued, fill = season_type)) +
geom_density(alpha = .3) +
facet_wrap(~crop, ncol = 1L, scales = "free_y") +
labs(title = "GH-BA by crop", x = NULL, y = NULL)
… still overlapping. The level of district!
data %>%
filter(region == "GH-BA",
!is.na(crop)) %>%
select(crop, district, customer_policy_id, season_type, date_issued) %>%
distinct() %>%
ggplot(aes(date_issued, fill = season_type)) +
geom_density(alpha = .3) +
facet_wrap(~district, ncol = 1L, scales = "free_y") +
labs(title = "GH-BA", x = NULL, y = NULL)
data %>%
filter(district %in% c("Tano North", "Sunyani West"),
!is.na(crop)) %>%
select(crop, district, customer_policy_id, season_type, date_issued) %>%
distinct() %>%
ggplot(aes(date_issued, fill = season_type)) +
geom_density(alpha = .3) +
facet_wrap(~district + crop, ncol = 1L, scales = "free_y") +
labs(title = "GH-BA districts by crop", x = NULL, y = NULL)
data %>%
filter(district == "Sunyani West") %>%
select(community_name, customer_policy_id, season_type, date_issued) %>%
distinct() %>%
ggplot(aes(date_issued, fill = season_type)) +
geom_density(alpha = .3) +
facet_wrap(~community_name, ncol = 1L, scales = "free_y") +
labs(title = "GH-BA, Sunyani West communities", x = NULL, y = NULL)
## Warning: Removed 2 rows containing non-finite values (stat_density).
## Warning: Groups with fewer than two data points have been dropped.
OK, on the level of communities season types are stable. So to have proper mapping of seasons to calendar we need community-level scheme. Drop it!
Spatial distribution of communities.
ds$communities %>%
filter(!is.na(latitude)) %>%
ggplot(aes(latitude, longitude)) +
geom_point() +
coord_map()
Garbage happens: some mistakes in the data, probably. Applying some filtering…
ds$communities %>%
filter(!is.na(latitude)) %>%
filter(latitude < 9) %>%
ggplot(aes(latitude, longitude)) +
geom_point() +
coord_map() +
ggtitle("It doesn't look like Ghana",
"And didn't I messed axises up?")
OK, we’ve got Ghana coordinate box from OpenStreetMap.
ghana <- c(left = -3.2, top = 11.5, right = 1.4, bottom = 4.7)
ghana_back <- get_stamenmap(ghana, zoom = 8, maptype = "terrain-background")
# toner-lite or toner-backgroud are also OK
ghana_back %>%
ggmap() +
geom_point(
aes(longitude, latitude),
data = ds$communities %>% filter(!is.na(longitude)),
alpha = .2, color = "purple") +
theme_void()
## Warning: Removed 20 rows containing missing values (geom_point).
Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
ghana_north_bbox <- c(left = -3.2, top = 11.5, right = 1.4, bottom = 6.8)
ghana_north <- get_stamenmap(ghana_north_bbox,
zoom = 9,
maptype = "toner-background")
# toner-lite or toner-backgroud are also OK
ghana_north_small <- get_stamenmap(ghana_north_bbox,
zoom = 8,
maptype = "toner-background")
saveRDS(ghana_north, file.path("retentionapp", "data", "map_ghana_north.rds"))
data %>%
select(customer_id,
longitude,
latitude) %>%
filter(!is.na(latitude),
!is.na(longitude),
!is.na(customer_id)) %>%
distinct() %>%
{ghana_north %>%
ggmap() +
stat_density_2d(
aes(longitude, latitude, fill = ..level..), geom = "polygon", alpha = .3,
data = .,) +
theme_void() +
guides(fill = FALSE) +
ggtitle("Density of customers' locations")
} %>% print
## Warning: Removed 1 rows containing non-finite values (stat_density2d).
data %>%
select(customer_id,
longitude,
latitude,
season) %>%
na.omit() %>%
distinct() %>%
{ghana_north_small %>%
ggmap() +
stat_density_2d(
aes(longitude, latitude, fill = ..level..), geom = "polygon", alpha = .3,
data = .,) +
theme_void() +
guides(fill = FALSE) +
ggtitle("Density of customers' locations by seasons") +
facet_wrap(~season)
} %>% print
## Warning: Removed 1 rows containing non-finite values (stat_density2d).
data %>%
select(customer_id,
longitude,
latitude,
crop) %>%
na.omit() %>%
distinct() %>%
{ghana_north %>%
ggmap() +
# stat_density_2d(
# aes(longitude, latitude, fill = ..level..), geom = "polygon", alpha = .3,
# data = .,) +
geom_density_2d(aes(longitude, latitude, color = crop), data = .) +
theme_void() +
guides(fill = FALSE) +
ggtitle("Density of customers' locations by crops")
} %>% print
## Warning: Removed 1 rows containing non-finite values (stat_density2d).
From the project description we have two slightly different definitions of retention:
we think of a “retained” customer as a customer who purchased drought insurance in multiple cropping seasons.
retention rate Percentage of prior season customers who purchase insurance again in a subsequent season.
The first variant with multiple seasons is easy. In the second one the added complication is that it is not straightforward how to define “subsequent” season as for various communities and crops minor and major seasons are possible. Additionally we have only four years including incomplete. Let’s stick to the first definition for now.
We assume a policy is issued around the beginning of a season.
data %>%
select(customer_policy_id, date_issued) %>%
na.omit() %>%
distinct() %>%
ggplot(aes(customer_policy_id, date_issued)) +
geom_point(alpha = .2) +
ggtitle("Policy ID can't be used as ordered substitution for date issued")
Cohort analysis, as Google Analytics calls it. By month of first policy.
data %>%
select(
customer_id,
customer_policy_id,
date_issued
) %>%
distinct() %>%
group_by(customer_id) %>%
mutate(
month_issued = ceiling_date(date_issued, 'month'),
queue = dense_rank(month_issued),
first_policy = min(month_issued)
) %>%
group_by(first_policy, queue) %>%
summarise(policies = n()) %>%
ungroup() %>%
spread(queue, policies)
## # A tibble: 40 x 7
## first_policy `1` `2` `3` `4` `5` `<NA>`
## <date> <int> <int> <int> <int> <int> <int>
## 1 2015-02-01 1 1 1 NA NA NA
## 2 2016-01-01 14 7 3 NA NA NA
## 3 2016-02-01 3 3 2 NA NA NA
## 4 2016-03-01 5 NA NA NA NA NA
## 5 2016-04-01 82 39 13 5 NA NA
## 6 2016-05-01 606 113 25 3 NA NA
## 7 2016-06-01 634 235 49 13 1 NA
## 8 2016-07-01 768 56 12 3 NA NA
## 9 2016-08-01 812 102 46 6 1 NA
## 10 2016-09-01 18 9 1 NA NA NA
## # … with 30 more rows
data %>%
select(
customer_id,
customer_policy_id,
date_issued
) %>%
distinct() %>%
group_by(customer_id) %>%
mutate(
lag_pol = lag(date_issued, order_by = date_issued)
) %>%
ungroup() %>%
filter(!is.na(lag_pol)) %>%
mutate(interlapse = difftime(date_issued, lag_pol, units = "days") %>%
as.integer()) %>%
group_by(customer_id) %>%
summarise(shortest_interlapse = min(interlapse, na.rm = TRUE)) %>%
filter(!is.na(shortest_interlapse)) %>%
ggplot(aes(shortest_interlapse)) + geom_density() +
ggtitle("What are shortest periods between policies for a client?")
Should we filter out specific policy statuses?
table(ds$customer_policies$status)
##
## pending active expired paid out
## 1632 210 19780 1579
## planted payout due payout requested triggered
## 156 678 58 6
## refunded dispute priced
## 45 49 2
Policy statuses vs transactions. There is only one type of transaction which is payment. So we don’t care about transaction type.
ds$policy_transactions %>%
ggplot(aes(transaction_amount)) +
geom_density() +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1 rows containing non-finite values (stat_density).
data %>%
select(customer_policy_id,
status,
transaction_amount) %>%
group_by(customer_policy_id, status) %>%
summarize(amount = sum(transaction_amount, na.rm = TRUE)) %>%
na.omit() %>%
ggplot(aes(status, amount)) +
geom_boxplot() +
# scale_y_log10() +
coord_flip(ylim = c(0, 100)) +
labs(y = "Total policy cost, GH₵",
title = "All policy statuses can have actual transactions")
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
It means we don’t care about policy statuses now. But maybe we can drop policies with zero payments.
data %>%
select(customer_policy_id,
transaction_amount) %>%
group_by(customer_policy_id) %>%
summarise(zero_amount = sum(transaction_amount, na.rm = TRUE) == 0) %>%
group_by(zero_amount) %>%
summarise(policies = n())
## # A tibble: 2 x 2
## zero_amount policies
## <lgl> <int>
## 1 FALSE 24089
## 2 TRUE 107
0.4%: we don’t care. Let’s check a couple of case studies with very short interlapses.
data %>%
select(
customer_id,
customer_policy_id,
date_issued
) %>%
distinct() %>%
group_by(customer_id) %>%
mutate(
lag_pol = lag(date_issued, order_by = date_issued)
) %>%
ungroup() %>%
mutate(interlapse = difftime(date_issued, lag_pol, units = "days") %>%
as.integer()) %>%
group_by(customer_id) %>%
mutate(tinyinterlapse = any(interlapse < 1)) %>%
ungroup() %>%
filter(tinyinterlapse) %>%
select(-tinyinterlapse) %>%
arrange(customer_id, date_issued) %>%
head(10)
## # A tibble: 10 x 5
## customer_id customer_policy_id date_issued lag_pol interlapse
## <int> <int> <date> <date> <int>
## 1 164 18599 2018-04-06 NA NA
## 2 164 40712 2018-06-30 2018-04-06 85
## 3 164 48484 2018-11-20 2018-06-30 143
## 4 164 48492 2018-11-20 2018-11-20 0
## 5 252 21137 2016-04-28 NA NA
## 6 252 20961 2016-04-28 2016-04-28 0
## 7 300 20988 2016-05-11 NA NA
## 8 300 21187 2016-05-11 2016-05-11 0
## 9 300 16042 2016-12-28 2016-05-11 231
## 10 300 11956 2018-03-16 2016-12-28 443
The guy 164 got two policies on one day.
data %>%
filter(customer_policy_id %in% c(48484, 48492)) %>%
select_if(~length(unique(.)) > 1)
## # A tibble: 2 x 5
## customer_policy… crop policy_transact… receipt_number
## <int> <fct> <int> <chr>
## 1 48484 Maize 57009 GH0034329
## 2 48492 Rice 57013 GH0034332
## # … with 1 more variable: transaction_updated_at <dttm>
Policy per crop. It makes sense. One farmer can plant different crops. Let’s check similar situation for one day delay.
data %>%
select(
customer_id,
customer_policy_id,
date_issued
) %>%
distinct() %>%
group_by(customer_id) %>%
mutate(
lag_pol = lag(date_issued, order_by = date_issued)
) %>%
ungroup() %>%
mutate(interlapse = difftime(date_issued, lag_pol, units = "days") %>%
as.integer()) %>%
group_by(customer_id) %>%
mutate(tinyinterlapse = any(interlapse == 1)) %>%
ungroup() %>%
filter(tinyinterlapse) %>%
select(-tinyinterlapse) %>%
arrange(customer_id, date_issued) %>%
head(10)
## # A tibble: 10 x 5
## customer_id customer_policy_id date_issued lag_pol interlapse
## <int> <int> <date> <date> <int>
## 1 171535 21083 2015-12-04 NA NA
## 2 171535 48953 2018-12-17 2015-12-04 1109
## 3 171535 48903 2018-12-18 2018-12-17 1
## 4 188474 40781 2018-05-15 NA NA
## 5 188474 40782 2018-05-16 2018-05-15 1
## 6 188819 33143 2018-05-28 NA NA
## 7 188819 37467 2018-05-29 2018-05-28 1
## 8 191273 34653 2018-05-30 NA NA
## 9 191273 35100 2018-05-31 2018-05-30 1
## 10 195478 40270 2018-06-18 NA NA
data %>%
filter(customer_policy_id %in% c(40781, 40782)) %>%
select_if(~length(unique(.)) > 1)
## # A tibble: 2 x 7
## customer_policy… date_issued policy_updated_at date_planted crop
## <int> <date> <dttm> <date> <fct>
## 1 40781 2018-05-15 2018-11-14 02:22:55 2018-06-14 Grou…
## 2 40782 2018-05-16 2018-11-14 02:22:55 2018-06-30 Maize
## # … with 2 more variables: policy_transaction_id <int>,
## # receipt_number <chr>
Any cases when we have close dates for different seasons?
data %>%
select(
customer_id,
customer_policy_id,
date_issued,
season,
crop
) %>%
distinct() %>%
group_by(customer_id) %>%
mutate(
lag_pol = lag(date_issued, order_by = date_issued),
lead_pol = lead(date_issued, order_by = date_issued),
) %>%
ungroup() %>%
mutate(interlapse = difftime(date_issued, lag_pol, units = "days") %>%
as.integer(),
interlapse_lead = difftime(lead_pol, date_issued, units = "days") %>%
as.integer()) %>%
group_by(customer_id) %>%
mutate(has_tinyinterlapse = any(interlapse <= 31)) %>%
ungroup() %>%
filter(has_tinyinterlapse) %>%
select(-has_tinyinterlapse) %>%
group_by(customer_id) %>%
filter((interlapse <= 31 | interlapse_lead <= 31) &
length(unique(season)) > 1 &
length(unique(crop)) == 1) %>%
ungroup() %T>%
{select(., customer_id) %>%
distinct() %>%
summarize(suspicious_policies = n()) %>%
print()} %>%
arrange(customer_id, date_issued) %>%
head(10)
## # A tibble: 1 x 1
## suspicious_policies
## <int>
## 1 6
## # A tibble: 10 x 9
## customer_id customer_policy… date_issued season crop lag_pol
## <int> <int> <date> <chr> <fct> <date>
## 1 203869 45507 2018-09-21 2018 … Maize NA
## 2 203869 47231 2018-09-21 2019 … Maize 2018-09-21
## 3 203988 45533 2018-09-27 2018 … Maize NA
## 4 203988 47848 2018-10-26 2019 … Maize 2018-09-27
## 5 204100 50255 2018-09-25 2019 … Maize NA
## 6 204100 45609 2018-09-25 2018 … Maize 2018-09-25
## 7 204846 46483 2018-10-11 2018 … Maize NA
## 8 204846 47399 2018-11-06 2019 … Maize 2018-10-11
## 9 205160 46973 2018-10-15 2018 … Maize NA
## 10 205160 50256 2018-10-15 2019 … Maize 2018-10-15
## # … with 3 more variables: lead_pol <date>, interlapse <int>,
## # interlapse_lead <int>
OK, in total we have only six clients who bought two policies for one crop with interlapse 31 days or less. We don’t care about such small amount but it is better to clarify or clean it out.
Taking into account a customer could buy two policies for two different crops in the same season we can’t use unique client’s policies to calculate unique purchase. A suitable approach is to count combinations of customer and season.
We drop differences between major and minor seasons, as there are not so many policies for minor season and also taking it into account would complicate the approach.
table(data$season_year, data$season_type)
##
## major minor
## 2016 3333 0
## 2017 2710 0
## 2018 19253 1481
## 2019 2250 147
retention <- data %>%
select(
customer_id,
season_year) %>%
distinct() %>%
group_by(customer_id) %>%
mutate(
first_policy = min(season_year),
following = str_c("year_", season_year - first_policy)
) %>%
ungroup() %>%
filter(!is.na(first_policy),
!is.na(following)) %>%
group_by(first_policy, following) %>%
summarise(policies = n()) %>%
group_by(first_policy) %>%
mutate(prop = policies / max(policies, na.rm = TRUE)) %>%
ungroup() %>%
complete(first_policy, following, fill = list(policies = 0L, prop = 0))
retention %>%
select(-prop) %>%
spread(following, policies)
## # A tibble: 4 x 5
## first_policy year_0 year_1 year_2 year_3
## <dbl> <int> <int> <int> <int>
## 1 2016 2926 332 301 60
## 2 2017 2308 774 151 0
## 3 2018 14678 1312 0 0
## 4 2019 529 0 0 0
retention %>%
select(-policies) %>%
spread(following, prop)
## # A tibble: 4 x 5
## first_policy year_0 year_1 year_2 year_3
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016 1 0.113 0.103 0.0205
## 2 2017 1 0.335 0.0654 0
## 3 2018 1 0.0894 0 0
## 4 2019 1 0 0 0
retention %>%
# filter(following != "year_0") %>%
ggplot(aes(following, prop, fill = factor(first_policy, ordered = TRUE))) +
geom_bar(stat = "identity", position = "dodge") +
scale_y_continuous(labels = scales::percent)
ds$community_payouts %>%
sample_n(10)
## # A tibble: 10 x 10
## community_payou… community_id paid_date paid_by paid_to
## <int> <int> <date> <int> <int>
## 1 622 1730914 2019-04-17 1264 17631
## 2 311 1729400 2018-11-09 1262 13275
## 3 323 697344 2018-12-13 1262 17479
## 4 262 695511 2018-12-19 1262 11254
## 5 324 1729543 2018-12-13 1262 14055
## 6 21 695067 2018-11-07 1262 13254
## 7 446 696724 2018-11-26 1262 13243
## 8 565 1730653 2019-02-07 1264 16572
## 9 86 1729472 2018-11-15 1262 13353
## 10 362 697136 2018-12-18 1262 13805
## # … with 5 more variables: transaction_amount <dbl>,
## # confirmation_code <chr>, created_at <dttm>, status <fct>, season <chr>
What is paid_by? How many?
ds$community_payouts$paid_by %>% unique %>% length
## [1] 4
Is one of them a customer?
1262L %in% ds$customers$customer_id
## [1] TRUE
How many of them are not customers?
sum(!(unique(c(ds$community_payouts$paid_by, ds$community_payouts$paid_to)) %in%
ds$customers$customer_id))
## [1] 438
… are in the customers table?
sum((unique(c(ds$community_payouts$paid_by, ds$community_payouts$paid_to)) %in%
ds$customers$customer_id))
## [1] 89
Even if paid_by/to is a customer_id we can’t use it now.
We want to show influence of various factors on the retention rate. Factors are:
Let’s build a set with count data on it.
payouts_years <- ds$community_payouts %>%
mutate(payout_season_year = as.integer(str_extract(season, "^\\d{4}"))) %>%
select(
community_id,
payout_season_year
) %>%
distinct() %>%
group_by(community_id) %>%
mutate(payout_first_year = min(payout_season_year)) %>%
ungroup() %>%
select(-payout_season_year)
retfactors <- data %>%
filter(!is.na(customer_id)) %>%
select(
customer_id,
community_id,
region,
gender,
literacy,
has_phone,
has_mobile_money,
season_year,
farm_size
) %>%
distinct() %>%
left_join(payouts_years, by = "community_id") %>%
group_by(customer_id) %>%
mutate(
first_policy = min(season_year),
retained_next_year = any(first_policy + 1 == season_year),
retained = max(season_year) - first_policy > 0
) %>%
ungroup() %>%
mutate(
big_farm = farm_size > 13,
literacy = forcats::fct_lump(literacy, prop = .05, other_level = "other"),
gender = forcats::fct_explicit_na(gender)
# , has_mobile_money = forcats::fct_explicit_na(has_mobile_money),
) %>%
group_by(
first_policy,
region,
community_id,
gender,
literacy,
big_farm
) %>%
summarize(
total_customers = n(),
retained_next_year = sum(retained_next_year, na.rm = TRUE),
retained = sum(retained, na.rm = TRUE)
) %>%
ungroup()
retfactors %>%
group_by(first_policy) %>%
summarize(
customers = sum(total_customers),
retained_next_year = sum(retained_next_year),
retained = sum(retained),
prop = retained_next_year / customers
)
## # A tibble: 4 x 5
## first_policy customers retained_next_year retained prop
## <int> <int> <int> <int> <dbl>
## 1 2016 3619 800 1223 0.221
## 2 2017 3233 1695 1703 0.524
## 3 2018 15990 2624 2624 0.164
## 4 2019 529 0 0 0
Houston, we’ve had a problem here. Numbers don’t match. Simplify it.
data %>%
filter(!is.na(customer_id)) %>%
select(
customer_id,
season_year
) %>%
distinct() %>%
group_by(customer_id) %>%
summarize(
first_policy = min(season_year),
retained_next_year = any(first_policy + 1 == season_year),
retained = max(season_year) - first_policy > 0
) %>%
group_by(
first_policy
) %>%
summarize(
total_customers = n(), # length(unique(customer_id)),
retained_next_year = sum(retained_next_year, na.rm = TRUE),
retained = sum(retained, na.rm = TRUE)
) %>%
ungroup()
## # A tibble: 4 x 4
## first_policy total_customers retained_next_year retained
## <int> <int> <int> <int>
## 1 2016 2926 332 530
## 2 2017 2308 774 778
## 3 2018 14678 1312 1312
## 4 2019 529 0 0
OK, the problem was with duplicating customer records. So let’s firstly measure individual customers and add properties afterward.
customer_retention <- data %>%
filter(!is.na(customer_id)) %>%
select(
customer_id,
season_year
) %>%
distinct() %>%
group_by(customer_id) %>%
summarize(
first_policy = min(season_year),
retained_next_year = any(first_policy + 1 == season_year),
retained_after_1year = max(season_year) - first_policy > 1
)
retfactors <- data %>%
filter(!is.na(customer_id)) %>%
select(
customer_id,
community_id,
longitude,
latitude,
region,
gender,
literacy,
farm_size,
has_mobile_money,
has_phone
) %>%
distinct() %>%
left_join(payouts_years, by = "community_id") %>%
mutate(
farm_size = if_else(farm_size < 13, "12 acres or less",
if_else(farm_size > 13, "14 acres or more", "13 acres")),
literacy = forcats::fct_lump(literacy, prop = .05, other_level = "other"),
gender = forcats::fct_explicit_na(gender)
# , has_mobile_money = forcats::fct_explicit_na(has_mobile_money)
)
customer_retention_factors <-
customer_retention %>%
left_join(retfactors, by = "customer_id") %>%
mutate(
community_payouts_before = if_else(
is.na(payout_first_year),
FALSE,
if_else(payout_first_year <= first_policy, TRUE, FALSE)))
saveRDS(customer_retention_factors, file.path("retentionapp", "data", "customer_retention.rds"))
One more check.
customer_retention_factors %>%
group_by(first_policy) %>%
summarize(
customers = n(),
next_year = sum(retained_next_year),
year2plus = sum(retained_after_1year)
)
## # A tibble: 4 x 4
## first_policy customers next_year year2plus
## <int> <int> <int> <int>
## 1 2016 2926 332 313
## 2 2017 2308 774 151
## 3 2018 14678 1312 0
## 4 2019 529 0 0