欺诈分析

分析这四个渠道 :Affle、Mobvista、Glispa、Batmobi

Affle

1.Affle没有任何点击行为:

head(Affle[[1]])
##   click_time.x activity_kind             network_name
## 1   1527447336         event                    Affle
## 2   1527454021         event                    Affle
## 3   1527437658       session Adwords Display Installs
## 4   1527216072       session                    Affle
## 5   1527612572 reattribution Adwords Display Installs
## 6   1527361784       session                    Affle
##                                   adid installed_at install_time
## 1 adf14c41-bcac-4971-a0b0-c8b295ef9b10   1527447343   1527447336
## 2 3b086480-1c53-4d4e-bf7e-233c29394eef   1527454077   1527454021
## 3 749f42ec-41c0-4f3f-b990-48c4fafa2f8f   1527157096   1527157073
## 4 4ed9d343-ba83-4856-92f2-f71f3849421a   1527216138   1527216072
## 5 03302132-27d8-4776-9178-f530d6f246a2   1527154556   1527154551
## 6 2cb187f4-c2b3-4536-9d89-7c45cc1fcaad   1527361787   1527361784

这一部分数据的量有:

length(unique(Affle[[1]]$adid))
## [1] 2104

2.Affle下载与最后一次点击间隔为0:

head(Affle[[2]] %>% filter(install_time-click_time1==0))
## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 6 x 17
##   adid            install_time        click_time1         action1 network1
##   <chr>           <dttm>              <dttm>              <chr>   <chr>   
## 1 00014760-951a-… 2018-05-25 09:48:15 2018-05-25 09:48:15 click   Affle   
## 2 000f17e6-db13-… 2018-05-25 11:17:32 2018-05-25 11:17:32 click   Affle   
## 3 000f17e6-db13-… 2018-05-25 11:17:32 2018-05-25 11:17:32 click   Affle   
## 4 000f17e6-db13-… 2018-05-25 11:17:32 2018-05-25 11:17:32 click   Solo    
## 5 000f17e6-db13-… 2018-05-25 11:17:32 2018-05-25 11:17:32 click   Solo    
## 6 0015e82d-b646-… 2018-05-27 02:58:27 2018-05-27 02:58:27 click   Affle   
## # ... with 12 more variables: click_time2 <dttm>, action2 <chr>,
## #   network2 <chr>, click_time3 <dttm>, action3 <chr>, network3 <chr>,
## #   click_time4 <dttm>, action4 <chr>, network4 <chr>, click_time5 <dttm>,
## #   action5 <chr>, network5 <chr>

这一部分数据的量有:

length(unique((Affle[[2]] %>% filter(install_time-click_time1==0))$adid))
## [1] 4107

3.Affle下载渠道与最后一次点击渠道不一致

head(Affle[[2]] %>% filter(network1 !='Affle'))
## # A tibble: 6 x 17
##   adid            install_time        click_time1         action1 network1
##   <chr>           <dttm>              <dttm>              <chr>   <chr>   
## 1 000f17e6-db13-… 2018-05-25 11:17:32 2018-05-25 11:17:32 click   Solo    
## 2 000f17e6-db13-… 2018-05-25 11:17:32 2018-05-25 11:17:32 click   Solo    
## 3 002f8ca5-0e89-… 2018-05-29 08:54:39 2018-05-29 08:54:38 click   Glispa  
## 4 00679b72-11bc-… 2018-05-27 10:48:31 2018-05-27 10:48:30 click   Glispa  
## 5 007be953-8422-… 2018-05-29 10:34:11 2018-05-29 10:33:58 click   Glispa  
## 6 00cb2fcf-cdda-… 2018-05-27 10:42:19 2018-05-27 10:42:17 click   Glispa  
## # ... with 12 more variables: click_time2 <dttm>, action2 <chr>,
## #   network2 <chr>, click_time3 <dttm>, action3 <chr>, network3 <chr>,
## #   click_time4 <dttm>, action4 <chr>, network4 <chr>, click_time5 <dttm>,
## #   action5 <chr>, network5 <chr>

这一不分的量有

length(unique((Affle[[2]] %>% filter(network1 !='Affle'))$adid))
## [1] 856

所以,可能有问题的占比为:

(length(unique((Affle[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique((Affle[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique((Affle[[2]] %>% filter(network1 !='Affle'))$adid)))/(length(unique(Affle[[2]]$adid)) + length(unique((Affle[[2]] %>% filter(install_time-click_time1==0))$adid)))
## [1] 0.9980194

Mobvista

1.Mobvista没有任何点击行为:

head(Mobvista[[1]])
##   click_time.x activity_kind         network_name
## 1   1528787069       session             Mobvista
## 2   1528431337       session             Mobvista
## 3   1527236182         event Adwords UAC Installs
## 4   1528675020       session             Mobvista
## 5   1528403614       session             Mobvista
## 6   1528620450       session             Mobvista
##                                   adid installed_at install_time
## 1 c7a9fbea-3bc0-4855-adf9-dcb32094ac82   1528787110   1528787069
## 2 02b4e120-1677-4130-8434-ced4086e3a23   1528431369   1528431337
## 3 5428b746-f11f-4732-87b4-d8f928f99e09   1528617610   1528617457
## 4 a8c8042e-0b88-49c8-b65e-4d2a447e0a39   1528821641   1528675020
## 5 45d160c3-2163-4083-9d10-a36dd25f5698   1528403718   1528403614
## 6 060ca671-d7a9-470c-80d6-2038271414f8   1528620490   1528620450

这一部分数据的量有:

length(unique(Mobvista[[1]]$adid))
## [1] 1262

2.Mobvista下载与最后一次点击间隔为0:

head(Mobvista[[2]] %>% filter(install_time-click_time1==0))
## # A tibble: 6 x 17
##   adid            install_time        click_time1         action1 network1
##   <chr>           <dttm>              <dttm>              <chr>   <chr>   
## 1 0000f255-af3b-… 2018-06-01 00:58:44 2018-06-01 00:58:44 click   Mobvista
## 2 00010b6d-41f9-… 2018-06-22 00:34:50 2018-06-22 00:34:50 click   Mobvista
## 3 00016e4d-4644-… 2018-06-23 00:08:05 2018-06-23 00:08:05 click   Mobvista
## 4 000248c4-3177-… 2018-06-22 00:34:04 2018-06-22 00:34:04 click   Mobvista
## 5 00036248-e38d-… 2018-06-21 01:18:36 2018-06-21 01:18:36 click   Mobvista
## 6 00039430-7788-… 2018-06-20 01:10:03 2018-06-20 01:10:03 click   Mobvista
## # ... with 12 more variables: click_time2 <dttm>, action2 <chr>,
## #   network2 <chr>, click_time3 <dttm>, action3 <chr>, network3 <chr>,
## #   click_time4 <dttm>, action4 <chr>, network4 <chr>, click_time5 <dttm>,
## #   action5 <chr>, network5 <chr>

这一部分数据的量有:

length(unique((Mobvista[[2]] %>% filter(install_time-click_time1==0))$adid))
## [1] 58246

3.Mobvista下载渠道与最后一次点击渠道不一致

head(Mobvista[[2]] %>% filter(network1 !='Affle'))
## # A tibble: 6 x 17
##   adid            install_time        click_time1         action1 network1
##   <chr>           <dttm>              <dttm>              <chr>   <chr>   
## 1 0000f255-af3b-… 2018-06-01 00:58:44 2018-06-01 00:58:44 click   Mobvista
## 2 00010b6d-41f9-… 2018-06-22 00:34:50 2018-06-22 00:34:50 click   Mobvista
## 3 00016e4d-4644-… 2018-06-23 00:08:05 2018-06-23 00:08:05 click   Mobvista
## 4 000248c4-3177-… 2018-06-22 00:34:04 2018-06-22 00:34:04 click   Mobvista
## 5 00036248-e38d-… 2018-06-21 01:18:36 2018-06-21 01:18:36 click   Mobvista
## 6 00039430-7788-… 2018-06-20 01:10:03 2018-06-20 01:10:03 click   Mobvista
## # ... with 12 more variables: click_time2 <dttm>, action2 <chr>,
## #   network2 <chr>, click_time3 <dttm>, action3 <chr>, network3 <chr>,
## #   click_time4 <dttm>, action4 <chr>, network4 <chr>, click_time5 <dttm>,
## #   action5 <chr>, network5 <chr>

这一不分的量有

length(unique((Mobvista[[2]] %>% filter(network1 !='Mobvista'))$adid))
## [1] 299

所以,可能有问题的占比为:

(length(unique((Mobvista[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique((Mobvista[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique((Mobvista[[2]] %>% filter(network1 !='Mobvista'))$adid)))/(length(unique(Mobvista[[2]]$adid)) + length(unique((Mobvista[[2]] %>% filter(install_time-click_time1==0))$adid)))
## [1] 0.9976424

Glispa

1.Glispa没有任何点击行为:

head(Glispa[[1]])
##   click_time.x activity_kind network_name
## 1   1530094926         event       Glispa
## 2   1528938534         event       Glispa
## 3   1529027316         event       Glispa
## 4   1529111285         event       Glispa
## 5   1528938534       session       Glispa
## 6   1528276402         event       Glispa
##                                   adid installed_at install_time
## 1 e7bf8d54-c60c-4713-9bc1-56f17e276ca0   1530095169   1530094926
## 2 b4a3a846-4dcd-48b9-8f25-2fed4ddc8985   1528938543   1528938534
## 3 381f643e-ff52-4707-93c8-f91ea00e1372   1529027366   1529027316
## 4 c700cf29-178d-4ba6-a415-f01f5c0dc514   1529112465   1529111285
## 5 b4a3a846-4dcd-48b9-8f25-2fed4ddc8985   1528938543   1528938534
## 6 625a6fc9-004f-4f4f-ba17-192134088874   1528277967   1528276402

这一部分数据的量有:

length(unique(Glispa[[1]]$adid))
## [1] 7

2.Glispa下载与最后一次点击间隔为0:

head(Glispa[[2]] %>% filter(install_time-click_time1==0))
## # A tibble: 6 x 17
##   adid            install_time        click_time1         action1 network1
##   <chr>           <dttm>              <dttm>              <chr>   <chr>   
## 1 000029d0-8b0e-… 2018-06-08 16:08:07 2018-06-08 16:08:07 click   Glispa  
## 2 00066fe7-34ae-… 2018-06-12 09:18:14 2018-06-12 09:18:14 click   Glispa  
## 3 000f4070-7174-… 2018-06-01 11:39:37 2018-06-01 11:39:37 click   Glispa  
## 4 0011b09d-aad5-… 2018-06-05 08:42:42 2018-06-05 08:42:42 click   Glispa  
## 5 001248d6-800e-… 2018-06-03 14:31:15 2018-06-03 14:31:15 click   Glispa  
## 6 0014620d-708a-… 2018-06-04 08:42:43 2018-06-04 08:42:43 click   Glispa  
## # ... with 12 more variables: click_time2 <dttm>, action2 <chr>,
## #   network2 <chr>, click_time3 <dttm>, action3 <chr>, network3 <chr>,
## #   click_time4 <dttm>, action4 <chr>, network4 <chr>, click_time5 <dttm>,
## #   action5 <chr>, network5 <chr>

这一部分数据的量有:

length(unique((Glispa[[2]] %>% filter(install_time-click_time1==0))$adid))
## [1] 32534

3.Glispa下载渠道与最后一次点击渠道不一致

head(Glispa[[2]] %>% filter(network1 !='Affle'))
## # A tibble: 6 x 17
##   adid            install_time        click_time1         action1 network1
##   <chr>           <dttm>              <dttm>              <chr>   <chr>   
## 1 000029d0-8b0e-… 2018-06-08 16:08:07 2018-06-08 16:08:07 click   Glispa  
## 2 00066fe7-34ae-… 2018-06-12 09:18:14 2018-06-12 09:18:14 click   Glispa  
## 3 000f4070-7174-… 2018-06-01 11:39:37 2018-06-01 11:39:37 click   Glispa  
## 4 0011b09d-aad5-… 2018-06-05 08:42:42 2018-06-05 08:42:42 click   Glispa  
## 5 001248d6-800e-… 2018-06-03 14:31:15 2018-06-03 14:31:15 click   Glispa  
## 6 0014620d-708a-… 2018-06-04 08:42:43 2018-06-04 08:42:43 click   Glispa  
## # ... with 12 more variables: click_time2 <dttm>, action2 <chr>,
## #   network2 <chr>, click_time3 <dttm>, action3 <chr>, network3 <chr>,
## #   click_time4 <dttm>, action4 <chr>, network4 <chr>, click_time5 <dttm>,
## #   action5 <chr>, network5 <chr>

这一不分的量有

length(unique((Glispa[[2]] %>% filter(network1 !='Glispa'))$adid))
## [1] 339

所以,可能有问题的占比为:

(length(unique((Glispa[[2]] %>% filter(network1 !='Glispa'))$adid))+length(unique((Glispa[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique(Glispa[[1]]$adid)))/(length(unique(Glispa[[2]]$adid)) + length(unique((Glispa[[2]] %>% filter(install_time-click_time1==0))$adid)))
## [1] 0.5052865

Batmobi

1.Batmobi没有任何点击行为:

head(Batmobi[[1]])
##   click_time.x activity_kind network_name
## 1   1529424882       session      Batmobi
## 2   1529253610         event      Batmobi
## 3   1529601558       session      Batmobi
## 4   1528794780         event      Batmobi
## 5   1529255115       session      Batmobi
## 6   1529774271         event      Batmobi
##                                   adid installed_at install_time
## 1 cbb7df24-6346-46dd-8790-b74b8937884b   1529424886   1529424882
## 2 fb6a20fc-0a5f-4eea-8301-8d6e93599b53   1529294092   1529253610
## 3 254b5d00-cff4-49c9-9b4a-367769aa8edd   1529601686   1529601558
## 4 51a21df2-ed22-4452-9817-13ba01d0a09e   1528795143   1528794780
## 5 bb2236f2-ec31-441d-a0cc-1e2cace7ba76   1529255144   1529255115
## 6 33171714-6db8-4279-8b98-148027f2c673   1529774378   1529774271

这一部分数据的量有:

length(unique(Batmobi[[1]]$adid))
## [1] 479

2.Batmobi下载与最后一次点击间隔为0:

head(Batmobi[[2]] %>% filter(install_time-click_time1==0))
## # A tibble: 6 x 17
##   adid            install_time        click_time1         action1 network1
##   <chr>           <dttm>              <dttm>              <chr>   <chr>   
## 1 00093036-cd9b-… 2018-06-16 00:39:45 2018-06-16 00:39:45 click   Batmobi 
## 2 000b2fc9-c221-… 2018-06-24 00:26:12 2018-06-24 00:26:12 click   Batmobi 
## 3 001169e6-9753-… 2018-06-20 01:02:23 2018-06-20 01:02:23 click   Batmobi 
## 4 001b6aad-a34c-… 2018-06-20 00:52:55 2018-06-20 00:52:55 click   Batmobi 
## 5 00229c86-abcb-… 2018-06-28 02:29:10 2018-06-28 02:29:10 click   Batmobi 
## 6 003526bf-f96e-… 2018-06-22 00:49:55 2018-06-22 00:49:55 click   Batmobi 
## # ... with 12 more variables: click_time2 <dttm>, action2 <chr>,
## #   network2 <chr>, click_time3 <dttm>, action3 <chr>, network3 <chr>,
## #   click_time4 <dttm>, action4 <chr>, network4 <chr>, click_time5 <dttm>,
## #   action5 <chr>, network5 <chr>

这一部分数据的量有:

length(unique((Batmobi[[2]] %>% filter(install_time-click_time1==0))$adid))
## [1] 8935

3.Batmobi下载渠道与最后一次点击渠道不一致

head(Batmobi[[2]] %>% filter(network1 !='Affle'))
## # A tibble: 6 x 17
##   adid            install_time        click_time1         action1 network1
##   <chr>           <dttm>              <dttm>              <chr>   <chr>   
## 1 00093036-cd9b-… 2018-06-16 00:39:45 2018-06-16 00:39:45 click   Batmobi 
## 2 000b2fc9-c221-… 2018-06-24 00:26:12 2018-06-24 00:26:12 click   Batmobi 
## 3 001169e6-9753-… 2018-06-20 01:02:23 2018-06-20 01:02:23 click   Batmobi 
## 4 001b6aad-a34c-… 2018-06-20 00:52:55 2018-06-20 00:52:55 click   Batmobi 
## 5 00229c86-abcb-… 2018-06-28 02:29:10 2018-06-28 02:29:10 click   Batmobi 
## 6 003526bf-f96e-… 2018-06-22 00:49:55 2018-06-22 00:49:55 click   Batmobi 
## # ... with 12 more variables: click_time2 <dttm>, action2 <chr>,
## #   network2 <chr>, click_time3 <dttm>, action3 <chr>, network3 <chr>,
## #   click_time4 <dttm>, action4 <chr>, network4 <chr>, click_time5 <dttm>,
## #   action5 <chr>, network5 <chr>

这一不分的量有

length(unique((Batmobi[[2]] %>% filter(network1 !='Batmobi'))$adid))
## [1] 1867

所以,可能有问题的占比为:

(length(unique((Batmobi[[2]] %>% filter(network1 !='Batmobi'))$adid))+length(unique((Batmobi[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique(Batmobi[[1]]$adid)))/(length(unique(Batmobi[[2]]$adid)) + length(unique((Batmobi[[2]] %>% filter(install_time-click_time1==0))$adid)))
## [1] 0.6209269

总结

a <- (length(unique((Batmobi[[2]] %>% filter(network1 !='Batmobi'))$adid))+length(unique((Batmobi[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique(Batmobi[[1]]$adid)))/(length(unique(Batmobi[[2]]$adid)) + length(unique((Batmobi[[2]] %>% filter(install_time-click_time1==0))$adid)))
b <- (length(unique((Glispa[[2]] %>% filter(network1 !='Glispa'))$adid))+length(unique((Glispa[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique(Glispa[[1]]$adid)))/(length(unique(Glispa[[2]]$adid)) + length(unique((Glispa[[2]] %>% filter(install_time-click_time1==0))$adid)))
c <- (length(unique((Mobvista[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique((Mobvista[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique((Mobvista[[2]] %>% filter(network1 !='Mobvista'))$adid)))/(length(unique(Mobvista[[2]]$adid)) + length(unique((Mobvista[[2]] %>% filter(install_time-click_time1==0))$adid)))
d <- (length(unique((Affle[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique((Affle[[2]] %>% filter(install_time-click_time1==0))$adid))+length(unique((Affle[[2]] %>% filter(network1 !='Affle'))$adid)))/(length(unique(Affle[[2]]$adid)) + length(unique((Affle[[2]] %>% filter(install_time-click_time1==0))$adid)))

result <- data.frame(c('Batmobi','Glispa','Mobvista','Affle'),c(a,b,c,d))
names(result) <- c('type','rate')

library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- ggplot(aes(x = type,y = rate),data = result)+geom_bar(stat= 'identity')
ggplotly(p)