TalkingData AdTracking Fraud Detection Challenge

Natália Faraj Murad

This is a study about predicting fraud in clicks with a dataset available on [Kaggle] (https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/data). The main goal is to measure the journey of a user’s click across their portfolio, and flag IP addresses who produce lots of clicks, but never end up installing apps. The difficulties here are the size of the data set and the unbalanced classes once you have too much more fraud clicks than clicks that resulted in a download.

Prepare the Environment & Load the Data

Preprocessing data in chunks

The dataset is very big, so it will be processed in smaller chunks in order to not exceed the available memory in the computer. In this step, it will be already filtered some of the data of class 0, once it is too much bigger than class 1. It will be used a size of 5 times the size of class 1 to class 0. It will be written in a new balanced file.

# Define parameters
nlinesfile <- nlines("train.csv")
oldrows <- 1
chunksize <- 1000000


# Read the file in chunks & split classes in 2 different files

while(nlinesfile>oldrows){
  chunk <- read.csv2("train.csv", 
                     nrows=chunksize, skip = oldrows,
                     stringsAsFactors=FALSE, sep = ",", header =F,
                     na.strings = "")
  colnames(chunk) <- c("ip", "app", "device", "os", "channel",
                       "click_time", "attributed_time",
                       "is_attributed")
 chunk %>%
    filter(is_attributed==1) %>%
    # Write the file class 1
    fwrite("datafilt1.csv", append = TRUE, sep = ",",
           col.names = F)
  chunk %>%
    filter(is_attributed==0) %>%
    # Write the file class 0
    fwrite("datafilt0.csv", append = TRUE, sep = ",",
           col.names = F)
    # Update values
    oldrows <- oldrows + nrow(chunk)
    cat("Read: lines", oldrows, "to", oldrows + nrow(chunk), "\n")
    rm(chunk)
}

# Classes are unbalanced

sizeClass1 <-  nlines("datafilt1.csv")
sizeClass0 <- nlines("datafilt0.csv")

# Creating indexes to sample Class 0
indexes <- sample(2:sizeClass0, size = 5*sizeClass1, replace = FALSE)

# Read the files
class0 <- fread("datafilt0.csv", stringsAsFactors = F,sep = ",",
                na.strings = "")

# Create a sample
class0sample <- class0 %>%
  sample_n(2284230)

fwrite(class0sample, "class0sample.csv", append = TRUE, sep = ",",
       col.names = F)

The dataset is composed by the following variables: ip: click IP address; app: application id referred by the ad; device: dispositive type ID; os: operational system version; channel: channel of the announcement editors; click_time: register and time of the click; attributed_time: if the app was downloaded, it shows the time of the download; is_attributed (Target): target feature indicating if the app was downloaded or not.

data <- fread("databalanced.csv", stringsAsFactors = F, sep = ",", header =T, na.strings = "")
head(data)
##        ip app device os channel          click_time is_attributed
## 1: 204158  35      1 13      21 2017-11-06 15:41:07             1
## 2:  29692   9      1 22     215 2017-11-06 16:00:02             1
## 3:  64516  35      1 13      21 2017-11-06 16:00:02             1
## 4: 172429  35      1 46     274 2017-11-06 16:00:03             1
## 5: 199085  35      1 13     274 2017-11-06 16:00:04             1
## 6:  82917  19      0 24     210 2017-11-06 16:00:04             1
dim(data)
## [1] 2741076       7
str(data)
## Classes 'data.table' and 'data.frame':   2741076 obs. of  7 variables:
##  $ ip           : int  204158 29692 64516 172429 199085 82917 126647 57546 189682 24200 ...
##  $ app          : int  35 9 35 35 35 19 72 29 35 19 ...
##  $ device       : int  1 1 1 1 1 0 1 1 1 88 ...
##  $ os           : int  13 22 13 46 13 24 6 41 13 24 ...
##  $ channel      : int  21 215 21 274 274 210 101 213 21 213 ...
##  $ click_time   : POSIXct, format: "2017-11-06 15:41:07" "2017-11-06 16:00:02" ...
##  $ is_attributed: int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Format the Dates

Here we format the dates and time.

data$click_time <- as.POSIXct(data$click_time)
sum(is.na(data$click_time))
## [1] 0

Then extract the information in new separated variables.

# Splitting the click time and date
data$day <- as.numeric(sapply(data$click_time, wday))
data$hour <- as.numeric(sapply(data$click_time, hour))
data$min <- as.numeric(sapply(data$click_time, minute))

Exploration

Number of Unique Values in the Features

n_ip <- length(unique(data$ip))
n_app <- length(unique(data$app))
n_os <- length(unique(data$os))
n_ch <- length(unique(data$channel))
n_device <- length(unique(data$device))
n_days <- length(unique(data$day))

counts <- rbind(n_app, n_os, n_ch, n_device, n_days)

barplot(counts[,1], main = 'Exclusive Items', ylab = 'Number of Exclusive',
        col = c('#836FFF', '#1E90FF', '#FFD700', '#32CD32','#DC143C'))

message(c('The total number of ips is ', n_ip, '.\n',
          'The total number of apps is ', n_app, '.\n',
          'The total number of operational systems is ', n_os, '.\n',
          'The total number of channels is ', n_ch, '.\n',
          'The total number of devices is ', n_device, '.\n',
          'The total number of day is ', n_days, '.'))
## The total number of ips is 261478.
## The total number of apps is 393.
## The total number of operational systems is 247.
## The total number of channels is 179.
## The total number of devices is 1918.
## The total number of day is 4.
rm(counts, n_ip, n_app, n_os, n_ch, n_device, n_days)
invisible(gc())

Number of Clicks by IP

# Total number of clicks by IP
n_clicks_by_ip <- as.data.frame(table(data$ip))
summary(as.data.frame(n_clicks_by_ip))
##       Var1             Freq         
##  1      :     1   Min.   :    1.00  
##  5      :     1   1st Qu.:    1.00  
##  6      :     1   Median :    1.00  
##  9      :     1   Mean   :   10.48  
##  10     :     1   3rd Qu.:    4.00  
##  19     :     1   Max.   :17644.00  
##  (Other):261472
#data1 <- merge(data1, n_clicks_by_ip, by.x = 'ip', by.y = 'Var1')
barplot(n_clicks_by_ip$Freq~n_clicks_by_ip$Var1, xlab = "IP",
        ylab = 'Number of Clicks',
        main = "Number of Clicks by IP", ylim = c(0,18000))

x <- data %>%
  group_by(is_attributed) %>%
  count(ip)

y <- as.data.frame(subset(x, is_attributed==1))
z <- as.data.frame(subset(x, is_attributed==0))

cat("Max clicks by IP - Not Download: ", max(z$n), "\n")
## Max clicks by IP - Not Download:  15304
cat("Max clicks by IP - Download: ", max(y$n), "\n")
## Max clicks by IP - Download:  2340
par(mfrow = c(1,2))
barplot(z$n~z$ip, ylim=c(0,18000), xlab = "IP",
        ylab = 'Number of Clicks',
        main = "Number of Clicks by IP- Not Download")
barplot(y$n~y$ip, ylim=c(0,18000), xlab = "IP",
        ylab = 'Number of Clicks',
        main = "Number of Clicks by IP - Download")

rm(n_clicks_by_ip, x, y)
invisible(gc())

Percentage of Clicks by Hour

par(mfrow = c(1,1))

# Percent of clicks by hour
total_cliques <- data %>%
  group_by(is_attributed) %>%
  count()

clicksHdownload <- data %>%
  filter(is_attributed=='1') %>%
  group_by(hour) %>%
  mutate(percentclickDown = sum(hour)/456846)

clicksHNotdownload <- data %>%
  filter(is_attributed=='0') %>%
  group_by(hour) %>%
  mutate(percentclickNotDown = sum(hour)/2284230)

ggplot()+
  geom_step(aes(x = clicksHdownload$hour,
                y = clicksHdownload$percentclickDown, col = "Downloads"))+
  geom_step(aes(x = clicksHNotdownload$hour,
                y = clicksHNotdownload$percentclickNotDown, col = "Not Downloads"))+
  labs(x = "Hour", y = "Percentage of Clicks")+
  ggtitle("Percentage of Clicks by Hour") +
  theme(plot.title = element_text(hjust = 0.5))

rm(clicksHdownload, clicksHNotdownload, total_cliques)
invisible(gc())

Feature Engineering

Some features created aggregating groups by counting.

# Add some count vars
data <- data %>%
  add_count(ip, name = 'ip_ct') %>%
  add_count(ip, day, hour, name = 'ip_d_h') %>%
  add_count(ip, hour, channel, name = 'ip_h_ch') %>%
  add_count(ip, hour, os, name = 'ip_h_os') %>%
  add_count(ip, hour, app, name = 'ip_h_app') %>%
  add_count(ip, hour, device, name = 'ip_h_dev')

# How popular is the app or channel?
data <- data %>%
  add_count(app, channel, name = 'app_ch')

The next click by ip, os, device calculated by grouping these variables and calculating the difference of time until next click. When the click is the last by group, this value was filled with 0.

#Next click
next_click <- function(vector){
  nxt_click <- as.vector(0)
  for(i in 1:length(vector)){
    if(i < length(vector)){
     nxt_click[i] <- as.numeric(difftime(vector[i+1], vector[i], units = 'secs'))
    } else {
     nxt_click[i] <- 0
    }
    return(nxt_click[i])
  }
}

#next click by ip, os, device
data <- data %>%
  group_by(ip, os, device) %>%
  arrange(click_time) %>%
  mutate(ip_os_dev_nxcl = next_click(click_time))
# Excluding variables that will be not used
data$click_time <- NULL
data$ip <- NULL

Split Train & Test Datasets

The trainset will be the data of the first 3 days and the last day will be used as test.

trainset <- filter(data, day<5)
testset <- filter(data, day == 5)

# Check proportion of each class
prop.table(table(trainset$is_attributed))
## 
##         0         1 
## 0.8345039 0.1654961
barplot(prop.table(table(trainset$is_attributed)),
        col = c('#6495ED', '#FF69B4'))

# Check na values
sum(is.na(trainset))
## [1] 0

Balancing Categories

Balancing the target feature by undersampling. It has some disadvantages, but considering that the dataset is big, it will be used. It was tested different proportions of balancing (1:1, 1:2, 1:3, 1:4 and 1:5). Best model metrics was found using the proportion 1 class 1 : 3 class 0.

balanced_sample = ovun.sample(is_attributed~., trainset, method="under",
                              p=0.25, subset=options("subset")$subset,
                              seed = 15)$data
dim(balanced_sample)
## [1] 1291860      16
# Check proportion of each class
prop.table(table(balanced_sample$is_attributed))
## 
##         0         1 
## 0.7500666 0.2499334
barplot(prop.table(table(balanced_sample$is_attributed)),
        col = c('#6495ED', '#FF69B4'))

rm(trainset)
rm(data)
invisible(gc())

XGBoost

Searching the best tuning for the hyperparameters through Grid Search.

# GridSearch in order to find best parameters

# Set up the cross-validated hyper-parameter search
xgb_grid_1 = expand.grid(
nrounds = 100,
max_depth = c(30, 31, 35, 36, 40, 41),
eta = c(0.1, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
gamma = 0.001,
colsample_bytree = 0.8,
#max_delta_step = c(12, 13, 15, 17, 19),
subsample = 1,
min_child_weight = c(0,1,3,5)
)

# pack the training control parameters
xgb_trcontrol_1 = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
returnData = FALSE,
returnResamp = "all",                                                        # save losses across all models
classProbs = TRUE,                                                           # set to TRUE for AUC to be computed
summaryFunction = twoClassSummary,
allowParallel = TRUE
)

factoris_attributed <- as.factor(ifelse(balanced_sample$is_attributed==0, "No", "Download"))

# train the model for each parameter combination in the grid,
#   using CV to evaluate
xgb_train_1 = train(
x = as.matrix(balanced_sample[, impfeat]),
y = factoris_attributed,
trControl = xgb_trcontrol_1,
tuneGrid = xgb_grid_1,
method = "xgbTree",
nthread = 50
)

### Result
#Aggregating results
#Selecting tuning parameters
#Fitting nrounds = 100, max_depth = 30, eta = 0.1, gamma = 0.001, colsample_bytree = 0.8, min_child_weight = 5, subsample = 1 on full training set

Training the model using the values of parameters found in the previous step.

# Check importance of the features
imp.xgb <- xgb.importance(model = model_xgboost)
kable(imp.xgb)
Feature Gain Cover Frequency
app 0.4108019 0.1665830 0.0218443
channel 0.1513979 0.0740529 0.0534865
ip_ct 0.1400499 0.1522775 0.1629586
app_ch 0.0867369 0.1153905 0.0572351
ip_h_dev 0.0569163 0.0548843 0.0667904
ip_os_dev_nxcl 0.0445886 0.0997027 0.1759491
os 0.0248271 0.1072724 0.0597624
min 0.0208194 0.0396978 0.1528512
hour 0.0183449 0.0612127 0.1000966
device 0.0174709 0.0346641 0.0068602
ip_d_h 0.0078682 0.0287201 0.0500069
ip_h_os 0.0069001 0.0215787 0.0329585
ip_h_ch 0.0056145 0.0167482 0.0128813
day 0.0039706 0.0094317 0.0286539
ip_h_app 0.0036928 0.0177833 0.0176651
# app, channel, ip_ct, ip_h_dev, ip_os_dev_nxcl, min, hour,
# os, app_ch, ip_d_h, ip_h_os, day, device, ip_h_app, ip_h_ch

When working with fraud detection it is important to find a balance between Sensibility and Specificity values. It was focused on the sensibility because it is important, in this case, to know of all positive values, how much was identified.

# Predictions
test_features <- testset %>% dplyr::select(-c("is_attributed"))
pred <- predict(model_xgboost, as.matrix(test_features))


# Classification
cutoff <- 0.5
predClass <- ifelse(pred > cutoff, 1, 0)

# Confusion Matrix 
confusionMatrix(table(pred = predClass, data = testset$is_attributed))
## Confusion Matrix and Statistics
## 
##     data
## pred      0      1
##    0 643658  19292
##    1  12474 114675
##                                           
##                Accuracy : 0.9598          
##                  95% CI : (0.9594, 0.9602)
##     No Information Rate : 0.8304          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8543          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9810          
##             Specificity : 0.8560          
##          Pos Pred Value : 0.9709          
##          Neg Pred Value : 0.9019          
##              Prevalence : 0.8304          
##          Detection Rate : 0.8147          
##    Detection Prevalence : 0.8391          
##       Balanced Accuracy : 0.9185          
##                                           
##        'Positive' Class : 0               
## 
# AUC

xgboostAuc <- auc(roc(as.integer(testset$is_attributed), as.integer(predClass)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#plot ROC
roc(testset$is_attributed, pred, plot = TRUE, col = "steelblue", lwd = 1, 
    levels=base::levels(as.factor(testset$is_attributed)), grid=TRUE) 
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = testset$is_attributed, predictor = pred,     levels = base::levels(as.factor(testset$is_attributed)),     plot = TRUE, col = "steelblue", lwd = 1, grid = TRUE)
## 
## Data: pred in 656132 controls (testset$is_attributed 0) < 133967 cases (testset$is_attributed 1).
## Area under the curve: 0.9697
rm(test_features)
invisible(gc())

LightGBM

Adjusting a model using the LightGBM algorithm.

# Set categorical features
categorical_features = c("app", "device", "os", "channel", "day", "hour", "min")

train <- as.data.frame(balanced_sample)
train[categorical_features] <- apply(train[categorical_features],2, as.factor)

test <- as.data.frame(testset)
test[categorical_features] <- apply(test[categorical_features],2, as.factor)

# Prepare datasets
dtrain = lgb.Dataset(data = as.matrix(train[, colnames(train) != "is_attributed"]), 
                     label = train$is_attributed, categorical_feature = categorical_features)
dvalid = lgb.Dataset(data = as.matrix(test[, colnames(test) != "is_attributed"]), 
                     label = test$is_attributed, categorical_feature = categorical_features)

invisible(gc())

# Set parameters
params = list(objective = "binary", 
              metric = "auc", 
              learning_rate= 0.25, 
              num_leaves= 38,
              max_depth= 21,
              min_child_samples= 100,
              max_bin= 100, # RAM dependent as per LightGBM documentation
              subsample= 0.7,
              subsample_freq= 1,
              colsample_bytree= 0.7,
              min_child_weight= 0,
              min_split_gain= 0) #,
              #scale_pos_weight=99.76) # calculated for this dataset


# Train the model
model <- lgb.train(params, dtrain, valids = list(validation = dvalid), nthread = 45,
                   nrounds = 1000, verbose= 1, early_stopping_rounds = 50, eval_freq = 50)
## [LightGBM] [Info] Number of positive: 322879, number of negative: 968981
## [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027182 seconds.
## You can set `force_row_wise=true` to remove the overhead.
## And if memory is not enough, you can set `force_col_wise=true`.
## [LightGBM] [Info] Total Bins 1281
## [LightGBM] [Info] Number of data points in the train set: 1291860, number of used features: 15
## [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249933 -> initscore=-1.098967
## [LightGBM] [Info] Start training from score -1.098967
## [1] "[1]:  validation's auc:0.958215"
## [1] "[51]:  validation's auc:0.970668"
## [1] "[101]:  validation's auc:0.970629"

Checking the metrics for this model.

rm(dtrain, dvalid)
invisible(gc())

# AUC
cat("Best AUC: ", 
    max(unlist(model$record_evals[["validation"]][["auc"]][["eval"]])))
## Best AUC:  0.9707243
# get feature importance
implgb = lgb.importance(model, percentage = TRUE)
kable(implgb)
Feature Gain Cover Frequency
channel 0.4664055 0.2398605 0.2185519
app 0.4207643 0.1703252 0.1197865
ip_ct 0.0567847 0.0856857 0.0613947
os 0.0200849 0.1520078 0.1544878
hour 0.0077292 0.0669085 0.1074408
min 0.0063728 0.1060428 0.1695028
device 0.0048178 0.0948599 0.0360360
app_ch 0.0046465 0.0115593 0.0236904
ip_h_dev 0.0032183 0.0165658 0.0250250
ip_os_dev_nxcl 0.0031793 0.0123481 0.0246914
ip_h_ch 0.0020276 0.0096597 0.0140140
ip_h_os 0.0018975 0.0105659 0.0126793
ip_d_h 0.0011819 0.0142438 0.0156823
ip_h_app 0.0006370 0.0040595 0.0113447
day 0.0002526 0.0053076 0.0056723
val_preds = predict(model, data = as.matrix(testset[, colnames(testset) != "is_attributed"]), n = model$best_iter)


# Classification
cutoff <- 0.5
predLGBM <- ifelse(val_preds > cutoff, 1, 0)

# Confusion Matrix 
confusionMatrix(table(pred = predLGBM, data = testset$is_attributed))
## Confusion Matrix and Statistics
## 
##     data
## pred      0      1
##    0 645328  19588
##    1  10804 114379
##                                          
##                Accuracy : 0.9615         
##                  95% CI : (0.9611, 0.962)
##     No Information Rate : 0.8304         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8597         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9835         
##             Specificity : 0.8538         
##          Pos Pred Value : 0.9705         
##          Neg Pred Value : 0.9137         
##              Prevalence : 0.8304         
##          Detection Rate : 0.8168         
##    Detection Prevalence : 0.8416         
##       Balanced Accuracy : 0.9187         
##                                          
##        'Positive' Class : 0              
## 
#plot ROC
roc(testset$is_attributed, val_preds, plot = TRUE, col = "steelblue",
    lwd = 1, levels=base::levels(as.factor(testset$is_attributed)),
    grid=TRUE)   
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = testset$is_attributed, predictor = val_preds,     levels = base::levels(as.factor(testset$is_attributed)),     plot = TRUE, col = "steelblue", lwd = 1, grid = TRUE)
## 
## Data: val_preds in 656132 controls (testset$is_attributed 0) < 133967 cases (testset$is_attributed 1).
## Area under the curve: 0.9707

Applying the Model to the Test Dataset

rm(balanced_sample, test, testset, train)
invisible(gc())


# Read the files
testdata <- fread("test.csv", stringsAsFactors = F,sep = ",",
                na.strings = "")

# Creating features

testdata$click_time <- as.POSIXct(testdata$click_time)

# Splitting the click time and date

testdata <- testdata %>%
  mutate(day = wday(testdata$click_time)) %>%
  mutate(hour = hour(testdata$click_time)) %>%  
  mutate(min = minute(testdata$click_time))


# Add some count vars
testdata <- testdata %>%
  add_count(ip, name = 'ip_ct') %>%
  add_count(ip, day, hour, name = 'ip_d_h') %>%
  add_count(ip, hour, channel, name = 'ip_h_ch') %>%
  add_count(ip, hour, os, name = 'ip_h_os') %>%
  add_count(ip, hour, app, name = 'ip_h_app') %>%
  add_count(ip, hour, device, name = 'ip_h_dev')

# How popular is the app or channel?
testdata <- testdata %>%
  add_count(app, channel, name = 'app_ch')

#next click by ip, os, device
testdata <- testdata %>%
  group_by(ip, os, device) %>%
  arrange(click_time) %>%
  mutate(ip_os_dev_nxcl = next_click(click_time))

# Excluding variables that will be not used
testdata$click_time <- NULL
testdata$ip <- NULL


# Predictions

val_preds = predict(model, data = as.matrix(testdata[, colnames(testdata) != "click_id"]), n = model$best_iter)

# Classification
cutoff <- 0.5
is_attributed <- ifelse(val_preds > cutoff, 1, 0)

predictions <- cbind(as.integer(testdata$click_id), is_attributed)
colnames(predictions) <- c('click_id', 'is_attributed')

fwrite(predictions, 'predictionsLGBM.csv', sep = ',',
       col.names = TRUE)