This is a study about predicting fraud in clicks with a dataset available on [Kaggle] (https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/data). The main goal is to measure the journey of a user’s click across their portfolio, and flag IP addresses who produce lots of clicks, but never end up installing apps. The difficulties here are the size of the data set and the unbalanced classes once you have too much more fraud clicks than clicks that resulted in a download.
The dataset is very big, so it will be processed in smaller chunks in order to not exceed the available memory in the computer. In this step, it will be already filtered some of the data of class 0, once it is too much bigger than class 1. It will be used a size of 5 times the size of class 1 to class 0. It will be written in a new balanced file.
# Define parameters
nlinesfile <- nlines("train.csv")
oldrows <- 1
chunksize <- 1000000
# Read the file in chunks & split classes in 2 different files
while(nlinesfile>oldrows){
chunk <- read.csv2("train.csv",
nrows=chunksize, skip = oldrows,
stringsAsFactors=FALSE, sep = ",", header =F,
na.strings = "")
colnames(chunk) <- c("ip", "app", "device", "os", "channel",
"click_time", "attributed_time",
"is_attributed")
chunk %>%
filter(is_attributed==1) %>%
# Write the file class 1
fwrite("datafilt1.csv", append = TRUE, sep = ",",
col.names = F)
chunk %>%
filter(is_attributed==0) %>%
# Write the file class 0
fwrite("datafilt0.csv", append = TRUE, sep = ",",
col.names = F)
# Update values
oldrows <- oldrows + nrow(chunk)
cat("Read: lines", oldrows, "to", oldrows + nrow(chunk), "\n")
rm(chunk)
}
# Classes are unbalanced
sizeClass1 <- nlines("datafilt1.csv")
sizeClass0 <- nlines("datafilt0.csv")
# Creating indexes to sample Class 0
indexes <- sample(2:sizeClass0, size = 5*sizeClass1, replace = FALSE)
# Read the files
class0 <- fread("datafilt0.csv", stringsAsFactors = F,sep = ",",
na.strings = "")
# Create a sample
class0sample <- class0 %>%
sample_n(2284230)
fwrite(class0sample, "class0sample.csv", append = TRUE, sep = ",",
col.names = F)
The dataset is composed by the following variables: ip: click IP address; app: application id referred by the ad; device: dispositive type ID; os: operational system version; channel: channel of the announcement editors; click_time: register and time of the click; attributed_time: if the app was downloaded, it shows the time of the download; is_attributed (Target): target feature indicating if the app was downloaded or not.
data <- fread("databalanced.csv", stringsAsFactors = F, sep = ",", header =T, na.strings = "")
head(data)
## ip app device os channel click_time is_attributed
## 1: 204158 35 1 13 21 2017-11-06 15:41:07 1
## 2: 29692 9 1 22 215 2017-11-06 16:00:02 1
## 3: 64516 35 1 13 21 2017-11-06 16:00:02 1
## 4: 172429 35 1 46 274 2017-11-06 16:00:03 1
## 5: 199085 35 1 13 274 2017-11-06 16:00:04 1
## 6: 82917 19 0 24 210 2017-11-06 16:00:04 1
dim(data)
## [1] 2741076 7
str(data)
## Classes 'data.table' and 'data.frame': 2741076 obs. of 7 variables:
## $ ip : int 204158 29692 64516 172429 199085 82917 126647 57546 189682 24200 ...
## $ app : int 35 9 35 35 35 19 72 29 35 19 ...
## $ device : int 1 1 1 1 1 0 1 1 1 88 ...
## $ os : int 13 22 13 46 13 24 6 41 13 24 ...
## $ channel : int 21 215 21 274 274 210 101 213 21 213 ...
## $ click_time : POSIXct, format: "2017-11-06 15:41:07" "2017-11-06 16:00:02" ...
## $ is_attributed: int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
Here we format the dates and time.
data$click_time <- as.POSIXct(data$click_time)
sum(is.na(data$click_time))
## [1] 0
Then extract the information in new separated variables.
# Splitting the click time and date
data$day <- as.numeric(sapply(data$click_time, wday))
data$hour <- as.numeric(sapply(data$click_time, hour))
data$min <- as.numeric(sapply(data$click_time, minute))
n_ip <- length(unique(data$ip))
n_app <- length(unique(data$app))
n_os <- length(unique(data$os))
n_ch <- length(unique(data$channel))
n_device <- length(unique(data$device))
n_days <- length(unique(data$day))
counts <- rbind(n_app, n_os, n_ch, n_device, n_days)
barplot(counts[,1], main = 'Exclusive Items', ylab = 'Number of Exclusive',
col = c('#836FFF', '#1E90FF', '#FFD700', '#32CD32','#DC143C'))
message(c('The total number of ips is ', n_ip, '.\n',
'The total number of apps is ', n_app, '.\n',
'The total number of operational systems is ', n_os, '.\n',
'The total number of channels is ', n_ch, '.\n',
'The total number of devices is ', n_device, '.\n',
'The total number of day is ', n_days, '.'))
## The total number of ips is 261478.
## The total number of apps is 393.
## The total number of operational systems is 247.
## The total number of channels is 179.
## The total number of devices is 1918.
## The total number of day is 4.
rm(counts, n_ip, n_app, n_os, n_ch, n_device, n_days)
invisible(gc())
# Total number of clicks by IP
n_clicks_by_ip <- as.data.frame(table(data$ip))
summary(as.data.frame(n_clicks_by_ip))
## Var1 Freq
## 1 : 1 Min. : 1.00
## 5 : 1 1st Qu.: 1.00
## 6 : 1 Median : 1.00
## 9 : 1 Mean : 10.48
## 10 : 1 3rd Qu.: 4.00
## 19 : 1 Max. :17644.00
## (Other):261472
#data1 <- merge(data1, n_clicks_by_ip, by.x = 'ip', by.y = 'Var1')
barplot(n_clicks_by_ip$Freq~n_clicks_by_ip$Var1, xlab = "IP",
ylab = 'Number of Clicks',
main = "Number of Clicks by IP", ylim = c(0,18000))
x <- data %>%
group_by(is_attributed) %>%
count(ip)
y <- as.data.frame(subset(x, is_attributed==1))
z <- as.data.frame(subset(x, is_attributed==0))
cat("Max clicks by IP - Not Download: ", max(z$n), "\n")
## Max clicks by IP - Not Download: 15304
cat("Max clicks by IP - Download: ", max(y$n), "\n")
## Max clicks by IP - Download: 2340
par(mfrow = c(1,2))
barplot(z$n~z$ip, ylim=c(0,18000), xlab = "IP",
ylab = 'Number of Clicks',
main = "Number of Clicks by IP- Not Download")
barplot(y$n~y$ip, ylim=c(0,18000), xlab = "IP",
ylab = 'Number of Clicks',
main = "Number of Clicks by IP - Download")
rm(n_clicks_by_ip, x, y)
invisible(gc())
par(mfrow = c(1,1))
# Percent of clicks by hour
total_cliques <- data %>%
group_by(is_attributed) %>%
count()
clicksHdownload <- data %>%
filter(is_attributed=='1') %>%
group_by(hour) %>%
mutate(percentclickDown = sum(hour)/456846)
clicksHNotdownload <- data %>%
filter(is_attributed=='0') %>%
group_by(hour) %>%
mutate(percentclickNotDown = sum(hour)/2284230)
ggplot()+
geom_step(aes(x = clicksHdownload$hour,
y = clicksHdownload$percentclickDown, col = "Downloads"))+
geom_step(aes(x = clicksHNotdownload$hour,
y = clicksHNotdownload$percentclickNotDown, col = "Not Downloads"))+
labs(x = "Hour", y = "Percentage of Clicks")+
ggtitle("Percentage of Clicks by Hour") +
theme(plot.title = element_text(hjust = 0.5))
rm(clicksHdownload, clicksHNotdownload, total_cliques)
invisible(gc())
Some features created aggregating groups by counting.
# Add some count vars
data <- data %>%
add_count(ip, name = 'ip_ct') %>%
add_count(ip, day, hour, name = 'ip_d_h') %>%
add_count(ip, hour, channel, name = 'ip_h_ch') %>%
add_count(ip, hour, os, name = 'ip_h_os') %>%
add_count(ip, hour, app, name = 'ip_h_app') %>%
add_count(ip, hour, device, name = 'ip_h_dev')
# How popular is the app or channel?
data <- data %>%
add_count(app, channel, name = 'app_ch')
The next click by ip, os, device calculated by grouping these variables and calculating the difference of time until next click. When the click is the last by group, this value was filled with 0.
#Next click
next_click <- function(vector){
nxt_click <- as.vector(0)
for(i in 1:length(vector)){
if(i < length(vector)){
nxt_click[i] <- as.numeric(difftime(vector[i+1], vector[i], units = 'secs'))
} else {
nxt_click[i] <- 0
}
return(nxt_click[i])
}
}
#next click by ip, os, device
data <- data %>%
group_by(ip, os, device) %>%
arrange(click_time) %>%
mutate(ip_os_dev_nxcl = next_click(click_time))
# Excluding variables that will be not used
data$click_time <- NULL
data$ip <- NULL
The trainset will be the data of the first 3 days and the last day will be used as test.
trainset <- filter(data, day<5)
testset <- filter(data, day == 5)
# Check proportion of each class
prop.table(table(trainset$is_attributed))
##
## 0 1
## 0.8345039 0.1654961
barplot(prop.table(table(trainset$is_attributed)),
col = c('#6495ED', '#FF69B4'))
# Check na values
sum(is.na(trainset))
## [1] 0
Balancing the target feature by undersampling. It has some disadvantages, but considering that the dataset is big, it will be used. It was tested different proportions of balancing (1:1, 1:2, 1:3, 1:4 and 1:5). Best model metrics was found using the proportion 1 class 1 : 3 class 0.
balanced_sample = ovun.sample(is_attributed~., trainset, method="under",
p=0.25, subset=options("subset")$subset,
seed = 15)$data
dim(balanced_sample)
## [1] 1291860 16
# Check proportion of each class
prop.table(table(balanced_sample$is_attributed))
##
## 0 1
## 0.7500666 0.2499334
barplot(prop.table(table(balanced_sample$is_attributed)),
col = c('#6495ED', '#FF69B4'))
rm(trainset)
rm(data)
invisible(gc())
Searching the best tuning for the hyperparameters through Grid Search.
# GridSearch in order to find best parameters
# Set up the cross-validated hyper-parameter search
xgb_grid_1 = expand.grid(
nrounds = 100,
max_depth = c(30, 31, 35, 36, 40, 41),
eta = c(0.1, 0.5, 0.6, 0.7, 0.8, 0.9, 1),
gamma = 0.001,
colsample_bytree = 0.8,
#max_delta_step = c(12, 13, 15, 17, 19),
subsample = 1,
min_child_weight = c(0,1,3,5)
)
# pack the training control parameters
xgb_trcontrol_1 = trainControl(
method = "cv",
number = 5,
verboseIter = TRUE,
returnData = FALSE,
returnResamp = "all", # save losses across all models
classProbs = TRUE, # set to TRUE for AUC to be computed
summaryFunction = twoClassSummary,
allowParallel = TRUE
)
factoris_attributed <- as.factor(ifelse(balanced_sample$is_attributed==0, "No", "Download"))
# train the model for each parameter combination in the grid,
# using CV to evaluate
xgb_train_1 = train(
x = as.matrix(balanced_sample[, impfeat]),
y = factoris_attributed,
trControl = xgb_trcontrol_1,
tuneGrid = xgb_grid_1,
method = "xgbTree",
nthread = 50
)
### Result
#Aggregating results
#Selecting tuning parameters
#Fitting nrounds = 100, max_depth = 30, eta = 0.1, gamma = 0.001, colsample_bytree = 0.8, min_child_weight = 5, subsample = 1 on full training set
Training the model using the values of parameters found in the previous step.
# Check importance of the features
imp.xgb <- xgb.importance(model = model_xgboost)
kable(imp.xgb)
| Feature | Gain | Cover | Frequency |
|---|---|---|---|
| app | 0.4108019 | 0.1665830 | 0.0218443 |
| channel | 0.1513979 | 0.0740529 | 0.0534865 |
| ip_ct | 0.1400499 | 0.1522775 | 0.1629586 |
| app_ch | 0.0867369 | 0.1153905 | 0.0572351 |
| ip_h_dev | 0.0569163 | 0.0548843 | 0.0667904 |
| ip_os_dev_nxcl | 0.0445886 | 0.0997027 | 0.1759491 |
| os | 0.0248271 | 0.1072724 | 0.0597624 |
| min | 0.0208194 | 0.0396978 | 0.1528512 |
| hour | 0.0183449 | 0.0612127 | 0.1000966 |
| device | 0.0174709 | 0.0346641 | 0.0068602 |
| ip_d_h | 0.0078682 | 0.0287201 | 0.0500069 |
| ip_h_os | 0.0069001 | 0.0215787 | 0.0329585 |
| ip_h_ch | 0.0056145 | 0.0167482 | 0.0128813 |
| day | 0.0039706 | 0.0094317 | 0.0286539 |
| ip_h_app | 0.0036928 | 0.0177833 | 0.0176651 |
# app, channel, ip_ct, ip_h_dev, ip_os_dev_nxcl, min, hour,
# os, app_ch, ip_d_h, ip_h_os, day, device, ip_h_app, ip_h_ch
When working with fraud detection it is important to find a balance between Sensibility and Specificity values. It was focused on the sensibility because it is important, in this case, to know of all positive values, how much was identified.
# Predictions
test_features <- testset %>% dplyr::select(-c("is_attributed"))
pred <- predict(model_xgboost, as.matrix(test_features))
# Classification
cutoff <- 0.5
predClass <- ifelse(pred > cutoff, 1, 0)
# Confusion Matrix
confusionMatrix(table(pred = predClass, data = testset$is_attributed))
## Confusion Matrix and Statistics
##
## data
## pred 0 1
## 0 643658 19292
## 1 12474 114675
##
## Accuracy : 0.9598
## 95% CI : (0.9594, 0.9602)
## No Information Rate : 0.8304
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8543
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9810
## Specificity : 0.8560
## Pos Pred Value : 0.9709
## Neg Pred Value : 0.9019
## Prevalence : 0.8304
## Detection Rate : 0.8147
## Detection Prevalence : 0.8391
## Balanced Accuracy : 0.9185
##
## 'Positive' Class : 0
##
# AUC
xgboostAuc <- auc(roc(as.integer(testset$is_attributed), as.integer(predClass)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#plot ROC
roc(testset$is_attributed, pred, plot = TRUE, col = "steelblue", lwd = 1,
levels=base::levels(as.factor(testset$is_attributed)), grid=TRUE)
## Setting direction: controls < cases
##
## Call:
## roc.default(response = testset$is_attributed, predictor = pred, levels = base::levels(as.factor(testset$is_attributed)), plot = TRUE, col = "steelblue", lwd = 1, grid = TRUE)
##
## Data: pred in 656132 controls (testset$is_attributed 0) < 133967 cases (testset$is_attributed 1).
## Area under the curve: 0.9697
rm(test_features)
invisible(gc())
Adjusting a model using the LightGBM algorithm.
# Set categorical features
categorical_features = c("app", "device", "os", "channel", "day", "hour", "min")
train <- as.data.frame(balanced_sample)
train[categorical_features] <- apply(train[categorical_features],2, as.factor)
test <- as.data.frame(testset)
test[categorical_features] <- apply(test[categorical_features],2, as.factor)
# Prepare datasets
dtrain = lgb.Dataset(data = as.matrix(train[, colnames(train) != "is_attributed"]),
label = train$is_attributed, categorical_feature = categorical_features)
dvalid = lgb.Dataset(data = as.matrix(test[, colnames(test) != "is_attributed"]),
label = test$is_attributed, categorical_feature = categorical_features)
invisible(gc())
# Set parameters
params = list(objective = "binary",
metric = "auc",
learning_rate= 0.25,
num_leaves= 38,
max_depth= 21,
min_child_samples= 100,
max_bin= 100, # RAM dependent as per LightGBM documentation
subsample= 0.7,
subsample_freq= 1,
colsample_bytree= 0.7,
min_child_weight= 0,
min_split_gain= 0) #,
#scale_pos_weight=99.76) # calculated for this dataset
# Train the model
model <- lgb.train(params, dtrain, valids = list(validation = dvalid), nthread = 45,
nrounds = 1000, verbose= 1, early_stopping_rounds = 50, eval_freq = 50)
## [LightGBM] [Info] Number of positive: 322879, number of negative: 968981
## [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027182 seconds.
## You can set `force_row_wise=true` to remove the overhead.
## And if memory is not enough, you can set `force_col_wise=true`.
## [LightGBM] [Info] Total Bins 1281
## [LightGBM] [Info] Number of data points in the train set: 1291860, number of used features: 15
## [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249933 -> initscore=-1.098967
## [LightGBM] [Info] Start training from score -1.098967
## [1] "[1]: validation's auc:0.958215"
## [1] "[51]: validation's auc:0.970668"
## [1] "[101]: validation's auc:0.970629"
Checking the metrics for this model.
rm(dtrain, dvalid)
invisible(gc())
# AUC
cat("Best AUC: ",
max(unlist(model$record_evals[["validation"]][["auc"]][["eval"]])))
## Best AUC: 0.9707243
# get feature importance
implgb = lgb.importance(model, percentage = TRUE)
kable(implgb)
| Feature | Gain | Cover | Frequency |
|---|---|---|---|
| channel | 0.4664055 | 0.2398605 | 0.2185519 |
| app | 0.4207643 | 0.1703252 | 0.1197865 |
| ip_ct | 0.0567847 | 0.0856857 | 0.0613947 |
| os | 0.0200849 | 0.1520078 | 0.1544878 |
| hour | 0.0077292 | 0.0669085 | 0.1074408 |
| min | 0.0063728 | 0.1060428 | 0.1695028 |
| device | 0.0048178 | 0.0948599 | 0.0360360 |
| app_ch | 0.0046465 | 0.0115593 | 0.0236904 |
| ip_h_dev | 0.0032183 | 0.0165658 | 0.0250250 |
| ip_os_dev_nxcl | 0.0031793 | 0.0123481 | 0.0246914 |
| ip_h_ch | 0.0020276 | 0.0096597 | 0.0140140 |
| ip_h_os | 0.0018975 | 0.0105659 | 0.0126793 |
| ip_d_h | 0.0011819 | 0.0142438 | 0.0156823 |
| ip_h_app | 0.0006370 | 0.0040595 | 0.0113447 |
| day | 0.0002526 | 0.0053076 | 0.0056723 |
val_preds = predict(model, data = as.matrix(testset[, colnames(testset) != "is_attributed"]), n = model$best_iter)
# Classification
cutoff <- 0.5
predLGBM <- ifelse(val_preds > cutoff, 1, 0)
# Confusion Matrix
confusionMatrix(table(pred = predLGBM, data = testset$is_attributed))
## Confusion Matrix and Statistics
##
## data
## pred 0 1
## 0 645328 19588
## 1 10804 114379
##
## Accuracy : 0.9615
## 95% CI : (0.9611, 0.962)
## No Information Rate : 0.8304
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8597
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9835
## Specificity : 0.8538
## Pos Pred Value : 0.9705
## Neg Pred Value : 0.9137
## Prevalence : 0.8304
## Detection Rate : 0.8168
## Detection Prevalence : 0.8416
## Balanced Accuracy : 0.9187
##
## 'Positive' Class : 0
##
#plot ROC
roc(testset$is_attributed, val_preds, plot = TRUE, col = "steelblue",
lwd = 1, levels=base::levels(as.factor(testset$is_attributed)),
grid=TRUE)
## Setting direction: controls < cases
##
## Call:
## roc.default(response = testset$is_attributed, predictor = val_preds, levels = base::levels(as.factor(testset$is_attributed)), plot = TRUE, col = "steelblue", lwd = 1, grid = TRUE)
##
## Data: val_preds in 656132 controls (testset$is_attributed 0) < 133967 cases (testset$is_attributed 1).
## Area under the curve: 0.9707
rm(balanced_sample, test, testset, train)
invisible(gc())
# Read the files
testdata <- fread("test.csv", stringsAsFactors = F,sep = ",",
na.strings = "")
# Creating features
testdata$click_time <- as.POSIXct(testdata$click_time)
# Splitting the click time and date
testdata <- testdata %>%
mutate(day = wday(testdata$click_time)) %>%
mutate(hour = hour(testdata$click_time)) %>%
mutate(min = minute(testdata$click_time))
# Add some count vars
testdata <- testdata %>%
add_count(ip, name = 'ip_ct') %>%
add_count(ip, day, hour, name = 'ip_d_h') %>%
add_count(ip, hour, channel, name = 'ip_h_ch') %>%
add_count(ip, hour, os, name = 'ip_h_os') %>%
add_count(ip, hour, app, name = 'ip_h_app') %>%
add_count(ip, hour, device, name = 'ip_h_dev')
# How popular is the app or channel?
testdata <- testdata %>%
add_count(app, channel, name = 'app_ch')
#next click by ip, os, device
testdata <- testdata %>%
group_by(ip, os, device) %>%
arrange(click_time) %>%
mutate(ip_os_dev_nxcl = next_click(click_time))
# Excluding variables that will be not used
testdata$click_time <- NULL
testdata$ip <- NULL
# Predictions
val_preds = predict(model, data = as.matrix(testdata[, colnames(testdata) != "click_id"]), n = model$best_iter)
# Classification
cutoff <- 0.5
is_attributed <- ifelse(val_preds > cutoff, 1, 0)
predictions <- cbind(as.integer(testdata$click_id), is_attributed)
colnames(predictions) <- c('click_id', 'is_attributed')
fwrite(predictions, 'predictionsLGBM.csv', sep = ',',
col.names = TRUE)