According to an article by New York Times:
Data scientists, according to interviews and expert estimates, spend from 50 percent to 80 percent of their time mired in this more mundane labor of collecting and preparing unruly digital data, before it can be explored for useful nuggets.
# Clear workspace:
rm(list = ls())
# load data:
library(tidyverse)
library(stringi)
library(knitr)
read_csv("C:/Users/Admin/Documents/train.csv") -> trainData
read_csv("C:/Users/Admin/Documents/test.csv") -> testData
# Check data type for train and test:
data.frame(fromTrain = sapply(trainData %>% select(-label), class), fromTest = sapply(testData, class)) -> df_dataType
df_dataType %>% mutate(col_name = row.names(.)) -> df_dataType
kable(df_dataType %>% slice(1:6))
df_dataType %>%
filter(fromTrain != fromTest) %>%
kable()
trainData$FIELD_36 %>% unique()
testData$FIELD_36 %>% unique()
# Convert to logical:
testData %>%
mutate(FIELD_36 = case_when(FIELD_36 == "FALSE" ~ FALSE, FIELD_36 == "TRUE" ~ TRUE, TRUE ~ NA)) -> testData
# Convert all string columns to lower:
trainData %>% mutate_if(is.character, function(x) {str_to_lower(x) %>% stri_trans_general("Latin-ASCII")}) -> trainData
testData %>% mutate_if(is.character, function(x) {str_to_lower(x) %>% stri_trans_general("Latin-ASCII")}) -> testData
# Check intersection:
intersect_between <- function(col_name) {
from_train <- trainData %>% pull(col_name) %>% unique()
from_test <- testData %>% pull(col_name) %>% unique()
condition <- sum(length(from_test)) == sum(length(from_train))
if (condition) {
conclusion <- "yes"
} else {
conclusion <- "no"
}
return(data.frame(feature = col_name, in_train = conclusion))
}
# All categorical features:
trainData %>% select_if(is.character) %>% names() -> all_categoricals
# Use the function:
do.call("bind_rows", lapply(all_categoricals, intersect_between)) -> df_check_diff
df_check_diff %>% filter(in_train == "no") -> df_diff
df_diff %>% kable()
features_diff <- df_diff$feature
# An example:
sapply(trainData %>% select(features_diff), n_distinct)
sapply(testData %>% select(features_diff), n_distinct)
# Function harmonizes for train:
harmonize_for_train <- function(col) {
from_Train <- trainData[, col, drop = TRUE]
from_Test <- testData[, col, drop = TRUE]
dplyr::intersect(from_Train, from_Test) -> intersections
new_values_forTrain <- case_when(from_Train %in% intersections ~ from_Train, TRUE ~ NA_character_)
# new_values_forTest <- case_when(from_Test %in% intersections ~ from_Test, TRUE ~ NA_character_)
return(new_values_forTrain)
}
# Function harmonizes for test:
harmonize_for_test <- function(col) {
from_Train <- trainData[, col, drop = TRUE]
from_Test <- testData[, col, drop = TRUE]
dplyr::intersect(from_Train, from_Test) -> intersections
# new_values_forTrain <- case_when(from_Train %in% intersections ~ from_Train, TRUE ~ NA_character_)
new_values_forTest <- case_when(from_Test %in% intersections ~ from_Test, TRUE ~ NA_character_)
return(new_values_forTest)
}
sapply(features_diff, harmonize_for_train) %>% data.frame() -> df_harmonized_train
sapply(features_diff, harmonize_for_test) %>% data.frame() -> df_harmonized_test
trainData %>% select(-features_diff) %>% bind_cols(df_harmonized_train) -> trainData
testData %>% select(-features_diff) %>% bind_cols(df_harmonized_test) -> testData
# Check:
sapply(trainData %>% select(features_diff), n_distinct)
sapply(testData %>% select(features_diff), n_distinct)
# Convert logical to integer:
trainData %>% mutate_if(is.logical, as.integer) -> trainData
testData %>% mutate_if(is.logical, as.integer) -> testData
# Combine train and test set:
totalData <- bind_rows(trainData, testData)
# Convert Nan to missing:
sum(is.nan(totalData$FIELD_54))
totalData %>%
mutate_if(is.double, function(x) {case_when(is.nan(x) ~ NA_real_, TRUE ~ x)}) %>%
mutate_if(is.integer, function(x) {case_when(is.nan(x) ~ NA_integer_, TRUE ~ x)}) -> totalData
# Check again:
sum(is.nan(totalData$FIELD_54))
# Convert "none" to missing for categoricals:
totalData %>% mutate_if(is.character, function(x) {case_when(x == "none" ~ NA_character_, TRUE ~ x)}) -> totalData
# Some special culumns:
logical_condition <- sapply(totalData %>% select_if(is.character), function(x) {sum(x %in% c("true", "false"))}) > 0
all_columns <- names(totalData %>% select_if(is.character))
var_logical_character <- all_columns[logical_condition]
totalData %>%
select(var_logical_character) %>%
sample_n(20) %>%
kable()
totalData %>% mutate_at(var_logical_character, function(x) {case_when(x == "true" ~ 1L, x == "false" ~ 0L, TRUE ~ NA_integer_)}) -> totalData
# Check again:
totalData %>%
select(var_logical_character) %>%
sample_n(20) %>%
kable()
# Convert to dummies for categorical features:
library(caret)
dummies <- dummyVars("~ .", totalData %>% select_if(is.character))
predict(dummies, totalData) %>% as.data.frame() -> features_oneHot
# Final data for modelling:
totalData %>%
select_if(is.numeric) %>%
bind_cols(features_oneHot) -> df_final
# Rename for columns:
all_names <- names(df_final)
names(df_final) <- all_names %>%
str_replace_all("\\[|\\]|\\,|\\<|\\'| ", "_")
# Save data:
write_csv(df_final, "C:/Users/Admin/Documents/df_final.csv")R Codes:
# Convert features to DMatrix form:
X_train <- df_final %>%
filter(!is.na(label)) %>%
select(-id, -label) %>%
as.matrix()
Y_train <- df_final %>%
filter(!is.na(label)) %>%
pull(label)
X_test <- df_final %>%
filter(is.na(label)) %>%
select(-id, -label) %>%
as.matrix()
#------------------------------------------
# Train XGBoost with default parameters
#------------------------------------------
library(xgboost)
# Convert to DMatrix form for train data:
dtrain <- xgb.DMatrix(data = X_train, label = Y_train)
# Train a default XGBoost:
xgb1 <- xgboost(data = dtrain,
objective = "binary:logistic",
verbose = 1,
nround = 250)
xgb1$evaluation_log %>%
data.frame() %>%
ggplot(aes(iter, train_error)) +
geom_line() +
labs(title = "Figure 1: XGBoost Training Process with missing data")
# Make prediction:
prediction <- predict(xgb1, X_test)
# DF of results:
data.frame(id = df_final %>% filter(is.na(label)) %>% pull(id), label = prediction) -> df_sub
# Save for submission:
write_csv(df_sub, "df_subR.csv")Python codes:
# Load data:
import pandas as pd
df = pd.read_csv("C:/Users/Admin/Documents/df_final.csv")
# Split data:
df_train = df[df['label'].notnull()]
X = df_train.drop(labels=["id", "label"], axis=1)
Y = df_train["label"]
df_test = df[df['label'].isnull()]
df_test = df_test.drop(labels=["id", "label"], axis=1)
# =================
# Train XGboost
# =================
# Train XGBClassifier with cross-validation:
from xgboost import XGBClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=29)
xgb1 = XGBClassifier(random_state=29)
auc_scores1 = cross_val_score(xgb1, X, Y, cv=cv, scoring="roc_auc", n_jobs=-1)
auc_scores1.mean()
xgb1.fit(X, Y)
pd = xgb1.predict_proba(df_test)[:, 1]
df_test = df[df['label'].isnull()]
df_submission = df_test.assign(label=pd)
df_submission = df_submission[['id', 'label']]
df_submission.to_csv("C:/Users/Admin/Documents/df_subPython.csv")Codes for Random Forest:
totalData %>%
mutate_if(is.character, function(x) {x %>% str_replace_all("\\[|\\]|\\,|\\<|\\'| ", "_")}) -> totalData
df_train <- totalData %>%
filter(!is.na(label)) %>%
select(-id)
df_test <- totalData %>%
filter(is.na(label)) %>%
select(-id, -label)
# Generates optimal binning for all variables/features:
library(scorecard)
bins_var <- woebin(df_train, y = "label", no_cores = 8, positive = "label|1")
# IV for variables/features:
do.call("rbind", bins_var) %>%
as.data.frame() %>%
filter(!duplicated(variable)) %>%
rename(iv_var = total_iv) %>%
arrange(iv_var) %>%
mutate(variable = factor(variable, levels = variable)) -> iv_values
# Features have IV >= 0:
iv_values %>%
filter(iv_var >= 0) %>%
pull(variable) %>%
as.character() -> var_IV_10
# Conduct data transformation based on IV/WoE and filter features with IV > 0.1:
train_woe <- woebin_ply(df_train, bins_var) %>%
as.data.frame() %>%
select(c("label", paste0(var_IV_10, "_", "woe")))
write_csv(train_woe, "C:/Users/Admin/Documents/train_woe.csv")
test_woe <- woebin_ply(df_test, bins_var) %>%
as.data.frame() %>%
select(paste0(var_IV_10, "_", "woe"))
# Relabel for target:
train_woe %>%
mutate(label = case_when(label == 1 ~ "Bad", TRUE ~ "Good")) %>%
mutate(label = as.factor(label)) -> df_forGBM
# Scale our data:
df_forGBM %>%
# mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))}) %>%
mutate_if(is.numeric, as.factor) -> df_forGBM_Scaled
test_woe %>%
# mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))}) %>%
mutate_if(is.numeric, as.factor) -> df_test_Scaled
# Train Random Forest:
library(ranger)
RF_default <- ranger(label ~ ., data = df_forGBM_Scaled, probability = TRUE, seed = 29)
# Use the RF Classifier for predicting PD (Probability of Default):
pd_sub_RF <- predict(RF_default, df_test_Scaled, type = "response")
# Save results for submission:
pd_sub_RF$predictions %>% as.data.frame() %>% pull(Bad) -> pd_sub_RF
df_sub <- data.frame(id = 30000:49999, label = pd_sub_RF)
write_csv(df_sub, "submission_RandomForest_ScaledData.csv")Codes for XGBoost:
df_final <- read_csv("C:/Users/Admin/Documents/train_woe.csv")
# Convert features to DMatrix form:
X_train <- df_final %>%
filter(!is.na(label)) %>%
select(-label) %>%
as.matrix()
X_test <- test_woe %>% as.matrix()
#------------------------------------------
# Train XGBoost with default parameters
#------------------------------------------
library(xgboost)
# Convert to DMatrix form for train data:
dtrain <- xgb.DMatrix(data = X_train, label = Y_train)
# Train a default XGBoost:
xgb2 <- xgboost(data = dtrain,
objective = "binary:logistic",
verbose = 1,
nround = 1000)
# Training process:
xgb2$evaluation_log %>%
data.frame() %>%
ggplot(aes(iter, train_error)) +
geom_line() +
labs(title = "Figure 2: XGBoost Training Process, WOE Transformation")
# Use for predicting:
pd_xgb_woe <- predict(xgb2, X_test)
# Save result for submission:
df_sub_xgb_woe <- data.frame(id = 30000:49999, label = pd_xgb_woe)
write_csv(df_sub_xgb_woe, "df_sub_xgb_woe.csv")