# load the required libraries
library("tidyverse") #for read_csv() among other functions
library("readxl") # used to read excel files
library("dplyr") # used for data munging 
library("FNN") # used for knn regression (knn.reg function)
library("caret") # used for various predictive models
library("class") # for using confusion matrix function
library("rpart.plot") # used to plot decision tree
library("rpart") # used for Regression tree
library("glmnet") # used for Lasso and Ridge regression
library("NeuralNetTools") # used to plot Neural Networks
library("PRROC") # top plot ROC curve
library("ROCR") # top plot lift curve
library("skimr") # to provide summary statistics about variables in data frames
library("e1071") # used for generalized k-nearest neighbour algorithm
library("mice") # used for Multivariate Imputation by Chained Equations
library("dummies") # used for one hot coding of categorical variables
library("ggplot2") # used for visualization & graphs

1. Classification

1.1 Data loading and transformation

HighNote is an online community similar to Spotify (and LinkedIn and OkCupid) that allows both free users and premium users to co-exist. The highnote dataset has a variable called adopter which is 1 if the user adopts the premium level and is 0 if the user is a free user. It has 2 categories of X variables about these users, how their behaviors and engagement changed in the previous time period and what their levels are in the current period.

# Load the HighNote data set
#highnote_data <- read_csv("HN_data_PostModule.csv")
highnote <- read_csv("HN_data_PostModule.csv", 
                     col_types = "cnfnnnnnnnnnnnfnnnnnnnnnnff") # original copy
highnote_data = highnote # data for imputation later

# Get a preview of the data.
glimpse(highnote_data)
## Rows: 107,213
## Columns: 27
## $ net_user                     <chr> "tinaj5920", "tinalabina", "tinamachine",…
## $ age                          <dbl> NA, NA, 22, 31, NA, 35, 27, NA, 16, 21, N…
## $ male                         <fct> 0, NA, 0, 0, NA, 0, 1, 1, 0, 0, NA, 0, 0,…
## $ friend_cnt                   <dbl> 20, 3, 8, 0, 1, 2, 2, 3, 1, 28, 1, 65, 1,…
## $ avg_friend_age               <dbl> 30.28571, 30.50000, 22.57143, NA, NA, 28.…
## $ avg_friend_male              <dbl> 0.7368421, 0.3333333, 0.4285714, NA, 1.00…
## $ friend_country_cnt           <dbl> 14, 1, 1, 0, 1, 2, 1, 2, 1, 7, 1, 9, 1, 4…
## $ subscriber_friend_cnt        <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
## $ songsListened                <dbl> 8414, 1943, 9687, 26863, 187, 0, 508, 371…
## $ lovedTracks                  <dbl> 348, 0, 194, 12, 0, 0, 0, 34, 6, 32, 3, 2…
## $ posts                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,…
## $ playlists                    <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,…
## $ shouts                       <dbl> 6, 3, 8, 2, 7, 0, 2, 1, 1, 1, 1, 81, 0, 2…
## $ tenure                       <dbl> 59, 34, 59, 55, 52, 35, 42, 25, 7, 25, 22…
## $ good_country                 <fct> 1, NA, 1, 0, NA, 0, 0, 1, 0, 0, NA, 0, 1,…
## $ delta1_friend_cnt            <dbl> 8, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,…
## $ delta1_avg_friend_age        <dbl> 0.6190476, 3.5000000, 0.0000000, NA, NA, …
## $ delta1_avg_friend_male       <dbl> -0.01315790, -0.16666667, 0.00000000, NA,…
## $ delta1_friend_country_cnt    <dbl> 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ delta1_subscriber_friend_cnt <dbl> -1, 0, 0, 0, 0, 0, -1, 0, 0, 1, 0, 0, 0, …
## $ delta1_songsListened         <dbl> 2918, 0, 0, 0, 0, 0, 0, 0, 1852, 842, 0, …
## $ delta1_lovedTracks           <dbl> 60, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ delta1_posts                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ delta1_playlists             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ delta1_shouts                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ delta1_good_country          <fct> 0, NA, 0, 0, NA, 0, 0, 0, 0, 0, NA, 0, 0,…
## $ adopter                      <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
skim(highnote_data)
Data summary
Name highnote_data
Number of rows 107213
Number of columns 27
_______________________
Column type frequency:
character 1
factor 4
numeric 22
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
net_user 0 1 1 19 0 107205 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
male 0 1 FALSE 3 1: 42548, NA: 38950, 0: 25715
good_country 0 1 FALSE 3 0: 43025, NA: 39155, 1: 25033
delta1_good_country 0 1 FALSE 4 0: 67728, NA: 39393, 1: 57, -1: 35
adopter 0 1 FALSE 2 0: 100000, 1: 7213

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 50859 0.53 24.39 6.84 8 20.00 23.00 27.00 79.0 ▇▇▁▁▁
friend_cnt 1 1.00 12.23 48.19 0 1.00 3.00 10.00 5089.0 ▇▁▁▁▁
avg_friend_age 21626 0.80 24.61 5.73 8 20.99 23.33 27.00 79.0 ▆▇▁▁▁
avg_friend_male 16517 0.85 0.63 0.35 0 0.40 0.67 1.00 1.0 ▃▂▃▃▇
friend_country_cnt 1 1.00 2.79 4.98 0 1.00 1.00 3.00 136.0 ▇▁▁▁▁
subscriber_friend_cnt 1 1.00 0.33 2.12 0 0.00 0.00 0.00 309.0 ▇▁▁▁▁
songsListened 0 1.00 12863.89 25193.67 0 439.00 3432.00 14785.00 1000000.0 ▇▁▁▁▁
lovedTracks 0 1.00 77.76 284.19 0 0.00 9.00 58.00 44005.0 ▇▁▁▁▁
posts 0 1.00 3.77 93.96 0 0.00 0.00 0.00 15185.0 ▇▁▁▁▁
playlists 0 1.00 0.53 6.14 0 0.00 0.00 1.00 1943.0 ▇▁▁▁▁
shouts 1927 0.98 20.85 261.08 0 1.00 2.00 6.00 65872.0 ▇▁▁▁▁
tenure 32 1.00 39.55 19.29 0 24.00 38.00 54.00 111.0 ▅▇▆▂▁
delta1_friend_cnt 26 1.00 0.47 5.34 -486 0.00 0.00 0.00 521.0 ▁▁▇▁▁
delta1_avg_friend_age 22057 0.79 0.23 0.71 -50 0.00 0.14 0.38 19.5 ▁▁▁▇▁
delta1_avg_friend_male 16826 0.84 0.00 0.05 -1 0.00 0.00 0.00 1.0 ▁▁▇▁▁
delta1_friend_country_cnt 26 1.00 0.06 0.68 -52 0.00 0.00 0.00 30.0 ▁▁▁▇▁
delta1_subscriber_friend_cnt 26 1.00 -0.01 0.48 -23 0.00 0.00 0.00 43.0 ▁▇▁▁▁
delta1_songsListened 1001 0.99 587.81 2135.08 -93584 0.00 0.00 310.00 129096.0 ▁▁▇▁▁
delta1_lovedTracks 27 1.00 3.26 24.32 -1040 0.00 0.00 0.00 1338.0 ▁▁▇▁▁
delta1_posts 27 1.00 0.15 13.53 -6 0.00 0.00 0.00 3391.0 ▇▁▁▁▁
delta1_playlists 27 1.00 0.00 0.38 -8 0.00 0.00 0.00 116.0 ▇▁▁▁▁
delta1_shouts 1995 0.98 0.78 36.46 -953 0.00 0.00 0.00 10022.0 ▇▁▁▁▁
## categorical features
highnote_data %>%  keep(is.factor) %>%  summary()
##  male       good_country delta1_good_country adopter   
##  0 :25715   1 :25033     0 :67728            0:100000  
##  NA:38950   NA:39155     NA:39393            1:  7213  
##  1 :42548   0 :43025     1 :   57                      
##                          -1:   35
## NA's
# lets fix the NA's in below categorical variables
  # male
  # good_country
  # delta1_good_country
  # we will update all the "NA" to "UNK" for the above categorical variables
  # in our data set
  # in ordered to change NA to UNK we have to convert factor to character, 
  # then replace NA with UNK, then convert back to factor

## male
summary(select(highnote_data, male))
##  male      
##  0 :25715  
##  NA:38950  
##  1 :42548
# change "male" to character variable
highnote_data$male  = as.character(highnote_data$male)

# update "NA" to "UNK"
highnote_data$male = as.factor(ifelse(highnote_data$male=="NA", 'UNK', highnote_data$male))

# view male variable after converting NAs to UNKs
summary(select(highnote_data, male))
##   male      
##  0  :25715  
##  1  :42548  
##  UNK:38950
## good_country
summary(select(highnote_data, good_country))
##  good_country
##  1 :25033    
##  NA:39155    
##  0 :43025
# change "good_country" to character variable
highnote_data$good_country  = as.character(highnote_data$good_country)

# update "NA" to "UNK"
highnote_data$good_country = as.factor(ifelse(highnote_data$good_country=="NA", 'UNK', 
                                              highnote_data$good_country))

# view good_country variable after converting NAs to UNKs
summary(select(highnote_data, good_country))
##  good_country
##  0  :43025   
##  1  :25033   
##  UNK:39155
## delta1_good_country
summary(select(highnote_data, delta1_good_country))
##  delta1_good_country
##  0 :67728           
##  NA:39393           
##  1 :   57           
##  -1:   35
# change "delta1_good_country" to character variable
highnote_data$delta1_good_country  = as.character(highnote_data$delta1_good_country)

# update "NA" to "UNK"
highnote_data$delta1_good_country = as.factor(ifelse(highnote_data$delta1_good_country=="NA", 'UNK', 
                                                     highnote_data$delta1_good_country))

# view delta1_good_country variable after converting NAs to UNKs
summary(select(highnote_data, delta1_good_country))
##  delta1_good_country
##  -1 :   35          
##  0  :67728          
##  1  :   57          
##  UNK:39393
## Imputation Method : Using Mean and Mode
# Continuous features
# Get the summary statistics for the continuous features
highnote_data %>% keep(is.numeric) %>% skim()
Data summary
Name Piped data
Number of rows 107213
Number of columns 22
_______________________
Column type frequency:
numeric 22
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 50859 0.53 24.39 6.84 8 20.00 23.00 27.00 79.0 ▇▇▁▁▁
friend_cnt 1 1.00 12.23 48.19 0 1.00 3.00 10.00 5089.0 ▇▁▁▁▁
avg_friend_age 21626 0.80 24.61 5.73 8 20.99 23.33 27.00 79.0 ▆▇▁▁▁
avg_friend_male 16517 0.85 0.63 0.35 0 0.40 0.67 1.00 1.0 ▃▂▃▃▇
friend_country_cnt 1 1.00 2.79 4.98 0 1.00 1.00 3.00 136.0 ▇▁▁▁▁
subscriber_friend_cnt 1 1.00 0.33 2.12 0 0.00 0.00 0.00 309.0 ▇▁▁▁▁
songsListened 0 1.00 12863.89 25193.67 0 439.00 3432.00 14785.00 1000000.0 ▇▁▁▁▁
lovedTracks 0 1.00 77.76 284.19 0 0.00 9.00 58.00 44005.0 ▇▁▁▁▁
posts 0 1.00 3.77 93.96 0 0.00 0.00 0.00 15185.0 ▇▁▁▁▁
playlists 0 1.00 0.53 6.14 0 0.00 0.00 1.00 1943.0 ▇▁▁▁▁
shouts 1927 0.98 20.85 261.08 0 1.00 2.00 6.00 65872.0 ▇▁▁▁▁
tenure 32 1.00 39.55 19.29 0 24.00 38.00 54.00 111.0 ▅▇▆▂▁
delta1_friend_cnt 26 1.00 0.47 5.34 -486 0.00 0.00 0.00 521.0 ▁▁▇▁▁
delta1_avg_friend_age 22057 0.79 0.23 0.71 -50 0.00 0.14 0.38 19.5 ▁▁▁▇▁
delta1_avg_friend_male 16826 0.84 0.00 0.05 -1 0.00 0.00 0.00 1.0 ▁▁▇▁▁
delta1_friend_country_cnt 26 1.00 0.06 0.68 -52 0.00 0.00 0.00 30.0 ▁▁▁▇▁
delta1_subscriber_friend_cnt 26 1.00 -0.01 0.48 -23 0.00 0.00 0.00 43.0 ▁▇▁▁▁
delta1_songsListened 1001 0.99 587.81 2135.08 -93584 0.00 0.00 310.00 129096.0 ▁▁▇▁▁
delta1_lovedTracks 27 1.00 3.26 24.32 -1040 0.00 0.00 0.00 1338.0 ▁▁▇▁▁
delta1_posts 27 1.00 0.15 13.53 -6 0.00 0.00 0.00 3391.0 ▇▁▁▁▁
delta1_playlists 27 1.00 0.00 0.38 -8 0.00 0.00 0.00 116.0 ▇▁▁▁▁
delta1_shouts 1995 0.98 0.78 36.46 -953 0.00 0.00 0.00 10022.0 ▇▁▁▁▁
## Age
# There are a lot of missing values for age. We will use mean imputation 
# (by Gender) to resolve the missing values of age.
skim(highnote_data$age)
Data summary
Name highnote_data$age
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 50859 0.53 24.39 6.84 8 20 23 27 79 ▇▇▁▁▁
highnote_data <- highnote_data %>%
  group_by(male) %>%
  mutate(age = ifelse(is.na(age), mean(age, na.rm = TRUE), age)) %>%
  ungroup()

# after change
skim(highnote_data$age)
Data summary
Name highnote_data$age
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 24.53 4.99 8 22 24.85 25.28 79 ▃▇▁▁▁
## friend_cnt
# view the distribution of friend count
ggplot(data = highnote_data) + 
  geom_point(mapping = aes(x = friend_cnt, y = adopter, color = male))

# There is 1 missing values for friend_cnt. We will use median imputation to resolve the NAs.
# Note that we used median this time instead of mean. 
# Mean wouldn't have given us reasonable values, since we cannot have fractional number of friends.
highnote_data <- highnote_data %>%
  mutate(friend_cnt = ifelse(is.na(friend_cnt), 
                                 median(friend_cnt, na.rm = TRUE), 
                                 friend_cnt))

# after change
skim(highnote_data$friend_cnt)
Data summary
Name highnote_data$friend_cnt
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 12.23 48.19 0 1 3 10 5089 ▇▁▁▁▁
## avg_friend_age, use mean for missing values
highnote_data <- highnote_data %>%
  mutate(avg_friend_age = ifelse(is.na(avg_friend_age), 
                                 mean(avg_friend_age, na.rm = TRUE), 
                                 avg_friend_age))

# after change  
skim(highnote_data$avg_friend_age)
Data summary
Name highnote_data$avg_friend_…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 24.61 5.12 8 21.45 24.61 26 79 ▃▇▁▁▁
## avg_friend_male, use mean for missing values
highnote_data <- highnote_data %>%
  mutate(avg_friend_male = ifelse(is.na(avg_friend_male), 
                                  mean(avg_friend_male, na.rm = TRUE), 
                                  avg_friend_male))

# after change 
skim(highnote_data$avg_friend_male)
Data summary
Name highnote_data$avg_friend_…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0.63 0.33 0 0.5 0.63 1 1 ▃▂▃▇▇
## friend_country_cnt - Number of different countries this user's friends are from, 
# use median for missing values
highnote_data <- highnote_data %>%
  mutate(friend_country_cnt = ifelse(is.na(friend_country_cnt), 
                                     median(friend_country_cnt, na.rm = TRUE), 
                                     friend_country_cnt))

# after change 
skim(highnote_data$friend_country_cnt)
Data summary
Name highnote_data$friend_coun…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 2.79 4.98 0 1 1 3 136 ▇▁▁▁▁
## subscriber_friend_cnt, use median for missing values
highnote_data <- highnote_data %>%
  mutate(subscriber_friend_cnt = ifelse(is.na(subscriber_friend_cnt), 
                                        median(subscriber_friend_cnt, na.rm = TRUE), 
                                        subscriber_friend_cnt))

# after change
skim(highnote_data$subscriber_friend_cnt)
Data summary
Name highnote_data$subscriber_…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0.33 2.12 0 0 0 0 309 ▇▁▁▁▁
## shouts, use median for missing values
highnote_data <- highnote_data %>%
  mutate(shouts = ifelse(is.na(shouts), 
                         median(shouts, na.rm = TRUE), 
                         shouts)) 

# after change
skim(highnote_data$shouts)
Data summary
Name highnote_data$shouts
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 20.51 258.74 0 1 2 6 65872 ▇▁▁▁▁
## tenure, use mean for missing values. 
highnote_data <- highnote_data %>%
  mutate(tenure = ifelse(is.na(tenure), 
                                 mean(tenure, na.rm = TRUE), 
                                 tenure))
# after change
skim(highnote_data$tenure)
Data summary
Name highnote_data$tenure
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 39.55 19.28 0 24 38 54 111 ▅▇▆▂▁
## delta1_friend_cnt, use median for missing values
highnote_data <- highnote_data %>%
  mutate(delta1_friend_cnt = ifelse(is.na(delta1_friend_cnt), 
                                    median(delta1_friend_cnt, na.rm = TRUE), 
                                    delta1_friend_cnt))
# after change
skim(highnote_data$delta1_friend_cnt)
Data summary
Name highnote_data$delta1_frie…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0.47 5.34 -486 0 0 0 521 ▁▁▇▁▁
## delta1_avg_friend_age, use mean for missing values
highnote_data <- highnote_data %>%
  mutate(delta1_avg_friend_age = ifelse(is.na(delta1_avg_friend_age), 
                                        mean(delta1_avg_friend_age, na.rm = TRUE), 
                                        delta1_avg_friend_age)) 
# after change
skim(highnote_data$delta1_avg_friend_age)
Data summary
Name highnote_data$delta1_avg_…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0.23 0.63 -50 0 0.23 0.32 19.5 ▁▁▁▇▁
## delta1_avg_friend_male, use mean for missing values
highnote_data <- highnote_data %>%
  mutate(delta1_avg_friend_male = ifelse(is.na(delta1_avg_friend_male), 
                                         mean(delta1_avg_friend_male, na.rm = TRUE), 
                                         delta1_avg_friend_male))
# after change
skim(highnote_data$delta1_avg_friend_male)
Data summary
Name highnote_data$delta1_avg_…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0 0.05 -1 0 0 0 1 ▁▁▇▁▁
## delta1_friend_country_cnt, use median for missing values.
highnote_data <- highnote_data %>%
  mutate(delta1_friend_country_cnt = ifelse(is.na(delta1_friend_country_cnt), 
                                 median(delta1_friend_country_cnt, na.rm = TRUE), 
                                 delta1_friend_country_cnt))

# after change
skim(highnote_data$delta1_friend_country_cnt)
Data summary
Name highnote_data$delta1_frie…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0.06 0.68 -52 0 0 0 30 ▁▁▁▇▁
## delta1_subscriber_friend_cnt, use median for missing values.
highnote_data <- highnote_data %>%
  mutate(delta1_subscriber_friend_cnt = ifelse(is.na(delta1_subscriber_friend_cnt), 
                                 median(delta1_subscriber_friend_cnt, na.rm = TRUE), 
                                 delta1_subscriber_friend_cnt))

# after change
skim(highnote_data$delta1_subscriber_friend_cnt)
Data summary
Name highnote_data$delta1_subs…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 -0.01 0.48 -23 0 0 0 43 ▁▇▁▁▁
## delta1_songsListened, use median for missing value
highnote_data <- highnote_data %>%
  mutate(delta1_songsListened = ifelse(is.na(delta1_songsListened), 
                                       median(delta1_songsListened, na.rm = TRUE), 
                                       delta1_songsListened))

# after change
skim(highnote_data$delta1_songsListened)
Data summary
Name highnote_data$delta1_song…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 582.32 2125.84 -93584 0 0 298 129096 ▁▁▇▁▁
## delta1_lovedTracks, use median for missing values. 
highnote_data <- highnote_data %>%
  mutate(delta1_lovedTracks = ifelse(is.na(delta1_lovedTracks), 
                                 median(delta1_lovedTracks, na.rm = TRUE), 
                                 delta1_lovedTracks))
# after change
skim(highnote_data$delta1_lovedTracks)
Data summary
Name highnote_data$delta1_love…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 3.26 24.32 -1040 0 0 0 1338 ▁▁▇▁▁
## delta1_posts, use median for missing values.
highnote_data <- highnote_data %>%
  mutate(delta1_posts = ifelse(is.na(delta1_posts), 
                                 median(delta1_posts, na.rm = TRUE), 
                                 delta1_posts))
# after change
skim(highnote_data$delta1_posts)
Data summary
Name highnote_data$delta1_post…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0.15 13.53 -6 0 0 0 3391 ▇▁▁▁▁
## delta1_playlists, use median for missing values
highnote_data <- highnote_data %>%
  mutate(delta1_playlists = ifelse(is.na(delta1_playlists), 
                                 median(delta1_playlists, na.rm = TRUE), 
                                 delta1_playlists))
# after change
skim(highnote_data$delta1_playlists)
Data summary
Name highnote_data$delta1_play…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0 0.38 -8 0 0 0 116 ▇▁▁▁▁
## delta1_shouts, use median for missing values
highnote_data <- highnote_data %>%
  mutate(delta1_shouts = ifelse(is.na(delta1_shouts), 
                                median(delta1_shouts, na.rm = TRUE), 
                                delta1_shouts)) 
# after change
skim(highnote_data$delta1_shouts)
Data summary
Name highnote_data$delta1_shou…
Number of rows 107213
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0.76 36.12 -953 0 0 0 10022 ▇▁▁▁▁
## create Y and X data frames
highnote_data_y = highnote_data %>% pull("adopter") %>% as.factor()

# exclude net_user and adopter from the training dataset
highnote_data_x = highnote_data %>% select(-c("net_user", "adopter"))

1.2 Create Training and Testing data sets

# 75% of the data is used for training and rest for testing
smp_size <- floor(0.75 * nrow(highnote_data_x))

# randomly select row numbers for training data set
set.seed(12345)
train_ind <- sample(seq_len(nrow(highnote_data_x)), size = smp_size)

# create training and test sets for x
highnote_data_x_train <- highnote_data_x[train_ind, ]
highnote_data_x_test <- highnote_data_x[-train_ind, ]

# creating test and training sets for y
highnote_data_y_train <- highnote_data_y[train_ind]
highnote_data_y_test <- highnote_data_y[-train_ind]

# Create an empty data frame to store results from different models
clf_results <- data.frame(matrix(ncol = 5, nrow = 0))
names(clf_results) <- c("Model", "Accuracy", "Precision", "Recall", "F1")

# Create an empty data frame to store TP, TN, FP and FN values
cost_benefit_df <- data.frame(matrix(ncol = 5, nrow = 0))
names(cost_benefit_df) <- c("Model", "TP", "FN", "FP", "TN")

1.3 Variable grouping

# Generic_Variables   Past_Time_Variables           Current_Time_variables    Outcome_Variable
# --------------------------------------------------------------------------------------------
# net_user            delta1_friend_cnt             friend_cnt                adopter
# age                 delta1_avg_friend_age         avg_friend_age  
# male                delta1_avg_friend_male        avg_friend_male 
# tenure              delta1_friend_country_cnt     friend_country_cnt  
#                     delta1_subscriber_friend_cnt  subscriber_friend_cnt   
#                     delta1_songsListened          songsListened   
#                     delta1_lovedTracks            lovedTracks 
#                     delta1_posts                  posts   
#                     delta1_playlists              playlists   
#                     delta1_shouts                 shouts  
#                     delta1_good_country           good_country

1.4 Fit a logistic regression

# x = highnote_data_x_train
# y = highnote_data_y_train
glm_fit <- train(highnote_data_x_train,
                 highnote_data_y_train, 
                 method = "glm",
                 family = "binomial",
                 preProc = c("center", "scale"))
# Predict on test data
glm_predict <- predict(glm_fit, newdata = highnote_data_x_test)

# Predict probablity on test data
glm_predict_prob <- predict(glm_fit, newdata = highnote_data_x_test, type="prob")

Convert probability outcome into categorical outcome

y_pred_num <- ifelse(glm_predict_prob[,2] > 0.5, "1", "0")
# Print Confusion matrix, Accuarcy, Sensitivity etc 
# Predicted = y_pred_num
# Actual = highnote_data_y_test
confusionMatrix(as.factor(y_pred_num), as.factor(highnote_data_y_test), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 24881  1803
##          1    77    43
##                                           
##                Accuracy : 0.9299          
##                  95% CI : (0.9267, 0.9329)
##     No Information Rate : 0.9311          
##     P-Value [Acc > NIR] : 0.7976          
##                                           
##                   Kappa : 0.0356          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.023294        
##             Specificity : 0.996915        
##          Pos Pred Value : 0.358333        
##          Neg Pred Value : 0.932431        
##              Prevalence : 0.068870        
##          Detection Rate : 0.001604        
##    Detection Prevalence : 0.004477        
##       Balanced Accuracy : 0.510104        
##                                           
##        'Positive' Class : 1               
## 
# Recording Output
# Confusion Matrix and Statistics

#               Reference
# Prediction     0     1
#         0   24881  1803
#         1     77    43
#                                          
#               Accuracy : 0.9299          
#                 95% CI : (0.9267, 0.9329)
#    No Information Rate : 0.9311          
#    P-Value [Acc > NIR] : 0.7976          
#                                          
#                  Kappa : 0.0356          
#                                          
# Mcnemar's Test P-Value : <2e-16          
#                                          
#            Sensitivity : 0.023294        
#            Specificity : 0.996915        
#         Pos Pred Value : 0.358333        
#         Neg Pred Value : 0.932431        
#             Prevalence : 0.068870        
#         Detection Rate : 0.001604        
#   Detection Prevalence : 0.004477        
#      Balanced Accuracy : 0.510104        
#                                          
#       'Positive' Class : 1               

1.5 Do SMOTE to deal with class imbalance

SMOTE algorithm for unbalanced classification problems https://www.rdocumentation.org/packages/DMwR/versions/0.4.1/topics/SMOTE

# What is the class distribution?
# Check the proportions for the class between all 3 datasets.
round(prop.table(table(select(highnote_data, adopter), exclude = NULL)), 4) * 100
## 
##     0     1 
## 93.27  6.73
round(prop.table(table(highnote_data_y_train)), 4) * 100
## highnote_data_y_train
##     0     1 
## 93.33  6.67
round(prop.table(table(highnote_data_y_test)), 4) * 100
## highnote_data_y_test
##     0     1 
## 93.11  6.89
# We will use the SMOTE() function from the DMwR package to balance 
# the training data before we build our model.
# install.packages("abind")
library("abind")

# install packages for SMOTE using below links. You need to install 
# from the archive since DMwR is removed from Cran
# install.packages(c("xts","quantmod"))
# install.packages( "https://cran.r-project.org/src/contrib/Archive/DMwR/DMwR_0.4.1.tar.gz", 
# repos=NULL, type="source")
library("DMwR")
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
set.seed(1234)

# create the full training dataset with X and y variable
highnote_data_train <-  cbind(highnote_data_x_train, highnote_data_y_train)

# parameters for SMOTE are 
#   1) prediction model, 
#   2) data, 
#   3) percent to oversample, 
#   4) percent to undersample
highnote_data_train_balanced <- SMOTE(highnote_data_y_train ~ ., 
                                      data.frame(highnote_data_train), 
                                      perc.over = 100, perc.under = 200)

# Check the proportions for the class between all 3 datasets.
round(prop.table(table(select(highnote_data, adopter), exclude = NULL)), 4) * 100
## 
##     0     1 
## 93.27  6.73
round(prop.table(table(highnote_data_train_balanced$highnote_data_y_train)), 4) * 100
## 
##  0  1 
## 50 50
round(prop.table(table(highnote_data_y_test)), 4) * 100
## highnote_data_y_test
##     0     1 
## 93.11  6.89
# remove the Y column from the newly balanced training set
highnote_data_x_train <- highnote_data_train_balanced %>% 
  select(-highnote_data_y_train)

# store the Y column
highnote_data_y_train <- highnote_data_train_balanced %>% 
  pull(highnote_data_y_train) %>% as.factor()

1.6 Fit logistic again

# x = highnote_data_x_train
# y = highnote_data_y_train

glm_fit <- train(highnote_data_x_train,
                 highnote_data_y_train, 
                 method = "glm",
                 family = "binomial",
                 preProc = c("center", "scale")
                 )
# Predict on test data
glm_predict <- predict(glm_fit, newdata = highnote_data_x_test)

# Predict on test data prob
glm_predict_prob <- predict(glm_fit, newdata = highnote_data_x_test, type="prob")

Convert probability outcome into categorical outcome

y_pred_num <- ifelse(glm_predict_prob[,2] > 0.5, "1", "0")
# Print Confusion matrix, Accuarcy, Sensitivity etc 
# Predicted = y_pred_num
# Actual = highnote_data_y_test
confusionMatrix(as.factor(y_pred_num), as.factor(highnote_data_y_test), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19852   776
##          1  5106  1070
##                                           
##                Accuracy : 0.7806          
##                  95% CI : (0.7756, 0.7855)
##     No Information Rate : 0.9311          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1798          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.57963         
##             Specificity : 0.79542         
##          Pos Pred Value : 0.17325         
##          Neg Pred Value : 0.96238         
##              Prevalence : 0.06887         
##          Detection Rate : 0.03992         
##    Detection Prevalence : 0.23041         
##       Balanced Accuracy : 0.68752         
##                                           
##        'Positive' Class : 1               
## 
## storing output

# Confusion Matrix and Statistics
#
#             Reference
# Prediction     0     1
#         0   19852   776
#         1   5106  1070
#                                          
#               Accuracy : 0.7806          
#                 95% CI : (0.7756, 0.7855)
#    No Information Rate : 0.9311          
#    P-Value [Acc > NIR] : 1               
#                                          
#                  Kappa : 0.1798          
#                                          
# Mcnemar's Test P-Value : <2e-16          
#                                          
#            Sensitivity : 0.57963         
#            Specificity : 0.79542         
#         Pos Pred Value : 0.17325         
#         Neg Pred Value : 0.96238         
#             Prevalence : 0.06887         
#         Detection Rate : 0.03992         
#   Detection Prevalence : 0.23041         
#      Balanced Accuracy : 0.68752         
#                                          
#       'Positive' Class : 1  


# Add results into clf_results dataframe
x1 <- confusionMatrix(as.factor(y_pred_num), as.factor(highnote_data_y_test), 
                      positive = "1")[["overall"]]
y1 <- confusionMatrix(as.factor(y_pred_num), as.factor(highnote_data_y_test),
                      positive = "1")[["byClass"]]

clf_results[nrow(clf_results) + 1,] <-  list(Model = "LR", 
                                             Accuracy = round (x1[["Accuracy"]],3), 
                                            Precision = round (y1[["Precision"]],3), 
                                            Recall = round (y1[["Recall"]],3), 
                                            F1 = round (y1[["F1"]],3))

# Print Accuracy and F1 score
cat("Accuarcy is ", round(x1[["Accuracy"]],3), "and F1 is ", round (y1[["F1"]],3))
## Accuarcy is  0.781 and F1 is  0.267
# Add results into cost_benefit_df dataframe for cost benefit analysis 
a1 <- confusionMatrix(as.factor(y_pred_num), as.factor(highnote_data_y_test))

# be careful about accurately picking up the TP, FN, FP and TN
cost_benefit_df[nrow(cost_benefit_df) + 1,] <-  list(Model = "LR", 
                                             TP = a1[["table"]][4], 
                                             FN = a1[["table"]][3], 
                                             FP = a1[["table"]][2], 
                                             TN = a1[["table"]][1])

1.7 SMOTE Results Summary

SMOTE have improved the performance of the model. Hence we will be using the balanced class data set for model training and validation

x = highnote_data_x_train y = highnote_data_y_train

1.8 Fit logistic again with only previous time period(delta1) variables

highnote_data_x_train_delta1 = highnote_data_x_train %>% select(-c("friend_cnt",
                                                     "avg_friend_age",
                                                     "avg_friend_male",
                                                     "friend_country_cnt",
                                                     "subscriber_friend_cnt",
                                                     "songsListened",
                                                     "lovedTracks",
                                                     "posts",
                                                     "playlists",
                                                     "shouts",
                                                     "good_country"
                                                     ))

glm_fit_delta1 <- train(highnote_data_x_train_delta1,
                 highnote_data_y_train, 
                 method = "glm",
                 family = "binomial",
                 preProc = c("center", "scale")
                 )

# Predict on test data
glm_predict_delta1 <- predict(glm_fit_delta1, newdata = highnote_data_x_test)

# Predict on test data prob
glm_predict_prob_delta1 <- predict(glm_fit_delta1, newdata = highnote_data_x_test, 
                                   type="prob")

# Convert probability outcome into categorical outcome
y_pred_num_delta1 <- ifelse(glm_predict_prob_delta1[,2] > 0.5, "1", "0")

# Print Confusion matrix, Accuarcy, Sensitivity etc 
# Predicted = y_pred_num
# Actual = highnote_data_y_test
confusionMatrix(as.factor(y_pred_num_delta1), as.factor(highnote_data_y_test), 
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19498   906
##          1  5460   940
##                                           
##                Accuracy : 0.7625          
##                  95% CI : (0.7574, 0.7676)
##     No Information Rate : 0.9311          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1356          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.50921         
##             Specificity : 0.78123         
##          Pos Pred Value : 0.14688         
##          Neg Pred Value : 0.95560         
##              Prevalence : 0.06887         
##          Detection Rate : 0.03507         
##    Detection Prevalence : 0.23877         
##       Balanced Accuracy : 0.64522         
##                                           
##        'Positive' Class : 1               
## 
# Add results into clf_results dataframe
x2 <- confusionMatrix(as.factor(y_pred_num_delta1), as.factor(highnote_data_y_test), 
                      positive = "1")[["overall"]]
y2 <- confusionMatrix(as.factor(y_pred_num_delta1), as.factor(highnote_data_y_test),
                      positive = "1")[["byClass"]]

clf_results[nrow(clf_results) + 1,] <-  list(Model = "LR Delta1", 
                                             Accuracy = round (x2[["Accuracy"]],3), 
                                            Precision = round (y2[["Precision"]],3), 
                                            Recall = round (y2[["Recall"]],3), 
                                            F1 = round (y2[["F1"]],3))

# Print Accuracy and F1 score
cat("Accuarcy is ", round(x2[["Accuracy"]],3), "and F1 is ", round (y2[["F1"]],3))
## Accuarcy is  0.762 and F1 is  0.228
# Add results into cost_benefit_df data frame for cost benefit analysis 
a2 <- confusionMatrix(as.factor(y_pred_num_delta1), as.factor(highnote_data_y_test))

# be careful about accurately picking up the TP, FN, FP and TN
cost_benefit_df[nrow(cost_benefit_df) + 1,] <-  list(Model = "LR Delta1", 
                                             TP = a2[["table"]][4], 
                                             FN = a2[["table"]][3], 
                                             FP = a2[["table"]][2], 
                                             TN = a2[["table"]][1])

Cross validation

It is a technique to use same training data but some portion of it for training and rest for validation of model. This technique reduces chances of overfitting

Hyperparamter tuning

We provide a list of hyperparameters to train the model. This helps in identifying best set of hyperparameters for a given model like Decision tree. train function in caret library automatically stores the information of the best model and its hyperparameters.

1.9 Decision Tree Classification with Previous and Current Time Period

#x = highnote_data_x_train
#y = highnote_data_y_train

# Cross validation
cross_validation <- trainControl(## 10-fold CV
                                method = "repeatedcv",
                                number = 10,
                                ## repeated three times
                                repeats = 3)
# Hyperparamter tuning
# maxdepth =  the maximum depth of the tree that will be created or
# the length of the longest path from the tree root to a leaf

#lets see what hyper parameters are needed /available to tune the 
# decision tree using rpart2 package
modelLookup("rpart2")
Param_Grid <-  expand.grid(maxdepth = 2:10)

# fit the model
dtree_fit <- train(highnote_data_x_train,
                   highnote_data_y_train, 
                   method = "rpart2",
                   # split - criteria to split nodes
                   parms = list(split = "gini"),
                  tuneGrid = Param_Grid,
                   trControl = cross_validation,
                  # preProc -  perform listed pre-processing to predictor dataframe
                   preProc = c("center", "scale"))

# check the accuracy
dtree_fit
## CART 
## 
## 21468 samples
##    25 predictor
##     2 classes: '0', '1' 
## 
## Pre-processing: centered (22), scaled (22), ignore (3) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 19322, 19321, 19321, 19320, 19320, 19321, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  Accuracy   Kappa    
##    2        0.7408697  0.4817406
##    3        0.7417551  0.4835113
##    4        0.7435864  0.4871734
##    5        0.7435864  0.4871734
##    6        0.7435864  0.4871734
##    7        0.7435864  0.4871734
##    8        0.7435864  0.4871734
##    9        0.7435864  0.4871734
##   10        0.7435864  0.4871734
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 4.
# print the final model
dtree_fit$finalModel
## n= 21468 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 21468 10734 0 (0.5000000 0.5000000)  
##    2) delta1_songsListened< -0.3710361 8601  1873 0 (0.7822346 0.2177654) *
##    3) delta1_songsListened>=-0.3710361 12867  4006 1 (0.3113391 0.6886609)  
##      6) lovedTracks< -0.2293336 3359  1640 0 (0.5117595 0.4882405)  
##       12) lovedTracks< -0.271241 1106   361 0 (0.6735986 0.3264014) *
##       13) lovedTracks>=-0.271241 2253   974 1 (0.4323125 0.5676875) *
##      7) lovedTracks>=-0.2293336 9508  2287 1 (0.2405343 0.7594657) *
# Plot decision tree
prp(dtree_fit$finalModel, box.palette = "Reds", tweak = 1.2)

# Predict on test data
dtree_predict <- predict(dtree_fit, newdata = highnote_data_x_test)

# Predict test data probability
dtree_predict_prob <- predict(dtree_fit, newdata = highnote_data_x_test, type = "prob")

# Print Confusion matrix, Accuarcy, Sensitivity etc 
confusionMatrix(dtree_predict,  highnote_data_y_test, positive="1" )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 17504   623
##          1  7454  1223
##                                           
##                Accuracy : 0.6987          
##                  95% CI : (0.6931, 0.7042)
##     No Information Rate : 0.9311          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1341          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.66251         
##             Specificity : 0.70134         
##          Pos Pred Value : 0.14095         
##          Neg Pred Value : 0.96563         
##              Prevalence : 0.06887         
##          Detection Rate : 0.04563         
##    Detection Prevalence : 0.32372         
##       Balanced Accuracy : 0.68193         
##                                           
##        'Positive' Class : 1               
## 
# Add results into clf_results dataframe
x3 <- confusionMatrix(dtree_predict,  highnote_data_y_test, positive="1")[["overall"]]
y3 <- confusionMatrix(dtree_predict,  highnote_data_y_test, positive="1" )[["byClass"]]

clf_results[nrow(clf_results) + 1,] <-  list(Model = "DT", 
                                             Accuracy = round (x3[["Accuracy"]],3), 
                                            Precision = round (y3[["Precision"]],3), 
                                            Recall = round (y3[["Recall"]],3), 
                                            F1 = round (y3[["F1"]],3))

# Print Accuracy and F1 score

cat("Accuarcy is ", round(x3[["Accuracy"]],3), "and F1 is ", round (y3[["F1"]],3))
## Accuarcy is  0.699 and F1 is  0.232
# Add results into cost_benefit_df dataframe for cost benefit analysis 
a3 <- confusionMatrix(dtree_predict,  highnote_data_y_test)

cost_benefit_df[nrow(cost_benefit_df) + 1,] <-  list(Model = "DT", 
                                             TP = a3[["table"]][4], 
                                             FN = a3[["table"]][3], 
                                             FP = a3[["table"]][2], 
                                             TN = a3[["table"]][1])

1.10 Decision Tree Classification with Previous Time Period Only

#x = highnote_data_x_train_delta1
#y = highnote_data_y_train

# Cross validation
cross_validation <- trainControl(## 10-fold CV
                                method = "repeatedcv",
                                number = 10,
                                ## repeated three times
                                repeats = 3)
# Hyperparamter tuning
# maxdepth =  the maximum depth of the tree that will be created or
# the length of the longest path from the tree root to a leaf

#lets see what hyper parameters are needed /available to tune the 
# decision tree using rpart2 package
modelLookup("rpart2")
Param_Grid <-  expand.grid(maxdepth = 2:10)

# fit the model
dtree_fit_delta1 <- train(highnote_data_x_train_delta1,
                   highnote_data_y_train, 
                   method = "rpart2",
                   # split - criteria to split nodes
                   parms = list(split = "gini"),
                  tuneGrid = Param_Grid,
                   trControl = cross_validation,
                  # preProc -  perform listed pre-processing to predictor dataframe
                   preProc = c("center", "scale"))

# check the accuracy
dtree_fit_delta1
## CART 
## 
## 21468 samples
##    14 predictor
##     2 classes: '0', '1' 
## 
## Pre-processing: centered (12), scaled (12), ignore (2) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 19322, 19321, 19322, 19321, 19321, 19321, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  Accuracy   Kappa    
##    2        0.7512430  0.5024852
##    3        0.7512430  0.5024852
##    4        0.7529817  0.5059642
##    5        0.7529817  0.5059642
##    6        0.7529817  0.5059642
##    7        0.7529817  0.5059642
##    8        0.7529817  0.5059642
##    9        0.7529817  0.5059642
##   10        0.7529817  0.5059642
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 4.
# print the final model
dtree_fit_delta1$finalModel
## n= 21468 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 21468 10734 0 (0.5000000 0.5000000)  
##    2) delta1_songsListened< -0.3710361 8601  1873 0 (0.7822346 0.2177654) *
##    3) delta1_songsListened>=-0.3710361 12867  4006 1 (0.3113391 0.6886609)  
##      6) delta1_lovedTracks< -0.2039879 5415  2401 1 (0.4433980 0.5566020)  
##       12) age< -0.018029 3098  1398 0 (0.5487411 0.4512589) *
##       13) age>=-0.018029 2317   701 1 (0.3025464 0.6974536) *
##      7) delta1_lovedTracks>=-0.2039879 7452  1605 1 (0.2153784 0.7846216) *
# Plot decision tree
prp(dtree_fit_delta1$finalModel, box.palette = "Reds", tweak = 1.2)

# Predict on test data
dtree_predict_delta1 <- predict(dtree_fit_delta1, newdata = highnote_data_x_test)

# Predict test data probability
dtree_predict_prob_delta1 <- predict(dtree_fit_delta1, newdata = highnote_data_x_test, 
                                     type = "prob")

# Print Confusion matrix, Accuarcy, Sensitivity etc 
confusionMatrix(dtree_predict_delta1,  highnote_data_y_test, positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 19633   798
##          1  5325  1048
##                                           
##                Accuracy : 0.7716          
##                  95% CI : (0.7665, 0.7766)
##     No Information Rate : 0.9311          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1659          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.56771         
##             Specificity : 0.78664         
##          Pos Pred Value : 0.16444         
##          Neg Pred Value : 0.96094         
##              Prevalence : 0.06887         
##          Detection Rate : 0.03910         
##    Detection Prevalence : 0.23776         
##       Balanced Accuracy : 0.67718         
##                                           
##        'Positive' Class : 1               
## 
# Add results into clf_results dataframe
x4 <- confusionMatrix(dtree_predict_delta1,  highnote_data_y_test, positive="1")[["overall"]]
y4 <- confusionMatrix(dtree_predict_delta1,  highnote_data_y_test, positive="1" )[["byClass"]]

clf_results[nrow(clf_results) + 1,] <-  list(Model = "DT Delta1", 
                                             Accuracy = round (x4[["Accuracy"]],3), 
                                            Precision = round (y4[["Precision"]],3), 
                                            Recall = round (y4[["Recall"]],3), 
                                            F1 = round (y4[["F1"]],3))

# Print Accuracy and F1 score
cat("Accuarcy is ", round(x4[["Accuracy"]],3), "and F1 is ", round (y4[["F1"]],3))
## Accuarcy is  0.772 and F1 is  0.255
# Add results into cost_benefit_df dataframe for cost benefit analysis 
a4 <- confusionMatrix(dtree_predict_delta1,  highnote_data_y_test)

cost_benefit_df[nrow(cost_benefit_df) + 1,] <-  list(Model = "DT Delta1", 
                                             TP = a4[["table"]][4], 
                                             FN = a4[["table"]][3], 
                                             FP = a4[["table"]][2], 
                                             TN = a4[["table"]][1])

1.11 XGBoost classification with Previous and Current Time Period

#x = highnote_data_x_train
#y = highnote_data_y_train

# since XGBoost can not handle categorical variable, we need to one hot code
# training set
highnote_data_x_train_ohc = highnote_data_x_train
highnote_data_x_train_ohc <- data.frame(highnote_data_x_train_ohc) # convert from a tibble 
                                                                   # to a data frame
highnote_data_x_train_ohc <- dummy.data.frame(data=highnote_data_x_train_ohc, 
                                              sep="_")  # does one-hot encoding

# testing set
highnote_data_x_test_ohc = highnote_data_x_test
highnote_data_x_test_ohc <- data.frame(highnote_data_x_test_ohc) # convert 
                                            # from a tibble to a data frame
highnote_data_x_test_ohc <- dummy.data.frame(data=highnote_data_x_test_ohc, 
                                             sep="_")  #does one-hot encoding

# function to normalize data (0 to 1)
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

# normalize numeric data
# training set
highnote_data_x_train_ohc$age = 
  normalize(highnote_data_x_train_ohc$age)

highnote_data_x_train_ohc$friend_cnt = 
  normalize(highnote_data_x_train_ohc$friend_cnt)

highnote_data_x_train_ohc$avg_friend_age = 
  normalize(highnote_data_x_train_ohc$avg_friend_age)

highnote_data_x_train_ohc$avg_friend_male = 
  normalize(highnote_data_x_train_ohc$avg_friend_male)

highnote_data_x_train_ohc$friend_country_cnt = 
  normalize(highnote_data_x_train_ohc$friend_country_cnt)

highnote_data_x_train_ohc$subscriber_friend_cnt = 
  normalize(highnote_data_x_train_ohc$subscriber_friend_cnt)

highnote_data_x_train_ohc$songsListened = 
  normalize(highnote_data_x_train_ohc$songsListened)

highnote_data_x_train_ohc$lovedTracks = 
  normalize(highnote_data_x_train_ohc$lovedTracks)

highnote_data_x_train_ohc$posts = 
  normalize(highnote_data_x_train_ohc$posts)

highnote_data_x_train_ohc$playlists = 
  normalize(highnote_data_x_train_ohc$playlists)

highnote_data_x_train_ohc$shouts = 
  normalize(highnote_data_x_train_ohc$shouts)

highnote_data_x_train_ohc$tenure = 
  normalize(highnote_data_x_train_ohc$tenure)

highnote_data_x_train_ohc$delta1_friend_cnt = 
  normalize(highnote_data_x_train_ohc$delta1_friend_cnt)

highnote_data_x_train_ohc$delta1_avg_friend_age = 
  normalize(highnote_data_x_train_ohc$delta1_avg_friend_age)

highnote_data_x_train_ohc$delta1_avg_friend_male = 
  normalize(highnote_data_x_train_ohc$delta1_avg_friend_male)

highnote_data_x_train_ohc$delta1_friend_country_cnt = 
  normalize(highnote_data_x_train_ohc$delta1_friend_country_cnt)

highnote_data_x_train_ohc$delta1_subscriber_friend_cnt = 
  normalize(highnote_data_x_train_ohc$delta1_subscriber_friend_cnt)

highnote_data_x_train_ohc$delta1_songsListened = 
  normalize(highnote_data_x_train_ohc$delta1_songsListened)

highnote_data_x_train_ohc$delta1_lovedTracks = 
  normalize(highnote_data_x_train_ohc$delta1_lovedTracks)

highnote_data_x_train_ohc$delta1_posts = 
  normalize(highnote_data_x_train_ohc$delta1_posts)

highnote_data_x_train_ohc$delta1_playlists = 
  normalize(highnote_data_x_train_ohc$delta1_playlists)

highnote_data_x_train_ohc$delta1_shouts = 
  normalize(highnote_data_x_train_ohc$delta1_shouts)

# testing set
highnote_data_x_test_ohc$age = normalize(highnote_data_x_test_ohc$age)

highnote_data_x_test_ohc$friend_cnt = 
  normalize(highnote_data_x_test_ohc$friend_cnt)

highnote_data_x_test_ohc$avg_friend_age = 
  normalize(highnote_data_x_test_ohc$avg_friend_age)

highnote_data_x_test_ohc$avg_friend_male = 
  normalize(highnote_data_x_test_ohc$avg_friend_male)

highnote_data_x_test_ohc$friend_country_cnt = 
  normalize(highnote_data_x_test_ohc$friend_country_cnt)

highnote_data_x_test_ohc$subscriber_friend_cnt = 
  normalize(highnote_data_x_test_ohc$subscriber_friend_cnt)

highnote_data_x_test_ohc$songsListened = 
  normalize(highnote_data_x_test_ohc$songsListened)

highnote_data_x_test_ohc$lovedTracks = 
  normalize(highnote_data_x_test_ohc$lovedTracks)

highnote_data_x_test_ohc$posts = 
  normalize(highnote_data_x_test_ohc$posts)

highnote_data_x_test_ohc$playlists = 
  normalize(highnote_data_x_test_ohc$playlists)

highnote_data_x_test_ohc$shouts = 
  normalize(highnote_data_x_test_ohc$shouts)

highnote_data_x_test_ohc$tenure = 
  normalize(highnote_data_x_test_ohc$tenure)

highnote_data_x_test_ohc$delta1_friend_cnt = 
  normalize(highnote_data_x_test_ohc$delta1_friend_cnt)

highnote_data_x_test_ohc$delta1_avg_friend_age = 
  normalize(highnote_data_x_test_ohc$delta1_avg_friend_age)

highnote_data_x_test_ohc$delta1_avg_friend_male = 
  normalize(highnote_data_x_test_ohc$delta1_avg_friend_male)

highnote_data_x_test_ohc$delta1_friend_country_cnt = 
  normalize(highnote_data_x_test_ohc$delta1_friend_country_cnt)

highnote_data_x_test_ohc$delta1_subscriber_friend_cnt = 
  normalize(highnote_data_x_test_ohc$delta1_subscriber_friend_cnt)

highnote_data_x_test_ohc$delta1_songsListened = 
  normalize(highnote_data_x_test_ohc$delta1_songsListened)

highnote_data_x_test_ohc$delta1_lovedTracks = 
  normalize(highnote_data_x_test_ohc$delta1_lovedTracks)

highnote_data_x_test_ohc$delta1_posts = 
  normalize(highnote_data_x_test_ohc$delta1_posts)

highnote_data_x_test_ohc$delta1_playlists = 
  normalize(highnote_data_x_test_ohc$delta1_playlists)

highnote_data_x_test_ohc$delta1_shouts = 
  normalize(highnote_data_x_test_ohc$delta1_shouts)

XG_clf_fit <- train(highnote_data_x_train_ohc, 
                    highnote_data_y_train,
                    method = "xgbTree",
                    preProc = c("center", "scale"))

# print the final model
XG_clf_fit$finalModel
## ##### xgb.Booster
## raw: 183.8 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, objective = "binary:logistic")
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "3", gamma = "0", colsample_bytree = "0.6", min_child_weight = "1", subsample = "1", objective = "binary:logistic", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
## # of features: 32 
## niter: 150
## nfeatures : 32 
## xNames : age male_0 male_1 male_UNK friend_cnt avg_friend_age avg_friend_male friend_country_cnt subscriber_friend_cnt songsListened lovedTracks posts playlists shouts tenure good_country_0 good_country_1 good_country_UNK delta1_friend_cnt delta1_avg_friend_age delta1_avg_friend_male delta1_friend_country_cnt delta1_subscriber_friend_cnt delta1_songsListened delta1_lovedTracks delta1_posts delta1_playlists delta1_shouts delta1_good_country_-1 delta1_good_country_0 delta1_good_country_1 delta1_good_country_UNK 
## problemType : Classification 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 45     150         3 0.3     0              0.6                1         1
## obsLevels : 0 1 
## param :
##  list()
# Predict on test data
XG_clf_predict <- predict(XG_clf_fit,highnote_data_x_test_ohc)

# Print Confusion matrix, Accuracy, Sensitivity etc 
confusionMatrix(XG_clf_predict,  highnote_data_y_test )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0   585    55
##          1 24373  1791
##                                           
##                Accuracy : 0.0886          
##                  95% CI : (0.0853, 0.0921)
##     No Information Rate : 0.9311          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -9e-04          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.02344         
##             Specificity : 0.97021         
##          Pos Pred Value : 0.91406         
##          Neg Pred Value : 0.06845         
##              Prevalence : 0.93113         
##          Detection Rate : 0.02183         
##    Detection Prevalence : 0.02388         
##       Balanced Accuracy : 0.49682         
##                                           
##        'Positive' Class : 0               
## 
# Add results into clf_results dataframe
x5 <- confusionMatrix(XG_clf_predict,  highnote_data_y_test )[["overall"]]
y5 <- confusionMatrix(XG_clf_predict,  highnote_data_y_test )[["byClass"]]

clf_results[nrow(clf_results) + 1,] <-  list(Model = "XGB", 
                                             Accuracy = round (x5[["Accuracy"]],3), 
                                            Precision = round (y5[["Precision"]],3), 
                                            Recall = round (y5[["Recall"]],3), 
                                            F1 = round (y5[["F1"]],3))

# Print Accuracy and F1 score
cat("Accuarcy is ", round(x5[["Accuracy"]],3), "and F1 is ", round (y5[["F1"]],3))
## Accuarcy is  0.089 and F1 is  0.046
# Add results into cost_benefit_df dataframe for cost benefit analysis 
a5 <- confusionMatrix(XG_clf_predict,  highnote_data_y_test)

cost_benefit_df[nrow(cost_benefit_df) + 1,] <-  list(Model = "XGB", 
                                             TP = a5[["table"]][1], 
                                             FN = a5[["table"]][2], 
                                             FP = a5[["table"]][3], 
                                             TN = a5[["table"]][4])

1.12 XGBoost classification with Previous Time Period Only

#x = highnote_data_x_train_ohc
#y = highnote_data_y_train

highnote_data_x_train_ohc_delta1 = highnote_data_x_train_ohc %>% select(-c("friend_cnt",
                                                     "avg_friend_age",
                                                     "avg_friend_male",
                                                     "friend_country_cnt",
                                                     "subscriber_friend_cnt",
                                                     "songsListened",
                                                     "lovedTracks",
                                                     "posts",
                                                     "playlists",
                                                     "shouts",
                                                     "good_country_0", # already one hot coded, 
                                                                      # hence need to drop
                                                     "good_country_1", # already one hot coded 
                                                                      # hence need to drop
                                                     "good_country_UNK" # already one hot coded 
                                                                      # hence need to drop
                                                     ))

XG_clf_fit_delta1 <- train(highnote_data_x_train_ohc_delta1, 
                    highnote_data_y_train,
                    method = "xgbTree",
                    preProc = c("center", "scale"))

# print the final model
XG_clf_fit_delta1$finalModel
## ##### xgb.Booster
## raw: 178.1 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, objective = "binary:logistic")
## params (as set within xgb.train):
##   eta = "0.4", max_depth = "3", gamma = "0", colsample_bytree = "0.6", min_child_weight = "1", subsample = "1", objective = "binary:logistic", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
## # of features: 19 
## niter: 150
## nfeatures : 19 
## xNames : age male_0 male_1 male_UNK tenure delta1_friend_cnt delta1_avg_friend_age delta1_avg_friend_male delta1_friend_country_cnt delta1_subscriber_friend_cnt delta1_songsListened delta1_lovedTracks delta1_posts delta1_playlists delta1_shouts delta1_good_country_-1 delta1_good_country_0 delta1_good_country_1 delta1_good_country_UNK 
## problemType : Classification 
## tuneValue :
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 99     150         3 0.4     0              0.6                1         1
## obsLevels : 0 1 
## param :
##  list()
# Predict on test data
XG_clf_predict_delta1 <- predict(XG_clf_fit_delta1,highnote_data_x_test_ohc)

# Print Confusion matrix, Accuracy, Sensitivity etc 
confusionMatrix(XG_clf_predict_delta1,  highnote_data_y_test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0     8     5
##          1 24950  1841
##                                          
##                Accuracy : 0.069          
##                  95% CI : (0.066, 0.0721)
##     No Information Rate : 0.9311         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -3e-04         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.0003205      
##             Specificity : 0.9972914      
##          Pos Pred Value : 0.6153846      
##          Neg Pred Value : 0.0687171      
##              Prevalence : 0.9311297      
##          Detection Rate : 0.0002985      
##    Detection Prevalence : 0.0004850      
##       Balanced Accuracy : 0.4988060      
##                                          
##        'Positive' Class : 0              
## 
# Add results into clf_results dataframe
x6 <- confusionMatrix(XG_clf_predict_delta1,  highnote_data_y_test )[["overall"]]
y6 <- confusionMatrix(XG_clf_predict_delta1,  highnote_data_y_test )[["byClass"]]

clf_results[nrow(clf_results) + 1,] <-  list(Model = "XGB Delta1", 
                                             Accuracy = round (x6[["Accuracy"]],3), 
                                            Precision = round (y6[["Precision"]],3), 
                                            Recall = round (y6[["Recall"]],3), 
                                            F1 = round (y6[["F1"]],3))

# Print Accuracy and F1 score
cat("Accuarcy is ", round(x6[["Accuracy"]],3), "and F1 is ", round (y6[["F1"]],3))
## Accuarcy is  0.069 and F1 is  0.001
# Add results into cost_benefit_df dataframe for cost benefit analysis 
a6 <- confusionMatrix(XG_clf_predict_delta1,  highnote_data_y_test)

cost_benefit_df[nrow(cost_benefit_df) + 1,] <-  list(Model = "XGB Delta1", 
                                             TP = a6[["table"]][1], 
                                             FN = a6[["table"]][2], 
                                             FP = a6[["table"]][3], 
                                             TN = a6[["table"]][4])

1.13 KNN Classification with Previous and Current Time Period

# x = highnote_data_x_train_ohc
# y = highnote_data_y_train

# Cross validation 
cross_validation <- trainControl(## 10-fold CV
                                method = "repeatedcv",
                                number = 10,
                                ## repeated three times
                                repeats = 3)
# Hyperparamter tuning
# k = number of nearest neighbors
Param_Grid <-  expand.grid( k = 1:10)

#for every k from 1: 10 it will do 10 fold cross validation, 3 times

# fit the model to training data
knn_clf_fit <- train(highnote_data_x_train_ohc,
                     highnote_data_y_train,
                     method = "knn",
                     tuneGrid = Param_Grid,
                     trControl = cross_validation )


#you can see the sampling process
knn_clf_fit$resample %>% arrange(Resample)
# check the accuracy for different models
knn_clf_fit
## k-Nearest Neighbors 
## 
## 21468 samples
##    32 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 19321, 19321, 19321, 19322, 19321, 19321, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.7017421  0.4034836
##    2  0.6787471  0.3574935
##    3  0.6929073  0.3858133
##    4  0.6844602  0.3689196
##    5  0.6956399  0.3912787
##    6  0.6909973  0.3819933
##    7  0.6960439  0.3920864
##    8  0.6932490  0.3864969
##    9  0.6982492  0.3964972
##   10  0.6973638  0.3947270
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
# Plot accuracies for different k values
plot(knn_clf_fit)

# print the best model
print(knn_clf_fit$finalModel)
## 1-nearest neighbor model
## Training set outcome distribution:
## 
##     0     1 
## 10734 10734
# Predict on test data
knnPredict <- predict(knn_clf_fit, newdata = highnote_data_x_test_ohc) 

# Print Confusion matrix, Accuarcy, Sensitivity etc 
confusionMatrix(knnPredict, highnote_data_y_test, positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  7770   506
##          1 17188  1340
##                                           
##                Accuracy : 0.3399          
##                  95% CI : (0.3342, 0.3456)
##     No Information Rate : 0.9311          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0072          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.72589         
##             Specificity : 0.31132         
##          Pos Pred Value : 0.07232         
##          Neg Pred Value : 0.93886         
##              Prevalence : 0.06887         
##          Detection Rate : 0.04999         
##    Detection Prevalence : 0.69124         
##       Balanced Accuracy : 0.51861         
##                                           
##        'Positive' Class : 1               
## 
# Add results into clf_results dataframe
x7 <- confusionMatrix(knnPredict, highnote_data_y_test, positive="1")[["overall"]]
y7 <- confusionMatrix(knnPredict, highnote_data_y_test, positive="1")[["byClass"]]

clf_results[nrow(clf_results) + 1,] <-  list(Model = "KNN", 
                                             Accuracy = round (x7[["Accuracy"]],3), 
                                            Precision = round (y7[["Precision"]],3), 
                                            Recall = round (y7[["Recall"]],3), 
                                            F1 = round (y7[["F1"]],3))
# Print Accuracy and F1 score
cat("Accuarcy is ", round(x7[["Accuracy"]],3), "and F1 is ", round (y7[["F1"]],3))
## Accuarcy is  0.34 and F1 is  0.132
# Add results into cost_benefit_df dataframe for cost benefit analysis 
a7 <- confusionMatrix(knnPredict, highnote_data_y_test)

cost_benefit_df[nrow(cost_benefit_df) + 1,] <-  list(Model = "KNN", 
                                             TP = a7[["table"]][4], 
                                             FN = a7[["table"]][3], 
                                             FP = a7[["table"]][2], 
                                             TN = a7[["table"]][1])

1.14 KNN Classification with Previous Time Period Only

# x = highnote_data_x_train_ohc_delta1
# y = highnote_data_y_train

# Cross validation 
cross_validation <- trainControl(## 10-fold CV
                                method = "repeatedcv",
                                number = 10,
                                ## repeated three times
                                repeats = 3)
# Hyperparamter tuning
# k = number of nearest neighbors
Param_Grid <-  expand.grid( k = 1:10)

#for every k from 1: 10 it will do 10 fold cross validation, 3 times

# fit the model to training data
knn_clf_fit_delta1 <- train(highnote_data_x_train_ohc_delta1,
                     highnote_data_y_train,
                     method = "knn",
                     tuneGrid = Param_Grid,
                     trControl = cross_validation )


#you can see the sampling process
knn_clf_fit_delta1$resample %>% arrange(Resample)
# check the accuracy for different models
knn_clf_fit_delta1
## k-Nearest Neighbors 
## 
## 21468 samples
##    19 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 19321, 19321, 19321, 19321, 19322, 19321, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.6852993  0.3705989
##    2  0.6634370  0.3268743
##    3  0.6772257  0.3544517
##    4  0.6711538  0.3423081
##    5  0.6758279  0.3516564
##    6  0.6741507  0.3483015
##    7  0.6784357  0.3568721
##    8  0.6742284  0.3484576
##    9  0.6759210  0.3518429
##   10  0.6713404  0.3426822
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
# Plot accuracies for different k values
plot(knn_clf_fit_delta1)

# print the best model
print(knn_clf_fit_delta1$finalModel)
## 1-nearest neighbor model
## Training set outcome distribution:
## 
##     0     1 
## 10734 10734
# Predict on test data
knnPredict_delta1 <- predict(knn_clf_fit_delta1, newdata = highnote_data_x_test_ohc) 

# Print Confusion matrix, Accuarcy, Sensitivity etc 
confusionMatrix(knnPredict_delta1, highnote_data_y_test, positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  5418   386
##          1 19540  1460
##                                           
##                Accuracy : 0.2566          
##                  95% CI : (0.2514, 0.2619)
##     No Information Rate : 0.9311          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0014          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.79090         
##             Specificity : 0.21708         
##          Pos Pred Value : 0.06952         
##          Neg Pred Value : 0.93349         
##              Prevalence : 0.06887         
##          Detection Rate : 0.05447         
##    Detection Prevalence : 0.78347         
##       Balanced Accuracy : 0.50399         
##                                           
##        'Positive' Class : 1               
## 
# Add results into clf_results dataframe
x8 <- confusionMatrix(knnPredict_delta1, highnote_data_y_test, positive="1")[["overall"]]
y8 <- confusionMatrix(knnPredict_delta1, highnote_data_y_test, positive="1")[["byClass"]]

clf_results[nrow(clf_results) + 1,] <-  list(Model = "KNN Delta1", 
                                             Accuracy = round (x8[["Accuracy"]],3), 
                                            Precision = round (y8[["Precision"]],3), 
                                            Recall = round (y8[["Recall"]],3), 
                                            F1 = round (y8[["F1"]],3))
# Print Accuracy and F1 score
cat("Accuarcy is ", round(x8[["Accuracy"]],3), "and F1 is ", round (y8[["F1"]],3))
## Accuarcy is  0.257 and F1 is  0.128
# Add results into cost_benefit_df dataframe for cost benefit analysis 
a8 <- confusionMatrix(knnPredict_delta1, highnote_data_y_test)

cost_benefit_df[nrow(cost_benefit_df) + 1,] <-  list(Model = "KNN Delta1", 
                                             TP = a8[["table"]][4], 
                                             FN = a8[["table"]][3], 
                                             FP = a8[["table"]][2], 
                                             TN = a8[["table"]][1])

1.15 Neural Network classification with Previous and Current Time Period

# x = highnote_data_x_train_ohc
# y = highnote_data_y_train

# Try different combinations of parameters like 
# decay (prevents the weights from growing too large,) 
# and size of Hidden layers
my.grid <- expand.grid(.decay = c(0.5, 0.1), .size = c(5, 7))

# stepmax is maximum steps for the training of the neural network
# threshold is set to 0.01, meaning that if the change in error during an iteration is 
# less than 1%, then no further optimization will be carried out by the model
nn_clf_fit <- train(highnote_data_x_train_ohc,
                    highnote_data_y_train,
                    method = "nnet",
                    trace = F,
                    tuneGrid = my.grid,
                    linout = 0,
                    stepmax = 100,
                    threshold = 0.01)
print(nn_clf_fit)
## Neural Network 
## 
## 21468 samples
##    32 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 21468, 21468, 21468, 21468, 21468, 21468, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  Accuracy   Kappa    
##   0.1    5     0.7360404  0.4723747
##   0.1    7     0.7366608  0.4735868
##   0.5    5     0.7266161  0.4535832
##   0.5    7     0.7278869  0.4560774
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 7 and decay = 0.1.
# Plot Neural Network 
plotnet(nn_clf_fit$finalModel, y_names = "Adopter")

# Predict on test data
nn_clf_predict <- predict(nn_clf_fit, highnote_data_x_test_ohc)

# Print Confusion matrix, Accuarcy, Sensitivity etc 
confusionMatrix(nn_clf_predict,  highnote_data_y_test, positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 23871  1412
##          1  1087   434
##                                           
##                Accuracy : 0.9068          
##                  95% CI : (0.9032, 0.9102)
##     No Information Rate : 0.9311          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2086          
##                                           
##  Mcnemar's Test P-Value : 9.094e-11       
##                                           
##             Sensitivity : 0.23510         
##             Specificity : 0.95645         
##          Pos Pred Value : 0.28534         
##          Neg Pred Value : 0.94415         
##              Prevalence : 0.06887         
##          Detection Rate : 0.01619         
##    Detection Prevalence : 0.05675         
##       Balanced Accuracy : 0.59577         
##                                           
##        'Positive' Class : 1               
## 
# Add results into clf_results dataframe
x9 <- confusionMatrix(nn_clf_predict,  highnote_data_y_test)[["overall"]]
y9 <- confusionMatrix(nn_clf_predict,  highnote_data_y_test)[["byClass"]]

clf_results[nrow(clf_results) + 1,] <-  list(Model = "NN", 
                                             Accuracy = round (x9[["Accuracy"]],3), 
                                            Precision = round (y9[["Precision"]],3), 
                                            Recall = round (y9[["Recall"]],3), 
                                            F1 = round (y9[["F1"]],3))

# Print Accuracy and F1 score
cat("Accuarcy is ", round(x9[["Accuracy"]],3), "and F1 is ", round (y9[["F1"]],3))
## Accuarcy is  0.907 and F1 is  0.95
# Add results into cost_benefit_df dataframe for cost benefit analysis 
a9 <- confusionMatrix(nn_clf_predict,  highnote_data_y_test)

cost_benefit_df[nrow(cost_benefit_df) + 1,] <-  list(Model = "NN", 
                                             TP = a9[["table"]][1], 
                                             FN = a9[["table"]][2], 
                                             FP = a9[["table"]][3], 
                                             TN = a9[["table"]][4])

1.16 Neural Network classification with Previous Time Period Only

# x = highnote_data_x_train_ohc_delta1
# y = highnote_data_y_train

# Try different combinations of parameters like 
# decay (prevents the weights from growing too large,) 
# and size of Hidden layers
my.grid <- expand.grid(.decay = c(0.5, 0.1), .size = c(5, 7))

# stepmax is maximum steps for the training of the neural network
# threshold is set to 0.01, meaning that if the change in error during an iteration is 
# less than 1%, then no further optimization will be carried out by the model
nn_clf_fit_delta1 <- train(highnote_data_x_train_ohc_delta1,
                    highnote_data_y_train,
                    method = "nnet",
                    trace = F,
                    tuneGrid = my.grid,
                    linout = 0,
                    stepmax = 100,
                    threshold = 0.01)
print(nn_clf_fit_delta1)
## Neural Network 
## 
## 21468 samples
##    19 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 21468, 21468, 21468, 21468, 21468, 21468, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  Accuracy   Kappa    
##   0.1    5     0.6737709  0.3472919
##   0.1    7     0.6765990  0.3529346
##   0.5    5     0.6604082  0.3205482
##   0.5    7     0.6608191  0.3213232
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 7 and decay = 0.1.
# Plot Neural Network 
plotnet(nn_clf_fit_delta1$finalModel, y_names = "Adopter")

# Predict on test data
nn_clf_predict_delta1 <- predict(nn_clf_fit_delta1, highnote_data_x_test_ohc)

# Print Confusion matrix, Accuarcy, Sensitivity etc 
confusionMatrix(nn_clf_predict_delta1,  highnote_data_y_test, positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 24854  1822
##          1   104    24
##                                          
##                Accuracy : 0.9281         
##                  95% CI : (0.925, 0.9312)
##     No Information Rate : 0.9311         
##     P-Value [Acc > NIR] : 0.9733         
##                                          
##                   Kappa : 0.0155         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.0130011      
##             Specificity : 0.9958330      
##          Pos Pred Value : 0.1875000      
##          Neg Pred Value : 0.9316989      
##              Prevalence : 0.0688703      
##          Detection Rate : 0.0008954      
##    Detection Prevalence : 0.0047754      
##       Balanced Accuracy : 0.5044170      
##                                          
##        'Positive' Class : 1              
## 
# Add results into clf_results dataframe
x10 <- confusionMatrix(nn_clf_predict_delta1,  highnote_data_y_test)[["overall"]]
y10 <- confusionMatrix(nn_clf_predict_delta1,  highnote_data_y_test)[["byClass"]]

clf_results[nrow(clf_results) + 1,] <-  list(Model = "NN Delta1", 
                                             Accuracy = round (x10[["Accuracy"]],3), 
                                            Precision = round (y10[["Precision"]],3), 
                                            Recall = round (y10[["Recall"]],3), 
                                            F1 = round (y10[["F1"]],3))

# Print Accuracy and F1 score
cat("Accuarcy is ", round(x10[["Accuracy"]],3), "and F1 is ", round (y10[["F1"]],3))
## Accuarcy is  0.928 and F1 is  0.963
# Add results into cost_benefit_df dataframe for cost benefit analysis 
a10 <- confusionMatrix(nn_clf_predict_delta1,  highnote_data_y_test)

cost_benefit_df[nrow(cost_benefit_df) + 1,] <-  list(Model = "NN Delta1", 
                                             TP = a10[["table"]][1], 
                                             FN = a10[["table"]][2], 
                                             FP = a10[["table"]][3], 
                                             TN = a10[["table"]][4])

2. Model Evaluation

2.1 Compare Accuracy for all Classification models

print(clf_results)
##         Model Accuracy Precision Recall    F1
## 1          LR    0.781     0.173  0.580 0.267
## 2   LR Delta1    0.762     0.147  0.509 0.228
## 3          DT    0.699     0.141  0.663 0.232
## 4   DT Delta1    0.772     0.164  0.568 0.255
## 5         XGB    0.089     0.914  0.023 0.046
## 6  XGB Delta1    0.069     0.615  0.000 0.001
## 7         KNN    0.340     0.072  0.726 0.132
## 8  KNN Delta1    0.257     0.070  0.791 0.128
## 9          NN    0.907     0.944  0.956 0.950
## 10  NN Delta1    0.928     0.932  0.996 0.963
# Plot accuracy for all the Classification Models

ggplot(clf_results %>% arrange(desc(Accuracy)) %>%
       mutate(Model=factor(Model, levels=Model) ), 
       aes(x = Model, y = Accuracy)) +
  geom_bar(stat = "identity" , width=0.3, fill="steelblue") + 
  coord_cartesian(ylim = c(0.88, 1)) +
  geom_hline(aes(yintercept = mean(Accuracy)),
             colour = "green",linetype="dashed") +
  ggtitle("Compare Accuracy for all Models") +
  theme(plot.title = element_text(color="black", size=10, hjust = 0.5))

print(cost_benefit_df)
##         Model    TP    FN    FP    TN
## 1          LR  1070   776  5106 19852
## 2   LR Delta1   940   906  5460 19498
## 3          DT  1223   623  7454 17504
## 4   DT Delta1  1048   798  5325 19633
## 5         XGB   585 24373    55  1791
## 6  XGB Delta1     8 24950     5  1841
## 7         KNN  1340   506 17188  7770
## 8  KNN Delta1  1460   386 19540  5418
## 9          NN 23871  1087  1412   434
## 10  NN Delta1 24854   104  1822    24

2.2 ROC and Lift curves for all models

ROC curve - It is a performance measurement for classification problem at various thresholds settings. It tells how much a model is capable of distinguishing between classes.

Y axis - True Positive rate or Sensitivity = (TP / TP + FN)

X axis - False Positive rate or (1 - specificity) = (FP / TN + FP)

AUC - Area under ROC curve. Higher the AUC, better the model is at predicting 0s as 0s and 1s as 1s.

Lets Plot ROC curves for all the Models. The more “up and to the left” the ROC curve of a model is, the better the model. Also, higher the Area under curve, the better the model

# Predict probabilities of each model to plot ROC curve

logistic_predict_prob <- predict(glm_fit, newdata = highnote_data_x_test, type="prob")
logistic_predict_delta1_prob <- predict(glm_fit_delta1, 
                                        newdata = highnote_data_x_test, type="prob")

dtree_predict_prob <- predict(dtree_fit, newdata = highnote_data_x_test, type = "prob")
dtree_predict_delta1_prob <- predict(dtree_fit_delta1, 
                                     newdata = highnote_data_x_test, type = "prob")

XG_boost_predict_prob <- predict(XG_clf_fit,highnote_data_x_test_ohc, type = "prob")
XG_boost_predict_delta1_prob <- predict(XG_clf_fit_delta1,
                                        highnote_data_x_test_ohc, type = "prob")

knn_predict_prob <- predict(knn_clf_fit, newdata = highnote_data_x_test_ohc, type = "prob")
knn_predict_delta1_prob <- predict(knn_clf_fit_delta1, 
                                   newdata = highnote_data_x_test_ohc, type = "prob") 

neural_network_predict_prob <- predict(nn_clf_fit, highnote_data_x_test_ohc, type = "prob")
neural_network_predict_delta1_prob <- predict(nn_clf_fit_delta1, 
                                              highnote_data_x_test_ohc, type = "prob")


# List of predictions
preds_list <- list(
  logistic_predict_prob[,1],
  logistic_predict_delta1_prob[,1],
  dtree_predict_prob[,1],
  dtree_predict_delta1_prob[,1],
  XG_boost_predict_prob[,1],
  XG_boost_predict_delta1_prob[,1],
  knn_predict_prob[,1],
  knn_predict_delta1_prob[,1],
  neural_network_predict_prob[,1],
  neural_network_predict_delta1_prob[,1]
  )

# List of actual values (same for all)
m <- length(preds_list)
actuals_list <- rep(list(highnote_data_y_test), m)

# Plot the ROC curves
pred <- prediction(preds_list, actuals_list)
rocs <- performance(pred, "tpr", "fpr")

# calculate AUC for all models
AUC_models <- performance(pred, "auc")

auc_logistic = round(AUC_models@y.values[[1]], 3)
auc_logistic_delta1 = round(AUC_models@y.values[[2]], 3)
auc_dtree = round(AUC_models@y.values[[3]], 3)
auc_dtree_delta1 = round(AUC_models@y.values[[4]], 3)
auc_xg_boost = round(AUC_models@y.values[[5]], 3)
auc_xg_boost_delta1 = round(AUC_models@y.values[[6]], 3)
auc_knn = round(AUC_models@y.values[[7]], 3)
auc_knn_delta1 = round(AUC_models@y.values[[8]], 3)
auc_neural_network = round(AUC_models@y.values[[9]], 3)
auc_neural_network_delta1 = round(AUC_models@y.values[[10]], 3)

# Plot the ROC curves
plot(rocs, col = as.list(1:m), main = "ROC Curves of different models")
legend(x = "bottomright", 
       legend = c( paste0("Logistic - ", auc_logistic), 
                   paste0("Logistic Delta1 - ", auc_logistic_delta1),
                   paste0("Decision Tree - ", auc_dtree), 
                   paste0("Decision Tree Delta1 - ", auc_dtree_delta1),
                   paste0("XG Boost - ", auc_xg_boost), 
                   paste0("XG Boost Delta1 - ", auc_xg_boost_delta1),
                   paste0("Knn - ", auc_knn), 
                   paste0("Knn Delta1 - ", auc_knn_delta1),
                   paste0("Neural Net - ", auc_neural_network),
                   paste0("Neural Net Delta1 - ", auc_neural_network_delta1)
                   ), fill = 1:m)

Lift curve - Lift is a measure of the effectiveness of a predictive model calculated as the ratio between the results obtained with and without the predictive model. The lift chart shows how much more likely we are to predict the correct outcome than a random guess.

lifts <- performance(pred, "lift", "rpp")

# Plot the Lift curves
plot(lifts, col = as.list(1:m), main = "Lift Curves of Different Models")
legend(x = "topright", 
       legend = c( "Logistic",
                   "Logistic Delta1",
                   "Decision Tree",
                   "Decision Tree Delta1",
                   "XG Boost",
                   "XG Boost Delta1",
                   "Knn",
                   "Knn Delta1",
                   "Neural Net",
                   "Neural Net Delta1"
                   ), fill = 1:m)

3. Model Selection & Results

The performance of Neural Network with previous time variables (delta1) alone are as follows,

[1] AUC : 0.478 Higher the AUC, better the model is at predicting 0s as 0s and 1s as 1s.

[2] Accuracy : 0.928 Accuracy = (True Positives + True Negatives) / (True Positives + True Negatives + False Positives + False Negatives)

[3] Precision : 0.932 Out of all the positive predicted, what percentage is truly positive. Precision = True Positives / (True Positives + False Positives) = True Positives / (Total Predicted Positives)

[4] Recall : 0.996 Recall or True Positive Rate (TPR) is how many of the Actual Positives our model capture through labeling it as Positive (True Positive) Recall = True Positive / (True Positive + False Negative) = True Positives / (Total Actual Positive)

[5] F1 Score : 0.963 F1 is harmonic mean of precision and recall. It takes both false positive and false negatives into account. Therefore, it performs well on an imbalanced data set. Its value lie between 0 and 1, higher the better

F1 = 2* (Precision*Recall)/(Precision + Recall)

The Neural Network with previous time variables alone (delta1) performed well compared to other models. Also adding current time variable to the model did not increase any of the measures above. Hence we will be using Neural Network Model with previous (delta1) variables alone to score the final dataset

4. Scoring the target list

4.1 We will use the Neural Network with Delta1 variables alone for scoring

# load highnote_data dataframe from section 1.1 for scoring
# lets pull the current free users where adopter = 0 for scoring target
highnote_data_target <- filter(highnote_data, adopter == 0)

# drop "net_user" and "adopter" to prepare for scoring
highnote_data_target_x <- highnote_data_target %>% select(-c("net_user","adopter"))

# we need to one hot code factor variables and 
# normalize continous variables in highnote_data since we will be using neural network
highnote_data_target_ohc = highnote_data_target_x
highnote_data_target_ohc <- data.frame(highnote_data_target_ohc) # convert from 
                                                                # a tibble to a data frame
highnote_data_target_ohc <- dummy.data.frame(data=highnote_data_target_ohc, 
                                             sep="_")  #does one-hot encoding

# min max normalization of numeric variables
highnote_data_target_ohc$age = 
  normalize(highnote_data_target_ohc$age)

highnote_data_target_ohc$friend_cnt = 
  normalize(highnote_data_target_ohc$friend_cnt)

highnote_data_target_ohc$avg_friend_age = 
  normalize(highnote_data_target_ohc$avg_friend_age)

highnote_data_target_ohc$avg_friend_male = 
  normalize(highnote_data_target_ohc$avg_friend_male)

highnote_data_target_ohc$friend_country_cnt = 
  normalize(highnote_data_target_ohc$friend_country_cnt)

highnote_data_target_ohc$subscriber_friend_cnt = 
  normalize(highnote_data_target_ohc$subscriber_friend_cnt)

highnote_data_target_ohc$songsListened = 
  normalize(highnote_data_target_ohc$songsListened)

highnote_data_target_ohc$lovedTracks = 
  normalize(highnote_data_target_ohc$lovedTracks)

highnote_data_target_ohc$posts = 
  normalize(highnote_data_target_ohc$posts)

highnote_data_target_ohc$playlists = 
  normalize(highnote_data_target_ohc$playlists)

highnote_data_target_ohc$shouts = 
  normalize(highnote_data_target_ohc$shouts)

highnote_data_target_ohc$tenure = 
  normalize(highnote_data_target_ohc$tenure)

highnote_data_target_ohc$delta1_friend_cnt = 
  normalize(highnote_data_target_ohc$delta1_friend_cnt)

highnote_data_target_ohc$delta1_avg_friend_age = 
  normalize(highnote_data_target_ohc$delta1_avg_friend_age)

highnote_data_target_ohc$delta1_avg_friend_male = 
  normalize(highnote_data_target_ohc$delta1_avg_friend_male)

highnote_data_target_ohc$delta1_friend_country_cnt = 
  normalize(highnote_data_target_ohc$delta1_friend_country_cnt)

highnote_data_target_ohc$delta1_subscriber_friend_cnt = 
  normalize(highnote_data_target_ohc$delta1_subscriber_friend_cnt)

highnote_data_target_ohc$delta1_songsListened = 
  normalize(highnote_data_target_ohc$delta1_songsListened)

highnote_data_target_ohc$delta1_lovedTracks = 
  normalize(highnote_data_target_ohc$delta1_lovedTracks)

highnote_data_target_ohc$delta1_posts = 
  normalize(highnote_data_target_ohc$delta1_posts)

highnote_data_target_ohc$delta1_playlists = 
  normalize(highnote_data_target_ohc$delta1_playlists)

highnote_data_target_ohc$delta1_shouts = 
  normalize(highnote_data_target_ohc$delta1_shouts)


# use the fitted tree model to score this new data
# note here, i want to rank order my targets by probability of responding
# hence we tell R type="prob" as opposed to type="class" 
# latter would give 0, 1 output

# Predict on test data
nn_predict_score <- predict(nn_clf_fit_delta1, newdata = highnote_data_target_ohc, 
                            type="prob")

# above results in 2 columns, probability of class 0 and probability of class 1 in the 2nd column
# we will extract the 2nd column and store it in a new column to the highnote_data_target data frame
highnote_data_target$ProbConversion <- nn_predict_score[,2]

# sort this in descending order 
highnote_data_target <- highnote_data_target %>% arrange(desc(ProbConversion))

# lets count
successNum <-  highnote_data_target[1:1000,] %>% filter(ProbConversion>0.5) %>% tally()

# Print targeting accuracy 
cat("If we target the top 1000 people ", successNum[[1]], 
    "out of 1000 have a chance > 50% of converting","\n")
## If we target the top 1000 people  126 out of 1000 have a chance > 50% of converting
# write this sorted target list as a CSV file
write.csv(x=highnote_data_target, file = "Prospective_HighNote_Freemium_User_List.csv", 
          row.names = TRUE)

5. Conclusion

After deploying neural network trained with previous time varialbes alone (delta1) to score the 1000 current free user, we got a response of

126 free users out of 1000 have a chance > 50% of converting to premium

Lets say if we target these 126 users with two month free trial, following outcomes can be inferred

All 126 users converts to premium, win win situation for highnotes. But lets say after targeting the free users some converted to premium and some didnt. This can be evaluated by the false positives of the model or Type I error. FPR = FP/TN+FP = 1822/24+1822 = 98.7%

we might loose the marketing dollars + two month premium of highnote service. Conversely the false negative of the model whom we predicted will not convert to premium but actually they could have if we had given them the offer, is 104

The recall of the model is 0.996, which is high giving us the confident that true positive rate (who actually turned in to premium) is predicted high by the model

It would be nice to know following additional information before deploying the model with a targeted offer,

  1. What is life time value of the customer? What is the value that customer bring to highnote if they converted to premium by seeing the offer?
  2. Will they have converted anyways without the ad?
  3. Do we offer too much free content? may be the users are ok with add since the jump isnt significant enough
  4. What is the historical conversion rate on any similar campaign?
  5. Are the features in premium service is compelling enough to make a significant difference from free service other than just ads?