library("stringr")
library("dplyr")
library("tidyr")

#library("arm")
library("pROC")
library("car")
library("caret")

library("reshape2")
library("patchwork")

Note:

No user is using multiple device.
Users stay in the same location.
All songs are finished before going to the next song.
There is no “remove from Playlist” record.
Only users who canceled the service are considered as churned. Free users with no activities are not considered as churned.
We only have data from 10/1/2018 to 12/1/2018

#setwd("/Users/dpong/Data 621/Final_Project/Datasets")
setwd("~/Library/CloudStorage/OneDrive-CityUniversityofNewYork/621/final_churn_modeling")
df <- read.csv("sparkify-medium.csv", stringsAsFactors = FALSE, row.names=1)

The time of registration for the records of a few users are incorrect (the time of registration is after the user’s first log in). Correct the time of registration using the “Submit Registration” page and the session ID

regist_df <- filter(df,df$page=="Submit Registration")

for (i in c(1:nrow(regist_df))) {
  temp_df <- df %>% 
                filter(sessionId==regist_df$sessionId[i]) %>%
                filter(!is.na(userId)) %>% 
                mutate(delta=abs(ts-regist_df$ts[i])) %>% 
                arrange(delta,desc=FALSE)

  df[!is.na(df$userId) & df$userId==temp_df$userId[1],"registration"] <- regist_df$ts[i]
}

Filter out the guest records (the ones without a userId)

df <- filter(df,!is.na(userId))

Simplify the user Agent to represent the type of device that the user is using.

df$userAgent[str_detect(df$userAgent,"Macintosh")] <- "Macintosh"
df$userAgent[str_detect(df$userAgent,"iPad")] <- "iPad"
df$userAgent[str_detect(df$userAgent,"iPhone")] <- "iPhone"
df$userAgent[str_detect(df$userAgent,"Windows")] <- "Windows"
df$userAgent[str_detect(df$userAgent,"Linux")] <- "Linux"

# add location
df$location <- str_replace(str_extract(df$location, ", .+"), ", ", "")

Select a subset of the activities that may be significant predictors. Activities such as going to the home page or setting page that seem to be insignificant are excluded.

# selected_pages <- c("NextSong","Roll Advert","Add Friend","Thumbs Up",
#                     "Add to Playlist", "Upgrade", "Submit Upgrade", "Error",
#                     "Thumbs Down","Cancel", "Cancellation Confirmation",
#                     "Downgrade", "Submit Downgrade","Submit Registration")
# df <- df[df$page %in% selected_pages,]

Convert some categorical variables in to factors.

factor_columns <- c("page","auth","method","status","level","gender","userAgent","location") # add last variable

df[factor_columns] <- lapply(df[factor_columns], factor)

Remove some variables that are not used in our analysis

df$home <- NULL
df$method <- NULL
df$status <- NULL
df$itemInSession <- NULL
#df$location <- NULL # we need location to see if this feature affects user churned
df$lastName <- NULL
df$firstName <- NULL
df$auth <- NULL

Create a new variable indicating whether it is a song that the user never listened before.

df <- arrange(df, ts,desc=FALSE)

df$user_song <- paste0(df$userId, df$artist, df$song)
temp <- df %>% group_by(user_song) %>% mutate(count=row_number())
df$new_song <- temp$count
temp <- NULL
df$user_song <- NULL
df$new_song[df$new_song > 1] <- 0
df$new_song[is.na(df$song)] <- NA

page_df <- df %>% group_by(userId) %>% 
  count(page) %>% 
  spread(page, n, fill = 0)

#Cancel column is identical to "Cancellation Confirmation" so it is removed
page_df$Cancel <- NULL

page_df[,2:ncol(page_df)] <- sapply(page_df[,2:ncol(page_df)], as.integer)
page_df$Total_Activities <- apply(page_df[,2:ncol(page_df)], 1, sum)

page_df

user_df <- df %>% filter(!is.na(song)) %>% 
  arrange(ts, desc=FALSE) %>% 
  group_by(userId) %>% 
  summarise(active_sessions=n_distinct(sessionId),
            new_songs_listened=sum(new_song),
            registration=first(registration),
            end_level=last(level),
            gender=first(gender),
            userAgent=first(userAgent),
            location =first(location))
user_df

#Euclid: Fix the bug by sorting the user Id first
df <- df %>% arrange(userId, desc=FALSE)

obs_df <- data.frame(userId=unique(df$userId))
obs_df$start <- ifelse(user_df$registration > 1538352000000, user_df$registration, 1538352000000)
obs_df$end <- 1543622400000
temp <- filter(df, page == "Cancellation Confirmation")
obs_df$end[obs_df$userId %in% temp$userId] <- temp$ts

# obs_df

prepared_df <- merge(obs_df, user_df, by=c("userId")) %>% 
                arrange(userId)
  
prepared_df <- merge(prepared_df, page_df, by=c("userId")) %>% 
                arrange(userId)

names(prepared_df) <- str_replace_all(names(prepared_df), " ", "_")

prepared_df

lookback_window <- 14
# corrected the mistake by Euclid. Replaced 100 with 1000
sub_obs_time_frame <- lookback_window * 24 * 3600 * 1000

df <- merge(df, prepared_df[c("userId","start","end")], by=c("userId"))
df

df_recent <- filter(df, ts >= end - sub_obs_time_frame)
df_oldest <- filter(df, ts <= start + sub_obs_time_frame)

temp <- df_recent %>% 
        group_by(userId) %>% 
        summarise(recent_total_act = n())

prepared_df <- merge(prepared_df, temp, by=c("userId"), all.x=TRUE)

temp <- df_recent %>%  filter(page == "NextSong") %>% 
                        group_by(userId) %>% 
                        summarise(recent_total_song = n())

prepared_df <- merge(prepared_df, temp, by=c("userId"), all.x=TRUE)

temp <- df_oldest %>% 
        group_by(userId) %>% 
        summarise(oldest_total_act = n())

prepared_df <- merge(prepared_df, temp, by=c("userId"), all.x=TRUE)

temp <- df_oldest %>%  
        filter(page == "NextSong") %>% 
        group_by(userId) %>% 
        summarise(oldest_total_song = n())

prepared_df <- merge(prepared_df, temp, by=c("userId"), all.x=TRUE)

temp <- NULL
#Euclid: fix the records with no actitivies
prepared_df[is.na(prepared_df)] <- 0
prepared_df

Calculation of defined features that can be used as predictors for identifying users that are to churn.

train_df <- dplyr::select(prepared_df,userId,end_level,gender,userAgent, location) # add location
train_df$churn <- as.factor(prepared_df$Cancellation_Confirmation)

prepared_df$duration_in_hours <- (prepared_df$end - prepared_df$start)/3600/1000

train_df$tot_act_phour <- prepared_df$Total_Activities/prepared_df$duration_in_hours
train_df$songs_phour <- prepared_df$NextSong/prepared_df$duration_in_hours
train_df$tot_tu_phour <- prepared_df$Thumbs_Up/prepared_df$duration_in_hours
train_df$tot_td_phour <- prepared_df$Thumbs_Down/prepared_df$duration_in_hours
train_df$frds_added_phour <- prepared_df$Add_Friend/prepared_df$duration_in_hours
train_df$tot_add2PL_phour <- prepared_df$Add_to_Playlist/prepared_df$duration_in_hours
train_df$HP_visits_phour <- prepared_df$Home/prepared_df$duration_in_hours
#Euclid: Remove duplicated code
#prepared_df$HP_visits_phour <- prepared_df$Home/prepared_df$duration_in_hours
train_df$tot_ads_phour <- prepared_df$Roll_Advert/prepared_df$duration_in_hours
train_df$tot_errs_phour <- prepared_df$Error/prepared_df$duration_in_hours
#Euclid: keep upgrade and downgrade separated
#train_df$upgrade_downgrades_phour <- (prepared_df$Submit_Upgrade + prepared_df$Submit_Downgrade)/prepared_df$duration_in_hours
train_df$upgrades_phour <- prepared_df$Submit_Upgrade/prepared_df$duration_in_hours
train_df$downgrades_phour <- prepared_df$Submit_Downgrade/prepared_df$duration_in_hours

#train_df$upgrade_downgrades <- (prepared_df$Submit_Upgrade + prepared_df$Submit_Downgrade)

train_df$song_ratio <- prepared_df$NextSong / prepared_df$Total_Activities
train_df$new_songs_ratio <- prepared_df$new_songs_listened / prepared_df$NextSong

#prepared_df$pos_negative_ratio <- prepared_df$Thumbs_Up/(prepared_df$Thumbs_Down + 0.0001)
#Euclid: Change to
train_df$pos_negative_ratio <- (prepared_df$Thumbs_Up+1)/(prepared_df$Thumbs_Down+1)

#Euclid: calculate the difference between recent and oldest activities
# train_df$tot_recent_act_phour  <- prepared_df$recent_total_act / lookback_window / 24
# train_df$tot_oldest_act_phour  <- prepared_df$oldest_total_act / lookback_window / 24
# train_df$recent_songs_phour  <- prepared_df$recent_total_song / lookback_window / 24
# train_df$oldest_songs_phour  <- prepared_df$oldest_total_song / lookback_window / 24
train_df$diff_act_phour <- (prepared_df$recent_total_act-prepared_df$oldest_total_act) / lookback_window / 24
train_df$diff_song_phour <- (prepared_df$recent_total_song-prepared_df$oldest_total_song) / lookback_window / 24


train_df

# Calculation of user's average number of events per session
session_avg <- df %>% 
                group_by(userId, sessionId) %>%
                summarise(events = n(), .groups = 'drop') %>%
                group_by(userId) %>%
                summarise(avg_events_per_session = mean(events)) 


session_avg

# Calculation of user's average session duration

session_avg_length = df  %>% 
                    group_by(userId, sessionId) %>%
                    arrange(ts, .by_group = TRUE) %>% 
                    # filter(userId==3) %>%
                    summarise( session_begin_ts = min(ts), 
                               session_end_ts = max(ts), 
                               .groups = 'drop') %>% 
                    group_by(userId) %>% 
                    summarise( avg_session_duration = mean(session_end_ts-session_begin_ts))

session_avg_length

# Calculations to obtain user's average number of songs played between home visits
window_home_songs  <-  df  %>% 
                       group_by(userId) %>%
                       arrange(ts, .by_group = TRUE) %>% 
                       mutate(home_visits = cumsum(case_when( page == 'Home' ~ 1, TRUE ~ 0))) %>%
                       # summarise(home_button = case_when( page == 'Home' ~ 1, TRUE ~ 0), .groups = 'drop') %>%
                       group_by(userId, home_visits) %>%
                       summarise(nsongs = cumsum(sum(case_when (page == 'NextSong' ~ 1, TRUE ~ 0))), 
                                 .groups = 'drop')  %>%
                       group_by(userId) %>%
                       # filter(userId==4) %>%
                       summarise(avg_songs_btwn_home = mean(nsongs))

window_home_songs

# Calculations to obtain user's average number of songs played between ads played
window_ads_songs  <-  df  %>% 
                       group_by(userId) %>%
                       arrange(ts, .by_group = TRUE) %>% 
                       mutate(ads_played = cumsum(case_when( page == 'Roll Advert' ~ 1, TRUE ~ 0))) %>%
                       # summarise(home_button = case_when( page == 'Home' ~ 1, TRUE ~ 0), .groups = 'drop') %>%
                       group_by(userId, ads_played) %>%
                       summarise(nsongs = cumsum(sum(case_when (page == 'NextSong' ~ 1, TRUE ~ 0))), 
                                 .groups = 'drop')  %>%
                       group_by(userId) %>%
                       # filter(userId==4) %>%
                       summarise(avg_songs_btwn_ads = mean(nsongs))

window_ads_songs

# analysis of ads playing by level

df  %>% 
                       group_by(userId) %>%
                       arrange(ts, .by_group = TRUE) %>% 
                       mutate(ads_played = cumsum(case_when( page == 'Roll Advert' ~ 1, TRUE ~ 0))) %>%
                       group_by(level, ads_played) %>%
                       summarise (sum=n() , .groups = 'drop' )

Incorporating all the newly defined business metrics into the main data.frame (prepared_df)

#Keep the same data frame to save memory

train_df <- merge(train_df, session_avg, by=c("userId")) %>% 
                arrange(userId)
  
train_df <- merge(train_df, session_avg_length, by=c("userId")) %>% 
                arrange(userId)

train_df <- merge(train_df, window_home_songs, by=c("userId")) %>% 
                arrange(userId)
  
train_df <- merge(train_df, window_ads_songs, by=c("userId")) %>% 
                arrange(userId)
train_df

EDA

Distributions

By looking at these distribution plots with different constraints. We find:

Users with subscription are more likely to churn
The most frequent activity that users made is going to next song
California has most churned user
Males are more likely to churn compared to females

Distribution of level

ggplot(train_df %>% filter(churn == 1), aes(x = end_level)) + 
    geom_bar(position = position_dodge()) + 
    theme_classic() + 
    labs(x = "level of user",
         title = "distribution of level"
         )

Distribution of in app activities

ggplot(df, aes(x = page)) + 
    geom_bar() + 
    theme_classic() + 
    theme(axis.text.x = element_text(angle = 90))+
    labs(x = "user activities",
         title = "distribution of user activities(original data)")

Number of churn by states

ggplot(train_df %>% filter(as.factor(churn)== 1), aes(x=forcats::fct_infreq(location))) + 
    geom_bar()+
    theme_classic() + 
    theme(axis.text.x = element_text(angle = 90)) + 
    labs(x = "state", 
         y = "number of people churned", 
         title = 'churn by states'
         )

Churn ratio by gender

ggplot(train_df %>% filter(churn == 1), aes(x = gender))+
  geom_bar(na.rm = T) +
  theme_classic() + 
  labs(title = "churn ratio by gender")

Relations

Collinearity Check for Numeric Variables

correlation = cor(train_df %>% purrr::keep(is.numeric) %>% select(-c(userId)), use = 'pairwise.complete.obs')
corrplot::corrplot(correlation, 'ellipse', type = 'lower',  order = 'hclust')

We plugged in all the predictors, or independent variables, into this correlation matrix to visualize if there are any variables constitute multicollinearity.

At first glance, there is total thumb up per hour(tot_tu_phour) that is highly positively correlated with friends added per hour (frds_added_phour). Likewise, total add to playlist(tot_add2PL_phour) is highly correlated with total thumb up per hour(tot_tu_phour). Adding to the mix, we see that total songs play per hour(songs_phour) and total activity per hour(tot_act_phour) are highly positively correlated with the 3 mentioned variables. It makes sense because total songs played and total activities should be associated the the level of thumbs-up, adding to Play list, and friends added. The obvious decision is to keep one out of the 5 variables here. I’d opt for friends added per hour(frds_added_phour).

On the other hand, average session duration(Avg_session_duration) is highly positively correlated with average event happened per session(avg_events_per_session). Let’s pick average session duration(Avg_session_duration) as the final variable.

Thirdly, the total thumbs-down per hour (tot_td_phour) and total ads played per hour (total_ads_phour) is highly positively correlated. Home Page visits per hour (HP_visits_phour) is highly positively correlated with total thumbs-down per hour(tot_td_phour). Home Page visits per hour (HP_visits_phour) is also highly and positively correlated with total songs play per hour(songs_phour) and total activity per hour(tot_act_phour). Let’s pick total thumbs-down per hour (tot_td_phour) among these 5 variables.

In addition, we also spotted 2 pair of highly negatively correlated variables. One is ratio of new song listened(new_songs_ratio) and total error occurred per hour(tot_errs_phour). The other pair is ratio of new song listened(new_songs_ratio) and total thumbs-up per hour (tot_tu_phour). Out of these 3 variables we will keep total error occurred per hour(tot_errs_phour).

To summarize, here is the list of variables we wanted to remove:

tot_add2PL_phour
tot_tu_phour
songs_phour
tot_act_phour
avg_events_per_session
total_ads_phour
HP_visits_phour
new_songs_ratio

With that said, here is the list of variables that we eventually wanted to keep:

distribution of activity variables

temp <- train_df %>% select(-c(userId))
temp %>% 
  purrr::keep(is.numeric) %>% 
  cbind(churn = as.factor(train_df$churn)) %>% 
  tidyr::gather("key", "value", - churn, factor_key = T) %>% 
  ggplot(aes(value, color = churn)) +
    facet_wrap(~ key, scales = "free") +
    geom_density()

By looking at the churn categorical variable, which is denoted by 0 and 1, meaning a user Not Churned or Churned respectively, we can analytically visualize the patterns with each of the predictors we have.

At the first glance, total activities per hour (tot_act_phour), songs played per hour (songs_phour), total thumbs-up per hour (tot_tu_phour), total thumbs-down per hour(tot_td_phour), friends added per hour (frds_added_phour), totals add to playlist per hour (tot_Add2PL_phour), Home Page Visits per hour (HP_visits_phour) all exihibit the same pattern. It tells me that churn users tend to have more total activities, played more songs, interact more with the app by giving thumbs-up and thumbs-down, added more friends, added more songs to playlist, and have more homepage visits. It makes sense on aggregated average, these churn users knew they were going to churn so they spent more time doing all these activities before they churn.

Total errors encountered per hour really is illustrative of the differentiating behavior by the Churn and Not Churned group. When the rate of errors encountered per hour is less than the threshold of 0.001875, you see people are indifferent with that. But as soon as the error rate is above that threshold, you see a clear sign of an overwhelmingly more churned users than non-churned. This totally makes a lot of sense to us as a team because users tend to churn if they are fed up with a certain frequency of errors they have faced, which, in turn, result in the action of churning the app.

Upgrades per hour (Upgrades_phour) and Downgrades per hour (Downgrades_phour) really have some fluctuating patterns for non-churn. The way we interpreted it is that both churn and non-churn users do select their rate plan accordingly to their usage and budget. Non-churned users are more actively managing their subscriptions. That’s why you see more activities from non-churned users because they have a higher tendency of managing their subscription. That’s also an advanced move where you need to spend more time understanding the app in order to know the options of managing subscriptions.

Song Ratio (song_ratio) calculates how often is the user going to go to the next song among all the activities. You see that at ratio has a mean of of distribution near 0.8 for both churned and non-churned users. What I notice is there are way more people from non-churned that has that ratio than the churned. Not churned users have exhaustively tried out next song much more than churned users.

Different activity per hour(Diff_act_phour) and different song listened per hour(diff_song_phour) did illustrate the difference in number of activities or next songs that is done by the users between most recent and oldest period of 14 days. A positive number means there is more recent activities and songs listened than the initial period. When that difference is 0, meaning that they didn’t ramp up more activities recently, you can tell naturally there are more users that is in the non-churned category than the churned category, which follows the pattern seen in Song Ratio. However, when the difference is positive, meaning the users did more activities and listened to more songs recently than in the initial period, you see apparently there are more churned users than non-churned users. This totally makes a lot of sense to us.

At the end, we wanted to examine the average songs played between home page visits (avg_songs_btwn_home). We see that the distributions of non-churned users has a peak at a higher level of songs than the counterparts in churned category. Meaning churned users tend to be listening to less songs between homepage visits. That tells me that they are less satisfied with the choice of the songs in their playlist and wanted to go back to Home Page more frequently than the nonchurned counterparts.

Up sampling

temp <- train_df %>% filter(churn == 1) %>% 
      slice(rep(1:n(), 
            round(nrow(filter(train_df, churn == 0))/
                    nrow(filter(train_df, churn == 1)),0)-1))
train_df2 <- bind_rows(train_df, temp)

model_logi <- glm(churn~.-userId,family = binomial, train_df2)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(model_logi)

## 
## Call:
## glm(formula = churn ~ . - userId, family = binomial, data = train_df2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1375  -0.1250   0.0000   0.2171   1.8906  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -1.803e+02  6.936e+03  -0.026 0.979255    
## end_levelpaid           2.026e+00  6.687e-01   3.030 0.002448 ** 
## genderM                -1.801e-01  4.168e-01  -0.432 0.665617    
## userAgentiPhone         2.287e+01  2.546e+03   0.009 0.992831    
## userAgentLinux          2.033e+01  2.546e+03   0.008 0.993628    
## userAgentMacintosh      1.934e+01  2.546e+03   0.008 0.993938    
## userAgentWindows        2.020e+01  2.546e+03   0.008 0.993669    
## locationAL              1.218e+01  6.452e+03   0.002 0.998494    
## locationAR             -4.070e+00  1.254e+04   0.000 0.999741    
## locationAR-OK           6.571e-01  1.254e+04   0.000 0.999958    
## locationAZ              1.420e+01  6.452e+03   0.002 0.998243    
## locationCA              1.296e+01  6.452e+03   0.002 0.998397    
## locationCO              1.217e+01  6.452e+03   0.002 0.998494    
## locationCT              9.912e+00  6.452e+03   0.002 0.998774    
## locationDC-VA-MD-WV     1.542e+01  6.452e+03   0.002 0.998092    
## locationFL              1.461e+01  6.452e+03   0.002 0.998193    
## locationGA              1.555e+01  6.452e+03   0.002 0.998077    
## locationGA-AL           1.611e+01  6.452e+03   0.002 0.998008    
## locationHI             -2.355e+00  1.254e+04   0.000 0.999850    
## locationIA             -6.168e+00  9.703e+03  -0.001 0.999493    
## locationIA-IL-MO       -2.656e+00  1.254e+04   0.000 0.999831    
## locationID             -2.523e+00  1.254e+04   0.000 0.999839    
## locationIL             -3.432e+00  7.507e+03   0.000 0.999635    
## locationIL-IN-WI        1.474e+01  6.452e+03   0.002 0.998178    
## locationIL-MO          -6.051e+00  1.254e+04   0.000 0.999615    
## locationIN              2.510e+00  6.455e+03   0.000 0.999690    
## locationKS             -5.572e+00  8.578e+03  -0.001 0.999482    
## locationKY              1.607e+01  6.452e+03   0.002 0.998013    
## locationKY-IN           1.571e+01  6.452e+03   0.002 0.998058    
## locationLA              8.771e+00  6.452e+03   0.001 0.998915    
## locationMA-CT          -3.589e+00  1.254e+04   0.000 0.999772    
## locationMA-NH          -6.585e+00  1.343e+04   0.000 0.999609    
## locationMD              2.131e+00  6.602e+03   0.000 0.999742    
## locationMD-WV          -3.735e+00  9.164e+03   0.000 0.999675    
## locationME             -4.092e+00  1.254e+04   0.000 0.999740    
## locationMI              1.212e+01  6.452e+03   0.002 0.998501    
## locationMN             -4.866e+00  9.148e+03  -0.001 0.999576    
## locationMN-WI          -5.385e+00  7.243e+03  -0.001 0.999407    
## locationMO-IL           1.471e+01  6.452e+03   0.002 0.998181    
## locationMO-KS           1.345e+01  6.452e+03   0.002 0.998336    
## locationMS              1.137e+01  6.452e+03   0.002 0.998593    
## locationMT             -5.566e+00  9.003e+03  -0.001 0.999507    
## locationNC              9.151e+00  6.452e+03   0.001 0.998868    
## locationNC-SC           1.362e+01  6.452e+03   0.002 0.998315    
## locationND-MN          -3.743e-01  1.254e+04   0.000 0.999976    
## locationNE-IA          -4.982e+00  9.106e+03  -0.001 0.999563    
## locationNH              1.803e+01  6.452e+03   0.003 0.997770    
## locationNJ              1.577e+01  6.452e+03   0.002 0.998049    
## locationNM              1.744e+01  6.452e+03   0.003 0.997843    
## locationNV             -3.201e+00  8.885e+03   0.000 0.999713    
## locationNY              8.136e+00  6.452e+03   0.001 0.998994    
## locationNY-NJ-PA        1.438e+01  6.452e+03   0.002 0.998221    
## locationOH              1.309e+01  6.452e+03   0.002 0.998381    
## locationOH-KY-IN       -4.416e+00  7.906e+03  -0.001 0.999554    
## locationOK              1.653e+01  6.452e+03   0.003 0.997955    
## locationOR              1.429e+01  6.452e+03   0.002 0.998232    
## locationOR-WA           9.260e+00  6.452e+03   0.001 0.998855    
## locationPA              1.456e+01  6.452e+03   0.002 0.998199    
## locationPA-NJ           1.695e+01  6.452e+03   0.003 0.997904    
## locationPA-NJ-DE-MD     1.561e+01  6.452e+03   0.002 0.998069    
## locationRI-MA           1.420e+01  6.452e+03   0.002 0.998244    
## locationSC              7.360e+00  6.453e+03   0.001 0.999090    
## locationSC-NC          -7.073e+00  1.254e+04  -0.001 0.999550    
## locationSD             -4.537e+00  8.872e+03  -0.001 0.999592    
## locationTN              1.373e+01  6.452e+03   0.002 0.998302    
## locationTN-MS-AR       -5.298e+00  8.342e+03  -0.001 0.999493    
## locationTN-VA           2.884e+01  8.399e+03   0.003 0.997261    
## locationTX              1.189e+01  6.452e+03   0.002 0.998530    
## locationUT              1.243e+01  6.452e+03   0.002 0.998463    
## locationUT-ID          -3.669e+00  1.254e+04   0.000 0.999767    
## locationVA              1.423e-01  6.452e+03   0.000 0.999982    
## locationVA-NC           1.138e+01  6.452e+03   0.002 0.998593    
## locationVT             -5.373e+00  8.399e+03  -0.001 0.999490    
## locationWA              1.065e+01  6.452e+03   0.002 0.998682    
## locationWI             -4.922e+00  6.961e+03  -0.001 0.999436    
## locationWV             -4.844e+00  9.642e+03  -0.001 0.999599    
## tot_act_phour           1.949e+02  4.006e+01   4.864 1.15e-06 ***
## songs_phour            -1.892e+02  4.085e+01  -4.632 3.62e-06 ***
## tot_tu_phour           -1.985e+02  3.752e+01  -5.291 1.21e-07 ***
## tot_td_phour           -1.622e+02  6.906e+01  -2.348 0.018876 *  
## frds_added_phour       -3.386e+02  6.177e+01  -5.482 4.20e-08 ***
## tot_add2PL_phour       -3.518e+02  6.784e+01  -5.185 2.16e-07 ***
## HP_visits_phour        -1.571e+02  6.930e+01  -2.267 0.023406 *  
## tot_ads_phour          -1.398e+02  4.059e+01  -3.443 0.000575 ***
## tot_errs_phour         -2.475e+02  2.454e+02  -1.009 0.313054    
## upgrades_phour         -1.457e+01  3.719e+02  -0.039 0.968750    
## downgrades_phour        2.500e+02  6.246e+02   0.400 0.688981    
## song_ratio             -3.768e+00  7.096e+00  -0.531 0.595449    
## new_songs_ratio         1.448e+02  1.704e+01   8.499  < 2e-16 ***
## pos_negative_ratio      7.375e-02  1.017e-01   0.725 0.468264    
## diff_act_phour          4.064e+00  5.295e+00   0.768 0.442755    
## diff_song_phour        -3.458e+00  6.325e+00  -0.547 0.584557    
## avg_events_per_session  6.256e-02  4.716e-02   1.327 0.184626    
## avg_session_duration   -2.958e-07  2.290e-07  -1.292 0.196360    
## avg_songs_btwn_home     1.165e-01  4.573e-02   2.549 0.010812 *  
## avg_songs_btwn_ads      1.173e-03  9.194e-04   1.276 0.202002    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1029.82  on 744  degrees of freedom
## Residual deviance:  276.16  on 649  degrees of freedom
## AIC: 468.16
## 
## Number of Fisher Scoring iterations: 18

Performance evaluation using the up-sampled data

predicted_class <- ifelse(model_logi$fitted.values>0.5,1,0)
confusion_matrix <- confusionMatrix(as.factor(predicted_class),
                                      train_df2$churn,
                                    mode = "everything",positive = "1")
confusion_matrix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 320  16
##          1  29 380
##                                         
##                Accuracy : 0.9396        
##                  95% CI : (0.92, 0.9556)
##     No Information Rate : 0.5315        
##     P-Value [Acc > NIR] : < 2e-16       
##                                         
##                   Kappa : 0.8784        
##                                         
##  Mcnemar's Test P-Value : 0.07364       
##                                         
##             Sensitivity : 0.9596        
##             Specificity : 0.9169        
##          Pos Pred Value : 0.9291        
##          Neg Pred Value : 0.9524        
##               Precision : 0.9291        
##                  Recall : 0.9596        
##                      F1 : 0.9441        
##              Prevalence : 0.5315        
##          Detection Rate : 0.5101        
##    Detection Prevalence : 0.5490        
##       Balanced Accuracy : 0.9383        
##                                         
##        'Positive' Class : 1             
##

Performance evaluation using the pre-up-sampled data

predicted_class <- ifelse(predict(model_logi,train_df,type="response")>0.5,1,0)
confusion_matrix <- confusionMatrix(as.factor(predicted_class),
                                      train_df$churn,
                                    mode = "everything",positive = "1")
confusion_matrix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 320   4
##          1  29  95
##                                           
##                Accuracy : 0.9263          
##                  95% CI : (0.8981, 0.9488)
##     No Information Rate : 0.779           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8038          
##                                           
##  Mcnemar's Test P-Value : 2.943e-05       
##                                           
##             Sensitivity : 0.9596          
##             Specificity : 0.9169          
##          Pos Pred Value : 0.7661          
##          Neg Pred Value : 0.9877          
##               Precision : 0.7661          
##                  Recall : 0.9596          
##                      F1 : 0.8520          
##              Prevalence : 0.2210          
##          Detection Rate : 0.2121          
##    Detection Prevalence : 0.2768          
##       Balanced Accuracy : 0.9383          
##                                           
##        'Positive' Class : 1               
##

DATA_621_Final_Project

Chi Pong, Euclid Zhang, Jie Zou, Joseph Connolly, LeTicia Cancel

4/19/2022

EDA

Distributions

Relations