#setwd("/Users/dpong/Data 621/Final_Project/Datasets")
setwd("~/Library/CloudStorage/OneDrive-CityUniversityofNewYork/621/final_churn_modeling")
df <- read.csv("sparkify-medium.csv", stringsAsFactors = FALSE, row.names=1)

Data Souce

In this analysis, we will utilize the sparkify data set created by Udacity, an educational organization. Sparkify is a fictional company providing music streaming service.

The full data set is too large to run on a regular personal computer so we will focus on a subset of the data which is consist of 543705 user activity records of 448 users from 10/1/2018 to 12/1/2018. Each record contains the following information:

Since this data is fictional, there are several defects of the data we found and need to be aware when doing our analysis.

  1. We only have data from 10/1/2018 to 12/1/2018. We may be interested in comparing the users’ current activities to the users’ activities during a period right after they registered. However, almost all users in the data set registered before 10/1/2018.
  2. The user agent is static for each user, which may be implausible in real life where people use different devices.
  3. The location of the users are static.
  4. By examining on the time stamps of the activities and the length of the songs, we found that there is no partially listened songs.
  5. There are “add to Playlist” activities in the record, but there is no “remove from Playlist” activities. The two activities may help the company identify if a user like the service but one of them is missing here.
  6. Only users who canceled the service are considered as churned. Free users with no recent activities are not considered as churned.

In this analysis, we will clean up and aggregate the user activities based on the user ID. We will then perform feature engineering to build the features / variables for our model.

Data Clean Up and Aggregation

The time of registration for the records of a few users are incorrect (the time of registration is after the user’s first log in). Correct the time of registration using the “Submit Registration” page and the session ID.

regist_df <- filter(df,df$page=="Submit Registration")

for (i in c(1:nrow(regist_df))) {
  temp_df <- df %>% 
                filter(sessionId==regist_df$sessionId[i]) %>%
                filter(!is.na(userId)) %>% 
                mutate(delta=abs(ts-regist_df$ts[i])) %>% 
                arrange(delta,desc=FALSE)

  df[!is.na(df$userId) & df$userId==temp_df$userId[1],"registration"] <- regist_df$ts[i]
}

Filter out the guest records (the ones without a userId)

df <- filter(df,!is.na(userId))

Simplify the user Agent to represent the type of device that the user is using.

df$userAgent[str_detect(df$userAgent,"Macintosh")] <- "Macintosh"
df$userAgent[str_detect(df$userAgent,"iPad")] <- "iPad"
df$userAgent[str_detect(df$userAgent,"iPhone")] <- "iPhone"
df$userAgent[str_detect(df$userAgent,"Windows")] <- "Windows"
df$userAgent[str_detect(df$userAgent,"Linux")] <- "Linux"

Convert some categorical variables in to factors.

factor_columns <- c("page","auth","method","status","level","gender","userAgent")

df[factor_columns] <- lapply(df[factor_columns], factor)

Remove some variables that are not used in our analysis. For example, the method of the web request, the name of the user.

df$method <- NULL
df$status <- NULL
df$itemInSession <- NULL
df$location <- NULL
df$lastName <- NULL
df$firstName <- NULL
df$auth <- NULL

Create a new variable indicating whether it is a song that the user never listened before.

df <- arrange(df, ts,desc=FALSE)

df$user_song <- paste0(df$userId, df$artist, df$song)
temp <- df %>% group_by(user_song) %>% mutate(count=row_number())
df$new_song <- temp$count
temp <- NULL
df$user_song <- NULL
df$new_song[df$new_song > 1] <- 0
df$new_song[is.na(df$song)] <- NA

Aggregate the total number of records for each category of user activities (indicated by the page variable). For example, we would like to know the total number songs listened by a user, the total number of thumbs-ups by a user.

page_df <- df %>% group_by(userId) %>% 
  count(page) %>% 
  spread(page, n, fill = 0)

#Cancel column is identical to "Cancellation Confirmation" so it is removed
page_df$Cancel <- NULL

page_df[,2:ncol(page_df)] <- sapply(page_df[,2:ncol(page_df)], as.integer)
page_df$Total_Activities <- apply(page_df[,2:ncol(page_df)], 1, sum)

page_df

Summarize additional user activities (for example, the total number of unique sessions) and user information (for example, the user’s account level of the last activity).

user_df <- df %>% filter(!is.na(song)) %>% 
  arrange(ts, desc=FALSE) %>% 
  group_by(userId) %>% 
  summarise(active_sessions=n_distinct(sessionId),
            new_songs_listened=sum(new_song),
            registration=first(registration),
            end_level=last(level),
            gender=first(gender),
            userAgent=first(userAgent))
user_df

Finding the beginning time stamp and ending time stamp of the observation period for each user. For users who registered before 10/1/2018, the observation starts from 10/1/2018 (1538352000000). Otherwise, the observation starts from the time of registraction. For users who cancelled their service, the observation ends by the time stamp of the last activity. Otherwise, the observation ends by 12/1/2018.

df <- df %>% arrange(userId, desc=FALSE)

obs_df <- data.frame(userId=unique(df$userId))
obs_df$start <- ifelse(user_df$registration > 1538352000000, user_df$registration, 1538352000000)
obs_df$end <- 1543622400000
temp <- filter(df, page == "Cancellation Confirmation")
obs_df$end[obs_df$userId %in% temp$userId] <- temp$ts
#Calculate ad rolled per hour when account level = paid / free

roll_ad_df <- df %>% filter(page == "Roll Advert") %>%
  group_by(userId, level) %>%
  count() %>%
  spread(level, n, fill = 0)
#percentage of the song listening time with account level = paid

paid_time_df <- df %>% filter(!is.na(length)) %>%
  group_by(userId, level) %>%
  summarise(length = sum(length)) %>%
  spread(level, length, fill = 0)

paid_time_df$percent_paid <- paid_time_df$paid / (paid_time_df$free + paid_time_df$paid)
paid_time_df$free <- NULL
paid_time_df$paid <- NULL

Merging previously aggregated data into one dataframe.

prepared_df <- merge(obs_df, user_df, by=c("userId")) %>% 
                arrange(userId)
  
prepared_df <- merge(prepared_df, page_df, by=c("userId")) %>% 
                arrange(userId)

prepared_df <- merge(prepared_df, roll_ad_df, by=c("userId"), all.x=TRUE)

prepared_df <- merge(prepared_df, paid_time_df, by=c("userId"), all.x=TRUE)


prepared_df[is.na(prepared_df)] <- 0

names(prepared_df) <- str_replace_all(names(prepared_df), " ", "_")

#df <- merge(df, prepared_df[c("userId","start","end")], by=c("userId"))

Feature Engineering

Calculation of defined features that can be used as predictors for identifying users that are to churn.

train_df <- dplyr::select(prepared_df,userId,end_level,gender,userAgent)
train_df$churn <- as.factor(prepared_df$Cancellation_Confirmation)
#Since the lengths of the observation period are different for some users (new registrations and terminated users). We will scale the numbers of activities by the total number of hours in the whole observation period for each user.
prepared_df$duration_in_hours <- (prepared_df$end - prepared_df$start)/3600/1000

train_df$tot_act_phour <- prepared_df$Total_Activities/prepared_df$duration_in_hours
train_df$songs_phour <- prepared_df$NextSong/prepared_df$duration_in_hours
train_df$tot_tu_phour <- prepared_df$Thumbs_Up/prepared_df$duration_in_hours
train_df$tot_td_phour <- prepared_df$Thumbs_Down/prepared_df$duration_in_hours
train_df$frds_added_phour <- prepared_df$Add_Friend/prepared_df$duration_in_hours
train_df$tot_add2PL_phour <- prepared_df$Add_to_Playlist/prepared_df$duration_in_hours
train_df$HP_visits_phour <- prepared_df$Home/prepared_df$duration_in_hours
train_df$tot_errs_phour <- prepared_df$Error/prepared_df$duration_in_hours
train_df$upgrades_phour <- prepared_df$Submit_Upgrade/prepared_df$duration_in_hours
train_df$downgrades_phour <- prepared_df$Submit_Downgrade/prepared_df$duration_in_hours
train_df$setting_phour <- prepared_df$Settings/prepared_df$duration_in_hours
train_df$save_setting_phour <- prepared_df$Save_Settings/prepared_df$duration_in_hours
train_df$song_ratio <- prepared_df$NextSong / prepared_df$Total_Activities
train_df$new_songs_ratio <- prepared_df$new_songs_listened / prepared_df$NextSong
train_df$pos_negative_ratio <- (prepared_df$Thumbs_Up+1)/(prepared_df$Thumbs_Down+1)
train_df$ad_per_song <- prepared_df$Roll_Advert / prepared_df$NextSong

train_df$paid_ad_ph <- prepared_df$paid/prepared_df$duration_in_hours
train_df$free_ad_ph <- prepared_df$free/prepared_df$duration_in_hours

train_df$percent_paid <- prepared_df$percent_paid
# Calculation of user's average number of events per session
session_avg <- df %>% 
                group_by(userId, sessionId) %>%
                summarise(events = n(), .groups = 'drop') %>%
                group_by(userId) %>%
                summarise(avg_events_per_session = mean(events)) 
# Calculation of user's average session duration

session_avg_length = df  %>% 
                    group_by(userId, sessionId) %>%
                    arrange(ts, .by_group = TRUE) %>% 
                    # filter(userId==3) %>%
                    summarise( session_begin_ts = min(ts), 
                               session_end_ts = max(ts), 
                               .groups = 'drop') %>% 
                    group_by(userId) %>% 
                    summarise( avg_session_duration = mean(session_end_ts-session_begin_ts))

#Convert from timestamp unit to hour
session_avg_length$avg_session_duration <- session_avg_length$avg_session_duration / 3600000

Incorporating all the newly defined business metrics into the main data.frame (prepared_df)

Since the lengths of the observation period are different for some users (new registrations and terminated users). We will scale the numbers of activities by the total number of hours in the whole observation period for each user.

The followings are the created features for our analysis

Relations

Collinearity Check for Numeric Variables

We plugged in all the predictors, or independent variables, into this correlation matrix to visualize if there are any variables constitute multicollinearity.

To reduce multicollinearity of our data and reduce the complexity of our model, variables with correlation more than 0.8 are considered highly correlated and some variables are excluded as described below:

tot_act_phour is highly correlated with songs_phour, tot_tu_phour, tot_td_phour, freds_added_phour, tot_add2PL_phour, HP_visits_phour, and setting_phour. As song listening is main service here and we would like to know our service’s user experience, we would keep songs_phour and drop the other variables.

HP_visits_phour is highly correlated with free_ad_phour. Likewise, setting_phour is highly positively correlated with HP_visits_phour. setting_phour was already dropped in the previous block. As we would like to know the user’s option about the ads, we would keep free_ad_phour and drop HP_visits_phour

As expected, avg_events_per_session and avg_session_duration are highly correlated. Longer sessions have more events. We would keep avg_session_duration and drop avg_events_per_session

To summarize, here is the list of variables we wanted to remove:

  • tot_act_phour
  • tot_tu_phour
  • tot_td_phour
  • freds_added_phour
  • tot_add2PL_phour
  • HP_visits_phour
  • setting_phour
  • avg_events_per_session
train_df$tot_act_phour <- NULL
train_df$tot_tu_phour <- NULL
train_df$tot_td_phour <- NULL
train_df$frds_added_phour <- NULL
train_df$tot_add2PL_phour <- NULL
train_df$HP_visits_phour <- NULL
train_df$setting_phour <- NULL
train_df$diff_act_phour <- NULL
train_df$avg_events_per_session <- NULL

We have the following correlations after the highly correlated ones are removed.

distribution of categorical variables

From the plots, we find:

users will account level = paid seem to have a higher churn rate. The user’s gender seems to have no effect on the likelihood of churning. iPhone users seem to be more likely to churn than other device users. Probably the app is not running well on a phone.

distribution of numeric variables

In the following plots, some variables are log-transformed for the better visualization, they are not transformed in the data for our modeling.

For the user activities songs_phour, tot_errs_phour, upgrades_phour, downgrades_phour, and save_setting_phour, the higher the number of activities, the more likely the user is going to churn. This may imply that the more the user uses the service, the more unsatisfying the user feels.

pos_negative_ratio seems to have little effect to the churn rate. This may imply that the dissatisfaction of the users are very likely to be irrelevant to the quality of the songs.

The effect of advertisement listening is not so obvious on the ad_per_song plot. However, the free_ad_ph and paid_ad_ph do show that higher number of ad listened leads to high rate of churn. The dissatisfaction of the users may come from the advertisements.

For percent_paid, users who do not change their account level (the ones staying 100% of the time free or 100% of the time paid) seem to have lower churn rate. They are content with service they are using. The ones who switched their account level may still feel unsatisfied about the service.

song_ratio, new_songs_ratio, avg_session_duration do not show any obvious patterns about churning.

The above findings are just ideas we get from the plots, the actual effect would need to be confirmed by our model analysis.

Lastly, the target variable that is in the training dataset is observed to imbalanced.

summary(train_df$churn)
##   0   1 
## 349  99

In order to build a model with higher unbiasedness, we would use the method of upsampling. The method adds duplicate records of the minor class to the sample so that the size of the minor class is adjust to be close to the size of the major class.

temp <- train_df %>% filter(churn == 1) %>% 
      slice(rep(1:n(), 
            round(nrow(filter(train_df, churn == 0))/
                    nrow(filter(train_df, churn == 1)),0)-1))
train_df2 <- bind_rows(train_df, temp)

Finally, we can start building our model.

First, let’s build our preliminary model with all the defined features.

model_logi <- glm(churn~.,family = binomial, train_df2)

Checking the marginal model plots, we find that the variable new_songs_ratio does not fit well to our model.

marginalModelPlots(model_logi,~new_songs_ratio)

In order to fit the curvature, we will add a squared term of new_songs_ratio to our model

model_logi <- glm(churn~.+I(new_songs_ratio^2),family = binomial, train_df2)
summary(model_logi)
## 
## Call:
## glm(formula = churn ~ . + I(new_songs_ratio^2), family = binomial, 
##     data = train_df2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0027  -0.4708   0.0000   0.2581   2.4211  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -6.290e+02  7.097e+02  -0.886  0.37541    
## end_levelpaid         9.017e-01  4.897e-01   1.841  0.06556 .  
## genderM               8.163e-02  2.642e-01   0.309  0.75736    
## userAgentiPhone       1.829e+01  7.050e+02   0.026  0.97931    
## userAgentLinux        1.609e+01  7.050e+02   0.023  0.98179    
## userAgentMacintosh    1.601e+01  7.050e+02   0.023  0.98188    
## userAgentWindows      1.597e+01  7.050e+02   0.023  0.98193    
## songs_phour           8.935e+00  1.054e+00   8.480  < 2e-16 ***
## tot_errs_phour       -1.990e+01  1.373e+02  -0.145  0.88472    
## upgrades_phour        1.094e+02  2.475e+02   0.442  0.65843    
## downgrades_phour     -2.055e+02  3.942e+02  -0.521  0.60220    
## save_setting_phour    3.036e+01  1.070e+02   0.284  0.77658    
## song_ratio           -4.434e+00  3.872e+00  -1.145  0.25216    
## new_songs_ratio       1.163e+03  1.640e+02   7.091 1.34e-12 ***
## pos_negative_ratio   -1.506e-01  5.470e-02  -2.753  0.00591 ** 
## ad_per_song           9.938e+00  5.395e+00   1.842  0.06544 .  
## paid_ad_ph            2.782e+02  9.471e+01   2.938  0.00331 ** 
## free_ad_ph            4.491e+01  1.376e+01   3.263  0.00110 ** 
## percent_paid          2.241e+00  8.144e-01   2.751  0.00594 ** 
## avg_session_duration  1.811e-03  7.150e-02   0.025  0.97979    
## I(new_songs_ratio^2) -5.495e+02  8.246e+01  -6.664 2.66e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1029.82  on 744  degrees of freedom
## Residual deviance:  428.78  on 724  degrees of freedom
## AIC: 470.78
## 
## Number of Fisher Scoring iterations: 15

From our full model, there are some features that are insignificant. We would utilize the method of backward elimination based on the AIC score.

model_logi <- step(model_logi, trace=0)

The following is the result of our final model. Most of the coefficients are statistically significant

summary(model_logi)
## 
## Call:
## glm(formula = churn ~ end_level + userAgent + songs_phour + new_songs_ratio + 
##     pos_negative_ratio + ad_per_song + paid_ad_ph + free_ad_ph + 
##     percent_paid + I(new_songs_ratio^2), family = binomial, data = train_df2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.8177  -0.4831   0.0000   0.2596   2.4452  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -628.87451  692.48295  -0.908 0.363802    
## end_levelpaid           1.10537    0.38790   2.850 0.004377 ** 
## userAgentiPhone        18.36385  688.03763   0.027 0.978707    
## userAgentLinux         16.16706  688.03760   0.023 0.981254    
## userAgentMacintosh     16.07003  688.03745   0.023 0.981366    
## userAgentWindows       16.02738  688.03744   0.023 0.981415    
## songs_phour             8.93534    0.97286   9.185  < 2e-16 ***
## new_songs_ratio      1153.47860  156.71905   7.360 1.84e-13 ***
## pos_negative_ratio     -0.14576    0.05143  -2.834 0.004593 ** 
## ad_per_song            11.74476    5.15449   2.279 0.022694 *  
## paid_ad_ph            277.22956   90.61246   3.060 0.002217 ** 
## free_ad_ph             45.63953   12.18794   3.745 0.000181 ***
## percent_paid            1.96790    0.74120   2.655 0.007930 ** 
## I(new_songs_ratio^2) -543.96482   78.58793  -6.922 4.46e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1029.8  on 744  degrees of freedom
## Residual deviance:  430.5  on 731  degrees of freedom
## AIC: 458.5
## 
## Number of Fisher Scoring iterations: 15

By looking at the marginal model plots, there is no lack of fit of the model.

marginalModelPlots(model_logi,~songs_phour+new_songs_ratio+pos_negative_ratio+ad_per_song+
                     paid_ad_ph+free_ad_ph+percent_paid, layout =c(3,3))

The following deviance residual vs linear predictor plot also confirms that our model is valid. The model is producing accurate predictions at the two ends. The errors around the match point 0 are independent and random.

residual_df <- mutate(train_df2, residuals=residuals(model_logi,type="deviance"), 
                      linpred=predict(model_logi,type = "link"))
gdf <- group_by(residual_df, cut(linpred, breaks=unique(quantile(linpred,(1:100)/101))))
diagdf <- summarise(gdf, residuals=mean(residuals), linpred=mean(linpred))
plot(residuals ~ linpred, diagdf, xlab="linear predictor",xlim=c(-20,20))

Now let’s check the performance of our model

rocCurve <- roc(train_df2$churn, model_logi$fitted.values)
plot(rocCurve)

rocCurve$auc
## Area under the curve: 0.9445

We have an AUC of 0.9445, which is a good number. The optimal threshold of classifying the target variable churn is

optimal_threshold <- coords(rocCurve, "best", ret = "threshold")
optimal_threshold

Performance evaluation using the up-sampled data

predicted_class <- ifelse(model_logi$fitted.values>optimal_threshold[1,1],1,0)
confusion_matrix <- confusionMatrix(as.factor(predicted_class),
                                      train_df2$churn,
                                    mode = "everything",positive = "1")
confusion_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 331  44
##          1  18 352
##                                           
##                Accuracy : 0.9168          
##                  95% CI : (0.8946, 0.9356)
##     No Information Rate : 0.5315          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8336          
##                                           
##  Mcnemar's Test P-Value : 0.001498        
##                                           
##             Sensitivity : 0.8889          
##             Specificity : 0.9484          
##          Pos Pred Value : 0.9514          
##          Neg Pred Value : 0.8827          
##               Precision : 0.9514          
##                  Recall : 0.8889          
##                      F1 : 0.9191          
##              Prevalence : 0.5315          
##          Detection Rate : 0.4725          
##    Detection Prevalence : 0.4966          
##       Balanced Accuracy : 0.9187          
##                                           
##        'Positive' Class : 1               
## 

Performance evaluation using the pre-up-sampled data

predicted_class <- ifelse(predict(model_logi,train_df,type="response")>optimal_threshold[1,1],1,0)
confusion_matrix <- confusionMatrix(as.factor(predicted_class),
                                      train_df$churn,
                                    mode = "everything",positive = "1")
confusion_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 331  11
##          1  18  88
##                                           
##                Accuracy : 0.9353          
##                  95% CI : (0.9084, 0.9562)
##     No Information Rate : 0.779           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8166          
##                                           
##  Mcnemar's Test P-Value : 0.2652          
##                                           
##             Sensitivity : 0.8889          
##             Specificity : 0.9484          
##          Pos Pred Value : 0.8302          
##          Neg Pred Value : 0.9678          
##               Precision : 0.8302          
##                  Recall : 0.8889          
##                      F1 : 0.8585          
##              Prevalence : 0.2210          
##          Detection Rate : 0.1964          
##    Detection Prevalence : 0.2366          
##       Balanced Accuracy : 0.9187          
##                                           
##        'Positive' Class : 1               
## 

The evaluation using the pre-upsampled data and upsampled data both have a Balanced Accurary of over 91%.

Conclusion

Now let’s confirm the findings from our model output

summary(model_logi)
## 
## Call:
## glm(formula = churn ~ end_level + userAgent + songs_phour + new_songs_ratio + 
##     pos_negative_ratio + ad_per_song + paid_ad_ph + free_ad_ph + 
##     percent_paid + I(new_songs_ratio^2), family = binomial, data = train_df2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.8177  -0.4831   0.0000   0.2596   2.4452  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -628.87451  692.48295  -0.908 0.363802    
## end_levelpaid           1.10537    0.38790   2.850 0.004377 ** 
## userAgentiPhone        18.36385  688.03763   0.027 0.978707    
## userAgentLinux         16.16706  688.03760   0.023 0.981254    
## userAgentMacintosh     16.07003  688.03745   0.023 0.981366    
## userAgentWindows       16.02738  688.03744   0.023 0.981415    
## songs_phour             8.93534    0.97286   9.185  < 2e-16 ***
## new_songs_ratio      1153.47860  156.71905   7.360 1.84e-13 ***
## pos_negative_ratio     -0.14576    0.05143  -2.834 0.004593 ** 
## ad_per_song            11.74476    5.15449   2.279 0.022694 *  
## paid_ad_ph            277.22956   90.61246   3.060 0.002217 ** 
## free_ad_ph             45.63953   12.18794   3.745 0.000181 ***
## percent_paid            1.96790    0.74120   2.655 0.007930 ** 
## I(new_songs_ratio^2) -543.96482   78.58793  -6.922 4.46e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1029.8  on 744  degrees of freedom
## Residual deviance:  430.5  on 731  degrees of freedom
## AIC: 458.5
## 
## Number of Fisher Scoring iterations: 15

Though our model may help predicting the users that may churn so the company can take actions such as offering discounts to ask them stay, it is better to make improvements to the service to lower the chance of churn.

Based on our findings, we have the following recommendations: