#setwd("/Users/dpong/Data 621/Final_Project/Datasets")
setwd("~/Library/CloudStorage/OneDrive-CityUniversityofNewYork/621/final_churn_modeling")
df <- read.csv("sparkify-medium.csv", stringsAsFactors = FALSE, row.names=1)
In this analysis, we will utilize the sparkify data set created by Udacity, an educational organization. Sparkify is a fictional company providing music streaming service.
The full data set is too large to run on a regular personal computer so we will focus on a subset of the data which is consist of 543705 user activity records of 448 users from 10/1/2018 to 12/1/2018. Each record contains the following information:
Since this data is fictional, there are several defects of the data we found and need to be aware when doing our analysis.
In this analysis, we will clean up and aggregate the user activities based on the user ID. We will then perform feature engineering to build the features / variables for our model.
The time of registration for the records of a few users are incorrect (the time of registration is after the user’s first log in). Correct the time of registration using the “Submit Registration” page and the session ID.
regist_df <- filter(df,df$page=="Submit Registration")
for (i in c(1:nrow(regist_df))) {
temp_df <- df %>%
filter(sessionId==regist_df$sessionId[i]) %>%
filter(!is.na(userId)) %>%
mutate(delta=abs(ts-regist_df$ts[i])) %>%
arrange(delta,desc=FALSE)
df[!is.na(df$userId) & df$userId==temp_df$userId[1],"registration"] <- regist_df$ts[i]
}
Filter out the guest records (the ones without a userId)
df <- filter(df,!is.na(userId))
Simplify the user Agent to represent the type of device that the user is using.
df$userAgent[str_detect(df$userAgent,"Macintosh")] <- "Macintosh"
df$userAgent[str_detect(df$userAgent,"iPad")] <- "iPad"
df$userAgent[str_detect(df$userAgent,"iPhone")] <- "iPhone"
df$userAgent[str_detect(df$userAgent,"Windows")] <- "Windows"
df$userAgent[str_detect(df$userAgent,"Linux")] <- "Linux"
Convert some categorical variables in to factors.
factor_columns <- c("page","auth","method","status","level","gender","userAgent")
df[factor_columns] <- lapply(df[factor_columns], factor)
Remove some variables that are not used in our analysis. For example, the method of the web request, the name of the user.
df$method <- NULL
df$status <- NULL
df$itemInSession <- NULL
df$location <- NULL
df$lastName <- NULL
df$firstName <- NULL
df$auth <- NULL
Create a new variable indicating whether it is a song that the user never listened before.
df <- arrange(df, ts,desc=FALSE)
df$user_song <- paste0(df$userId, df$artist, df$song)
temp <- df %>% group_by(user_song) %>% mutate(count=row_number())
df$new_song <- temp$count
temp <- NULL
df$user_song <- NULL
df$new_song[df$new_song > 1] <- 0
df$new_song[is.na(df$song)] <- NA
Aggregate the total number of records for each category of user activities (indicated by the page variable). For example, we would like to know the total number songs listened by a user, the total number of thumbs-ups by a user.
page_df <- df %>% group_by(userId) %>%
count(page) %>%
spread(page, n, fill = 0)
#Cancel column is identical to "Cancellation Confirmation" so it is removed
page_df$Cancel <- NULL
page_df[,2:ncol(page_df)] <- sapply(page_df[,2:ncol(page_df)], as.integer)
page_df$Total_Activities <- apply(page_df[,2:ncol(page_df)], 1, sum)
page_df
Summarize additional user activities (for example, the total number of unique sessions) and user information (for example, the user’s account level of the last activity).
user_df <- df %>% filter(!is.na(song)) %>%
arrange(ts, desc=FALSE) %>%
group_by(userId) %>%
summarise(active_sessions=n_distinct(sessionId),
new_songs_listened=sum(new_song),
registration=first(registration),
end_level=last(level),
gender=first(gender),
userAgent=first(userAgent))
user_df
Finding the beginning time stamp and ending time stamp of the observation period for each user. For users who registered before 10/1/2018, the observation starts from 10/1/2018 (1538352000000). Otherwise, the observation starts from the time of registraction. For users who cancelled their service, the observation ends by the time stamp of the last activity. Otherwise, the observation ends by 12/1/2018.
df <- df %>% arrange(userId, desc=FALSE)
obs_df <- data.frame(userId=unique(df$userId))
obs_df$start <- ifelse(user_df$registration > 1538352000000, user_df$registration, 1538352000000)
obs_df$end <- 1543622400000
temp <- filter(df, page == "Cancellation Confirmation")
obs_df$end[obs_df$userId %in% temp$userId] <- temp$ts
#Calculate ad rolled per hour when account level = paid / free
roll_ad_df <- df %>% filter(page == "Roll Advert") %>%
group_by(userId, level) %>%
count() %>%
spread(level, n, fill = 0)
#percentage of the song listening time with account level = paid
paid_time_df <- df %>% filter(!is.na(length)) %>%
group_by(userId, level) %>%
summarise(length = sum(length)) %>%
spread(level, length, fill = 0)
paid_time_df$percent_paid <- paid_time_df$paid / (paid_time_df$free + paid_time_df$paid)
paid_time_df$free <- NULL
paid_time_df$paid <- NULL
Merging previously aggregated data into one dataframe.
prepared_df <- merge(obs_df, user_df, by=c("userId")) %>%
arrange(userId)
prepared_df <- merge(prepared_df, page_df, by=c("userId")) %>%
arrange(userId)
prepared_df <- merge(prepared_df, roll_ad_df, by=c("userId"), all.x=TRUE)
prepared_df <- merge(prepared_df, paid_time_df, by=c("userId"), all.x=TRUE)
prepared_df[is.na(prepared_df)] <- 0
names(prepared_df) <- str_replace_all(names(prepared_df), " ", "_")
#df <- merge(df, prepared_df[c("userId","start","end")], by=c("userId"))
Calculation of defined features that can be used as predictors for identifying users that are to churn.
train_df <- dplyr::select(prepared_df,userId,end_level,gender,userAgent)
train_df$churn <- as.factor(prepared_df$Cancellation_Confirmation)
#Since the lengths of the observation period are different for some users (new registrations and terminated users). We will scale the numbers of activities by the total number of hours in the whole observation period for each user.
prepared_df$duration_in_hours <- (prepared_df$end - prepared_df$start)/3600/1000
train_df$tot_act_phour <- prepared_df$Total_Activities/prepared_df$duration_in_hours
train_df$songs_phour <- prepared_df$NextSong/prepared_df$duration_in_hours
train_df$tot_tu_phour <- prepared_df$Thumbs_Up/prepared_df$duration_in_hours
train_df$tot_td_phour <- prepared_df$Thumbs_Down/prepared_df$duration_in_hours
train_df$frds_added_phour <- prepared_df$Add_Friend/prepared_df$duration_in_hours
train_df$tot_add2PL_phour <- prepared_df$Add_to_Playlist/prepared_df$duration_in_hours
train_df$HP_visits_phour <- prepared_df$Home/prepared_df$duration_in_hours
train_df$tot_errs_phour <- prepared_df$Error/prepared_df$duration_in_hours
train_df$upgrades_phour <- prepared_df$Submit_Upgrade/prepared_df$duration_in_hours
train_df$downgrades_phour <- prepared_df$Submit_Downgrade/prepared_df$duration_in_hours
train_df$setting_phour <- prepared_df$Settings/prepared_df$duration_in_hours
train_df$save_setting_phour <- prepared_df$Save_Settings/prepared_df$duration_in_hours
train_df$song_ratio <- prepared_df$NextSong / prepared_df$Total_Activities
train_df$new_songs_ratio <- prepared_df$new_songs_listened / prepared_df$NextSong
train_df$pos_negative_ratio <- (prepared_df$Thumbs_Up+1)/(prepared_df$Thumbs_Down+1)
train_df$ad_per_song <- prepared_df$Roll_Advert / prepared_df$NextSong
train_df$paid_ad_ph <- prepared_df$paid/prepared_df$duration_in_hours
train_df$free_ad_ph <- prepared_df$free/prepared_df$duration_in_hours
train_df$percent_paid <- prepared_df$percent_paid
# Calculation of user's average number of events per session
session_avg <- df %>%
group_by(userId, sessionId) %>%
summarise(events = n(), .groups = 'drop') %>%
group_by(userId) %>%
summarise(avg_events_per_session = mean(events))
# Calculation of user's average session duration
session_avg_length = df %>%
group_by(userId, sessionId) %>%
arrange(ts, .by_group = TRUE) %>%
# filter(userId==3) %>%
summarise( session_begin_ts = min(ts),
session_end_ts = max(ts),
.groups = 'drop') %>%
group_by(userId) %>%
summarise( avg_session_duration = mean(session_end_ts-session_begin_ts))
#Convert from timestamp unit to hour
session_avg_length$avg_session_duration <- session_avg_length$avg_session_duration / 3600000
Incorporating all the newly defined business metrics into the main data.frame (prepared_df)
Since the lengths of the observation period are different for some users (new registrations and terminated users). We will scale the numbers of activities by the total number of hours in the whole observation period for each user.
The followings are the created features for our analysis
Collinearity Check for Numeric Variables
We plugged in all the predictors, or independent variables, into this correlation matrix to visualize if there are any variables constitute multicollinearity.
To reduce multicollinearity of our data and reduce the complexity of our model, variables with correlation more than 0.8 are considered highly correlated and some variables are excluded as described below:
tot_act_phour is highly correlated with songs_phour, tot_tu_phour, tot_td_phour, freds_added_phour, tot_add2PL_phour, HP_visits_phour, and setting_phour. As song listening is main service here and we would like to know our service’s user experience, we would keep songs_phour and drop the other variables.
HP_visits_phour is highly correlated with free_ad_phour. Likewise, setting_phour is highly positively correlated with HP_visits_phour. setting_phour was already dropped in the previous block. As we would like to know the user’s option about the ads, we would keep free_ad_phour and drop HP_visits_phour
As expected, avg_events_per_session and avg_session_duration are highly correlated. Longer sessions have more events. We would keep avg_session_duration and drop avg_events_per_session
To summarize, here is the list of variables we wanted to remove:
train_df$tot_act_phour <- NULL
train_df$tot_tu_phour <- NULL
train_df$tot_td_phour <- NULL
train_df$frds_added_phour <- NULL
train_df$tot_add2PL_phour <- NULL
train_df$HP_visits_phour <- NULL
train_df$setting_phour <- NULL
train_df$diff_act_phour <- NULL
train_df$avg_events_per_session <- NULL
We have the following correlations after the highly correlated ones are removed.
distribution of categorical variables
From the plots, we find:
users will account level = paid seem to have a higher churn rate. The user’s gender seems to have no effect on the likelihood of churning. iPhone users seem to be more likely to churn than other device users. Probably the app is not running well on a phone.
distribution of numeric variables
In the following plots, some variables are log-transformed for the better visualization, they are not transformed in the data for our modeling.
For the user activities songs_phour, tot_errs_phour, upgrades_phour, downgrades_phour, and save_setting_phour, the higher the number of activities, the more likely the user is going to churn. This may imply that the more the user uses the service, the more unsatisfying the user feels.
pos_negative_ratio seems to have little effect to the churn rate. This may imply that the dissatisfaction of the users are very likely to be irrelevant to the quality of the songs.
The effect of advertisement listening is not so obvious on the ad_per_song plot. However, the free_ad_ph and paid_ad_ph do show that higher number of ad listened leads to high rate of churn. The dissatisfaction of the users may come from the advertisements.
For percent_paid, users who do not change their account level (the ones staying 100% of the time free or 100% of the time paid) seem to have lower churn rate. They are content with service they are using. The ones who switched their account level may still feel unsatisfied about the service.
song_ratio, new_songs_ratio, avg_session_duration do not show any obvious patterns about churning.
The above findings are just ideas we get from the plots, the actual effect would need to be confirmed by our model analysis.
Lastly, the target variable that is in the training dataset is observed to imbalanced.
summary(train_df$churn)
## 0 1
## 349 99
In order to build a model with higher unbiasedness, we would use the method of upsampling. The method adds duplicate records of the minor class to the sample so that the size of the minor class is adjust to be close to the size of the major class.
temp <- train_df %>% filter(churn == 1) %>%
slice(rep(1:n(),
round(nrow(filter(train_df, churn == 0))/
nrow(filter(train_df, churn == 1)),0)-1))
train_df2 <- bind_rows(train_df, temp)
Finally, we can start building our model.
First, let’s build our preliminary model with all the defined features.
model_logi <- glm(churn~.,family = binomial, train_df2)
Checking the marginal model plots, we find that the variable new_songs_ratio does not fit well to our model.
marginalModelPlots(model_logi,~new_songs_ratio)
In order to fit the curvature, we will add a squared term of new_songs_ratio to our model
model_logi <- glm(churn~.+I(new_songs_ratio^2),family = binomial, train_df2)
summary(model_logi)
##
## Call:
## glm(formula = churn ~ . + I(new_songs_ratio^2), family = binomial,
## data = train_df2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0027 -0.4708 0.0000 0.2581 2.4211
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.290e+02 7.097e+02 -0.886 0.37541
## end_levelpaid 9.017e-01 4.897e-01 1.841 0.06556 .
## genderM 8.163e-02 2.642e-01 0.309 0.75736
## userAgentiPhone 1.829e+01 7.050e+02 0.026 0.97931
## userAgentLinux 1.609e+01 7.050e+02 0.023 0.98179
## userAgentMacintosh 1.601e+01 7.050e+02 0.023 0.98188
## userAgentWindows 1.597e+01 7.050e+02 0.023 0.98193
## songs_phour 8.935e+00 1.054e+00 8.480 < 2e-16 ***
## tot_errs_phour -1.990e+01 1.373e+02 -0.145 0.88472
## upgrades_phour 1.094e+02 2.475e+02 0.442 0.65843
## downgrades_phour -2.055e+02 3.942e+02 -0.521 0.60220
## save_setting_phour 3.036e+01 1.070e+02 0.284 0.77658
## song_ratio -4.434e+00 3.872e+00 -1.145 0.25216
## new_songs_ratio 1.163e+03 1.640e+02 7.091 1.34e-12 ***
## pos_negative_ratio -1.506e-01 5.470e-02 -2.753 0.00591 **
## ad_per_song 9.938e+00 5.395e+00 1.842 0.06544 .
## paid_ad_ph 2.782e+02 9.471e+01 2.938 0.00331 **
## free_ad_ph 4.491e+01 1.376e+01 3.263 0.00110 **
## percent_paid 2.241e+00 8.144e-01 2.751 0.00594 **
## avg_session_duration 1.811e-03 7.150e-02 0.025 0.97979
## I(new_songs_ratio^2) -5.495e+02 8.246e+01 -6.664 2.66e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1029.82 on 744 degrees of freedom
## Residual deviance: 428.78 on 724 degrees of freedom
## AIC: 470.78
##
## Number of Fisher Scoring iterations: 15
From our full model, there are some features that are insignificant. We would utilize the method of backward elimination based on the AIC score.
model_logi <- step(model_logi, trace=0)
The following is the result of our final model. Most of the coefficients are statistically significant
summary(model_logi)
##
## Call:
## glm(formula = churn ~ end_level + userAgent + songs_phour + new_songs_ratio +
## pos_negative_ratio + ad_per_song + paid_ad_ph + free_ad_ph +
## percent_paid + I(new_songs_ratio^2), family = binomial, data = train_df2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.8177 -0.4831 0.0000 0.2596 2.4452
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -628.87451 692.48295 -0.908 0.363802
## end_levelpaid 1.10537 0.38790 2.850 0.004377 **
## userAgentiPhone 18.36385 688.03763 0.027 0.978707
## userAgentLinux 16.16706 688.03760 0.023 0.981254
## userAgentMacintosh 16.07003 688.03745 0.023 0.981366
## userAgentWindows 16.02738 688.03744 0.023 0.981415
## songs_phour 8.93534 0.97286 9.185 < 2e-16 ***
## new_songs_ratio 1153.47860 156.71905 7.360 1.84e-13 ***
## pos_negative_ratio -0.14576 0.05143 -2.834 0.004593 **
## ad_per_song 11.74476 5.15449 2.279 0.022694 *
## paid_ad_ph 277.22956 90.61246 3.060 0.002217 **
## free_ad_ph 45.63953 12.18794 3.745 0.000181 ***
## percent_paid 1.96790 0.74120 2.655 0.007930 **
## I(new_songs_ratio^2) -543.96482 78.58793 -6.922 4.46e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1029.8 on 744 degrees of freedom
## Residual deviance: 430.5 on 731 degrees of freedom
## AIC: 458.5
##
## Number of Fisher Scoring iterations: 15
By looking at the marginal model plots, there is no lack of fit of the model.
marginalModelPlots(model_logi,~songs_phour+new_songs_ratio+pos_negative_ratio+ad_per_song+
paid_ad_ph+free_ad_ph+percent_paid, layout =c(3,3))
The following deviance residual vs linear predictor plot also confirms that our model is valid. The model is producing accurate predictions at the two ends. The errors around the match point 0 are independent and random.
residual_df <- mutate(train_df2, residuals=residuals(model_logi,type="deviance"),
linpred=predict(model_logi,type = "link"))
gdf <- group_by(residual_df, cut(linpred, breaks=unique(quantile(linpred,(1:100)/101))))
diagdf <- summarise(gdf, residuals=mean(residuals), linpred=mean(linpred))
plot(residuals ~ linpred, diagdf, xlab="linear predictor",xlim=c(-20,20))
Now let’s check the performance of our model
rocCurve <- roc(train_df2$churn, model_logi$fitted.values)
plot(rocCurve)
rocCurve$auc
## Area under the curve: 0.9445
We have an AUC of 0.9445, which is a good number. The optimal threshold of classifying the target variable churn is
optimal_threshold <- coords(rocCurve, "best", ret = "threshold")
optimal_threshold
Performance evaluation using the up-sampled data
predicted_class <- ifelse(model_logi$fitted.values>optimal_threshold[1,1],1,0)
confusion_matrix <- confusionMatrix(as.factor(predicted_class),
train_df2$churn,
mode = "everything",positive = "1")
confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 331 44
## 1 18 352
##
## Accuracy : 0.9168
## 95% CI : (0.8946, 0.9356)
## No Information Rate : 0.5315
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8336
##
## Mcnemar's Test P-Value : 0.001498
##
## Sensitivity : 0.8889
## Specificity : 0.9484
## Pos Pred Value : 0.9514
## Neg Pred Value : 0.8827
## Precision : 0.9514
## Recall : 0.8889
## F1 : 0.9191
## Prevalence : 0.5315
## Detection Rate : 0.4725
## Detection Prevalence : 0.4966
## Balanced Accuracy : 0.9187
##
## 'Positive' Class : 1
##
Performance evaluation using the pre-up-sampled data
predicted_class <- ifelse(predict(model_logi,train_df,type="response")>optimal_threshold[1,1],1,0)
confusion_matrix <- confusionMatrix(as.factor(predicted_class),
train_df$churn,
mode = "everything",positive = "1")
confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 331 11
## 1 18 88
##
## Accuracy : 0.9353
## 95% CI : (0.9084, 0.9562)
## No Information Rate : 0.779
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8166
##
## Mcnemar's Test P-Value : 0.2652
##
## Sensitivity : 0.8889
## Specificity : 0.9484
## Pos Pred Value : 0.8302
## Neg Pred Value : 0.9678
## Precision : 0.8302
## Recall : 0.8889
## F1 : 0.8585
## Prevalence : 0.2210
## Detection Rate : 0.1964
## Detection Prevalence : 0.2366
## Balanced Accuracy : 0.9187
##
## 'Positive' Class : 1
##
The evaluation using the pre-upsampled data and upsampled data both have a Balanced Accurary of over 91%.
Now let’s confirm the findings from our model output
summary(model_logi)
##
## Call:
## glm(formula = churn ~ end_level + userAgent + songs_phour + new_songs_ratio +
## pos_negative_ratio + ad_per_song + paid_ad_ph + free_ad_ph +
## percent_paid + I(new_songs_ratio^2), family = binomial, data = train_df2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.8177 -0.4831 0.0000 0.2596 2.4452
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -628.87451 692.48295 -0.908 0.363802
## end_levelpaid 1.10537 0.38790 2.850 0.004377 **
## userAgentiPhone 18.36385 688.03763 0.027 0.978707
## userAgentLinux 16.16706 688.03760 0.023 0.981254
## userAgentMacintosh 16.07003 688.03745 0.023 0.981366
## userAgentWindows 16.02738 688.03744 0.023 0.981415
## songs_phour 8.93534 0.97286 9.185 < 2e-16 ***
## new_songs_ratio 1153.47860 156.71905 7.360 1.84e-13 ***
## pos_negative_ratio -0.14576 0.05143 -2.834 0.004593 **
## ad_per_song 11.74476 5.15449 2.279 0.022694 *
## paid_ad_ph 277.22956 90.61246 3.060 0.002217 **
## free_ad_ph 45.63953 12.18794 3.745 0.000181 ***
## percent_paid 1.96790 0.74120 2.655 0.007930 **
## I(new_songs_ratio^2) -543.96482 78.58793 -6.922 4.46e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1029.8 on 744 degrees of freedom
## Residual deviance: 430.5 on 731 degrees of freedom
## AIC: 458.5
##
## Number of Fisher Scoring iterations: 15
Though our model may help predicting the users that may churn so the company can take actions such as offering discounts to ask them stay, it is better to make improvements to the service to lower the chance of churn.
Based on our findings, we have the following recommendations: