1. Load Data
netflix_titles <- read.csv("C:/Users/anmol/OneDrive/Desktop/New folder (2)/netflix_titles (1).csv")
user_activity <- read.csv("C:/Users/anmol/OneDrive/Desktop/New folder (2)/user_activity_dataset.csv")
2. Clean Titles Dataset
titles_clean <- netflix_titles %>%
select(-starts_with("Unnamed")) %>%
separate_rows(listed_in, sep = ", ") %>%
rename(genre = listed_in) %>%
select(show_id, type, genre)
3. Merge Datasets
merged_data <- user_activity %>%
mutate(watch_date = as.Date(watch_date)) %>%
rename(
watch_time = watch_time_minutes,
device = device_type,
churn_status = subscription_status
) %>%
left_join(titles_clean, by = "show_id")
4. Feature Engineering
churn_target <- merged_data %>%
group_by(user_id) %>%
arrange(watch_date) %>%
slice_tail(n = 1) %>%
select(user_id, churn_status) %>%
mutate(churn = ifelse(churn_status == "Churned", 1, 0)) %>%
select(-churn_status)
user_features <- merged_data %>%
group_by(user_id) %>%
summarise(
total_watch_time = sum(watch_time, na.rm = TRUE),
total_activities = n(),
avg_watch_time = mean(watch_time, na.rm = TRUE),
unique_shows_watched = n_distinct(show_id),
unique_genres = n_distinct(genre),
most_freq_device = names(which.max(table(device))),
most_freq_country = names(which.max(table(country))),
most_freq_type = names(which.max(table(type))),
.groups = "drop"
)
model_df <- user_features %>%
left_join(churn_target, by = "user_id") %>%
mutate(
most_freq_device = as.factor(most_freq_device),
most_freq_country = as.factor(most_freq_country),
most_freq_type = as.factor(most_freq_type),
churn = as.factor(churn)
) %>%
drop_na(churn)
5. Combined Boxplot Analysis
p1 <- ggplot(model_df, aes(x = churn, y = total_watch_time, fill = churn)) + geom_boxplot() + labs(title = "Total Watch Time") + theme_minimal()
p2 <- ggplot(model_df, aes(x = churn, y = avg_watch_time, fill = churn)) + geom_boxplot() + labs(title = "Average Watch Time") + theme_minimal()
p3 <- ggplot(model_df, aes(x = churn, y = unique_shows_watched, fill = churn)) + geom_boxplot() + labs(title = "Unique Shows Watched") + theme_minimal()
p4 <- ggplot(model_df, aes(x = churn, y = unique_genres, fill = churn)) + geom_boxplot() + labs(title = "Unique Genres Watched") + theme_minimal()
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

6. Optional Logistic Regression
churn_model <- glm(churn ~ total_watch_time + avg_watch_time + unique_shows_watched + unique_genres, data = model_df, family = binomial)
summary(churn_model)
##
## Call:
## glm(formula = churn ~ total_watch_time + avg_watch_time + unique_shows_watched +
## unique_genres, family = binomial, data = model_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.427e+00 4.080e-01 -3.497 0.000471 ***
## total_watch_time -2.713e-05 6.970e-05 -0.389 0.697106
## avg_watch_time 2.910e-03 2.871e-03 1.014 0.310657
## unique_shows_watched 1.938e-03 1.947e-02 0.100 0.920711
## unique_genres 1.160e-02 1.279e-02 0.907 0.364348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7092.7 on 5999 degrees of freedom
## Residual deviance: 7090.2 on 5995 degrees of freedom
## AIC: 7100.2
##
## Number of Fisher Scoring iterations: 4