1. Load Data

netflix_titles <- read.csv("C:/Users/anmol/OneDrive/Desktop/New folder (2)/netflix_titles (1).csv")
user_activity  <- read.csv("C:/Users/anmol/OneDrive/Desktop/New folder (2)/user_activity_dataset.csv")

2. Clean Titles Dataset

titles_clean <- netflix_titles %>%
  select(-starts_with("Unnamed")) %>%
  separate_rows(listed_in, sep = ", ") %>%
  rename(genre = listed_in) %>%
  select(show_id, type, genre)

3. Merge Datasets

merged_data <- user_activity %>%
  mutate(watch_date = as.Date(watch_date)) %>%
  rename(
    watch_time   = watch_time_minutes,
    device       = device_type,
    churn_status = subscription_status
  ) %>%
  left_join(titles_clean, by = "show_id")

4. Feature Engineering

churn_target <- merged_data %>%
  group_by(user_id) %>%
  arrange(watch_date) %>%
  slice_tail(n = 1) %>%
  select(user_id, churn_status) %>%
  mutate(churn = ifelse(churn_status == "Churned", 1, 0)) %>%
  select(-churn_status)

user_features <- merged_data %>%
  group_by(user_id) %>%
  summarise(
    total_watch_time    = sum(watch_time, na.rm = TRUE),
    total_activities    = n(),
    avg_watch_time      = mean(watch_time, na.rm = TRUE),
    unique_shows_watched = n_distinct(show_id),
    unique_genres        = n_distinct(genre),
    most_freq_device     = names(which.max(table(device))),
    most_freq_country    = names(which.max(table(country))),
    most_freq_type       = names(which.max(table(type))),
    .groups = "drop"
  )

model_df <- user_features %>%
  left_join(churn_target, by = "user_id") %>%
  mutate(
    most_freq_device  = as.factor(most_freq_device),
    most_freq_country = as.factor(most_freq_country),
    most_freq_type    = as.factor(most_freq_type),
    churn             = as.factor(churn)
  ) %>%
  drop_na(churn)

5. Combined Boxplot Analysis

p1 <- ggplot(model_df, aes(x = churn, y = total_watch_time, fill = churn)) + geom_boxplot() + labs(title = "Total Watch Time") + theme_minimal()
p2 <- ggplot(model_df, aes(x = churn, y = avg_watch_time, fill = churn)) + geom_boxplot() + labs(title = "Average Watch Time") + theme_minimal()
p3 <- ggplot(model_df, aes(x = churn, y = unique_shows_watched, fill = churn)) + geom_boxplot() + labs(title = "Unique Shows Watched") + theme_minimal()
p4 <- ggplot(model_df, aes(x = churn, y = unique_genres, fill = churn)) + geom_boxplot() + labs(title = "Unique Genres Watched") + theme_minimal()
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

6. Optional Logistic Regression

churn_model <- glm(churn ~ total_watch_time + avg_watch_time + unique_shows_watched + unique_genres, data = model_df, family = binomial)
summary(churn_model)
## 
## Call:
## glm(formula = churn ~ total_watch_time + avg_watch_time + unique_shows_watched + 
##     unique_genres, family = binomial, data = model_df)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.427e+00  4.080e-01  -3.497 0.000471 ***
## total_watch_time     -2.713e-05  6.970e-05  -0.389 0.697106    
## avg_watch_time        2.910e-03  2.871e-03   1.014 0.310657    
## unique_shows_watched  1.938e-03  1.947e-02   0.100 0.920711    
## unique_genres         1.160e-02  1.279e-02   0.907 0.364348    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7092.7  on 5999  degrees of freedom
## Residual deviance: 7090.2  on 5995  degrees of freedom
## AIC: 7100.2
## 
## Number of Fisher Scoring iterations: 4