library(tidyverse)
library(scales)
library(corrplot)
# ---- Load data ----
df <- read_csv("Churn_Train.csv")
# churn order per the project notes: yes first, no second
df <- df %>%
mutate(
churn = factor(churn, levels = c("yes", "no")),
churn_bin = if_else(churn == "yes", 1, 0) # numeric helper for churn-rate tables/plots
)
# ---- Structure and shape ----
glimpse(df)
## Rows: 3,333
## Columns: 21
## $ state <chr> "NV", "HI", "DC", "HI", "OH", "MO", "NC"…
## $ account_length <dbl> 125, 108, 82, NA, 83, 89, 135, 28, 86, 6…
## $ area_code <chr> "area_code_510", "area_code_415", "area_…
## $ international_plan <chr> "no", "no", "no", "no", "no", "no", "no"…
## $ voice_mail_plan <chr> "no", "no", "no", "yes", "no", "no", "no…
## $ number_vmail_messages <dbl> 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, NA, 32…
## $ total_day_minutes <dbl> 2013.4, 291.6, 300.3, 110.3, 337.4, 178.…
## $ total_day_calls <dbl> 99, 99, 109, 71, 120, 81, 81, 87, 115, 1…
## $ total_day_charge <dbl> 28.66, 49.57, 51.05, 18.75, 57.36, 30.38…
## $ total_eve_minutes <dbl> 1107.6, 221.1, 181.0, 182.4, 227.4, NA, …
## $ total_eve_calls <dbl> 107, 93, 100, 108, 116, 74, 114, 92, 112…
## $ total_eve_charge <dbl> 14.93, 18.79, 15.39, 15.50, 19.33, 19.86…
## $ total_night_minutes <dbl> 243.3, 229.2, 270.1, 183.8, 153.9, 131.9…
## $ total_night_calls <dbl> 92, 110, 73, 88, 114, 120, 82, 112, 95, …
## $ total_night_charge <dbl> 10.95, 10.31, 12.15, 8.27, 6.93, 5.94, 9…
## $ total_intl_minutes <dbl> 10.9, 14.0, 11.7, 11.0, 15.8, 9.1, 10.3,…
## $ total_intl_calls <dbl> 7, 9, 4, 8, 7, 4, 6, 3, 7, 6, 7, NA, 4, …
## $ total_intl_charge <dbl> 2.94, 3.78, 3.16, 2.97, 4.27, 2.46, 2.78…
## $ number_customer_service_calls <dbl> 0, 2, 0, 2, 0, 1, 1, 3, 2, 4, 1, NA, 3, …
## $ churn <fct> no, yes, yes, no, yes, no, no, no, no, y…
## $ churn_bin <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0…
dim(df)
## [1] 3333 21
# ---- Class balance of the target ----
df %>%
count(churn) %>%
mutate(pct = n / sum(n))
# churn is imbalanced, roughly 14.5% yes vs 85.5% no any model needs to be judged on more than accuracy, since predicting all "no" already lands near 85%
ggplot(df, aes(churn, fill = churn)) +
geom_bar() +
geom_text(stat = "count", aes(label = comma(after_stat(count))), vjust = -0.4) +
scale_fill_manual(values = c(yes = "firebrick", no = "steelblue")) +
labs(title = "Class Balance of Churn", x = NULL, y = "Count") +
theme_minimal() +
theme(legend.position = "none")

# ---- Missing values per column ----
# This data is dirty: most columns carry NAs
missing_tbl <- df %>%
select(-churn_bin) %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "variable", values_to = "n_missing") %>%
arrange(desc(n_missing))
missing_tbl
# rows with at least one NA
sum(!complete.cases(df)) # 703 of 3333 rows are incomplete
## [1] 703
missing_tbl %>%
filter(n_missing > 0) %>%
mutate(variable = fct_reorder(variable, n_missing)) %>%
ggplot(aes(n_missing, variable)) +
geom_col(fill = "steelblue") +
labs(title = "Missing Values by Column", x = "Count of NA", y = NULL) +
theme_minimal()

# ---- Impossible / out-of-range values ----
# a few columns contain values that cannot occur in reality and point to corrupted data
tibble(
check = c("account_length < 0",
"number_vmail_messages < 0",
"total_day_minutes > 600",
"total_eve_minutes > 600"),
count = c(sum(df$account_length < 0, na.rm = TRUE),
sum(df$number_vmail_messages < 0, na.rm = TRUE),
sum(df$total_day_minutes > 600, na.rm = TRUE),
sum(df$total_eve_minutes > 600, na.rm = TRUE))
)
# account_length and number_vmail_messages have negative entries total_day_minutes and total_eve_minutes carry physically impossible values (a day only holds 1440 minutes, yet some rows exceed 2000)
# ---- Are minutes and charge consistent? ----
# In a clean telecom file, charge is just a flat rate times minutes, so the two should be almost perfectly correlated.
df %>%
summarise(
day = cor(total_day_minutes, total_day_charge, use = "complete.obs"),
eve = cor(total_eve_minutes, total_eve_charge, use = "complete.obs"),
night = cor(total_night_minutes, total_night_charge, use = "complete.obs"),
intl = cor(total_intl_minutes, total_intl_charge, use = "complete.obs")
) %>%
pivot_longer(everything(), names_to = "daypart", values_to = "cor_min_charge")
# night and intl come back at ~1.00 (clean), but day and eve collapse to ~0.09 and ~0.16 the corruption sits in the day/eve MINUTES columns, not the charges the charge columns stay trustworthy, so they are the better signal to model on
# the busted relationship, drawn out for the day period
# ---- Day minutes vs charge, corrupted points flagged ----
df %>%
mutate(corrupt = if_else(total_day_minutes > 600, "corrupted", "clean")) %>%
ggplot(aes(total_day_minutes, total_day_charge, color = corrupt)) +
geom_point(alpha = 0.3) +
scale_color_manual(values = c(clean = "steelblue", corrupted = "firebrick")) +
labs(title = "Day Minutes vs Day Charge (corrupted points flagged)",
x = "Total day minutes", y = "Total day charge", color = NULL) +
theme_minimal()
## Warning: Removed 200 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ---- Distributions of a few clean numeric drivers ----
df %>%
select(account_length, total_day_charge, total_eve_charge,
total_night_charge, total_intl_charge, number_customer_service_calls) %>%
pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
filter(!is.na(value)) %>%
ggplot(aes(value)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
facet_wrap(~ variable, scales = "free") +
labs(title = "Distributions of Selected Numeric Variables", x = NULL, y = "Count") +
theme_minimal()

# the charge columns are roughly normal, customer service calls is right-skewed and discrete
# ---- Churn rate by the two plan flags ----
plan_rates <- bind_rows(
df %>% group_by(plan = international_plan) %>%
summarise(churn_rate = mean(churn_bin), n = n()) %>%
mutate(variable = "international_plan"),
df %>% group_by(plan = voice_mail_plan) %>%
summarise(churn_rate = mean(churn_bin), n = n()) %>%
mutate(variable = "voice_mail_plan")
)
plan_rates
plan_rates %>%
ggplot(aes(plan, churn_rate, fill = plan)) +
geom_col() +
geom_text(aes(label = percent(churn_rate, accuracy = 0.1)), vjust = -0.4) +
facet_wrap(~ variable) +
scale_y_continuous(labels = percent) +
scale_fill_manual(values = c(no = "steelblue", yes = "firebrick")) +
labs(title = "Churn Rate by Plan Type", x = NULL, y = "Churn rate") +
theme_minimal() +
theme(legend.position = "none")

# international_plan = yes churns at ~42% vs ~12% without it, the single strongest flag
# voice_mail_plan = yes churns LESS (~9% vs ~17%), so it looks protective
# ---- Churn rate by number of customer service calls ----
df %>%
filter(!is.na(number_customer_service_calls)) %>%
group_by(number_customer_service_calls) %>%
summarise(churn_rate = mean(churn_bin), n = n()) %>%
ggplot(aes(factor(number_customer_service_calls), churn_rate)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = percent(churn_rate, accuracy = 1)), vjust = -0.4, size = 3) +
scale_y_continuous(labels = percent) +
labs(title = "Churn Rate by Number of Customer Service Calls",
x = "Customer service calls", y = "Churn rate") +
theme_minimal()

# clear threshold effect: churn sits near 10-13% through 3 calls, then jumps past 45% at 4 calls and keeps climbing. a flag for "4 or more calls" should be useful
# ---- Do churn customers spend more on day usage? ----
df %>%
select(churn, total_day_charge, total_eve_charge,
total_night_charge, total_intl_charge) %>%
pivot_longer(-churn, names_to = "variable", values_to = "charge") %>%
filter(!is.na(charge)) %>%
ggplot(aes(churn, charge, fill = churn)) +
geom_boxplot(alpha = 0.8) +
facet_wrap(~ variable, scales = "free_y") +
scale_fill_manual(values = c(yes = "firebrick", no = "steelblue")) +
labs(title = "Charge Columns by Churn Status", x = NULL, y = "Charge") +
theme_minimal() +
theme(legend.position = "none")

# day charge separates the groups the most, churners run noticeably higher day charges eve/night/intl charges barely move between the groups
# ---- Correlation among numeric variables (incl. churn as 0/1) ----
# pairwise complete obs so the NAs don't drop the whole matrix
num <- df %>%
select(where(is.numeric), -churn_bin) %>%
mutate(churn = df$churn_bin)
cmat <- cor(num, use = "pairwise.complete.obs")
corrplot(cmat, method = "color", type = "upper",
tl.col = "black",
tl.cex = 0.7,
tl.srt = 45,
number.cex = 0.5,
title = "Correlation Matrix of Numeric Variables",
mar = c(0, 0, 2, 0))

# each charge pairs with its own minutes column (perfectly for night/intl, weakly for day/eve because of the corruption). against churn the strongest linear singals are total_day_charge and number_customer_service_calls
# ---- Linear correlation with churn, ranked ----
as.data.frame(cmat) %>%
rownames_to_column("variable") %>%
select(variable, churn) %>%
filter(variable != "churn") %>%
arrange(desc(abs(churn)))