library(tidyverse)
library(scales)
library(corrplot)

# ---- Load data ----
df <- read_csv("Churn_Train.csv")

# churn order per the project notes: yes first, no second
df <- df %>%
  mutate(
    churn    = factor(churn, levels = c("yes", "no")),
    churn_bin = if_else(churn == "yes", 1, 0)   # numeric helper for churn-rate tables/plots
  )

# ---- Structure and shape ----
glimpse(df)                 
## Rows: 3,333
## Columns: 21
## $ state                         <chr> "NV", "HI", "DC", "HI", "OH", "MO", "NC"…
## $ account_length                <dbl> 125, 108, 82, NA, 83, 89, 135, 28, 86, 6…
## $ area_code                     <chr> "area_code_510", "area_code_415", "area_…
## $ international_plan            <chr> "no", "no", "no", "no", "no", "no", "no"…
## $ voice_mail_plan               <chr> "no", "no", "no", "yes", "no", "no", "no…
## $ number_vmail_messages         <dbl> 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, NA, 32…
## $ total_day_minutes             <dbl> 2013.4, 291.6, 300.3, 110.3, 337.4, 178.…
## $ total_day_calls               <dbl> 99, 99, 109, 71, 120, 81, 81, 87, 115, 1…
## $ total_day_charge              <dbl> 28.66, 49.57, 51.05, 18.75, 57.36, 30.38…
## $ total_eve_minutes             <dbl> 1107.6, 221.1, 181.0, 182.4, 227.4, NA, …
## $ total_eve_calls               <dbl> 107, 93, 100, 108, 116, 74, 114, 92, 112…
## $ total_eve_charge              <dbl> 14.93, 18.79, 15.39, 15.50, 19.33, 19.86…
## $ total_night_minutes           <dbl> 243.3, 229.2, 270.1, 183.8, 153.9, 131.9…
## $ total_night_calls             <dbl> 92, 110, 73, 88, 114, 120, 82, 112, 95, …
## $ total_night_charge            <dbl> 10.95, 10.31, 12.15, 8.27, 6.93, 5.94, 9…
## $ total_intl_minutes            <dbl> 10.9, 14.0, 11.7, 11.0, 15.8, 9.1, 10.3,…
## $ total_intl_calls              <dbl> 7, 9, 4, 8, 7, 4, 6, 3, 7, 6, 7, NA, 4, …
## $ total_intl_charge             <dbl> 2.94, 3.78, 3.16, 2.97, 4.27, 2.46, 2.78…
## $ number_customer_service_calls <dbl> 0, 2, 0, 2, 0, 1, 1, 3, 2, 4, 1, NA, 3, …
## $ churn                         <fct> no, yes, yes, no, yes, no, no, no, no, y…
## $ churn_bin                     <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0…
dim(df)                     
## [1] 3333   21
# ---- Class balance of the target ----
df %>%
  count(churn) %>%
  mutate(pct = n / sum(n))
# churn is imbalanced, roughly 14.5% yes vs 85.5% no any model needs to be judged on more than accuracy, since predicting all "no" already lands near 85%

ggplot(df, aes(churn, fill = churn)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = comma(after_stat(count))), vjust = -0.4) +
  scale_fill_manual(values = c(yes = "firebrick", no = "steelblue")) +
  labs(title = "Class Balance of Churn", x = NULL, y = "Count") +
  theme_minimal() +
  theme(legend.position = "none")

# ---- Missing values per column ----
# This data is dirty: most columns carry NAs
missing_tbl <- df %>%
  select(-churn_bin) %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "n_missing") %>%
  arrange(desc(n_missing))

missing_tbl
# rows with at least one NA
sum(!complete.cases(df))        # 703 of 3333 rows are incomplete
## [1] 703
missing_tbl %>%
  filter(n_missing > 0) %>%
  mutate(variable = fct_reorder(variable, n_missing)) %>%
  ggplot(aes(n_missing, variable)) +
  geom_col(fill = "steelblue") +
  labs(title = "Missing Values by Column", x = "Count of NA", y = NULL) +
  theme_minimal()

# ---- Impossible / out-of-range values ----
# a few columns contain values that cannot occur in reality and point to corrupted data
tibble(
  check = c("account_length < 0",
            "number_vmail_messages < 0",
            "total_day_minutes > 600",
            "total_eve_minutes > 600"),
  count = c(sum(df$account_length < 0,        na.rm = TRUE),
            sum(df$number_vmail_messages < 0, na.rm = TRUE),
            sum(df$total_day_minutes > 600,   na.rm = TRUE),
            sum(df$total_eve_minutes > 600,   na.rm = TRUE))
)
# account_length and number_vmail_messages have negative entries total_day_minutes and total_eve_minutes carry physically impossible values (a day only holds 1440 minutes, yet some rows exceed 2000)
# ---- Are minutes and charge consistent? ----
# In a clean telecom file, charge is just a flat rate times minutes, so the two should be almost perfectly correlated. 
df %>%
  summarise(
    day   = cor(total_day_minutes,   total_day_charge,   use = "complete.obs"),
    eve   = cor(total_eve_minutes,   total_eve_charge,   use = "complete.obs"),
    night = cor(total_night_minutes, total_night_charge, use = "complete.obs"),
    intl  = cor(total_intl_minutes,  total_intl_charge,  use = "complete.obs")
  ) %>%
  pivot_longer(everything(), names_to = "daypart", values_to = "cor_min_charge")
# night and intl come back at ~1.00 (clean), but day and eve collapse to ~0.09 and ~0.16 the corruption sits in the day/eve MINUTES columns, not the charges the charge columns stay trustworthy, so they are the better signal to model on

# the busted relationship, drawn out for the day period
# ---- Day minutes vs charge, corrupted points flagged ----
df %>%
  mutate(corrupt = if_else(total_day_minutes > 600, "corrupted", "clean")) %>%
  ggplot(aes(total_day_minutes, total_day_charge, color = corrupt)) +
  geom_point(alpha = 0.3) +
  scale_color_manual(values = c(clean = "steelblue", corrupted = "firebrick")) +
  labs(title = "Day Minutes vs Day Charge (corrupted points flagged)",
       x = "Total day minutes", y = "Total day charge", color = NULL) +
  theme_minimal()
## Warning: Removed 200 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ---- Distributions of a few clean numeric drivers ----
df %>%
  select(account_length, total_day_charge, total_eve_charge,
         total_night_charge, total_intl_charge, number_customer_service_calls) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  filter(!is.na(value)) %>%
  ggplot(aes(value)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  facet_wrap(~ variable, scales = "free") +
  labs(title = "Distributions of Selected Numeric Variables", x = NULL, y = "Count") +
  theme_minimal()

# the charge columns are roughly normal, customer service calls is right-skewed and discrete
# ---- Churn rate by the two plan flags ----
plan_rates <- bind_rows(
  df %>% group_by(plan = international_plan) %>%
    summarise(churn_rate = mean(churn_bin), n = n()) %>%
    mutate(variable = "international_plan"),
  df %>% group_by(plan = voice_mail_plan) %>%
    summarise(churn_rate = mean(churn_bin), n = n()) %>%
    mutate(variable = "voice_mail_plan")
)

plan_rates
plan_rates %>%
  ggplot(aes(plan, churn_rate, fill = plan)) +
  geom_col() +
  geom_text(aes(label = percent(churn_rate, accuracy = 0.1)), vjust = -0.4) +
  facet_wrap(~ variable) +
  scale_y_continuous(labels = percent) +
  scale_fill_manual(values = c(no = "steelblue", yes = "firebrick")) +
  labs(title = "Churn Rate by Plan Type", x = NULL, y = "Churn rate") +
  theme_minimal() +
  theme(legend.position = "none")

# international_plan = yes churns at ~42% vs ~12% without it, the single strongest flag
# voice_mail_plan = yes churns LESS (~9% vs ~17%), so it looks protective
# ---- Churn rate by number of customer service calls ----
df %>%
  filter(!is.na(number_customer_service_calls)) %>%
  group_by(number_customer_service_calls) %>%
  summarise(churn_rate = mean(churn_bin), n = n()) %>%
  ggplot(aes(factor(number_customer_service_calls), churn_rate)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = percent(churn_rate, accuracy = 1)), vjust = -0.4, size = 3) +
  scale_y_continuous(labels = percent) +
  labs(title = "Churn Rate by Number of Customer Service Calls",
       x = "Customer service calls", y = "Churn rate") +
  theme_minimal()

# clear threshold effect: churn sits near 10-13% through 3 calls, then jumps past 45% at 4 calls and keeps climbing. a flag for "4 or more calls" should be useful
# ---- Do churn customers spend more on day usage? ----
df %>%
  select(churn, total_day_charge, total_eve_charge,
         total_night_charge, total_intl_charge) %>%
  pivot_longer(-churn, names_to = "variable", values_to = "charge") %>%
  filter(!is.na(charge)) %>%
  ggplot(aes(churn, charge, fill = churn)) +
  geom_boxplot(alpha = 0.8) +
  facet_wrap(~ variable, scales = "free_y") +
  scale_fill_manual(values = c(yes = "firebrick", no = "steelblue")) +
  labs(title = "Charge Columns by Churn Status", x = NULL, y = "Charge") +
  theme_minimal() +
  theme(legend.position = "none")

# day charge separates the groups the most, churners run noticeably higher day charges eve/night/intl charges barely move between the groups
# ---- Correlation among numeric variables (incl. churn as 0/1) ----
# pairwise complete obs so the NAs don't drop the whole matrix
num <- df %>%
  select(where(is.numeric), -churn_bin) %>%
  mutate(churn = df$churn_bin)

cmat <- cor(num, use = "pairwise.complete.obs")

corrplot(cmat, method = "color", type = "upper",
         tl.col = "black",
         tl.cex = 0.7,
         tl.srt = 45,
         number.cex = 0.5,
         title = "Correlation Matrix of Numeric Variables",
         mar = c(0, 0, 2, 0))

# each charge pairs with its own minutes column (perfectly for night/intl, weakly for day/eve because of the corruption). against churn the strongest linear singals are total_day_charge and number_customer_service_calls

# ---- Linear correlation with churn, ranked ----
as.data.frame(cmat) %>%
  rownames_to_column("variable") %>%
  select(variable, churn) %>%
  filter(variable != "churn") %>%
  arrange(desc(abs(churn)))