# Load required libraries
library(tidyverse) # for data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor) # for cleaning column names
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr) # for quick data summary
library(DataExplorer) # optional, great for quick EDA
# Load dataset
churn_data <- read_csv("~/Desktop/DA Portfolio/telco-customer-churn-prediction/data/raw/raw_Telco-Customer-Churn.csv")
## Rows: 7043 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): customerID, gender, Partner, Dependents, PhoneService, MultipleLin...
## dbl (4): SeniorCitizen, tenure, MonthlyCharges, TotalCharges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean column names
churn_data <- clean_names(churn_data)
# View first few rows
head(churn_data)
## # A tibble: 6 × 21
## customer_id gender senior_citizen partner dependents tenure phone_service
## <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 7590-VHVEG Female 0 Yes No 1 No
## 2 5575-GNVDE Male 0 No No 34 Yes
## 3 3668-QPYBK Male 0 No No 2 Yes
## 4 7795-CFOCW Male 0 No No 45 No
## 5 9237-HQITU Female 0 No No 2 Yes
## 6 9305-CDSKC Female 0 No No 8 Yes
## # ℹ 14 more variables: multiple_lines <chr>, internet_service <chr>,
## # online_security <chr>, online_backup <chr>, device_protection <chr>,
## # tech_support <chr>, streaming_tv <chr>, streaming_movies <chr>,
## # contract <chr>, paperless_billing <chr>, payment_method <chr>,
## # monthly_charges <dbl>, total_charges <dbl>, churn <chr>
# Check structure of data
str(churn_data)
## spc_tbl_ [7,043 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ customer_id : chr [1:7043] "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : chr [1:7043] "Female" "Male" "Male" "Male" ...
## $ senior_citizen : num [1:7043] 0 0 0 0 0 0 0 0 0 0 ...
## $ partner : chr [1:7043] "Yes" "No" "No" "No" ...
## $ dependents : chr [1:7043] "No" "No" "No" "No" ...
## $ tenure : num [1:7043] 1 34 2 45 2 8 22 10 28 62 ...
## $ phone_service : chr [1:7043] "No" "Yes" "Yes" "No" ...
## $ multiple_lines : chr [1:7043] "No phone service" "No" "No" "No phone service" ...
## $ internet_service : chr [1:7043] "DSL" "DSL" "DSL" "DSL" ...
## $ online_security : chr [1:7043] "No" "Yes" "Yes" "Yes" ...
## $ online_backup : chr [1:7043] "Yes" "No" "Yes" "No" ...
## $ device_protection: chr [1:7043] "No" "Yes" "No" "Yes" ...
## $ tech_support : chr [1:7043] "No" "No" "No" "Yes" ...
## $ streaming_tv : chr [1:7043] "No" "No" "No" "No" ...
## $ streaming_movies : chr [1:7043] "No" "No" "No" "No" ...
## $ contract : chr [1:7043] "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ paperless_billing: chr [1:7043] "Yes" "No" "Yes" "No" ...
## $ payment_method : chr [1:7043] "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ monthly_charges : num [1:7043] 29.9 57 53.9 42.3 70.7 ...
## $ total_charges : num [1:7043] 29.9 1889.5 108.2 1840.8 151.7 ...
## $ churn : chr [1:7043] "No" "No" "Yes" "No" ...
## - attr(*, "spec")=
## .. cols(
## .. customerID = col_character(),
## .. gender = col_character(),
## .. SeniorCitizen = col_double(),
## .. Partner = col_character(),
## .. Dependents = col_character(),
## .. tenure = col_double(),
## .. PhoneService = col_character(),
## .. MultipleLines = col_character(),
## .. InternetService = col_character(),
## .. OnlineSecurity = col_character(),
## .. OnlineBackup = col_character(),
## .. DeviceProtection = col_character(),
## .. TechSupport = col_character(),
## .. StreamingTV = col_character(),
## .. StreamingMovies = col_character(),
## .. Contract = col_character(),
## .. PaperlessBilling = col_character(),
## .. PaymentMethod = col_character(),
## .. MonthlyCharges = col_double(),
## .. TotalCharges = col_double(),
## .. Churn = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Quick data summary
skim(churn_data)
| Name | churn_data |
| Number of rows | 7043 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| character | 17 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| customer_id | 0 | 1 | 10 | 10 | 0 | 7043 | 0 |
| gender | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| partner | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| dependents | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| phone_service | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| multiple_lines | 0 | 1 | 2 | 16 | 0 | 3 | 0 |
| internet_service | 0 | 1 | 2 | 11 | 0 | 3 | 0 |
| online_security | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| online_backup | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| device_protection | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| tech_support | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| streaming_tv | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| streaming_movies | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| contract | 0 | 1 | 8 | 14 | 0 | 3 | 0 |
| paperless_billing | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| payment_method | 0 | 1 | 12 | 25 | 0 | 4 | 0 |
| churn | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| senior_citizen | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| tenure | 0 | 1 | 32.37 | 24.56 | 0.00 | 9.00 | 29.00 | 55.00 | 72.00 | ▇▃▃▃▆ |
| monthly_charges | 0 | 1 | 64.76 | 30.09 | 18.25 | 35.50 | 70.35 | 89.85 | 118.75 | ▇▅▆▇▅ |
| total_charges | 11 | 1 | 2283.30 | 2266.77 | 18.80 | 401.45 | 1397.47 | 3794.74 | 8684.80 | ▇▂▂▂▁ |
# Check missing values
colSums(is.na(churn_data))
## customer_id gender senior_citizen partner
## 0 0 0 0
## dependents tenure phone_service multiple_lines
## 0 0 0 0
## internet_service online_security online_backup device_protection
## 0 0 0 0
## tech_support streaming_tv streaming_movies contract
## 0 0 0 0
## paperless_billing payment_method monthly_charges total_charges
## 0 0 0 11
## churn
## 0
# Convert total_charges to numeric (some spaces cause NA)
churn_data <- churn_data %>%
mutate(total_charges = as.numeric(trimws(total_charges)))
# Check again
summary(churn_data$total_charges)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.8 401.4 1397.5 2283.3 3794.7 8684.8 11
# How many NAs?
sum(is.na(churn_data$total_charges))
## [1] 11
# Remove rows with missing total_charges
churn_data <- churn_data %>%
filter(!is.na(total_charges))
# Churn rate overall
churn_data %>%
count(churn) %>%
mutate(percent = n / sum(n) * 100)
## # A tibble: 2 × 3
## churn n percent
## <chr> <int> <dbl>
## 1 No 5163 73.4
## 2 Yes 1869 26.6
# Churn rate by contract type
churn_data %>%
group_by(contract) %>%
summarise(
churn_rate = mean(churn == "Yes") * 100,
count = n()
)
## # A tibble: 3 × 3
## contract churn_rate count
## <chr> <dbl> <int>
## 1 Month-to-month 42.7 3875
## 2 One year 11.3 1472
## 3 Two year 2.85 1685
# Bar plot: Churn rate by contract type
churn_data %>%
group_by(contract) %>%
summarise(churn_rate = mean(churn == "Yes") * 100) %>%
ggplot(aes(x = contract, y = churn_rate, fill = contract)) +
geom_col() +
labs(title = "Churn Rate by Contract Type", y = "Churn Rate (%)", x = "Contract Type") +
theme_minimal()
# Create tenure group
churn_data <- churn_data %>%
mutate(tenure_group = case_when(
tenure <= 12 ~ "0-1 year",
tenure <= 24 ~ "1-2 years",
tenure <= 48 ~ "2-4 years",
tenure <= 60 ~ "4-5 years",
tenure > 60 ~ "5+ years"
))
# Churn by tenure group
churn_data %>%
group_by(tenure_group) %>%
summarise(
churn_rate = mean(churn == "Yes") * 100,
count = n()
)
## # A tibble: 5 × 3
## tenure_group churn_rate count
## <chr> <dbl> <int>
## 1 0-1 year 47.7 2175
## 2 1-2 years 28.7 1024
## 3 2-4 years 20.4 1594
## 4 4-5 years 14.4 832
## 5 5+ years 6.61 1407
# Bar plot: Churn rate by tenure group
churn_data %>%
group_by(tenure_group) %>%
summarise(churn_rate = mean(churn == "Yes") * 100) %>%
ggplot(aes(x = tenure_group, y = churn_rate, fill = tenure_group)) +
geom_col() +
labs(title = "Churn Rate by Tenure Group", y = "Churn Rate (%)", x = "Tenure Group") +
theme_minimal()
# Quick EDA report (DataExplorer)
plot_bar(churn_data, by = "churn")
## 1 columns ignored with more than 50 categories.
## customer_id: 7032 categories
# Churn by payment method
churn_data %>%
group_by(payment_method) %>%
summarise(
churn_rate = mean(churn == "Yes") * 100,
count = n()
) %>%
arrange(desc(churn_rate))
## # A tibble: 4 × 3
## payment_method churn_rate count
## <chr> <dbl> <int>
## 1 Electronic check 45.3 2365
## 2 Mailed check 19.2 1604
## 3 Bank transfer (automatic) 16.7 1542
## 4 Credit card (automatic) 15.3 1521
# Churn by internet service
churn_data %>%
group_by(internet_service) %>%
summarise(
churn_rate = mean(churn == "Yes") * 100,
count = n()
) %>%
arrange(desc(churn_rate))
## # A tibble: 3 × 3
## internet_service churn_rate count
## <chr> <dbl> <int>
## 1 Fiber optic 41.9 3096
## 2 DSL 19.0 2416
## 3 No 7.43 1520
# Boxplot: Monthly charges vs. churn
ggplot(churn_data, aes(x = churn, y = monthly_charges, fill = churn)) +
geom_boxplot() +
labs(title = "Monthly Charges vs. Churn", x = "Churn", y = "Monthly Charges") +
theme_minimal()
# Convert categorical variables to factors
churn_data <- churn_data %>%
mutate(
churn = factor(churn),
gender = factor(gender),
partner = factor(partner),
dependents = factor(dependents),
phone_service = factor(phone_service),
multiple_lines = factor(multiple_lines),
internet_service = factor(internet_service),
online_security = factor(online_security),
online_backup = factor(online_backup),
device_protection = factor(device_protection),
tech_support = factor(tech_support),
streaming_tv = factor(streaming_tv),
streaming_movies = factor(streaming_movies),
contract = factor(contract),
paperless_billing = factor(paperless_billing),
payment_method = factor(payment_method),
tenure_group = factor(tenure_group)
)
We will use 70% train / 30% test split.
library(caTools)
# Set random seed for reproducibility
set.seed(123)
# Split data
split <- sample.split(churn_data$churn, SplitRatio = 0.7)
train_data <- subset(churn_data, split == TRUE)
test_data <- subset(churn_data, split == FALSE)
# Check sizes
nrow(train_data)
## [1] 4922
nrow(test_data)
## [1] 2110
# Build logistic regression model
churn_model <- glm(churn ~ contract + tenure + monthly_charges + payment_method + internet_service + paperless_billing + online_security + tech_support + tenure_group,
data = train_data,
family = binomial)
# Model summary
summary(churn_model)
##
## Call:
## glm(formula = churn ~ contract + tenure + monthly_charges + payment_method +
## internet_service + paperless_billing + online_security +
## tech_support + tenure_group, family = binomial, data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8572 -0.6669 -0.2786 0.6735 3.2204
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.680902 0.222782 -3.056 0.002240 **
## contractOne year -0.718740 0.127592 -5.633 1.77e-08 ***
## contractTwo year -1.743919 0.231905 -7.520 5.48e-14 ***
## tenure -0.053200 0.009066 -5.868 4.41e-09 ***
## monthly_charges 0.010767 0.003962 2.718 0.006569 **
## payment_methodCredit card (automatic) -0.049399 0.136062 -0.363 0.716557
## payment_methodElectronic check 0.330577 0.112137 2.948 0.003199 **
## payment_methodMailed check -0.065573 0.136118 -0.482 0.629995
## internet_serviceFiber optic 0.533029 0.170086 3.134 0.001725 **
## internet_serviceNo -0.927831 0.186763 -4.968 6.77e-07 ***
## paperless_billingYes 0.436720 0.089336 4.889 1.02e-06 ***
## online_securityNo internet service NA NA NA NA
## online_securityYes -0.529635 0.102649 -5.160 2.47e-07 ***
## tech_supportNo internet service NA NA NA NA
## tech_supportYes -0.356172 0.104719 -3.401 0.000671 ***
## tenure_group1-2 years -0.305155 0.164593 -1.854 0.063739 .
## tenure_group2-4 years 0.360018 0.284011 1.268 0.204934
## tenure_group4-5 years 1.168785 0.459498 2.544 0.010971 *
## tenure_group5+ years 1.573792 0.568618 2.768 0.005645 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5699.5 on 4921 degrees of freedom
## Residual deviance: 4054.5 on 4905 degrees of freedom
## AIC: 4088.5
##
## Number of Fisher Scoring iterations: 6
# Predict probabilities
test_prob <- predict(churn_model, newdata = test_data, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# Convert probabilities to Yes/No (threshold 0.5)
test_pred <- ifelse(test_prob > 0.5, "Yes", "No") %>% factor(levels = c("No", "Yes"))
# Confusion matrix
table(Predicted = test_pred, Actual = test_data$churn)
## Actual
## Predicted No Yes
## No 1377 271
## Yes 172 290
# Calculate accuracy
mean(test_pred == test_data$churn)
## [1] 0.7900474
The company seeks to reduce customer churn, which negatively impacts revenue and profitability. This analysis identifies key drivers of churn and predicts which customers are at risk, enabling more targeted retention efforts.
## Cleaned data exported as 'cleaned_churn_data_for_tableau.csv' in your working directory.