# Install the necessary packages if you haven't already
# install.packages("xgboost")
# install.packages("caret") # For data splitting and training
# install.packages("dplyr") # For data manipulation
# install.packages("ggplot2") # For visualization
# Load the libraries
library(xgboost)
library(caret)
library(dplyr)
library(ggplot2)xgboost: For implementing the gradient boosting algorithm. caret: For data manipulation, model training, and evaluation. dplyr: For data manipulation. ggplot2: For data visualization.
# Load the dataset
telco_data <- read.csv("https://raw.githubusercontent.com/DataGuy-Kariuki/Customer.Churn-Project/refs/heads/main/Telco-Customer-Churn.csv")
# View the first few rows and summary statistics
head(telco_data)
str(telco_data)
summary(telco_data)
telco_data <- telco_data[, -1] # Remove the first column (likely an ID column)The Telco customer churn dataset is read directly from a URL into a data frame. Displays the first few rows of the dataset. Provides a structure overview, including variable types and dimensions. Gives summary statistics for each variable, helping understand the data’s characteristics Removes the first column, which is often an ID or index column not needed for analysis.
# Handling missing values
telco_data$TotalCharges <- as.numeric(telco_data$TotalCharges)
telco_data$TotalCharges[is.na(telco_data$TotalCharges)] <- median(telco_data$TotalCharges, na.rm = TRUE)
sum(is.na(telco_data$TotalCharges)) # Confirm no missing valuesA brief overview of the key variables and their types. This includes customer demographics, account information, service information, and charges. Uses median imputation to handle missing values in the Total Charges column, ensuring that the data is ready for modeling.
# EDA Using ggplot2
# Histogram of tenure
ggplot(telco_data, aes(x= tenure)) +
geom_histogram(fill = "skyblue", bins = 30) +
labs(title = "Distribution of Tenure", x = "Tenure", y = "Count")# Histogram for MonthlyCharges
ggplot(telco_data, aes(x= MonthlyCharges)) +
geom_histogram(fill = "salmon", bins = 30) +
labs(title = "Distribution of Monthly Charges", x = "Monthly Charges", y= "count")# Histogram of the TotalCharges
ggplot(telco_data, aes(x= TotalCharges)) +
geom_histogram(fill = "lightgreen", bins = 30) +
labs(title = 'Distribution of Total Charges', x ="Total charges", y ="count")# Categorical Variables vs. Target (Churn)
# Churn by gender
ggplot(telco_data, aes(x = gender, fill = factor(Churn))) +
geom_bar(position = "fill") +
labs(title = "Churn Rate by Gender", x = "Gender", y = "Proportion") +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(values = c("No" = "skyblue", "Yes" = "salmon"), name = "Churn")# Churn by Contract Type
ggplot(telco_data, aes(x = Contract, fill = factor(Churn))) +
geom_bar(position = "fill") +
labs(title = "Churn by Contract Type", x = "Contract Type", y = "Proportion")# Convert relevant columns to factors
categorical_vars <- c("gender", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "Churn")
telco_data[categorical_vars] <- lapply(telco_data[categorical_vars], as.factor)
# Convert Churn to numeric (0 and 1)
telco_data$Churn <- ifelse(telco_data$Churn == "Yes", 1, 0)
# Normalize/Scale numerical features
telco_data$tenure <- scale(telco_data$tenure)
telco_data$MonthlyCharges <- scale(telco_data$MonthlyCharges)
telco_data$TotalCharges <- scale(telco_data$TotalCharges)# Data Splitting
# Set a seed for reproducibility
set.seed(123)
# Split data into 70% training and 30% for test and validation
trainIndex <- createDataPartition(telco_data$Churn, p = 0.7, list = FALSE)
train_data <- telco_data[trainIndex, ]
temp_data <- telco_data[-trainIndex, ]
# Split remaining data into test (15%) and validation (15%)
testIndex <- createDataPartition(temp_data$Churn, p = 0.5, list = FALSE)
test_data <- temp_data[testIndex, ]
validation_data <- temp_data[-testIndex, ]# Model Selection and Training with Gradient Boosting
# Prepare Data for xgboost
train_matrix <- model.matrix(Churn ~ . - 1, data = train_data)
train_label <- train_data$Churn
dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)
# Prepare test data for early stopping and evaluation
test_matrix <- model.matrix(Churn ~ . - 1, data = test_data)
test_label <- test_data$Churn
dtest <- xgb.DMatrix(data = test_matrix, label = test_label)
# Define model parameters
params <- list(
objective = "binary:logistic",
eval_metric = "auc",
eta = 0.1,
max_depth = 6,
subsample = 0.8,
colsample_bytree = 0.8
)
# Train the model
set.seed(123)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 100,
watchlist = list(train = dtrain, eval = dtest),
early_stopping_rounds = 10,
print_every_n = 10
)# Model Evaluation
# Predictions on the test set
predictions <- predict(xgb_model, dtest)
# Convert probabilities to binary outcome using a threshold (0.5)
predicted_labels <- ifelse(predictions > 0.5, 1, 0)
# Confusion Matrix
confusion_matrix <- confusionMatrix(factor(predicted_labels), factor(test_label))
print(confusion_matrix)# Assuming you have your predicted classes and actual labels
predicted_classes <- ifelse(predictions > 0.5, 1, 0)
# Create a confusion matrix
confusion_matrix <- table(Actual = test_label, Predicted = predicted_classes)
# Convert the confusion matrix to a data frame
confusion_df <- as.data.frame(confusion_matrix)
# Load necessary libraries
library(pROC)## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Assuming you have your predictions and test labels defined
roc_curve <- roc(test_label, predictions)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ggplot(confusion_df, aes(x = Predicted, y = Actual)) +
geom_tile(aes(fill = Freq), color = "white") +
scale_fill_gradient(low = "white", high = "blue") +
geom_text(aes(label = Freq), vjust = 1) +
labs(title = "Confusion Matrix", x = "Predicted", y = "Actual") +
theme_minimal()# Calculate accuracy for various thresholds
thresholds <- seq(0, 1, by = 0.05)
accuracy_values <- sapply(thresholds, function(thresh) {
predicted_classes <- ifelse(predictions > thresh, 1, 0)
sum(predicted_classes == test_label) / length(test_label)
})
# Create a data frame for ggplot
accuracy_df <- data.frame(Threshold = thresholds, Accuracy = accuracy_values)
# Plot accuracy vs threshold
ggplot(accuracy_df, aes(x = Threshold, y = Accuracy)) +
geom_line(color = "blue") +
labs(title = "Accuracy vs. Threshold", x = "Threshold", y = "Accuracy") +
theme_minimal()