#Análisis y preparación de los datos
library(ggplot2)
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Adjuntando el paquete: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
churn_data= read.csv("555173.csv")
churn_data_limpio <- na.omit(churn_data)
summary(churn_data_limpio)
## customerID gender SeniorCitizen Partner
## Length:7032 Length:7032 Min. :0.0000 Length:7032
## Class :character Class :character 1st Qu.:0.0000 Class :character
## Mode :character Mode :character Median :0.0000 Mode :character
## Mean :0.1624
## 3rd Qu.:0.0000
## Max. :1.0000
## Dependents tenure PhoneService MultipleLines
## Length:7032 Min. : 1.00 Length:7032 Length:7032
## Class :character 1st Qu.: 9.00 Class :character Class :character
## Mode :character Median :29.00 Mode :character Mode :character
## Mean :32.42
## 3rd Qu.:55.00
## Max. :72.00
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## Length:7032 Length:7032 Length:7032 Length:7032
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## TechSupport StreamingTV StreamingMovies Contract
## Length:7032 Length:7032 Length:7032 Length:7032
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## Length:7032 Length:7032 Min. : 18.25 Min. : 18.8
## Class :character Class :character 1st Qu.: 35.59 1st Qu.: 401.4
## Mode :character Mode :character Median : 70.35 Median :1397.5
## Mean : 64.80 Mean :2283.3
## 3rd Qu.: 89.86 3rd Qu.:3794.7
## Max. :118.75 Max. :8684.8
## Churn
## Length:7032
## Class :character
## Mode :character
##
##
##
str(churn_data_limpio)
## 'data.frame': 7032 obs. of 21 variables:
## $ customerID : chr "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : chr "Female" "Male" "Male" "Male" ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : chr "Yes" "No" "No" "No" ...
## $ Dependents : chr "No" "No" "No" "No" ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : chr "No" "Yes" "Yes" "No" ...
## $ MultipleLines : chr "No phone service" "No" "No" "No phone service" ...
## $ InternetService : chr "DSL" "DSL" "DSL" "DSL" ...
## $ OnlineSecurity : chr "No" "Yes" "Yes" "Yes" ...
## $ OnlineBackup : chr "Yes" "No" "Yes" "No" ...
## $ DeviceProtection: chr "No" "Yes" "No" "Yes" ...
## $ TechSupport : chr "No" "No" "No" "Yes" ...
## $ StreamingTV : chr "No" "No" "No" "No" ...
## $ StreamingMovies : chr "No" "No" "No" "No" ...
## $ Contract : chr "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ PaperlessBilling: chr "Yes" "No" "Yes" "No" ...
## $ PaymentMethod : chr "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : chr "No" "No" "Yes" "No" ...
## - attr(*, "na.action")= 'omit' Named int [1:11] 489 754 937 1083 1341 3332 3827 4381 5219 6671 ...
## ..- attr(*, "names")= chr [1:11] "489" "754" "937" "1083" ...
churn_data_limpio$SeniorCitizen <- replace(churn_data_limpio$SeniorCitizen, churn_data_limpio$SeniorCitizen == 0, "No")
churn_data_limpio$SeniorCitizen <- replace(churn_data_limpio$SeniorCitizen, churn_data_limpio$SeniorCitizen == 1, "Yes")
variables_cualitativas <- c("gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "Churn")
variables_cuantitativas<- c("tenure", "MonthlyCharges", "TotalCharges")
churn_data_limpio[variables_cualitativas] <- lapply(churn_data_limpio[variables_cualitativas], factor)
str(churn_data_limpio[variables_cualitativas])
## 'data.frame': 7032 obs. of 17 variables:
## $ gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
## $ SeniorCitizen : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
clean_data <- subset(churn_data_limpio, select = -customerID)
str(clean_data)
## 'data.frame': 7032 obs. of 20 variables:
## $ gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
## $ SeniorCitizen : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
#Diagramas e histogramas
for (var in variables_cualitativas) {
if (var %in% names(clean_data)) {
print(
ggplot(clean_data, aes_string(x = var)) +
geom_bar(fill = "steelblue") +
labs(title = paste("Frecuencia de", var), x = var, y = "Frecuencia") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
for (var in variables_cuantitativas) {
if (var %in% names(clean_data)) {
print(
ggplot(clean_data, aes_string(x = var)) +
geom_histogram(binwidth = 5, fill = "darkorange", color = "black") +
labs(title = paste("Histograma de", var), x = var, y = "Frecuencia") +
theme_minimal()
)
}
}
#Diagramas e histrogramas en función del Churn
for (var in variables_cualitativas) {
if (var %in% names(churn_data_limpio) && var != "Churn") {
print(
ggplot(churn_data_limpio, aes_string(x = var, fill = "Churn")) +
geom_bar(position = "dodge") +
labs(title = paste("Frecuencia de", var, "según Churn"), x = var, y = "Frecuencia") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
}
}
for (var in variables_cuantitativas) {
if (var %in% names(churn_data_limpio)) {
print(
ggplot(churn_data_limpio, aes_string(x = var, fill = "Churn")) +
geom_histogram(position = "identity", alpha = 0.6, bins = 30) +
labs(title = paste("Histograma de", var, "según Churn"), x = var, y = "Frecuencia") +
theme_minimal()
)
}
}
#Boxplots
for (var in variables_cualitativas) {
if (var %in% names(churn_data_limpio) && var != "MonthlyCharges" && var != "Churn") {
print(
ggplot(churn_data_limpio, aes_string(x = var, y = "MonthlyCharges")) +
geom_boxplot(fill = "lightblue", outlier.color = "red", outlier.shape = 1) +
labs(
title = paste("Distribución de MonthlyCharges según", var),
x = var,
y = "MonthlyCharges"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
}
}
library(ggpubr)
var_par <- combn(variables_cuantitativas, 2, simplify = FALSE)
# Crear gráfico de dispersión para cada par
for (par in var_par) {
x <- par[1]
y <- par[2]
if (x %in% names(churn_data_limpio) && y %in% names(churn_data_limpio)) {
print(
ggplot(churn_data_limpio, aes_string(x = x, y = y)) +
geom_point(alpha = 0.6, color = "steelblue") +
stat_cor(method = "pearson", label.x.npc = "left", label.y.npc = "top") +
labs(title = paste("Dispersión entre", x, "y", y),
x = x, y = y) +
theme_minimal()
)
}
}
churn_train1 <- clean_data[1:5000, ]
churn_test1 <- clean_data[5001:nrow(clean_data),]
#Diseño del modelo de regresión
lmodel <- lm(MonthlyCharges ~ . -TotalCharges -Churn, data= churn_train1)
summary(lmodel)
##
## Call:
## lm(formula = MonthlyCharges ~ . - TotalCharges - Churn, data = churn_train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1927 -0.5992 0.0026 0.6025 4.8593
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.493e+01 6.918e-02 360.357 <2e-16
## genderMale 7.461e-03 2.886e-02 0.259 0.796
## SeniorCitizenYes 1.183e-03 4.200e-02 0.028 0.978
## PartnerYes -1.705e-02 3.518e-02 -0.485 0.628
## DependentsYes 3.445e-02 3.710e-02 0.928 0.353
## tenure 4.699e-04 1.002e-03 0.469 0.639
## PhoneServiceYes 2.007e+01 5.755e-02 348.781 <2e-16
## MultipleLinesNo phone service NA NA NA NA
## MultipleLinesYes 5.005e+00 3.460e-02 144.665 <2e-16
## InternetServiceFiber optic 2.498e+01 4.087e-02 611.186 <2e-16
## InternetServiceNo -2.505e+01 5.693e-02 -439.966 <2e-16
## OnlineSecurityNo internet service NA NA NA NA
## OnlineSecurityYes 4.995e+00 3.811e-02 131.085 <2e-16
## OnlineBackupNo internet service NA NA NA NA
## OnlineBackupYes 5.010e+00 3.576e-02 140.096 <2e-16
## DeviceProtectionNo internet service NA NA NA NA
## DeviceProtectionYes 5.024e+00 3.700e-02 135.786 <2e-16
## TechSupportNo internet service NA NA NA NA
## TechSupportYes 5.025e+00 3.899e-02 128.865 <2e-16
## StreamingTVNo internet service NA NA NA NA
## StreamingTVYes 9.944e+00 3.805e-02 261.318 <2e-16
## StreamingMoviesNo internet service NA NA NA NA
## StreamingMoviesYes 9.986e+00 3.798e-02 262.944 <2e-16
## ContractOne year 7.533e-03 4.540e-02 0.166 0.868
## ContractTwo year -1.692e-04 5.533e-02 -0.003 0.998
## PaperlessBillingYes -1.526e-03 3.233e-02 -0.047 0.962
## PaymentMethodCredit card (automatic) -6.455e-02 4.352e-02 -1.483 0.138
## PaymentMethodElectronic check -1.711e-02 4.284e-02 -0.399 0.690
## PaymentMethodMailed check -1.150e-02 4.676e-02 -0.246 0.806
##
## (Intercept) ***
## genderMale
## SeniorCitizenYes
## PartnerYes
## DependentsYes
## tenure
## PhoneServiceYes ***
## MultipleLinesNo phone service
## MultipleLinesYes ***
## InternetServiceFiber optic ***
## InternetServiceNo ***
## OnlineSecurityNo internet service
## OnlineSecurityYes ***
## OnlineBackupNo internet service
## OnlineBackupYes ***
## DeviceProtectionNo internet service
## DeviceProtectionYes ***
## TechSupportNo internet service
## TechSupportYes ***
## StreamingTVNo internet service
## StreamingTVYes ***
## StreamingMoviesNo internet service
## StreamingMoviesYes ***
## ContractOne year
## ContractTwo year
## PaperlessBillingYes
## PaymentMethodCredit card (automatic)
## PaymentMethodElectronic check
## PaymentMethodMailed check
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.018 on 4978 degrees of freedom
## Multiple R-squared: 0.9989, Adjusted R-squared: 0.9989
## F-statistic: 2.081e+05 on 21 and 4978 DF, p-value: < 2.2e-16
predicciones <- predict(lmodel, newdata = churn_test1)
rmse <- sqrt(mean((churn_test1$MonthlyCharges - predicciones)^2))
print(rmse)
## [1] 1.044388
##Ejercicio 3 - Modelo de Clasificación
clean_data2 <- subset(clean_data, select = -TotalCharges )
churn_train2 <- clean_data2[1:5000, ]
churn_test2 <- clean_data2[5001:nrow (clean_data2), ]
clean_data2$Churn <- as.factor(clean_data2$Churn)
clean_data2[sapply(clean_data2, is.character)] <- lapply(clean_data2[sapply(clean_data2, is.character)], as.factor)
set.seed(123)
gmodel1 <- randomForest(Churn ~ ., data = churn_train2, importance = TRUE)
prob_preds <- predict(gmodel1, newdata = churn_test2, type = "prob")
prob_yes <- prob_preds[, "Yes"]
churn_pred <- ifelse(prob_yes > 0.5, "Yes", "No")
churn_pred <- factor(churn_pred, levels = c("No", "Yes"))
mc <- table(Predicted = churn_pred, Actual = churn_test2$Churn)
TN <- mc["No", "No"]
FP <- mc["Yes", "No"]
FN <- mc["No", "Yes"]
TP <- mc["Yes", "Yes"]
accuracy <- (TP + TN) / (TP + TN + FP + FN)
fpr <- FP / (FP + TN)
fnr <- FN / (FN + TP)
tpr <- TP / (TP + FN)
cat("Accuracy:", round(accuracy, 4), "\n")
## Accuracy: 0.7918
cat("FPR:", round(fpr, 4), "\n")
## FPR: 0.1034
cat("FNR:", round(fnr, 4), "\n")
## FNR: 0.4891
cat("TPR:", round(tpr, 4), "\n")
## TPR: 0.5109
gmodel2 <- randomForest(Churn ~ Contract + tenure + InternetService, data = churn_train2, importance = TRUE)
prob_preds2 <- predict(gmodel2, newdata = churn_test2, type = "prob")
prob_yes2 <- prob_preds2[, "Yes"]
churn_pred2 <- ifelse(prob_yes2 > 0.5, "Yes", "No")
churn_pred2 <- factor(churn_pred2, levels = c("No", "Yes"))
mc2 <- table(Predicted = churn_pred2, Actual = churn_test2$Churn)
TN2 <- mc2["No", "No"]
FP2 <- mc2["Yes", "No"]
FN2 <- mc2["No", "Yes"]
TP2 <- mc2["Yes", "Yes"]
accuracy2 <- (TP2 + TN2) / (TP2 + TN2 + FP2 + FN2)
fpr2 <- FP2 / (FP2 + TN2)
fnr2 <- FN2 / (FN2 + TP2)
tpr2 <- TP2 / (TP2 + FN2)
cat("Accuracy:", round(accuracy2, 4), "\n")
## Accuracy: 0.7904
cat("FPR:", round(fpr2, 4), "\n")
## FPR: 0.0595
cat("FNR:", round(fnr2, 4), "\n")
## FNR: 0.6123
cat("TPR:", round(tpr2, 4), "\n")
## TPR: 0.3877
gmodel3 <- randomForest(Churn ~ ., data = churn_train2, importance = TRUE)
clean_data2$Churn <- as.factor(clean_data2$Churn)
probabilidades <- predict(gmodel3, newdata = churn_test2, type = "prob")
churn_pred <- ifelse(probabilidades[, "Yes"] > 0.5, "Yes", "No")
churn_pred <- factor(churn_pred, levels = c("No", "Yes"))
mc2 <- table(Predicted = churn_pred, Actual = churn_test2$Churn)
print(mc2)
## Actual
## Predicted No Yes
## No 1332 268
## Yes 148 284
#Escenario 1
CT <- 200
AR <- 0.4
R <- 500
umbrales <- seq(0,1, by = 0.1)
real <- churn_test2$Churn
beneficios <- numeric(length(umbrales))
for (i in seq_along(umbrales)) {
umbral <- umbrales[i]
pred_churn <- ifelse(probabilidades[, "Yes"] > umbral, "Yes", "No")
pred_churn <- factor(pred_churn, levels = c("No", "Yes"))
mc <- table(Predicted = pred_churn, Actual = real)
TP <- ifelse(!is.na(mc["Yes", "Yes"]), mc["Yes", "Yes"], 0)
FP <- ifelse(!is.na(mc["Yes", "No"]), mc["Yes", "No"], 0)
FN <- ifelse(!is.na(mc["No", "Yes"]), mc["No", "Yes"], 0)
resultado_con <- -FP * AR * CT - TP * AR * CT - TP * (1 - AR) * R - FN * R
resultado_sin <- -(FN + TP) * R
beneficio <- resultado_con - resultado_sin
beneficios[i] <- beneficio
}
resultados <- data.frame(Umbral = umbrales, Beneficio = beneficios)
optimo <- resultados[which.max(resultados$Beneficio), ]
print(resultados)
## Umbral Beneficio
## 1 0.0 -44200
## 2 0.1 240
## 3 0.2 14120
## 4 0.3 20800
## 5 0.4 22280
## 6 0.5 22240
## 7 0.6 20240
## 8 0.7 17880
## 9 0.8 11000
## 10 0.9 4760
## 11 1.0 0
cat("\n✅ Umbral óptimo:", optimo$Umbral, "\n💶 Beneficio máximo:", round(optimo$Beneficio, 2), "€\n")
##
## ✅ Umbral óptimo: 0.4
## 💶 Beneficio máximo: 22280 €
#Escenario 2
CT <- 400
AR <- 0.8
R <- 500
umbrales <- seq(0,1, by = 0.1)
real <- churn_test2$Churn
beneficios <- numeric(length(umbrales))
for (i in seq_along(umbrales)) {
umbral <- umbrales[i]
pred_churn <- ifelse(probabilidades[, "Yes"] > umbral, "Yes", "No")
pred_churn <- factor(pred_churn, levels = c("No", "Yes"))
mc <- table(Predicted = pred_churn, Actual = real)
TP <- ifelse(!is.na(mc["Yes", "Yes"]), mc["Yes", "Yes"], 0)
FP <- ifelse(!is.na(mc["Yes", "No"]), mc["Yes", "No"], 0)
FN <- ifelse(!is.na(mc["No", "Yes"]), mc["No", "Yes"], 0)
resultado_con <- -FP * AR * CT - TP * AR * CT - TP * (1 - AR) * R - FN * R
resultado_sin <- -(FN + TP) * R
beneficio <- resultado_con - resultado_sin
beneficios[i] <- beneficio
}
resultados <- data.frame(Umbral = umbrales, Beneficio = beneficios)
optimo <- resultados[which.max(resultados$Beneficio), ]
print(resultados)
## Umbral Beneficio
## 1 0.0 -397200
## 2 0.1 -199040
## 3 0.2 -122320
## 4 0.3 -76000
## 5 0.4 -48080
## 6 0.5 -24640
## 7 0.6 -11040
## 8 0.7 -1680
## 9 0.8 400
## 10 0.9 1040
## 11 1.0 0
cat("\n✅ Umbral óptimo:", optimo$Umbral, "\n💶 Beneficio máximo:", round(optimo$Beneficio, 2), "€\n")
##
## ✅ Umbral óptimo: 0.9
## 💶 Beneficio máximo: 1040 €