library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.1
## Loading required package: lattice
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.2
library(party)
## Warning: package 'party' was built under R version 4.3.1
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.1
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.3.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.1
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.1
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.3.2
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.1
df <- read.csv("C:\\Users\\memil\\OneDrive\\Desktop\\aaTecDeMonterrey\\5to Semestre\\Mineria\\WA_Fn-UseC_-Telco-Customer-Churn2.csv")
df[] <- lapply(df, as.factor)
df$SeniorCitizen <- as.integer(df$SeniorCitizen)
df$tenure <- as.integer(df$tenure)
df$MonthlyCharges <- as.integer(df$MonthlyCharges)
df$TotalCharges <- as.integer(df$TotalCharges)
df$customerID <- NULL
summary(df)
## gender SeniorCitizen Partner Dependents tenure
## Female:3488 Min. :1.000 No :3641 No :4933 Min. : 1.00
## Male :3555 1st Qu.:1.000 Yes:3402 Yes:2110 1st Qu.:10.00
## Median :1.000 Median :30.00
## Mean :1.162 Mean :33.37
## 3rd Qu.:1.000 3rd Qu.:56.00
## Max. :2.000 Max. :73.00
##
## PhoneService MultipleLines InternetService
## No : 682 No :3390 DSL :2421
## Yes:6361 No phone service: 682 Fiber optic:3096
## Yes :2971 No :1526
##
##
##
##
## OnlineSecurity OnlineBackup
## No :3498 No :3088
## No internet service:1526 No internet service:1526
## Yes :2019 Yes :2429
##
##
##
##
## DeviceProtection TechSupport
## No :3095 No :3473
## No internet service:1526 No internet service:1526
## Yes :2422 Yes :2044
##
##
##
##
## StreamingTV StreamingMovies Contract
## No :2810 No :2785 Month-to-month:3875
## No internet service:1526 No internet service:1526 One year :1473
## Yes :2707 Yes :2732 Two year :1695
##
##
##
##
## PaperlessBilling PaymentMethod MonthlyCharges
## No :2872 Bank transfer (automatic):1544 Min. : 1.0
## Yes:4171 Credit card (automatic) :1522 1st Qu.: 200.0
## Electronic check :2365 Median : 723.0
## Mailed check :1612 Mean : 684.5
## 3rd Qu.:1091.0
## Max. :1585.0
##
## TotalCharges Churn
## Min. : 1 No churn :5174
## 1st Qu.:1381 Yes churn:1869
## Median :3066
## Mean :3102
## 3rd Qu.:4783
## Max. :6530
## NA's :11
##### SETS de ENTRENAMIENTO, VALIDACIÓN Y PRUEBA #########
# Establecer la semilla para reproducibilidad
set.seed(123)
# Paso 1: Dividir el conjunto de datos en entrenamiento (50%) y temporal (50%)
trainIndex1 <- createDataPartition(df$Churn, p = 0.5, list = FALSE, times = 1)
train <- df[trainIndex1, ]
temp <- df[-trainIndex1, ]
# Paso 2: Dividir el conjunto temporal en validación (50% de temp) y prueba (50% de temp)
trainIndex2 <- createDataPartition(temp$Churn, p = 0.5, list = FALSE, times = 1)
validation <- temp[trainIndex2, ]
test <- temp[-trainIndex2, ]
##### ARBOLES ######################################################
# Construir el árbol de decisión
tree <- rpart(Churn ~ ., data = train, method = "class", control = rpart.control(cp = 0.0))
rpart.plot(tree)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

rpart.plot(tree,digits = 4,fallen.leaves = TRUE,type = 3,extra = 101)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

# Visualizar la curva de complejidad de costo
plotcp(tree)

# Elegir un valor de cp basado en la gráfica y podar el árbol
pruned_tree <- prune(tree, cp = 0.005)
# Visualizar el árbol podado
rpart.plot(pruned_tree)

rpart.plot(pruned_tree,digits = 4,fallen.leaves = TRUE,type = 3,extra = 101)

# Realizar predicciones en el conjunto de prueba
predictions <- predict(tree, newdata = test, type = "class")
# Convertir las predicciones a un factor si aún no lo son
predictions <- as.factor(predictions)
# Calcular la matriz de confusión
conf_matrix <- confusionMatrix(predictions, test$Churn,positive = "Yes churn")
# Imprimir la matriz de confusión
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No churn Yes churn
## No churn 1121 234
## Yes churn 172 233
##
## Accuracy : 0.7693
## 95% CI : (0.7489, 0.7888)
## No Information Rate : 0.7347
## P-Value [Acc > NIR] : 0.0004659
##
## Kappa : 0.3821
##
## Mcnemar's Test P-Value : 0.0024668
##
## Sensitivity : 0.4989
## Specificity : 0.8670
## Pos Pred Value : 0.5753
## Neg Pred Value : 0.8273
## Prevalence : 0.2653
## Detection Rate : 0.1324
## Detection Prevalence : 0.2301
## Balanced Accuracy : 0.6830
##
## 'Positive' Class : Yes churn
##
# Obtener las puntuaciones de probabilidad
prediccion_prob <- predict(tree, test, type = "prob")
# Si el resultado es una matriz, selecciona la columna que corresponde a la clase 'Yes' o '1'
prediccion_prob_yes <- prediccion_prob[, "Yes churn"]
# Generar la curva ROC
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.1
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_obj <- roc(test$Churn, prediccion_prob_yes)
## Setting levels: control = No churn, case = Yes churn
## Setting direction: controls < cases
# Dibujar la curva ROC
plot.roc(roc_obj, main="Curva ROC", col="blue")

# Calcular el AUC
auc(roc_obj)
## Area under the curve: 0.8118
######## Clasificación Bayesiana ##############################
modelo_nb <- naiveBayes(Churn ~ ., data = train)
# Hacer predicciones
predicciones_nb <- predict(modelo_nb, test)
# tablas de probabilidad condicional:
modelo_nb$tables
## $gender
## gender
## Y Female Male
## No churn 0.4901430 0.5098570
## Yes churn 0.5080214 0.4919786
##
## $SeniorCitizen
## SeniorCitizen
## Y [,1] [,2]
## No churn 1.124855 0.3306184
## Yes churn 1.258824 0.4382229
##
## $Partner
## Partner
## Y No Yes
## No churn 0.4642443 0.5357557
## Yes churn 0.6331551 0.3668449
##
## $Dependents
## Dependents
## Y No Yes
## No churn 0.6544260 0.3455740
## Yes churn 0.8224599 0.1775401
##
## $tenure
## tenure
## Y [,1] [,2]
## No churn 38.51411 24.21677
## Yes churn 18.19251 18.97568
##
## $PhoneService
## PhoneService
## Y No Yes
## No churn 0.10243525 0.89756475
## Yes churn 0.08877005 0.91122995
##
## $MultipleLines
## MultipleLines
## Y No No phone service Yes
## No churn 0.50212601 0.10243525 0.39543873
## Yes churn 0.47165775 0.08877005 0.43957219
##
## $InternetService
## InternetService
## Y DSL Fiber optic No
## No churn 0.38036336 0.34325474 0.27638191
## Yes churn 0.26417112 0.67807487 0.05775401
##
## $OnlineSecurity
## OnlineSecurity
## Y No No internet service Yes
## No churn 0.39041361 0.27638191 0.33320448
## Yes churn 0.78502674 0.05775401 0.15721925
##
## $OnlineBackup
## OnlineBackup
## Y No No internet service Yes
## No churn 0.37301894 0.27638191 0.35059915
## Yes churn 0.68449198 0.05775401 0.25775401
##
## $DeviceProtection
## DeviceProtection
## Y No No internet service Yes
## No churn 0.35833011 0.27638191 0.36528798
## Yes churn 0.65668449 0.05775401 0.28556150
##
## $TechSupport
## TechSupport
## Y No No internet service Yes
## No churn 0.37688442 0.27638191 0.34673367
## Yes churn 0.78609626 0.05775401 0.15614973
##
## $StreamingTV
## StreamingTV
## Y No No internet service Yes
## No churn 0.36064940 0.27638191 0.36296869
## Yes churn 0.51443850 0.05775401 0.42780749
##
## $StreamingMovies
## StreamingMovies
## Y No No internet service Yes
## No churn 0.34402783 0.27638191 0.37959026
## Yes churn 0.51764706 0.05775401 0.42459893
##
## $Contract
## Contract
## Y Month-to-month One year Two year
## No churn 0.42211055 0.25357557 0.32431388
## Yes churn 0.90053476 0.07486631 0.02459893
##
## $PaperlessBilling
## PaperlessBilling
## Y No Yes
## No churn 0.4773869 0.5226131
## Yes churn 0.2491979 0.7508021
##
## $PaymentMethod
## PaymentMethod
## Y Bank transfer (automatic) Credit card (automatic) Electronic check
## No churn 0.2647855 0.2392733 0.2431388
## Yes churn 0.1390374 0.1144385 0.5828877
## PaymentMethod
## Y Mailed check
## No churn 0.2528025
## Yes churn 0.1636364
##
## $MonthlyCharges
## MonthlyCharges
## Y [,1] [,2]
## No churn 627.2822 489.2133
## Yes churn 814.9369 401.4345
##
## $TotalCharges
## TotalCharges
## Y [,1] [,2]
## No churn 3365.313 1908.349
## Yes churn 2287.313 1839.565
barplot(modelo_nb$tables$OnlineSecurity)
# Agregar una leyenda
legend("topright", inset = .05, title = "CHURN",
c("Yes churn", "No churn"), fill = c("black", "lightgray"))

barplot(modelo_nb$tables$PaymentMethod)

barplot(modelo_nb$tables$OnlineBackup)

############# Calcular la matriz de confusión
conf_matrix_nb <- confusionMatrix(predicciones_nb, test$Churn,positive = "Yes churn")
# Imprimir la matriz de confusión
print(conf_matrix_nb)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No churn Yes churn
## No churn 903 100
## Yes churn 390 367
##
## Accuracy : 0.7216
## 95% CI : (0.7, 0.7424)
## No Information Rate : 0.7347
## P-Value [Acc > NIR] : 0.8973
##
## Kappa : 0.4041
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7859
## Specificity : 0.6984
## Pos Pred Value : 0.4848
## Neg Pred Value : 0.9003
## Prevalence : 0.2653
## Detection Rate : 0.2085
## Detection Prevalence : 0.4301
## Balanced Accuracy : 0.7421
##
## 'Positive' Class : Yes churn
##
################### Curva ROC ############################
# Obtener las puntuaciones de probabilidad
prediccion_prob_nb <- predict(modelo_nb, test, type = "raw")
# Si el resultado es una matriz, selecciona la columna que corresponde a la clase 'Yes' o '1'
prediccion_prob_yes_nb <- prediccion_prob_nb[, "Yes churn"]
# Generar la curva ROC
library(pROC)
roc_obj_nb <- roc(test$Churn, prediccion_prob_yes_nb)
## Setting levels: control = No churn, case = Yes churn
## Setting direction: controls < cases
# Dibujar la curva ROC
plot.roc(roc_obj_nb, main="Curva ROC", col="blue")

# Calcular el AUC
auc(roc_obj_nb)
## Area under the curve: 0.8167
########################### Actividad #############################################
#1. Hacer una tabla de comparación del redimiento(exactitud, sensibilidad y especificidad)
#entre los dos modelos. Establecer semejanzad y diferencias.
#CART:
#Exactitud: 76.93%
#Sensibilidad: 49.89%
#Especificidad: 86.70%
#Bayes:
#Exactitud: 72.16%
#Sensibilidad: 78.59%
#Especificidad: 69.84%
#Ambos modelos tienen una exactitud similar. Sin embargo, algo que
#destaca del modelo Bayesiano es que es mejor en detectar las instancias
#positivas que CART, ya que Bayes tiene una sensibilidad de 78.59% mientras
#que la de CART es de 49.89%. Aún así, el modelo CART fue mejor para clasificar
#correctamente los valores negativos, ya que si especificidad es de 86.70%,
#mientras que la especificidad de Bayes es de 69.84%. Habiendo dicho esto,
#creemos que el mejor modelo en esta situación fue el Bayesiano, ya que clasifica
#de mejor manera los casos positivos y eso es lo que estamos buscando del modelo.
#2. Hacer la interpretación sobre el Churn o abandono del cliente a la compañia
#utilizando el árbol de decisiones óptimo.
#3. Hacer una interpretación similar, pero con los resultados que aporta el algoritno naiveBayes
#Comentarios la facilidad en la interpretación.