library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.1
## Loading required package: lattice
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.2
library(party)
## Warning: package 'party' was built under R version 4.3.1
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.1
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.3.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.1
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.1
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.3.2
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.1
df <- read.csv("C:\\Users\\memil\\OneDrive\\Desktop\\aaTecDeMonterrey\\5to Semestre\\Mineria\\WA_Fn-UseC_-Telco-Customer-Churn2.csv")



df[] <- lapply(df, as.factor)

df$SeniorCitizen <- as.integer(df$SeniorCitizen)

df$tenure <- as.integer(df$tenure)
df$MonthlyCharges <- as.integer(df$MonthlyCharges)
df$TotalCharges <- as.integer(df$TotalCharges)
df$customerID <- NULL
summary(df)
##     gender     SeniorCitizen   Partner    Dependents     tenure     
##  Female:3488   Min.   :1.000   No :3641   No :4933   Min.   : 1.00  
##  Male  :3555   1st Qu.:1.000   Yes:3402   Yes:2110   1st Qu.:10.00  
##                Median :1.000                         Median :30.00  
##                Mean   :1.162                         Mean   :33.37  
##                3rd Qu.:1.000                         3rd Qu.:56.00  
##                Max.   :2.000                         Max.   :73.00  
##                                                                     
##  PhoneService          MultipleLines     InternetService
##  No : 682     No              :3390   DSL        :2421  
##  Yes:6361     No phone service: 682   Fiber optic:3096  
##               Yes             :2971   No         :1526  
##                                                         
##                                                         
##                                                         
##                                                         
##              OnlineSecurity              OnlineBackup 
##  No                 :3498   No                 :3088  
##  No internet service:1526   No internet service:1526  
##  Yes                :2019   Yes                :2429  
##                                                       
##                                                       
##                                                       
##                                                       
##             DeviceProtection              TechSupport  
##  No                 :3095    No                 :3473  
##  No internet service:1526    No internet service:1526  
##  Yes                :2422    Yes                :2044  
##                                                        
##                                                        
##                                                        
##                                                        
##               StreamingTV              StreamingMovies           Contract   
##  No                 :2810   No                 :2785   Month-to-month:3875  
##  No internet service:1526   No internet service:1526   One year      :1473  
##  Yes                :2707   Yes                :2732   Two year      :1695  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  PaperlessBilling                   PaymentMethod  MonthlyCharges  
##  No :2872         Bank transfer (automatic):1544   Min.   :   1.0  
##  Yes:4171         Credit card (automatic)  :1522   1st Qu.: 200.0  
##                   Electronic check         :2365   Median : 723.0  
##                   Mailed check             :1612   Mean   : 684.5  
##                                                    3rd Qu.:1091.0  
##                                                    Max.   :1585.0  
##                                                                    
##   TotalCharges        Churn     
##  Min.   :   1   No churn :5174  
##  1st Qu.:1381   Yes churn:1869  
##  Median :3066                   
##  Mean   :3102                   
##  3rd Qu.:4783                   
##  Max.   :6530                   
##  NA's   :11
##### SETS de  ENTRENAMIENTO, VALIDACIÓN Y PRUEBA   #########


# Establecer la semilla para reproducibilidad
set.seed(123)

# Paso 1: Dividir el conjunto de datos en entrenamiento (50%) y temporal (50%)
trainIndex1 <- createDataPartition(df$Churn, p = 0.5, list = FALSE, times = 1)
train <- df[trainIndex1, ]
temp <- df[-trainIndex1, ]

# Paso 2: Dividir el conjunto temporal en validación (50% de temp) y prueba (50% de temp)
trainIndex2 <- createDataPartition(temp$Churn, p = 0.5, list = FALSE, times = 1)
validation <- temp[trainIndex2, ]
test <- temp[-trainIndex2, ]




#####   ARBOLES   ######################################################

# Construir el árbol de decisión
tree <- rpart(Churn ~ ., data = train, method = "class", control = rpart.control(cp = 0.0))
rpart.plot(tree)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

rpart.plot(tree,digits = 4,fallen.leaves = TRUE,type = 3,extra = 101)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

# Visualizar la curva de complejidad de costo
plotcp(tree)

# Elegir un valor de cp basado en la gráfica y podar el árbol
pruned_tree <- prune(tree, cp = 0.005)

# Visualizar el árbol podado
rpart.plot(pruned_tree)

rpart.plot(pruned_tree,digits = 4,fallen.leaves = TRUE,type = 3,extra = 101)

# Realizar predicciones en el conjunto de prueba
predictions <- predict(tree, newdata = test, type = "class")

# Convertir las predicciones a un factor si aún no lo son
predictions <- as.factor(predictions)

# Calcular la matriz de confusión
conf_matrix <- confusionMatrix(predictions, test$Churn,positive = "Yes churn")

# Imprimir la matriz de confusión
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  No churn Yes churn
##   No churn      1121       234
##   Yes churn      172       233
##                                           
##                Accuracy : 0.7693          
##                  95% CI : (0.7489, 0.7888)
##     No Information Rate : 0.7347          
##     P-Value [Acc > NIR] : 0.0004659       
##                                           
##                   Kappa : 0.3821          
##                                           
##  Mcnemar's Test P-Value : 0.0024668       
##                                           
##             Sensitivity : 0.4989          
##             Specificity : 0.8670          
##          Pos Pred Value : 0.5753          
##          Neg Pred Value : 0.8273          
##              Prevalence : 0.2653          
##          Detection Rate : 0.1324          
##    Detection Prevalence : 0.2301          
##       Balanced Accuracy : 0.6830          
##                                           
##        'Positive' Class : Yes churn       
## 
# Obtener las puntuaciones de probabilidad
prediccion_prob <- predict(tree, test, type = "prob")

# Si el resultado es una matriz, selecciona la columna que corresponde a la clase 'Yes' o '1'
prediccion_prob_yes <- prediccion_prob[, "Yes churn"]

# Generar la curva ROC
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.1
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc_obj <- roc(test$Churn, prediccion_prob_yes)
## Setting levels: control = No churn, case = Yes churn
## Setting direction: controls < cases
# Dibujar la curva ROC
plot.roc(roc_obj, main="Curva ROC", col="blue")

# Calcular el AUC
auc(roc_obj)
## Area under the curve: 0.8118
########  Clasificación Bayesiana   ##############################


modelo_nb <- naiveBayes(Churn ~ ., data = train)


# Hacer predicciones
predicciones_nb <- predict(modelo_nb, test)



# tablas de probabilidad condicional:
modelo_nb$tables
## $gender
##            gender
## Y              Female      Male
##   No churn  0.4901430 0.5098570
##   Yes churn 0.5080214 0.4919786
## 
## $SeniorCitizen
##            SeniorCitizen
## Y               [,1]      [,2]
##   No churn  1.124855 0.3306184
##   Yes churn 1.258824 0.4382229
## 
## $Partner
##            Partner
## Y                  No       Yes
##   No churn  0.4642443 0.5357557
##   Yes churn 0.6331551 0.3668449
## 
## $Dependents
##            Dependents
## Y                  No       Yes
##   No churn  0.6544260 0.3455740
##   Yes churn 0.8224599 0.1775401
## 
## $tenure
##            tenure
## Y               [,1]     [,2]
##   No churn  38.51411 24.21677
##   Yes churn 18.19251 18.97568
## 
## $PhoneService
##            PhoneService
## Y                   No        Yes
##   No churn  0.10243525 0.89756475
##   Yes churn 0.08877005 0.91122995
## 
## $MultipleLines
##            MultipleLines
## Y                   No No phone service        Yes
##   No churn  0.50212601       0.10243525 0.39543873
##   Yes churn 0.47165775       0.08877005 0.43957219
## 
## $InternetService
##            InternetService
## Y                  DSL Fiber optic         No
##   No churn  0.38036336  0.34325474 0.27638191
##   Yes churn 0.26417112  0.67807487 0.05775401
## 
## $OnlineSecurity
##            OnlineSecurity
## Y                   No No internet service        Yes
##   No churn  0.39041361          0.27638191 0.33320448
##   Yes churn 0.78502674          0.05775401 0.15721925
## 
## $OnlineBackup
##            OnlineBackup
## Y                   No No internet service        Yes
##   No churn  0.37301894          0.27638191 0.35059915
##   Yes churn 0.68449198          0.05775401 0.25775401
## 
## $DeviceProtection
##            DeviceProtection
## Y                   No No internet service        Yes
##   No churn  0.35833011          0.27638191 0.36528798
##   Yes churn 0.65668449          0.05775401 0.28556150
## 
## $TechSupport
##            TechSupport
## Y                   No No internet service        Yes
##   No churn  0.37688442          0.27638191 0.34673367
##   Yes churn 0.78609626          0.05775401 0.15614973
## 
## $StreamingTV
##            StreamingTV
## Y                   No No internet service        Yes
##   No churn  0.36064940          0.27638191 0.36296869
##   Yes churn 0.51443850          0.05775401 0.42780749
## 
## $StreamingMovies
##            StreamingMovies
## Y                   No No internet service        Yes
##   No churn  0.34402783          0.27638191 0.37959026
##   Yes churn 0.51764706          0.05775401 0.42459893
## 
## $Contract
##            Contract
## Y           Month-to-month   One year   Two year
##   No churn      0.42211055 0.25357557 0.32431388
##   Yes churn     0.90053476 0.07486631 0.02459893
## 
## $PaperlessBilling
##            PaperlessBilling
## Y                  No       Yes
##   No churn  0.4773869 0.5226131
##   Yes churn 0.2491979 0.7508021
## 
## $PaymentMethod
##            PaymentMethod
## Y           Bank transfer (automatic) Credit card (automatic) Electronic check
##   No churn                  0.2647855               0.2392733        0.2431388
##   Yes churn                 0.1390374               0.1144385        0.5828877
##            PaymentMethod
## Y           Mailed check
##   No churn     0.2528025
##   Yes churn    0.1636364
## 
## $MonthlyCharges
##            MonthlyCharges
## Y               [,1]     [,2]
##   No churn  627.2822 489.2133
##   Yes churn 814.9369 401.4345
## 
## $TotalCharges
##            TotalCharges
## Y               [,1]     [,2]
##   No churn  3365.313 1908.349
##   Yes churn 2287.313 1839.565
barplot(modelo_nb$tables$OnlineSecurity)
# Agregar una leyenda
legend("topright", inset = .05, title = "CHURN", 
       c("Yes churn", "No churn"), fill = c("black", "lightgray"))

barplot(modelo_nb$tables$PaymentMethod)

barplot(modelo_nb$tables$OnlineBackup)

#############    Calcular la matriz de confusión
conf_matrix_nb <- confusionMatrix(predicciones_nb, test$Churn,positive = "Yes churn")

# Imprimir la matriz de confusión
print(conf_matrix_nb)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  No churn Yes churn
##   No churn       903       100
##   Yes churn      390       367
##                                        
##                Accuracy : 0.7216       
##                  95% CI : (0.7, 0.7424)
##     No Information Rate : 0.7347       
##     P-Value [Acc > NIR] : 0.8973       
##                                        
##                   Kappa : 0.4041       
##                                        
##  Mcnemar's Test P-Value : <2e-16       
##                                        
##             Sensitivity : 0.7859       
##             Specificity : 0.6984       
##          Pos Pred Value : 0.4848       
##          Neg Pred Value : 0.9003       
##              Prevalence : 0.2653       
##          Detection Rate : 0.2085       
##    Detection Prevalence : 0.4301       
##       Balanced Accuracy : 0.7421       
##                                        
##        'Positive' Class : Yes churn    
## 
###################  Curva ROC  ############################

# Obtener las puntuaciones de probabilidad
prediccion_prob_nb <- predict(modelo_nb, test, type = "raw")

# Si el resultado es una matriz, selecciona la columna que corresponde a la clase 'Yes' o '1'
prediccion_prob_yes_nb <- prediccion_prob_nb[, "Yes churn"]

# Generar la curva ROC
library(pROC)
roc_obj_nb <- roc(test$Churn, prediccion_prob_yes_nb)
## Setting levels: control = No churn, case = Yes churn
## Setting direction: controls < cases
# Dibujar la curva ROC
plot.roc(roc_obj_nb, main="Curva ROC", col="blue")

# Calcular el AUC
auc(roc_obj_nb)
## Area under the curve: 0.8167
###########################   Actividad  #############################################


#1. Hacer una tabla de comparación del redimiento(exactitud, sensibilidad y especificidad)
#entre los dos modelos. Establecer semejanzad y diferencias.

#CART:
#Exactitud: 76.93%
#Sensibilidad: 49.89%
#Especificidad: 86.70%

#Bayes:
#Exactitud: 72.16%
#Sensibilidad: 78.59%
#Especificidad: 69.84%

#Ambos modelos tienen una exactitud similar. Sin embargo, algo que 
#destaca del modelo Bayesiano es que es mejor en detectar las instancias
#positivas que CART, ya que Bayes tiene una sensibilidad de 78.59% mientras
#que la de CART es de 49.89%. Aún así, el modelo CART fue mejor para clasificar
#correctamente los valores negativos, ya que si especificidad es de 86.70%, 
#mientras que la especificidad de Bayes es de 69.84%. Habiendo dicho esto, 
#creemos que el mejor modelo en esta situación fue el Bayesiano, ya que clasifica
#de mejor manera los casos positivos y eso es lo que estamos buscando del modelo.


#2. Hacer la interpretación sobre el Churn o abandono del cliente a la compañia
#utilizando el árbol de decisiones óptimo.



#3. Hacer una interpretación similar, pero con los resultados que aporta el algoritno naiveBayes 
#Comentarios la facilidad en la interpretación.