#dondeseempieza #parte1

read.csv("churnactividad.csv")
datos <- read.csv("churnactividad.csv")
nrow(datos)   
## [1] 7043
ncol(datos) 
## [1] 22
dim(datos)
## [1] 7043   22

#parte2

str(datos)
## 'data.frame':    7043 obs. of  22 variables:
##  $ No.             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ customerID      : chr  "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ gender          : chr  "Female" "Male" "Male" "Male" ...
##  $ SeniorCitizen   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : chr  "Yes" "No" "No" "No" ...
##  $ Dependents      : chr  "No" "No" "No" "No" ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : chr  "No" "Yes" "Yes" "No" ...
##  $ MultipleLines   : chr  "No phone service" "No" "No" "No phone service" ...
##  $ InternetService : chr  "DSL" "DSL" "DSL" "DSL" ...
##  $ OnlineSecurity  : chr  "No" "Yes" "Yes" "Yes" ...
##  $ OnlineBackup    : chr  "Yes" "No" "Yes" "No" ...
##  $ DeviceProtection: chr  "No" "Yes" "No" "Yes" ...
##  $ TechSupport     : chr  "No" "No" "No" "Yes" ...
##  $ StreamingTV     : chr  "No" "No" "No" "No" ...
##  $ StreamingMovies : chr  "No" "No" "No" "No" ...
##  $ Contract        : chr  "Month-to-month" "One year" "Month-to-month" "One year" ...
##  $ PaperlessBilling: chr  "Yes" "No" "Yes" "No" ...
##  $ PaymentMethod   : chr  "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  39.6 2339.3 97 1596.6 181.7 ...
##  $ Churn           : chr  "No" "No" "Yes" "No" ...
lapply(datos, class)
## $No.
## [1] "integer"
## 
## $customerID
## [1] "character"
## 
## $gender
## [1] "character"
## 
## $SeniorCitizen
## [1] "integer"
## 
## $Partner
## [1] "character"
## 
## $Dependents
## [1] "character"
## 
## $tenure
## [1] "integer"
## 
## $PhoneService
## [1] "character"
## 
## $MultipleLines
## [1] "character"
## 
## $InternetService
## [1] "character"
## 
## $OnlineSecurity
## [1] "character"
## 
## $OnlineBackup
## [1] "character"
## 
## $DeviceProtection
## [1] "character"
## 
## $TechSupport
## [1] "character"
## 
## $StreamingTV
## [1] "character"
## 
## $StreamingMovies
## [1] "character"
## 
## $Contract
## [1] "character"
## 
## $PaperlessBilling
## [1] "character"
## 
## $PaymentMethod
## [1] "character"
## 
## $MonthlyCharges
## [1] "numeric"
## 
## $TotalCharges
## [1] "numeric"
## 
## $Churn
## [1] "character"
diccionario <- data.frame(
  Variable = names(datos),
  Tipo = ifelse(sapply(datos, is.numeric), "Numérica", "Categórica"))
write.csv(diccionario, "Diccionario.csv", row.names = FALSE)

#parte3

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
datos_ch  <- datos %>% select(where(is.character))
datos_char <- datos_ch %>% select(-any_of(c("No.", "customerID")))
lapply(datos_char, unique)
## $gender
## [1] "Female" "Male"  
## 
## $Partner
## [1] "Yes" "No" 
## 
## $Dependents
## [1] "No"  "Yes"
## 
## $PhoneService
## [1] "No"  "Yes" ""   
## 
## $MultipleLines
## [1] "No phone service" "No"               ""                 "Yes"             
## 
## $InternetService
## [1] "DSL"         ""            "Fiber optic" "No"         
## 
## $OnlineSecurity
## [1] "No"                  "Yes"                 "No internet service"
## 
## $OnlineBackup
## [1] "Yes"                 "No"                  "No internet service"
## 
## $DeviceProtection
## [1] "No"                  "Yes"                 "No internet service"
## 
## $TechSupport
## [1] "No"                  "Yes"                 "No internet service"
## 
## $StreamingTV
## [1] "No"                  "Yes"                 "No internet service"
## 
## $StreamingMovies
## [1] "No"                  "Yes"                 "No internet service"
## 
## $Contract
## [1] "Month-to-month" "One year"       "Two year"      
## 
## $PaperlessBilling
## [1] "Yes" "No" 
## 
## $PaymentMethod
## [1] "Electronic check"          "Mailed check"             
## [3] "Bank transfer (automatic)" "Credit card (automatic)"  
## 
## $Churn
## [1] "No"  "Yes"
lapply(datos, function(x) sum(is.na(x)))
## $No.
## [1] 0
## 
## $customerID
## [1] 0
## 
## $gender
## [1] 0
## 
## $SeniorCitizen
## [1] 0
## 
## $Partner
## [1] 0
## 
## $Dependents
## [1] 0
## 
## $tenure
## [1] 20
## 
## $PhoneService
## [1] 0
## 
## $MultipleLines
## [1] 0
## 
## $InternetService
## [1] 0
## 
## $OnlineSecurity
## [1] 0
## 
## $OnlineBackup
## [1] 0
## 
## $DeviceProtection
## [1] 0
## 
## $TechSupport
## [1] 0
## 
## $StreamingTV
## [1] 0
## 
## $StreamingMovies
## [1] 0
## 
## $Contract
## [1] 0
## 
## $PaperlessBilling
## [1] 0
## 
## $PaymentMethod
## [1] 0
## 
## $MonthlyCharges
## [1] 10
## 
## $TotalCharges
## [1] 11
## 
## $Churn
## [1] 0
medT <- median(datos$tenure, na.rm = TRUE)   # mediana de tenure
impT <- sum(is.na(datos$tenure))             # cuántos NA había
datos$tenure[is.na(datos$tenure)] <- medT    # imputación

cat("Mediana tenure =", medT, " - imputados =", impT, "\n")
## Mediana tenure = 29  - imputados = 20

#moda

modaPS <- names(which.max(table(datos$PhoneService[datos$PhoneService != "" & !is.na(datos$PhoneService)])))
impPS  <- sum(datos$PhoneService == "" | is.na(datos$PhoneService))
datos$PhoneService[datos$PhoneService == "" | is.na(datos$PhoneService)] <- modaPS

modaML <- names(which.max(table(datos$MultipleLines[datos$MultipleLines != "" & !is.na(datos$MultipleLines)])))
impML  <- sum(datos$MultipleLines == "" | is.na(datos$MultipleLines))
datos$MultipleLines[datos$MultipleLines == "" | is.na(datos$MultipleLines)] <- modaML

modaIS <- names(which.max(table(datos$InternetService[datos$InternetService != "" & !is.na(datos$InternetService)])))
impIS  <- sum(datos$InternetService == "" | is.na(datos$InternetService))
datos$InternetService[datos$InternetService == "" | is.na(datos$InternetService)] <- modaIS

modaPS; modaML; modaIS
## [1] "Yes"
## [1] "No"
## [1] "Fiber optic"
impPS;  impML;  impIS
## [1] 120
## [1] 90
## [1] 60

#mediana

MedianT <- median(datos$tenure,na.rm = T)
MedianT
## [1] 29
impT
## [1] 20
sum(is.na(datos$tenure))  
## [1] 0

#media

meanMC <- mean(datos$MonthlyCharges, na.rm = TRUE)
impMC  <- sum(is.na(datos$MonthlyCharges))
datos$MonthlyCharges[is.na(datos$MonthlyCharges)] <- meanMC

meanMC
## [1] 64.7617
impMC
## [1] 10
sum(is.na(datos$MonthlyCharges))  
## [1] 0
elimTC <- sum(is.na(datos$TotalCharges))
datos  <- datos[!is.na(datos$TotalCharges), ]

elimTC
## [1] 11
sum(is.na(datos$TotalCharges))  
## [1] 0

#graficas

sc <- factor(ifelse(datos$SeniorCitizen == 1, "Yes", "No"))

par(mfrow = c(2,2), mar = c(4,4,2,1))

barplot(table(sc),                main = "SeniorCitizen",   ylab = "Frecuencia", col = "lightblue")
barplot(table(datos$StreamingTV), main = "StreamingTV",     ylab = "Frecuencia", col = "lightgreen")
barplot(table(datos$StreamingMovies), main = "StreamingMovies", ylab = "Frecuencia", col = "khaki")
barplot(table(datos$Contract),    main = "Contract",        ylab = "Frecuencia", col = "salmon")

par(mfrow=c(1,3))
hist(datos$MonthlyCharges, main="MonthlyCharges", xlab="Monto")
hist(datos$tenure, main="Tenure", xlab="Meses")
hist(datos$TotalCharges, main="TotalCharges", xlab="Monto")

par(mfrow=c(1,3))
boxplot(datos$tenure, main="Boxplot Tenure")
boxplot(datos$TotalCharges, main="Boxplot TotalCharges")
boxplot(datos$MonthlyCharges, main="Boxplot MonthlyCharges")

library(ggplot2)

ggplot(datos, aes(x=MonthlyCharges, y=TotalCharges)) +
  geom_point() +
  geom_smooth(method="lm") +
  labs(title="Dispersión MonthlyCharges vs TotalCharges")
## `geom_smooth()` using formula = 'y ~ x'

cor(datos$MonthlyCharges, datos$TotalCharges, use="complete.obs")
## [1] 0.574391