#dondeseempieza #parte1
read.csv("churnactividad.csv")
datos <- read.csv("churnactividad.csv")
nrow(datos)
## [1] 7043
ncol(datos)
## [1] 22
dim(datos)
## [1] 7043 22
#parte2
str(datos)
## 'data.frame': 7043 obs. of 22 variables:
## $ No. : int 1 2 3 4 5 6 7 8 9 10 ...
## $ customerID : chr "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : chr "Female" "Male" "Male" "Male" ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : chr "Yes" "No" "No" "No" ...
## $ Dependents : chr "No" "No" "No" "No" ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : chr "No" "Yes" "Yes" "No" ...
## $ MultipleLines : chr "No phone service" "No" "No" "No phone service" ...
## $ InternetService : chr "DSL" "DSL" "DSL" "DSL" ...
## $ OnlineSecurity : chr "No" "Yes" "Yes" "Yes" ...
## $ OnlineBackup : chr "Yes" "No" "Yes" "No" ...
## $ DeviceProtection: chr "No" "Yes" "No" "Yes" ...
## $ TechSupport : chr "No" "No" "No" "Yes" ...
## $ StreamingTV : chr "No" "No" "No" "No" ...
## $ StreamingMovies : chr "No" "No" "No" "No" ...
## $ Contract : chr "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ PaperlessBilling: chr "Yes" "No" "Yes" "No" ...
## $ PaymentMethod : chr "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 39.6 2339.3 97 1596.6 181.7 ...
## $ Churn : chr "No" "No" "Yes" "No" ...
lapply(datos, class)
## $No.
## [1] "integer"
##
## $customerID
## [1] "character"
##
## $gender
## [1] "character"
##
## $SeniorCitizen
## [1] "integer"
##
## $Partner
## [1] "character"
##
## $Dependents
## [1] "character"
##
## $tenure
## [1] "integer"
##
## $PhoneService
## [1] "character"
##
## $MultipleLines
## [1] "character"
##
## $InternetService
## [1] "character"
##
## $OnlineSecurity
## [1] "character"
##
## $OnlineBackup
## [1] "character"
##
## $DeviceProtection
## [1] "character"
##
## $TechSupport
## [1] "character"
##
## $StreamingTV
## [1] "character"
##
## $StreamingMovies
## [1] "character"
##
## $Contract
## [1] "character"
##
## $PaperlessBilling
## [1] "character"
##
## $PaymentMethod
## [1] "character"
##
## $MonthlyCharges
## [1] "numeric"
##
## $TotalCharges
## [1] "numeric"
##
## $Churn
## [1] "character"
diccionario <- data.frame(
Variable = names(datos),
Tipo = ifelse(sapply(datos, is.numeric), "Numérica", "Categórica"))
write.csv(diccionario, "Diccionario.csv", row.names = FALSE)
#parte3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
datos_ch <- datos %>% select(where(is.character))
datos_char <- datos_ch %>% select(-any_of(c("No.", "customerID")))
lapply(datos_char, unique)
## $gender
## [1] "Female" "Male"
##
## $Partner
## [1] "Yes" "No"
##
## $Dependents
## [1] "No" "Yes"
##
## $PhoneService
## [1] "No" "Yes" ""
##
## $MultipleLines
## [1] "No phone service" "No" "" "Yes"
##
## $InternetService
## [1] "DSL" "" "Fiber optic" "No"
##
## $OnlineSecurity
## [1] "No" "Yes" "No internet service"
##
## $OnlineBackup
## [1] "Yes" "No" "No internet service"
##
## $DeviceProtection
## [1] "No" "Yes" "No internet service"
##
## $TechSupport
## [1] "No" "Yes" "No internet service"
##
## $StreamingTV
## [1] "No" "Yes" "No internet service"
##
## $StreamingMovies
## [1] "No" "Yes" "No internet service"
##
## $Contract
## [1] "Month-to-month" "One year" "Two year"
##
## $PaperlessBilling
## [1] "Yes" "No"
##
## $PaymentMethod
## [1] "Electronic check" "Mailed check"
## [3] "Bank transfer (automatic)" "Credit card (automatic)"
##
## $Churn
## [1] "No" "Yes"
lapply(datos, function(x) sum(is.na(x)))
## $No.
## [1] 0
##
## $customerID
## [1] 0
##
## $gender
## [1] 0
##
## $SeniorCitizen
## [1] 0
##
## $Partner
## [1] 0
##
## $Dependents
## [1] 0
##
## $tenure
## [1] 20
##
## $PhoneService
## [1] 0
##
## $MultipleLines
## [1] 0
##
## $InternetService
## [1] 0
##
## $OnlineSecurity
## [1] 0
##
## $OnlineBackup
## [1] 0
##
## $DeviceProtection
## [1] 0
##
## $TechSupport
## [1] 0
##
## $StreamingTV
## [1] 0
##
## $StreamingMovies
## [1] 0
##
## $Contract
## [1] 0
##
## $PaperlessBilling
## [1] 0
##
## $PaymentMethod
## [1] 0
##
## $MonthlyCharges
## [1] 10
##
## $TotalCharges
## [1] 11
##
## $Churn
## [1] 0
medT <- median(datos$tenure, na.rm = TRUE) # mediana de tenure
impT <- sum(is.na(datos$tenure)) # cuántos NA habÃa
datos$tenure[is.na(datos$tenure)] <- medT # imputación
cat("Mediana tenure =", medT, " - imputados =", impT, "\n")
## Mediana tenure = 29 - imputados = 20
#moda
modaPS <- names(which.max(table(datos$PhoneService[datos$PhoneService != "" & !is.na(datos$PhoneService)])))
impPS <- sum(datos$PhoneService == "" | is.na(datos$PhoneService))
datos$PhoneService[datos$PhoneService == "" | is.na(datos$PhoneService)] <- modaPS
modaML <- names(which.max(table(datos$MultipleLines[datos$MultipleLines != "" & !is.na(datos$MultipleLines)])))
impML <- sum(datos$MultipleLines == "" | is.na(datos$MultipleLines))
datos$MultipleLines[datos$MultipleLines == "" | is.na(datos$MultipleLines)] <- modaML
modaIS <- names(which.max(table(datos$InternetService[datos$InternetService != "" & !is.na(datos$InternetService)])))
impIS <- sum(datos$InternetService == "" | is.na(datos$InternetService))
datos$InternetService[datos$InternetService == "" | is.na(datos$InternetService)] <- modaIS
modaPS; modaML; modaIS
## [1] "Yes"
## [1] "No"
## [1] "Fiber optic"
impPS; impML; impIS
## [1] 120
## [1] 90
## [1] 60
#mediana
MedianT <- median(datos$tenure,na.rm = T)
MedianT
## [1] 29
impT
## [1] 20
sum(is.na(datos$tenure))
## [1] 0
#media
meanMC <- mean(datos$MonthlyCharges, na.rm = TRUE)
impMC <- sum(is.na(datos$MonthlyCharges))
datos$MonthlyCharges[is.na(datos$MonthlyCharges)] <- meanMC
meanMC
## [1] 64.7617
impMC
## [1] 10
sum(is.na(datos$MonthlyCharges))
## [1] 0
elimTC <- sum(is.na(datos$TotalCharges))
datos <- datos[!is.na(datos$TotalCharges), ]
elimTC
## [1] 11
sum(is.na(datos$TotalCharges))
## [1] 0
#graficas
sc <- factor(ifelse(datos$SeniorCitizen == 1, "Yes", "No"))
par(mfrow = c(2,2), mar = c(4,4,2,1))
barplot(table(sc), main = "SeniorCitizen", ylab = "Frecuencia", col = "lightblue")
barplot(table(datos$StreamingTV), main = "StreamingTV", ylab = "Frecuencia", col = "lightgreen")
barplot(table(datos$StreamingMovies), main = "StreamingMovies", ylab = "Frecuencia", col = "khaki")
barplot(table(datos$Contract), main = "Contract", ylab = "Frecuencia", col = "salmon")
par(mfrow=c(1,3))
hist(datos$MonthlyCharges, main="MonthlyCharges", xlab="Monto")
hist(datos$tenure, main="Tenure", xlab="Meses")
hist(datos$TotalCharges, main="TotalCharges", xlab="Monto")
par(mfrow=c(1,3))
boxplot(datos$tenure, main="Boxplot Tenure")
boxplot(datos$TotalCharges, main="Boxplot TotalCharges")
boxplot(datos$MonthlyCharges, main="Boxplot MonthlyCharges")
library(ggplot2)
ggplot(datos, aes(x=MonthlyCharges, y=TotalCharges)) +
geom_point() +
geom_smooth(method="lm") +
labs(title="Dispersión MonthlyCharges vs TotalCharges")
## `geom_smooth()` using formula = 'y ~ x'
cor(datos$MonthlyCharges, datos$TotalCharges, use="complete.obs")
## [1] 0.574391