ANÁLISIS EXPLORATORIO DE DATOS
#Cargamos los datos desde Github
telecom<-read.csv("https://raw.githubusercontent.com/VictorGuevaraP/Mineria-de-datos/master/Telecomunicaciones.csv", sep = ";")
#Verificamos los seis primeros datos por defecto
head(telecom)
## IdCliente Género Edad Llamadas Tiempo.enero Tiempo.febrero Monto
## 1 P50417214 Femenino 26 4 27.0 26.1 89.7
## 2 P50417215 Masculino 33 2 30.1 20.5 88.8
## 3 P50417216 Masculino 21 8 26.0 34.4 85.4
## 4 P50417217 Femenino 23 8 34.1 36.1 89.0
## 5 P50417218 Masculino 34 1 30.1 28.9 77.1
## 6 P50417219 Femenino 29 6 30.7 20.9 97.8
## Espera Opinión Empresa
## 1 0.8 Excelente Entell
## 2 0.4 Muy Bueno Entell
## 3 3.5 Bueno Entell
## 4 4.7 Pésimo Entell
## 5 2.2 Bueno Entell
## 6 5.1 Pésimo Entell
#Verificamos la estructura de los datos
str(telecom)
## 'data.frame': 120 obs. of 10 variables:
## $ IdCliente : Factor w/ 120 levels "A80117234","A80117235",..: 71 72 73 74 75 76 77 78 79 80 ...
## $ Género : Factor w/ 2 levels "Femenino","Masculino": 1 2 2 1 2 1 2 1 1 1 ...
## $ Edad : int 26 33 21 23 34 29 21 40 25 38 ...
## $ Llamadas : int 4 2 8 8 1 6 5 5 6 5 ...
## $ Tiempo.enero : num 27 30.1 26 34.1 30.1 30.7 26.5 28.3 29.7 32.5 ...
## $ Tiempo.febrero: num 26.1 20.5 34.4 36.1 28.9 20.9 32 29.8 31.1 27.6 ...
## $ Monto : num 89.7 88.8 85.4 89 77.1 97.8 84 84.2 91.7 74.1 ...
## $ Espera : num 0.8 0.4 3.5 4.7 2.2 5.1 2.8 5.8 4.2 0.8 ...
## $ Opinión : Factor w/ 5 levels "Bueno","Excelente",..: 2 3 1 4 1 4 5 3 2 5 ...
## $ Empresa : Factor w/ 4 levels "Bitele","Claros",..: 3 3 3 3 3 3 3 3 3 3 ...
#Realizamos un resumen de los datos
summary(telecom)
## IdCliente Género Edad Llamadas
## A80117234: 1 Femenino :54 Min. :20.00 Min. : 0.000
## A80117235: 1 Masculino:66 1st Qu.:26.00 1st Qu.: 3.000
## A80117236: 1 Median :31.00 Median : 5.000
## A80117237: 1 Mean :30.34 Mean : 5.017
## A80117238: 1 3rd Qu.:35.00 3rd Qu.: 7.000
## A80117239: 1 Max. :40.00 Max. :13.000
## (Other) :114
## Tiempo.enero Tiempo.febrero Monto Espera
## Min. :17.50 Min. :17.50 Min. : 74.10 Min. : 0.200
## 1st Qu.:31.70 1st Qu.:31.95 1st Qu.: 84.17 1st Qu.: 1.700
## Median :37.85 Median :37.60 Median : 90.70 Median : 3.650
## Mean :38.24 Mean :37.48 Mean : 92.68 Mean : 6.148
## 3rd Qu.:43.62 3rd Qu.:42.25 3rd Qu.: 99.53 3rd Qu.: 7.000
## Max. :62.20 Max. :59.60 Max. :119.10 Max. :36.000
##
## Opinión Empresa
## Bueno :23 Bitele:30
## Excelente:19 Claros:30
## Muy Bueno:19 Entell:20
## Pésimo :44 Movist:40
## Regular :15
##
##
#Resumen de la data
library(mlr)
## Warning: package 'mlr' was built under R version 3.5.3
## Loading required package: ParamHelpers
## Warning: package 'ParamHelpers' was built under R version 3.5.3
summarizeColumns(telecom)
## name type na mean disp median mad min
## 1 IdCliente factor 0 NA 0.9916667 NA NA 1.0
## 2 Género factor 0 NA 0.4500000 NA NA 54.0
## 3 Edad integer 0 30.341667 5.7357728 31.00 5.93040 20.0
## 4 Llamadas integer 0 5.016667 2.4218311 5.00 2.96520 0.0
## 5 Tiempo.enero numeric 0 38.235833 8.7875083 37.85 9.04386 17.5
## 6 Tiempo.febrero numeric 0 37.483333 8.3435713 37.60 8.00604 17.5
## 7 Monto numeric 0 92.678333 10.2873846 90.70 10.30407 74.1
## 8 Espera numeric 0 6.148333 6.8776582 3.65 2.96520 0.2
## 9 Opinión factor 0 NA 0.6333333 NA NA 15.0
## 10 Empresa factor 0 NA 0.6666667 NA NA 20.0
## max nlevs
## 1 1.0 120
## 2 66.0 2
## 3 40.0 0
## 4 13.0 0
## 5 62.2 0
## 6 59.6 0
## 7 119.1 0
## 8 36.0 0
## 9 44.0 5
## 10 40.0 4
# Revisa graficamente el porcentaje de nulos
library(VIM)
## Warning: package 'VIM' was built under R version 3.5.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 3.5.3
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.5.3
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(telecom,numbers=TRUE, plot = T)

## Realizamos tabla e frecuencia con formato SAS o SPSS
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.3
library(gdata)
## Warning: package 'gdata' was built under R version 3.5.2
## gdata: Unable to locate valid perl interpreter
## gdata:
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata:
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
##
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
##
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
##
## Attaching package: 'gdata'
## The following objects are masked from 'package:data.table':
##
## first, last
## The following object is masked from 'package:mlr':
##
## resample
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
CrossTable(telecom$Género, format="SAS")
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 120
##
##
## | Femenino | Masculino |
## |-----------|-----------|
## | 54 | 66 |
## | 0.450 | 0.550 |
## |-----------|-----------|
##
##
##
##
CrossTable(telecom$Género, format="SPSS")
##
## Cell Contents
## |-------------------------|
## | Count |
## | Row Percent |
## |-------------------------|
##
## Total Observations in Table: 120
##
## | Femenino | Masculino |
## |-----------|-----------|
## | 54 | 66 |
## | 45.000% | 55.000% |
## |-----------|-----------|
##
##
#Realizamos gráficos de resumen de las variables
frec<-table(telecom$Género)
barplot(frec, main="Distribución del género de los clientes",
xlab="Género",
ylab="Cantida de Clientes")

pie(frec, main="Distribución del género de los clientes",
xlab="Género")

lbls1 <- paste(names(table(telecom$Género)), "\n",
prop.table(table(telecom$Género))*100,"%", sep="")
library(plotrix)
## Warning: package 'plotrix' was built under R version 3.5.3
pie3D(table(telecom$Género), labels = lbls1,explode=0.15,
main="Distribución de la Edad de los Clientes")

# Dot Plots
dotchart(table(telecom$Género), cex=.7,
main="Distribución de la Edad de los Clientes",
xlab="# de Clientes")
## Warning in dotchart(table(telecom$Género), cex = 0.7, main = "Distribución
## de la Edad de los Clientes", : 'x' is neither a vector nor a matrix: using
## as.numeric(x)

#Creando tablas de contingencia
tablacruzada<-table(telecom$Opinión, telecom$Género)
tablacruzada
##
## Femenino Masculino
## Bueno 9 14
## Excelente 9 10
## Muy Bueno 8 11
## Pésimo 21 23
## Regular 7 8
library(gmodels)
library(gdata)
CrossTable(telecom$Opinión, telecom$Género, format="SPSS")
##
## Cell Contents
## |-------------------------|
## | Count |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## |-------------------------|
##
## Total Observations in Table: 120
##
## | telecom$Género
## telecom$Opinión | Femenino | Masculino | Row Total |
## ----------------|-----------|-----------|-----------|
## Bueno | 9 | 14 | 23 |
## | 0.176 | 0.144 | |
## | 39.130% | 60.870% | 19.167% |
## | 16.667% | 21.212% | |
## | 7.500% | 11.667% | |
## ----------------|-----------|-----------|-----------|
## Excelente | 9 | 10 | 19 |
## | 0.024 | 0.019 | |
## | 47.368% | 52.632% | 15.833% |
## | 16.667% | 15.152% | |
## | 7.500% | 8.333% | |
## ----------------|-----------|-----------|-----------|
## Muy Bueno | 8 | 11 | 19 |
## | 0.035 | 0.029 | |
## | 42.105% | 57.895% | 15.833% |
## | 14.815% | 16.667% | |
## | 6.667% | 9.167% | |
## ----------------|-----------|-----------|-----------|
## Pésimo | 21 | 23 | 44 |
## | 0.073 | 0.060 | |
## | 47.727% | 52.273% | 36.667% |
## | 38.889% | 34.848% | |
## | 17.500% | 19.167% | |
## ----------------|-----------|-----------|-----------|
## Regular | 7 | 8 | 15 |
## | 0.009 | 0.008 | |
## | 46.667% | 53.333% | 12.500% |
## | 12.963% | 12.121% | |
## | 5.833% | 6.667% | |
## ----------------|-----------|-----------|-----------|
## Column Total | 54 | 66 | 120 |
## | 45.000% | 55.000% | |
## ----------------|-----------|-----------|-----------|
##
##
#Representación de variables cuantitativas
library(agricolae)
## Warning: package 'agricolae' was built under R version 3.5.3
(table.freq(hist(telecom$Edad,breaks = "Sturges")))

## Lower Upper Main Frequency Percentage CF CPF
## 1 20 22 21 15 12.5 15 12.5
## 2 22 24 23 9 7.5 24 20.0
## 3 24 26 25 9 7.5 33 27.5
## 4 26 28 27 17 14.2 50 41.7
## 5 28 30 29 9 7.5 59 49.2
## 6 30 32 31 12 10.0 71 59.2
## 7 32 34 33 17 14.2 88 73.3
## 8 34 36 35 13 10.8 101 84.2
## 9 36 38 37 9 7.5 110 91.7
## 10 38 40 39 10 8.3 120 100.0
(table.freq(hist(telecom$Tiempo.enero,breaks = "Scott")))

## Lower Upper Main Frequency Percentage CF CPF
## 1 15 20 17.5 2 1.7 2 1.7
## 2 20 25 22.5 4 3.3 6 5.0
## 3 25 30 27.5 15 12.5 21 17.5
## 4 30 35 32.5 24 20.0 45 37.5
## 5 35 40 37.5 28 23.3 73 60.8
## 6 40 45 42.5 22 18.3 95 79.2
## 7 45 50 47.5 13 10.8 108 90.0
## 8 50 55 52.5 8 6.7 116 96.7
## 9 55 60 57.5 3 2.5 119 99.2
## 10 60 65 62.5 1 0.8 120 100.0
(table.freq(graph.freq(telecom$Monto,plot=FALSE)))
## Lower Upper Main Frequency Percentage CF CPF
## 1 74.0 79.8 76.9 10 8.3 10 8.3
## 2 79.8 85.6 82.7 25 20.8 35 29.2
## 3 85.6 91.4 88.5 27 22.5 62 51.7
## 4 91.4 97.2 94.3 19 15.8 81 67.5
## 5 97.2 103.0 100.1 18 15.0 99 82.5
## 6 103.0 108.8 105.9 10 8.3 109 90.8
## 7 108.8 114.6 111.7 9 7.5 118 98.3
## 8 114.6 120.4 117.5 2 1.7 120 100.0
(table.freq(hist(telecom$Tiempo.febrero,breaks = "FD")))

## Lower Upper Main Frequency Percentage CF CPF
## 1 15 20 17.5 2 1.7 2 1.7
## 2 20 25 22.5 9 7.5 11 9.2
## 3 25 30 27.5 10 8.3 21 17.5
## 4 30 35 32.5 21 17.5 42 35.0
## 5 35 40 37.5 35 29.2 77 64.2
## 6 40 45 42.5 24 20.0 101 84.2
## 7 45 50 47.5 11 9.2 112 93.3
## 8 50 55 52.5 6 5.0 118 98.3
## 9 55 60 57.5 2 1.7 120 100.0
# Histograma y polígono de frecuencia
hist(telecom$Tiempo.enero, col = 2)

histograma<-hist(telecom$Tiempo.enero,breaks = "Sturges",
xlab="Edad",
ylab="Número de clientes",)
polygon.freq(histograma,frequency=1,col="red")

# Polígono de Frecuencias (solo)
histograma1<-hist(telecom$Tiempo.enero,border=FALSE)
polygon.freq(histograma,frequency=1,col="red")

# Histograma (Comparativo)
par(mfrow=c(1,2))
hist(telecom$Tiempo.enero[telecom$Género=="Masculino"],ylim=c(0,20))
hist(telecom$Tiempo.enero[telecom$Género=="Femenino"],ylim=c(0,20))

#(Comparativo)
stripchart(telecom$Tiempo.enero~telecom$Género)

#Boxplots
boxplot(telecom$Tiempo.enero, col = 3)
