Análisis Exploratorio de Datos

Author

CEC

Análisis exploratorio de Datos

library(tidyverse)
library(openxlsx)
library(Hmisc)
library(DataExplorer)
library(psych)

pacman::p_load(tidyverse,openxlsx, Hmisc, DataExplorer, psych, flextable)

Importación de datos

datos<- read.xlsx("datos_taller.xlsx")

summary(datos)
    salario           horas             ci             KWW       
 Min.   : 115.0   Min.   :20.00   Min.   : 50.0   Min.   :12.00  
 1st Qu.: 669.0   1st Qu.:40.00   1st Qu.: 92.0   1st Qu.:31.00  
 Median : 905.0   Median :40.00   Median :102.0   Median :37.00  
 Mean   : 957.9   Mean   :43.93   Mean   :101.3   Mean   :35.74  
 3rd Qu.:1160.0   3rd Qu.:48.00   3rd Qu.:112.0   3rd Qu.:41.00  
 Max.   :3078.0   Max.   :80.00   Max.   :145.0   Max.   :56.00  
      educ           exper           antig             edad      
 Min.   : 9.00   Min.   : 1.00   Min.   : 0.000   Min.   :28.00  
 1st Qu.:12.00   1st Qu.: 8.00   1st Qu.: 3.000   1st Qu.:30.00  
 Median :12.00   Median :11.00   Median : 7.000   Median :33.00  
 Mean   :13.47   Mean   :11.56   Mean   : 7.234   Mean   :33.08  
 3rd Qu.:16.00   3rd Qu.:15.00   3rd Qu.:11.000   3rd Qu.:36.00  
 Max.   :18.00   Max.   :23.00   Max.   :22.000   Max.   :38.00  
    casado             urbano             meduc              peduc          
 Length:935         Length:935         Length:935         Length:935        
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
tam<- length(datos$salario)
varia<- length(datos)

Descripción de los datos

El Análisis Exploratorio de Datos (EDA), es un proceso que permite conocer a priori sobre el conjunto de datos. [1]

Conjunto de datos de 935 encuestados que consta de 12 variables XX cualitativas xx cuantitativas.

boxplot(datos$salario)

fivenum(datos$salario)
[1]  115  669  905 1160 3078
p_70_80_90<- quantile(datos$salario, probs = c(0.70, 0.80, 0.90))

p_70_80_90
   70%    80%    90% 
1100.0 1250.0 1443.2 
boxplot(salario ~ urbano, data=datos)

desvst<- sd(datos$salario)

Herramientas para describir datos

Librería HMISC

#| label: Hmisc

des1<-describe(datos)

class(des1)
[1] "psych"      "describe"   "data.frame"
des1$horas$counts
NULL

Herramientas Tidyverse

Función pipe %>% y select

datos_cuanti<- datos %>% 
  select(1:8)

Crear una función con los estadísticos básicos con sapply

des2<- function(x, na.omit=F){
        if (na.omit)
          x<- x[!is.na(x)]
        media<- mean(x)
        n=length(x)
        ds<- sd(x)
        med<-median(x)
        kurt<- sum(x-media)^4/ds^4/n-3
        return(c(m=media, dv.st=ds, mediana=med, kurtosis=kurt))
  
}


sapply(datos_cuanti, des2)
          salario     horas        ci       KWW      educ     exper     antig
m        957.9455 43.929412 101.28235 35.744385 13.468449 11.563636  7.234225
dv.st    404.3608  7.224256  15.05264  7.638788  2.196654  4.374586  5.075206
mediana  905.0000 40.000000 102.00000 37.000000 12.000000 11.000000  7.000000
kurtosis  -3.0000 -3.000000  -3.00000 -3.000000 -3.000000 -3.000000 -3.000000
              edad
m        33.080214
dv.st     3.107803
mediana  33.000000
kurtosis -3.000000

Librería Data Explorer

plot_str(datos)

introduce(datos)
  rows columns discrete_columns continuous_columns all_missing_columns
1  935      12                4                  8                   0
  total_missing_values complete_rows total_observations memory_usage
1                    0           935              11220        93768
plot_intro(datos)

plot_bar(datos, with = "edad")

plot_histogram(datos)

Librería Psych

library(flextable) # Libería para presentar tablas
library(dplyr)
desc3<- psych::describe(datos)


desc3 %>% 
  flextable::flextable() %>% 
  flextable::autofit()

vars

n

mean

sd

median

trimmed

mad

min

max

range

skew

kurtosis

se

1

935

957.945455

404.3608225

905

919.688919

369.1674

115

3,078

2,963

1.19733547

2.68448191

13.22401295

2

935

43.929412

7.2242559

40

42.798398

0.0000

20

80

60

1.59105743

4.14257253

0.23625843

3

935

101.282353

15.0526364

102

101.867824

14.8260

50

145

95

-0.33987862

-0.02932957

0.49227385

4

935

35.744385

7.6387878

37

35.986649

7.4130

12

56

44

-0.29199229

-0.32737219

0.24981507

5

935

13.468449

2.1966539

12

13.327103

1.4826

9

18

9

0.54691730

-0.74218638

0.07183826

6

935

11.563636

4.3745864

11

11.523364

4.4478

1

23

22

0.07755143

-0.57239635

0.14306427

7

935

7.234225

5.0752058

7

6.890521

5.9304

0

22

22

0.43114539

-0.80544642

0.16597698

8

935

33.080214

3.1078033

33

33.036048

4.4478

28

38

10

0.11835518

-1.26051864

0.10163603

9

935

1.893048

0.3092174

2

1.990654

0.0000

1

2

1

-2.53949288

4.45379093

0.01011249

10

935

1.717647

0.4503851

2

1.771696

0.0000

1

2

1

-0.96545955

-1.06902777

0.01472917

11

935

1.895187

0.8995171

2

1.750334

1.4826

1

4

3

0.99145990

0.37660098

0.02941735

12

935

1.827807

0.9636193

2

1.660881

1.4826

1

4

3

1.07311554

0.18818342

0.03151372

Descripción de datos a través de gráficos con GGPLOT2

Gráficos Univariantes

ggplot(data = datos, aes(x= salario, y= horas))+
  geom_point()

ggplot(data=datos, aes(x= salario))+
  geom_histogram()

Gráficos Bivariantes

ggplot(data = datos, aes(x= urbano, y= salario, fill= casado))+
  geom_boxplot()

Gráficos con Facetas

ggplot(data = datos, aes(x= urbano, y= salario, fill= casado))+
  geom_boxplot()+
  facet_grid(meduc~peduc)

Bibliografía

#| label: import


base<- "http://archive.ics.uci.edu/ml/machine-learning-databases/00616/Tetuan%20City%20power%20consumption.csv"
datos_ce<- read.csv(base, sep=",")
summary(datos_ce)
   DateTime          Temperature        Humidity       Wind.Speed   
 Length:52416       Min.   : 3.247   Min.   :11.34   Min.   :0.050  
 Class :character   1st Qu.:14.410   1st Qu.:58.31   1st Qu.:0.078  
 Mode  :character   Median :18.780   Median :69.86   Median :0.086  
                    Mean   :18.810   Mean   :68.26   Mean   :1.959  
                    3rd Qu.:22.890   3rd Qu.:81.40   3rd Qu.:4.915  
                    Max.   :40.010   Max.   :94.80   Max.   :6.483  
 general.diffuse.flows diffuse.flows     Zone.1.Power.Consumption
 Min.   :   0.004      Min.   :  0.011   Min.   :13896           
 1st Qu.:   0.062      1st Qu.:  0.122   1st Qu.:26311           
 Median :   5.035      Median :  4.456   Median :32266           
 Mean   : 182.697      Mean   : 75.028   Mean   :32345           
 3rd Qu.: 319.600      3rd Qu.:101.000   3rd Qu.:37309           
 Max.   :1163.000      Max.   :936.000   Max.   :52204           
 Zone.2..Power.Consumption Zone.3..Power.Consumption
 Min.   : 8560             Min.   : 5935            
 1st Qu.:16981             1st Qu.:13129            
 Median :20823             Median :16415            
 Mean   :21043             Mean   :17835            
 3rd Qu.:24714             3rd Qu.:21624            
 Max.   :37409             Max.   :47598            

#Cargar liber�as extra

#| label: librerias
pacman::p_load(tidyverse, lubridate)

Transformaci�n de datos

datos_ce1<- datos_ce

datos_ce$DateTime_trans <- mdy_hm(datos_ce$DateTime)

datos_ce <- datos_ce %>% mutate(dia = day(DateTime_trans),
                            mes = month(DateTime_trans),
                            hora= hour(DateTime_trans),
                           )

Descripci�n del conjunto de datos

Resumen medidas de descripción

library(corrplot)
res_decr <- summary(datos_ce)
res_decr
   DateTime          Temperature        Humidity       Wind.Speed   
 Length:52416       Min.   : 3.247   Min.   :11.34   Min.   :0.050  
 Class :character   1st Qu.:14.410   1st Qu.:58.31   1st Qu.:0.078  
 Mode  :character   Median :18.780   Median :69.86   Median :0.086  
                    Mean   :18.810   Mean   :68.26   Mean   :1.959  
                    3rd Qu.:22.890   3rd Qu.:81.40   3rd Qu.:4.915  
                    Max.   :40.010   Max.   :94.80   Max.   :6.483  
 general.diffuse.flows diffuse.flows     Zone.1.Power.Consumption
 Min.   :   0.004      Min.   :  0.011   Min.   :13896           
 1st Qu.:   0.062      1st Qu.:  0.122   1st Qu.:26311           
 Median :   5.035      Median :  4.456   Median :32266           
 Mean   : 182.697      Mean   : 75.028   Mean   :32345           
 3rd Qu.: 319.600      3rd Qu.:101.000   3rd Qu.:37309           
 Max.   :1163.000      Max.   :936.000   Max.   :52204           
 Zone.2..Power.Consumption Zone.3..Power.Consumption
 Min.   : 8560             Min.   : 5935            
 1st Qu.:16981             1st Qu.:13129            
 Median :20823             Median :16415            
 Mean   :21043             Mean   :17835            
 3rd Qu.:24714             3rd Qu.:21624            
 Max.   :37409             Max.   :47598            
 DateTime_trans                     dia             mes              hora      
 Min.   :2017-01-01 00:00:00   Min.   : 1.00   Min.   : 1.000   Min.   : 0.00  
 1st Qu.:2017-04-01 23:57:30   1st Qu.: 8.00   1st Qu.: 4.000   1st Qu.: 5.75  
 Median :2017-07-01 23:55:00   Median :16.00   Median : 7.000   Median :11.50  
 Mean   :2017-07-01 23:55:00   Mean   :15.68   Mean   : 6.511   Mean   :11.50  
 3rd Qu.:2017-09-30 23:52:30   3rd Qu.:23.00   3rd Qu.: 9.250   3rd Qu.:17.25  
 Max.   :2017-12-30 23:50:00   Max.   :31.00   Max.   :12.000   Max.   :23.00  
datos_cuanti<-select(datos_ce,-DateTime,-DateTime_trans)
matcor <- cor(datos_cuanti)
corrplot(matcor, method = c("number"))

Gráficos

Por el índice de correlación cercano a 1, se eligieron las siguientes variables para evaluar: Zone.1.Power.Consumption Zone..2.Power.Consumption Zone.1.Power.Consumption Hora

#library(janitor)


datos_ce <- datos_ce %>% 
  mutate(horas2=as.factor(hora))

ggplot(data = datos_ce, aes(x= horas2, y= Zone.1.Power.Consumption), title="Consumo energ�tico en Zona 1") + geom_boxplot()

ggplot(data = datos_ce, aes(x= horas2, y= Zone.2..Power.Consumption), title="Consumo energ�tico en Zona 2") +
  geom_boxplot()

ggplot(data = datos_ce, aes(x= horas2, y= Zone.3..Power.Consumption),title="Consumo energ�tico en Zona 3") +
  geom_boxplot()

dat_prom_hora <- datos_2 %>% group_by(hora,name) %>% summarise(Media=mean(value))

datos_2 <- pivot_longer(datos_ce, cols = c(“Zone.1.Power.Consumption”, “Zone.2..Power.Consumption”, “Zone.3..Power.Consumption” ))

ggplot(datos_2,aes(x=hora,y=value,fill=name))+ geom_col(position=“dodge”)

ggplot(dat_prom_hora,aes(x=hora,y=Media,fill=name))+ geom_col(position=“dodge”)

```

References

[1]
Buzai, Gustavo D ; Baxendale, Claudia A: Análisis exploratorio de datos espaciales. In: Geografı́a y Sistemas de Información Geográfica, N° 1,(2009), Universidad Nacional de Luján. Departamento de Ciencias Sociales. (2009)