library(tidyverse)
library(openxlsx)
library(Hmisc)
library(DataExplorer)
library(psych)
pacman::p_load(tidyverse,openxlsx, Hmisc, DataExplorer, psych, flextable)Análisis Exploratorio de Datos
Análisis exploratorio de Datos
Importación de datos
datos<- read.xlsx("datos_taller.xlsx")
summary(datos) salario horas ci KWW
Min. : 115.0 Min. :20.00 Min. : 50.0 Min. :12.00
1st Qu.: 669.0 1st Qu.:40.00 1st Qu.: 92.0 1st Qu.:31.00
Median : 905.0 Median :40.00 Median :102.0 Median :37.00
Mean : 957.9 Mean :43.93 Mean :101.3 Mean :35.74
3rd Qu.:1160.0 3rd Qu.:48.00 3rd Qu.:112.0 3rd Qu.:41.00
Max. :3078.0 Max. :80.00 Max. :145.0 Max. :56.00
educ exper antig edad
Min. : 9.00 Min. : 1.00 Min. : 0.000 Min. :28.00
1st Qu.:12.00 1st Qu.: 8.00 1st Qu.: 3.000 1st Qu.:30.00
Median :12.00 Median :11.00 Median : 7.000 Median :33.00
Mean :13.47 Mean :11.56 Mean : 7.234 Mean :33.08
3rd Qu.:16.00 3rd Qu.:15.00 3rd Qu.:11.000 3rd Qu.:36.00
Max. :18.00 Max. :23.00 Max. :22.000 Max. :38.00
casado urbano meduc peduc
Length:935 Length:935 Length:935 Length:935
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
tam<- length(datos$salario)
varia<- length(datos)Descripción de los datos
El Análisis Exploratorio de Datos (EDA), es un proceso que permite conocer a priori sobre el conjunto de datos. [1]
Conjunto de datos de 935 encuestados que consta de 12 variables XX cualitativas xx cuantitativas.
boxplot(datos$salario)fivenum(datos$salario)[1] 115 669 905 1160 3078
p_70_80_90<- quantile(datos$salario, probs = c(0.70, 0.80, 0.90))
p_70_80_90 70% 80% 90%
1100.0 1250.0 1443.2
boxplot(salario ~ urbano, data=datos)desvst<- sd(datos$salario)Herramientas para describir datos
Librería HMISC
#| label: Hmisc
des1<-describe(datos)
class(des1)[1] "psych" "describe" "data.frame"
des1$horas$countsNULL
Herramientas Tidyverse
Función pipe %>% y select
datos_cuanti<- datos %>%
select(1:8)Crear una función con los estadísticos básicos con sapply
des2<- function(x, na.omit=F){
if (na.omit)
x<- x[!is.na(x)]
media<- mean(x)
n=length(x)
ds<- sd(x)
med<-median(x)
kurt<- sum(x-media)^4/ds^4/n-3
return(c(m=media, dv.st=ds, mediana=med, kurtosis=kurt))
}
sapply(datos_cuanti, des2) salario horas ci KWW educ exper antig
m 957.9455 43.929412 101.28235 35.744385 13.468449 11.563636 7.234225
dv.st 404.3608 7.224256 15.05264 7.638788 2.196654 4.374586 5.075206
mediana 905.0000 40.000000 102.00000 37.000000 12.000000 11.000000 7.000000
kurtosis -3.0000 -3.000000 -3.00000 -3.000000 -3.000000 -3.000000 -3.000000
edad
m 33.080214
dv.st 3.107803
mediana 33.000000
kurtosis -3.000000
Librería Data Explorer
plot_str(datos)
introduce(datos) rows columns discrete_columns continuous_columns all_missing_columns
1 935 12 4 8 0
total_missing_values complete_rows total_observations memory_usage
1 0 935 11220 93768
plot_intro(datos)plot_bar(datos, with = "edad")plot_histogram(datos)Librería Psych
library(flextable) # Libería para presentar tablas
library(dplyr)
desc3<- psych::describe(datos)
desc3 %>%
flextable::flextable() %>%
flextable::autofit()vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 935 | 957.945455 | 404.3608225 | 905 | 919.688919 | 369.1674 | 115 | 3,078 | 2,963 | 1.19733547 | 2.68448191 | 13.22401295 |
2 | 935 | 43.929412 | 7.2242559 | 40 | 42.798398 | 0.0000 | 20 | 80 | 60 | 1.59105743 | 4.14257253 | 0.23625843 |
3 | 935 | 101.282353 | 15.0526364 | 102 | 101.867824 | 14.8260 | 50 | 145 | 95 | -0.33987862 | -0.02932957 | 0.49227385 |
4 | 935 | 35.744385 | 7.6387878 | 37 | 35.986649 | 7.4130 | 12 | 56 | 44 | -0.29199229 | -0.32737219 | 0.24981507 |
5 | 935 | 13.468449 | 2.1966539 | 12 | 13.327103 | 1.4826 | 9 | 18 | 9 | 0.54691730 | -0.74218638 | 0.07183826 |
6 | 935 | 11.563636 | 4.3745864 | 11 | 11.523364 | 4.4478 | 1 | 23 | 22 | 0.07755143 | -0.57239635 | 0.14306427 |
7 | 935 | 7.234225 | 5.0752058 | 7 | 6.890521 | 5.9304 | 0 | 22 | 22 | 0.43114539 | -0.80544642 | 0.16597698 |
8 | 935 | 33.080214 | 3.1078033 | 33 | 33.036048 | 4.4478 | 28 | 38 | 10 | 0.11835518 | -1.26051864 | 0.10163603 |
9 | 935 | 1.893048 | 0.3092174 | 2 | 1.990654 | 0.0000 | 1 | 2 | 1 | -2.53949288 | 4.45379093 | 0.01011249 |
10 | 935 | 1.717647 | 0.4503851 | 2 | 1.771696 | 0.0000 | 1 | 2 | 1 | -0.96545955 | -1.06902777 | 0.01472917 |
11 | 935 | 1.895187 | 0.8995171 | 2 | 1.750334 | 1.4826 | 1 | 4 | 3 | 0.99145990 | 0.37660098 | 0.02941735 |
12 | 935 | 1.827807 | 0.9636193 | 2 | 1.660881 | 1.4826 | 1 | 4 | 3 | 1.07311554 | 0.18818342 | 0.03151372 |
Descripción de datos a través de gráficos con GGPLOT2
Gráficos Univariantes
ggplot(data = datos, aes(x= salario, y= horas))+
geom_point()ggplot(data=datos, aes(x= salario))+
geom_histogram()Gráficos Bivariantes
ggplot(data = datos, aes(x= urbano, y= salario, fill= casado))+
geom_boxplot()Gráficos con Facetas
ggplot(data = datos, aes(x= urbano, y= salario, fill= casado))+
geom_boxplot()+
facet_grid(meduc~peduc)Bibliografía
#| label: import
base<- "http://archive.ics.uci.edu/ml/machine-learning-databases/00616/Tetuan%20City%20power%20consumption.csv"
datos_ce<- read.csv(base, sep=",")
summary(datos_ce) DateTime Temperature Humidity Wind.Speed
Length:52416 Min. : 3.247 Min. :11.34 Min. :0.050
Class :character 1st Qu.:14.410 1st Qu.:58.31 1st Qu.:0.078
Mode :character Median :18.780 Median :69.86 Median :0.086
Mean :18.810 Mean :68.26 Mean :1.959
3rd Qu.:22.890 3rd Qu.:81.40 3rd Qu.:4.915
Max. :40.010 Max. :94.80 Max. :6.483
general.diffuse.flows diffuse.flows Zone.1.Power.Consumption
Min. : 0.004 Min. : 0.011 Min. :13896
1st Qu.: 0.062 1st Qu.: 0.122 1st Qu.:26311
Median : 5.035 Median : 4.456 Median :32266
Mean : 182.697 Mean : 75.028 Mean :32345
3rd Qu.: 319.600 3rd Qu.:101.000 3rd Qu.:37309
Max. :1163.000 Max. :936.000 Max. :52204
Zone.2..Power.Consumption Zone.3..Power.Consumption
Min. : 8560 Min. : 5935
1st Qu.:16981 1st Qu.:13129
Median :20823 Median :16415
Mean :21043 Mean :17835
3rd Qu.:24714 3rd Qu.:21624
Max. :37409 Max. :47598
#Cargar liber�as extra
#| label: librerias
pacman::p_load(tidyverse, lubridate)Transformaci�n de datos
datos_ce1<- datos_ce
datos_ce$DateTime_trans <- mdy_hm(datos_ce$DateTime)
datos_ce <- datos_ce %>% mutate(dia = day(DateTime_trans),
mes = month(DateTime_trans),
hora= hour(DateTime_trans),
)Descripci�n del conjunto de datos
Resumen medidas de descripción
library(corrplot)
res_decr <- summary(datos_ce)
res_decr DateTime Temperature Humidity Wind.Speed
Length:52416 Min. : 3.247 Min. :11.34 Min. :0.050
Class :character 1st Qu.:14.410 1st Qu.:58.31 1st Qu.:0.078
Mode :character Median :18.780 Median :69.86 Median :0.086
Mean :18.810 Mean :68.26 Mean :1.959
3rd Qu.:22.890 3rd Qu.:81.40 3rd Qu.:4.915
Max. :40.010 Max. :94.80 Max. :6.483
general.diffuse.flows diffuse.flows Zone.1.Power.Consumption
Min. : 0.004 Min. : 0.011 Min. :13896
1st Qu.: 0.062 1st Qu.: 0.122 1st Qu.:26311
Median : 5.035 Median : 4.456 Median :32266
Mean : 182.697 Mean : 75.028 Mean :32345
3rd Qu.: 319.600 3rd Qu.:101.000 3rd Qu.:37309
Max. :1163.000 Max. :936.000 Max. :52204
Zone.2..Power.Consumption Zone.3..Power.Consumption
Min. : 8560 Min. : 5935
1st Qu.:16981 1st Qu.:13129
Median :20823 Median :16415
Mean :21043 Mean :17835
3rd Qu.:24714 3rd Qu.:21624
Max. :37409 Max. :47598
DateTime_trans dia mes hora
Min. :2017-01-01 00:00:00 Min. : 1.00 Min. : 1.000 Min. : 0.00
1st Qu.:2017-04-01 23:57:30 1st Qu.: 8.00 1st Qu.: 4.000 1st Qu.: 5.75
Median :2017-07-01 23:55:00 Median :16.00 Median : 7.000 Median :11.50
Mean :2017-07-01 23:55:00 Mean :15.68 Mean : 6.511 Mean :11.50
3rd Qu.:2017-09-30 23:52:30 3rd Qu.:23.00 3rd Qu.: 9.250 3rd Qu.:17.25
Max. :2017-12-30 23:50:00 Max. :31.00 Max. :12.000 Max. :23.00
datos_cuanti<-select(datos_ce,-DateTime,-DateTime_trans)
matcor <- cor(datos_cuanti)
corrplot(matcor, method = c("number"))Gráficos
Por el índice de correlación cercano a 1, se eligieron las siguientes variables para evaluar: Zone.1.Power.Consumption Zone..2.Power.Consumption Zone.1.Power.Consumption Hora
#library(janitor)
datos_ce <- datos_ce %>%
mutate(horas2=as.factor(hora))
ggplot(data = datos_ce, aes(x= horas2, y= Zone.1.Power.Consumption), title="Consumo energ�tico en Zona 1") + geom_boxplot()ggplot(data = datos_ce, aes(x= horas2, y= Zone.2..Power.Consumption), title="Consumo energ�tico en Zona 2") +
geom_boxplot()ggplot(data = datos_ce, aes(x= horas2, y= Zone.3..Power.Consumption),title="Consumo energ�tico en Zona 3") +
geom_boxplot()dat_prom_hora <- datos_2 %>% group_by(hora,name) %>% summarise(Media=mean(value))
datos_2 <- pivot_longer(datos_ce, cols = c(“Zone.1.Power.Consumption”, “Zone.2..Power.Consumption”, “Zone.3..Power.Consumption” ))
ggplot(datos_2,aes(x=hora,y=value,fill=name))+ geom_col(position=“dodge”)
ggplot(dat_prom_hora,aes(x=hora,y=Media,fill=name))+ geom_col(position=“dodge”)
```