BIG DATA PRIMER ENSAYO

LECTURA DE DATOS EN DIVERSOS FORMATOS

Qué necesitamos para cargar datos de diversas fuentes? En principio algunas librerias de R

#install.packages("tidyverse")

FORMATO SAV (SPSS)

library(haven)
eph <- read_sav("REG02_EPHC_T4_2023.SAV")

Verificar algunas de las variables

names(eph)
##   [1] "UPM"          "NVIVI"        "NHOGA"        "TRIMESTRE"    "RONDA"       
##   [6] "ANIO"         "AREA"         "ESTGEO"       "L02"          "P02"         
##  [11] "P03"          "P06"          "P09"          "A01"          "A01A"        
##  [16] "A02"          "A03"          "A04"          "A04B"         "A04A"        
##  [21] "A05"          "A06"          "A06E"         "A07"          "A08"         
##  [26] "A09"          "A09E"         "A10"          "A11A"         "A11M"        
##  [31] "A11S"         "A12"          "A13REC"       "A14REC"       "A15"         
##  [36] "A16"          "A17A"         "A17M"         "A17S"         "A18"         
##  [41] "B01REC"       "B02REC"       "B03LU"        "B03MA"        "B03MI"       
##  [46] "B03JU"        "B03VI"        "B03SA"        "B03DO"        "B04"         
##  [51] "B05"          "B06"          "B07A"         "B07M"         "B07S"        
##  [56] "B08"          "B09A"         "B09M"         "B09S"         "B10"         
##  [61] "B11"          "B12"          "B13"          "B14"          "B15"         
##  [66] "B16G"         "B16U"         "B16D"         "B16T"         "B17"         
##  [71] "B18AG"        "B18AU"        "B18BG"        "B18BU"        "B19"         
##  [76] "B20G"         "B20U"         "B20D"         "B20T"         "B21"         
##  [81] "B22"          "B23"          "B24"          "B25"          "B26"         
##  [86] "B271"         "B272"         "B28"          "B29"          "B30"         
##  [91] "B31"          "C01REC"       "C02REC"       "C03"          "C04"         
##  [96] "C05"          "C06"          "C07"          "C08"          "C09"         
## [101] "C101"         "C102"         "C11G"         "C11U"         "C11D"        
## [106] "C11T"         "C12"          "C13AG"        "C13AU"        "C13BG"       
## [111] "C13BU"        "C14"          "C14A"         "C14B"         "C14C"        
## [116] "C15"          "C16REC"       "C17REC"       "C18"          "C18A"        
## [121] "C18B"         "C19"          "D01"          "D02"          "D03"         
## [126] "D04"          "D05"          "CATE_PEA"     "TAMA_PEA"     "OCUP_PEA"    
## [131] "RAMA_PEA"     "HORAB"        "HORABC"       "HORABCO"      "PEAD"        
## [136] "PEAD_1"       "PEAA"         "FEX.2022"     "añoest"       "Informalidad"
## [141] "E01AIMDE"     "E01BIMDE"     "E01CIMDE"     "E01DDE"       "E01EDE"      
## [146] "E01FDE"       "E01GDE"       "E01HDE"       "E01IDE"       "E01JDE"      
## [151] "E01KDE"       "E01LDE"       "E01MDE"       "E01KJDE"

Sexo (P06)

table(eph$P06)
## 
##    1    6 
## 8287 8488

*Etiquetamos las categorias para la variable sexo (p06)

eph$P06=factor(eph$P06, levels = c(1,6),labels = c("Hombres","Mujeres"))
table(eph$P06)
## 
## Hombres Mujeres 
##    8287    8488

cantidad de población

tabla1=aggregate(FEX.2022 ~ P06,data=eph,sum)
tabla1
##       P06 FEX.2022
## 1 Hombres  2900086
## 2 Mujeres  3002988

los salarios de las mujeres en promedio son mas bajos o altos que el de los hombres?

La variable ingreso se llama e01aimde

tabla2=aggregate(as.numeric(E01AIMDE) ~ P06, data=eph, mean)
tabla2
##       P06 as.numeric(E01AIMDE)
## 1 Hombres            1815106.2
## 2 Mujeres             845322.7

Explorar otras variables como: Area de residencia (AREA)

Edad (P02)

Categoria de ocupacion (PEAA)

FORMATO XLSX (EXCEL)

FORMATO CSV (SEPARADO POR COMAS)

FORMATO TXT (TEXTO PLANO)