library(haven)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(purrr)
getwd()
## [1] "/Users/valeriacigarroa/Documents/ITESM/Datos Masivos"
# Colocar los cinco archivos .sav en un mismo folder
# Lista de archivos .sav
files <- list.files(path = "/Users/valeriacigarroa/Documents/ITESM/Modelacion de Sistemas/R Files", pattern = "\\.sav$", full.names = TRUE)
# Función para extraer año, mes y metadatos
extract_metadata <- function(file) {
tryCatch({
data <- read_sav(file)
# Extraer año y mes del nombre del archivo
file_name <- basename(file)
year <- str_extract(file_name, "(?<=[A-Za-z]{4})\\d{4}") %>% # 4 dígitos después de 4 letras
substr(1, 4) # Para asegurarnos que solo tome el año (ej: "1998" en "199806")
month <- str_extract(file_name, "\\d{2}(?=\\.sav$)") # últimos 2 dígitos antes de .sav
# Extraer metadatos de variables
vars_metadata <- map(names(data), ~ {
var <- data[[.x]]
list(
año = year,
mes = month,
nombre_variable = .x,
etiqueta_variable = ifelse(!is.null(attr(var, "label")), attr(var, "label"), NA),
pregunta = ifelse(!is.null(attr(var, "question")), attr(var, "question"), NA)
)
})
bind_rows(vars_metadata)
}, error = function(e) {
message("Error en: ", file)
message("Detalle: ", e$message)
return(NULL)
})
}
# Extraer todos los metadatos
metadata_total <- map_df(files, extract_metadata)
# Verificar resultados
head(metadata_total)
## # A tibble: 6 × 5
## año mes nombre_variable etiqueta_variable pregunta
## <chr> <chr> <chr> <chr> <lgl>
## 1 2024 03 folio Folio de identificación NA
## 2 2024 03 unidad Unidad primaria de muestreo NA
## 3 2024 03 region Region de residencia NA
## 4 2024 03 edad Edad de la persona entrevistada NA
## 5 2024 03 sexo Sexo de la persona entrevistada NA
## 6 2024 03 P01a P01. ¿Cuál cree usted que es actualmente… NA
Ya tengo mi csv con la información
write.csv(metadata_total, file="metadata_total.csv")
## 2. Convertir los .sav a .csv
# install.packages(c("haven", "sjlabelled"))
# Cargar las librerías necesarias
library(haven)
library(sjlabelled)
##
## Attaching package: 'sjlabelled'
## The following object is masked from 'package:forcats':
##
## as_factor
## The following object is masked from 'package:dplyr':
##
## as_label
## The following object is masked from 'package:ggplot2':
##
## as_label
## The following objects are masked from 'package:haven':
##
## as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels
# Establecer el directorio de trabajo
getwd()
## [1] "/Users/valeriacigarroa/Documents/ITESM/Datos Masivos"
# Listar todos los archivos .sav en el directorio
sav_files <- list.files(
path = ".",
pattern = "\\.sav$",
full.names = FALSE
)
cat("Archivos .sav encontrados:", length(sav_files), "\n")
## Archivos .sav encontrados: 0
cat("Iniciando la conversión...\n\n")
## Iniciando la conversión...
# Iterar sobre la lista de archivos para leer, aplicar etiquetas y convertir
for (file_name in sav_files) {
# Construir el nombre del archivo de salida .csv
csv_file_name <- sub("\\.sav$", "_con_etiquetas.csv", file_name)
# A. Leer el archivo .sav
data_sav <- read_sav(file_name)
# B. PASO CLAVE: Aplicar las etiquetas de valor
# La función as_label() convierte los valores numéricos de las variables
# con etiquetas a sus correspondientes textos de etiquetas (p. ej., 1 -> "Sí").
# El resultado es un data frame donde las variables etiquetadas son de tipo caracter (texto).
data_labeled <- data.frame(as_label(data_sav))
# C. Escribir el data frame resultante en un archivo .csv
# row.names = FALSE: Previene la columna de números de fila de R.
# fileEncoding = "UTF-8": Recomendado para asegurar que los caracteres especiales
# (acentos, ñ, etc.) se guarden correctamente.
write.csv(
data_labeled,
file = csv_file_name,
row.names = FALSE,
fileEncoding = "UTF-8"
)
# Mensaje de progreso
cat(paste("✅ Convertido:", file_name, "a", csv_file_name, "\n"))
}
cat("\n¡Conversión completada y etiquetas aplicadas para todos los archivos!\n")
##
## ¡Conversión completada y etiquetas aplicadas para todos los archivos!
### 3. Elegir las variables a emplear
## abrir los csv
mar <- read.csv("/Users/valeriacigarroa/Documents/ITESM/Modelacion de Sistemas/R Files/GIMX202403_con_etiquetas.csv")
abr <- read.csv("/Users/valeriacigarroa/Documents/ITESM/Modelacion de Sistemas/R Files/GIMX202404_con_etiquetas.csv")
may <- read.csv("/Users/valeriacigarroa/Documents/ITESM/Modelacion de Sistemas/R Files/GIMX202405_con_etiquetas.csv")
jun <- read.csv("/Users/valeriacigarroa/Documents/ITESM/Modelacion de Sistemas/R Files/GIMX202406_con_etiquetas.csv")
sep <- read.csv("/Users/valeriacigarroa/Documents/ITESM/Modelacion de Sistemas/R Files/GIMX202409_con_etiquetas.csv")
ls()
## [1] "abr" "extract_metadata" "file_name" "files"
## [5] "jun" "mar" "may" "metadata_total"
## [9] "sav_files" "sep"
############# limpiar variables con nombres
# los dataframes
dfs <- list(abr = abr, jun = jun, mar = mar, may = may, sep = sep)
# nombres comunes a cambiar
nombres_comunes <- c(
"D01" = "smartphone",
"D02a" = "tv",
"D02b" = "radio",
"D02c" = "periodico",
"D02d" = "internet",
"D03a" = "Facebook",
"D03b" = "Twitter",
"D03c" = "YouTube",
"D03d" = "TikTok",
"D03e" = "Instagram",
"D04" = "estudios",
"D06" = "estado_civil"
)
# los cambios
for (n in names(dfs)) {
old_names <- names(dfs[[n]])
names(dfs[[n]]) <- ifelse(old_names %in% names(nombres_comunes),
nombres_comunes[old_names],
old_names)
}
# cambios específicos
cambios_especificos <- list(
mar = c(
"D11" = "origen_etnico",
"P24" = "benef_prog_soc",
"D14" = "ingreso",
"P11" = "aprob_amlo",
"P64" = "pid",
"P50" = "apoyo_demo",
"P59" = "INE_imparcial"
),
abr = c(
"D11" = "origen_etnico",
"P24" = "benef_prog_soc",
"D14" = "ingreso",
"P11" = "aprob_amlo",
"P58" = "pid",
"P43" = "apoyo_demo",
"P52" = "INE_imparcial"
),
may = c(
"D10" = "origen_etnico",
"P06" = "benef_prog_soc",
"D13" = "ingreso",
"P05" = "aprob_amlo",
"P17" = "pid",
"P08" = "apoyo_demo",
"P13" = "INE_imparcial"
),
jun = c(
"D07" = "origen_etnico",
"D12" = "benef_prog_soc",
"D16" = "ingreso",
"Q4" = "aprob_amlo",
"P0" = "pid",
"Q5" = "apoyo_demo",
"P9" = "INE_imparcial"
),
sep = c(
"D12" = "origen_etnico",
"D08" = "benef_prog_soc",
"D16" = "ingreso",
"P11" = "aprob_amlo",
"P61" = "pid",
"P52" = "apoyo_demo",
"P57" = "INE_imparcial"
)
)
# los cambios
for (n in names(cambios_especificos)) {
old_names <- names(dfs[[n]])
cambios <- cambios_especificos[[n]]
names(dfs[[n]]) <- ifelse(old_names %in% names(cambios),
cambios[old_names],
old_names)
}
# guardar
abr <- dfs$abr
jun <- dfs$jun
mar <- dfs$mar
may <- dfs$may
sep <- dfs$sep
lapply(dfs, names)
## $abr
## [1] "folio" "unidad" "region" "edad"
## [5] "sexo" "P01a" "P01b" "P02"
## [9] "P03" "P04a" "P04b" "P05a"
## [13] "P05b" "P06a" "P06b" "P07a"
## [17] "P07b" "P07c" "P07d" "P08"
## [21] "P09" "P10a" "P10b" "aprob_amlo"
## [25] "R11" "P11a" "P11b" "P12a"
## [29] "P12b" "P12c" "P12d" "P12e"
## [33] "P13" "P14" "P15a" "P15b"
## [37] "P15c" "P15d" "P15e" "P15f"
## [41] "P15g" "P15h" "P15i" "P15j"
## [45] "P16" "P17a" "P17b" "P18a"
## [49] "P18b" "P18c" "P18d" "P18e"
## [53] "P18f" "P18g" "P18h" "P18i"
## [57] "P19a" "P19b" "P19c" "P19d"
## [61] "P19e" "P19f" "P19g" "P19h"
## [65] "P19i" "P19j" "P19k" "P20a"
## [69] "P20b" "P20c" "P20d" "P21"
## [73] "P22" "P23" "benef_prog_soc" "P25"
## [77] "P26" "P27" "P28" "P29"
## [81] "P30" "P31" "P32" "P33a"
## [85] "P33b" "P34" "P35" "P36"
## [89] "P37" "P38" "P39a" "P39b"
## [93] "P40" "P41" "P42" "apoyo_demo"
## [97] "P44" "P45" "P46" "P47"
## [101] "P48" "P49" "P50" "P51"
## [105] "INE_imparcial" "P53" "P54" "P55"
## [109] "P56" "P57a" "P57b" "P57c"
## [113] "P57d" "P57e" "P57f" "P57g"
## [117] "pid" "R58" "P59" "P60a"
## [121] "P60b" "P60c" "P61" "P62"
## [125] "P63a" "P63b" "P63c" "P63d"
## [129] "P63e" "P64" "P65" "R65"
## [133] "P66" "P67" "P68" "R68"
## [137] "P69" "P70" "P71" "R71"
## [141] "P72a" "P72b" "P72c" "P73"
## [145] "R73" "P74" "P75" "P76"
## [149] "R76" "P77" "P78" "P79"
## [153] "P80" "P81" "P82a" "P82b"
## [157] "P82c" "P83" "P84a" "P84b"
## [161] "P85a" "P85b" "P85c" "P85d"
## [165] "P85e" "P85f" "P85g" "P85h"
## [169] "P86a" "P86b" "P86c" "P86d"
## [173] "P86e" "P86f" "P86g" "P86h"
## [177] "P86i" "P87a" "P87b" "P87c"
## [181] "P87d" "P87e" "P87f" "P87g"
## [185] "P87h" "P87i" "P88" "P89"
## [189] "P90" "P91" "P92" "P93"
## [193] "smartphone" "tv" "radio" "periodico"
## [197] "internet" "Facebook" "Twitter" "YouTube"
## [201] "TikTok" "Instagram" "estudios" "D04a"
## [205] "D05" "D05a" "estado_civil" "D06a"
## [209] "D07" "D08" "D09" "D10"
## [213] "origen_etnico" "D12" "D12a" "D13"
## [217] "ingreso" "D15" "rechazos" "ausentes"
## [221] "fecha" "inicio" "termino" "duracion"
## [225] "pond" "fexpm"
##
## $jun
## [1] "folio" "circuns" "region" "entidad"
## [5] "seccion" "rechazos" "hora" "sexo"
## [9] "años" "edad" "tipo" "V1"
## [13] "V1a" "V1b" "V1c" "V2"
## [17] "V3" "pid" "P1" "P2a"
## [21] "P2b" "P2c" "P3" "P4a"
## [25] "P4b" "P5" "P6" "P7"
## [29] "P8" "INE_imparcial" "Q1" "Q2"
## [33] "Q3" "aprob_amlo" "R4" "apoyo_demo"
## [37] "Q6" "Q7" "Q8" "Q9"
## [41] "smartphone" "tv" "radio" "periodico"
## [45] "internet" "Facebook" "Twitter" "YouTube"
## [49] "TikTok" "Instagram" "estudios" "D05"
## [53] "estado_civil" "D06a" "origen_etnico" "D08"
## [57] "D08a" "D09" "D10" "D11"
## [61] "benef_prog_soc" "D13" "D14" "D15"
## [65] "ingreso" "fexpm"
##
## $mar
## [1] "folio" "unidad" "region" "edad"
## [5] "sexo" "P01a" "P01b" "P02"
## [9] "P03" "P04a" "P04b" "P05a"
## [13] "P05b" "P06a" "P06b" "P07a"
## [17] "P07b" "P07c" "P07d" "P08"
## [21] "P09" "P10a" "P10b" "aprob_amlo"
## [25] "R11" "P11a" "P11b" "P12a"
## [29] "P12b" "P12c" "P12d" "P12e"
## [33] "P13" "P14" "P15a" "P15b"
## [37] "P15c" "P15d" "P15e" "P15f"
## [41] "P15g" "P15h" "P15i" "P15j"
## [45] "P16" "P17a" "P17b" "P18a"
## [49] "P18b" "P18c" "P18d" "P18e"
## [53] "P18f" "P18g" "P18h" "P18i"
## [57] "P19a" "P19b" "P19c" "P19d"
## [61] "P19e" "P19f" "P19g" "P19h"
## [65] "P19i" "P19j" "P19k" "P20a"
## [69] "P20b" "P20c" "P20d" "P21"
## [73] "P22" "P23" "benef_prog_soc" "P25"
## [77] "P26" "P27" "P28" "P29"
## [81] "P30" "P31" "P32" "P33"
## [85] "P34" "P35" "P36" "P37"
## [89] "P38" "P39" "P40a" "P40b"
## [93] "P41" "P42" "P43" "P44"
## [97] "P45" "P46a" "P46b" "P47"
## [101] "P48" "P49" "apoyo_demo" "P51"
## [105] "P52" "P53" "P54" "P55"
## [109] "P56" "P57" "P58" "INE_imparcial"
## [113] "P60" "P61" "P62" "P63a"
## [117] "P63b" "P63c" "P63d" "P63e"
## [121] "P63f" "P63g" "pid" "R64"
## [125] "P65" "P66a" "P66b" "P66c"
## [129] "P67" "P68" "P69a" "P69b"
## [133] "P69c" "P69d" "P69e" "P70"
## [137] "P71" "R71" "P72" "P73"
## [141] "P74" "R74" "P75" "P76"
## [145] "P77a" "P77b" "P77c" "P78"
## [149] "R78" "P79" "P80" "P81"
## [153] "R81" "P82" "P83" "P84"
## [157] "P85a" "P85b" "P86a" "P86b"
## [161] "P86c" "P86d" "P86e" "P86f"
## [165] "P86g" "P86h" "P87a" "P87b"
## [169] "P87c" "P87d" "P87e" "P87f"
## [173] "P87g" "P87h" "P87i" "P88a"
## [177] "P88b" "P88c" "P88d" "P88e"
## [181] "P88f" "P88g" "P88h" "P88i"
## [185] "P89" "P90" "P91" "P92"
## [189] "P93" "P94" "smartphone" "tv"
## [193] "radio" "periodico" "internet" "Facebook"
## [197] "Twitter" "YouTube" "TikTok" "Instagram"
## [201] "estudios" "D04a" "D05" "estado_civil"
## [205] "D06a" "D07" "D08" "D09"
## [209] "D10" "origen_etnico" "D12" "D12a"
## [213] "D13" "ingreso" "D15" "rechazos"
## [217] "ausentes" "fecha" "inicio" "termino"
## [221] "duracion" "pond" "fexpm"
##
## $may
## [1] "folio" "unidad" "region" "edad"
## [5] "sexo" "P01" "P02" "P03"
## [9] "P04" "aprob_amlo" "R05" "benef_prog_soc"
## [13] "R06" "P07" "apoyo_demo" "P09"
## [17] "P10" "P11" "P12" "INE_imparcial"
## [21] "P14" "P15" "P16a" "P16b"
## [25] "P16c" "P16d" "P16e" "P16f"
## [29] "P16g" "pid" "R17" "P18"
## [33] "P19a" "P19b" "P19c" "P20"
## [37] "P21" "P22" "R22" "P23"
## [41] "P24" "R24" "P25" "P26"
## [45] "P27" "R27" "P28a" "P28b"
## [49] "P28c" "P29" "R29" "P30"
## [53] "P31" "P32a" "P32b" "P33"
## [57] "P34" "P35" "P36" "P37"
## [61] "P38" "smartphone" "tv" "radio"
## [65] "periodico" "internet" "Facebook" "Twitter"
## [69] "YouTube" "TikTok" "Instagram" "estudios"
## [73] "D05" "D05a" "estado_civil" "D06a"
## [77] "D07" "D08" "D09" "origen_etnico"
## [81] "D11" "D11a" "D12" "ingreso"
## [85] "rechazos" "ausentes" "fecha" "inicio"
## [89] "termino" "duracion" "pond" "fexpm"
## [93] "filter_."
##
## $sep
## [1] "folio" "circun" "region" "sexo"
## [5] "edad" "años" "P01a" "P01b"
## [9] "P02" "P03" "P04a" "P04b"
## [13] "P05a" "P05b" "P06a" "P06b"
## [17] "P07a" "P07b" "P07c" "P07d"
## [21] "P08" "P09" "P10a" "P10b"
## [25] "aprob_amlo" "P12a" "P12b" "P12c"
## [29] "P12d" "P12e" "P13" "P14a"
## [33] "P14b" "P14c" "P14d" "P14e"
## [37] "P14f" "P14g" "P14h" "P14i"
## [41] "P14j" "P15a" "P15b" "P15c"
## [45] "P15d" "P15e" "P15f" "P15g"
## [49] "P15h" "P15i" "P16" "P17"
## [53] "P18" "P19a" "P19b" "P19c"
## [57] "P19d" "P19e" "P19f" "P20"
## [61] "P21a" "P21b" "P22" "P23"
## [65] "P24" "P25a" "P25b" "P25c"
## [69] "P25d" "P25e" "P26" "P27"
## [73] "P28" "P29" "P30" "P31"
## [77] "P32" "P33" "P34" "P35"
## [81] "P36" "P37" "P38" "P39"
## [85] "P40" "P41a" "P41b" "P41c"
## [89] "P41d" "P41e" "P41f" "P41g"
## [93] "P42a" "P42b" "P43a" "P43b"
## [97] "P44" "P45" "P46" "P47"
## [101] "P48a" "P48b" "P49" "P50"
## [105] "P51" "apoyo_demo" "P53" "P54"
## [109] "P55" "P56" "INE_imparcial" "P58"
## [113] "P59" "P60a" "P60b" "P60c"
## [117] "P60d" "P60e" "P60f" "pid"
## [121] "R61" "P62a" "R62a" "P62b"
## [125] "R62b" "P63a" "P63b" "P64"
## [129] "smartphone" "tv" "radio" "periodico"
## [133] "internet" "Facebook" "Twitter" "YouTube"
## [137] "TikTok" "Instagram" "estudios" "D05"
## [141] "estado_civil" "D06a" "D07" "benef_prog_soc"
## [145] "D09" "D10" "D11a" "D11b"
## [149] "origen_etnico" "D13a" "D13b" "D14"
## [153] "D15a" "D15b" "ingreso" "D17"
## [157] "fecha" "inicio" "final" "duracion"
## [161] "entidad" "municipio" "seccion" "rechazos"
## [165] "ausentes" "pond" "fexpm"
#### seleccionar
# lista de dataframes originales (ya renombrados)
dfs <- list(abr = abr, jun = jun, mar = mar, may = may, sep = sep)
# variables a preservar
vars_seleccionadas <- c(
"smartphone",
"tv",
"radio",
"periodico",
"internet",
"Facebook",
"Twitter",
"YouTube",
"TikTok",
"Instagram",
"estudios",
"estado_civil",
"sexo",
"region",
"edad",
"origen_etnico",
"benef_prog_soc",
"ingreso",
"aprob_amlo",
"pid",
"apoyo_demo",
"INE_imparcial"
)
# crear nuevos dataframes con variables selecciondas y en csv
for (n in names(dfs)) {
# seleccionar solo las columnas que existan en el dataframe
df_filtrado <- dfs[[n]][ , intersect(vars_seleccionadas, names(dfs[[n]]))]
# asignar el nuevo dataframe al entorno con nombre *_sel*
assign(paste0(n, "_sel"), df_filtrado)
# guardar como CSV
write.csv(df_filtrado, paste0(n, "_sel.csv"), row.names = FALSE)
}
# confirmar
lapply(ls(pattern = "_sel$"), function(x) names(get(x)))
## [[1]]
## [1] "smartphone" "tv" "radio" "periodico"
## [5] "internet" "Facebook" "Twitter" "YouTube"
## [9] "TikTok" "Instagram" "estudios" "estado_civil"
## [13] "sexo" "region" "edad" "origen_etnico"
## [17] "benef_prog_soc" "ingreso" "aprob_amlo" "pid"
## [21] "apoyo_demo" "INE_imparcial"
##
## [[2]]
## [1] "smartphone" "tv" "radio" "periodico"
## [5] "internet" "Facebook" "Twitter" "YouTube"
## [9] "TikTok" "Instagram" "estudios" "estado_civil"
## [13] "sexo" "region" "edad" "origen_etnico"
## [17] "benef_prog_soc" "ingreso" "aprob_amlo" "pid"
## [21] "apoyo_demo" "INE_imparcial"
##
## [[3]]
## [1] "smartphone" "tv" "radio" "periodico"
## [5] "internet" "Facebook" "Twitter" "YouTube"
## [9] "TikTok" "Instagram" "estudios" "estado_civil"
## [13] "sexo" "region" "edad" "origen_etnico"
## [17] "benef_prog_soc" "ingreso" "aprob_amlo" "pid"
## [21] "apoyo_demo" "INE_imparcial"
##
## [[4]]
## [1] "smartphone" "tv" "radio" "periodico"
## [5] "internet" "Facebook" "Twitter" "YouTube"
## [9] "TikTok" "Instagram" "estudios" "estado_civil"
## [13] "sexo" "region" "edad" "origen_etnico"
## [17] "benef_prog_soc" "ingreso" "aprob_amlo" "pid"
## [21] "apoyo_demo" "INE_imparcial"
##
## [[5]]
## [1] "smartphone" "tv" "radio" "periodico"
## [5] "internet" "Facebook" "Twitter" "YouTube"
## [9] "TikTok" "Instagram" "estudios" "estado_civil"
## [13] "sexo" "region" "edad" "origen_etnico"
## [17] "benef_prog_soc" "ingreso" "aprob_amlo" "pid"
## [21] "apoyo_demo" "INE_imparcial"
table(mar$edad)
##
## 17 a 24 años 25 a 39 años 40 a 54 años 55 y más años
## 97 471 305 197
table(sep$edad)
##
## 18 a 24 años 25 a 39 años 40 a 54 años 55 y más años
## 94 420 381 175
### usar promtp 1
### usar prompt 2 (quitar lo de columnas diferentes, eso ya se arregló)
table(mar$INE_imparcial)
##
## 1. Sí la garantiza 2. No la garantiza 8. No sabe
## 487 490 93
table(sep$INE_imparcial)
##
## No la garantiza No sabe Sí la garantiza
## 409 77 584
# 2025 INE
#que si 439
#que no 461
#ns/nc 100
#¿seraaaa? Igual siii