#En este trabajo autónomo, vamos a recibir un conjunto de datos provenientes de una encuesta de salarios a gerentes que, por su libertad para contestar, requiere serrevisada y corregida antes de analizarla.
#Se instala librería necesaria para la visualización de dataset
install.packages("shiny")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(shiny)
install.packages("DT")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(DT)
##
## Adjuntando el paquete: 'DT'
## The following objects are masked from 'package:shiny':
##
## dataTableOutput, renderDataTable
install.packages("dplyr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
install.packages("tidyr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror
library(tidyr)
Se importa la base a ser analizada: Taller_1_._Anexo_A
Anexo_A=read.csv("C:/Users/Usuario/Downloads/Taller_1_-_Anexo_A.csv")
Exploración inicial
#View(Anexo_A)
names(Anexo_A) #Nombre de las Variables
## [1] "Timestamp"
## [2] "How.old.are.you."
## [3] "Industry"
## [4] "Functional.area.of.job"
## [5] "Job.title"
## [6] "Job.title...additional.context"
## [7] "Annual.salary..gross."
## [8] "Additional.monetary.compensation"
## [9] "Currency"
## [10] "Currency...other"
## [11] "Income...additional.context"
## [12] "Country"
## [13] "State"
## [14] "City"
## [15] "Remote.or.on.site."
## [16] "Years.of.experience..overall"
## [17] "Years.of.experience.in.field"
## [18] "Highest.level.of.education.completed"
## [19] "Gender"
## [20] "Race"
summary(Anexo_A) # Resumen de datos
## Timestamp How.old.are.you. Industry
## Length:17099 Length:17099 Length:17099
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Functional.area.of.job Job.title Job.title...additional.context
## Length:17099 Length:17099 Length:17099
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Annual.salary..gross. Additional.monetary.compensation Currency
## Min. : 0 Min. : 0 Length:17099
## 1st Qu.: 60128 1st Qu.: 0 Class :character
## Median : 85000 Median : 2000 Mode :character
## Mean : 116039 Mean : 13457
## 3rd Qu.: 120000 3rd Qu.: 10000
## Max. :125000000 Max. :20000000
## NA's :3998
## Currency...other Income...additional.context Country
## Length:17099 Length:17099 Length:17099
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## State City Remote.or.on.site.
## Length:17099 Length:17099 Length:17099
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Years.of.experience..overall Years.of.experience.in.field
## Length:17099 Length:17099
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Highest.level.of.education.completed Gender Race
## Length:17099 Length:17099 Length:17099
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
nrow(Anexo_A) #Número de Observaciones
## [1] 17099
ncol(Anexo_A) #Número de Variables
## [1] 20
#str(Anexo_A)
#class(Anexo_A)
#sapply(Anexo_A, class)
#datatable(Anexo_A) #Visualización de la data: Anexo_A
#table(sapply(Anexo_A, class)) #Tipo de Variables
Se detectan 17099 observaciones y 20 variables.
Los datos se muestran en Inglés
De las 20 variables, 18 son de tipo “character” (Alfanumérico), y 2 son de tipo “integer” (numérico entero)
#> Variables Categórica Nominales:14 #Race, Gender, Highest.level.of.education.completed, Remote.or.on.site., City, State, Country, Income…additional.context, Currency…other, Currency, Job.title…additional.context, Job.title, Functional.area.of.job, Industry
#> Variables Numérica Continua:3 #Timestamp, Additional.monetary.compensation, Annual.salary..gross.
#> Variables Numérica Ratios:3 #How.old.are.you., Years.of.experience..overall, Years.of.experience.in.field
Se verifica la validez, precisión, completitud, consistencia y uniformidad de los datos con una tabla resumen
resumen <- data.frame(
Variable = names(Anexo_A),
tipo = sapply(Anexo_A, class),
Total_valores = sapply(Anexo_A, function(x) length(x)),
Valores_únicos = sapply(Anexo_A, function(x) length(unique(x))),
Valores_blancos = sapply(Anexo_A, function(x) sum(nchar(trimws(x)) == 0)),
Valores_vacios = sapply(Anexo_A, function(x) sum(x == "")),
Valores_NA = sapply(Anexo_A, function(x) sum(is.na(x))),
Aporta_info = c("Sí", "Sí", "Sí", "Sí", "Sí",
"No", "Sí", "Sí", "Sí", "No",
"No", "Sí", "Sí", "Sí", "Sí",
"Sí", "Sí", "Sí", "Sí", "Sí")
)
#resumen
#str(resumen)
#View(resumen)
datatable(resumen) #Visualización de la data: resumen
Elimiar filas duplicadas
Anexo_A <- Anexo_A %>% distinct()
Añadir columnas necesarias: ID, Fecha y Hora
Anexo_A$ID <- 1:nrow(Anexo_A)
Anexo_A <- Anexo_A %>%
mutate(
# Convertir fecha_hora a fecha y hora
Fecha = format(as.Date(strptime(Timestamp, format = "%d/%m/%Y %H:%M:%S")), format = "%d-%m-%Y"),
Hora = format(strptime(Timestamp, format = "%d/%m/%Y %H:%M:%S"), format = "%H:%M:%S")
)
Eliminar columnas innecesarias
Anexo_A <- Anexo_A[, !names(Anexo_A) %in% c("Timestamp","Job.title...additional.context","Currency...other","Income...additional.context")]
Renombrar las columnas
names(Anexo_A) <- c( "Edad", "Industria", "Área de Trabajo", "Trabajo", "Salario anual","Pago adicional", "Divisa","País","Estado","Ciudad","Trabajo Remoto", "Experiencia laboral", "Experiencia en el cargo", "Nivel académico", "Género", "Raza", "ID", "Fecha", "Hora")
Rellenar valores NA
# Rellenar los NA numéricos con la media
Anexo_A <- Anexo_A %>%
mutate(across(where(is.numeric), ~replace(., is.na(.), mean(., na.rm = TRUE))))
# Rellenar los NA en la columna fecha
Anexo_A$Fecha[is.na(Anexo_A$Fecha)] <- as.Date("2024-12-31")
# Rellenar los NA en la columna hora
Anexo_A$Hora[is.na(Anexo_A$Hora)] <- "00:00:00"
Rellenar valores Vacios de texto
Anexo_A <- Anexo_A %>%
mutate(across(where(is.character), ~replace(., . == "" | is.na(.), "No Name")))
Ordenar columnas
Anexo_A <- Anexo_A %>%
select(ID, Fecha, Hora, everything())
Verificar Resumen
resumen <- data.frame(
Variable = names(Anexo_A),
tipo = sapply(Anexo_A, class),
Total_valores = sapply(Anexo_A, function(x) length(x)),
Valores_únicos = sapply(Anexo_A, function(x) length(unique(x))),
Valores_blancos = sapply(Anexo_A, function(x) sum(nchar(trimws(x)) == 0)),
Valores_vacios = sapply(Anexo_A, function(x) sum(x == "")),
Valores_NA = sapply(Anexo_A, function(x) sum(is.na(x)))
)
datatable(resumen)
#View(resumen)
se exporta csv del documento Anexo_A luego de la limpieza
write.csv(Anexo_A, file = "Anexo_A_limpio.csv", row.names = FALSE)