library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.3, built: 2024-11-07)
## ## Copyright (C) 2005-2025 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(questionr)
library(ggplot2)
library(dplyr)
library(ggcorrplot)
https://archive.ics.uci.edu/dataset/2/adult
Estos datos fueron extraídos de la base de datos del Census Bureau del gobierno de los Estados Unidos por Ronny Kohavi y Barry Becker, el objetivo es determinar si el salario de un individuo sobrepasa los $50K dólares anuales.
df = read.csv('adult.csv', sep = ',')
head(df)
## age workclass fnlwgt education education.num marital.status
## 1 39 State-gov 77516 Bachelors 13 Never-married
## 2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## 3 38 Private 215646 HS-grad 9 Divorced
## 4 53 Private 234721 11th 7 Married-civ-spouse
## 5 28 Private 338409 Bachelors 13 Married-civ-spouse
## 6 37 Private 284582 Masters 14 Married-civ-spouse
## occupation relationship race sex capital.gain capital.loss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## hours.per.week native.country income
## 1 40 United-States <=50K
## 2 13 United-States <=50K
## 3 40 United-States <=50K
## 4 40 United-States <=50K
## 5 40 Cuba <=50K
## 6 40 United-States <=50K
Se observa el las primeras filas del DataFrame, con la descripción del tipo de variable respectiva en cada columna. El Dataframe cuenta con 6 variables númericas, 2 variables categóricas dicotómicas (incluyendo variable objetivo) y 7 categóricas nominales
sapply(df, class)
## age workclass fnlwgt education education.num
## "integer" "character" "integer" "character" "integer"
## marital.status occupation relationship race sex
## "character" "character" "character" "character" "character"
## capital.gain capital.loss hours.per.week native.country income
## "integer" "integer" "integer" "character" "character"
Los datos faltantes están recategorizados como el signo de interrogación, por tanto, si queremos visualizarlos usando la función missmap de la librería Amelia es necesario convertirlos a NA
df[df == ' ?'] <- NA
Posteriormente hacemos revisión de datos faltantes en las variables.
sapply(df, function(x) sum(is.na(x)))
## age workclass fnlwgt education education.num
## 0 1836 0 0 0
## marital.status occupation relationship race sex
## 0 1843 0 0 0
## capital.gain capital.loss hours.per.week native.country income
## 0 0 0 583 0
missmap(df, col = c("red", "blue"), legend = TRUE)
Observamos que hay NA’s en las variables Workclasss, Occupation y
Native-Country
Realizamos el respectivo summary del Dataframe.
summary(df)
## age workclass fnlwgt education
## Min. :17.00 Length:32561 Min. : 12285 Length:32561
## 1st Qu.:28.00 Class :character 1st Qu.: 117827 Class :character
## Median :37.00 Mode :character Median : 178356 Mode :character
## Mean :38.58 Mean : 189778
## 3rd Qu.:48.00 3rd Qu.: 237051
## Max. :90.00 Max. :1484705
## education.num marital.status occupation relationship
## Min. : 1.00 Length:32561 Length:32561 Length:32561
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :10.00 Mode :character Mode :character Mode :character
## Mean :10.08
## 3rd Qu.:12.00
## Max. :16.00
## race sex capital.gain capital.loss
## Length:32561 Length:32561 Min. : 0 Min. : 0.0
## Class :character Class :character 1st Qu.: 0 1st Qu.: 0.0
## Mode :character Mode :character Median : 0 Median : 0.0
## Mean : 1078 Mean : 87.3
## 3rd Qu.: 0 3rd Qu.: 0.0
## Max. :99999 Max. :4356.0
## hours.per.week native.country income
## Min. : 1.00 Length:32561 Length:32561
## 1st Qu.:40.00 Class :character Class :character
## Median :40.00 Mode :character Mode :character
## Mean :40.44
## 3rd Qu.:45.00
## Max. :99.00
table(df$workclass)
##
## Federal-gov Local-gov Never-worked Private
## 960 2093 7 22696
## Self-emp-inc Self-emp-not-inc State-gov Without-pay
## 1116 2541 1298 14
table(df$marital)
##
## Divorced Married-AF-spouse Married-civ-spouse
## 4443 23 14976
## Married-spouse-absent Never-married Separated
## 418 10683 1025
## Widowed
## 993
table(df$education)
##
## 10th 11th 12th 1st-4th 5th-6th
## 933 1175 433 168 333
## 7th-8th 9th Assoc-acdm Assoc-voc Bachelors
## 646 514 1067 1382 5355
## Doctorate HS-grad Masters Preschool Prof-school
## 413 10501 1723 51 576
## Some-college
## 7291
table(df$relationship)
##
## Husband Not-in-family Other-relative Own-child Unmarried
## 13193 8305 981 5068 3446
## Wife
## 1568
table(df$race)
##
## Amer-Indian-Eskimo Asian-Pac-Islander Black Other
## 311 1039 3124 271
## White
## 27816
A continuación, se genera un doble gráfico para cada variable binaria, permitiendo visualizarlo en forma de Piechart y tabla de frecuencia.
Se observa una ligera mayoría masculina.
table_default <- questionr::freq(df$sex, cum = TRUE, sort = "dec", total = TRUE)
knitr::kable(table_default)
n | % | val% | %cum | val%cum | |
---|---|---|---|---|---|
Male | 21790 | 66.9 | 66.9 | 66.9 | 66.9 |
Female | 10771 | 33.1 | 33.1 | 100.0 | 100.0 |
Total | 32561 | 100.0 | 100.0 | 100.0 | 100.0 |
mayoría percibe ingresos ≤ 50K; menor proporción con ingresos > 50K.
A continuación graficaremos las distribuciones de las variables categóricas por medio de diagramas de barras.
Tabla_workclass <- df %>%
filter(!is.na(workclass)) %>%
dplyr::group_by(workclass) %>%
dplyr::summarise(Total = n()) %>%
dplyr::mutate(Porcentaje = round(Total/sum(Total)*100, 1)) %>%
dplyr::arrange(workclass)
Tabla_workclass
## # A tibble: 8 × 3
## workclass Total Porcentaje
## <chr> <int> <dbl>
## 1 " Federal-gov" 960 3.1
## 2 " Local-gov" 2093 6.8
## 3 " Never-worked" 7 0
## 4 " Private" 22696 73.9
## 5 " Self-emp-inc" 1116 3.6
## 6 " Self-emp-not-inc" 2541 8.3
## 7 " State-gov" 1298 4.2
## 8 " Without-pay" 14 0
G1 <- ggplot(Tabla_workclass, aes(x=workclass, y=Total) ) +
geom_bar(width = 0.7, stat="identity", fill="grey") +
coord_cartesian(ylim = c(0, 25000)) +
labs(x="Tipo de trabajo", y="Frecuencias \n (Porcentajes)") +
geom_text(aes(label=paste0(Total, " (", Porcentaje, "%)")),
vjust=-0.9, color="black", size=4) +
theme_bw(base_size = 16) +
theme(
axis.text.x = element_text(angle = 30, vjust = 0.5, hjust = 0.5, size = 12)
) +
facet_wrap(~"Distribución de Tipo de trabajo")
G1
predominio del sector privado, seguido por Self-employed y Government
Tabla_education <- df %>%
filter(!is.na(education)) %>%
dplyr::group_by(education) %>%
dplyr::summarise(Total = n()) %>%
dplyr::mutate(Porcentaje = round(Total/sum(Total)*100, 1)) %>%
dplyr::arrange(education)
Tabla_education
## # A tibble: 16 × 3
## education Total Porcentaje
## <chr> <int> <dbl>
## 1 " 10th" 933 2.9
## 2 " 11th" 1175 3.6
## 3 " 12th" 433 1.3
## 4 " 1st-4th" 168 0.5
## 5 " 5th-6th" 333 1
## 6 " 7th-8th" 646 2
## 7 " 9th" 514 1.6
## 8 " Assoc-acdm" 1067 3.3
## 9 " Assoc-voc" 1382 4.2
## 10 " Bachelors" 5355 16.4
## 11 " Doctorate" 413 1.3
## 12 " HS-grad" 10501 32.3
## 13 " Masters" 1723 5.3
## 14 " Preschool" 51 0.2
## 15 " Prof-school" 576 1.8
## 16 " Some-college" 7291 22.4
G2 <- ggplot(Tabla_education, aes(x=education, y=Total) ) +
geom_bar(width = 0.7, stat="identity", fill="cadetblue3") +
coord_cartesian(ylim = c(0, 25000)) +
labs(x="Nivel de educación", y="Frecuencias \n (Porcentajes)") +
geom_text(aes(label=paste0(Total, " (", Porcentaje, "%)")),
vjust=-0.9, color="black", size=3) +
theme_bw(base_size = 16) +
theme(
axis.text.x = element_text(angle = 30, vjust = 0.5, hjust = 0.5, size = 12)
) +
facet_wrap(~"Distribución de Nivel de educación")
G2
Mayoría con educación secundaria o equivalente; minoría con títulos de posgrado.
Tabla_marital <- df %>%
filter(!is.na(`marital.status`)) %>%
dplyr::group_by(`marital.status`) %>%
dplyr::summarise(Total = n()) %>%
dplyr::mutate(Porcentaje = round(Total/sum(Total)*100, 1)) %>%
dplyr::arrange(`marital.status`)
Tabla_marital
## # A tibble: 7 × 3
## marital.status Total Porcentaje
## <chr> <int> <dbl>
## 1 " Divorced" 4443 13.6
## 2 " Married-AF-spouse" 23 0.1
## 3 " Married-civ-spouse" 14976 46
## 4 " Married-spouse-absent" 418 1.3
## 5 " Never-married" 10683 32.8
## 6 " Separated" 1025 3.1
## 7 " Widowed" 993 3
G3 <- ggplot(Tabla_marital, aes(x=`marital.status`, y=Total) ) +
geom_bar(width = 0.7, stat="identity", fill="lemonchiffon1") +
coord_cartesian(ylim = c(0, 25000)) +
labs(x="Estado civil", y="Frecuencias \n (Porcentajes)") +
geom_text(aes(label=paste0(Total, " (", Porcentaje, "%)")),
vjust=-0.9, color="black", size=3) +
theme_bw(base_size = 16) +
theme(
axis.text.x = element_text(angle = 30, vjust = 0.5, hjust = 0.5, size = 12)
) +
facet_wrap(~"Distribución de estado civil")
G3
Alto porcentaje de casados, seguido por solteros y divorciados.
Tabla_occupation <- df %>%
filter(!is.na(occupation)) %>%
dplyr::group_by(occupation) %>%
dplyr::summarise(Total = n()) %>%
dplyr::mutate(Porcentaje = round(Total/sum(Total)*100, 1)) %>%
dplyr::arrange(occupation)
Tabla_occupation
## # A tibble: 14 × 3
## occupation Total Porcentaje
## <chr> <int> <dbl>
## 1 " Adm-clerical" 3770 12.3
## 2 " Armed-Forces" 9 0
## 3 " Craft-repair" 4099 13.3
## 4 " Exec-managerial" 4066 13.2
## 5 " Farming-fishing" 994 3.2
## 6 " Handlers-cleaners" 1370 4.5
## 7 " Machine-op-inspct" 2002 6.5
## 8 " Other-service" 3295 10.7
## 9 " Priv-house-serv" 149 0.5
## 10 " Prof-specialty" 4140 13.5
## 11 " Protective-serv" 649 2.1
## 12 " Sales" 3650 11.9
## 13 " Tech-support" 928 3
## 14 " Transport-moving" 1597 5.2
G4 <- ggplot(Tabla_occupation, aes(x=occupation, y=Total) ) +
geom_bar(width = 0.7, stat="identity", fill="darkolivegreen2") +
coord_cartesian(ylim = c(0, 25000)) +
labs(x="Ocupación", y="Frecuencias \n (Porcentajes)") +
geom_text(aes(label=paste0(Total, " (", Porcentaje, "%)")),
vjust=-0.9, color="black", size=4) +
theme_bw(base_size = 16) +
theme(
axis.text.x = element_text(angle = 30, vjust = 0.5, hjust = 0.5, size = 12)
) +
facet_wrap(~"Distribución de la ocupación")
G4
Alta diversidad de profesiones; destacan craft-repair, exec-managerial y prof-specialty.
Tabla_relationship <- df %>%
filter(!is.na(relationship)) %>%
dplyr::group_by(relationship) %>%
dplyr::summarise(Total = n()) %>%
dplyr::mutate(Porcentaje = round(Total/sum(Total)*100, 1)) %>%
dplyr::arrange(relationship)
Tabla_relationship
## # A tibble: 6 × 3
## relationship Total Porcentaje
## <chr> <int> <dbl>
## 1 " Husband" 13193 40.5
## 2 " Not-in-family" 8305 25.5
## 3 " Other-relative" 981 3
## 4 " Own-child" 5068 15.6
## 5 " Unmarried" 3446 10.6
## 6 " Wife" 1568 4.8
G5 <- ggplot(Tabla_relationship, aes(x=relationship, y=Total) ) +
geom_bar(width = 0.7, stat="identity", fill="coral") +
coord_cartesian(ylim = c(0, 25000)) +
labs(x="Relación", y="Frecuencias \n (Porcentajes)") +
geom_text(aes(label=paste0(Total, " (", Porcentaje, "%)")),
vjust=-0.9, color="black", size=3) +
theme_bw(base_size = 16) +
theme(
axis.text.x = element_text(angle = 30, vjust = 0.5, hjust = 0.5, size = 12)
) +
facet_wrap(~"Distribución de relación del individuo")
G5
A continuación realizaremos las gráficas correspondientes a las variables númericas, graficaremos boxplots con el fin de identificar outliers
df %>%
ggplot(aes(x = "", y = age)) +
geom_boxplot(color = "black", fill = "lightsalmon", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de las Edades") +
coord_flip()
Distribución con mayor concentración entre 25 y 45 años; ligera cola hacia edades avanzadas.
df %>%
ggplot(aes(x = "", y = capital.gain)) +
geom_boxplot(color = "black", fill = "lightsalmon", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de Capital gain") +
coord_flip()
Distribución con mayor concentración entre 25 y 45 años; ligera cola hacia edades avanzadas.
df %>%
ggplot(aes(x = "", y = capital.loss)) +
geom_boxplot(color = "black", fill = "lightpink", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de Capital loss") +
coord_flip()
Similar a Capital-gain, la mayoría sin pérdidas; pocos casos con valores significativos.
df %>%
ggplot(aes(x = "", y = hours.per.week)) +
geom_boxplot(color = "black", fill = "palegreen", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de las horas de trabajo semanales") +
coord_flip()
Pico marcado en 40 horas, lo que sugiere predominio de empleo a tiempo completo.
Tabla_sex_occ <- df %>%
filter(!is.na(sex), !is.na(occupation)) %>%
group_by(sex, occupation) %>%
summarise(Total = n(), .groups = "drop") %>%
mutate(Porcentaje = round(Total / sum(Total) * 100, 1))
ggplot(Tabla_sex_occ, aes(x = occupation, y = Total, fill = sex)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = Total),
position = position_dodge(width = 0.9),
vjust = -0.3, size = 3) +
labs(x = "Ocupación", y = "Frecuencia", fill = "Sexo") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
En el sector privado predominan hombres. En trabajos del gobierno y sin clasificación clara hay mayor equilibrio de género.
ggplot(
df %>% filter(!is.na(workclass), !is.na(hours.per.week)),
aes(x = workclass, y = hours.per.week, fill = workclass)
) +
geom_boxplot() +
labs(title = "Diagrama de horas de trabajo semanal de acuerdo a la ocupación",
x = "Trabajo", y = "Horas por semana") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Sector privado con mayor concentración en 40 horas/semana, autónomos tienden a trabajar más de 40 horas, mientras que otros sectores presentan jornadas parciales (< 40 horas).
Tabla_income_edu <- df %>%
filter(!is.na(income), !is.na(education)) %>%
group_by(income, education) %>%
summarise(Total = n(), .groups = "drop") %>%
mutate(Porcentaje = round(Total / sum(Total) * 100, 1))
ggplot(Tabla_income_edu, aes(x = education, y = Total, fill = income)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = Total),
position = position_dodge(width = 0.9),
vjust = -0.3, size = 3) +
labs(x = "Educación", y = "Frecuencia", fill = "Income") +
scale_fill_manual(values = c("orange", "seagreen2")) +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Ingresos > 50K son más frecuentes en personas con educación universitaria o superior.
Tabla_income_sex <- df %>%
filter(!is.na(sex), !is.na(income)) %>%
group_by(sex, income) %>%
summarise(Total = n(), .groups = "drop") %>%
mutate(Porcentaje = round(Total / sum(Total) * 100, 1))
ggplot(Tabla_income_sex, aes(x = income, y = Total, fill = sex)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = Total),
position = position_dodge(width = 0.9),
vjust = -0.3, size = 3) +
labs(x = "Income", y = "Frecuencia", fill = "Income") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Hombres con mayor proporción de ingresos > 50K. Mujeres presentan más registros en el rango ≤ 50K, lo que sugiere posible brecha salarial o de acceso a empleos mejor remunerados.
Tabla_income_workclass <- df %>%
filter(!is.na(workclass), !is.na(income)) %>%
group_by(workclass, income) %>%
summarise(Total = n(), .groups = "drop") %>%
mutate(Porcentaje = round(Total / sum(Total) * 100, 1))
ggplot(Tabla_income_workclass, aes(x = workclass, y = Total, fill = income)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = Total),
position = position_dodge(width = 0.9),
vjust = -0.3, size = 3) +
labs(x = "Ocupación", y = "Frecuencia", fill = "Income") +
scale_fill_manual(values = c("steelblue3", "tan2")) +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Sector privado concentra la mayor cantidad de personas en ambos niveles de ingresos, pero también es donde hay mayor desigualdad. Autónomos y gobierno tienen porcentajes más equilibrados de > 50K.