Descripción
- Elaborar un archivo markdown que identifique el análisi de datos mediante un modeo de regesión logística que permita realizar predicciones de los ingresos de personas.
library(tidyverse) # varias
## ── Attaching packages ──────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 3.0.1 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr) # select filter mutate ...
library(ggplot2) # Gráficas
library(fdth) # Para tablas de distribución y frecuencias
##
## Attaching package: 'fdth'
## The following objects are masked from 'package:stats':
##
## sd, var
library(knitr) # Para ver tablas mas amigables en formato html markdown
library(caret) # Para particionar datos
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(scales) # Para escalar datos
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(readr)
Estructura y resumen de los datos
- Datos[-1]. Excepto la columna x que no interesa
str(datos)
## tibble [48,842 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ x : num [1:48842] 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num [1:48842] 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : chr [1:48842] "Private" "Private" "Local-gov" "Private" ...
## $ education : chr [1:48842] "11th" "HS-grad" "Assoc-acdm" "Some-college" ...
## $ educational.num: num [1:48842] 7 9 12 10 10 6 9 15 10 4 ...
## $ marital.status : chr [1:48842] "Never-married" "Married-civ-spouse" "Married-civ-spouse" "Married-civ-spouse" ...
## $ race : chr [1:48842] "Black" "White" "White" "Black" ...
## $ gender : chr [1:48842] "Male" "Male" "Male" "Male" ...
## $ hours.per.week : num [1:48842] 40 50 40 40 30 30 40 32 40 10 ...
## $ income : chr [1:48842] "<=50K" "<=50K" ">50K" ">50K" ...
## - attr(*, "spec")=
## .. cols(
## .. x = col_double(),
## .. age = col_double(),
## .. workclass = col_character(),
## .. education = col_character(),
## .. educational.num = col_double(),
## .. marital.status = col_character(),
## .. race = col_character(),
## .. gender = col_character(),
## .. hours.per.week = col_double(),
## .. income = col_character()
## .. )
kable(summary(datos[-1]))
|
Min. :17.00 |
Length:48842 |
Length:48842 |
Min. : 1.00 |
Length:48842 |
Length:48842 |
Length:48842 |
Min. : 1.00 |
Length:48842 |
|
1st Qu.:28.00 |
Class :character |
Class :character |
1st Qu.: 9.00 |
Class :character |
Class :character |
Class :character |
1st Qu.:40.00 |
Class :character |
|
Median :37.00 |
Mode :character |
Mode :character |
Median :10.00 |
Mode :character |
Mode :character |
Mode :character |
Median :40.00 |
Mode :character |
|
Mean :38.64 |
NA |
NA |
Mean :10.08 |
NA |
NA |
NA |
Mean :40.42 |
NA |
|
3rd Qu.:48.00 |
NA |
NA |
3rd Qu.:12.00 |
NA |
NA |
NA |
3rd Qu.:45.00 |
NA |
|
Max. :90.00 |
NA |
NA |
Max. :16.00 |
NA |
NA |
NA |
Max. :99.00 |
NA |
Se utiliza select_if() para seleccionar ciertas variables del conjunto de datos select_if() es un función de la librería dplyr Se analizan dos variables numéricas:
- hours.per.week
- age education.num es un valor numérico del factor education por lo que se analiza en las variables tipo factor mas adelante
numericas <-select_if(datos, is.numeric)
kable(summary(numericas[-1]))
|
Min. :17.00 |
Min. : 1.00 |
Min. : 1.00 |
|
1st Qu.:28.00 |
1st Qu.: 9.00 |
1st Qu.:40.00 |
|
Median :37.00 |
Median :10.00 |
Median :40.00 |
|
Mean :38.64 |
Mean :10.08 |
Mean :40.42 |
|
3rd Qu.:48.00 |
3rd Qu.:12.00 |
3rd Qu.:45.00 |
|
Max. :90.00 |
Max. :16.00 |
Max. :99.00 |
distribucion <- fdt(numericas$hours.per.week,breaks="Sturges")
kable(distribucion)
| [0.99,6.8) |
410 |
0.0083944 |
0.8394415 |
410 |
0.8394415 |
| [6.8,13) |
982 |
0.0201056 |
2.0105647 |
1392 |
2.8500061 |
| [13,18) |
1180 |
0.0241595 |
2.4159535 |
2572 |
5.2659596 |
| [18,24) |
2383 |
0.0487900 |
4.8789976 |
4955 |
10.1449572 |
| [24,30) |
2896 |
0.0592932 |
5.9293231 |
7851 |
16.0742803 |
| [30,36) |
2481 |
0.0507964 |
5.0796446 |
10332 |
21.1539249 |
| [36,42) |
24217 |
0.4958233 |
49.5823267 |
34549 |
70.7362516 |
| [42,48) |
3803 |
0.0778633 |
7.7863314 |
38352 |
78.5225830 |
| [48,53) |
5319 |
0.1089022 |
10.8902174 |
43671 |
89.4128005 |
| [53,59) |
1318 |
0.0269850 |
2.6984972 |
44989 |
92.1112977 |
| [59,65) |
2596 |
0.0531510 |
5.3150977 |
47585 |
97.4263953 |
| [65,71) |
483 |
0.0098890 |
0.9889030 |
48068 |
98.4152983 |
| [71,77) |
223 |
0.0045657 |
0.4565743 |
48291 |
98.8718726 |
| [77,83) |
237 |
0.0048524 |
0.4852381 |
48528 |
99.3571107 |
| [83,88) |
98 |
0.0020065 |
0.2006470 |
48626 |
99.5577577 |
| [88,94) |
52 |
0.0010647 |
0.1064657 |
48678 |
99.6642234 |
| [94,1e+02) |
164 |
0.0033578 |
0.3357766 |
48842 |
100.0000000 |
|
| start |
0.990000 |
| end |
99.990000 |
| h |
5.823529 |
| right |
0.000000 |
|
escalados <- datos[-1] %>%
mutate(age.scale = rescale(age),educational.num.scale = rescale(educational.num), hours.per.week.scale = rescale(hours.per.week) )
head(escalados, 10)
## # A tibble: 10 x 12
## age workclass education educational.num marital.status race gender
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 25 Private 11th 7 Never-married Black Male
## 2 38 Private HS-grad 9 Married-civ-s… White Male
## 3 28 Local-gov Assoc-ac… 12 Married-civ-s… White Male
## 4 44 Private Some-col… 10 Married-civ-s… Black Male
## 5 18 ? Some-col… 10 Never-married White Female
## 6 34 Private 10th 6 Never-married White Male
## 7 29 ? HS-grad 9 Never-married Black Male
## 8 63 Self-emp… Prof-sch… 15 Married-civ-s… White Male
## 9 24 Private Some-col… 10 Never-married White Female
## 10 55 Private 7th-8th 4 Married-civ-s… White Male
## # … with 5 more variables: hours.per.week <dbl>, income <chr>, age.scale <dbl>,
## # educational.num.scale <dbl>, hours.per.week.scale <dbl>
tail(escalados, 10)
## # A tibble: 10 x 12
## age workclass education educational.num marital.status race gender
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 32 Private 10th 6 Married-civ-s… Amer… Male
## 2 43 Private Assoc-voc 11 Married-civ-s… White Male
## 3 32 Private Masters 14 Never-married Asia… Male
## 4 53 Private Masters 14 Married-civ-s… White Male
## 5 22 Private Some-col… 10 Never-married White Male
## 6 27 Private Assoc-ac… 12 Married-civ-s… White Female
## 7 40 Private HS-grad 9 Married-civ-s… White Male
## 8 58 Private HS-grad 9 Widowed White Female
## 9 22 Private HS-grad 9 Never-married White Male
## 10 52 Self-emp… HS-grad 9 Married-civ-s… White Female
## # … with 5 more variables: hours.per.week <dbl>, income <chr>, age.scale <dbl>,
## # educational.num.scale <dbl>, hours.per.week.scale <dbl>
recategorizados <- escalados %>%
mutate(education = factor(ifelse(education == "Preschool" | education == "10th" | education == "11th" | education == "12th" | education == "1st-4th" | education == "5th-6th" | education == "7th-8th" | education == "9th", "Dropout", ifelse(education == "HS-grad", "HighGrad", ifelse(education == "Some-college" | education == "Assoc-acdm" | education == "Assoc-voc", "Community",ifelse(education == "Bachelors", "Bachelors",
ifelse(education == "Masters" | education == "Prof-school", "Master", "PhD")))))))
kable(head(recategorizados))
| 25 |
Private |
Dropout |
7 |
Never-married |
Black |
Male |
40 |
<=50K |
0.1095890 |
0.4000000 |
0.3979592 |
| 38 |
Private |
HighGrad |
9 |
Married-civ-spouse |
White |
Male |
50 |
<=50K |
0.2876712 |
0.5333333 |
0.5000000 |
| 28 |
Local-gov |
Community |
12 |
Married-civ-spouse |
White |
Male |
40 |
>50K |
0.1506849 |
0.7333333 |
0.3979592 |
| 44 |
Private |
Community |
10 |
Married-civ-spouse |
Black |
Male |
40 |
>50K |
0.3698630 |
0.6000000 |
0.3979592 |
| 18 |
? |
Community |
10 |
Never-married |
White |
Female |
30 |
<=50K |
0.0136986 |
0.6000000 |
0.2959184 |
| 34 |
Private |
Dropout |
6 |
Never-married |
White |
Male |
30 |
<=50K |
0.2328767 |
0.3333333 |
0.2959184 |
recategorizados %>%
group_by(education) %>%
summarize(promedio_educacion = mean(educational.num),
cuantos = n()) %>%
arrange(promedio_educacion)
## # A tibble: 6 x 3
## education promedio_educacion cuantos
## <fct> <dbl> <int>
## 1 Dropout 5.61 6408
## 2 HighGrad 9 15784
## 3 Community 10.4 14540
## 4 Bachelors 13 8025
## 5 Master 14.2 3491
## 6 PhD 16 594
temporal <- recategorizados %>%
mutate(marital.status=factor(ifelse(marital.status=="Never-married" | marital.status=="Married-spouse-absent","Not_married",ifelse(marital.status == "Married-civ-spouse" | marital.status=="Married-AF-spouse","Married",ifelse(marital.status=="Divorced" | marital.status=="Separated","Separated","Widow")))))
recategorizados <- temporal
kable(head(recategorizados))
| 25 |
Private |
Dropout |
7 |
Not_married |
Black |
Male |
40 |
<=50K |
0.1095890 |
0.4000000 |
0.3979592 |
| 38 |
Private |
HighGrad |
9 |
Married |
White |
Male |
50 |
<=50K |
0.2876712 |
0.5333333 |
0.5000000 |
| 28 |
Local-gov |
Community |
12 |
Married |
White |
Male |
40 |
>50K |
0.1506849 |
0.7333333 |
0.3979592 |
| 44 |
Private |
Community |
10 |
Married |
Black |
Male |
40 |
>50K |
0.3698630 |
0.6000000 |
0.3979592 |
| 18 |
? |
Community |
10 |
Not_married |
White |
Female |
30 |
<=50K |
0.0136986 |
0.6000000 |
0.2959184 |
| 34 |
Private |
Dropout |
6 |
Not_married |
White |
Male |
30 |
<=50K |
0.2328767 |
0.3333333 |
0.2959184 |
table(recategorizados$marital.status)
##
## Married Not_married Separated Widow
## 22416 16745 8163 1518
ggplot(recategorizados, aes(x = gender, fill = income)) +
geom_bar(position = "fill") +
theme_classic()

ggplot(recategorizados, aes(x = race, fill = income)) +
geom_bar(position = "fill") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))

ggplot(recategorizados, aes(x = gender, y = hours.per.week)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
size = 3,
color = "steelblue") +
theme_classic()

recategorizados <- recategorizados %>%
mutate(income10 = recode(income,"<=50K" = 0,">50K" = 1))
head(recategorizados[,c(9,13)])
## # A tibble: 6 x 2
## income income10
## <chr> <dbl>
## 1 <=50K 0
## 2 <=50K 0
## 3 >50K 1
## 4 >50K 1
## 5 <=50K 0
## 6 <=50K 0
names(recategorizados)
## [1] "age" "workclass" "education"
## [4] "educational.num" "marital.status" "race"
## [7] "gender" "hours.per.week" "income"
## [10] "age.scale" "educational.num.scale" "hours.per.week.scale"
## [13] "income10"
head(recategorizados)
## # A tibble: 6 x 13
## age workclass education educational.num marital.status race gender
## <dbl> <chr> <fct> <dbl> <fct> <chr> <chr>
## 1 25 Private Dropout 7 Not_married Black Male
## 2 38 Private HighGrad 9 Married White Male
## 3 28 Local-gov Community 12 Married White Male
## 4 44 Private Community 10 Married Black Male
## 5 18 ? Community 10 Not_married White Female
## 6 34 Private Dropout 6 Not_married White Male
## # … with 6 more variables: hours.per.week <dbl>, income <chr>, age.scale <dbl>,
## # educational.num.scale <dbl>, hours.per.week.scale <dbl>, income10 <dbl>
write.csv(recategorizados, file="adultos_clean.csv")
dir()
## [1] "adultos_clean.csv"
## [2] "Regresión logística para predecir ingresos de personas.Rmd"
## [3] "Regresión-logística-para-predecir-ingresos-de-personas.html"
## [4] "Regresión-logística-para-predecir-ingresos-de-personas.Rmd"
nrow(recategorizados)
## [1] 48842
set.seed(2020)
entrena <- createDataPartition(recategorizados$income, p=0.7, list = FALSE)
datos.Entrena <- recategorizados[entrena,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
datos.Validacion <- recategorizados[-entrena,]
nrow(datos.Entrena)
## [1] 34190
datos.Validacion <- recategorizados[-entrena,]
nrow(datos.Validacion)
## [1] 14652
formula = income10 ~ age.scale + workclass + education + marital.status + race + gender + hours.per.week.scale
modelo <- glm(formula, data = datos.Entrena, family = 'binomial')
summary(modelo)
##
## Call:
## glm(formula = formula, family = "binomial", data = datos.Entrena)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7337 -0.5768 -0.2588 -0.0654 3.3492
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.419e+00 2.228e-01 -10.858 < 2e-16 ***
## age.scale 2.224e+00 1.053e-01 21.121 < 2e-16 ***
## workclassFederal-gov 1.421e+00 1.237e-01 11.485 < 2e-16 ***
## workclassLocal-gov 6.942e-01 1.100e-01 6.312 2.76e-10 ***
## workclassNever-worked -8.124e+00 1.042e+02 -0.078 0.9379
## workclassPrivate 8.124e-01 9.598e-02 8.464 < 2e-16 ***
## workclassSelf-emp-inc 1.218e+00 1.186e-01 10.270 < 2e-16 ***
## workclassSelf-emp-not-inc 1.878e-01 1.071e-01 1.753 0.0797 .
## workclassState-gov 5.339e-01 1.223e-01 4.367 1.26e-05 ***
## workclassWithout-pay -3.965e-01 8.276e-01 -0.479 0.6318
## educationCommunity -9.930e-01 4.428e-02 -22.426 < 2e-16 ***
## educationDropout -2.782e+00 7.802e-02 -35.657 < 2e-16 ***
## educationHighGrad -1.611e+00 4.523e-02 -35.610 < 2e-16 ***
## educationMaster 6.250e-01 6.110e-02 10.230 < 2e-16 ***
## educationPhD 1.077e+00 1.379e-01 7.814 5.55e-15 ***
## marital.statusNot_married -2.491e+00 5.355e-02 -46.511 < 2e-16 ***
## marital.statusSeparated -2.102e+00 5.650e-02 -37.214 < 2e-16 ***
## marital.statusWidow -2.163e+00 1.287e-01 -16.809 < 2e-16 ***
## raceAsian-Pac-Islander -2.461e-02 2.074e-01 -0.119 0.9055
## raceBlack 4.784e-04 1.968e-01 0.002 0.9981
## raceOther -9.881e-02 2.817e-01 -0.351 0.7258
## raceWhite 2.155e-01 1.876e-01 1.148 0.2509
## genderMale 9.432e-02 4.455e-02 2.117 0.0342 *
## hours.per.week.scale 3.136e+00 1.398e-01 22.430 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37626 on 34189 degrees of freedom
## Residual deviance: 25011 on 34166 degrees of freedom
## AIC: 25059
##
## Number of Fisher Scoring iterations: 11
comparar <- data.frame(datos.Entrena$income10, as.vector(modelo$fitted.values) )
comparar <- comparar %>%
mutate(income10ajustados = if_else (modelo$fitted.values > 0.5, 1, 0))
colnames(comparar) <- c("income10", "ajuste", 'income10ajustados')
head(comparar)
## income10 ajuste income10ajustados
## 1 0 0.005003194 0
## 2 0 0.331904483 0
## 3 1 0.304736019 0
## 4 1 0.393130183 0
## 5 0 0.008761445 0
## 6 0 0.008068021 0
matriz_confusion <- table(comparar$income10, comparar$income10ajustados, dnn = c("income10", "income10ajustados para predicciones"))
matriz_confusion
## income10ajustados para predicciones
## income10 0 1
## 0 24107 1902
## 1 4016 4165