library(tidyverse) # varias
library(dplyr) # select filter mutate ...
library(ggplot2) # Gráficas
library(fdth) # Para tablas de distribución y frecuencias
library(knitr) # Para ver tablas mas amigables en formato html markdown
library(caret) # Pra particionar datos
library(reshape) # Para renombrar columnas en caso de necesitarse
library(scales) # Para escalar datos
setwd("~/Mis clases ITD/Semestre Enero Junio 2020/Analisis Inteligente de Datos/Mis Proyectos")
datos <- read.csv("datos/adultos.csv")
kable(head(datos, 10))
| x | age | workclass | education | educational.num | marital.status | race | gender | hours.per.week | income |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 25 | Private | 11th | 7 | Never-married | Black | Male | 40 | <=50K |
| 2 | 38 | Private | HS-grad | 9 | Married-civ-spouse | White | Male | 50 | <=50K |
| 3 | 28 | Local-gov | Assoc-acdm | 12 | Married-civ-spouse | White | Male | 40 | >50K |
| 4 | 44 | Private | Some-college | 10 | Married-civ-spouse | Black | Male | 40 | >50K |
| 5 | 18 | ? | Some-college | 10 | Never-married | White | Female | 30 | <=50K |
| 6 | 34 | Private | 10th | 6 | Never-married | White | Male | 30 | <=50K |
| 7 | 29 | ? | HS-grad | 9 | Never-married | Black | Male | 40 | <=50K |
| 8 | 63 | Self-emp-not-inc | Prof-school | 15 | Married-civ-spouse | White | Male | 32 | >50K |
| 9 | 24 | Private | Some-college | 10 | Never-married | White | Female | 40 | <=50K |
| 10 | 55 | Private | 7th-8th | 4 | Married-civ-spouse | White | Male | 10 | <=50K |
kable(tail(datos,10))
| x | age | workclass | education | educational.num | marital.status | race | gender | hours.per.week | income | |
|---|---|---|---|---|---|---|---|---|---|---|
| 48833 | 48833 | 32 | Private | 10th | 6 | Married-civ-spouse | Amer-Indian-Eskimo | Male | 40 | <=50K |
| 48834 | 48834 | 43 | Private | Assoc-voc | 11 | Married-civ-spouse | White | Male | 45 | <=50K |
| 48835 | 48835 | 32 | Private | Masters | 14 | Never-married | Asian-Pac-Islander | Male | 11 | <=50K |
| 48836 | 48836 | 53 | Private | Masters | 14 | Married-civ-spouse | White | Male | 40 | >50K |
| 48837 | 48837 | 22 | Private | Some-college | 10 | Never-married | White | Male | 40 | <=50K |
| 48838 | 48838 | 27 | Private | Assoc-acdm | 12 | Married-civ-spouse | White | Female | 38 | <=50K |
| 48839 | 48839 | 40 | Private | HS-grad | 9 | Married-civ-spouse | White | Male | 40 | >50K |
| 48840 | 48840 | 58 | Private | HS-grad | 9 | Widowed | White | Female | 40 | <=50K |
| 48841 | 48841 | 22 | Private | HS-grad | 9 | Never-married | White | Male | 20 | <=50K |
| 48842 | 48842 | 52 | Self-emp-inc | HS-grad | 9 | Married-civ-spouse | White | Female | 40 | >50K |
str(datos)
## 'data.frame': 48842 obs. of 10 variables:
## $ x : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : Factor w/ 9 levels "?","Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ educational.num: int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital.status : Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ hours.per.week : int 40 50 40 40 30 30 40 32 40 10 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...
kable(summary(datos[-1]))
| age | workclass | education | educational.num | marital.status | race | gender | hours.per.week | income | |
|---|---|---|---|---|---|---|---|---|---|
| Min. :17.00 | Private :33906 | HS-grad :15784 | Min. : 1.00 | Divorced : 6633 | Amer-Indian-Eskimo: 470 | Female:16192 | Min. : 1.00 | <=50K:37155 | |
| 1st Qu.:28.00 | Self-emp-not-inc: 3862 | Some-college:10878 | 1st Qu.: 9.00 | Married-AF-spouse : 37 | Asian-Pac-Islander: 1519 | Male :32650 | 1st Qu.:40.00 | >50K :11687 | |
| Median :37.00 | Local-gov : 3136 | Bachelors : 8025 | Median :10.00 | Married-civ-spouse :22379 | Black : 4685 | NA | Median :40.00 | NA | |
| Mean :38.64 | ? : 2799 | Masters : 2657 | Mean :10.08 | Married-spouse-absent: 628 | Other : 406 | NA | Mean :40.42 | NA | |
| 3rd Qu.:48.00 | State-gov : 1981 | Assoc-voc : 2061 | 3rd Qu.:12.00 | Never-married :16117 | White :41762 | NA | 3rd Qu.:45.00 | NA | |
| Max. :90.00 | Self-emp-inc : 1695 | 11th : 1812 | Max. :16.00 | Separated : 1530 | NA | NA | Max. :99.00 | NA | |
| NA | (Other) : 1463 | (Other) : 7625 | NA | Widowed : 1518 | NA | NA | NA | NA |
De los pasos 1 al 4 se pueden integrar en fases de ciencia de los datos como de carga, limpieza y exploración de los datos.
A partir del paso 5 se construye un modelo de regresión lógistica para predicciones.
paso 1: Identificar variables numéricas
paso 2: Identificar variables factor
paso 3: Ingeniería de datos
paso 4: Estadísticos descriptivos
paso 5: Conjunto de datos de entrenamiento y de validación Train/test set
paso 6: Modelo de regresión logística
paso 7: Evaluar el modelo
paso 8: Predicciones con datos de entrenamiento
numericas <-select_if(datos, is.numeric)
kable(summary(numericas[-1]))
| age | educational.num | hours.per.week | |
|---|---|---|---|
| Min. :17.00 | Min. : 1.00 | Min. : 1.00 | |
| 1st Qu.:28.00 | 1st Qu.: 9.00 | 1st Qu.:40.00 | |
| Median :37.00 | Median :10.00 | Median :40.00 | |
| Mean :38.64 | Mean :10.08 | Mean :40.42 | |
| 3rd Qu.:48.00 | 3rd Qu.:12.00 | 3rd Qu.:45.00 | |
| Max. :90.00 | Max. :16.00 | Max. :99.00 |
ggplot(numericas, aes(x = hours.per.week)) +
geom_density(alpha = .2, fill = "#FF6666")
distribucion <- fdt(numericas$hours.per.week,breaks="Sturges")
kable(distribucion)
|
|
barplot(height = distribucion$table$f, names.arg = distribucion$table$`Class limits`)
ggplot(numericas, aes(x = age)) +
geom_density(alpha = .2, fill = "#FF6666")
distribucion <- fdt(numericas$age, breaks="Sturges")
kable(distribucion)
|
|
barplot(height = distribucion$table$f, names.arg = distribucion$table$`Class limits`)
escalados <- datos[-1] %>%
mutate(age.scale = rescale(age),educational.num.scale = rescale(educational.num), hours.per.week.scale = rescale(hours.per.week) )
head(escalados, 10)
## age workclass education educational.num marital.status race
## 1 25 Private 11th 7 Never-married Black
## 2 38 Private HS-grad 9 Married-civ-spouse White
## 3 28 Local-gov Assoc-acdm 12 Married-civ-spouse White
## 4 44 Private Some-college 10 Married-civ-spouse Black
## 5 18 ? Some-college 10 Never-married White
## 6 34 Private 10th 6 Never-married White
## 7 29 ? HS-grad 9 Never-married Black
## 8 63 Self-emp-not-inc Prof-school 15 Married-civ-spouse White
## 9 24 Private Some-college 10 Never-married White
## 10 55 Private 7th-8th 4 Married-civ-spouse White
## gender hours.per.week income age.scale educational.num.scale
## 1 Male 40 <=50K 0.10958904 0.4000000
## 2 Male 50 <=50K 0.28767123 0.5333333
## 3 Male 40 >50K 0.15068493 0.7333333
## 4 Male 40 >50K 0.36986301 0.6000000
## 5 Female 30 <=50K 0.01369863 0.6000000
## 6 Male 30 <=50K 0.23287671 0.3333333
## 7 Male 40 <=50K 0.16438356 0.5333333
## 8 Male 32 >50K 0.63013699 0.9333333
## 9 Female 40 <=50K 0.09589041 0.6000000
## 10 Male 10 <=50K 0.52054795 0.2000000
## hours.per.week.scale
## 1 0.39795918
## 2 0.50000000
## 3 0.39795918
## 4 0.39795918
## 5 0.29591837
## 6 0.29591837
## 7 0.39795918
## 8 0.31632653
## 9 0.39795918
## 10 0.09183673
tail(escalados, 10)
## age workclass education educational.num marital.status
## 48833 32 Private 10th 6 Married-civ-spouse
## 48834 43 Private Assoc-voc 11 Married-civ-spouse
## 48835 32 Private Masters 14 Never-married
## 48836 53 Private Masters 14 Married-civ-spouse
## 48837 22 Private Some-college 10 Never-married
## 48838 27 Private Assoc-acdm 12 Married-civ-spouse
## 48839 40 Private HS-grad 9 Married-civ-spouse
## 48840 58 Private HS-grad 9 Widowed
## 48841 22 Private HS-grad 9 Never-married
## 48842 52 Self-emp-inc HS-grad 9 Married-civ-spouse
## race gender hours.per.week income age.scale
## 48833 Amer-Indian-Eskimo Male 40 <=50K 0.20547945
## 48834 White Male 45 <=50K 0.35616438
## 48835 Asian-Pac-Islander Male 11 <=50K 0.20547945
## 48836 White Male 40 >50K 0.49315068
## 48837 White Male 40 <=50K 0.06849315
## 48838 White Female 38 <=50K 0.13698630
## 48839 White Male 40 >50K 0.31506849
## 48840 White Female 40 <=50K 0.56164384
## 48841 White Male 20 <=50K 0.06849315
## 48842 White Female 40 >50K 0.47945205
## educational.num.scale hours.per.week.scale
## 48833 0.3333333 0.3979592
## 48834 0.6666667 0.4489796
## 48835 0.8666667 0.1020408
## 48836 0.8666667 0.3979592
## 48837 0.6000000 0.3979592
## 48838 0.7333333 0.3775510
## 48839 0.5333333 0.3979592
## 48840 0.5333333 0.3979592
## 48841 0.5333333 0.1938776
## 48842 0.5333333 0.3979592
factores <- data.frame(select_if(datos, is.factor))
summary(factores)
## workclass education marital.status
## Private :33906 HS-grad :15784 Divorced : 6633
## Self-emp-not-inc: 3862 Some-college:10878 Married-AF-spouse : 37
## Local-gov : 3136 Bachelors : 8025 Married-civ-spouse :22379
## ? : 2799 Masters : 2657 Married-spouse-absent: 628
## State-gov : 1981 Assoc-voc : 2061 Never-married :16117
## Self-emp-inc : 1695 11th : 1812 Separated : 1530
## (Other) : 1463 (Other) : 7625 Widowed : 1518
## race gender income
## Amer-Indian-Eskimo: 470 Female:16192 <=50K:37155
## Asian-Pac-Islander: 1519 Male :32650 >50K :11687
## Black : 4685
## Other : 406
## White :41762
##
##
kable(summary(factores) )
| workclass | education | marital.status | race | gender i | ncome | |
|---|---|---|---|---|---|---|
| Private :33906 | HS-grad :15784 | Divorced : 6633 | Amer-Indian-Eskimo: 470 | Female:16192 | <=50K:37155 | |
| Self-emp-not-inc: 3862 | Some-college:10878 | Married-AF-spouse : 37 | Asian-Pac-Islander: 1519 | Male :32650 | >50K :11687 | |
| Local-gov : 3136 | Bachelors : 8025 | Married-civ-spouse :22379 | Black : 4685 | NA | NA | |
| ? : 2799 | Masters : 2657 | Married-spouse-absent: 628 | Other : 406 | NA | NA | |
| State-gov : 1981 | Assoc-voc : 2061 | Never-married :16117 | White :41762 | NA | NA | |
| Self-emp-inc : 1695 | 11th : 1812 | Separated : 1530 | NA | NA | NA | |
| (Other) : 1463 | (Other) : 7625 | Widowed : 1518 | NA | NA | NA |
str(factores)
## 'data.frame': 48842 obs. of 6 variables:
## $ workclass : Factor w/ 9 levels "?","Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...
distribucion <- fdt_cat(factores)
kable(distribucion$workclass)
|
barplot(height = distribucion$workclass$table$f, names.arg = distribucion$workclass$table$Category)
kable(distribucion$education)
|
barplot(height = distribucion$education$table$f, names.arg = distribucion$education$table$Category)
kable(distribucion$marital.status)
|
barplot(height = distribucion$marital.status$table$f, names.arg = distribucion$marital.status$table$Category)
kable(distribucion$race)
|
barplot(height = distribucion$race$table$f, names.arg = distribucion$race$table$Category)
kable(distribucion$gender)
|
barplot(height = distribucion$gender$table$f, names.arg = distribucion$gender$table$Category)
kable(distribucion$income)
|
barplot(height = distribucion$income$table$f, names.arg = distribucion$income$table$Category)
recategorizados <- escalados %>%
mutate(education = factor(ifelse(education == "Preschool" | education == "10th" | education == "11th" | education == "12th" | education == "1st-4th" | education == "5th-6th" | education == "7th-8th" | education == "9th", "Dropout", ifelse(education == "HS-grad", "HighGrad", ifelse(education == "Some-college" | education == "Assoc-acdm" | education == "Assoc-voc", "Community",ifelse(education == "Bachelors", "Bachelors",
ifelse(education == "Masters" | education == "Prof-school", "Master", "PhD")))))))
kable(head(recategorizados))
| age w | orkclass e | ducation | educational.num m | arital.status r | ace g | ender | hours.per.week i | ncome | age.scale | educational.num.scale | hours.per.week.scale |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 25 | Private | Dropout | 7 | Never-married | Black | Male | 40 | <=50K | 0.1095890 | 0.4000000 | 0.3979592 |
| 38 | Private | HighGrad | 9 | Married-civ-spouse | White | Male | 50 | <=50K | 0.2876712 | 0.5333333 | 0.5000000 |
| 28 | Local-gov | Community | 12 | Married-civ-spouse | White | Male | 40 | >50K | 0.1506849 | 0.7333333 | 0.3979592 |
| 44 | Private | Community | 10 | Married-civ-spouse | Black | Male | 40 | >50K | 0.3698630 | 0.6000000 | 0.3979592 |
| 18 | ? | Community | 10 | Never-married | White | Female | 30 | <=50K | 0.0136986 | 0.6000000 | 0.2959184 |
| 34 | Private | Dropout | 6 | Never-married | White | Male | 30 | <=50K | 0.2328767 | 0.3333333 | 0.2959184 |
recategorizados %>%
group_by(education) %>%
summarize(promedio_educacion = mean(educational.num),
cuantos = n()) %>%
arrange(promedio_educacion)
## # A tibble: 6 x 3
## education promedio_educacion cuantos
## <fct> <dbl> <int>
## 1 Dropout 5.61 6408
## 2 HighGrad 9 15784
## 3 Community 10.4 14540
## 4 Bachelors 13 8025
## 5 Master 14.2 3491
## 6 PhD 16 594
temporal <- recategorizados %>%
mutate(marital.status=factor(ifelse(marital.status=="Never-married" | marital.status=="Married-spouse-absent","Not_married",ifelse(marital.status == "Married-civ-spouse" | marital.status=="Married-AF-spouse","Married",ifelse(marital.status=="Divorced" | marital.status=="Separated","Separated","Widow")))))
recategorizados <- temporal
kable(head(recategorizados))
| age w | orkclass e | ducation | educational.num m | arital.status r | ace g | ender | hours.per.week i | ncome | age.scale | educational.num.scale | hours.per.week.scale |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 25 | Private | Dropout | 7 | Not_married | Black | Male | 40 | <=50K | 0.1095890 | 0.4000000 | 0.3979592 |
| 38 | Private | HighGrad | 9 | Married | White | Male | 50 | <=50K | 0.2876712 | 0.5333333 | 0.5000000 |
| 28 | Local-gov | Community | 12 | Married | White | Male | 40 | >50K | 0.1506849 | 0.7333333 | 0.3979592 |
| 44 | Private | Community | 10 | Married | Black | Male | 40 | >50K | 0.3698630 | 0.6000000 | 0.3979592 |
| 18 | ? | Community | 10 | Not_married | White | Female | 30 | <=50K | 0.0136986 | 0.6000000 | 0.2959184 |
| 34 | Private | Dropout | 6 | Not_married | White | Male | 30 | <=50K | 0.2328767 | 0.3333333 | 0.2959184 |
table(recategorizados$marital.status)
##
## Married Not_married Separated Widow
## 22416 16745 8163 1518
ggplot(recategorizados, aes(x = gender, fill = income)) +
geom_bar(position = "fill") +
theme_classic()
ggplot(recategorizados, aes(x = race, fill = income)) +
geom_bar(position = "fill") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
head(recategorizados)
## age workclass education educational.num marital.status race gender
## 1 25 Private Dropout 7 Not_married Black Male
## 2 38 Private HighGrad 9 Married White Male
## 3 28 Local-gov Community 12 Married White Male
## 4 44 Private Community 10 Married Black Male
## 5 18 ? Community 10 Not_married White Female
## 6 34 Private Dropout 6 Not_married White Male
## hours.per.week income age.scale educational.num.scale hours.per.week.scale
## 1 40 <=50K 0.10958904 0.4000000 0.3979592
## 2 50 <=50K 0.28767123 0.5333333 0.5000000
## 3 40 >50K 0.15068493 0.7333333 0.3979592
## 4 40 >50K 0.36986301 0.6000000 0.3979592
## 5 30 <=50K 0.01369863 0.6000000 0.2959184
## 6 30 <=50K 0.23287671 0.3333333 0.2959184
ggplot(recategorizados, aes(x = gender, y = hours.per.week)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
size = 3,
color = "steelblue") +
theme_classic()
## Warning: `fun.y` is deprecated. Use `fun` instead.
ggplot(recategorizados, aes(x = hours.per.week)) +
geom_density(aes(color = education), alpha = 0.5) +
theme_classic()
ggplot(recategorizados, aes(x = age, y = hours.per.week)) +
geom_point(aes(color = income),
size = 0.5) +
stat_smooth(method = 'lm',
formula = y~poly(x, 2),
se = TRUE,
aes(color = income)) +
theme_classic()
ggplot(recategorizados, aes(x = education, y = hours.per.week)) +
geom_point(aes(color = income),
size = 0.5) +
stat_smooth(method = 'lm',
formula = y~poly(x, 2),
se = TRUE,
aes(color = income)) +
theme_classic()
recategorizados <- recategorizados %>%
mutate(income10 = recode(income,"<=50K" = 0,">50K" = 1))
head(recategorizados[,c(9,13)])
## income income10
## 1 <=50K 0
## 2 <=50K 0
## 3 >50K 1
## 4 >50K 1
## 5 <=50K 0
## 6 <=50K 0
names(recategorizados)
## [1] "age" "workclass" "education"
## [4] "educational.num" "marital.status" "race"
## [7] "gender" "hours.per.week" "income"
## [10] "age.scale" "educational.num.scale" "hours.per.week.scale"
## [13] "income10"
head(recategorizados)
## age workclass education educational.num marital.status race gender
## 1 25 Private Dropout 7 Not_married Black Male
## 2 38 Private HighGrad 9 Married White Male
## 3 28 Local-gov Community 12 Married White Male
## 4 44 Private Community 10 Married Black Male
## 5 18 ? Community 10 Not_married White Female
## 6 34 Private Dropout 6 Not_married White Male
## hours.per.week income age.scale educational.num.scale hours.per.week.scale
## 1 40 <=50K 0.10958904 0.4000000 0.3979592
## 2 50 <=50K 0.28767123 0.5333333 0.5000000
## 3 40 >50K 0.15068493 0.7333333 0.3979592
## 4 40 >50K 0.36986301 0.6000000 0.3979592
## 5 30 <=50K 0.01369863 0.6000000 0.2959184
## 6 30 <=50K 0.23287671 0.3333333 0.2959184
## income10
## 1 0
## 2 0
## 3 1
## 4 1
## 5 0
## 6 0
write.csv(recategorizados, file="adultos_clean.csv")
dir()
## [1] "adultos_clean.csv"
## [2] "Analisis-de-regresion-logistica-de-Datos-adultos.html"
## [3] "Analisis-de-regresion-logistica-de-Datos-adultos.Rmd"
## [4] "Analisis-de-regresion-logistica-de-Datos-adultos_files"
## [5] "Analisis de regresion logistica de Datos adultos.Rmd"
## [6] "matriz confusion.jpg"
## [7] "probabilidad de regresion logistica.jpg"
## [8] "ProbandoMarkdownShinyMDS.Rmd"
## [9] "rsconnect"
nrow(recategorizados)
## [1] 48842
set.seed(2020)
entrena <- createDataPartition(recategorizados$income, p=0.7, list = FALSE)
datos.Entrena <- recategorizados[entrena,]
datos.Validacion <- recategorizados[-entrena,]
nrow(datos.Entrena)
## [1] 34190
kable(head(datos.Entrena))
| ag | e wor | kclass edu | cation ed | ucational.num mar | ital.status rac | e gen | der ho | urs.per.week inc | ome ag | e.scale ed | ucational.num.scale ho | urs.per.week.scale in | come10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 25 | Private | Dropout | 7 | Not_married | Black | Male | 40 | <=50K | 0.1095890 | 0.4000000 | 0.3979592 | 0 |
| 2 | 38 | Private | HighGrad | 9 | Married | White | Male | 50 | <=50K | 0.2876712 | 0.5333333 | 0.5000000 | 0 |
| 3 | 28 | Local-gov | Community | 12 | Married | White | Male | 40 | >50K | 0.1506849 | 0.7333333 | 0.3979592 | 1 |
| 4 | 44 | Private | Community | 10 | Married | Black | Male | 40 | >50K | 0.3698630 | 0.6000000 | 0.3979592 | 1 |
| 5 | 18 | ? | Community | 10 | Not_married | White | Female | 30 | <=50K | 0.0136986 | 0.6000000 | 0.2959184 | 0 |
| 7 | 29 | ? | HighGrad | 9 | Not_married | Black | Male | 40 | <=50K | 0.1643836 | 0.5333333 | 0.3979592 | 0 |
kable(tail(datos.Entrena))
| ag | e wor | kclass edu | cation ed | ucational.num mar | ital.status rac | e gen | der ho | urs.per.week inc | ome ag | e.scale ed | ucational.num.scale ho | urs.per.week.scale in | come10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 48834 | 43 | Private | Community | 11 | Married | White | Male | 45 | <=50K | 0.3561644 | 0.6666667 | 0.4489796 | 0 |
| 48835 | 32 | Private | Master | 14 | Not_married | Asian-Pac-Islander | Male | 11 | <=50K | 0.2054795 | 0.8666667 | 0.1020408 | 0 |
| 48838 | 27 | Private | Community | 12 | Married | White | Female | 38 | <=50K | 0.1369863 | 0.7333333 | 0.3775510 | 0 |
| 48840 | 58 | Private | HighGrad | 9 | Widow | White | Female | 40 | <=50K | 0.5616438 | 0.5333333 | 0.3979592 | 0 |
| 48841 | 22 | Private | HighGrad | 9 | Not_married | White | Male | 20 | <=50K | 0.0684932 | 0.5333333 | 0.1938776 | 0 |
| 48842 | 52 | Self-emp-inc | HighGrad | 9 | Married | White | Female | 40 | >50K | 0.4794521 | 0.5333333 | 0.3979592 | 1 |
datos.Validacion <- recategorizados[-entrena,]
nrow(datos.Validacion)
## [1] 14652
kable(head(datos.Validacion))
| ag | e wor | kclass edu | cation ed | ucational.num mar | ital.status rac | e gen | der ho | urs.per.week inc | ome ag | e.scale ed | ucational.num.scale ho | urs.per.week.scale in | come10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | 34 | Private | Dropout | 6 | Not_married | White | Male | 30 | <=50K | 0.2328767 | 0.3333333 | 0.2959184 | 0 |
| 15 | 48 | Private | HighGrad | 9 | Married | White | Male | 48 | >50K | 0.4246575 | 0.5333333 | 0.4795918 | 1 |
| 17 | 20 | State-gov | Community | 10 | Not_married | White | Male | 25 | <=50K | 0.0410959 | 0.6000000 | 0.2448980 | 0 |
| 36 | 65 | ? | HighGrad | 9 | Married | White | Male | 40 | <=50K | 0.6575342 | 0.5333333 | 0.3979592 | 0 |
| 41 | 65 | Private | Master | 14 | Married | White | Male | 50 | >50K | 0.6575342 | 0.8666667 | 0.5000000 | 1 |
| 49 | 52 | Private | Dropout | 7 | Separated | Black | Female | 18 | <=50K | 0.4794521 | 0.4000000 | 0.1734694 | 0 |
kable(tail(datos.Validacion))
| ag | e wor | kclass edu | cation ed | ucational.num mar | ital.status rac | e gen | der ho | urs.per.week inc | ome ag | e.scale ed | ucational.num.scale ho | urs.per.week.scale in | come10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 48826 | 31 | Private | Master | 14 | Separated | Other | Female | 30 | <=50K | 0.1917808 | 0.8666667 | 0.2959184 | 0 |
| 48830 | 65 | Self-emp-not-inc | Master | 15 | Not_married | White | Male | 60 | <=50K | 0.6575342 | 0.9333333 | 0.6020408 | 0 |
| 48832 | 43 | Self-emp-not-inc | Community | 10 | Married | White | Male | 50 | <=50K | 0.3561644 | 0.6000000 | 0.5000000 | 0 |
| 48836 | 53 | Private | Master | 14 | Married | White | Male | 40 | >50K | 0.4931507 | 0.8666667 | 0.3979592 | 1 |
| 48837 | 22 | Private | Community | 10 | Not_married | White | Male | 40 | <=50K | 0.0684932 | 0.6000000 | 0.3979592 | 0 |
| 48839 | 40 | Private | HighGrad | 9 | Married | White | Male | 40 | >50K | 0.3150685 | 0.5333333 | 0.3979592 | 1 |
Con la regresión logística, dado un conjunto particular de valores de las variables independientes elegidas, se estima la probabilidad de los ingresos de una persona ‘<=50’ o ‘>50’
formula = income10 ~ age.scale + workclass + education + marital.status + race + gender + hours.per.week.scale
modelo <- glm(formula, data = datos.Entrena, family = 'binomial')
summary(modelo)
##
## Call:
## glm(formula = formula, family = "binomial", data = datos.Entrena)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7337 -0.5768 -0.2588 -0.0654 3.3492
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.419e+00 2.228e-01 -10.858 < 2e-16 ***
## age.scale 2.224e+00 1.053e-01 21.121 < 2e-16 ***
## workclassFederal-gov 1.421e+00 1.237e-01 11.485 < 2e-16 ***
## workclassLocal-gov 6.942e-01 1.100e-01 6.312 2.76e-10 ***
## workclassNever-worked -8.124e+00 1.042e+02 -0.078 0.9379
## workclassPrivate 8.124e-01 9.598e-02 8.464 < 2e-16 ***
## workclassSelf-emp-inc 1.218e+00 1.186e-01 10.270 < 2e-16 ***
## workclassSelf-emp-not-inc 1.878e-01 1.071e-01 1.753 0.0797 .
## workclassState-gov 5.339e-01 1.223e-01 4.367 1.26e-05 ***
## workclassWithout-pay -3.965e-01 8.276e-01 -0.479 0.6318
## educationCommunity -9.930e-01 4.428e-02 -22.426 < 2e-16 ***
## educationDropout -2.782e+00 7.802e-02 -35.657 < 2e-16 ***
## educationHighGrad -1.611e+00 4.523e-02 -35.610 < 2e-16 ***
## educationMaster 6.250e-01 6.110e-02 10.230 < 2e-16 ***
## educationPhD 1.077e+00 1.379e-01 7.814 5.55e-15 ***
## marital.statusNot_married -2.491e+00 5.355e-02 -46.511 < 2e-16 ***
## marital.statusSeparated -2.102e+00 5.650e-02 -37.214 < 2e-16 ***
## marital.statusWidow -2.163e+00 1.287e-01 -16.809 < 2e-16 ***
## raceAsian-Pac-Islander -2.461e-02 2.074e-01 -0.119 0.9055
## raceBlack 4.784e-04 1.968e-01 0.002 0.9981
## raceOther -9.881e-02 2.817e-01 -0.351 0.7258
## raceWhite 2.155e-01 1.876e-01 1.148 0.2509
## genderMale 9.432e-02 4.455e-02 2.117 0.0342 *
## hours.per.week.scale 3.136e+00 1.398e-01 22.430 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37626 on 34189 degrees of freedom
## Residual deviance: 25011 on 34166 degrees of freedom
## AIC: 25059
##
## Number of Fisher Scoring iterations: 11
comparar <- data.frame(datos.Entrena$income10, as.vector(modelo$fitted.values) )
comparar <- comparar %>%
mutate(income10ajustados = if_else (modelo$fitted.values > 0.5, 1, 0))
colnames(comparar) <- c("income10", "ajuste", 'income10ajustados')
head(comparar)
## income10 ajuste income10ajustados
## 1 0 0.005003194 0
## 2 0 0.331904483 0
## 3 1 0.304736019 0
## 4 1 0.393130183 0
## 5 0 0.008761445 0
## 6 0 0.008068021 0
tail(comparar)
## income10 ajuste income10ajustados
## 34185 0 0.47760555 0
## 34186 0 0.06751504 0
## 34187 0 0.28999271 0
## 34188 0 0.06490387 0
## 34189 0 0.00958675 0
## 34190 1 0.43003777 0
matriz_confusion <- table(comparar$income10, comparar$income10ajustados, dnn = c("income10", "income10ajustados para predicciones"))
matriz_confusion
## income10ajustados para predicciones
## income10 0 1
## 0 24107 1902
## 1 4016 4165
n = nrow(datos.Entrena)
exactidud <- (matriz_confusion[1,1] + matriz_confusion[2,2]) / n
exactidud
## [1] 0.8269085
predicciones <- predict(modelo, datos.Validacion, se.fit = TRUE)
head(predicciones$fit)
## 6 15 17 36 41 49
## -5.1235447 -0.4589452 -4.1994476 -1.0094459 2.3586616 -4.8803438
tail(predicciones$fit)
## 48826 48830 48832 48836 48837 48839
## -1.8282163 -0.4364728 -0.5542026 1.6730416 -3.3800099 -0.9586976
predicciones_prob <- exp(predicciones$fit) / (1 + exp(predicciones$fit))
head(predicciones_prob)
## 6 15 17 36 41 49
## 0.005919626 0.387236079 0.014782074 0.267088310 0.913620236 0.007537162
tail(predicciones_prob)
## 48826 48830 48832 48836 48837 48839
## 0.13845090 0.39258175 0.36488993 0.84198092 0.03292608 0.27713903
las.predicciones <- cbind(datos.Validacion, predicciones_prob)
las.predicciones <- las.predicciones %>%
mutate(income10.prediccion = if_else(predicciones_prob > 0.5, 1, 0))
head(las.predicciones)
## age workclass education educational.num marital.status race gender
## 1 34 Private Dropout 6 Not_married White Male
## 2 48 Private HighGrad 9 Married White Male
## 3 20 State-gov Community 10 Not_married White Male
## 4 65 ? HighGrad 9 Married White Male
## 5 65 Private Master 14 Married White Male
## 6 52 Private Dropout 7 Separated Black Female
## hours.per.week income age.scale educational.num.scale hours.per.week.scale
## 1 30 <=50K 0.23287671 0.3333333 0.2959184
## 2 48 >50K 0.42465753 0.5333333 0.4795918
## 3 25 <=50K 0.04109589 0.6000000 0.2448980
## 4 40 <=50K 0.65753425 0.5333333 0.3979592
## 5 50 >50K 0.65753425 0.8666667 0.5000000
## 6 18 <=50K 0.47945205 0.4000000 0.1734694
## income10 predicciones_prob income10.prediccion
## 1 0 0.005919626 0
## 2 1 0.387236079 0
## 3 0 0.014782074 0
## 4 0 0.267088310 0
## 5 1 0.913620236 1
## 6 0 0.007537162 0
tail(las.predicciones)
## age workclass education educational.num marital.status race
## 14647 31 Private Master 14 Separated Other
## 14648 65 Self-emp-not-inc Master 15 Not_married White
## 14649 43 Self-emp-not-inc Community 10 Married White
## 14650 53 Private Master 14 Married White
## 14651 22 Private Community 10 Not_married White
## 14652 40 Private HighGrad 9 Married White
## gender hours.per.week income age.scale educational.num.scale
## 14647 Female 30 <=50K 0.19178082 0.8666667
## 14648 Male 60 <=50K 0.65753425 0.9333333
## 14649 Male 50 <=50K 0.35616438 0.6000000
## 14650 Male 40 >50K 0.49315068 0.8666667
## 14651 Male 40 <=50K 0.06849315 0.6000000
## 14652 Male 40 >50K 0.31506849 0.5333333
## hours.per.week.scale income10 predicciones_prob income10.prediccion
## 14647 0.2959184 0 0.13845090 0
## 14648 0.6020408 0 0.39258175 0
## 14649 0.5000000 0 0.36488993 0
## 14650 0.3979592 1 0.84198092 1
## 14651 0.3979592 0 0.03292608 0
## 14652 0.3979592 1 0.27713903 0
matriz_confusion <- table(las.predicciones$income10, las.predicciones$income10.prediccion, dnn = c("income10", "predicciones"))
matriz_confusion
## predicciones
## income10 0 1
## 0 10357 789
## 1 1762 1744
n = nrow(datos.Validacion)
exactidud <- (matriz_confusion[1,1] + matriz_confusion[2,2]) / n
exactidud
## [1] 0.8258941
# filter (datos.Validacion, age == 53 & workclass == 'Local-gov' & education == 'HighGrad' & marital.status == 'Married' & race == 'White' & gender == 'Male' & hours.per.week == 40)
edad <- 53; horas <- 50
a.predecir <- data.frame(rbind(c(edad, 'Local-gov', 'HighGrad', 'Married', 'White' , 'Male', horas)))
colnames(a.predecir) <- c('age.scale', 'workclass', 'education', 'marital.status', 'race', 'gender', 'hours.per.week.scale')
edad; horas
## [1] 53
## [1] 50
edad.escalada <- rescale(c(edad, min(datos$age), max(datos$age)))
edad.escalada <- edad.escalada[1]
# Escalando las horas por semana
horas.escalada<- rescale(c(horas, min(datos$hours.per.week), max(datos$hours.per.week)))
horas.escalada <- horas.escalada[1]
a.predecir <- a.predecir %>%
mutate(age.scale = edad.escalada,
hours.per.week.scale = horas.escalada)
a.predecir
## age.scale workclass education marital.status race gender
## 1 0.4931507 Local-gov HighGrad Married White Male
## hours.per.week.scale
## 1 0.5
prediccion <- predict(modelo, a.predecir, se.fit = TRUE)
prediccion
## $fit
## 1
## -0.3608282
##
## $se.fit
## [1] 0.06749201
##
## $residual.scale
## [1] 1
prediccion_prob <- exp(prediccion$fit) / (1 + exp(prediccion$fit))
prediccion_prob
## 1
## 0.4107591
# Predecir si será 0 o 1 conforme a la probabilidad de predicción
if_else(prediccion_prob > 0.5, 1, 0)
## [1] 0
cat("La probabilidad y la predicción de que una persona con esas características gane >50K es: ", if_else(prediccion_prob > 0.5, 1, 0))
## La probabilidad y la predicción de que una persona con esas características gane >50K es: 0
edad <- c(40,50,60,70,45,60,65,75,35,53)
clase.empleo <- c('Federal-gov', 'State-gov','Never-worked','Self-emp-inc', 'Federal-gov','Private', 'Private', 'Federal-gov', 'State-gov', 'Local-gov')
nivel.educacion <- c('HighGrad', 'HighGrad', 'HighGrad', 'HighGrad', 'Bachelors', 'Bachelors', 'Community', 'Community', 'Master', 'PhD')
edo.civil <- c('Married', 'Married', 'Separated', 'Widow', 'Not_married', 'Married', 'Separated', 'Widow', 'Married', 'Not_married')
raza <- c('White', 'Asian-Pac-Islander', 'Black', 'Other', 'White', 'White', 'Amer-Indian-Eskimo', 'Black', 'White', 'White')
genero <- c('Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male')
horas <- c(50,45,55,58,60,65,70,44,61,53)
a.predecir <- data.frame(rbind(cbind(edad, clase.empleo, nivel.educacion, edo.civil,raza, genero, horas)))
colnames(a.predecir) <- c('age.scale', 'workclass', 'education', 'marital.status', 'race', 'gender', 'hours.per.week.scale')
a.predecir
## age.scale workclass education marital.status race gender
## 1 40 Federal-gov HighGrad Married White Female
## 2 50 State-gov HighGrad Married Asian-Pac-Islander Male
## 3 60 Never-worked HighGrad Separated Black Female
## 4 70 Self-emp-inc HighGrad Widow Other Male
## 5 45 Federal-gov Bachelors Not_married White Female
## 6 60 Private Bachelors Married White Male
## 7 65 Private Community Separated Amer-Indian-Eskimo Female
## 8 75 Federal-gov Community Widow Black Male
## 9 35 State-gov Master Married White Female
## 10 53 Local-gov PhD Not_married White Male
## hours.per.week.scale
## 1 50
## 2 45
## 3 55
## 4 58
## 5 60
## 6 65
## 7 70
## 8 44
## 9 61
## 10 53
edad; horas
## [1] 40 50 60 70 45 60 65 75 35 53
## [1] 50 45 55 58 60 65 70 44 61 53
edad.escalada <- rescale(c(edad, min(datos$age), max(datos$age)))
edad.escalada <- edad.escalada[1:10]
# Escalando las horas por semana
horas.escalada<- rescale(c(horas, min(datos$hours.per.week), max(datos$hours.per.week)))
horas.escalada <- horas.escalada[1:10]
a.predecir <- a.predecir %>%
mutate(age.scale = edad.escalada,
hours.per.week.scale = horas.escalada)
a.predecir
## age.scale workclass education marital.status race gender
## 1 0.3150685 Federal-gov HighGrad Married White Female
## 2 0.4520548 State-gov HighGrad Married Asian-Pac-Islander Male
## 3 0.5890411 Never-worked HighGrad Separated Black Female
## 4 0.7260274 Self-emp-inc HighGrad Widow Other Male
## 5 0.3835616 Federal-gov Bachelors Not_married White Female
## 6 0.5890411 Private Bachelors Married White Male
## 7 0.6575342 Private Community Separated Amer-Indian-Eskimo Female
## 8 0.7945205 Federal-gov Community Widow Black Male
## 9 0.2465753 State-gov Master Married White Female
## 10 0.4931507 Local-gov PhD Not_married White Male
## hours.per.week.scale
## 1 0.5000000
## 2 0.4489796
## 3 0.5510204
## 4 0.5816327
## 5 0.6020408
## 6 0.6530612
## 7 0.7040816
## 8 0.4387755
## 9 0.6122449
## 10 0.5306122
prediccion <- predict(modelo, a.predecir, se.fit = TRUE)
prediccion
## $fit
## 1 2 3 4 5 6
## -0.12439975 -1.01254072 -11.21715667 -1.54029814 -0.53195390 2.06137511
## 7 8 9 10
## -1.03148774 -0.91627294 1.42400774 -0.06745948
##
## $se.fit
## 1 2 3 4 5 6
## 0.09665512 0.12280131 104.21710499 0.25776451 0.10447450 0.05880670
## 7 8 9 10
## 0.20303050 0.16157401 0.10392565 0.15241508
##
## $residual.scale
## [1] 1
prediccion_prob <- exp(prediccion$fit) / (1 + exp(prediccion$fit))
prediccion_prob
## 1 2 3 4 5 6
## 4.689401e-01 2.664829e-01 1.344141e-05 1.764919e-01 3.700613e-01 8.870920e-01
## 7 8 9 10
## 2.627958e-01 2.857179e-01 8.059659e-01 4.831415e-01
# Predecir si será 0 o 1 conforme a la probabilidad de predicción
las.predicciones <- if_else(prediccion_prob > 0.5, 1, 0)
cat("Son las predicciones para las personas con esas características ")
## Son las predicciones para las personas con esas características
las.predicciones
## [1] 0 0 0 0 0 1 0 0 1 0
print("Las predicciones en la columna final")
## [1] "Las predicciones en la columna final"
cbind(a.predecir ,las.predicciones)
## age.scale workclass education marital.status race gender
## 1 0.3150685 Federal-gov HighGrad Married White Female
## 2 0.4520548 State-gov HighGrad Married Asian-Pac-Islander Male
## 3 0.5890411 Never-worked HighGrad Separated Black Female
## 4 0.7260274 Self-emp-inc HighGrad Widow Other Male
## 5 0.3835616 Federal-gov Bachelors Not_married White Female
## 6 0.5890411 Private Bachelors Married White Male
## 7 0.6575342 Private Community Separated Amer-Indian-Eskimo Female
## 8 0.7945205 Federal-gov Community Widow Black Male
## 9 0.2465753 State-gov Master Married White Female
## 10 0.4931507 Local-gov PhD Not_married White Male
## hours.per.week.scale las.predicciones
## 1 0.5000000 0
## 2 0.4489796 0
## 3 0.5510204 0
## 4 0.5816327 0
## 5 0.6020408 0
## 6 0.6530612 1
## 7 0.7040816 0
## 8 0.4387755 0
## 9 0.6122449 1
## 10 0.5306122 0