library(tidyverse) # varias
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr) # select filter mutate ...
library(ggplot2) # Gráficas
library(fdth) # Para tablas de distribución y frecuencias
## Warning: package 'fdth' was built under R version 3.6.3
##
## Attaching package: 'fdth'
## The following objects are masked from 'package:stats':
##
## sd, var
library(knitr) # Para ver tablas mas amigables en formato html markdown
## Warning: package 'knitr' was built under R version 3.6.3
library(caret) # Pra particionar datos
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(reshape) # Para renombrar columnas en caso de necesitarse
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
library(scales) # Para escalar datos
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
ruta="C:/Users/AlanA/Desktop/8vo/Analisis Datos/datos"
setwd(ruta)
datos <- read.csv("adultos.csv")
kable(head(datos, 10))
| x | age | workclass | education | educational.num | marital.status | race | gender | hours.per.week | income |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 25 | Private | 11th | 7 | Never-married | Black | Male | 40 | <=50K |
| 2 | 38 | Private | HS-grad | 9 | Married-civ-spouse | White | Male | 50 | <=50K |
| 3 | 28 | Local-gov | Assoc-acdm | 12 | Married-civ-spouse | White | Male | 40 | >50K |
| 4 | 44 | Private | Some-college | 10 | Married-civ-spouse | Black | Male | 40 | >50K |
| 5 | 18 | ? | Some-college | 10 | Never-married | White | Female | 30 | <=50K |
| 6 | 34 | Private | 10th | 6 | Never-married | White | Male | 30 | <=50K |
| 7 | 29 | ? | HS-grad | 9 | Never-married | Black | Male | 40 | <=50K |
| 8 | 63 | Self-emp-not-inc | Prof-school | 15 | Married-civ-spouse | White | Male | 32 | >50K |
| 9 | 24 | Private | Some-college | 10 | Never-married | White | Female | 40 | <=50K |
| 10 | 55 | Private | 7th-8th | 4 | Married-civ-spouse | White | Male | 10 | <=50K |
| ### L | os últ | imos diez registros |
kable(tail(datos,10))
| x | age | workclass | education | educational.num | marital.status | race | gender | hours.per.week | income | |
|---|---|---|---|---|---|---|---|---|---|---|
| 48833 | 48833 | 32 | Private | 10th | 6 | Married-civ-spouse | Amer-Indian-Eskimo | Male | 40 | <=50K |
| 48834 | 48834 | 43 | Private | Assoc-voc | 11 | Married-civ-spouse | White | Male | 45 | <=50K |
| 48835 | 48835 | 32 | Private | Masters | 14 | Never-married | Asian-Pac-Islander | Male | 11 | <=50K |
| 48836 | 48836 | 53 | Private | Masters | 14 | Married-civ-spouse | White | Male | 40 | >50K |
| 48837 | 48837 | 22 | Private | Some-college | 10 | Never-married | White | Male | 40 | <=50K |
| 48838 | 48838 | 27 | Private | Assoc-acdm | 12 | Married-civ-spouse | White | Female | 38 | <=50K |
| 48839 | 48839 | 40 | Private | HS-grad | 9 | Married-civ-spouse | White | Male | 40 | >50K |
| 48840 | 48840 | 58 | Private | HS-grad | 9 | Widowed | White | Female | 40 | <=50K |
| 48841 | 48841 | 22 | Private | HS-grad | 9 | Never-married | White | Male | 20 | <=50K |
| 48842 | 48842 | 52 | Self-emp-inc | HS-grad | 9 | Married-civ-spouse | White | Female | 40 | >50K |
| * La est | ructrua | de los | datos | |||||||
| * Resume | n de los | datos | : | |||||||
| * x Vari | able de | consec | utivo de los da | tos | ||||||
| * age la | edad de | la pe | rsona | |||||||
| * workcl | ass es u | n tipo | o clase de tra | bajo de la pers | ona, privado, gobi | erno, por su cuenta, | ||||
| * educat | ion indi | ca el | nivel educativo | de la persona | ||||||
| * educat | ional es | el va | lor numérico de | education | ||||||
| * marita | l es su | estado | civil | |||||||
| * race e | s el tip | o de r | aza de persona | |||||||
| * gender | es el g | énero | de la persona | |||||||
| * hours. | per.week | son l | as horas que tr | baja por semana | ||||||
| * income | son los | ingre | sos |
str(datos)
## 'data.frame': 48842 obs. of 10 variables:
## $ x : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : Factor w/ 9 levels "?","Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ educational.num: int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital.status : Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ hours.per.week : int 40 50 40 40 30 30 40 32 40 10 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...
kable(summary(datos[-1]))
| age | workclass | education | educational.num | marital.status | race | gender | hours.per.week | income | |
|---|---|---|---|---|---|---|---|---|---|
| Min. :17.00 | Private :33906 | HS-grad :15784 | Min. : 1.00 | Divorced : 6633 | Amer-Indian-Eskimo: 470 | Female:16192 | Min. : 1.00 | <=50K:37155 | |
| 1st Qu.:28.00 | Self-emp-not-inc: 3862 | Some-college:10878 | 1st Qu.: 9.00 | Married-AF-spouse : 37 | Asian-Pac-Islander: 1519 | Male :32650 | 1st Qu.:40.00 | >50K :11687 | |
| Median :37.00 | Local-gov : 3136 | Bachelors : 8025 | Median :10.00 | Married-civ-spouse :22379 | Black : 4685 | NA | Median :40.00 | NA | |
| Mean :38.64 | ? : 2799 | Masters : 2657 | Mean :10.08 | Married-spouse-absent: 628 | Other : 406 | NA | Mean :40.42 | NA | |
| 3rd Qu.:48.00 | State-gov : 1981 | Assoc-voc : 2061 | 3rd Qu.:12.00 | Never-married :16117 | White :41762 | NA | 3rd Qu.:45.00 | NA | |
| Max. :90.00 | Self-emp-inc : 1695 | 11th : 1812 | Max. :16.00 | Separated : 1530 | NA | NA | Max. :99.00 | NA | |
| NA | (Other) : 1463 | (Other) : 7625 | NA | Widowed : 1518 | NA | NA | NA | NA | |
| ### P | roceso para anal | izar los datos | |||||||
| * De | los pasos 1 al 4 | se pueden integrar en fa | ses de ciencia de los | datos como de car | ga, limpieza y exploración de l | os datos. |
A partir del paso 5 se construye un modelo de regresión lógistica para predicciones.
paso 1: Identificar variables numéricas
paso 2: Identificar variables factor
paso 3: Ingeniería de datos
paso 4: Estadísticos descriptivos
paso 5: Conjunto de datos de entrenamiento y de validación Train/test set
paso 6: Modelo de regresión logística
paso 7: Evaluar el modelo
paso 8: Predicciones con datos de entrenamiento
numericas <-select_if(datos, is.numeric)
kable(summary(numericas[-1]))
| age | educational.num | hours.per.week | |
|---|---|---|---|
| Min. :17.00 | Min. : 1.00 | Min. : 1.00 | |
| 1st Qu.:28.00 | 1st Qu.: 9.00 | 1st Qu.:40.00 | |
| Median :37.00 | Median :10.00 | Median :40.00 | |
| Mean :38.64 | Mean :10.08 | Mean :40.42 | |
| 3rd Qu.:48.00 | 3rd Qu.:12.00 | 3rd Qu.:45.00 | |
| Max. :90.00 | Max. :16.00 | Max. :99.00 | |
| ### A | nálisis de la va | riable hours.per.w | eek |
| * Se | visualiza un his | tograma | |
| * Se | determina la tab | la de distribucion | de frecuencia con la función fdt() de la variable hours.per.week |
| * La | mayoría de la ge | nte trabaja entre | 36 y 42 horas por semana |
ggplot(numericas, aes(x = hours.per.week)) +
geom_density(alpha = .2, fill = "#FF6666")
distribucion <- fdt(numericas$hours.per.week,breaks="Sturges")
kable(distribucion)
|
|
barplot(height = distribucion$table$f, names.arg = distribucion$table$`Class limits`)
### Análisis de la variable age * Se visualiza un histograma * Se determina la tabla de distribucion de frecuencia con la función fdt() de la variable age * La mayoría tiene entre 30 y 34 años y de igual forma una gran población en edades de 43 y 47 años de edad
ggplot(numericas, aes(x = age)) +
geom_density(alpha = .2, fill = "#FF6666")
distribucion <- fdt(numericas$age, breaks="Sturges")
kable(distribucion)
|
|
barplot(height = distribucion$table$f, names.arg = distribucion$table$`Class limits`)
### Estandarizar los valores numéricos * Escalar los valores numéricos de datos con la función rescale() de la librería scales * La finalidad de estandarizar los datos numéricos escalando o centrando los mismos es para mejorar el rendimiento en los análisis teniendo en cuenta que se establecen valores numéricos bajo los mismos criterios * Se dejan los datos escalados en un nuevo conjunto de datos excepto la variable x [-1] que no interesa * Se visualzan los primeros diez registros * Se visualizan los últimos diez registros * Observar que las variables numéricas age y hours.per.week están centradas y escaladas. * generan tres nuevas variables por medio de mutate() con datos numéricos escalados * Se retomará en el análisis más adelante el conjunto de datos datos.Escalados generado en esta apartado
escalados <- datos[-1] %>%
mutate(age.scale = rescale(age),educational.num.scale = rescale(educational.num), hours.per.week.scale = rescale(hours.per.week) )
head(escalados, 10)
## age workclass education educational.num marital.status race
## 1 25 Private 11th 7 Never-married Black
## 2 38 Private HS-grad 9 Married-civ-spouse White
## 3 28 Local-gov Assoc-acdm 12 Married-civ-spouse White
## 4 44 Private Some-college 10 Married-civ-spouse Black
## 5 18 ? Some-college 10 Never-married White
## 6 34 Private 10th 6 Never-married White
## 7 29 ? HS-grad 9 Never-married Black
## 8 63 Self-emp-not-inc Prof-school 15 Married-civ-spouse White
## 9 24 Private Some-college 10 Never-married White
## 10 55 Private 7th-8th 4 Married-civ-spouse White
## gender hours.per.week income age.scale educational.num.scale
## 1 Male 40 <=50K 0.10958904 0.4000000
## 2 Male 50 <=50K 0.28767123 0.5333333
## 3 Male 40 >50K 0.15068493 0.7333333
## 4 Male 40 >50K 0.36986301 0.6000000
## 5 Female 30 <=50K 0.01369863 0.6000000
## 6 Male 30 <=50K 0.23287671 0.3333333
## 7 Male 40 <=50K 0.16438356 0.5333333
## 8 Male 32 >50K 0.63013699 0.9333333
## 9 Female 40 <=50K 0.09589041 0.6000000
## 10 Male 10 <=50K 0.52054795 0.2000000
## hours.per.week.scale
## 1 0.39795918
## 2 0.50000000
## 3 0.39795918
## 4 0.39795918
## 5 0.29591837
## 6 0.29591837
## 7 0.39795918
## 8 0.31632653
## 9 0.39795918
## 10 0.09183673
tail(escalados, 10)
## age workclass education educational.num marital.status
## 48833 32 Private 10th 6 Married-civ-spouse
## 48834 43 Private Assoc-voc 11 Married-civ-spouse
## 48835 32 Private Masters 14 Never-married
## 48836 53 Private Masters 14 Married-civ-spouse
## 48837 22 Private Some-college 10 Never-married
## 48838 27 Private Assoc-acdm 12 Married-civ-spouse
## 48839 40 Private HS-grad 9 Married-civ-spouse
## 48840 58 Private HS-grad 9 Widowed
## 48841 22 Private HS-grad 9 Never-married
## 48842 52 Self-emp-inc HS-grad 9 Married-civ-spouse
## race gender hours.per.week income age.scale
## 48833 Amer-Indian-Eskimo Male 40 <=50K 0.20547945
## 48834 White Male 45 <=50K 0.35616438
## 48835 Asian-Pac-Islander Male 11 <=50K 0.20547945
## 48836 White Male 40 >50K 0.49315068
## 48837 White Male 40 <=50K 0.06849315
## 48838 White Female 38 <=50K 0.13698630
## 48839 White Male 40 >50K 0.31506849
## 48840 White Female 40 <=50K 0.56164384
## 48841 White Male 20 <=50K 0.06849315
## 48842 White Female 40 >50K 0.47945205
## educational.num.scale hours.per.week.scale
## 48833 0.3333333 0.3979592
## 48834 0.6666667 0.4489796
## 48835 0.8666667 0.1020408
## 48836 0.8666667 0.3979592
## 48837 0.6000000 0.3979592
## 48838 0.7333333 0.3775510
## 48839 0.5333333 0.3979592
## 48840 0.5333333 0.3979592
## 48841 0.5333333 0.1938776
## 48842 0.5333333 0.3979592
factores <- data.frame(select_if(datos, is.factor))
summary(factores)
## workclass education marital.status
## Private :33906 HS-grad :15784 Divorced : 6633
## Self-emp-not-inc: 3862 Some-college:10878 Married-AF-spouse : 37
## Local-gov : 3136 Bachelors : 8025 Married-civ-spouse :22379
## ? : 2799 Masters : 2657 Married-spouse-absent: 628
## State-gov : 1981 Assoc-voc : 2061 Never-married :16117
## Self-emp-inc : 1695 11th : 1812 Separated : 1530
## (Other) : 1463 (Other) : 7625 Widowed : 1518
## race gender income
## Amer-Indian-Eskimo: 470 Female:16192 <=50K:37155
## Asian-Pac-Islander: 1519 Male :32650 >50K :11687
## Black : 4685
## Other : 406
## White :41762
##
##
kable(summary(factores) )
| workclass | education | marital.status | race | gender i | ncome | |
|---|---|---|---|---|---|---|
| Private :33906 | HS-grad :15784 | Divorced : 6633 | Amer-Indian-Eskimo: 470 | Female:16192 | <=50K:37155 | |
| Self-emp-not-inc: 3862 | Some-college:10878 | Married-AF-spouse : 37 | Asian-Pac-Islander: 1519 | Male :32650 | >50K :11687 | |
| Local-gov : 3136 | Bachelors : 8025 | Married-civ-spouse :22379 | Black : 4685 | NA | NA | |
| ? : 2799 | Masters : 2657 | Married-spouse-absent: 628 | Other : 406 | NA | NA | |
| State-gov : 1981 | Assoc-voc : 2061 | Never-married :16117 | White :41762 | NA | NA | |
| Self-emp-inc : 1695 | 11th : 1812 | Separated : 1530 | NA | NA | NA | |
| (Other) : 1463 | (Other) : 7625 | Widowed : 1518 | NA | NA | NA |
str(factores)
## 'data.frame': 48842 obs. of 6 variables:
## $ workclass : Factor w/ 9 levels "?","Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...
distribucion <- fdt_cat(factores)
kable(distribucion$workclass)
|
barplot(height = distribucion$workclass$table$f, names.arg = distribucion$workclass$table$Category)
### Variable education * Tabla de distribución * Se observa que hay gran frecuencia de personas con High School: HS-grad de 15784 representando el 32.32% * Se observa que hay 83 casos de personas con nivel educativo de Preschool representando tan solo el 0.17%
kable(distribucion$education)
|
barplot(height = distribucion$education$table$f, names.arg = distribucion$education$table$Category)
### Variable marital.status * Tabla de distribución * Se observa que hay gran frecuencia de personas en estado civil de Married-civ-spouse (esposa o esposo casado): de 22379 representando el 45.82% * Se observa que hay 37 casos de personas en estado civil de Married-AF-spouse (esposa o esposo casado de ámbito militar-Armed Forces) representando tan solo el 0.08%
kable(distribucion$marital.status)
|
barplot(height = distribucion$marital.status$table$f, names.arg = distribucion$marital.status$table$Category)
### Variable race * Tabla de distribución * Se observa que hay gran frecuencia de personas de raza blanca con un valor de White = 41762 representando el 85.50% * Se observa que hay 406 casos de personas de otro tipo de raza diferente a White, Black, Asian-Pac-Islander, * Amer-Indian-Eskimo representando tan solo el 0.08%
kable(distribucion$race)
|
barplot(height = distribucion$race$table$f, names.arg = distribucion$race$table$Category)
### Variable gender * Tabla de distribución * Se observa que hay dos tipos de género ‘Male’ y ‘Female’ * El género ‘Male’ masculino existen 32650 casos representando el 66.84% * Mientras que del género ‘Female’ femenino hay 16192 casos representando el 33.15%
kable(distribucion$gender)
|
barplot(height = distribucion$gender$table$f, names.arg = distribucion$gender$table$Category)
### Variable income * Tabla de distribución * Se observa que hay dos tipos de income (Sueldos y salarios o lo que ganan en dinero) ‘<=50 mil’ y ‘>50 mil’ * De la categoría ‘<=50’ hay 37155 y representan el 76% * De la categoría ‘>50’ hay 11687 casos y representan el 23.92%
barplot(height = distribucion$income$table$f, names.arg = distribucion$income$table$Category)
### paso 3: Ingeniería de datos. * Este proceso tiene la finalidad principal de realizar algunos ajsutes en las variables * Estos ajustes van desde categorizar o agrupar en nuevas etiquetas algunas variables hasta modificar algunos valores
recategorizados <- escalados %>%
mutate(education = factor(ifelse(education == "Preschool" | education == "10th" | education == "11th" | education == "12th" | education == "1st-4th" | education == "5th-6th" | education == "7th-8th" | education == "9th", "Dropout", ifelse(education == "HS-grad", "HighGrad", ifelse(education == "Some-college" | education == "Assoc-acdm" | education == "Assoc-voc", "Community",ifelse(education == "Bachelors", "Bachelors",
ifelse(education == "Masters" | education == "Prof-school", "Master", "PhD")))))))
kable(head(recategorizados))
| age w | orkclass e | ducation | educational.num m | arital.status r | ace g | ender | hours.per.week i | ncome | age.scale | educational.num.scale | hours.per.week.scale |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 25 | Private | Dropout | 7 | Never-married | Black | Male | 40 | <=50K | 0.1095890 | 0.4000000 | 0.3979592 |
| 38 | Private | HighGrad | 9 | Married-civ-spouse | White | Male | 50 | <=50K | 0.2876712 | 0.5333333 | 0.5000000 |
| 28 | Local-gov | Community | 12 | Married-civ-spouse | White | Male | 40 | >50K | 0.1506849 | 0.7333333 | 0.3979592 |
| 44 | Private | Community | 10 | Married-civ-spouse | Black | Male | 40 | >50K | 0.3698630 | 0.6000000 | 0.3979592 |
| 18 | ? | Community | 10 | Never-married | White | Female | 30 | <=50K | 0.0136986 | 0.6000000 | 0.2959184 |
| 34 | Private | Dropout | 6 | Never-married | White | Male | 30 | <=50K | 0.2328767 | 0.3333333 | 0.2959184 |
| ### Ag | rupar estadí | sticamente l | a variable nivel d | e educación | |||||||
| * Se a | grupa por la | variable ed | ucation ya recateg | orizada | |||||||
| * Se e | ncuentra la | media de la | variable education | al.num que previament | e había | sido esca | lada | ||||
| * Se d | eterminar cu | antos de cad | a grupo |
recategorizados %>%
group_by(education) %>%
summarize(promedio_educacion = mean(educational.num),
cuantos = n()) %>%
arrange(promedio_educacion)
## # A tibble: 6 x 3
## education promedio_educacion cuantos
## <fct> <dbl> <int>
## 1 Dropout 5.61 6408
## 2 HighGrad 9 15784
## 3 Community 10.4 14540
## 4 Bachelors 13 8025
## 5 Master 14.2 3491
## 6 PhD 16 594
recategorizados <- recategorizados %>%
mutate(marital.status = factor(ifelse(marital.status == "Never-married" | marital.status == "Married-spouse-absent", "Not_married", ifelse(marital.status == "Married-AF-spouse" | marital.status == "Married-civ-spouse", "Married", ifelse(marital.status == "Separated" | marital.status == "Divorced", "Separated", "Widow")))))
kable(head(recategorizados))
| age w | orkclass e | ducation | educational.num m | arital.status r | ace g | ender | hours.per.week i | ncome | age.scale | educational.num.scale | hours.per.week.scale |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 25 | Private | Dropout | 7 | Not_married | Black | Male | 40 | <=50K | 0.1095890 | 0.4000000 | 0.3979592 |
| 38 | Private | HighGrad | 9 | Married | White | Male | 50 | <=50K | 0.2876712 | 0.5333333 | 0.5000000 |
| 28 | Local-gov | Community | 12 | Married | White | Male | 40 | >50K | 0.1506849 | 0.7333333 | 0.3979592 |
| 44 | Private | Community | 10 | Married | Black | Male | 40 | >50K | 0.3698630 | 0.6000000 | 0.3979592 |
| 18 | ? | Community | 10 | Not_married | White | Female | 30 | <=50K | 0.0136986 | 0.6000000 | 0.2959184 |
| 34 | Private | Dropout | 6 | Not_married | White | Male | 30 | <=50K | 0.2328767 | 0.3333333 | 0.2959184 |
| ### Fr | ecuencia de | la variable | marital.status | ||||||||
| * Se o | bservan las | frecuencias | del estado civil ( | marital.status.R) | recateg | orizadas |
table(recategorizados$marital.status)
##
## Married Not_married Separated Widow
## 22416 16745 8163 1518
ggplot(recategorizados, aes(x = gender, fill = income)) +
geom_bar(position = "fill") +
theme_classic()
### Origen o raza de persona y su ngreso (race, income) * Se observa que las personas de raza blanca ‘White’ al igual que las personas de raza Asiática ganan mas que las otras razas
ggplot(recategorizados, aes(x = race, fill = income)) +
geom_bar(position = "fill") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
## Horas trabajadas por semana y el género de persona (hours.per.week, gender) * Se observa con datos escalados, que los hombres trabajan mas horas a la semana que las mujeres
head(recategorizados)
## age workclass education educational.num marital.status race gender
## 1 25 Private Dropout 7 Not_married Black Male
## 2 38 Private HighGrad 9 Married White Male
## 3 28 Local-gov Community 12 Married White Male
## 4 44 Private Community 10 Married Black Male
## 5 18 ? Community 10 Not_married White Female
## 6 34 Private Dropout 6 Not_married White Male
## hours.per.week income age.scale educational.num.scale hours.per.week.scale
## 1 40 <=50K 0.10958904 0.4000000 0.3979592
## 2 50 <=50K 0.28767123 0.5333333 0.5000000
## 3 40 >50K 0.15068493 0.7333333 0.3979592
## 4 40 >50K 0.36986301 0.6000000 0.3979592
## 5 30 <=50K 0.01369863 0.6000000 0.2959184
## 6 30 <=50K 0.23287671 0.3333333 0.2959184
ggplot(recategorizados, aes(x = gender, y = hours.per.week)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
size = 3,
color = "steelblue") +
theme_classic()
## Warning: `fun.y` is deprecated. Use `fun` instead.
### Horas trabajadas por semana y el nivel educativo (hours.per.week, education) * Con datos escalados y recategorizados, se observ que los de nivel HighSchool
ggplot(recategorizados, aes(x = hours.per.week)) +
geom_density(aes(color = education), alpha = 0.5) +
theme_classic()
### No linealidad de los datos * Representar datos en gráficas de dispersión y agruando para encontrar la no linealidad de los mismos * horas por semana (hours.per.week) y la edad (age) agrupados por el sueldo (income) * Se observa que con respecto a la varible horas por semana (hours.per.week) y la edad (age) agrupados por el * sueldo (income) no una una linealidad en los datos * Se interpreta que los que trabajan mas ganan mas
ggplot(recategorizados, aes(x = age, y = hours.per.week)) +
geom_point(aes(color = income),
size = 0.5) +
stat_smooth(method = 'lm',
formula = y~poly(x, 2),
se = TRUE,
aes(color = income)) +
theme_classic()
### horas por semana (hours.per.week) y el nivel de educación (education) agrupados por el sueldo (income) * Se observa que con respecto a la varible horas por semana (hours.per.week) y la edad (age) agrupados por el * sueldo (income) no una una linealidad en los datos * Se interpreta que los que trabajan mas ganan mas
ggplot(recategorizados, aes(x = education, y = hours.per.week)) +
geom_point(aes(color = income),
size = 0.5) +
stat_smooth(method = 'lm',
formula = y~poly(x, 2),
se = TRUE,
aes(color = income)) +
theme_classic()
### Recodificar la variable income a income10 * 1 para income ‘>50’ * 0 para income ‘<=50’ * Se crea una nueva variable llamada income10 * Se muestran los primeros registros de las columnas 9 y 10 del conjunto de datos
recategorizados <- recategorizados %>%
mutate(income10 = recode(income,"<=50K" = 0,">50K" = 1))
head(recategorizados[,c(9,13)])
## income income10
## 1 <=50K 0
## 2 <=50K 0
## 3 >50K 1
## 4 >50K 1
## 5 <=50K 0
## 6 <=50K 0
names(recategorizados)
## [1] "age" "workclass" "education"
## [4] "educational.num" "marital.status" "race"
## [7] "gender" "hours.per.week" "income"
## [10] "age.scale" "educational.num.scale" "hours.per.week.scale"
## [13] "income10"
head(recategorizados)
## age workclass education educational.num marital.status race gender
## 1 25 Private Dropout 7 Not_married Black Male
## 2 38 Private HighGrad 9 Married White Male
## 3 28 Local-gov Community 12 Married White Male
## 4 44 Private Community 10 Married Black Male
## 5 18 ? Community 10 Not_married White Female
## 6 34 Private Dropout 6 Not_married White Male
## hours.per.week income age.scale educational.num.scale hours.per.week.scale
## 1 40 <=50K 0.10958904 0.4000000 0.3979592
## 2 50 <=50K 0.28767123 0.5333333 0.5000000
## 3 40 >50K 0.15068493 0.7333333 0.3979592
## 4 40 >50K 0.36986301 0.6000000 0.3979592
## 5 30 <=50K 0.01369863 0.6000000 0.2959184
## 6 30 <=50K 0.23287671 0.3333333 0.2959184
## income10
## 1 0
## 2 0
## 3 1
## 4 1
## 5 0
## 6 0
write.csv(recategorizados, file="adultos_clean.csv")
dir()
## [1] "adultos_clean.csv"
## [2] "Practica-10.html"
## [3] "Practica-11.html"
## [4] "Practica-11.Rmd"
## [5] "Practica-11_files"
## [6] "Practica-12.html"
## [7] "Practica-13.html"
## [8] "Practica-3-ventas-hora.html"
## [9] "Practica-4-rutas-millas.html"
## [10] "Practica-4-Ventas-pizza.html"
## [11] "Practica-5-Contaminantes.html"
## [12] "Practica-6-regresion-lineal-de-autos.html"
## [13] "Practica-6.html"
## [14] "Practica-7.html"
## [15] "Practica-8.html"
## [16] "Practica 10.Rmd"
## [17] "Practica 11.Rmd"
## [18] "Practica 12.Rmd"
## [19] "Practica 13.Rmd"
## [20] "Practica 3 ventas hora.Rmd"
## [21] "Practica 4 rutas millas.Rmd"
## [22] "Practica 4 Ventas pizza.Rmd"
## [23] "Practica 5 Contaminantes.Rmd"
## [24] "Practica 6 regresion lineal de autos.Rmd"
## [25] "Practica 6.Rmd"
## [26] "Practica 7.Rmd"
## [27] "Practica 8.Rmd"
## [28] "Practica1-Promedios-alumnos.html"
## [29] "Practica1-Promedios.html"
## [30] "Practica1 Promedios alumnos.Rmd"
## [31] "Practica2-Ventas.html"
## [32] "Practica2 Ventas.Rmd"
## [33] "Practica9.html"
## [34] "Practica9.Rmd"
## [35] "rsconnect"
nrow(recategorizados)
## [1] 48842
set.seed(2020)
entrena <- createDataPartition(recategorizados$income, p=0.7, list = FALSE)
datos.Entrena <- recategorizados[entrena,]
datos.Validacion <- recategorizados[-entrena,]
nrow(datos.Entrena)
## [1] 34190
kable(head(datos.Entrena))
| ag | e wor | kclass edu | cation ed | ucational.num mar | ital.status rac | e gen | der ho | urs.per.week inc | ome ag | e.scale ed | ucational.num.scale ho | urs.per.week.scale in | come10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 25 | Private | Dropout | 7 | Not_married | Black | Male | 40 | <=50K | 0.1095890 | 0.4000000 | 0.3979592 | 0 |
| 2 | 38 | Private | HighGrad | 9 | Married | White | Male | 50 | <=50K | 0.2876712 | 0.5333333 | 0.5000000 | 0 |
| 3 | 28 | Local-gov | Community | 12 | Married | White | Male | 40 | >50K | 0.1506849 | 0.7333333 | 0.3979592 | 1 |
| 4 | 44 | Private | Community | 10 | Married | Black | Male | 40 | >50K | 0.3698630 | 0.6000000 | 0.3979592 | 1 |
| 5 | 18 | ? | Community | 10 | Not_married | White | Female | 30 | <=50K | 0.0136986 | 0.6000000 | 0.2959184 | 0 |
| 7 | 29 | ? | HighGrad | 9 | Not_married | Black | Male | 40 | <=50K | 0.1643836 | 0.5333333 | 0.3979592 | 0 |
kable(tail(datos.Entrena))
| ag | e wor | kclass edu | cation ed | ucational.num mar | ital.status rac | e gen | der ho | urs.per.week inc | ome ag | e.scale ed | ucational.num.scale ho | urs.per.week.scale in | come10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 48834 | 43 | Private | Community | 11 | Married | White | Male | 45 | <=50K | 0.3561644 | 0.6666667 | 0.4489796 | 0 |
| 48835 | 32 | Private | Master | 14 | Not_married | Asian-Pac-Islander | Male | 11 | <=50K | 0.2054795 | 0.8666667 | 0.1020408 | 0 |
| 48838 | 27 | Private | Community | 12 | Married | White | Female | 38 | <=50K | 0.1369863 | 0.7333333 | 0.3775510 | 0 |
| 48840 | 58 | Private | HighGrad | 9 | Widow | White | Female | 40 | <=50K | 0.5616438 | 0.5333333 | 0.3979592 | 0 |
| 48841 | 22 | Private | HighGrad | 9 | Not_married | White | Male | 20 | <=50K | 0.0684932 | 0.5333333 | 0.1938776 | 0 |
| 48842 | 52 | Self-emp-inc | HighGrad | 9 | Married | White | Female | 40 | >50K | 0.4794521 | 0.5333333 | 0.3979592 | 1 |
datos.Validacion <- recategorizados[-entrena,]
nrow(datos.Validacion)
## [1] 14652
kable(head(datos.Validacion))
| ag | e wor | kclass edu | cation ed | ucational.num mar | ital.status rac | e gen | der ho | urs.per.week inc | ome ag | e.scale ed | ucational.num.scale ho | urs.per.week.scale in | come10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | 34 | Private | Dropout | 6 | Not_married | White | Male | 30 | <=50K | 0.2328767 | 0.3333333 | 0.2959184 | 0 |
| 15 | 48 | Private | HighGrad | 9 | Married | White | Male | 48 | >50K | 0.4246575 | 0.5333333 | 0.4795918 | 1 |
| 17 | 20 | State-gov | Community | 10 | Not_married | White | Male | 25 | <=50K | 0.0410959 | 0.6000000 | 0.2448980 | 0 |
| 36 | 65 | ? | HighGrad | 9 | Married | White | Male | 40 | <=50K | 0.6575342 | 0.5333333 | 0.3979592 | 0 |
| 41 | 65 | Private | Master | 14 | Married | White | Male | 50 | >50K | 0.6575342 | 0.8666667 | 0.5000000 | 1 |
| 49 | 52 | Private | Dropout | 7 | Separated | Black | Female | 18 | <=50K | 0.4794521 | 0.4000000 | 0.1734694 | 0 |
kable(tail(datos.Validacion))
| ag | e wor | kclass edu | cation ed | ucational.num mar | ital.status rac | e gen | der ho | urs.per.week inc | ome ag | e.scale ed | ucational.num.scale ho | urs.per.week.scale in | come10 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 48826 | 31 | Private | Master | 14 | Separated | Other | Female | 30 | <=50K | 0.1917808 | 0.8666667 | 0.2959184 | 0 |
| 48830 | 65 | Self-emp-not-inc | Master | 15 | Not_married | White | Male | 60 | <=50K | 0.6575342 | 0.9333333 | 0.6020408 | 0 |
| 48832 | 43 | Self-emp-not-inc | Community | 10 | Married | White | Male | 50 | <=50K | 0.3561644 | 0.6000000 | 0.5000000 | 0 |
| 48836 | 53 | Private | Master | 14 | Married | White | Male | 40 | >50K | 0.4931507 | 0.8666667 | 0.3979592 | 1 |
| 48837 | 22 | Private | Community | 10 | Not_married | White | Male | 40 | <=50K | 0.0684932 | 0.6000000 | 0.3979592 | 0 |
| 48839 | 40 | Private | HighGrad | 9 | Married | White | Male | 40 | >50K | 0.3150685 | 0.5333333 | 0.3979592 | 1 |
| ### paso | 6: Re | gresión logística | |||||||||||
| * Con la | regre | sión logística, dad | o un conjunt | o particular de va | lores de las vari | ables in | dependien | tes elegidas, se | estima la | probabilida | d de los ingresos de una | persona ‘<=50’ o ‘>50’ |
formula = income10 ~ age.scale + workclass + education + marital.status + race + gender + hours.per.week.scale
modelo <- glm(formula, data = datos.Entrena, family = 'binomial')
summary(modelo)
##
## Call:
## glm(formula = formula, family = "binomial", data = datos.Entrena)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7337 -0.5768 -0.2588 -0.0654 3.3492
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.419e+00 2.228e-01 -10.858 < 2e-16 ***
## age.scale 2.224e+00 1.053e-01 21.121 < 2e-16 ***
## workclassFederal-gov 1.421e+00 1.237e-01 11.485 < 2e-16 ***
## workclassLocal-gov 6.942e-01 1.100e-01 6.312 2.76e-10 ***
## workclassNever-worked -8.124e+00 1.042e+02 -0.078 0.9379
## workclassPrivate 8.124e-01 9.598e-02 8.464 < 2e-16 ***
## workclassSelf-emp-inc 1.218e+00 1.186e-01 10.270 < 2e-16 ***
## workclassSelf-emp-not-inc 1.878e-01 1.071e-01 1.753 0.0797 .
## workclassState-gov 5.339e-01 1.223e-01 4.367 1.26e-05 ***
## workclassWithout-pay -3.965e-01 8.276e-01 -0.479 0.6318
## educationCommunity -9.930e-01 4.428e-02 -22.426 < 2e-16 ***
## educationDropout -2.782e+00 7.802e-02 -35.657 < 2e-16 ***
## educationHighGrad -1.611e+00 4.523e-02 -35.610 < 2e-16 ***
## educationMaster 6.250e-01 6.110e-02 10.230 < 2e-16 ***
## educationPhD 1.077e+00 1.379e-01 7.814 5.55e-15 ***
## marital.statusNot_married -2.491e+00 5.355e-02 -46.511 < 2e-16 ***
## marital.statusSeparated -2.102e+00 5.650e-02 -37.214 < 2e-16 ***
## marital.statusWidow -2.163e+00 1.287e-01 -16.809 < 2e-16 ***
## raceAsian-Pac-Islander -2.461e-02 2.074e-01 -0.119 0.9055
## raceBlack 4.784e-04 1.968e-01 0.002 0.9981
## raceOther -9.881e-02 2.817e-01 -0.351 0.7258
## raceWhite 2.155e-01 1.876e-01 1.148 0.2509
## genderMale 9.432e-02 4.455e-02 2.117 0.0342 *
## hours.per.week.scale 3.136e+00 1.398e-01 22.430 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37626 on 34189 degrees of freedom
## Residual deviance: 25011 on 34166 degrees of freedom
## AIC: 25059
##
## Number of Fisher Scoring iterations: 11
¿Qué significan los coeficientes? Por cada unidad en el valor de edad, representa una probabilidad de exp(2.224e+00) en relación a tener ingresos >50K ó 1. exp(prediccionesfit)/(1exp(prediccionesfit)) ¿Qué significan los valores de significancia Pr(>|z|) ? La variable age tiene un valor de significancia muy bueno para el modelo de age.scale < 2e-16 age.scale Con respecto a la variable workclass, los empleos de gobierno federal, estatal, local, los privados, autoempleo tienen un valor significativo muy bueno para el modelo workclassFederal-gov < 2e-16 workclassLocal-gov 2.76e-10 workclassPrivate < 2e-16 workclassSelf-emp-inc < 2e-16 workclassState-gov 1.26e-05 El nivel de educación también refleja un nivel de significancia muy imporatnte para el modelo educationCommunity < 2e-16 educationDropout< 2e-16 educationHighGrad < 2e-16 educationMaster < 2e-16 educationPhD 5.55e-15 El estado civil de soltero, diverciado, y viudo reflejan de igual forma un importante nivel de significancia en el modelo marital.statusNot_married < 2e-16 marital.statusSeparated < 2e-16 marital.statusWidow < 2e-16 La variable race origen o raza étnica no representa un valor significativo para el modelo El género masculino aparece con aceptable nivel de significancia 0.0342 La variable horas de trabajo por semana también aparece con un nivel de signifancia de < 2e-16 hours.per.week.scale < 2e-16 ** paso 7: Evaluar el modelo Para evaluar el rendimiento del modelo, se crea la matriz de confusión Una matriz de confusión es una herramienta que permite la visualización del desempeño de un algoritmo que se emplea en aprendizaje supervisado. Cada columna de la matriz representa el número de predicciones de cada clase, mientras que cada fila representa a las instancias en la clase real. Uno de los beneficios de las matrices de confusión es que facilitan ver si el sistema está confundiendo las diferentes clases o resultados. Matriz de Confusión
Caption for the picture.
comparar <- data.frame(datos.Entrena$income10, as.vector(modelo$fitted.values) )
comparar <- comparar %>%
mutate(income10ajustados = if_else (modelo$fitted.values > 0.5, 1, 0))
colnames(comparar) <- c("income10", "ajuste", 'income10ajustados')
head(comparar)
## income10 ajuste income10ajustados
## 1 0 0.005003194 0
## 2 0 0.331904483 0
## 3 1 0.304736019 0
## 4 1 0.393130183 0
## 5 0 0.008761445 0
## 6 0 0.008068021 0
tail(comparar)
## income10 ajuste income10ajustados
## 34185 0 0.47760555 0
## 34186 0 0.06751504 0
## 34187 0 0.28999271 0
## 34188 0 0.06490387 0
## 34189 0 0.00958675 0
## 34190 1 0.43003777 0
matriz_confusion <- table(comparar$income10, comparar$income10ajustados, dnn = c("income10", "income10ajustados para predicciones"))
matriz_confusion
## income10ajustados para predicciones
## income10 0 1
## 0 24107 1902
## 1 4016 4165
n = nrow(datos.Entrena)
exactidud <- (matriz_confusion[1,1] + matriz_confusion[2,2]) / n
exactidud
## [1] 0.8269085
predicciones <- predict(modelo, datos.Validacion, se.fit = TRUE)
head(predicciones$fit)
## 6 15 17 36 41 49
## -5.1235447 -0.4589452 -4.1994476 -1.0094459 2.3586616 -4.8803438
tail(predicciones$fit)
## 48826 48830 48832 48836 48837 48839
## -1.8282163 -0.4364728 -0.5542026 1.6730416 -3.3800099 -0.9586976
predicciones_prob <- exp(predicciones$fit) / (1 + exp(predicciones$fit))
head(predicciones_prob)
## 6 15 17 36 41 49
## 0.005919626 0.387236079 0.014782074 0.267088310 0.913620236 0.007537162
tail(predicciones_prob)
## 48826 48830 48832 48836 48837 48839
## 0.13845090 0.39258175 0.36488993 0.84198092 0.03292608 0.27713903
las.predicciones <- cbind(datos.Validacion, predicciones_prob)
las.predicciones <- las.predicciones %>%
mutate(income10.prediccion = if_else(predicciones_prob > 0.5, 1, 0))
head(las.predicciones)
## age workclass education educational.num marital.status race gender
## 1 34 Private Dropout 6 Not_married White Male
## 2 48 Private HighGrad 9 Married White Male
## 3 20 State-gov Community 10 Not_married White Male
## 4 65 ? HighGrad 9 Married White Male
## 5 65 Private Master 14 Married White Male
## 6 52 Private Dropout 7 Separated Black Female
## hours.per.week income age.scale educational.num.scale hours.per.week.scale
## 1 30 <=50K 0.23287671 0.3333333 0.2959184
## 2 48 >50K 0.42465753 0.5333333 0.4795918
## 3 25 <=50K 0.04109589 0.6000000 0.2448980
## 4 40 <=50K 0.65753425 0.5333333 0.3979592
## 5 50 >50K 0.65753425 0.8666667 0.5000000
## 6 18 <=50K 0.47945205 0.4000000 0.1734694
## income10 predicciones_prob income10.prediccion
## 1 0 0.005919626 0
## 2 1 0.387236079 0
## 3 0 0.014782074 0
## 4 0 0.267088310 0
## 5 1 0.913620236 1
## 6 0 0.007537162 0
tail(las.predicciones)
## age workclass education educational.num marital.status race
## 14647 31 Private Master 14 Separated Other
## 14648 65 Self-emp-not-inc Master 15 Not_married White
## 14649 43 Self-emp-not-inc Community 10 Married White
## 14650 53 Private Master 14 Married White
## 14651 22 Private Community 10 Not_married White
## 14652 40 Private HighGrad 9 Married White
## gender hours.per.week income age.scale educational.num.scale
## 14647 Female 30 <=50K 0.19178082 0.8666667
## 14648 Male 60 <=50K 0.65753425 0.9333333
## 14649 Male 50 <=50K 0.35616438 0.6000000
## 14650 Male 40 >50K 0.49315068 0.8666667
## 14651 Male 40 <=50K 0.06849315 0.6000000
## 14652 Male 40 >50K 0.31506849 0.5333333
## hours.per.week.scale income10 predicciones_prob income10.prediccion
## 14647 0.2959184 0 0.13845090 0
## 14648 0.6020408 0 0.39258175 0
## 14649 0.5000000 0 0.36488993 0
## 14650 0.3979592 1 0.84198092 1
## 14651 0.3979592 0 0.03292608 0
## 14652 0.3979592 1 0.27713903 0
matriz_confusion <- table(las.predicciones$income10, las.predicciones$income10.prediccion, dnn = c("income10", "predicciones"))
matriz_confusion
## predicciones
## income10 0 1
## 0 10357 789
## 1 1762 1744
n = nrow(datos.Validacion)
exactidud <- (matriz_confusion[1,1] + matriz_confusion[2,2]) / n
exactidud
## [1] 0.8258941
Predecir con un nuevo registro ¿Cuál será la predicción de una persona con las siguientes características? age=53, workclass=Local-gov, education=HighGrad, marital.status=Married, race=White, gender=Male, hours.per.week=40, Con los valores escalados de age.scale = 1.04711018 y hours.per.week.scale = -0.03408661 Primero, identificar el registro a predecir conforme a los atributos del modelo formula = income10 ~ age.scale + workclass + education + marital.status + race + gender + hours.per.week.scale Crear un nuevo data.frame con un registro Dar nombres de las columnas igual que el modelo
# filter (datos.Validacion, age == 53 & workclass == 'Local-gov' & education == 'HighGrad' & marital.status == 'Married' & race == 'White' & gender == 'Male' & hours.per.week == 40)
edad <- 53; horas <- 50
a.predecir <- data.frame(rbind(c(edad, 'Local-gov', 'HighGrad', 'Married', 'White' , 'Male', horas)))
colnames(a.predecir) <- c('age.scale', 'workclass', 'education', 'marital.status', 'race', 'gender', 'hours.per.week.scale')
Segundo, escalar los valores numéricos Escalar significa centrar conforme con los valores mínimos y máximo de datos originales Escalar el valor numérico de la edad igualando y con todos los valores de la edad de los datos originale Escalar el valor de horas por semana ‘hours.per.week.scale’ de acuerdo a la columna hours.per.week de todos los datos originales Modificar mutate() las columnas de age y hours.per.week Se toman para este caso el primero registro [1] de los valores escalados: edad.escalada[1] y horas.escalada[1]
edad; horas
## [1] 53
## [1] 50
edad.escalada <- rescale(c(edad, min(datos$age), max(datos$age)))
edad.escalada <- edad.escalada[1]
# Escalando las horas por semana
horas.escalada<- rescale(c(horas, min(datos$hours.per.week), max(datos$hours.per.week)))
horas.escalada <- horas.escalada[1]
a.predecir <- a.predecir %>%
mutate(age.scale = edad.escalada,
hours.per.week.scale = horas.escalada)
a.predecir
## age.scale workclass education marital.status race gender
## 1 0.4931507 Local-gov HighGrad Married White Male
## hours.per.week.scale
## 1 0.5
Tercero, realizar la predicción con el nuevo registro Realizar las predicción Establecer la probabilidad de predicción Determinar si es 0 a 1 la predicción
prediccion <- predict(modelo, a.predecir, se.fit = TRUE)
prediccion
## $fit
## 1
## -0.3608282
##
## $se.fit
## [1] 0.06749201
##
## $residual.scale
## [1] 1
prediccion_prob <- exp(prediccion$fit) / (1 + exp(prediccion$fit))
prediccion_prob
## 1
## 0.4107591
# Predecir si será 0 o 1 conforme a la probabilidad de predicción
if_else(prediccion_prob > 0.5, 1, 0)
## [1] 0
cat("La probabilidad y la predicción de que una persona con esas características gane >50K es: ", if_else(prediccion_prob > 0.5, 1, 0))
## La probabilidad y la predicción de que una persona con esas características gane >50K es: 0
Nuevas predicciones Realizar predicciones con 10 nuevos registros y con características específicas de las personas con la finalidad de determinar si van a ganar más de 50 mil dólares Predecir con 10 nuevos registros Primero, identificar los registros a predecir conforme a los atributos del modelo 10 nuevas observaciones
edad <- c(40,50,60,70,45,60,65,75,35,53)
clase.empleo <- c('Federal-gov', 'State-gov','Never-worked','Self-emp-inc', 'Federal-gov','Private', 'Private', 'Federal-gov', 'State-gov', 'Local-gov')
nivel.educacion <- c('HighGrad', 'HighGrad', 'HighGrad', 'HighGrad', 'Bachelors', 'Bachelors', 'Community', 'Community', 'Master', 'PhD')
edo.civil <- c('Married', 'Married', 'Separated', 'Widow', 'Not_married', 'Married', 'Separated', 'Widow', 'Married', 'Not_married')
raza <- c('White', 'Asian-Pac-Islander', 'Black', 'Other', 'White', 'White', 'Amer-Indian-Eskimo', 'Black', 'White', 'White')
genero <- c('Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male')
horas <- c(50,45,55,58,60,65,70,44,61,53)
a.predecir <- data.frame(rbind(cbind(edad, clase.empleo, nivel.educacion, edo.civil,raza, genero, horas)))
colnames(a.predecir) <- c('age.scale', 'workclass', 'education', 'marital.status', 'race', 'gender', 'hours.per.week.scale')
a.predecir
## age.scale workclass education marital.status race gender
## 1 40 Federal-gov HighGrad Married White Female
## 2 50 State-gov HighGrad Married Asian-Pac-Islander Male
## 3 60 Never-worked HighGrad Separated Black Female
## 4 70 Self-emp-inc HighGrad Widow Other Male
## 5 45 Federal-gov Bachelors Not_married White Female
## 6 60 Private Bachelors Married White Male
## 7 65 Private Community Separated Amer-Indian-Eskimo Female
## 8 75 Federal-gov Community Widow Black Male
## 9 35 State-gov Master Married White Female
## 10 53 Local-gov PhD Not_married White Male
## hours.per.week.scale
## 1 50
## 2 45
## 3 55
## 4 58
## 5 60
## 6 65
## 7 70
## 8 44
## 9 61
## 10 53
Segundo: Escalar la edad y las horas trabajadas Se escalan la edad y las horas trabajadas Los primeros 10 valores escalados son los que interesan porque son 10 nuevos registros Se escalan de igual forma con los valores originales de datos tanto en age como en hours.per.week
edad; horas
## [1] 40 50 60 70 45 60 65 75 35 53
## [1] 50 45 55 58 60 65 70 44 61 53
edad.escalada <- rescale(c(edad, min(datos$age), max(datos$age)))
edad.escalada <- edad.escalada[1:10]
# Escalando las horas por semana
horas.escalada<- rescale(c(horas, min(datos$hours.per.week), max(datos$hours.per.week)))
horas.escalada <- horas.escalada[1:10]
a.predecir <- a.predecir %>%
mutate(age.scale = edad.escalada,
hours.per.week.scale = horas.escalada)
a.predecir
## age.scale workclass education marital.status race gender
## 1 0.3150685 Federal-gov HighGrad Married White Female
## 2 0.4520548 State-gov HighGrad Married Asian-Pac-Islander Male
## 3 0.5890411 Never-worked HighGrad Separated Black Female
## 4 0.7260274 Self-emp-inc HighGrad Widow Other Male
## 5 0.3835616 Federal-gov Bachelors Not_married White Female
## 6 0.5890411 Private Bachelors Married White Male
## 7 0.6575342 Private Community Separated Amer-Indian-Eskimo Female
## 8 0.7945205 Federal-gov Community Widow Black Male
## 9 0.2465753 State-gov Master Married White Female
## 10 0.4931507 Local-gov PhD Not_married White Male
## hours.per.week.scale
## 1 0.5000000
## 2 0.4489796
## 3 0.5510204
## 4 0.5816327
## 5 0.6020408
## 6 0.6530612
## 7 0.7040816
## 8 0.4387755
## 9 0.6122449
## 10 0.5306122
Tercero. Realizar la predicción con los nuevos registros Realizar la predicción Establecer la probabilidad de predicción Determinar si es 0 a 1 la predicción
prediccion <- predict(modelo, a.predecir, se.fit = TRUE)
prediccion
## $fit
## 1 2 3 4 5 6
## -0.12439975 -1.01254072 -11.21715667 -1.54029814 -0.53195390 2.06137511
## 7 8 9 10
## -1.03148774 -0.91627294 1.42400774 -0.06745948
##
## $se.fit
## 1 2 3 4 5 6
## 0.09665512 0.12280131 104.21710499 0.25776451 0.10447450 0.05880670
## 7 8 9 10
## 0.20303050 0.16157401 0.10392565 0.15241508
##
## $residual.scale
## [1] 1
prediccion_prob <- exp(prediccion$fit) / (1 + exp(prediccion$fit))
prediccion_prob
## 1 2 3 4 5 6
## 4.689401e-01 2.664829e-01 1.344141e-05 1.764919e-01 3.700613e-01 8.870920e-01
## 7 8 9 10
## 2.627958e-01 2.857179e-01 8.059659e-01 4.831415e-01
# Predecir si será 0 o 1 conforme a la probabilidad de predicción
las.predicciones <- if_else(prediccion_prob > 0.5, 1, 0)
cat("Son las predicciones para las personas con esas características ")
## Son las predicciones para las personas con esas características
las.predicciones
## [1] 0 0 0 0 0 1 0 0 1 0
print("Las predicciones en la columna final")
## [1] "Las predicciones en la columna final"
cbind(a.predecir ,las.predicciones)
## age.scale workclass education marital.status race gender
## 1 0.3150685 Federal-gov HighGrad Married White Female
## 2 0.4520548 State-gov HighGrad Married Asian-Pac-Islander Male
## 3 0.5890411 Never-worked HighGrad Separated Black Female
## 4 0.7260274 Self-emp-inc HighGrad Widow Other Male
## 5 0.3835616 Federal-gov Bachelors Not_married White Female
## 6 0.5890411 Private Bachelors Married White Male
## 7 0.6575342 Private Community Separated Amer-Indian-Eskimo Female
## 8 0.7945205 Federal-gov Community Widow Black Male
## 9 0.2465753 State-gov Master Married White Female
## 10 0.4931507 Local-gov PhD Not_married White Male
## hours.per.week.scale las.predicciones
## 1 0.5000000 0
## 2 0.4489796 0
## 3 0.5510204 0
## 4 0.5816327 0
## 5 0.6020408 0
## 6 0.6530612 1
## 7 0.7040816 0
## 8 0.4387755 0
## 9 0.6122449 1
## 10 0.5306122 0