library(tidyverse) # varias
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ----------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts -------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr) # select filter mutate ...
library(ggplot2) # Gráficas
library(fdth) # Para tablas de distribución y frecuencias
##
## Attaching package: 'fdth'
## The following objects are masked from 'package:stats':
##
## sd, var
library(knitr) # Para ver tablas mas amigables en formato html markdown
library(caret) # Pra particionar datos
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(reshape) # Para renombrar columnas en caso de necesitarse
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
library(scales) # Para escalar datos
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
datos <- read_csv("C:/Users/crep_/Documents/8vo semestre/Analisis de datos inteligentes/datos/adultos.csv")
## Parsed with column specification:
## cols(
## x = col_double(),
## age = col_double(),
## workclass = col_character(),
## education = col_character(),
## `educational-num` = col_double(),
## `marital-status` = col_character(),
## race = col_character(),
## gender = col_character(),
## `hours-per-week` = col_double(),
## income = col_character()
## )
head(datos)
## # A tibble: 6 x 10
## x age workclass education `educational-nu~ `marital-status` race gender
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 1 25 Private 11th 7 Never-married Black Male
## 2 2 38 Private HS-grad 9 Married-civ-spo~ White Male
## 3 3 28 Local-gov Assoc-ac~ 12 Married-civ-spo~ White Male
## 4 4 44 Private Some-col~ 10 Married-civ-spo~ Black Male
## 5 5 18 ? Some-col~ 10 Never-married White Female
## 6 6 34 Private 10th 6 Never-married White Male
## # ... with 2 more variables: `hours-per-week` <dbl>, income <chr>
# Los primeros diez registros
kable(head(datos, 10))
| x | age | workclass | education | educational-num | marital-status | race | gender | hours-per-week | income |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 25 | Private | 11th | 7 | Never-married | Black | Male | 40 | <=50K |
| 2 | 38 | Private | HS-grad | 9 | Married-civ-spouse | White | Male | 50 | <=50K |
| 3 | 28 | Local-gov | Assoc-acdm | 12 | Married-civ-spouse | White | Male | 40 | >50K |
| 4 | 44 | Private | Some-college | 10 | Married-civ-spouse | Black | Male | 40 | >50K |
| 5 | 18 | ? | Some-college | 10 | Never-married | White | Female | 30 | <=50K |
| 6 | 34 | Private | 10th | 6 | Never-married | White | Male | 30 | <=50K |
| 7 | 29 | ? | HS-grad | 9 | Never-married | Black | Male | 40 | <=50K |
| 8 | 63 | Self-emp-not-inc | Prof-school | 15 | Married-civ-spouse | White | Male | 32 | >50K |
| 9 | 24 | Private | Some-college | 10 | Never-married | White | Female | 40 | <=50K |
| 10 | 55 | Private | 7th-8th | 4 | Married-civ-spouse | White | Male | 10 | <=50K |
## Los últimos diez registros
kable(tail(datos,10))
| x | age | workclass | education | educational-num | marital-status | race | gender | hours-per-week | income |
|---|---|---|---|---|---|---|---|---|---|
| 48833 | 32 | Private | 10th | 6 | Married-civ-spouse | Amer-Indian-Eskimo | Male | 40 | <=50K |
| 48834 | 43 | Private | Assoc-voc | 11 | Married-civ-spouse | White | Male | 45 | <=50K |
| 48835 | 32 | Private | Masters | 14 | Never-married | Asian-Pac-Islander | Male | 11 | <=50K |
| 48836 | 53 | Private | Masters | 14 | Married-civ-spouse | White | Male | 40 | >50K |
| 48837 | 22 | Private | Some-college | 10 | Never-married | White | Male | 40 | <=50K |
| 48838 | 27 | Private | Assoc-acdm | 12 | Married-civ-spouse | White | Female | 38 | <=50K |
| 48839 | 40 | Private | HS-grad | 9 | Married-civ-spouse | White | Male | 40 | >50K |
| 48840 | 58 | Private | HS-grad | 9 | Widowed | White | Female | 40 | <=50K |
| 48841 | 22 | Private | HS-grad | 9 | Never-married | White | Male | 20 | <=50K |
| 48842 | 52 | Self-emp-inc | HS-grad | 9 | Married-civ-spouse | White | Female | 40 | >50K |
| * La est | ructru | a de los datos | |||||||
| * Resume | n de l | os datos: | |||||||
| - x Vari | able d | e consecutivo d | e los datos | ||||||
| - age la | edad | de la persona | |||||||
| - workcl | ass es | un tipo o clas | e de trabajo de | la persona, priva | do, gobierno, por su | cuenta, | |||
| - educat | ion in | dica el nivel e | ducativo de la | persona | |||||
| - educat | ional | es el valor num | érico de educat | ion | |||||
| - marita | l es s | u estado civil | |||||||
| - race e | s el t | ipo de raza de | persona | ||||||
| - gender | es el | género de la p | ersona | ||||||
| - hours. | per.we | ek son las hora | s que trbaja po | r semana | |||||
| - income | son l | os ingresos |
datos[-1]. Excepto la columna x que no interesa
str(datos)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 48842 obs. of 10 variables:
## $ x : num 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : chr "Private" "Private" "Local-gov" "Private" ...
## $ education : chr "11th" "HS-grad" "Assoc-acdm" "Some-college" ...
## $ educational-num: num 7 9 12 10 10 6 9 15 10 4 ...
## $ marital-status : chr "Never-married" "Married-civ-spouse" "Married-civ-spouse" "Married-civ-spouse" ...
## $ race : chr "Black" "White" "White" "Black" ...
## $ gender : chr "Male" "Male" "Male" "Male" ...
## $ hours-per-week : num 40 50 40 40 30 30 40 32 40 10 ...
## $ income : chr "<=50K" "<=50K" ">50K" ">50K" ...
## - attr(*, "spec")=
## .. cols(
## .. x = col_double(),
## .. age = col_double(),
## .. workclass = col_character(),
## .. education = col_character(),
## .. `educational-num` = col_double(),
## .. `marital-status` = col_character(),
## .. race = col_character(),
## .. gender = col_character(),
## .. `hours-per-week` = col_double(),
## .. income = col_character()
## .. )
kable(summary(datos[-1]))
| age | workclass | education | educational-num | marital-status | race | gender | hours-per-week | income | |
|---|---|---|---|---|---|---|---|---|---|
| Min. :17.00 | Length:48842 | Length:48842 | Min. : 1.00 | Length:48842 | Length:48842 | Length:48842 | Min. : 1.00 | Length:48842 | |
| 1st Qu.:28.00 | Class :character | Class :character | 1st Qu.: 9.00 | Class :character | Class :character | Class :character | 1st Qu.:40.00 | Class :character | |
| Median :37.00 | Mode :character | Mode :character | Median :10.00 | Mode :character | Mode :character | Mode :character | Median :40.00 | Mode :character | |
| Mean :38.64 | NA | NA | Mean :10.08 | NA | NA | NA | Mean :40.42 | NA | |
| 3rd Qu.:48.00 | NA | NA | 3rd Qu.:12.00 | NA | NA | NA | 3rd Qu.:45.00 | NA | |
| Max. :90.00 | NA | NA | Max. :16.00 | NA | NA | NA | Max. :99.00 | NA |
De los pasos 1 al 4 se pueden integrar en fases de ciencia de los datos como de carga, limpieza y exploración de los datos.
A partir del paso 5 se construye un modelo de regresión lógistica para predicciones.
paso 1: Identificar variables numéricas
paso 2: Identificar variables factor
paso 3: Ingeniería de datos
paso 4: Estadísticos descriptivos
paso 5: Conjunto de datos de entrenamiento y de validación Train/test set
paso 6: Modelo de regresión logística
paso 7: Evaluar el modelo
paso 8: Predicciones con datos de entrenamiento
Se utiliza select_if() para seleccionar ciertas variables del conjunto de datos select_if() es un función de la librería dplyr Se analizan dos variables numéricas: - hours.per.week - age education.num es un valor numérico del factor education por lo que se analiza en las variables tipo factor mas adelante
numericas <-select_if(datos, is.numeric)
kable(summary(numericas[-1]))
| age | educational-num | hours-per-week | |
|---|---|---|---|
| Min. :17.00 | Min. : 1.00 | Min. : 1.00 | |
| 1st Qu.:28.00 | 1st Qu.: 9.00 | 1st Qu.:40.00 | |
| Median :37.00 | Median :10.00 | Median :40.00 | |
| Mean :38.64 | Mean :10.08 | Mean :40.42 | |
| 3rd Qu.:48.00 | 3rd Qu.:12.00 | 3rd Qu.:45.00 | |
| Max. :90.00 | Max. :16.00 | Max. :99.00 |