R Markdown
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#VERGARA HERNANDEZ JESUS ALEJANDRO
#NUMERO DE CONTROL: 16040461
#PRACTICA 11
library(tidyverse) # varias
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages -------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 3.0.0 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr) # select filter mutate ...
library(ggplot2) # Gráficas
library(fdth) # Para tablas de distribución y frecuencias
##
## Attaching package: 'fdth'
## The following objects are masked from 'package:stats':
##
## sd, var
library(knitr) # Para ver tablas mas amigables en formato html markdown
library(caret) # Pra particionar datos
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(reshape) # Para renombrar columnas en caso de necesitarse
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(readr)
datos <- read.csv("C:/Users/esemi/OneDrive/Documentos/RSTUDIO/adultos.csv")
head(datos)
## x age workclass education educational.num marital.status race gender
## 1 1 25 Private 11th 7 Never-married Black Male
## 2 2 38 Private HS-grad 9 Married-civ-spouse White Male
## 3 3 28 Local-gov Assoc-acdm 12 Married-civ-spouse White Male
## 4 4 44 Private Some-college 10 Married-civ-spouse Black Male
## 5 5 18 ? Some-college 10 Never-married White Female
## 6 6 34 Private 10th 6 Never-married White Male
## hours.per.week income
## 1 40 <=50K
## 2 50 <=50K
## 3 40 >50K
## 4 40 >50K
## 5 30 <=50K
## 6 30 <=50K
kable(head(datos, 10))
1 |
25 |
Private |
11th |
7 |
Never-married |
Black |
Male |
40 |
<=50K |
2 |
38 |
Private |
HS-grad |
9 |
Married-civ-spouse |
White |
Male |
50 |
<=50K |
3 |
28 |
Local-gov |
Assoc-acdm |
12 |
Married-civ-spouse |
White |
Male |
40 |
>50K |
4 |
44 |
Private |
Some-college |
10 |
Married-civ-spouse |
Black |
Male |
40 |
>50K |
5 |
18 |
? |
Some-college |
10 |
Never-married |
White |
Female |
30 |
<=50K |
6 |
34 |
Private |
10th |
6 |
Never-married |
White |
Male |
30 |
<=50K |
7 |
29 |
? |
HS-grad |
9 |
Never-married |
Black |
Male |
40 |
<=50K |
8 |
63 |
Self-emp-not-inc |
Prof-school |
15 |
Married-civ-spouse |
White |
Male |
32 |
>50K |
9 |
24 |
Private |
Some-college |
10 |
Never-married |
White |
Female |
40 |
<=50K |
10 |
55 |
Private |
7th-8th |
4 |
Married-civ-spouse |
White |
Male |
10 |
<=50K |
kable(tail(datos,10))
48833 |
48833 |
32 |
Private |
10th |
6 |
Married-civ-spouse |
Amer-Indian-Eskimo |
Male |
40 |
<=50K |
48834 |
48834 |
43 |
Private |
Assoc-voc |
11 |
Married-civ-spouse |
White |
Male |
45 |
<=50K |
48835 |
48835 |
32 |
Private |
Masters |
14 |
Never-married |
Asian-Pac-Islander |
Male |
11 |
<=50K |
48836 |
48836 |
53 |
Private |
Masters |
14 |
Married-civ-spouse |
White |
Male |
40 |
>50K |
48837 |
48837 |
22 |
Private |
Some-college |
10 |
Never-married |
White |
Male |
40 |
<=50K |
48838 |
48838 |
27 |
Private |
Assoc-acdm |
12 |
Married-civ-spouse |
White |
Female |
38 |
<=50K |
48839 |
48839 |
40 |
Private |
HS-grad |
9 |
Married-civ-spouse |
White |
Male |
40 |
>50K |
48840 |
48840 |
58 |
Private |
HS-grad |
9 |
Widowed |
White |
Female |
40 |
<=50K |
48841 |
48841 |
22 |
Private |
HS-grad |
9 |
Never-married |
White |
Male |
20 |
<=50K |
48842 |
48842 |
52 |
Self-emp-inc |
HS-grad |
9 |
Married-civ-spouse |
White |
Female |
40 |
>50K |
str(datos)
## 'data.frame': 48842 obs. of 10 variables:
## $ x : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 25 38 28 44 18 34 29 63 24 55 ...
## $ workclass : Factor w/ 9 levels "?","Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ educational.num: int 7 9 12 10 10 6 9 15 10 4 ...
## $ marital.status : Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ hours.per.week : int 40 50 40 40 30 30 40 32 40 10 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...
kable(summary(datos[-1]))
|
Min. :17.00 |
Private :33906 |
HS-grad :15784 |
Min. : 1.00 |
Divorced : 6633 |
Amer-Indian-Eskimo: 470 |
Female:16192 |
Min. : 1.00 |
<=50K:37155 |
|
1st Qu.:28.00 |
Self-emp-not-inc: 3862 |
Some-college:10878 |
1st Qu.: 9.00 |
Married-AF-spouse : 37 |
Asian-Pac-Islander: 1519 |
Male :32650 |
1st Qu.:40.00 |
>50K :11687 |
|
Median :37.00 |
Local-gov : 3136 |
Bachelors : 8025 |
Median :10.00 |
Married-civ-spouse :22379 |
Black : 4685 |
NA |
Median :40.00 |
NA |
|
Mean :38.64 |
? : 2799 |
Masters : 2657 |
Mean :10.08 |
Married-spouse-absent: 628 |
Other : 406 |
NA |
Mean :40.42 |
NA |
|
3rd Qu.:48.00 |
State-gov : 1981 |
Assoc-voc : 2061 |
3rd Qu.:12.00 |
Never-married :16117 |
White :41762 |
NA |
3rd Qu.:45.00 |
NA |
|
Max. :90.00 |
Self-emp-inc : 1695 |
11th : 1812 |
Max. :16.00 |
Separated : 1530 |
NA |
NA |
Max. :99.00 |
NA |
|
NA |
(Other) : 1463 |
(Other) : 7625 |
NA |
Widowed : 1518 |
NA |
NA |
NA |
NA |
numericas <-select_if(datos, is.numeric)
kable(summary(numericas[-1]))
|
Min. :17.00 |
Min. : 1.00 |
Min. : 1.00 |
|
1st Qu.:28.00 |
1st Qu.: 9.00 |
1st Qu.:40.00 |
|
Median :37.00 |
Median :10.00 |
Median :40.00 |
|
Mean :38.64 |
Mean :10.08 |
Mean :40.42 |
|
3rd Qu.:48.00 |
3rd Qu.:12.00 |
3rd Qu.:45.00 |
|
Max. :90.00 |
Max. :16.00 |
Max. :99.00 |
ggplot(numericas, aes(x=hours.per.week)) +
geom_density(alpha = .2, fill = "#FF6666")

distribucion <- fdt(numericas$hours.per.week,breaks="Sturges")
kable(distribucion)
[0.99,6.8) |
410 |
0.0083944 |
0.8394415 |
410 |
0.8394415 |
[6.8,13) |
982 |
0.0201056 |
2.0105647 |
1392 |
2.8500061 |
[13,18) |
1180 |
0.0241595 |
2.4159535 |
2572 |
5.2659596 |
[18,24) |
2383 |
0.0487900 |
4.8789976 |
4955 |
10.1449572 |
[24,30) |
2896 |
0.0592932 |
5.9293231 |
7851 |
16.0742803 |
[30,36) |
2481 |
0.0507964 |
5.0796446 |
10332 |
21.1539249 |
[36,42) |
24217 |
0.4958233 |
49.5823267 |
34549 |
70.7362516 |
[42,48) |
3803 |
0.0778633 |
7.7863314 |
38352 |
78.5225830 |
[48,53) |
5319 |
0.1089022 |
10.8902174 |
43671 |
89.4128005 |
[53,59) |
1318 |
0.0269850 |
2.6984972 |
44989 |
92.1112977 |
[59,65) |
2596 |
0.0531510 |
5.3150977 |
47585 |
97.4263953 |
[65,71) |
483 |
0.0098890 |
0.9889030 |
48068 |
98.4152983 |
[71,77) |
223 |
0.0045657 |
0.4565743 |
48291 |
98.8718726 |
[77,83) |
237 |
0.0048524 |
0.4852381 |
48528 |
99.3571107 |
[83,88) |
98 |
0.0020065 |
0.2006470 |
48626 |
99.5577577 |
[88,94) |
52 |
0.0010647 |
0.1064657 |
48678 |
99.6642234 |
[94,1e+02) |
164 |
0.0033578 |
0.3357766 |
48842 |
100.0000000 |
|
start |
0.990000 |
end |
99.990000 |
h |
5.823529 |
right |
0.000000 |
|
barplot(height = distribucion$table$f, names.arg = distribucion$table$`Class limits`)

ggplot(numericas, aes(x = age)) +
geom_density(alpha = .2, fill = "#FF6666")

barplot(height = distribucion$table$f, names.arg = distribucion$table$`Class limits`)

escalados <- datos[-1] %>%
mutate(age.scale = rescale(age),educational.num.scale = rescale(educational.num), hours.per.week.scale = rescale(hours.per.week) )
head(escalados, 10)
## age workclass education educational.num marital.status race
## 1 25 Private 11th 7 Never-married Black
## 2 38 Private HS-grad 9 Married-civ-spouse White
## 3 28 Local-gov Assoc-acdm 12 Married-civ-spouse White
## 4 44 Private Some-college 10 Married-civ-spouse Black
## 5 18 ? Some-college 10 Never-married White
## 6 34 Private 10th 6 Never-married White
## 7 29 ? HS-grad 9 Never-married Black
## 8 63 Self-emp-not-inc Prof-school 15 Married-civ-spouse White
## 9 24 Private Some-college 10 Never-married White
## 10 55 Private 7th-8th 4 Married-civ-spouse White
## gender hours.per.week income age.scale educational.num.scale
## 1 Male 40 <=50K 0.10958904 0.4000000
## 2 Male 50 <=50K 0.28767123 0.5333333
## 3 Male 40 >50K 0.15068493 0.7333333
## 4 Male 40 >50K 0.36986301 0.6000000
## 5 Female 30 <=50K 0.01369863 0.6000000
## 6 Male 30 <=50K 0.23287671 0.3333333
## 7 Male 40 <=50K 0.16438356 0.5333333
## 8 Male 32 >50K 0.63013699 0.9333333
## 9 Female 40 <=50K 0.09589041 0.6000000
## 10 Male 10 <=50K 0.52054795 0.2000000
## hours.per.week.scale
## 1 0.39795918
## 2 0.50000000
## 3 0.39795918
## 4 0.39795918
## 5 0.29591837
## 6 0.29591837
## 7 0.39795918
## 8 0.31632653
## 9 0.39795918
## 10 0.09183673
tail(escalados, 10)
## age workclass education educational.num marital.status
## 48833 32 Private 10th 6 Married-civ-spouse
## 48834 43 Private Assoc-voc 11 Married-civ-spouse
## 48835 32 Private Masters 14 Never-married
## 48836 53 Private Masters 14 Married-civ-spouse
## 48837 22 Private Some-college 10 Never-married
## 48838 27 Private Assoc-acdm 12 Married-civ-spouse
## 48839 40 Private HS-grad 9 Married-civ-spouse
## 48840 58 Private HS-grad 9 Widowed
## 48841 22 Private HS-grad 9 Never-married
## 48842 52 Self-emp-inc HS-grad 9 Married-civ-spouse
## race gender hours.per.week income age.scale
## 48833 Amer-Indian-Eskimo Male 40 <=50K 0.20547945
## 48834 White Male 45 <=50K 0.35616438
## 48835 Asian-Pac-Islander Male 11 <=50K 0.20547945
## 48836 White Male 40 >50K 0.49315068
## 48837 White Male 40 <=50K 0.06849315
## 48838 White Female 38 <=50K 0.13698630
## 48839 White Male 40 >50K 0.31506849
## 48840 White Female 40 <=50K 0.56164384
## 48841 White Male 20 <=50K 0.06849315
## 48842 White Female 40 >50K 0.47945205
## educational.num.scale hours.per.week.scale
## 48833 0.3333333 0.3979592
## 48834 0.6666667 0.4489796
## 48835 0.8666667 0.1020408
## 48836 0.8666667 0.3979592
## 48837 0.6000000 0.3979592
## 48838 0.7333333 0.3775510
## 48839 0.5333333 0.3979592
## 48840 0.5333333 0.3979592
## 48841 0.5333333 0.1938776
## 48842 0.5333333 0.3979592
recategorizados <- escalados %>%
mutate(education = factor(ifelse(education == "Preschool" | education == "10th" | education == "11th" | education == "12th" | education == "1st-4th" | education == "5th-6th" | education == "7th-8th" | education == "9th", "Dropout", ifelse(education == "HS-grad", "HighGrad", ifelse(education == "Some-college" | education == "Assoc-acdm" | education == "Assoc-voc", "Community",ifelse(education == "Bachelors", "Bachelors",
ifelse(education == "Masters" | education == "Prof-school", "Master", "PhD")))))))
kable(head(recategorizados))
25 |
Private |
Dropout |
7 |
Never-married |
Black |
Male |
40 |
<=50K |
0.1095890 |
0.4000000 |
0.3979592 |
38 |
Private |
HighGrad |
9 |
Married-civ-spouse |
White |
Male |
50 |
<=50K |
0.2876712 |
0.5333333 |
0.5000000 |
28 |
Local-gov |
Community |
12 |
Married-civ-spouse |
White |
Male |
40 |
>50K |
0.1506849 |
0.7333333 |
0.3979592 |
44 |
Private |
Community |
10 |
Married-civ-spouse |
Black |
Male |
40 |
>50K |
0.3698630 |
0.6000000 |
0.3979592 |
18 |
? |
Community |
10 |
Never-married |
White |
Female |
30 |
<=50K |
0.0136986 |
0.6000000 |
0.2959184 |
34 |
Private |
Dropout |
6 |
Never-married |
White |
Male |
30 |
<=50K |
0.2328767 |
0.3333333 |
0.2959184 |
recategorizados %>%
group_by(education) %>%
summarize(promedio_educacion = mean(educational.num),
cuantos = n()) %>%
arrange(promedio_educacion)
## # A tibble: 6 x 3
## education promedio_educacion cuantos
## <fct> <dbl> <int>
## 1 Dropout 5.61 6408
## 2 HighGrad 9 15784
## 3 Community 10.4 14540
## 4 Bachelors 13 8025
## 5 Master 14.2 3491
## 6 PhD 16 594
temporal <- recategorizados %>%
mutate(marital.status=factor(ifelse(marital.status=="Never-married" | marital.status=="Married-spouse-absent","Not_married",ifelse(marital.status == "Married-civ-spouse" | marital.status=="Married-AF-spouse","Married",ifelse(marital.status=="Divorced" | marital.status=="Separated","Separated","Widow")))))
recategorizados <- temporal
kable(head(recategorizados))
25 |
Private |
Dropout |
7 |
Not_married |
Black |
Male |
40 |
<=50K |
0.1095890 |
0.4000000 |
0.3979592 |
38 |
Private |
HighGrad |
9 |
Married |
White |
Male |
50 |
<=50K |
0.2876712 |
0.5333333 |
0.5000000 |
28 |
Local-gov |
Community |
12 |
Married |
White |
Male |
40 |
>50K |
0.1506849 |
0.7333333 |
0.3979592 |
44 |
Private |
Community |
10 |
Married |
Black |
Male |
40 |
>50K |
0.3698630 |
0.6000000 |
0.3979592 |
18 |
? |
Community |
10 |
Not_married |
White |
Female |
30 |
<=50K |
0.0136986 |
0.6000000 |
0.2959184 |
34 |
Private |
Dropout |
6 |
Not_married |
White |
Male |
30 |
<=50K |
0.2328767 |
0.3333333 |
0.2959184 |
table(recategorizados$marital.status)
##
## Married Not_married Separated Widow
## 22416 16745 8163 1518
ggplot(recategorizados, aes(x = gender, fill = income)) +
geom_bar(position = "fill") +
theme_classic()

ggplot(recategorizados, aes(x = race, fill = income)) +
geom_bar(position = "fill") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))

head(recategorizados)
## age workclass education educational.num marital.status race gender
## 1 25 Private Dropout 7 Not_married Black Male
## 2 38 Private HighGrad 9 Married White Male
## 3 28 Local-gov Community 12 Married White Male
## 4 44 Private Community 10 Married Black Male
## 5 18 ? Community 10 Not_married White Female
## 6 34 Private Dropout 6 Not_married White Male
## hours.per.week income age.scale educational.num.scale hours.per.week.scale
## 1 40 <=50K 0.10958904 0.4000000 0.3979592
## 2 50 <=50K 0.28767123 0.5333333 0.5000000
## 3 40 >50K 0.15068493 0.7333333 0.3979592
## 4 40 >50K 0.36986301 0.6000000 0.3979592
## 5 30 <=50K 0.01369863 0.6000000 0.2959184
## 6 30 <=50K 0.23287671 0.3333333 0.2959184
ggplot(recategorizados, aes(x = gender, y = hours.per.week)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
size = 3,
color = "steelblue") +
theme_classic()
## Warning: `fun.y` is deprecated. Use `fun` instead.

ggplot(recategorizados, aes(x = hours.per.week)) +
geom_density(aes(color = education), alpha = 0.5) +
theme_classic()

ggplot(recategorizados, aes(x = age, y = hours.per.week)) +
geom_point(aes(color = income),
size = 0.5) +
stat_smooth(method = 'lm',
formula = y~poly(x, 2),
se = TRUE,
aes(color = income)) +
theme_classic()

ggplot(recategorizados, aes(x = education, y = hours.per.week)) +
geom_point(aes(color = income),
size = 0.5) +
stat_smooth(method = 'lm',
formula = y~poly(x, 2),
se = TRUE,
aes(color = income)) +
theme_classic()

recategorizados <- recategorizados %>%
mutate(income10 = recode(income,"<=50K" = 0,">50K" = 1))
head(recategorizados[,c(9,13)])
## income income10
## 1 <=50K 0
## 2 <=50K 0
## 3 >50K 1
## 4 >50K 1
## 5 <=50K 0
## 6 <=50K 0
names(recategorizados)
## [1] "age" "workclass" "education"
## [4] "educational.num" "marital.status" "race"
## [7] "gender" "hours.per.week" "income"
## [10] "age.scale" "educational.num.scale" "hours.per.week.scale"
## [13] "income10"
head(recategorizados)
## age workclass education educational.num marital.status race gender
## 1 25 Private Dropout 7 Not_married Black Male
## 2 38 Private HighGrad 9 Married White Male
## 3 28 Local-gov Community 12 Married White Male
## 4 44 Private Community 10 Married Black Male
## 5 18 ? Community 10 Not_married White Female
## 6 34 Private Dropout 6 Not_married White Male
## hours.per.week income age.scale educational.num.scale hours.per.week.scale
## 1 40 <=50K 0.10958904 0.4000000 0.3979592
## 2 50 <=50K 0.28767123 0.5333333 0.5000000
## 3 40 >50K 0.15068493 0.7333333 0.3979592
## 4 40 >50K 0.36986301 0.6000000 0.3979592
## 5 30 <=50K 0.01369863 0.6000000 0.2959184
## 6 30 <=50K 0.23287671 0.3333333 0.2959184
## income10
## 1 0
## 2 0
## 3 1
## 4 1
## 5 0
## 6 0
write.csv(recategorizados, file="adultos_clean.csv")
dir()
## [1] "adultos_clean.csv" "ALEJANDRO CUENTAS Y PASSWORD.url"
## [3] "Arduino" "ArduinoData"
## [5] "Bandicam" "Base de daots Unidad 4"
## [7] "BCoGfWQCcAA_939.jpg" "Bloc de notas de Jesus.url"
## [9] "BzEouGVIAAEEjrH.jpg" "Camtasia Studio"
## [11] "Cliente_TELNET_TFTP.pptx" "cN7W7JM.jpg"
## [13] "CUBOS ACUATICOS.docx" "DESARROLLO BB.pptx"
## [15] "descarga.jpg" "desktop.ini"
## [17] "DOCKERFILE PASOS.docx" "Empresa.accdb"
## [19] "enlace.txt" "Entonces.docx"
## [21] "examen metodos.txt" "examenes"
## [23] "FEMJOY22-682x1024.jpg" "Formulario2.html"
## [25] "hoja rayada.docx" "justin bb.txt"
## [27] "los 5 poemas del romanticismo.docx" "MAnual de seguridad.docx"
## [29] "Manual_colegioMilitar(1).docx" "Manual_colegioMilitar.docx"
## [31] "MEGA" "Metdo baisteain.xlsx"
## [33] "mierda.webp" "Modelo 2.mwb"
## [35] "Modelo 2.mwb.bak" "Modelo1.mwb"
## [37] "Modelo1.mwb.bak" "My Cheat Tables"
## [39] "NetBeansProjects" "NMP [Autoguardado].pptx"
## [41] "NMP.pptx" "PCSX2"
## [43] "Plantillas personalizadas de Office" "practica"
## [45] "practica11.Rmd" "practica9.html"
## [47] "practica9.Rmd" "Presentación1.pptx"
## [49] "PROBA" "prueba.html"
## [51] "R" "rsconnect"
## [53] "RSTUDIO" "Solar Winds.docx"
## [55] "UNIDAD 4 TOPICOS" "Virtual Machines"
## [57] "WindowsPowerShell"
nrow(recategorizados)
## [1] 48842
set.seed(2020)
entrena <- createDataPartition(recategorizados$income, p=0.7, list = FALSE)
datos.Entrena <- recategorizados[entrena,]
datos.Validacion <- recategorizados[-entrena,]
nrow(datos.Entrena)
## [1] 34190
kable(head(datos.Entrena))
1 |
25 |
Private |
Dropout |
7 |
Not_married |
Black |
Male |
40 |
<=50K |
0.1095890 |
0.4000000 |
0.3979592 |
0 |
2 |
38 |
Private |
HighGrad |
9 |
Married |
White |
Male |
50 |
<=50K |
0.2876712 |
0.5333333 |
0.5000000 |
0 |
3 |
28 |
Local-gov |
Community |
12 |
Married |
White |
Male |
40 |
>50K |
0.1506849 |
0.7333333 |
0.3979592 |
1 |
4 |
44 |
Private |
Community |
10 |
Married |
Black |
Male |
40 |
>50K |
0.3698630 |
0.6000000 |
0.3979592 |
1 |
5 |
18 |
? |
Community |
10 |
Not_married |
White |
Female |
30 |
<=50K |
0.0136986 |
0.6000000 |
0.2959184 |
0 |
7 |
29 |
? |
HighGrad |
9 |
Not_married |
Black |
Male |
40 |
<=50K |
0.1643836 |
0.5333333 |
0.3979592 |
0 |
kable(tail(datos.Entrena))
48834 |
43 |
Private |
Community |
11 |
Married |
White |
Male |
45 |
<=50K |
0.3561644 |
0.6666667 |
0.4489796 |
0 |
48835 |
32 |
Private |
Master |
14 |
Not_married |
Asian-Pac-Islander |
Male |
11 |
<=50K |
0.2054795 |
0.8666667 |
0.1020408 |
0 |
48838 |
27 |
Private |
Community |
12 |
Married |
White |
Female |
38 |
<=50K |
0.1369863 |
0.7333333 |
0.3775510 |
0 |
48840 |
58 |
Private |
HighGrad |
9 |
Widow |
White |
Female |
40 |
<=50K |
0.5616438 |
0.5333333 |
0.3979592 |
0 |
48841 |
22 |
Private |
HighGrad |
9 |
Not_married |
White |
Male |
20 |
<=50K |
0.0684932 |
0.5333333 |
0.1938776 |
0 |
48842 |
52 |
Self-emp-inc |
HighGrad |
9 |
Married |
White |
Female |
40 |
>50K |
0.4794521 |
0.5333333 |
0.3979592 |
1 |
datos.Validacion <- recategorizados[-entrena,]
nrow(datos.Validacion)
## [1] 14652
kable(head(datos.Validacion))
6 |
34 |
Private |
Dropout |
6 |
Not_married |
White |
Male |
30 |
<=50K |
0.2328767 |
0.3333333 |
0.2959184 |
0 |
15 |
48 |
Private |
HighGrad |
9 |
Married |
White |
Male |
48 |
>50K |
0.4246575 |
0.5333333 |
0.4795918 |
1 |
17 |
20 |
State-gov |
Community |
10 |
Not_married |
White |
Male |
25 |
<=50K |
0.0410959 |
0.6000000 |
0.2448980 |
0 |
36 |
65 |
? |
HighGrad |
9 |
Married |
White |
Male |
40 |
<=50K |
0.6575342 |
0.5333333 |
0.3979592 |
0 |
41 |
65 |
Private |
Master |
14 |
Married |
White |
Male |
50 |
>50K |
0.6575342 |
0.8666667 |
0.5000000 |
1 |
49 |
52 |
Private |
Dropout |
7 |
Separated |
Black |
Female |
18 |
<=50K |
0.4794521 |
0.4000000 |
0.1734694 |
0 |
kable(tail(datos.Validacion))
48826 |
31 |
Private |
Master |
14 |
Separated |
Other |
Female |
30 |
<=50K |
0.1917808 |
0.8666667 |
0.2959184 |
0 |
48830 |
65 |
Self-emp-not-inc |
Master |
15 |
Not_married |
White |
Male |
60 |
<=50K |
0.6575342 |
0.9333333 |
0.6020408 |
0 |
48832 |
43 |
Self-emp-not-inc |
Community |
10 |
Married |
White |
Male |
50 |
<=50K |
0.3561644 |
0.6000000 |
0.5000000 |
0 |
48836 |
53 |
Private |
Master |
14 |
Married |
White |
Male |
40 |
>50K |
0.4931507 |
0.8666667 |
0.3979592 |
1 |
48837 |
22 |
Private |
Community |
10 |
Not_married |
White |
Male |
40 |
<=50K |
0.0684932 |
0.6000000 |
0.3979592 |
0 |
48839 |
40 |
Private |
HighGrad |
9 |
Married |
White |
Male |
40 |
>50K |
0.3150685 |
0.5333333 |
0.3979592 |
1 |
formula = income10 ~ age.scale + workclass + education + marital.status + race + gender + hours.per.week.scale
modelo <- glm(formula, data = datos.Entrena, family = 'binomial')
summary(modelo)
##
## Call:
## glm(formula = formula, family = "binomial", data = datos.Entrena)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7337 -0.5768 -0.2588 -0.0654 3.3492
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.419e+00 2.228e-01 -10.858 < 2e-16 ***
## age.scale 2.224e+00 1.053e-01 21.121 < 2e-16 ***
## workclassFederal-gov 1.421e+00 1.237e-01 11.485 < 2e-16 ***
## workclassLocal-gov 6.942e-01 1.100e-01 6.312 2.76e-10 ***
## workclassNever-worked -8.124e+00 1.042e+02 -0.078 0.9379
## workclassPrivate 8.124e-01 9.598e-02 8.464 < 2e-16 ***
## workclassSelf-emp-inc 1.218e+00 1.186e-01 10.270 < 2e-16 ***
## workclassSelf-emp-not-inc 1.878e-01 1.071e-01 1.753 0.0797 .
## workclassState-gov 5.339e-01 1.223e-01 4.367 1.26e-05 ***
## workclassWithout-pay -3.965e-01 8.276e-01 -0.479 0.6318
## educationCommunity -9.930e-01 4.428e-02 -22.426 < 2e-16 ***
## educationDropout -2.782e+00 7.802e-02 -35.657 < 2e-16 ***
## educationHighGrad -1.611e+00 4.523e-02 -35.610 < 2e-16 ***
## educationMaster 6.250e-01 6.110e-02 10.230 < 2e-16 ***
## educationPhD 1.077e+00 1.379e-01 7.814 5.55e-15 ***
## marital.statusNot_married -2.491e+00 5.355e-02 -46.511 < 2e-16 ***
## marital.statusSeparated -2.102e+00 5.650e-02 -37.214 < 2e-16 ***
## marital.statusWidow -2.163e+00 1.287e-01 -16.809 < 2e-16 ***
## raceAsian-Pac-Islander -2.461e-02 2.074e-01 -0.119 0.9055
## raceBlack 4.784e-04 1.968e-01 0.002 0.9981
## raceOther -9.881e-02 2.817e-01 -0.351 0.7258
## raceWhite 2.155e-01 1.876e-01 1.148 0.2509
## genderMale 9.432e-02 4.455e-02 2.117 0.0342 *
## hours.per.week.scale 3.136e+00 1.398e-01 22.430 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 37626 on 34189 degrees of freedom
## Residual deviance: 25011 on 34166 degrees of freedom
## AIC: 25059
##
## Number of Fisher Scoring iterations: 11
comparar <- data.frame(datos.Entrena$income10, as.vector(modelo$fitted.values) )
comparar <- comparar %>%
mutate(income10ajustados = if_else (modelo$fitted.values > 0.5, 1, 0))
colnames(comparar) <- c("income10", "ajuste", 'income10ajustados')
head(comparar)
## income10 ajuste income10ajustados
## 1 0 0.005003194 0
## 2 0 0.331904483 0
## 3 1 0.304736019 0
## 4 1 0.393130183 0
## 5 0 0.008761445 0
## 6 0 0.008068021 0
tail(comparar)
## income10 ajuste income10ajustados
## 34185 0 0.47760555 0
## 34186 0 0.06751504 0
## 34187 0 0.28999271 0
## 34188 0 0.06490387 0
## 34189 0 0.00958675 0
## 34190 1 0.43003777 0
matriz_confusion <- table(comparar$income10, comparar$income10ajustados, dnn = c("income10", "income10ajustados para predicciones"))
matriz_confusion
## income10ajustados para predicciones
## income10 0 1
## 0 24107 1902
## 1 4016 4165
n = nrow(datos.Entrena)
exactidud <- (matriz_confusion[1,1] + matriz_confusion[2,2]) / n
exactidud
## [1] 0.8269085
predicciones <- predict(modelo, datos.Validacion, se.fit = TRUE)
head(predicciones$fit)
## 6 15 17 36 41 49
## -5.1235447 -0.4589452 -4.1994476 -1.0094459 2.3586616 -4.8803438
tail(predicciones$fit)
## 48826 48830 48832 48836 48837 48839
## -1.8282163 -0.4364728 -0.5542026 1.6730416 -3.3800099 -0.9586976
predicciones_prob <- exp(predicciones$fit) / (1 + exp(predicciones$fit))
head(predicciones_prob)
## 6 15 17 36 41 49
## 0.005919626 0.387236079 0.014782074 0.267088310 0.913620236 0.007537162
tail(predicciones_prob)
## 48826 48830 48832 48836 48837 48839
## 0.13845090 0.39258175 0.36488993 0.84198092 0.03292608 0.27713903
las.predicciones <- cbind(datos.Validacion, predicciones_prob)
las.predicciones <- las.predicciones %>%
mutate(income10.prediccion = if_else(predicciones_prob > 0.5, 1, 0))
head(las.predicciones)
## age workclass education educational.num marital.status race gender
## 1 34 Private Dropout 6 Not_married White Male
## 2 48 Private HighGrad 9 Married White Male
## 3 20 State-gov Community 10 Not_married White Male
## 4 65 ? HighGrad 9 Married White Male
## 5 65 Private Master 14 Married White Male
## 6 52 Private Dropout 7 Separated Black Female
## hours.per.week income age.scale educational.num.scale hours.per.week.scale
## 1 30 <=50K 0.23287671 0.3333333 0.2959184
## 2 48 >50K 0.42465753 0.5333333 0.4795918
## 3 25 <=50K 0.04109589 0.6000000 0.2448980
## 4 40 <=50K 0.65753425 0.5333333 0.3979592
## 5 50 >50K 0.65753425 0.8666667 0.5000000
## 6 18 <=50K 0.47945205 0.4000000 0.1734694
## income10 predicciones_prob income10.prediccion
## 1 0 0.005919626 0
## 2 1 0.387236079 0
## 3 0 0.014782074 0
## 4 0 0.267088310 0
## 5 1 0.913620236 1
## 6 0 0.007537162 0
tail(las.predicciones)
## age workclass education educational.num marital.status race
## 14647 31 Private Master 14 Separated Other
## 14648 65 Self-emp-not-inc Master 15 Not_married White
## 14649 43 Self-emp-not-inc Community 10 Married White
## 14650 53 Private Master 14 Married White
## 14651 22 Private Community 10 Not_married White
## 14652 40 Private HighGrad 9 Married White
## gender hours.per.week income age.scale educational.num.scale
## 14647 Female 30 <=50K 0.19178082 0.8666667
## 14648 Male 60 <=50K 0.65753425 0.9333333
## 14649 Male 50 <=50K 0.35616438 0.6000000
## 14650 Male 40 >50K 0.49315068 0.8666667
## 14651 Male 40 <=50K 0.06849315 0.6000000
## 14652 Male 40 >50K 0.31506849 0.5333333
## hours.per.week.scale income10 predicciones_prob income10.prediccion
## 14647 0.2959184 0 0.13845090 0
## 14648 0.6020408 0 0.39258175 0
## 14649 0.5000000 0 0.36488993 0
## 14650 0.3979592 1 0.84198092 1
## 14651 0.3979592 0 0.03292608 0
## 14652 0.3979592 1 0.27713903 0
matriz_confusion <- table(las.predicciones$income10, las.predicciones$income10.prediccion, dnn = c("income10", "predicciones"))
matriz_confusion
## predicciones
## income10 0 1
## 0 10357 789
## 1 1762 1744
n = nrow(datos.Validacion)
exactidud <- (matriz_confusion[1,1] + matriz_confusion[2,2]) / n
exactidud
## [1] 0.8258941
edad <- 53; horas <- 50
a.predecir <- data.frame(rbind(c(edad, 'Local-gov', 'HighGrad', 'Married', 'White' , 'Male', horas)))
colnames(a.predecir) <- c('age.scale', 'workclass', 'education', 'marital.status', 'race', 'gender', 'hours.per.week.scale')
edad; horas
## [1] 53
## [1] 50
edad.escalada <- rescale(c(edad, min(datos$age), max(datos$age)))
edad.escalada <- edad.escalada[1]
# Escalando las horas por semana
horas.escalada<- rescale(c(horas, min(datos$hours.per.week), max(datos$hours.per.week)))
horas.escalada <- horas.escalada[1]
a.predecir <- a.predecir %>%
mutate(age.scale = edad.escalada,
hours.per.week.scale = horas.escalada)
a.predecir
## age.scale workclass education marital.status race gender
## 1 0.4931507 Local-gov HighGrad Married White Male
## hours.per.week.scale
## 1 0.5
prediccion <- predict(modelo, a.predecir, se.fit = TRUE)
prediccion
## $fit
## 1
## -0.3608282
##
## $se.fit
## [1] 0.06749201
##
## $residual.scale
## [1] 1
prediccion_prob <- exp(prediccion$fit) / (1 + exp(prediccion$fit))
prediccion_prob
## 1
## 0.4107591
if_else(prediccion_prob > 0.5, 1, 0)
## [1] 0
cat("La probabilidad y la predicción de que una persona con esas características gane >50K es: ", if_else(prediccion_prob > 0.5, 1, 0))
## La probabilidad y la predicción de que una persona con esas características gane >50K es: 0
edad <- c(35,55,65,71,42,60,65,75,35,53)
clase.empleo <- c('HighGrad', 'State-gov','Never-worked','Self-emp-inc', 'Federal-gov','Private', 'Private', 'Federal-gov', 'State-gov', 'Local-gov')
nivel.educacion <- c('HighGrad', 'HighGrad', 'Bachelors', 'HighGrad', 'Bachelors', 'Bachelors', 'Community', 'Community', 'Master', 'PhD')
edo.civil <- c('Married', 'Separated', 'Separated', 'Widow', 'Not_married', 'Married', 'Separated', 'Widow', 'Married', 'Not_married')
raza <- c('White', 'Asian-Pac-Islander', 'Black', 'Other', 'White', 'White', 'Amer-Indian-Eskimo', 'Black', 'White', 'White')
genero <- c('Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male')
horas <- c(45,55,50,52,50,34,40,44,60,53)
a.predecir <- data.frame(rbind(cbind(edad, clase.empleo, nivel.educacion, edo.civil,raza, genero, horas)))
colnames(a.predecir) <- c('age.scale', 'workclass', 'education', 'marital.status', 'race', 'gender', 'hours.per.week.scale')
a.predecir
## age.scale workclass education marital.status race gender
## 1 35 HighGrad HighGrad Married White Male
## 2 55 State-gov HighGrad Separated Asian-Pac-Islander Male
## 3 65 Never-worked Bachelors Separated Black Female
## 4 71 Self-emp-inc HighGrad Widow Other Male
## 5 42 Federal-gov Bachelors Not_married White Female
## 6 60 Private Bachelors Married White Male
## 7 65 Private Community Separated Amer-Indian-Eskimo Female
## 8 75 Federal-gov Community Widow Black Male
## 9 35 State-gov Master Married White Female
## 10 53 Local-gov PhD Not_married White Male
## hours.per.week.scale
## 1 45
## 2 55
## 3 50
## 4 52
## 5 50
## 6 34
## 7 40
## 8 44
## 9 60
## 10 53