require(data.table)
## Loading required package: data.table
theUrl <- "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
rm(df_census)
## Warning in rm(df_census): object 'df_census' not found
rm(df_census.select)
## Warning in rm(df_census.select): object 'df_census.select' not found
rm(dt_census)
## Warning in rm(dt_census): object 'dt_census' not found
rm(dt_select)
## Warning in rm(dt_select): object 'dt_select' not found
df_census <- read.table(file = theUrl, header = TRUE, sep = ",")
## Decode column labels and Load as data.table
setnames(
df_census,
old=c(
"X39",
"State.gov",
"X77516",
"Bachelors",
"X13",
"Never.married",
"Adm.clerical",
"Not.in.family",
"White",
"Male",
"X2174",
"X0",
"X40",
"United.States",
"X..50K"
),
new=c(
"age",
"workclass",
"fnlwgt",
"education",
"doctorate",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"capital-gain",
"capital-loss",
"hours-per-week",
"native-country",
"socio-status"
)
)
id <- rownames(df_census)
df_census <- cbind(id=id, df_census)
hist(df_census$age)
df_census.select <- subset(df_census, age <= 65)
dt_census <- data.table(df_census.select)
setkey(dt_census,"id")
Load a result set data.table with a subset of interesting columns Filter on “capgain” census to look at filtered set of data
dt_select <- dt_census[`capital-gain` > 0, .(id, age, workclass, occupation,sex, education,`hours-per-week`,`socio-status`)]
str(dt_select)
## Classes 'data.table' and 'data.frame': 2511 obs. of 8 variables:
## $ id : Factor w/ 32560 levels "1","10","100",..: 41 54 92 95 114 128 129 131 137 140 ...
## $ age : int 28 20 54 34 44 29 18 43 50 50 ...
## $ workclass : Factor w/ 9 levels " ?"," Federal-gov",..: 5 5 7 5 5 5 5 8 5 3 ...
## $ occupation : Factor w/ 15 levels " ?"," Adm-clerical",..: 13 9 4 11 5 4 13 11 8 11 ...
## $ sex : Factor w/ 2 levels " Female"," Male": 2 2 2 1 2 2 1 2 2 1 ...
## $ education : Factor w/ 16 levels " 10th"," 11th",..: 12 16 12 10 10 2 12 11 12 13 ...
## $ hours-per-week: int 40 25 52 36 60 50 30 64 40 40 ...
## $ socio-status : Factor w/ 2 levels " <=50K"," >50K": 2 1 2 2 2 1 1 2 2 2 ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "id"