Library packages
library(dplyr)
library(ggplot2)
library(GGally)
library(ROCR)
Data
data_adult <-
read.csv(
"https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/data_adult.csv"
)
write.csv(data_adult, 'data_adult.csv', row.names = FALSE)
glimpse(data_adult)
Step 1: Check continuous variables
continuous <-
select_if(data_adult, is.numeric) #select only the numerical columns
summary(continuous)
##1. Plot the distribution
### Histogram with kernel density curve
ggplot(continuous, aes(x = hours.per.week)) + geom_density(alpha = .2, fill = "#FF6666")
top_one_percent <-
quantile(data_adult$hours.per.week, .99) #Compute the value of the 99 percent of the working time
top_one_percent
data_adult_drop <-
data_adult %>% filter(hours.per.week < top_one_percent)
dim(data_adult_drop)
##2. Standardize the continuous variables
data_adult_rescale <-
data_adult_drop %>% mutate_if(is.numeric, funs(as.numeric(scale(.))))
head(data_adult_rescale)
Step 2: Check factor variables
## Select categorical column
factor <- data.frame(select_if(data_adult_rescale, is.factor))
ncol(factor)
## Create graph for each column
graph <-
lapply(names(factor), function(x)
ggplot(factor, aes(get(x))) + geom_bar() + theme(axis.text.x = element_text(angle = 90)))
### Print the graph
graph
Step 3: Feature engineering
recast_data <-
data_adult_rescale %>% select(-X) %>% mutate(education = factor(
ifelse(
education == "Preschool" |
education == "10th" |
education == "11th" |
education == "12th" |
education == "1st-4th" |
education == "5th-6th" |
education == "7th-8th" |
education == "9th",
"dropout",
ifelse(
education == "HS-grad",
"HighGrad",
ifelse(
education == "Some-college" |
education == "Assoc-acdm" | education == "Assoc-voc",
"Community",
ifelse(
education == "Bachelors",
"Bachelors",
ifelse(
education == "Masters" |
education == "Prof-school",
"Master",
"PhD"
)
)
)
)
)
))
recast_data %>% group_by(education) %>% summarize(average_educ_year = mean(educational.num),
count = n()) %>% arrange(average_educ_year)
## Change level marry
recast_data <- recast_data %>%
mutate(marital.status = factor(
ifelse(
marital.status == "Never-married" |
marital.status == "Married-spouse-absent",
"Not_married",
ifelse(
marital.status == "Married-AF-spouse" |
marital.status == "Married-civ-spouse",
"Married",
ifelse(
marital.status == "Separated" |
marital.status == "Divorced",
"Separated",
"Widow"
)
)
)
))
table(recast_data$marital.status)
Step 4: Summary statistic
## Plot gender income
ggplot(recast_data, aes(x = gender, fill = income)) + geom_bar(position = "fill") + theme_classic()
## Plot origin income
ggplot(recast_data, aes(x = race, fill = income)) + geom_bar(position = "fill") + theme_classic() + theme(axis.text.x = element_text(angle = 90))
## box plot gender working time
ggplot(recast_data, aes(x = gender, y = hours.per.week)) + geom_boxplot() + stat_summary(
fun.y = mean,
geom = "point",
size = 3,
color = "steelblue"
) + theme_classic()
## Plot distribution working time by education
ggplot(recast_data, aes(x = hours.per.week)) + geom_density(aes(color = education), alpha = 0.5) + theme_classic()
anova <- aov(hours.per.week ~ education, recast_data)
summary(anova)
ggplot(recast_data, aes(x = age, y = hours.per.week)) + geom_point(aes(color = income), size = 0.5) + stat_smooth(
method = 'lm',
formula = y ~ poly(x, 2),
se = TRUE,
aes(color = income)
) + theme_classic()
## Convert data to numeric
corr <- data.frame(lapply(recast_data, as.integer)) #Plot the graphggcorr(corr, method <- c("pairwise", "spearman"), nbreaks = 6, hjust = 0.8, label = TRUE, label_size = 3, color = "grey50")
Step 5: Train/test set
set.seed(1234)
create_train_test <-
function(data, size = 0.8, train = TRUE) {
n_row = nrow(data)
total_row = size * n_row
train_sample <- 1:total_row
if (train == TRUE) {
return (data[train_sample, ])
}
else {
return (data[-train_sample, ])
}
}
data_train <- create_train_test(recast_data, 0.8, train = TRUE)
data_test <- create_train_test(recast_data, 0.8, train = FALSE)
dim(data_train)
dim(data_test)
Step 6: Build the model
formula <- income ~ .
logit <- glm(formula, data = data_train, family = 'binomial')
summary(logit)
lapply(logit, class)[1:3]
logit$aic
step 8: Improve the model
formula_2 <- income ~ age:hours.per.week + gender:hours.per.week + .
logit_2 <- glm(formula_2, data = data_train, family = 'binomial')
predict_2 <- predict(logit_2, data_test, type = 'response')
table_mat_2 <- table(data_test$income, predict_2 > 0.5)
precision_2 <- precision(table_mat_2)
recall_2 <- recall(table_mat_2)
f1_2 <- 2 * ((precision_2 * recall_2) / (precision_2 + recall_2))
f1_2
LS0tCnRpdGxlOiAiR2VuZXJhbGl6ZWQgTGluZWFyIE1vZGVsIChHTE0pIGluIFIiCm91dHB1dDoKICBodG1sX25vdGVib29rOgogICAgdG9jOiB5ZXMKICBodG1sX2RvY3VtZW50OgogICAgZGZfcHJpbnQ6IHBhZ2VkCiAgICB0b2M6IHllcwotLS0KIyBMaWJyYXJ5IHBhY2thZ2VzCmBgYHtyfQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkoR0dhbGx5KQpsaWJyYXJ5KFJPQ1IpCmBgYAoKIyBEYXRhCmBgYHtyfQpkYXRhX2FkdWx0IDwtCglyZWFkLmNzdigKCQkiaHR0cHM6Ly9yYXcuZ2l0aHVidXNlcmNvbnRlbnQuY29tL3Rob21hc3Blcm5ldC9kYXRhX2Nzdl9yL21hc3Rlci9kYXRhL2RhdGFfYWR1bHQuY3N2IgoJKQp3cml0ZS5jc3YoZGF0YV9hZHVsdCwgJ2RhdGFfYWR1bHQuY3N2Jywgcm93Lm5hbWVzID0gRkFMU0UpCmdsaW1wc2UoZGF0YV9hZHVsdCkKYGBgCgojIFN0ZXAgMTogQ2hlY2sgY29udGludW91cyB2YXJpYWJsZXMKYGBge3J9CmNvbnRpbnVvdXMgPC0KCXNlbGVjdF9pZihkYXRhX2FkdWx0LCBpcy5udW1lcmljKSAjc2VsZWN0IG9ubHkgdGhlIG51bWVyaWNhbCBjb2x1bW5zCnN1bW1hcnkoY29udGludW91cykKIyMxLiBQbG90IHRoZSBkaXN0cmlidXRpb24KIyMjIEhpc3RvZ3JhbSB3aXRoIGtlcm5lbCBkZW5zaXR5IGN1cnZlCmdncGxvdChjb250aW51b3VzLCBhZXMoeCA9IGhvdXJzLnBlci53ZWVrKSkgKyBnZW9tX2RlbnNpdHkoYWxwaGEgPSAuMiwgZmlsbCA9ICIjRkY2NjY2IikKdG9wX29uZV9wZXJjZW50IDwtCglxdWFudGlsZShkYXRhX2FkdWx0JGhvdXJzLnBlci53ZWVrLCAuOTkpICNDb21wdXRlIHRoZSB2YWx1ZSBvZiB0aGUgOTkgcGVyY2VudCBvZiB0aGUgd29ya2luZyB0aW1lCnRvcF9vbmVfcGVyY2VudApkYXRhX2FkdWx0X2Ryb3AgPC0KCWRhdGFfYWR1bHQgJT4lIGZpbHRlcihob3Vycy5wZXIud2VlayA8IHRvcF9vbmVfcGVyY2VudCkKZGltKGRhdGFfYWR1bHRfZHJvcCkKIyMyLiBTdGFuZGFyZGl6ZSB0aGUgY29udGludW91cyB2YXJpYWJsZXMKZGF0YV9hZHVsdF9yZXNjYWxlIDwtCglkYXRhX2FkdWx0X2Ryb3AgJT4lIG11dGF0ZV9pZihpcy5udW1lcmljLCBmdW5zKGFzLm51bWVyaWMoc2NhbGUoLikpKSkKaGVhZChkYXRhX2FkdWx0X3Jlc2NhbGUpCmBgYAoKIyBTdGVwIDI6IENoZWNrIGZhY3RvciB2YXJpYWJsZXMKYGBge3J9CiMjIFNlbGVjdCBjYXRlZ29yaWNhbCBjb2x1bW4KZmFjdG9yIDwtIGRhdGEuZnJhbWUoc2VsZWN0X2lmKGRhdGFfYWR1bHRfcmVzY2FsZSwgaXMuZmFjdG9yKSkKbmNvbChmYWN0b3IpCiMjIENyZWF0ZSBncmFwaCBmb3IgZWFjaCBjb2x1bW4KZ3JhcGggPC0KCWxhcHBseShuYW1lcyhmYWN0b3IpLCBmdW5jdGlvbih4KQoJCWdncGxvdChmYWN0b3IsIGFlcyhnZXQoeCkpKSArIGdlb21fYmFyKCkgKyB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChhbmdsZSA9IDkwKSkpCiMjIyBQcmludCB0aGUgZ3JhcGgKZ3JhcGgKYGBgCgojIFN0ZXAgMzogRmVhdHVyZSBlbmdpbmVlcmluZwpgYGB7cn0KcmVjYXN0X2RhdGEgPC0KCWRhdGFfYWR1bHRfcmVzY2FsZSAlPiUgc2VsZWN0KC1YKSAlPiUgbXV0YXRlKGVkdWNhdGlvbiA9IGZhY3RvcigKCQlpZmVsc2UoCgkJCWVkdWNhdGlvbiA9PSAiUHJlc2Nob29sIiB8CgkJCQllZHVjYXRpb24gPT0gIjEwdGgiIHwKCQkJCWVkdWNhdGlvbiA9PSAiMTF0aCIgfAoJCQkJZWR1Y2F0aW9uID09ICIxMnRoIiB8CgkJCQllZHVjYXRpb24gPT0gIjFzdC00dGgiIHwKCQkJCWVkdWNhdGlvbiA9PSAiNXRoLTZ0aCIgfAoJCQkJZWR1Y2F0aW9uID09ICI3dGgtOHRoIiB8CgkJCQllZHVjYXRpb24gPT0gIjl0aCIsCgkJCSJkcm9wb3V0IiwKCQkJaWZlbHNlKAoJCQkJZWR1Y2F0aW9uID09ICJIUy1ncmFkIiwKCQkJCSJIaWdoR3JhZCIsCgkJCQlpZmVsc2UoCgkJCQkJZWR1Y2F0aW9uID09ICJTb21lLWNvbGxlZ2UiIHwKCQkJCQkJZWR1Y2F0aW9uID09ICJBc3NvYy1hY2RtIiB8IGVkdWNhdGlvbiA9PSAiQXNzb2Mtdm9jIiwKCQkJCQkiQ29tbXVuaXR5IiwKCQkJCQlpZmVsc2UoCgkJCQkJCWVkdWNhdGlvbiA9PSAiQmFjaGVsb3JzIiwKCQkJCQkJIkJhY2hlbG9ycyIsCgkJCQkJCWlmZWxzZSgKCQkJCQkJCWVkdWNhdGlvbiA9PSAiTWFzdGVycyIgfAoJCQkJCQkJCWVkdWNhdGlvbiA9PSAiUHJvZi1zY2hvb2wiLAoJCQkJCQkJIk1hc3RlciIsCgkJCQkJCQkiUGhEIgoJCQkJCQkpCgkJCQkJKQoJCQkJKQoJCQkpCgkJKQoJKSkKcmVjYXN0X2RhdGEgJT4lIGdyb3VwX2J5KGVkdWNhdGlvbikgJT4lIHN1bW1hcml6ZShhdmVyYWdlX2VkdWNfeWVhciA9IG1lYW4oZWR1Y2F0aW9uYWwubnVtKSwKCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCWNvdW50ID0gbigpKSAlPiUgYXJyYW5nZShhdmVyYWdlX2VkdWNfeWVhcikKIyMgQ2hhbmdlIGxldmVsIG1hcnJ5CnJlY2FzdF9kYXRhIDwtIHJlY2FzdF9kYXRhICU+JQoJbXV0YXRlKG1hcml0YWwuc3RhdHVzID0gZmFjdG9yKAoJCWlmZWxzZSgKCQkJbWFyaXRhbC5zdGF0dXMgPT0gIk5ldmVyLW1hcnJpZWQiIHwKCQkJCW1hcml0YWwuc3RhdHVzID09ICJNYXJyaWVkLXNwb3VzZS1hYnNlbnQiLAoJCQkiTm90X21hcnJpZWQiLAoJCQlpZmVsc2UoCgkJCQltYXJpdGFsLnN0YXR1cyA9PSAiTWFycmllZC1BRi1zcG91c2UiIHwKCQkJCQltYXJpdGFsLnN0YXR1cyA9PSAiTWFycmllZC1jaXYtc3BvdXNlIiwKCQkJCSJNYXJyaWVkIiwKCQkJCWlmZWxzZSgKCQkJCQltYXJpdGFsLnN0YXR1cyA9PSAiU2VwYXJhdGVkIiB8CgkJCQkJCW1hcml0YWwuc3RhdHVzID09ICJEaXZvcmNlZCIsCgkJCQkJIlNlcGFyYXRlZCIsCgkJCQkJIldpZG93IgoJCQkJKQoJCQkpCgkJKQoJKSkKdGFibGUocmVjYXN0X2RhdGEkbWFyaXRhbC5zdGF0dXMpCmBgYAoKIyBTdGVwIDQ6IFN1bW1hcnkgc3RhdGlzdGljCmBgYHtyfQojIyBQbG90IGdlbmRlciBpbmNvbWUKZ2dwbG90KHJlY2FzdF9kYXRhLCBhZXMoeCA9IGdlbmRlciwgZmlsbCA9IGluY29tZSkpICsgZ2VvbV9iYXIocG9zaXRpb24gPSAiZmlsbCIpICsgdGhlbWVfY2xhc3NpYygpCiMjIFBsb3Qgb3JpZ2luIGluY29tZQpnZ3Bsb3QocmVjYXN0X2RhdGEsIGFlcyh4ID0gcmFjZSwgZmlsbCA9IGluY29tZSkpICsgZ2VvbV9iYXIocG9zaXRpb24gPSAiZmlsbCIpICsgdGhlbWVfY2xhc3NpYygpICsgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA5MCkpCiMjIGJveCBwbG90IGdlbmRlciB3b3JraW5nIHRpbWUKZ2dwbG90KHJlY2FzdF9kYXRhLCBhZXMoeCA9IGdlbmRlciwgeSA9IGhvdXJzLnBlci53ZWVrKSkgKyBnZW9tX2JveHBsb3QoKSArIHN0YXRfc3VtbWFyeSgKCWZ1bi55ID0gbWVhbiwKCWdlb20gPSAicG9pbnQiLAoJc2l6ZSA9IDMsCgljb2xvciA9ICJzdGVlbGJsdWUiCikgKyB0aGVtZV9jbGFzc2ljKCkKIyMgUGxvdCBkaXN0cmlidXRpb24gd29ya2luZyB0aW1lIGJ5IGVkdWNhdGlvbgpnZ3Bsb3QocmVjYXN0X2RhdGEsIGFlcyh4ID0gaG91cnMucGVyLndlZWspKSArIGdlb21fZGVuc2l0eShhZXMoY29sb3IgPSBlZHVjYXRpb24pLCBhbHBoYSA9IDAuNSkgKyB0aGVtZV9jbGFzc2ljKCkKYW5vdmEgPC0gYW92KGhvdXJzLnBlci53ZWVrIH4gZWR1Y2F0aW9uLCByZWNhc3RfZGF0YSkKc3VtbWFyeShhbm92YSkKZ2dwbG90KHJlY2FzdF9kYXRhLCBhZXMoeCA9IGFnZSwgeSA9IGhvdXJzLnBlci53ZWVrKSkgKyBnZW9tX3BvaW50KGFlcyhjb2xvciA9IGluY29tZSksIHNpemUgPSAwLjUpICsgc3RhdF9zbW9vdGgoCgltZXRob2QgPSAnbG0nLAoJZm9ybXVsYSA9IHkgfiBwb2x5KHgsIDIpLAoJc2UgPSBUUlVFLAoJYWVzKGNvbG9yID0gaW5jb21lKQopICsgdGhlbWVfY2xhc3NpYygpCiMjIENvbnZlcnQgZGF0YSB0byBudW1lcmljCmNvcnIgPC0gZGF0YS5mcmFtZShsYXBwbHkocmVjYXN0X2RhdGEsIGFzLmludGVnZXIpKSAjUGxvdCB0aGUgZ3JhcGhnZ2NvcnIoY29yciwgbWV0aG9kIDwtIGMoInBhaXJ3aXNlIiwgInNwZWFybWFuIiksIG5icmVha3MgPSA2LCBoanVzdCA9IDAuOCwgbGFiZWwgPSBUUlVFLCBsYWJlbF9zaXplID0gMywgY29sb3IgPSAiZ3JleTUwIikKYGBgCgojIFN0ZXAgNTogVHJhaW4vdGVzdCBzZXQKYGBge3J9CnNldC5zZWVkKDEyMzQpCmNyZWF0ZV90cmFpbl90ZXN0IDwtCglmdW5jdGlvbihkYXRhLCBzaXplID0gMC44LCB0cmFpbiA9IFRSVUUpIHsKCQluX3JvdyA9IG5yb3coZGF0YSkKCQl0b3RhbF9yb3cgPSBzaXplICogbl9yb3cKCQl0cmFpbl9zYW1wbGUgPC0gMTp0b3RhbF9yb3cKCQlpZiAodHJhaW4gPT0gVFJVRSkgewoJCQlyZXR1cm4gKGRhdGFbdHJhaW5fc2FtcGxlLCBdKQoJCX0KCQllbHNlIHsKCQkJcmV0dXJuIChkYXRhWy10cmFpbl9zYW1wbGUsIF0pCgkJfQoJfQpkYXRhX3RyYWluIDwtIGNyZWF0ZV90cmFpbl90ZXN0KHJlY2FzdF9kYXRhLCAwLjgsIHRyYWluID0gVFJVRSkKZGF0YV90ZXN0IDwtIGNyZWF0ZV90cmFpbl90ZXN0KHJlY2FzdF9kYXRhLCAwLjgsIHRyYWluID0gRkFMU0UpCmRpbShkYXRhX3RyYWluKQpkaW0oZGF0YV90ZXN0KQpgYGAKCiMgU3RlcCA2OiBCdWlsZCB0aGUgbW9kZWwKYGBge3J9CmZvcm11bGEgPC0gaW5jb21lIH4gLgpsb2dpdCA8LSBnbG0oZm9ybXVsYSwgZGF0YSA9IGRhdGFfdHJhaW4sIGZhbWlseSA9ICdiaW5vbWlhbCcpCnN1bW1hcnkobG9naXQpCmxhcHBseShsb2dpdCwgY2xhc3MpWzE6M10KbG9naXQkYWljCmBgYAoKIyBTdGVwIDc6IEFzc2VzcyB0aGUgcGVyZm9ybWFuY2Ugb2YgdGhlIG1vZGVsCmBgYHtyfQpwcmVkaWN0IDwtIHByZWRpY3QobG9naXQsIGRhdGFfdGVzdCwgdHlwZSA9ICdyZXNwb25zZScpCiMjIGNvbmZ1c2lvbiBtYXRyaXgKdGFibGVfbWF0IDwtIHRhYmxlKGRhdGFfdGVzdCRpbmNvbWUsIHByZWRpY3QgPiAwLjUpCnRhYmxlX21hdAphY2N1cmFjeV9UZXN0IDwtIHN1bShkaWFnKHRhYmxlX21hdCkpIC8gc3VtKHRhYmxlX21hdCkKYWNjdXJhY3lfVGVzdApwcmVjaXNpb24gPC0gZnVuY3Rpb24obWF0cml4KSB7Cgl0cCA8LSBtYXRyaXhbMiwgMl0gI3RydWUgcG9zaXRpdmUKCWZwIDwtIG1hdHJpeFsxLCAyXSAjZmFsc2UgcG9zaXRpdmUKCXJldHVybiAodHAgLyAodHAgKyBmcCkpCn0KcmVjYWxsIDwtIGZ1bmN0aW9uKG1hdHJpeCkgewoJdHAgPC0gbWF0cml4WzIsIDJdICN0cnVlIHBvc2l0aXZlCglmbiA8LSBtYXRyaXhbMiwgMV0gI2ZhbHNlIHBvc2l0aXZlCglyZXR1cm4gKHRwIC8gKHRwICsgZm4pKQp9CnByZWMgPC0gcHJlY2lzaW9uKHRhYmxlX21hdCkKcHJlYwpyZWMgPC0gcmVjYWxsKHRhYmxlX21hdCkKcmVjCmYxIDwtIDIgKiAoKHByZWMgKiByZWMpIC8gKHByZWMgKyByZWMpKQpmMQpST0NScHJlZCA8LSBwcmVkaWN0aW9uKHByZWRpY3QsIGRhdGFfdGVzdCRpbmNvbWUpClJPQ1JwZXJmIDwtIHBlcmZvcm1hbmNlKFJPQ1JwcmVkLCAndHByJywgJ2ZwcicpCnBsb3QoUk9DUnBlcmYsIGNvbG9yaXplID0gVFJVRSwgdGV4dC5hZGogPSBjKC0wLjIsIDEuNykpCmBgYAoKIyBzdGVwIDg6IEltcHJvdmUgdGhlIG1vZGVsCmBgYHtyfQpmb3JtdWxhXzIgPC0gaW5jb21lIH4gYWdlOmhvdXJzLnBlci53ZWVrICsgZ2VuZGVyOmhvdXJzLnBlci53ZWVrICsgLgpsb2dpdF8yIDwtIGdsbShmb3JtdWxhXzIsIGRhdGEgPSBkYXRhX3RyYWluLCBmYW1pbHkgPSAnYmlub21pYWwnKQpwcmVkaWN0XzIgPC0gcHJlZGljdChsb2dpdF8yLCBkYXRhX3Rlc3QsIHR5cGUgPSAncmVzcG9uc2UnKQp0YWJsZV9tYXRfMiA8LSB0YWJsZShkYXRhX3Rlc3QkaW5jb21lLCBwcmVkaWN0XzIgPiAwLjUpCnByZWNpc2lvbl8yIDwtIHByZWNpc2lvbih0YWJsZV9tYXRfMikKcmVjYWxsXzIgPC0gcmVjYWxsKHRhYmxlX21hdF8yKQpmMV8yIDwtIDIgKiAoKHByZWNpc2lvbl8yICogcmVjYWxsXzIpIC8gKHByZWNpc2lvbl8yICsgcmVjYWxsXzIpKQpmMV8yCmBgYAoK