Library packages

library(dplyr)
library(ggplot2)
library(GGally)
library(ROCR)

Data

data_adult <-
    read.csv(
        "https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/data_adult.csv"
    )
write.csv(data_adult, 'data_adult.csv', row.names = FALSE)
glimpse(data_adult)

Step 1: Check continuous variables

continuous <-
    select_if(data_adult, is.numeric) #select only the numerical columns
summary(continuous)
##1. Plot the distribution
### Histogram with kernel density curve
ggplot(continuous, aes(x = hours.per.week)) + geom_density(alpha = .2, fill = "#FF6666")
top_one_percent <-
    quantile(data_adult$hours.per.week, .99) #Compute the value of the 99 percent of the working time
top_one_percent
data_adult_drop <-
    data_adult %>% filter(hours.per.week < top_one_percent)
dim(data_adult_drop)
##2. Standardize the continuous variables
data_adult_rescale <-
    data_adult_drop %>% mutate_if(is.numeric, funs(as.numeric(scale(.))))
head(data_adult_rescale)

Step 2: Check factor variables

## Select categorical column
factor <- data.frame(select_if(data_adult_rescale, is.factor))
ncol(factor)
## Create graph for each column
graph <-
    lapply(names(factor), function(x)
        ggplot(factor, aes(get(x))) + geom_bar() + theme(axis.text.x = element_text(angle = 90)))
### Print the graph
graph

Step 3: Feature engineering

recast_data <-
    data_adult_rescale %>% select(-X) %>% mutate(education = factor(
        ifelse(
            education == "Preschool" |
                education == "10th" |
                education == "11th" |
                education == "12th" |
                education == "1st-4th" |
                education == "5th-6th" |
                education == "7th-8th" |
                education == "9th",
            "dropout",
            ifelse(
                education == "HS-grad",
                "HighGrad",
                ifelse(
                    education == "Some-college" |
                        education == "Assoc-acdm" | education == "Assoc-voc",
                    "Community",
                    ifelse(
                        education == "Bachelors",
                        "Bachelors",
                        ifelse(
                            education == "Masters" |
                                education == "Prof-school",
                            "Master",
                            "PhD"
                        )
                    )
                )
            )
        )
    ))
recast_data %>% group_by(education) %>% summarize(average_educ_year = mean(educational.num),
                                                                                                    count = n()) %>% arrange(average_educ_year)
## Change level marry
recast_data <- recast_data %>%
    mutate(marital.status = factor(
        ifelse(
            marital.status == "Never-married" |
                marital.status == "Married-spouse-absent",
            "Not_married",
            ifelse(
                marital.status == "Married-AF-spouse" |
                    marital.status == "Married-civ-spouse",
                "Married",
                ifelse(
                    marital.status == "Separated" |
                        marital.status == "Divorced",
                    "Separated",
                    "Widow"
                )
            )
        )
    ))
table(recast_data$marital.status)

Step 4: Summary statistic

## Plot gender income
ggplot(recast_data, aes(x = gender, fill = income)) + geom_bar(position = "fill") + theme_classic()
## Plot origin income
ggplot(recast_data, aes(x = race, fill = income)) + geom_bar(position = "fill") + theme_classic() + theme(axis.text.x = element_text(angle = 90))
## box plot gender working time
ggplot(recast_data, aes(x = gender, y = hours.per.week)) + geom_boxplot() + stat_summary(
    fun.y = mean,
    geom = "point",
    size = 3,
    color = "steelblue"
) + theme_classic()
## Plot distribution working time by education
ggplot(recast_data, aes(x = hours.per.week)) + geom_density(aes(color = education), alpha = 0.5) + theme_classic()
anova <- aov(hours.per.week ~ education, recast_data)
summary(anova)
ggplot(recast_data, aes(x = age, y = hours.per.week)) + geom_point(aes(color = income), size = 0.5) + stat_smooth(
    method = 'lm',
    formula = y ~ poly(x, 2),
    se = TRUE,
    aes(color = income)
) + theme_classic()
## Convert data to numeric
corr <- data.frame(lapply(recast_data, as.integer)) #Plot the graphggcorr(corr, method <- c("pairwise", "spearman"), nbreaks = 6, hjust = 0.8, label = TRUE, label_size = 3, color = "grey50")

Step 5: Train/test set

set.seed(1234)
create_train_test <-
    function(data, size = 0.8, train = TRUE) {
        n_row = nrow(data)
        total_row = size * n_row
        train_sample <- 1:total_row
        if (train == TRUE) {
            return (data[train_sample, ])
        }
        else {
            return (data[-train_sample, ])
        }
    }
data_train <- create_train_test(recast_data, 0.8, train = TRUE)
data_test <- create_train_test(recast_data, 0.8, train = FALSE)
dim(data_train)
dim(data_test)

Step 6: Build the model

formula <- income ~ .
logit <- glm(formula, data = data_train, family = 'binomial')
summary(logit)
lapply(logit, class)[1:3]
logit$aic

Step 7: Assess the performance of the model

predict <- predict(logit, data_test, type = 'response')
## confusion matrix
table_mat <- table(data_test$income, predict > 0.5)
table_mat
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
precision <- function(matrix) {
    tp <- matrix[2, 2] #true positive
    fp <- matrix[1, 2] #false positive
    return (tp / (tp + fp))
}
recall <- function(matrix) {
    tp <- matrix[2, 2] #true positive
    fn <- matrix[2, 1] #false positive
    return (tp / (tp + fn))
}
prec <- precision(table_mat)
prec
rec <- recall(table_mat)
rec
f1 <- 2 * ((prec * rec) / (prec + rec))
f1
ROCRpred <- prediction(predict, data_test$income)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))

step 8: Improve the model

formula_2 <- income ~ age:hours.per.week + gender:hours.per.week + .
logit_2 <- glm(formula_2, data = data_train, family = 'binomial')
predict_2 <- predict(logit_2, data_test, type = 'response')
table_mat_2 <- table(data_test$income, predict_2 > 0.5)
precision_2 <- precision(table_mat_2)
recall_2 <- recall(table_mat_2)
f1_2 <- 2 * ((precision_2 * recall_2) / (precision_2 + recall_2))
f1_2
LS0tCnRpdGxlOiAiR2VuZXJhbGl6ZWQgTGluZWFyIE1vZGVsIChHTE0pIGluIFIiCm91dHB1dDoKICBodG1sX25vdGVib29rOgogICAgdG9jOiB5ZXMKICBodG1sX2RvY3VtZW50OgogICAgZGZfcHJpbnQ6IHBhZ2VkCiAgICB0b2M6IHllcwotLS0KIyBMaWJyYXJ5IHBhY2thZ2VzCmBgYHtyfQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkoR0dhbGx5KQpsaWJyYXJ5KFJPQ1IpCmBgYAoKIyBEYXRhCmBgYHtyfQpkYXRhX2FkdWx0IDwtCglyZWFkLmNzdigKCQkiaHR0cHM6Ly9yYXcuZ2l0aHVidXNlcmNvbnRlbnQuY29tL3Rob21hc3Blcm5ldC9kYXRhX2Nzdl9yL21hc3Rlci9kYXRhL2RhdGFfYWR1bHQuY3N2IgoJKQp3cml0ZS5jc3YoZGF0YV9hZHVsdCwgJ2RhdGFfYWR1bHQuY3N2Jywgcm93Lm5hbWVzID0gRkFMU0UpCmdsaW1wc2UoZGF0YV9hZHVsdCkKYGBgCgojIFN0ZXAgMTogQ2hlY2sgY29udGludW91cyB2YXJpYWJsZXMKYGBge3J9CmNvbnRpbnVvdXMgPC0KCXNlbGVjdF9pZihkYXRhX2FkdWx0LCBpcy5udW1lcmljKSAjc2VsZWN0IG9ubHkgdGhlIG51bWVyaWNhbCBjb2x1bW5zCnN1bW1hcnkoY29udGludW91cykKIyMxLiBQbG90IHRoZSBkaXN0cmlidXRpb24KIyMjIEhpc3RvZ3JhbSB3aXRoIGtlcm5lbCBkZW5zaXR5IGN1cnZlCmdncGxvdChjb250aW51b3VzLCBhZXMoeCA9IGhvdXJzLnBlci53ZWVrKSkgKyBnZW9tX2RlbnNpdHkoYWxwaGEgPSAuMiwgZmlsbCA9ICIjRkY2NjY2IikKdG9wX29uZV9wZXJjZW50IDwtCglxdWFudGlsZShkYXRhX2FkdWx0JGhvdXJzLnBlci53ZWVrLCAuOTkpICNDb21wdXRlIHRoZSB2YWx1ZSBvZiB0aGUgOTkgcGVyY2VudCBvZiB0aGUgd29ya2luZyB0aW1lCnRvcF9vbmVfcGVyY2VudApkYXRhX2FkdWx0X2Ryb3AgPC0KCWRhdGFfYWR1bHQgJT4lIGZpbHRlcihob3Vycy5wZXIud2VlayA8IHRvcF9vbmVfcGVyY2VudCkKZGltKGRhdGFfYWR1bHRfZHJvcCkKIyMyLiBTdGFuZGFyZGl6ZSB0aGUgY29udGludW91cyB2YXJpYWJsZXMKZGF0YV9hZHVsdF9yZXNjYWxlIDwtCglkYXRhX2FkdWx0X2Ryb3AgJT4lIG11dGF0ZV9pZihpcy5udW1lcmljLCBmdW5zKGFzLm51bWVyaWMoc2NhbGUoLikpKSkKaGVhZChkYXRhX2FkdWx0X3Jlc2NhbGUpCmBgYAoKIyBTdGVwIDI6IENoZWNrIGZhY3RvciB2YXJpYWJsZXMKYGBge3J9CiMjIFNlbGVjdCBjYXRlZ29yaWNhbCBjb2x1bW4KZmFjdG9yIDwtIGRhdGEuZnJhbWUoc2VsZWN0X2lmKGRhdGFfYWR1bHRfcmVzY2FsZSwgaXMuZmFjdG9yKSkKbmNvbChmYWN0b3IpCiMjIENyZWF0ZSBncmFwaCBmb3IgZWFjaCBjb2x1bW4KZ3JhcGggPC0KCWxhcHBseShuYW1lcyhmYWN0b3IpLCBmdW5jdGlvbih4KQoJCWdncGxvdChmYWN0b3IsIGFlcyhnZXQoeCkpKSArIGdlb21fYmFyKCkgKyB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChhbmdsZSA9IDkwKSkpCiMjIyBQcmludCB0aGUgZ3JhcGgKZ3JhcGgKYGBgCgojIFN0ZXAgMzogRmVhdHVyZSBlbmdpbmVlcmluZwpgYGB7cn0KcmVjYXN0X2RhdGEgPC0KCWRhdGFfYWR1bHRfcmVzY2FsZSAlPiUgc2VsZWN0KC1YKSAlPiUgbXV0YXRlKGVkdWNhdGlvbiA9IGZhY3RvcigKCQlpZmVsc2UoCgkJCWVkdWNhdGlvbiA9PSAiUHJlc2Nob29sIiB8CgkJCQllZHVjYXRpb24gPT0gIjEwdGgiIHwKCQkJCWVkdWNhdGlvbiA9PSAiMTF0aCIgfAoJCQkJZWR1Y2F0aW9uID09ICIxMnRoIiB8CgkJCQllZHVjYXRpb24gPT0gIjFzdC00dGgiIHwKCQkJCWVkdWNhdGlvbiA9PSAiNXRoLTZ0aCIgfAoJCQkJZWR1Y2F0aW9uID09ICI3dGgtOHRoIiB8CgkJCQllZHVjYXRpb24gPT0gIjl0aCIsCgkJCSJkcm9wb3V0IiwKCQkJaWZlbHNlKAoJCQkJZWR1Y2F0aW9uID09ICJIUy1ncmFkIiwKCQkJCSJIaWdoR3JhZCIsCgkJCQlpZmVsc2UoCgkJCQkJZWR1Y2F0aW9uID09ICJTb21lLWNvbGxlZ2UiIHwKCQkJCQkJZWR1Y2F0aW9uID09ICJBc3NvYy1hY2RtIiB8IGVkdWNhdGlvbiA9PSAiQXNzb2Mtdm9jIiwKCQkJCQkiQ29tbXVuaXR5IiwKCQkJCQlpZmVsc2UoCgkJCQkJCWVkdWNhdGlvbiA9PSAiQmFjaGVsb3JzIiwKCQkJCQkJIkJhY2hlbG9ycyIsCgkJCQkJCWlmZWxzZSgKCQkJCQkJCWVkdWNhdGlvbiA9PSAiTWFzdGVycyIgfAoJCQkJCQkJCWVkdWNhdGlvbiA9PSAiUHJvZi1zY2hvb2wiLAoJCQkJCQkJIk1hc3RlciIsCgkJCQkJCQkiUGhEIgoJCQkJCQkpCgkJCQkJKQoJCQkJKQoJCQkpCgkJKQoJKSkKcmVjYXN0X2RhdGEgJT4lIGdyb3VwX2J5KGVkdWNhdGlvbikgJT4lIHN1bW1hcml6ZShhdmVyYWdlX2VkdWNfeWVhciA9IG1lYW4oZWR1Y2F0aW9uYWwubnVtKSwKCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCWNvdW50ID0gbigpKSAlPiUgYXJyYW5nZShhdmVyYWdlX2VkdWNfeWVhcikKIyMgQ2hhbmdlIGxldmVsIG1hcnJ5CnJlY2FzdF9kYXRhIDwtIHJlY2FzdF9kYXRhICU+JQoJbXV0YXRlKG1hcml0YWwuc3RhdHVzID0gZmFjdG9yKAoJCWlmZWxzZSgKCQkJbWFyaXRhbC5zdGF0dXMgPT0gIk5ldmVyLW1hcnJpZWQiIHwKCQkJCW1hcml0YWwuc3RhdHVzID09ICJNYXJyaWVkLXNwb3VzZS1hYnNlbnQiLAoJCQkiTm90X21hcnJpZWQiLAoJCQlpZmVsc2UoCgkJCQltYXJpdGFsLnN0YXR1cyA9PSAiTWFycmllZC1BRi1zcG91c2UiIHwKCQkJCQltYXJpdGFsLnN0YXR1cyA9PSAiTWFycmllZC1jaXYtc3BvdXNlIiwKCQkJCSJNYXJyaWVkIiwKCQkJCWlmZWxzZSgKCQkJCQltYXJpdGFsLnN0YXR1cyA9PSAiU2VwYXJhdGVkIiB8CgkJCQkJCW1hcml0YWwuc3RhdHVzID09ICJEaXZvcmNlZCIsCgkJCQkJIlNlcGFyYXRlZCIsCgkJCQkJIldpZG93IgoJCQkJKQoJCQkpCgkJKQoJKSkKdGFibGUocmVjYXN0X2RhdGEkbWFyaXRhbC5zdGF0dXMpCmBgYAoKIyBTdGVwIDQ6IFN1bW1hcnkgc3RhdGlzdGljCmBgYHtyfQojIyBQbG90IGdlbmRlciBpbmNvbWUKZ2dwbG90KHJlY2FzdF9kYXRhLCBhZXMoeCA9IGdlbmRlciwgZmlsbCA9IGluY29tZSkpICsgZ2VvbV9iYXIocG9zaXRpb24gPSAiZmlsbCIpICsgdGhlbWVfY2xhc3NpYygpCiMjIFBsb3Qgb3JpZ2luIGluY29tZQpnZ3Bsb3QocmVjYXN0X2RhdGEsIGFlcyh4ID0gcmFjZSwgZmlsbCA9IGluY29tZSkpICsgZ2VvbV9iYXIocG9zaXRpb24gPSAiZmlsbCIpICsgdGhlbWVfY2xhc3NpYygpICsgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA5MCkpCiMjIGJveCBwbG90IGdlbmRlciB3b3JraW5nIHRpbWUKZ2dwbG90KHJlY2FzdF9kYXRhLCBhZXMoeCA9IGdlbmRlciwgeSA9IGhvdXJzLnBlci53ZWVrKSkgKyBnZW9tX2JveHBsb3QoKSArIHN0YXRfc3VtbWFyeSgKCWZ1bi55ID0gbWVhbiwKCWdlb20gPSAicG9pbnQiLAoJc2l6ZSA9IDMsCgljb2xvciA9ICJzdGVlbGJsdWUiCikgKyB0aGVtZV9jbGFzc2ljKCkKIyMgUGxvdCBkaXN0cmlidXRpb24gd29ya2luZyB0aW1lIGJ5IGVkdWNhdGlvbgpnZ3Bsb3QocmVjYXN0X2RhdGEsIGFlcyh4ID0gaG91cnMucGVyLndlZWspKSArIGdlb21fZGVuc2l0eShhZXMoY29sb3IgPSBlZHVjYXRpb24pLCBhbHBoYSA9IDAuNSkgKyB0aGVtZV9jbGFzc2ljKCkKYW5vdmEgPC0gYW92KGhvdXJzLnBlci53ZWVrIH4gZWR1Y2F0aW9uLCByZWNhc3RfZGF0YSkKc3VtbWFyeShhbm92YSkKZ2dwbG90KHJlY2FzdF9kYXRhLCBhZXMoeCA9IGFnZSwgeSA9IGhvdXJzLnBlci53ZWVrKSkgKyBnZW9tX3BvaW50KGFlcyhjb2xvciA9IGluY29tZSksIHNpemUgPSAwLjUpICsgc3RhdF9zbW9vdGgoCgltZXRob2QgPSAnbG0nLAoJZm9ybXVsYSA9IHkgfiBwb2x5KHgsIDIpLAoJc2UgPSBUUlVFLAoJYWVzKGNvbG9yID0gaW5jb21lKQopICsgdGhlbWVfY2xhc3NpYygpCiMjIENvbnZlcnQgZGF0YSB0byBudW1lcmljCmNvcnIgPC0gZGF0YS5mcmFtZShsYXBwbHkocmVjYXN0X2RhdGEsIGFzLmludGVnZXIpKSAjUGxvdCB0aGUgZ3JhcGhnZ2NvcnIoY29yciwgbWV0aG9kIDwtIGMoInBhaXJ3aXNlIiwgInNwZWFybWFuIiksIG5icmVha3MgPSA2LCBoanVzdCA9IDAuOCwgbGFiZWwgPSBUUlVFLCBsYWJlbF9zaXplID0gMywgY29sb3IgPSAiZ3JleTUwIikKYGBgCgojIFN0ZXAgNTogVHJhaW4vdGVzdCBzZXQKYGBge3J9CnNldC5zZWVkKDEyMzQpCmNyZWF0ZV90cmFpbl90ZXN0IDwtCglmdW5jdGlvbihkYXRhLCBzaXplID0gMC44LCB0cmFpbiA9IFRSVUUpIHsKCQluX3JvdyA9IG5yb3coZGF0YSkKCQl0b3RhbF9yb3cgPSBzaXplICogbl9yb3cKCQl0cmFpbl9zYW1wbGUgPC0gMTp0b3RhbF9yb3cKCQlpZiAodHJhaW4gPT0gVFJVRSkgewoJCQlyZXR1cm4gKGRhdGFbdHJhaW5fc2FtcGxlLCBdKQoJCX0KCQllbHNlIHsKCQkJcmV0dXJuIChkYXRhWy10cmFpbl9zYW1wbGUsIF0pCgkJfQoJfQpkYXRhX3RyYWluIDwtIGNyZWF0ZV90cmFpbl90ZXN0KHJlY2FzdF9kYXRhLCAwLjgsIHRyYWluID0gVFJVRSkKZGF0YV90ZXN0IDwtIGNyZWF0ZV90cmFpbl90ZXN0KHJlY2FzdF9kYXRhLCAwLjgsIHRyYWluID0gRkFMU0UpCmRpbShkYXRhX3RyYWluKQpkaW0oZGF0YV90ZXN0KQpgYGAKCiMgU3RlcCA2OiBCdWlsZCB0aGUgbW9kZWwKYGBge3J9CmZvcm11bGEgPC0gaW5jb21lIH4gLgpsb2dpdCA8LSBnbG0oZm9ybXVsYSwgZGF0YSA9IGRhdGFfdHJhaW4sIGZhbWlseSA9ICdiaW5vbWlhbCcpCnN1bW1hcnkobG9naXQpCmxhcHBseShsb2dpdCwgY2xhc3MpWzE6M10KbG9naXQkYWljCmBgYAoKIyBTdGVwIDc6IEFzc2VzcyB0aGUgcGVyZm9ybWFuY2Ugb2YgdGhlIG1vZGVsCmBgYHtyfQpwcmVkaWN0IDwtIHByZWRpY3QobG9naXQsIGRhdGFfdGVzdCwgdHlwZSA9ICdyZXNwb25zZScpCiMjIGNvbmZ1c2lvbiBtYXRyaXgKdGFibGVfbWF0IDwtIHRhYmxlKGRhdGFfdGVzdCRpbmNvbWUsIHByZWRpY3QgPiAwLjUpCnRhYmxlX21hdAphY2N1cmFjeV9UZXN0IDwtIHN1bShkaWFnKHRhYmxlX21hdCkpIC8gc3VtKHRhYmxlX21hdCkKYWNjdXJhY3lfVGVzdApwcmVjaXNpb24gPC0gZnVuY3Rpb24obWF0cml4KSB7Cgl0cCA8LSBtYXRyaXhbMiwgMl0gI3RydWUgcG9zaXRpdmUKCWZwIDwtIG1hdHJpeFsxLCAyXSAjZmFsc2UgcG9zaXRpdmUKCXJldHVybiAodHAgLyAodHAgKyBmcCkpCn0KcmVjYWxsIDwtIGZ1bmN0aW9uKG1hdHJpeCkgewoJdHAgPC0gbWF0cml4WzIsIDJdICN0cnVlIHBvc2l0aXZlCglmbiA8LSBtYXRyaXhbMiwgMV0gI2ZhbHNlIHBvc2l0aXZlCglyZXR1cm4gKHRwIC8gKHRwICsgZm4pKQp9CnByZWMgPC0gcHJlY2lzaW9uKHRhYmxlX21hdCkKcHJlYwpyZWMgPC0gcmVjYWxsKHRhYmxlX21hdCkKcmVjCmYxIDwtIDIgKiAoKHByZWMgKiByZWMpIC8gKHByZWMgKyByZWMpKQpmMQpST0NScHJlZCA8LSBwcmVkaWN0aW9uKHByZWRpY3QsIGRhdGFfdGVzdCRpbmNvbWUpClJPQ1JwZXJmIDwtIHBlcmZvcm1hbmNlKFJPQ1JwcmVkLCAndHByJywgJ2ZwcicpCnBsb3QoUk9DUnBlcmYsIGNvbG9yaXplID0gVFJVRSwgdGV4dC5hZGogPSBjKC0wLjIsIDEuNykpCmBgYAoKIyBzdGVwIDg6IEltcHJvdmUgdGhlIG1vZGVsCmBgYHtyfQpmb3JtdWxhXzIgPC0gaW5jb21lIH4gYWdlOmhvdXJzLnBlci53ZWVrICsgZ2VuZGVyOmhvdXJzLnBlci53ZWVrICsgLgpsb2dpdF8yIDwtIGdsbShmb3JtdWxhXzIsIGRhdGEgPSBkYXRhX3RyYWluLCBmYW1pbHkgPSAnYmlub21pYWwnKQpwcmVkaWN0XzIgPC0gcHJlZGljdChsb2dpdF8yLCBkYXRhX3Rlc3QsIHR5cGUgPSAncmVzcG9uc2UnKQp0YWJsZV9tYXRfMiA8LSB0YWJsZShkYXRhX3Rlc3QkaW5jb21lLCBwcmVkaWN0XzIgPiAwLjUpCnByZWNpc2lvbl8yIDwtIHByZWNpc2lvbih0YWJsZV9tYXRfMikKcmVjYWxsXzIgPC0gcmVjYWxsKHRhYmxlX21hdF8yKQpmMV8yIDwtIDIgKiAoKHByZWNpc2lvbl8yICogcmVjYWxsXzIpIC8gKHByZWNpc2lvbl8yICsgcmVjYWxsXzIpKQpmMV8yCmBgYAoK