Data Exploration

First we use the glimpse function to get a better understanding of our data. First Sales record file is (1000, 14) and the second is (1000000, 14). Both data sets have the same number of columns and the column names/data types are the same.

df <- read.csv("adult.csv")

glimpse(df)
## Rows: 32,561
## Columns: 15
## $ age            <int> 39, 50, 38, 53, 28, 37, 49, 52, 31, 42, 37, 30, 23, 32,…
## $ workclass      <chr> " State-gov", " Self-emp-not-inc", " Private", " Privat…
## $ fnlwgt         <int> 77516, 83311, 215646, 234721, 338409, 284582, 160187, 2…
## $ education      <chr> " Bachelors", " Bachelors", " HS-grad", " 11th", " Bach…
## $ education.num  <int> 13, 13, 9, 7, 13, 14, 5, 9, 14, 13, 10, 13, 13, 12, 11,…
## $ marital_status <chr> " Never-married", " Married-civ-spouse", " Divorced", "…
## $ occupation     <chr> " Adm-clerical", " Exec-managerial", " Handlers-cleaner…
## $ relationship   <chr> " Not-in-family", " Husband", " Not-in-family", " Husba…
## $ race           <chr> " White", " White", " White", " Black", " Black", " Whi…
## $ sex            <chr> " Male", " Male", " Male", " Male", " Female", " Female…
## $ capital_gain   <int> 2174, 0, 0, 0, 0, 0, 0, 0, 14084, 5178, 0, 0, 0, 0, 0, …
## $ capital_loss   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hours_per_week <int> 40, 13, 40, 40, 40, 40, 16, 45, 50, 40, 80, 40, 30, 50,…
## $ native_country <chr> " United-States", " United-States", " United-States", "…
## $ salary         <chr> " <=50K", " <=50K", " <=50K", " <=50K", " <=50K", " <=5…
vis_miss(df)

Data Split

smp_size <- floor(0.75 * nrow(df))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(df)), size = smp_size)

df_train <- df[train_ind, ]
df_test <- df[-train_ind, ]

head(df_test)
##    age         workclass fnlwgt   education education.num      marital_status
## 1   39         State-gov  77516   Bachelors            13       Never-married
## 6   37           Private 284582     Masters            14  Married-civ-spouse
## 8   52  Self-emp-not-inc 209642     HS-grad             9  Married-civ-spouse
## 12  30         State-gov 141297   Bachelors            13  Married-civ-spouse
## 14  32           Private 205019  Assoc-acdm            12       Never-married
## 25  59           Private 109015     HS-grad             9            Divorced
##          occupation   relationship                race     sex capital_gain
## 1      Adm-clerical  Not-in-family               White    Male         2174
## 6   Exec-managerial           Wife               White  Female            0
## 8   Exec-managerial        Husband               White    Male            0
## 12   Prof-specialty        Husband  Asian-Pac-Islander    Male            0
## 14            Sales  Not-in-family               Black    Male            0
## 25     Tech-support      Unmarried               White  Female            0
##    capital_loss hours_per_week native_country salary
## 1             0             40  United-States  <=50K
## 6             0             40  United-States  <=50K
## 8             0             45  United-States   >50K
## 12            0             40          India   >50K
## 14            0             50  United-States  <=50K
## 25            0             40  United-States  <=50K
d_tree1 <- rpart(salary ~ education + occupation + native_country, cp=0.001, maxdepth=5, data=df_train)

prp(d_tree1)

pred <- predict(d_tree1, df_test, type="class")

df_test$salary_pred <- pred

table(df_test$salary, df_test$salary_pred)
##         
##           <=50K  >50K
##    <=50K   5880   244
##    >50K    1526   491
table(df_test$salary, df_test$salary_pred)
##         
##           <=50K  >50K
##    <=50K   5880   244
##    >50K    1526   491
d_tree2 <- rpart(salary ~ ., cp=0.001, data=df_train)

prp(d_tree2)

pred <- predict(d_tree2, df_test, type="class")

df_test$salary_pred <- pred

table(df_test$salary, df_test$salary_pred)
##         
##           <=50K  >50K
##    <=50K   5793   331
##    >50K     823  1194
table(df_test$salary, df_test$salary_pred)
##         
##           <=50K  >50K
##    <=50K   5793   331
##    >50K     823  1194
df_train <- mutate(df_train, across(where(is.character), as.factor))
df_test <- mutate(df_test, across(where(is.character), as.factor))

rfm = randomForest(salary ~ ., data = df_train)

rfm
## 
## Call:
##  randomForest(formula = salary ~ ., data = df_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 13.5%
## Confusion matrix:
##         <=50K  >50K class.error
##  <=50K  17407  1189  0.06393848
##  >50K    2108  3716  0.36195055
df_test <- df_test[ , !(names(df_test) %in% c('salary_pred'))]
df_test <- rbind(df_train[1, ] , df_test)
df_test <- df_test[-1,]

pred <- predict(rfm, df_test)

df_test$salary_pred <- pred

table(df_test$salary, df_test$salary_pred)
##         
##           <=50K  >50K
##    <=50K   5685   439
##    >50K     687  1330
rfm
## 
## Call:
##  randomForest(formula = salary ~ ., data = df_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 13.5%
## Confusion matrix:
##         <=50K  >50K class.error
##  <=50K  17407  1189  0.06393848
##  >50K    2108  3716  0.36195055

Support Vector Model

svm_m = svm(formula = salary ~ .,
                 data = df_train,
                 type = 'C-classification',
                 kernel = 'linear')
svm_m
## 
## Call:
## svm(formula = salary ~ ., data = df_train, type = "C-classification", 
##     kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  8320
df_test <- df_test[ , !(names(df_test) %in% c('salary_pred'))]
df_test <- rbind(df_train[1, ] , df_test)
df_test <- df_test[-1,]


salary_pred <- predict(svm_m, df_test)
df_test$salary_pred <- salary_pred


table(df_test$salary, df_test$salary_pred)
##         
##           <=50K  >50K
##    <=50K   5714   410
##    >50K     824  1193
model = svm(salary ~ education.num + age, data = df_train)
df_plot <- df_train %>% select(salary, education.num, age)

plot(model, data=df_plot)