First we use the glimpse function to get a better understanding of our data. First Sales record file is (1000, 14) and the second is (1000000, 14). Both data sets have the same number of columns and the column names/data types are the same.
df <- read.csv("adult.csv")
glimpse(df)## Rows: 32,561
## Columns: 15
## $ age <int> 39, 50, 38, 53, 28, 37, 49, 52, 31, 42, 37, 30, 23, 32,…
## $ workclass <chr> " State-gov", " Self-emp-not-inc", " Private", " Privat…
## $ fnlwgt <int> 77516, 83311, 215646, 234721, 338409, 284582, 160187, 2…
## $ education <chr> " Bachelors", " Bachelors", " HS-grad", " 11th", " Bach…
## $ education.num <int> 13, 13, 9, 7, 13, 14, 5, 9, 14, 13, 10, 13, 13, 12, 11,…
## $ marital_status <chr> " Never-married", " Married-civ-spouse", " Divorced", "…
## $ occupation <chr> " Adm-clerical", " Exec-managerial", " Handlers-cleaner…
## $ relationship <chr> " Not-in-family", " Husband", " Not-in-family", " Husba…
## $ race <chr> " White", " White", " White", " Black", " Black", " Whi…
## $ sex <chr> " Male", " Male", " Male", " Male", " Female", " Female…
## $ capital_gain <int> 2174, 0, 0, 0, 0, 0, 0, 0, 14084, 5178, 0, 0, 0, 0, 0, …
## $ capital_loss <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ hours_per_week <int> 40, 13, 40, 40, 40, 40, 16, 45, 50, 40, 80, 40, 30, 50,…
## $ native_country <chr> " United-States", " United-States", " United-States", "…
## $ salary <chr> " <=50K", " <=50K", " <=50K", " <=50K", " <=50K", " <=5…
vis_miss(df)smp_size <- floor(0.75 * nrow(df))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(df)), size = smp_size)
df_train <- df[train_ind, ]
df_test <- df[-train_ind, ]
head(df_test)## age workclass fnlwgt education education.num marital_status
## 1 39 State-gov 77516 Bachelors 13 Never-married
## 6 37 Private 284582 Masters 14 Married-civ-spouse
## 8 52 Self-emp-not-inc 209642 HS-grad 9 Married-civ-spouse
## 12 30 State-gov 141297 Bachelors 13 Married-civ-spouse
## 14 32 Private 205019 Assoc-acdm 12 Never-married
## 25 59 Private 109015 HS-grad 9 Divorced
## occupation relationship race sex capital_gain
## 1 Adm-clerical Not-in-family White Male 2174
## 6 Exec-managerial Wife White Female 0
## 8 Exec-managerial Husband White Male 0
## 12 Prof-specialty Husband Asian-Pac-Islander Male 0
## 14 Sales Not-in-family Black Male 0
## 25 Tech-support Unmarried White Female 0
## capital_loss hours_per_week native_country salary
## 1 0 40 United-States <=50K
## 6 0 40 United-States <=50K
## 8 0 45 United-States >50K
## 12 0 40 India >50K
## 14 0 50 United-States <=50K
## 25 0 40 United-States <=50K
d_tree1 <- rpart(salary ~ education + occupation + native_country, cp=0.001, maxdepth=5, data=df_train)
prp(d_tree1)pred <- predict(d_tree1, df_test, type="class")
df_test$salary_pred <- pred
table(df_test$salary, df_test$salary_pred)##
## <=50K >50K
## <=50K 5880 244
## >50K 1526 491
table(df_test$salary, df_test$salary_pred)##
## <=50K >50K
## <=50K 5880 244
## >50K 1526 491
d_tree2 <- rpart(salary ~ ., cp=0.001, data=df_train)
prp(d_tree2)pred <- predict(d_tree2, df_test, type="class")
df_test$salary_pred <- pred
table(df_test$salary, df_test$salary_pred)##
## <=50K >50K
## <=50K 5793 331
## >50K 823 1194
table(df_test$salary, df_test$salary_pred)##
## <=50K >50K
## <=50K 5793 331
## >50K 823 1194
df_train <- mutate(df_train, across(where(is.character), as.factor))
df_test <- mutate(df_test, across(where(is.character), as.factor))
rfm = randomForest(salary ~ ., data = df_train)
rfm##
## Call:
## randomForest(formula = salary ~ ., data = df_train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 13.5%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 17407 1189 0.06393848
## >50K 2108 3716 0.36195055
df_test <- df_test[ , !(names(df_test) %in% c('salary_pred'))]
df_test <- rbind(df_train[1, ] , df_test)
df_test <- df_test[-1,]
pred <- predict(rfm, df_test)
df_test$salary_pred <- pred
table(df_test$salary, df_test$salary_pred)##
## <=50K >50K
## <=50K 5685 439
## >50K 687 1330
rfm##
## Call:
## randomForest(formula = salary ~ ., data = df_train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 13.5%
## Confusion matrix:
## <=50K >50K class.error
## <=50K 17407 1189 0.06393848
## >50K 2108 3716 0.36195055
svm_m = svm(formula = salary ~ .,
data = df_train,
type = 'C-classification',
kernel = 'linear')svm_m##
## Call:
## svm(formula = salary ~ ., data = df_train, type = "C-classification",
## kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 8320
df_test <- df_test[ , !(names(df_test) %in% c('salary_pred'))]
df_test <- rbind(df_train[1, ] , df_test)
df_test <- df_test[-1,]
salary_pred <- predict(svm_m, df_test)
df_test$salary_pred <- salary_pred
table(df_test$salary, df_test$salary_pred)##
## <=50K >50K
## <=50K 5714 410
## >50K 824 1193
model = svm(salary ~ education.num + age, data = df_train)
df_plot <- df_train %>% select(salary, education.num, age)
plot(model, data=df_plot)