library(readr)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(mlr)

## Loading required package: ParamHelpers

## 
## Attaching package: 'mlr'

## The following object is masked from 'package:caret':
## 
##     train

library(data.table)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(ROSE)

## Loaded ROSE 0.0-3

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:plotly':
## 
##     slice

train <- fread("~/Dropbox/rmachinelearn/census/train.csv",na.string=c(""," ","?","NA",NA)) 
test <- fread("~/Dropbox/rmachinelearn/census/test.csv",na.string=c(""," ","?","NA",NA))

数据探索DATA EXPLORATION

dim(train)

## [1] 199523     41

str(train)

## Classes 'data.table' and 'data.frame':   199523 obs. of  41 variables:
##  $ age                             : int  73 58 18 9 10 48 42 28 47 34 ...
##  $ class_of_worker                 : chr  "Not in universe" "Self-employed-not incorporated" "Not in universe" "Not in universe" ...
##  $ industry_code                   : int  0 4 0 0 0 40 34 4 43 4 ...
##  $ occupation_code                 : int  0 34 0 0 0 10 3 40 26 37 ...
##  $ education                       : chr  "High school graduate" "Some college but no degree" "10th grade" "Children" ...
##  $ wage_per_hour                   : int  0 0 0 0 0 1200 0 0 876 0 ...
##  $ enrolled_in_edu_inst_lastwk     : chr  "Not in universe" "Not in universe" "High school" "Not in universe" ...
##  $ marital_status                  : chr  "Widowed" "Divorced" "Never married" "Never married" ...
##  $ major_industry_code             : chr  "Not in universe or children" "Construction" "Not in universe or children" "Not in universe or children" ...
##  $ major_occupation_code           : chr  "Not in universe" "Precision production craft & repair" "Not in universe" "Not in universe" ...
##  $ race                            : chr  "White" "White" "Asian or Pacific Islander" "White" ...
##  $ hispanic_origin                 : chr  "All other" "All other" "All other" "All other" ...
##  $ sex                             : chr  "Female" "Male" "Female" "Female" ...
##  $ member_of_labor_union           : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ reason_for_unemployment         : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ full_parttime_employment_stat   : chr  "Not in labor force" "Children or Armed Forces" "Not in labor force" "Children or Armed Forces" ...
##  $ capital_gains                   : int  0 0 0 0 0 0 5178 0 0 0 ...
##  $ capital_losses                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ dividend_from_Stocks            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ tax_filer_status                : chr  "Nonfiler" "Head of household" "Nonfiler" "Nonfiler" ...
##  $ region_of_previous_residence    : chr  "Not in universe" "South" "Not in universe" "Not in universe" ...
##  $ state_of_previous_residence     : chr  "Not in universe" "Arkansas" "Not in universe" "Not in universe" ...
##  $ d_household_family_stat         : chr  "Other Rel 18+ ever marr not in subfamily" "Householder" "Child 18+ never marr Not in a subfamily" "Child <18 never marr not in subfamily" ...
##  $ d_household_summary             : chr  "Other relative of householder" "Householder" "Child 18 or older" "Child under 18 never married" ...
##  $ migration_msa                   : chr  NA "MSA to MSA" NA "Nonmover" ...
##  $ migration_reg                   : chr  NA "Same county" NA "Nonmover" ...
##  $ migration_within_reg            : chr  NA "Same county" NA "Nonmover" ...
##  $ live_1_year_ago                 : chr  "Not in universe under 1 year old" "No" "Not in universe under 1 year old" "Yes" ...
##  $ migration_sunbelt               : chr  NA "Yes" NA "Not in universe" ...
##  $ num_person_Worked_employer      : int  0 1 0 0 0 1 6 4 5 6 ...
##  $ family_members_under_18         : chr  "Not in universe" "Not in universe" "Not in universe" "Both parents present" ...
##  $ country_father                  : chr  "United-States" "United-States" "Vietnam" "United-States" ...
##  $ country_mother                  : chr  "United-States" "United-States" "Vietnam" "United-States" ...
##  $ country_self                    : chr  "United-States" "United-States" "Vietnam" "United-States" ...
##  $ citizenship                     : chr  "Native- Born in the United States" "Native- Born in the United States" "Foreign born- Not a citizen of U S" "Native- Born in the United States" ...
##  $ business_or_self_employed       : int  0 0 0 0 0 2 0 0 0 0 ...
##  $ fill_questionnaire_veteran_admin: chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ veterans_benefits               : int  2 2 2 0 0 2 2 2 2 2 ...
##  $ weeks_worked_in_year            : int  0 52 0 0 0 52 52 30 52 52 ...
##  $ year                            : int  95 94 95 94 94 95 94 95 95 94 ...
##  $ income_level                    : chr  "-50000" "-50000" "-50000" "-50000" ...
##  - attr(*, ".internal.selfref")=<externalptr>

train数据集199523*41.

dim(test)

## [1] 99762    41

str (test)

## Classes 'data.table' and 'data.frame':   99762 obs. of  41 variables:
##  $ age                             : int  38 44 2 35 49 13 1 61 38 7 ...
##  $ class_of_worker                 : chr  "Private" "Self-employed-not incorporated" "Not in universe" "Private" ...
##  $ industry_code                   : int  6 37 0 29 4 0 0 0 45 0 ...
##  $ occupation_code                 : int  36 12 0 3 34 0 0 0 12 0 ...
##  $ education                       : chr  "1st 2nd 3rd or 4th grade" "Associates degree-occup /vocational" "Children" "High school graduate" ...
##  $ wage_per_hour                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ enrolled_in_edu_inst_lastwk     : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ marital_status                  : chr  "Married-civilian spouse present" "Married-civilian spouse present" "Never married" "Divorced" ...
##  $ major_industry_code             : chr  "Manufacturing-durable goods" "Business and repair services" "Not in universe or children" "Transportation" ...
##  $ major_occupation_code           : chr  "Machine operators assmblrs & inspctrs" "Professional specialty" "Not in universe" "Executive admin and managerial" ...
##  $ race                            : chr  "White" "White" "White" "White" ...
##  $ hispanic_origin                 : chr  "Mexican (Mexicano)" "All other" "Mexican-American" "All other" ...
##  $ sex                             : chr  "Female" "Female" "Male" "Female" ...
##  $ member_of_labor_union           : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ reason_for_unemployment         : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ full_parttime_employment_stat   : chr  "Full-time schedules" "PT for econ reasons usually PT" "Children or Armed Forces" "Children or Armed Forces" ...
##  $ capital_gains                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ capital_losses                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ dividend_from_Stocks            : int  0 2500 0 0 0 0 0 0 0 0 ...
##  $ tax_filer_status                : chr  "Joint one under 65 & one 65+" "Joint both under 65" "Nonfiler" "Head of household" ...
##  $ region_of_previous_residence    : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ state_of_previous_residence     : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ d_household_family_stat         : chr  "Spouse of householder" "Spouse of householder" "Child <18 never marr not in subfamily" "Householder" ...
##  $ d_household_summary             : chr  "Spouse of householder" "Spouse of householder" "Child under 18 never married" "Householder" ...
##  $ migration_msa                   : chr  NA NA NA "Nonmover" ...
##  $ migration_reg                   : chr  NA NA NA "Nonmover" ...
##  $ migration_within_reg            : chr  NA NA NA "Nonmover" ...
##  $ live_1_year_ago                 : chr  "Not in universe under 1 year old" "Not in universe under 1 year old" "Not in universe under 1 year old" "Yes" ...
##  $ migration_sunbelt               : chr  NA NA NA "Not in universe" ...
##  $ num_person_Worked_employer      : int  4 1 0 5 4 0 0 0 1 0 ...
##  $ family_members_under_18         : chr  "Not in universe" "Not in universe" "Both parents present" "Not in universe" ...
##  $ country_father                  : chr  "Mexico" "United-States" "United-States" "United-States" ...
##  $ country_mother                  : chr  "Mexico" "United-States" "United-States" "United-States" ...
##  $ country_self                    : chr  "Mexico" "United-States" "United-States" "United-States" ...
##  $ citizenship                     : chr  "Foreign born- Not a citizen of U S" "Native- Born in the United States" "Native- Born in the United States" "Native- Born in the United States" ...
##  $ business_or_self_employed       : int  0 0 0 2 0 0 0 0 0 0 ...
##  $ fill_questionnaire_veteran_admin: chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ veterans_benefits               : int  2 2 0 2 2 0 0 2 2 0 ...
##  $ weeks_worked_in_year            : int  12 26 0 52 50 0 0 0 52 0 ...
##  $ year                            : int  95 95 95 94 95 94 94 95 94 94 ...
##  $ income_level                    : chr  "-50000" "-50000" "-50000" "-50000" ...
##  - attr(*, ".internal.selfref")=<externalptr>

test数据集99762*41.

查看因变量take a look at target variable

unique(train$income_level)

## [1] "-50000" "+50000"

unique(test$income_level)

## [1] "-50000"  "50000+."

编码因变量encode target variables

# 将目标变量取值替换为0,1，ifelse(test,yes,no)
train[, income_level := ifelse(income_level == "-50000",0,1)]
test[, income_level := ifelse(income_level == "-50000",0,1)]

查看训练集的正负样本分布情况

round(prop.table(table(train$income_level))*100)

## 
##  0  1 
## 94  6

从返回结果可以看出，原始训练集的正负样本分布非常不均衡，收入水平小于5万的人群占据了94%，大于5万的人群仅仅占了6%（毕竟有钱人还是少！），这是一个典型的不平衡数据集，正负样本差别太大，则会对模型的准确性造成误解。例如有98个正例，2个反例，那么只需要建立一个永远返回正例的预测模型就能轻松达到98%的精确度，这样显然是不合理的。那么如何由这种不平衡数据集学习到一个准确预测收入水平的模型呢？这是主要要解决的难题。

数据预处理

首先，从数据集介绍的页面可以了解到，变量被分为nominal（名义型）和continuous（连续型）两种类型，即分别对应类别型和数值型。对数据进行预处理时，首先要解决的便是将这两种不同的数据类型切分开来，data.table包可以帮助我们快速简单地完成这个任务。

#数据集介绍的页面已经告诉我们哪些特征为nominal或是continuous
factcols <- c(2:5,7,8:16,20:29,31:38,40,41)
numcols <- setdiff(1:40,factcols)
# lapply的.SD的用法
#DT[, .SD, .SDcols=x:y]  #用.SDcols 定义SubDadaColums（子列数据)，这里取出x到y之间的列作为子集，然后.SD 输出所有子集
train[,(factcols) := lapply(.SD, factor), .SDcols = factcols][,(numcols) := lapply(.SD, as.numeric), .SDcols = numcols]
test[,(factcols) := lapply(.SD, factor), .SDcols = factcols][,(numcols) := lapply(.SD, as.numeric), .SDcols = numcols]
# 将训练集和测试集中的类别变量和数值变量分别提取出来
cat_train <- train[,factcols, with=FALSE]
cat_test <- test[,factcols,with=FALSE]
num_train <- train[,numcols,with=FALSE]
num_test <- test[,numcols,with=FALSE]
rm(train,test)

数据可视化

单纯查看数据集无法得到直观的感受，“一图胜千言”，图形是最简单直观的办法，下面我们会用到ggplot2和plotly两个强大的绘图包。

# geom_histogram()直方图
# geom_density()密度图
# aes设定x轴y轴的名称
plot_dist <- function(a){
  ggplot(data = num_train, aes(x= a, y=..density..)) + geom_histogram(fill="green",color="white",
                                        alpha = 0.5,bins =100) + geom_density()
  ggplotly()
}
plot_dist(num_train$age)

## Warning: We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

plot_dist(num_train$wage_per_hour)

## Warning: We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

以上两个分布图符合我们的常识，年龄分布在0~90岁之间，年龄越大，所占比例越小。

自变量与因变量之间的关系

我们分别考虑训练集中的数值型变量和类别型变量与income_level的关系。首先看数值型的，数值型训练集num_train下有wage_per_hour、capital_gains、capital_losses、dividend_from_Stocks等等几个变量，我们选取了四个关联程度较大的指标，可以看出，大部分年龄段处于25-65的人收入水平income_level为1（大于50000），他们的小时工资（wage per hour）处在1000美元到4000美元的水平。这个事实进一步强化了我们认为年龄小于20的人收入水平小于50000的假设。

#income_level属于类别型的，被切分到了cat_train中
#:=是data.table添加一列
num_train[,income_level := cat_train$income_level]
ggplot(data=num_train,aes(x = age,
y=wage_per_hour))+geom_point(aes(colour=income_level))+scale_y_continuous("wage per
hour", breaks = seq(0,10000,1000))

股票收益对收入的影响也是比较大的，收入水平大于50000的人群基本上股票分红都超过了30000美元

ggplot(data=num_train,aes(x = age,y=dividend_from_Stocks))+geom_point(aes(colour=income_level))+
  scale_y_continuous("dividend from stocks", breaks = seq(0,10000,5000))

ggplot(data = num_train, aes(x = weeks_worked_in_year, y = wage_per_hour)) +
  geom_point(aes(colour = income_level)) +
  scale_y_continuous("wage per hour", breaks = seq(0,10000,1000))

我们也可以将分类变量以可视化的形式展现出来，对于类别数据，dodged条形图比一般条形图能展更多的信息。在dodged条形图中，可以发现很多有趣的东西，比如本科毕业生年薪超过5万的人数最多，拥有博士学位的人年薪超过5万和低于5万的比例是相同的，白人年薪超过5万的人群远远多于其他肤色的种族

plot_dodgedbar <- function(a){
  ggplot(cat_train, aes(x = a, fill = income_level)) +
    geom_bar(position = 'dodge', color = 'black') + 
    scale_fill_brewer(palette = 'Pastel1') + 
    theme(axis.text.x = element_text(angle = 60, hjust = 1, size = 10))
}
plot_dodgedbar(cat_train$class_of_worker)

plot_dodgedbar(cat_train$education)

plot_dodgedbar(cat_train$enrolled_in_edu_inst_lastwk)

plot_dodgedbar(cat_train$marital_status)

plot_dodgedbar(cat_train$major_industry_code)

plot_dodgedbar(cat_train$major_occupation_code)

prop.table(table(cat_train$class_of_worker, cat_train$income_level),1)

##                                 
##                                            0           1
##   Federal government             0.795897436 0.204102564
##   Local government               0.891187050 0.108812950
##   Never worked                   0.995444191 0.004555809
##   Not in universe                0.990982094 0.009017906
##   Private                        0.898345088 0.101654912
##   Self-employed-incorporated     0.652679939 0.347320061
##   Self-employed-not incorporated 0.870929544 0.129070456
##   State government               0.885261415 0.114738585
##   Without pay                    0.993939394 0.006060606

prop.table(table(cat_train$marital_status, cat_train$income_level),1)

##                                  
##                                            0          1
##   Divorced                        0.91612903 0.08387097
##   Married-A F spouse present      0.97744361 0.02255639
##   Married-civilian spouse present 0.88601553 0.11398447
##   Married-spouse absent           0.93675889 0.06324111
##   Never married                   0.98708447 0.01291553
##   Separated                       0.95433526 0.04566474
##   Widowed                         0.96846029 0.03153971

数据清洗DATA CLEANING

检查遗漏值

数据清洗时数据分析的一个重要步骤，首先检查训练集和测试集中是否有遗漏值

table(is.na(num_train))

## 
##   FALSE 
## 1596184

table(is.na(num_test))

## 
##  FALSE 
## 698334

num_train[, income_level := NULL]

从反馈的结果来看，FALSE分别等于1596184=199523×8，698334=99762×7，故训练集和测试集中没有一个遗漏值，这是一个不错的消息！

删除高度相关变量

correlatedvars <- findCorrelation(x = cor(num_train), cutoff = 0.7)
correlatedvars

## [1] 7

num_train <-num_train[, -correlatedvars, with = FALSE]
num_test <- num_test[, -correlatedvars, with = FALSE]

筛选的结果显示，weeks_worked_in_year变量与其他变量存在相当高的相关性。这很好理解，因为一年之中工作的时间越长那么相应的工资、回报也会随之上涨，所以要把这个高度相关性的变量剔除掉，这样num_train当中就只剩下7个变量了。

missingvaluesprop_train <- sapply(cat_train, function(x){sum(is.na(x))/length(x)})*100 
missingvaluesprop_train

##                  class_of_worker                    industry_code 
##                        0.0000000                        0.0000000 
##                  occupation_code                        education 
##                        0.0000000                        0.0000000 
##      enrolled_in_edu_inst_lastwk                   marital_status 
##                        0.0000000                        0.0000000 
##              major_industry_code            major_occupation_code 
##                        0.0000000                        0.0000000 
##                             race                  hispanic_origin 
##                        0.0000000                        0.4380447 
##                              sex            member_of_labor_union 
##                        0.0000000                        0.0000000 
##          reason_for_unemployment    full_parttime_employment_stat 
##                        0.0000000                        0.0000000 
##                 tax_filer_status     region_of_previous_residence 
##                        0.0000000                        0.0000000 
##      state_of_previous_residence          d_household_family_stat 
##                        0.3548463                        0.0000000 
##              d_household_summary                    migration_msa 
##                        0.0000000                       49.9671717 
##                    migration_reg             migration_within_reg 
##                       49.9671717                       49.9671717 
##                  live_1_year_ago                migration_sunbelt 
##                        0.0000000                       49.9671717 
##          family_members_under_18                   country_father 
##                        0.0000000                        3.3645244 
##                   country_mother                     country_self 
##                        3.0668144                        1.7005558 
##                      citizenship        business_or_self_employed 
##                        0.0000000                        0.0000000 
## fill_questionnaire_veteran_admin                veterans_benefits 
##                        0.0000000                        0.0000000 
##                             year                     income_level 
##                        0.0000000                        0.0000000

missingvaluesprop_test <- sapply(cat_test, function(x){sum(is.na(x))/length(x)})*100
missingvaluesprop_test

##                  class_of_worker                    industry_code 
##                        0.0000000                        0.0000000 
##                  occupation_code                        education 
##                        0.0000000                        0.0000000 
##      enrolled_in_edu_inst_lastwk                   marital_status 
##                        0.0000000                        0.0000000 
##              major_industry_code            major_occupation_code 
##                        0.0000000                        0.0000000 
##                             race                  hispanic_origin 
##                        0.0000000                        0.4059662 
##                              sex            member_of_labor_union 
##                        0.0000000                        0.0000000 
##          reason_for_unemployment    full_parttime_employment_stat 
##                        0.0000000                        0.0000000 
##                 tax_filer_status     region_of_previous_residence 
##                        0.0000000                        0.0000000 
##      state_of_previous_residence          d_household_family_stat 
##                        0.3307873                        0.0000000 
##              d_household_summary                    migration_msa 
##                        0.0000000                       50.0651551 
##                    migration_reg             migration_within_reg 
##                       50.0651551                       50.0651551 
##                  live_1_year_ago                migration_sunbelt 
##                        0.0000000                       50.0651551 
##          family_members_under_18                   country_father 
##                        0.0000000                        3.4371805 
##                   country_mother                     country_self 
##                        3.0793288                        1.7682083 
##                      citizenship        business_or_self_employed 
##                        0.0000000                        0.0000000 
## fill_questionnaire_veteran_admin                veterans_benefits 
##                        0.0000000                        0.0000000 
##                             year                     income_level 
##                        0.0000000                        0.0000000

大部分的列情况比较乐观，但是有的列甚至有超过50%的数据遗漏（这有可能是由于采集数据难度所致，特别是人口普查），将遗漏率小于5%的列挑选出来，遗漏率太高的变量剔除掉。

cat_train <- subset(cat_train, select = missingvaluesprop_train < 5)
cat_test <- subset(cat_test, select = missingvaluesprop_test < 5)

set NA as Unavailable - train data

#convert to characters
cat_train <- cat_train[,names(cat_train) := lapply(.SD, as.character),.SDcols = names(cat_train)]
for (i in seq_along(cat_train)){ 
  set(cat_train, i=which(is.na(cat_train[[i]])), j=i, value="Unavailable")
}
str(cat_train)

## Classes 'data.table' and 'data.frame':   199523 obs. of  30 variables:
##  $ class_of_worker                 : chr  "Not in universe" "Self-employed-not incorporated" "Not in universe" "Not in universe" ...
##  $ industry_code                   : chr  "0" "4" "0" "0" ...
##  $ occupation_code                 : chr  "0" "34" "0" "0" ...
##  $ education                       : chr  "High school graduate" "Some college but no degree" "10th grade" "Children" ...
##  $ enrolled_in_edu_inst_lastwk     : chr  "Not in universe" "Not in universe" "High school" "Not in universe" ...
##  $ marital_status                  : chr  "Widowed" "Divorced" "Never married" "Never married" ...
##  $ major_industry_code             : chr  "Not in universe or children" "Construction" "Not in universe or children" "Not in universe or children" ...
##  $ major_occupation_code           : chr  "Not in universe" "Precision production craft & repair" "Not in universe" "Not in universe" ...
##  $ race                            : chr  "White" "White" "Asian or Pacific Islander" "White" ...
##  $ hispanic_origin                 : chr  "All other" "All other" "All other" "All other" ...
##  $ sex                             : chr  "Female" "Male" "Female" "Female" ...
##  $ member_of_labor_union           : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ reason_for_unemployment         : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ full_parttime_employment_stat   : chr  "Not in labor force" "Children or Armed Forces" "Not in labor force" "Children or Armed Forces" ...
##  $ tax_filer_status                : chr  "Nonfiler" "Head of household" "Nonfiler" "Nonfiler" ...
##  $ region_of_previous_residence    : chr  "Not in universe" "South" "Not in universe" "Not in universe" ...
##  $ state_of_previous_residence     : chr  "Not in universe" "Arkansas" "Not in universe" "Not in universe" ...
##  $ d_household_family_stat         : chr  "Other Rel 18+ ever marr not in subfamily" "Householder" "Child 18+ never marr Not in a subfamily" "Child <18 never marr not in subfamily" ...
##  $ d_household_summary             : chr  "Other relative of householder" "Householder" "Child 18 or older" "Child under 18 never married" ...
##  $ live_1_year_ago                 : chr  "Not in universe under 1 year old" "No" "Not in universe under 1 year old" "Yes" ...
##  $ family_members_under_18         : chr  "Not in universe" "Not in universe" "Not in universe" "Both parents present" ...
##  $ country_father                  : chr  "United-States" "United-States" "Vietnam" "United-States" ...
##  $ country_mother                  : chr  "United-States" "United-States" "Vietnam" "United-States" ...
##  $ country_self                    : chr  "United-States" "United-States" "Vietnam" "United-States" ...
##  $ citizenship                     : chr  "Native- Born in the United States" "Native- Born in the United States" "Foreign born- Not a citizen of U S" "Native- Born in the United States" ...
##  $ business_or_self_employed       : chr  "0" "0" "0" "0" ...
##  $ fill_questionnaire_veteran_admin: chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ veterans_benefits               : chr  "2" "2" "2" "0" ...
##  $ year                            : chr  "95" "94" "95" "94" ...
##  $ income_level                    : chr  "0" "0" "0" "0" ...
##  - attr(*, ".internal.selfref")=<externalptr>

#convert back to factors
cat_train <- cat_train[, names(cat_train) := lapply(.SD,factor), .SDcols = names(cat_train)]
str(cat_train)

## Classes 'data.table' and 'data.frame':   199523 obs. of  30 variables:
##  $ class_of_worker                 : Factor w/ 9 levels "Federal government",..: 4 7 4 4 4 5 5 5 2 5 ...
##  $ industry_code                   : Factor w/ 52 levels "0","1","10","11",..: 1 35 1 1 1 36 29 35 39 35 ...
##  $ occupation_code                 : Factor w/ 47 levels "0","1","10","11",..: 1 29 1 1 1 3 24 36 20 32 ...
##  $ education                       : Factor w/ 17 levels "10th grade","11th grade",..: 13 17 1 11 11 17 10 13 17 17 ...
##  $ enrolled_in_edu_inst_lastwk     : Factor w/ 3 levels "College or university",..: 3 3 2 3 3 3 3 3 3 3 ...
##  $ marital_status                  : Factor w/ 7 levels "Divorced","Married-A F spouse present",..: 7 1 5 5 5 3 3 5 3 3 ...
##  $ major_industry_code             : Factor w/ 24 levels "Agriculture",..: 15 5 15 15 15 7 8 5 6 5 ...
##  $ major_occupation_code           : Factor w/ 15 levels "Adm support including clerical",..: 7 9 7 7 7 11 3 5 1 6 ...
##  $ race                            : Factor w/ 5 levels "Amer Indian Aleut or Eskimo",..: 5 5 2 5 5 1 5 5 5 5 ...
##  $ hispanic_origin                 : Factor w/ 10 levels "All other","Central or South American",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex                             : Factor w/ 2 levels "Female","Male": 1 2 1 1 1 1 2 1 1 2 ...
##  $ member_of_labor_union           : Factor w/ 3 levels "No","Not in universe",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ reason_for_unemployment         : Factor w/ 6 levels "Job leaver","Job loser - on layoff",..: 4 4 4 4 4 4 4 2 4 4 ...
##  $ full_parttime_employment_stat   : Factor w/ 8 levels "Children or Armed Forces",..: 3 1 3 1 1 2 1 7 2 1 ...
##  $ tax_filer_status                : Factor w/ 6 levels "Head of household",..: 5 1 5 5 5 3 3 6 3 3 ...
##  $ region_of_previous_residence    : Factor w/ 6 levels "Abroad","Midwest",..: 4 5 4 4 4 4 4 4 4 4 ...
##  $ state_of_previous_residence     : Factor w/ 51 levels "Abroad","Alabama",..: 36 5 36 36 36 36 36 36 36 36 ...
##  $ d_household_family_stat         : Factor w/ 38 levels "Child 18+ ever marr Not in a subfamily",..: 25 21 5 6 6 37 21 36 37 21 ...
##  $ d_household_summary             : Factor w/ 8 levels "Child 18 or older",..: 7 5 1 3 3 8 5 6 8 5 ...
##  $ live_1_year_ago                 : Factor w/ 3 levels "No","Not in universe under 1 year old",..: 2 1 2 3 3 2 3 2 2 3 ...
##  $ family_members_under_18         : Factor w/ 5 levels "Both parents present",..: 5 5 5 1 1 5 5 5 5 5 ...
##  $ country_father                  : Factor w/ 43 levels "Cambodia","Canada",..: 41 41 42 41 41 31 41 41 41 41 ...
##  $ country_mother                  : Factor w/ 43 levels "Cambodia","Canada",..: 41 41 42 41 41 41 41 41 41 41 ...
##  $ country_self                    : Factor w/ 43 levels "Cambodia","Canada",..: 41 41 42 41 41 41 41 41 41 41 ...
##  $ citizenship                     : Factor w/ 5 levels "Foreign born- Not a citizen of U S",..: 5 5 1 5 5 5 5 5 5 5 ...
##  $ business_or_self_employed       : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 3 1 1 1 1 ...
##  $ fill_questionnaire_veteran_admin: Factor w/ 3 levels "No","Not in universe",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ veterans_benefits               : Factor w/ 3 levels "0","1","2": 3 3 3 1 1 3 3 3 3 3 ...
##  $ year                            : Factor w/ 2 levels "94","95": 2 1 2 1 1 2 1 2 2 1 ...
##  $ income_level                    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

set NA as Unavailable - test data

cat_test <- cat_test[, (names(cat_test)) := lapply(.SD, as.character), .SDcols = names(cat_test)]
for (i in seq_along(cat_test)){ 
  set(cat_test, i=which(is.na(cat_test[[i]])), j=i, value="Unavailable")
}
str(cat_test)

## Classes 'data.table' and 'data.frame':   99762 obs. of  30 variables:
##  $ class_of_worker                 : chr  "Private" "Self-employed-not incorporated" "Not in universe" "Private" ...
##  $ industry_code                   : chr  "6" "37" "0" "29" ...
##  $ occupation_code                 : chr  "36" "12" "0" "3" ...
##  $ education                       : chr  "1st 2nd 3rd or 4th grade" "Associates degree-occup /vocational" "Children" "High school graduate" ...
##  $ enrolled_in_edu_inst_lastwk     : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ marital_status                  : chr  "Married-civilian spouse present" "Married-civilian spouse present" "Never married" "Divorced" ...
##  $ major_industry_code             : chr  "Manufacturing-durable goods" "Business and repair services" "Not in universe or children" "Transportation" ...
##  $ major_occupation_code           : chr  "Machine operators assmblrs & inspctrs" "Professional specialty" "Not in universe" "Executive admin and managerial" ...
##  $ race                            : chr  "White" "White" "White" "White" ...
##  $ hispanic_origin                 : chr  "Mexican (Mexicano)" "All other" "Mexican-American" "All other" ...
##  $ sex                             : chr  "Female" "Female" "Male" "Female" ...
##  $ member_of_labor_union           : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ reason_for_unemployment         : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ full_parttime_employment_stat   : chr  "Full-time schedules" "PT for econ reasons usually PT" "Children or Armed Forces" "Children or Armed Forces" ...
##  $ tax_filer_status                : chr  "Joint one under 65 & one 65+" "Joint both under 65" "Nonfiler" "Head of household" ...
##  $ region_of_previous_residence    : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ state_of_previous_residence     : chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ d_household_family_stat         : chr  "Spouse of householder" "Spouse of householder" "Child <18 never marr not in subfamily" "Householder" ...
##  $ d_household_summary             : chr  "Spouse of householder" "Spouse of householder" "Child under 18 never married" "Householder" ...
##  $ live_1_year_ago                 : chr  "Not in universe under 1 year old" "Not in universe under 1 year old" "Not in universe under 1 year old" "Yes" ...
##  $ family_members_under_18         : chr  "Not in universe" "Not in universe" "Both parents present" "Not in universe" ...
##  $ country_father                  : chr  "Mexico" "United-States" "United-States" "United-States" ...
##  $ country_mother                  : chr  "Mexico" "United-States" "United-States" "United-States" ...
##  $ country_self                    : chr  "Mexico" "United-States" "United-States" "United-States" ...
##  $ citizenship                     : chr  "Foreign born- Not a citizen of U S" "Native- Born in the United States" "Native- Born in the United States" "Native- Born in the United States" ...
##  $ business_or_self_employed       : chr  "0" "0" "0" "2" ...
##  $ fill_questionnaire_veteran_admin: chr  "Not in universe" "Not in universe" "Not in universe" "Not in universe" ...
##  $ veterans_benefits               : chr  "2" "2" "0" "2" ...
##  $ year                            : chr  "95" "95" "95" "94" ...
##  $ income_level                    : chr  "0" "0" "0" "0" ...
##  - attr(*, ".internal.selfref")=<externalptr>

#convert back to factors
cat_test <- cat_test[, (names(cat_test)) := lapply(.SD, factor), .SDcols = names(cat_test)]
str(cat_test)

## Classes 'data.table' and 'data.frame':   99762 obs. of  30 variables:
##  $ class_of_worker                 : Factor w/ 9 levels "Federal government",..: 5 7 4 5 5 4 4 4 5 4 ...
##  $ industry_code                   : Factor w/ 52 levels "0","1","10","11",..: 49 32 1 23 35 1 1 1 41 1 ...
##  $ occupation_code                 : Factor w/ 47 levels "0","1","10","11",..: 31 5 1 24 29 1 1 1 5 1 ...
##  $ education                       : Factor w/ 17 levels "10th grade","11th grade",..: 4 9 11 13 13 11 11 13 15 11 ...
##  $ enrolled_in_edu_inst_lastwk     : Factor w/ 3 levels "College or university",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ marital_status                  : Factor w/ 7 levels "Divorced","Married-A F spouse present",..: 3 3 5 1 1 5 5 3 3 5 ...
##  $ major_industry_code             : Factor w/ 24 levels "Agriculture",..: 11 3 15 22 5 15 15 15 16 15 ...
##  $ major_occupation_code           : Factor w/ 15 levels "Adm support including clerical",..: 6 11 7 3 9 7 7 7 11 7 ...
##  $ race                            : Factor w/ 5 levels "Amer Indian Aleut or Eskimo",..: 5 5 5 5 5 5 5 5 3 5 ...
##  $ hispanic_origin                 : Factor w/ 10 levels "All other","Central or South American",..: 7 1 6 1 1 1 6 1 1 1 ...
##  $ sex                             : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 1 2 1 ...
##  $ member_of_labor_union           : Factor w/ 3 levels "No","Not in universe",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ reason_for_unemployment         : Factor w/ 6 levels "Job leaver","Job loser - on layoff",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ full_parttime_employment_stat   : Factor w/ 8 levels "Children or Armed Forces",..: 2 5 1 1 2 1 1 3 1 1 ...
##  $ tax_filer_status                : Factor w/ 6 levels "Head of household",..: 4 3 5 1 6 5 5 3 3 5 ...
##  $ region_of_previous_residence    : Factor w/ 6 levels "Abroad","Midwest",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ state_of_previous_residence     : Factor w/ 51 levels "Abroad","Alabama",..: 36 36 36 36 36 36 36 36 36 36 ...
##  $ d_household_family_stat         : Factor w/ 37 levels "Child 18+ ever marr Not in a subfamily",..: 36 36 6 20 35 6 6 36 20 6 ...
##  $ d_household_summary             : Factor w/ 8 levels "Child 18 or older",..: 8 8 3 5 6 3 3 8 5 3 ...
##  $ live_1_year_ago                 : Factor w/ 3 levels "No","Not in universe under 1 year old",..: 2 2 2 3 2 3 3 2 3 3 ...
##  $ family_members_under_18         : Factor w/ 5 levels "Both parents present",..: 5 5 1 5 5 1 1 5 5 1 ...
##  $ country_father                  : Factor w/ 43 levels "Cambodia","Canada",..: 26 41 41 41 41 11 26 41 41 41 ...
##  $ country_mother                  : Factor w/ 43 levels "Cambodia","Canada",..: 26 41 41 41 41 41 41 41 41 41 ...
##  $ country_self                    : Factor w/ 43 levels "Cambodia","Canada",..: 26 41 41 41 41 41 41 41 41 41 ...
##  $ citizenship                     : Factor w/ 5 levels "Foreign born- Not a citizen of U S",..: 1 5 5 5 5 5 5 5 5 5 ...
##  $ business_or_self_employed       : Factor w/ 3 levels "0","1","2": 1 1 1 3 1 1 1 1 1 1 ...
##  $ fill_questionnaire_veteran_admin: Factor w/ 3 levels "No","Not in universe",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ veterans_benefits               : Factor w/ 3 levels "0","1","2": 3 3 1 3 3 1 1 3 3 1 ...
##  $ year                            : Factor w/ 2 levels "94","95": 2 2 2 1 2 1 1 2 1 1 ...
##  $ income_level                    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

数据操作——规范化处理DATA MANIPULATION

在前面的分析中，有的类别变量下个别水平出现的频率很低，这样的数据对我们的分析作用不是特别大。在接下来的步骤中，我们的任务是将这些变量下频率低于5%的水平字段设置为“Other”。处理完类别型数据之后，对于数值型数据，各个变量下的水平分布过于稀疏，所以需要将其规范化。

#将cat_train和cat_test中每列下出现频率低于5%的水平设置为“Other”
#train
for(i in names(cat_train)){
  p <- 5/100
  ld <- names(which(prop.table(table(cat_train[[i]])) < p))
  levels(cat_train[[i]])[levels(cat_train[[i]]) %in% ld] <- "Other"
}
#test
for(i in names(cat_test)){
  p <- 5/100
  ld <- names(which(prop.table(table(cat_test[[i]])) < p))
  levels(cat_test[[i]])[levels(cat_test[[i]]) %in% ld] <- "Other"
}

#"nlevs"参数：返回每列下维度的个数，测试集和训练集是否匹配
summarizeColumns(cat_train)[, 'nlevs']

##  [1] 3 3 2 5 2 5 3 8 3 2 2 3 2 4 4 2 2 6 5 3 4 3 2 2 3 3 2 3 2 2

summarizeColumns(cat_test)[, 'nlevs']

##  [1] 3 3 2 5 2 5 3 8 3 2 2 3 2 4 4 2 2 6 5 3 4 3 2 2 3 3 2 3 2 2

num_train[, .N, age][order(age)]

##     age    N
##  1:   0 2839
##  2:   1 3138
##  3:   2 3236
##  4:   3 3279
##  5:   4 3318
##  6:   5 3332
##  7:   6 3171
##  8:   7 3218
##  9:   8 3187
## 10:   9 3162
## 11:  10 3134
## 12:  11 3128
## 13:  12 3060
## 14:  13 3152
## 15:  14 3068
## 16:  15 2926
## 17:  16 2882
## 18:  17 2762
## 19:  18 2484
## 20:  19 2419
## 21:  20 2390
## 22:  21 2386
## 23:  22 2573
## 24:  23 2789
## 25:  24 2783
## 26:  25 2783
## 27:  26 2714
## 28:  27 2758
## 29:  28 3013
## 30:  29 3050
## 31:  30 3203
## 32:  31 3351
## 33:  32 3188
## 34:  33 3340
## 35:  34 3489
## 36:  35 3450
## 37:  36 3353
## 38:  37 3278
## 39:  38 3277
## 40:  39 3144
## 41:  40 3114
## 42:  41 3134
## 43:  42 2995
## 44:  43 2889
## 45:  44 2786
## 46:  45 2847
## 47:  46 2816
## 48:  47 2795
## 49:  48 2410
## 50:  49 2142
## 51:  50 2214
## 52:  51 2215
## 53:  52 2115
## 54:  53 1900
## 55:  54 1745
## 56:  55 1730
## 57:  56 1710
## 58:  57 1622
## 59:  58 1600
## 60:  59 1580
## 61:  60 1560
## 62:  61 1497
## 63:  62 1531
## 64:  63 1501
## 65:  64 1579
## 66:  65 1550
## 67:  66 1443
## 68:  67 1496
## 69:  68 1436
## 70:  69 1412
## 71:  70 1410
## 72:  71 1418
## 73:  72 1315
## 74:  73 1354
## 75:  74 1227
## 76:  75 1065
## 77:  76 1050
## 78:  77  979
## 79:  78  876
## 80:  79  811
## 81:  80  799
## 82:  81  720
## 83:  82  615
## 84:  83  561
## 85:  84  519
## 86:  85  423
## 87:  86  348
## 88:  87  301
## 89:  88  241
## 90:  89  195
## 91:  90  725
##     age    N

num_train[, .N, wage_per_hour][order(-N)]

##       wage_per_hour      N
##    1:             0 188219
##    2:           500    734
##    3:           600    546
##    4:           700    534
##    5:           800    507
##   ---                     
## 1236:           724      1
## 1237:          1376      1
## 1238:          3156      1
## 1239:          2188      1
## 1240:          1092      1

#以0，30，90为分隔点，将年龄段划分为三个区段，“young”，“adult”，“old”
num_train[, age := cut(age, breaks = c(0,30,60,90), labels = c("young","adult","old"),
                       include.lowest = TRUE)]
num_test[, age := cut(age, breaks = c(0,30,60,90), labels = c("young","adult","old"),
                       include.lowest = TRUE)]

#将wage_per_hour，capital_gains，capital_losses，dividend_from_Stocks设置为只有0和大于0两个水平
num_train[,wage_per_hour := ifelse(wage_per_hour == 0,"Zero","MoreThanZero")][,wage_per_hour := as.factor(wage_per_hour)]
num_train[,capital_gains := ifelse(capital_gains == 0,"Zero","MoreThanZero")][,capital_gains := as.factor(capital_gains)]
num_train[,capital_losses := ifelse(capital_losses == 0,"Zero","MoreThanZero")][,capital_losses := as.factor(capital_losses)]
num_train[,dividend_from_Stocks := ifelse(dividend_from_Stocks == 0,"Zero","MoreThanZero")][,dividend_from_Stocks := as.factor(dividend_from_Stocks)]
num_test[,wage_per_hour := ifelse(wage_per_hour == 0,"Zero","MoreThanZero")][,wage_per_hour := as.factor(wage_per_hour)]
num_test[,capital_gains := ifelse(capital_gains == 0,"Zero","MoreThanZero")][,capital_gains := as.factor(capital_gains)]
num_test[,capital_losses := ifelse(capital_losses == 0,"Zero","MoreThanZero")][,capital_losses := as.factor(capital_losses)]
num_test[,dividend_from_Stocks := ifelse(dividend_from_Stocks == 0,"Zero","MoreThanZero")][,dividend_from_Stocks := as.factor(dividend_from_Stocks)]

模型开发MODEL DEVELOPMENT

train <- cbind(num_train, cat_train)
test <- cbind(num_test, cat_test)
train.task <- makeClassifTask(data = train, target = "income_level")

## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class
## data.table, hence it will be converted.

test.task<- makeClassifTask(data = test, target = "income_level")

## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class
## data.table, hence it will be converted.

#remove constant variables with no variance
train.task <- removeConstantFeatures(train.task)
test.task <- removeConstantFeatures(test.task)

应对不平衡数据集，通常的技巧有上采样(oversampling)、下采样(undersampling)，以及过采样的一种代表性方法SMOTE(Synthetic Minority Oversampling TEchnique)算法。上采样：即增加一些正例使得正、反例数目接近，然后再进行学习下采样：去除一些反例使得正、反例数目接近，然后进行学习 SMOTE：通过对训练集里的正例进行插值来产生额外的正例，主要思想是通过在一些位置相近的少数类样本中插入新样本来达到平衡的目的

下采样法的时间开销通常远远小于上采样法，因为前者丢弃了很多反例，使得训练集远小于初始训练集，下采样另外一个可能的缺陷在于它可能会导致信息的丢失。上采样法增加了很多正例，训练集的大小大于初始训练集，训练时间开销变大，而且容易导致过拟合。更多关于SMOTE采样方法，Chawla 的这篇文章有详细的说明。

train_under <- undersample(train.task, rate = 0.1)
table(getTaskTargets(train_under))

## 
##     0     1 
## 18714 12382

train_over <- oversample(train.task, rate = 15)
table(getTaskTargets(train_over))

## 
##      0      1 
## 187141 185730

train.smote <- smote(train.task,rate = 10,nn = 3) 
table(getTaskTargets(train.smote))

## 
##      0      1 
## 187141 123820

选择算法

进行完特征工程等数据清洗工作，我们就开始建模的过程，在建立模型之前，如何快速选择合适的学习算法又是重中之重，那么mlr给出了非常人性的方法。通过listLearners()可以查看所有涉及到的算法以及相应的数据要求指定条件，选择适用的算法

listLearners("classif","twoclass")[c("class","package")]

## Warning in listLearners.character("classif", "twoclass"): The following learners could not be constructed, probably because their packages are not installed:
## classif.ada,classif.bartMachine,classif.bdk,classif.blackboost,classif.bst,classif.cforest,classif.clusterSVM,classif.ctree,classif.cvglmnet,classif.dbnDNN,classif.dcSVM,classif.earth,classif.evtree,classif.extraTrees,classif.fnn,classif.gamboost,classif.gaterSVM,classif.gbm,classif.geoDA,classif.glmboost,classif.glmnet,classif.h2o.deeplearning,classif.h2o.gbm,classif.h2o.glm,classif.h2o.randomForest,classif.hdrda,classif.kknn,classif.LiblineaRL1L2SVC,classif.LiblineaRL1LogReg,classif.LiblineaRL2L1SVC,classif.LiblineaRL2LogReg,classif.LiblineaRL2SVC,classif.LiblineaRMultiClassSVC,classif.linDA,classif.lqa,classif.mda,classif.nnTrain,classif.nodeHarvest,classif.pamr,classif.penalized.fusedlasso,classif.penalized.lasso,classif.penalized.ridge,classif.plr,classif.quaDA,classif.randomForestSRC,classif.ranger,classif.rFerns,classif.rknn,classif.rotationForest,classif.RRF,classif.rrlda,classif.saeDNN,classif.sda,classif.sparseLDA,classif.xyf,cluster.cmeans,cluster.dbscan,cluster.kmeans,multilabel.cforest,multilabel.randomForestSRC,multilabel.rFerns,regr.bartMachine,regr.bcart,regr.bdk,regr.bgp,regr.bgpllm,regr.blackboost,regr.blm,regr.brnn,regr.bst,regr.btgp,regr.btgpllm,regr.btlm,regr.cforest,regr.crs,regr.ctree,regr.cubist,regr.cvglmnet,regr.earth,regr.elmNN,regr.evtree,regr.extraTrees,regr.fnn,regr.frbs,regr.gamboost,regr.gbm,regr.glmboost,regr.glmnet,regr.GPfit,regr.h2o.deeplearning,regr.h2o.gbm,regr.h2o.glm,regr.h2o.randomForest,regr.kknn,regr.km,regr.laGP,regr.LiblineaRL2L1SVR,regr.LiblineaRL2L2SVR,regr.mars,regr.mob,regr.nodeHarvest,regr.pcr,regr.penalized.fusedlasso,regr.penalized.lasso,regr.penalized.ridge,regr.plsr,regr.randomForestSRC,regr.ranger,regr.rknn,regr.RRF,regr.rsm,regr.slim,regr.xyf,surv.cforest,surv.CoxBoost,surv.cv.CoxBoost,surv.cvglmnet,surv.gamboost,surv.gbm,surv.glmboost,surv.glmnet,surv.penalized.fusedlasso,surv.penalized.lasso,surv.penalized.ridge,surv.randomForestSRC,surv.ranger
## Check ?learners to see which packages you need or install mlr with all suggestions.

##                 class      package
## 1    classif.binomial        stats
## 2    classif.boosting adabag,rpart
## 3         classif.C50          C50
## 4 classif.featureless          mlr
## 5     classif.gausspr      kernlab
## 6         classif.IBk        RWeka
## ... (29 rows, 2 cols)

朴素贝叶斯

#naive Bayes
naive_learner <- makeLearner("classif.naiveBayes",predict.type = "response")
naive_learner$par.vals <- list(laplace = 1)

10folds交叉验证

#10fold CV - stratified
folds <- makeResampleDesc("CV",iters=10,stratify = TRUE)

交叉验证函数

#cross validation function
fun_cv <- function(a){
  crv_val <- resample(naive_learner,a,folds,measures = list(acc,tpr,tnr,fpr,fp,fn))
  crv_val$aggr
}
#Lets check whether train data or smoted train data is better
# 训练结果，从训练结果得知，对不平衡数据集采取不同的采样
# 方法得出的结果截然不同
fun_cv(train.task)

## [Resample] cross-validation iter 1:

## acc.test.mean=0.733,tpr.test.mean=0.722,tnr.test.mean=0.892,fpr.test.mean=0.108,fp.test.mean= 134,fn.test.mean=5.2e+03

## [Resample] cross-validation iter 2:

## acc.test.mean=0.733,tpr.test.mean=0.722,tnr.test.mean=0.89,fpr.test.mean=0.11,fp.test.mean= 136,fn.test.mean=5.2e+03

## [Resample] cross-validation iter 3:

## acc.test.mean=0.728,tpr.test.mean=0.718,tnr.test.mean=0.877,fpr.test.mean=0.123,fp.test.mean= 153,fn.test.mean=5.28e+03

## [Resample] cross-validation iter 4:

## acc.test.mean=0.727,tpr.test.mean=0.715,tnr.test.mean=0.904,fpr.test.mean=0.0961,fp.test.mean= 119,fn.test.mean=5.33e+03

## [Resample] cross-validation iter 5:

## acc.test.mean=0.737,tpr.test.mean=0.726,tnr.test.mean=0.905,fpr.test.mean=0.0945,fp.test.mean= 117,fn.test.mean=5.12e+03

## [Resample] cross-validation iter 6:

## acc.test.mean=0.725,tpr.test.mean=0.714,tnr.test.mean=0.895,fpr.test.mean=0.105,fp.test.mean= 130,fn.test.mean=5.35e+03

## [Resample] cross-validation iter 7:

## acc.test.mean=0.741,tpr.test.mean=0.73,tnr.test.mean=0.905,fpr.test.mean=0.0945,fp.test.mean= 117,fn.test.mean=5.06e+03

## [Resample] cross-validation iter 8:

## acc.test.mean=0.737,tpr.test.mean=0.727,tnr.test.mean= 0.9,fpr.test.mean= 0.1,fp.test.mean= 124,fn.test.mean=5.12e+03

## [Resample] cross-validation iter 9:

## acc.test.mean=0.732,tpr.test.mean=0.722,tnr.test.mean=0.886,fpr.test.mean=0.114,fp.test.mean= 141,fn.test.mean=5.2e+03

## [Resample] cross-validation iter 10:

## acc.test.mean=0.73,tpr.test.mean=0.718,tnr.test.mean=0.905,fpr.test.mean=0.0945,fp.test.mean= 117,fn.test.mean=5.28e+03

## [Resample] Aggr. Result: acc.test.mean=0.732,tpr.test.mean=0.721,tnr.test.mean=0.896,fpr.test.mean=0.104,fp.test.mean= 129,fn.test.mean=5.21e+03

## acc.test.mean tpr.test.mean tnr.test.mean fpr.test.mean  fp.test.mean 
##     0.7322715     0.7214400     0.8959799     0.1040201   128.8000000 
##  fn.test.mean 
##  5213.0000000

fun_cv(train.smote)

## [Resample] cross-validation iter 1:

## acc.test.mean=0.873,tpr.test.mean=0.824,tnr.test.mean=0.945,fpr.test.mean=0.0546,fp.test.mean= 676,fn.test.mean=3.28e+03

## [Resample] cross-validation iter 2:

## acc.test.mean=0.873,tpr.test.mean=0.825,tnr.test.mean=0.945,fpr.test.mean=0.0549,fp.test.mean= 680,fn.test.mean=3.27e+03

## [Resample] cross-validation iter 3:

## acc.test.mean=0.872,tpr.test.mean=0.824,tnr.test.mean=0.945,fpr.test.mean=0.0554,fp.test.mean= 686,fn.test.mean=3.3e+03

## [Resample] cross-validation iter 4:

## acc.test.mean=0.872,tpr.test.mean=0.825,tnr.test.mean=0.942,fpr.test.mean=0.058,fp.test.mean= 718,fn.test.mean=3.27e+03

## [Resample] cross-validation iter 5:

## acc.test.mean=0.874,tpr.test.mean=0.826,tnr.test.mean=0.947,fpr.test.mean=0.0531,fp.test.mean= 658,fn.test.mean=3.25e+03

## [Resample] cross-validation iter 6:

## acc.test.mean=0.874,tpr.test.mean=0.827,tnr.test.mean=0.945,fpr.test.mean=0.0553,fp.test.mean= 685,fn.test.mean=3.24e+03

## [Resample] cross-validation iter 7:

## acc.test.mean=0.871,tpr.test.mean=0.825,tnr.test.mean=0.941,fpr.test.mean=0.0591,fp.test.mean= 732,fn.test.mean=3.27e+03

## [Resample] cross-validation iter 8:

## acc.test.mean=0.872,tpr.test.mean=0.824,tnr.test.mean=0.944,fpr.test.mean=0.0562,fp.test.mean= 696,fn.test.mean=3.29e+03

## [Resample] cross-validation iter 9:

## acc.test.mean=0.874,tpr.test.mean=0.826,tnr.test.mean=0.946,fpr.test.mean=0.0544,fp.test.mean= 673,fn.test.mean=3.25e+03

## [Resample] cross-validation iter 10:

## acc.test.mean=0.873,tpr.test.mean=0.826,tnr.test.mean=0.945,fpr.test.mean=0.0548,fp.test.mean= 679,fn.test.mean=3.26e+03

## [Resample] Aggr. Result: acc.test.mean=0.873,tpr.test.mean=0.825,tnr.test.mean=0.944,fpr.test.mean=0.0556,fp.test.mean= 688,fn.test.mean=3.27e+03

## acc.test.mean tpr.test.mean tnr.test.mean fpr.test.mean  fp.test.mean 
##  8.727268e-01  8.252975e-01  9.444112e-01  5.558876e-02  6.883000e+02 
##  fn.test.mean 
##  3.269400e+03

训练和预测

#train and predict
nB_model <- mlr::train(naive_learner, train.smote)
nB_predict <- predict(nB_model, test.task)

模型评估

#evaluate
nB_prediction <- nB_predict$data$response
dCM <- confusionMatrix(test$income_level,nB_prediction)
dCM

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 77317 16259
##          1  1074  5112
##                                           
##                Accuracy : 0.8263          
##                  95% CI : (0.8239, 0.8286)
##     No Information Rate : 0.7858          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3041          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9863          
##             Specificity : 0.2392          
##          Pos Pred Value : 0.8262          
##          Neg Pred Value : 0.8264          
##              Prevalence : 0.7858          
##          Detection Rate : 0.7750          
##    Detection Prevalence : 0.9380          
##       Balanced Accuracy : 0.6128          
##                                           
##        'Positive' Class : 0               
##

准确率precision ：0.844 召回率recall 0.985 Specificity（真阴性率）：0.254

F值

#calculate F measure
precision <- dCM$byClass['Pos Pred Value']
recall <- dCM$byClass['Sensitivity']
specificity <- dCM$byClass['Specificity']
# F值
f_measure <- 2*((precision*recall)/(precision+recall))
f_measure

## Pos Pred Value 
##      0.8992074

由实验结果，学习得到的模型达到0.844的准确率和0.985的召回率，而真反例仅仅为0.254。这就说明模型在预测正例上表现良好，而对个数较少的反例预测精度不高。这样的结果不太令人满意，在接下来的文章中我们继续探讨其他模型是不是有更好的效果。

MLR

data

May 25, 2017