Loading the libraries

ipak <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg)) 
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}

packages <- c("dplyr","tidyr","ggplot2","ROCR","corrplot","sparklyr","caret","missForest",
              "doParallel","R.utils","glmnet","coefplot","knitr","GGally")

ipak(packages)
##      dplyr      tidyr    ggplot2       ROCR   corrplot   sparklyr      caret 
##       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE 
## missForest doParallel    R.utils     glmnet   coefplot      knitr     GGally 
##       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE       TRUE

Downloading and unziping the data

temp <- tempfile()
if(!file.exists("./census-income.data.gz")){
  
    fileUrl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.data.gz"  
    
    download.file(fileUrl, destfile = "./census-income.data.gz", mode="wb")
}

if(!file.exists("census-income.data")){
    gunzip("census-income.data.gz","census-income.data",remove=F)
}
rm(fileUrl)
temp <- tempfile()
if(!file.exists("./census-income.test.gz")){
  
    fileUrl2 <- "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.test.gz"  
    
    download.file(fileUrl2, destfile = "./census-income.test.gz", mode="wb")
}

if(!file.exists("census-income.test")){
    gunzip("census-income.test.gz","census-income.test",remove=F)
}
rm(fileUrl2)

Loading the databases

df <- read.table("census-income.data", sep = ','
                 ,stringsAsFactors = TRUE ,na.strings = c(" ?", " Not in universe"))
df_test <- read.table("census-income.test", sep = ','
                      ,stringsAsFactors = TRUE,na.strings = c(" ?", " Not in universe"))

Adding the column names

colns <- c("Age","Class_of_Worker","Industry_code","occupation_code",
           "education","wage_per_hour","enrolled_in_edu_inst_last wk",
           "marital_status","major_industry_code","major_occupation_code","race",
           "hispanic_Origin","sex","member_of_a_labor_union","reason_for_unemployment",
           "full_part_time_employment_stat","capital_gains","capital_losses",
           "divdends_from_stocks","tax_filer_status","region_of_previous_residence",
           "state_of_previous_residence","detailed_household_and_family_stat",
           "detailed_household_summary_in_household","instance_weight",
           "migration_code-change_msa","migration_code-change_reg",
           "migration_code-move_within_reg","live_in_house_1_year_ago",
           "migration_prev_res_sunbelt","num_persons_worked_for_employer",
           "family_members_under 18","country_birth_father","country_birth_mother",
           "country_birth_self","citizenship","ownbusiness_or_self_employed",
           "fillinc_questionnaire_for_veterans_admin","veteran_benefits" ,
           "Weeks_worked_in_year", "Year", "Salary")
           
colnames(df) <- colns
colnames(df_test) <- colns
head(df,3)

Data Structure

kable(str(df))
## 'data.frame':    199523 obs. of  42 variables:
##  $ Age                                     : int  73 58 18 9 10 48 42 28 47 34 ...
##  $ Class_of_Worker                         : Factor w/ 8 levels " Federal government",..: NA 6 NA NA NA 4 4 4 2 4 ...
##  $ Industry_code                           : int  0 4 0 0 0 40 34 4 43 4 ...
##  $ occupation_code                         : int  0 34 0 0 0 10 3 40 26 37 ...
##  $ education                               : Factor w/ 17 levels " 10th grade",..: 13 17 1 11 11 17 10 13 17 17 ...
##  $ wage_per_hour                           : int  0 0 0 0 0 1200 0 0 876 0 ...
##  $ enrolled_in_edu_inst_last wk            : Factor w/ 2 levels " College or university",..: NA NA 2 NA NA NA NA NA NA NA ...
##  $ marital_status                          : Factor w/ 7 levels " Divorced"," Married-A F spouse present",..: 7 1 5 5 5 3 3 5 3 3 ...
##  $ major_industry_code                     : Factor w/ 24 levels " Agriculture",..: 15 5 15 15 15 7 8 5 6 5 ...
##  $ major_occupation_code                   : Factor w/ 14 levels " Adm support including clerical",..: NA 8 NA NA NA 10 3 5 1 6 ...
##  $ race                                    : Factor w/ 5 levels " Amer Indian Aleut or Eskimo",..: 5 5 2 5 5 1 5 5 5 5 ...
##  $ hispanic_Origin                         : Factor w/ 10 levels " All other"," Central or South American",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex                                     : Factor w/ 2 levels " Female"," Male": 1 2 1 1 1 1 2 1 1 2 ...
##  $ member_of_a_labor_union                 : Factor w/ 2 levels " No"," Yes": NA NA NA NA NA 1 NA NA 1 NA ...
##  $ reason_for_unemployment                 : Factor w/ 5 levels " Job leaver",..: NA NA NA NA NA NA NA 2 NA NA ...
##  $ full_part_time_employment_stat          : Factor w/ 8 levels " Children or Armed Forces",..: 3 1 3 1 1 2 1 7 2 1 ...
##  $ capital_gains                           : int  0 0 0 0 0 0 5178 0 0 0 ...
##  $ capital_losses                          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divdends_from_stocks                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ tax_filer_status                        : Factor w/ 6 levels " Head of household",..: 5 1 5 5 5 3 3 6 3 3 ...
##  $ region_of_previous_residence            : Factor w/ 5 levels " Abroad"," Midwest",..: NA 4 NA NA NA NA NA NA NA NA ...
##  $ state_of_previous_residence             : Factor w/ 49 levels " Abroad"," Alabama",..: NA 5 NA NA NA NA NA NA NA NA ...
##  $ detailed_household_and_family_stat      : Factor w/ 38 levels " Child <18 ever marr not in subfamily",..: 30 21 8 3 3 37 21 36 37 21 ...
##  $ detailed_household_summary_in_household : Factor w/ 8 levels " Child 18 or older",..: 7 5 1 3 3 8 5 6 8 5 ...
##  $ instance_weight                         : num  1700 1054 992 1758 1069 ...
##  $ migration_code-change_msa               : Factor w/ 8 levels " Abroad to MSA",..: NA 3 NA 5 5 NA 5 NA NA 5 ...
##  $ migration_code-change_reg               : Factor w/ 7 levels " Abroad"," Different county same state",..: NA 7 NA 6 6 NA 6 NA NA 6 ...
##  $ migration_code-move_within_reg          : Factor w/ 8 levels " Abroad"," Different county same state",..: NA 8 NA 7 7 NA 7 NA NA 7 ...
##  $ live_in_house_1_year_ago                : Factor w/ 3 levels " No"," Not in universe under 1 year old",..: 2 1 2 3 3 2 3 2 2 3 ...
##  $ migration_prev_res_sunbelt              : Factor w/ 2 levels " No"," Yes": NA 2 NA NA NA NA NA NA NA NA ...
##  $ num_persons_worked_for_employer         : int  0 1 0 0 0 1 6 4 5 6 ...
##  $ family_members_under 18                 : Factor w/ 4 levels " Both parents present",..: NA NA NA 1 1 NA NA NA NA NA ...
##  $ country_birth_father                    : Factor w/ 42 levels " Cambodia"," Canada",..: 40 40 41 40 40 31 40 40 40 40 ...
##  $ country_birth_mother                    : Factor w/ 42 levels " Cambodia"," Canada",..: 40 40 41 40 40 40 40 40 40 40 ...
##  $ country_birth_self                      : Factor w/ 42 levels " Cambodia"," Canada",..: 40 40 41 40 40 40 40 40 40 40 ...
##  $ citizenship                             : Factor w/ 5 levels " Foreign born- Not a citizen of U S ",..: 5 5 1 5 5 5 5 5 5 5 ...
##  $ ownbusiness_or_self_employed            : int  0 0 0 0 0 2 0 0 0 0 ...
##  $ fillinc_questionnaire_for_veterans_admin: Factor w/ 2 levels " No"," Yes": NA NA NA NA NA NA NA NA NA NA ...
##  $ veteran_benefits                        : int  2 2 2 0 0 2 2 2 2 2 ...
##  $ Weeks_worked_in_year                    : int  0 52 0 0 0 52 52 30 52 52 ...
##  $ Year                                    : int  95 94 95 94 94 95 94 95 95 94 ...
##  $ Salary                                  : Factor w/ 2 levels " - 50000."," 50000+.": 1 1 1 1 1 1 1 1 1 1 ...

|| || || ||

Data Summary

summary(df)
##       Age                               Class_of_Worker   Industry_code  
##  Min.   : 0.00    Private                       : 72028   Min.   : 0.00  
##  1st Qu.:15.00    Self-employed-not incorporated:  8445   1st Qu.: 0.00  
##  Median :33.00    Local government              :  7784   Median : 0.00  
##  Mean   :34.49    State government              :  4227   Mean   :15.35  
##  3rd Qu.:50.00    Self-employed-incorporated    :  3265   3rd Qu.:33.00  
##  Max.   :90.00   (Other)                        :  3529   Max.   :51.00  
##                  NA's                           :100245                  
##  occupation_code                       education     wage_per_hour    
##  Min.   : 0.00    High school graduate      :48407   Min.   :   0.00  
##  1st Qu.: 0.00    Children                  :47422   1st Qu.:   0.00  
##  Median : 0.00    Some college but no degree:27820   Median :   0.00  
##  Mean   :11.31    Bachelors degree(BA AB BS):19865   Mean   :  55.43  
##  3rd Qu.:26.00    7th and 8th grade         : 8007   3rd Qu.:   0.00  
##  Max.   :46.00    10th grade                : 7557   Max.   :9999.00  
##                  (Other)                    :40445                    
##          enrolled_in_edu_inst_last wk                          marital_status 
##   College or university:  5688         Divorced                       :12710  
##   High school          :  6892         Married-A F spouse present     :  665  
##  NA's                  :186943         Married-civilian spouse present:84222  
##                                        Married-spouse absent          : 1518  
##                                        Never married                  :86485  
##                                        Separated                      : 3460  
##                                        Widowed                        :10463  
##                          major_industry_code
##   Not in universe or children      :100684  
##   Retail trade                     : 17070  
##   Manufacturing-durable goods      :  9015  
##   Education                        :  8283  
##   Manufacturing-nondurable goods   :  6897  
##   Finance insurance and real estate:  6145  
##  (Other)                           : 51429  
##                      major_occupation_code
##   Adm support including clerical: 14837   
##   Professional specialty        : 13940   
##   Executive admin and managerial: 12495   
##   Other service                 : 12099   
##   Sales                         : 11783   
##  (Other)                        : 33685   
##  NA's                           :100684   
##                            race                          hispanic_Origin  
##   Amer Indian Aleut or Eskimo:  2251    All other                :171907  
##   Asian or Pacific Islander  :  5835    Mexican-American         :  8079  
##   Black                      : 20415    Mexican (Mexicano)       :  7234  
##   Other                      :  3657    Central or South American:  3895  
##   White                      :167365    Puerto Rican             :  3313  
##                                         Other Spanish            :  2485  
##                                        (Other)                   :  2610  
##       sex         member_of_a_labor_union           reason_for_unemployment
##   Female:103984    No : 16034              Job leaver           :   598    
##   Male  : 95539    Yes:  3030              Job loser - on layoff:   976    
##                   NA's:180459              New entrant          :   439    
##                                            Other job loser      :  2038    
##                                            Re-entrant           :  2019    
##                                           NA's                  :193453    
##                                                                            
##                      full_part_time_employment_stat capital_gains    
##   Children or Armed Forces          :123769         Min.   :    0.0  
##   Full-time schedules               : 40736         1st Qu.:    0.0  
##   Not in labor force                : 26808         Median :    0.0  
##   PT for non-econ reasons usually FT:  3322         Mean   :  434.7  
##   Unemployed full-time              :  2311         3rd Qu.:    0.0  
##   PT for econ reasons usually PT    :  1209         Max.   :99999.0  
##  (Other)                            :  1368                          
##  capital_losses    divdends_from_stocks                      tax_filer_status
##  Min.   :   0.00   Min.   :    0.0       Head of household           : 7426  
##  1st Qu.:   0.00   1st Qu.:    0.0       Joint both 65+              : 8332  
##  Median :   0.00   Median :    0.0       Joint both under 65         :67383  
##  Mean   :  37.31   Mean   :  197.5       Joint one under 65 & one 65+: 3867  
##  3rd Qu.:   0.00   3rd Qu.:    0.0       Nonfiler                    :75094  
##  Max.   :4608.00   Max.   :99999.0       Single                      :37421  
##                                                                              
##  region_of_previous_residence  state_of_previous_residence
##   Abroad   :   530             California    :  1714      
##   Midwest  :  3575             Utah          :  1063      
##   Northeast:  2705             Florida       :   849      
##   South    :  4889             North Carolina:   812      
##   West     :  4074             Abroad        :   671      
##  NA's      :183750            (Other)        :  9956      
##                               NA's           :184458      
##                         detailed_household_and_family_stat
##   Householder                            :53248           
##   Child <18 never marr not in subfamily  :50326           
##   Spouse of householder                  :41695           
##   Nonfamily householder                  :22213           
##   Child 18+ never marr Not in a subfamily:12030           
##   Secondary individual                   : 6122           
##  (Other)                                 :13889           
##            detailed_household_summary_in_household instance_weight   
##   Householder                  :75475              Min.   :   37.87  
##   Child under 18 never married :50426              1st Qu.: 1061.62  
##   Spouse of householder        :41709              Median : 1618.31  
##   Child 18 or older            :14430              Mean   : 1740.38  
##   Other relative of householder: 9703              3rd Qu.: 2188.61  
##   Nonrelative of householder   : 7601              Max.   :18656.30  
##  (Other)                       :  179                                
##      migration_code-change_msa                  migration_code-change_reg
##   Nonmover        : 82538       Nonmover                     : 82538     
##   MSA to MSA      : 10601       Same county                  :  9812     
##   NonMSA to nonMSA:  2811       Different county same state  :  2797     
##   MSA to nonMSA   :   790       Different region             :  1178     
##   NonMSA to MSA   :   615       Different state same division:   991     
##  (Other)          :   956      (Other)                       :   995     
##  NA's             :101212      NA's                          :101212     
##               migration_code-move_within_reg
##   Nonmover                   : 82538        
##   Same county                :  9812        
##   Different county same state:  2797        
##   Different state in South   :   973        
##   Different state in West    :   679        
##  (Other)                     :  1512        
##  NA's                        :101212        
##                       live_in_house_1_year_ago migration_prev_res_sunbelt
##   No                              : 15773       No :  9987               
##   Not in universe under 1 year old:101212       Yes:  5786               
##   Yes                             : 82538      NA's:183750               
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##  num_persons_worked_for_employer            family_members_under 18
##  Min.   :0.000                    Both parents present  : 38983    
##  1st Qu.:0.000                    Father only present   :  1883    
##  Median :1.000                    Mother only present   : 12772    
##  Mean   :1.956                    Neither parent present:  1653    
##  3rd Qu.:4.000                   NA's                   :144232    
##  Max.   :6.000                                                     
##                                                                    
##      country_birth_father     country_birth_mother      country_birth_self
##   United-States:159163     United-States:160479     United-States:176989  
##   Mexico       : 10008     Mexico       :  9781     Mexico       :  5767  
##   Puerto-Rico  :  2680     Puerto-Rico  :  2473     Puerto-Rico  :  1400  
##   Italy        :  2212     Italy        :  1844     Germany      :   851  
##   Canada       :  1380     Canada       :  1451     Philippines  :   845  
##  (Other)       : 17367    (Other)       : 17376    (Other)       : 10278  
##  NA's          :  6713    NA's          :  6119    NA's          :  3393  
##                                        citizenship    
##   Foreign born- Not a citizen of U S         : 13401  
##   Foreign born- U S citizen by naturalization:  5855  
##   Native- Born abroad of American Parent(s)  :  1756  
##   Native- Born in Puerto Rico or U S Outlying:  1519  
##   Native- Born in the United States          :176992  
##                                                       
##                                                       
##  ownbusiness_or_self_employed fillinc_questionnaire_for_veterans_admin
##  Min.   :0.0000                No :  1593                             
##  1st Qu.:0.0000                Yes:   391                             
##  Median :0.0000               NA's:197539                             
##  Mean   :0.1754                                                       
##  3rd Qu.:0.0000                                                       
##  Max.   :2.0000                                                       
##                                                                       
##  veteran_benefits Weeks_worked_in_year      Year            Salary      
##  Min.   :0.000    Min.   : 0.00        Min.   :94.0    - 50000.:187141  
##  1st Qu.:2.000    1st Qu.: 0.00        1st Qu.:94.0    50000+. : 12382  
##  Median :2.000    Median : 8.00        Median :94.0                     
##  Mean   :1.515    Mean   :23.17        Mean   :94.5                     
##  3rd Qu.:2.000    3rd Qu.:52.00        3rd Qu.:95.0                     
##  Max.   :2.000    Max.   :52.00        Max.   :95.0                     
## 

Dimensions of train and test set

dim(df)
## [1] 199523     42
dim(df_test)
## [1] 99762    42

Feature Selection

nas <- inspectdf::inspect_na(df) #count percentage of NA's
todrop<-unlist(nas %>%
                 filter(pcnt>50) %>%
                 select(col_name)) # filtering features with NA percentage bigger than 50


near_zero_var <- nearZeroVar(df, freqCut = 99/1) 
near_zero_var <- names(df)[near_zero_var] # finding variables with almost zero variance


df <- df %>% 
  select(-c(todrop,near_zero_var))

df <- df %>%
  select(-instance_weight) # removing weight feature, we not gonna use it for prediction purposes

df <- df %>%  # same information with other variables
  select(-major_industry_code,-detailed_household_and_family_stat) 

df <- df %>% # most of values are "nofiler"
  select(-tax_filer_status,-live_in_house_1_year_ago)

Using missForest function to fill NA. we gonna use

registerDoParallel(cores=3)
dfn<- df %>% 
  select(-Salary) %>%
  missForest(maxiter=1,ntree=10, variablewise=T, parallelize = "forests")
##   missForest iteration 1 in progress...done!
df[,-20] <- dfn$ximp

Data visualization and analysis

ggplot(df, aes(x=Salary))+ geom_bar(aes(fill=Salary))

ggplot(df, aes(x=Age))+ geom_histogram(aes(colour=I("black"), fill=I("blue")),bins=20)

ggplot(df, aes(x=race))+ geom_bar(aes( colour=I("black"), fill=race))

ggplot(df, aes(x=sex))+ geom_bar(aes( colour=I("black"), fill=sex))

ggplot(df, aes(x=citizenship))+ geom_bar(aes( colour=I("black"), fill=citizenship)) + coord_flip() + theme(legend.position = "none")

ggplot(df, aes(x=as.factor(marital_status)))+ geom_bar(aes( colour=I("black"), fill=marital_status))

ggplot(df, aes(x=education))+ geom_bar(aes( colour=I("black"), fill=education)) + coord_flip() + theme(legend.position = "none")

## Data correlations

boxplot(Age~Salary, data= df , main = "Age vs Salary", 
        xlab = "Salary", ylab = "Age", col = "orange")

As we can see from the boxplot the age has important correlation with Salary. It looks like that you need to be older to earn more than 50.000.

boxplot(num_persons_worked_for_employer~Salary, data= df , main = "Num of person worked for employer vs Salary", 
        xlab = "Salary", ylab = "Num of person worked for employer", col = "Green")

Working in a place with more employees seems to be important for Salary

boxplot(Weeks_worked_in_year~Salary, data= df , main = "Weeks worked vs Salary", 
        xlab = "Salary", ylab = "Weeks worked in a year", col = "red")

Here we can see that in our database females are more than males but the males have significant more people earning more than 50.000. So we can see there is a correlation between sex and salary

ggplot(df, aes(x=as.factor(Salary))) + geom_bar(stat = "count", aes(fill=sex)) + xlab("Salary") + ggtitle("Salary with Sex") + theme_classic()

Race is also very important. it seems to be rare to earn more than 50k if you are not white.

qplot (Salary, data = df, fill = race) + facet_grid (. ~ race)

You need to have veteran benefits if you want to earn more than 50k

qplot (Salary, data = df, fill = as.factor(veteran_benefits) ) + facet_grid (. ~ veteran_benefits)

qplot (Salary, data = df, fill = marital_status) + facet_grid (. ~ marital_status)

df_test <- df_test %>% select(-c(todrop,near_zero_var,instance_weight,major_industry_code,
                                 detailed_household_and_family_stat,tax_filer_status,
                                 live_in_house_1_year_ago))
registerDoParallel(cores=3)
dfnt<- df_test %>% 
  select(-Salary) %>%
  missForest(maxiter=1,ntree=10, variablewise=T, parallelize = "forests")
##   missForest iteration 1 in progress...done!
df_test[,-20] <- dfnt$ximp

Binding the train and test database for feature engineering

Usually we need to split the data in train and test set after feature engineering, so we can avoid different kind of problems. One of the problems is that some times we have different number of levels or entries for train and test set regarding to the same feature. But this is not the correct strategy for all project, we always have to aware of data leakage

df$train <- T
df_test$train <- F

df_total <- rbind(df,df_test)

df_total$Salary <- ifelse(df_total$Salary==" - 50000.",0,1)

df_total$race <- ifelse(df_total$race==" White", "White", "Not White")

df_total$hispanic_Origin <- ifelse(df_total$hispanic_Origin==" All other", 0, 1)

df_total$citizenship <- ifelse(df_total$citizenship %in% 
                                 c(" Native- Born abroad of American Parent(s)",
                                   " Native- Born in Puerto Rico or U S Outlying",
                                   " Native- Born in the United States",
                                   " Foreign born- U S citizen by naturalization"),
                          "US citizenship", "No US citizenship")

levels(df_total$marital_status)[2:4] <- "Married"


levels(df_total$education)[levels(df_total$education) %in% c(" Children"," Less than 1st grade",
                                   " 1st 2nd 3rd or 4th grade",
                                   " 5th or 6th grade"," 7th and 8th grade",
                                   " 9th grade"," 10th grade"," 11th grade",
                                   " 12th grade no diploma" )] <- 0
levels(df_total$education)[levels(df_total$education) %in% c(" High school graduate", 
                                                             " Some college but no degree")] <- 1
levels(df_total$education)[levels(df_total$education) %in% 
                             c(" Associates degree-academic program",
                               " Prof school degree (MD DDS DVM LLB JD)",
                               " Associates degree-occup /vocational")] <- 2
levels(df_total$education)[levels(df_total$education)==" Bachelors degree(BA AB BS)"] <- 3
levels(df_total$education)[levels(df_total$education)==" Masters degree(MA MS MEng MEd MSW MBA)"] <- 4
levels(df_total$education)[levels(df_total$education)==" Doctorate degree(PhD EdD)"] <- 5


df_total <- df_total %>% mutate_if(is.character,factor)

vect <- c("Industry_code","occupation_code","num_persons_worked_for_employer",
          "ownbusiness_or_self_employed","Weeks_worked_in_year","Year","veteran_benefits")

df_total <- df_total %>% mutate_if(names(df_total) %in% vect, as.factor)

Splitting the database to train and test set

df_train <- df_total %>% filter(train==T) %>% select(-train)
df_test <- df_total %>% filter(train==F) %>% select(-train)

Connecting to Spark

sc <- spark_connect(master = "local")

Connecting the datasets to Spark

dfs_train <- sdf_copy_to(sc,df_train)
dfs_test <- sdf_copy_to(sc,df_test)

GRADIENT BOOSTED TREE

GBT_model <- dfs_train %>%
  ml_gbt_classifier(Salary~.)
GBT_pred <- ml_predict(GBT_model, dfs_test)

AREA UNDER ROC

GBT_pred2 <- ml_binary_classification_evaluator(GBT_pred)
GBT_pred2
## [1] 0.9239043

Confusion Matrix

cm <- table(pull(GBT_pred, label), pull(GBT_pred, prediction))
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##    
##         0     1
##   0 92964   612
##   1  4646  1540
##                                           
##                Accuracy : 0.9473          
##                  95% CI : (0.9459, 0.9487)
##     No Information Rate : 0.9784          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3485          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9524          
##             Specificity : 0.7156          
##          Pos Pred Value : 0.9935          
##          Neg Pred Value : 0.2489          
##              Prevalence : 0.9784          
##          Detection Rate : 0.9319          
##    Detection Prevalence : 0.9380          
##       Balanced Accuracy : 0.8340          
##                                           
##        'Positive' Class : 0               
## 
featureImport <- ml_tree_feature_importance(GBT_model)
featureImport[1:10,] %>% ggplot(aes(reorder(feature, importance),importance,fill=feature)) + 
  geom_bar(stat = "identity") + coord_flip() + ggtitle("Top 10 feature importance") + 
  theme(legend.position="none") + ylab("importance") + xlab("feature")