final-glm-salary.utf8

Logistic Regression Project

In this project we will be working with the UCI adult dataset. We will be attempting to predict if people in the data set belong in a certain class by salary, either making <=50k or >50k per year.

Get the Data

library(readr)
a<-read.csv('adult_sal.csv')

head(a)

##   X age    type_employer fnlwgt education education_num            marital
## 1 1  39        State-gov  77516 Bachelors            13      Never-married
## 2 2  50 Self-emp-not-inc  83311 Bachelors            13 Married-civ-spouse
## 3 3  38          Private 215646   HS-grad             9           Divorced
## 4 4  53          Private 234721      11th             7 Married-civ-spouse
## 5 5  28          Private 338409 Bachelors            13 Married-civ-spouse
## 6 6  37          Private 284582   Masters            14 Married-civ-spouse
##          occupation  relationship  race    sex capital_gain capital_loss
## 1      Adm-clerical Not-in-family White   Male         2174            0
## 2   Exec-managerial       Husband White   Male            0            0
## 3 Handlers-cleaners Not-in-family White   Male            0            0
## 4 Handlers-cleaners       Husband Black   Male            0            0
## 5    Prof-specialty          Wife Black Female            0            0
## 6   Exec-managerial          Wife White Female            0            0
##   hr_per_week       country income
## 1          40 United-States  <=50K
## 2          13 United-States  <=50K
## 3          40 United-States  <=50K
## 4          40 United-States  <=50K
## 5          40          Cuba  <=50K
## 6          40 United-States  <=50K

str(a)

## 'data.frame':    32561 obs. of  16 variables:
##  $ X            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age          : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ type_employer: chr  "State-gov" "Self-emp-not-inc" "Private" "Private" ...
##  $ fnlwgt       : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education    : chr  "Bachelors" "Bachelors" "HS-grad" "11th" ...
##  $ education_num: int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital      : chr  "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
##  $ occupation   : chr  "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
##  $ relationship : chr  "Not-in-family" "Husband" "Not-in-family" "Husband" ...
##  $ race         : chr  "White" "White" "White" "Black" ...
##  $ sex          : chr  "Male" "Male" "Male" "Male" ...
##  $ capital_gain : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hr_per_week  : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ country      : chr  "United-States" "United-States" "United-States" "United-States" ...
##  $ income       : chr  "<=50K" "<=50K" "<=50K" "<=50K" ...

summary(a)

##        X              age        type_employer          fnlwgt       
##  Min.   :    1   Min.   :17.00   Length:32561       Min.   :  12285  
##  1st Qu.: 8141   1st Qu.:28.00   Class :character   1st Qu.: 117827  
##  Median :16281   Median :37.00   Mode  :character   Median : 178356  
##  Mean   :16281   Mean   :38.58                      Mean   : 189778  
##  3rd Qu.:24421   3rd Qu.:48.00                      3rd Qu.: 237051  
##  Max.   :32561   Max.   :90.00                      Max.   :1484705  
##   education         education_num     marital           occupation       
##  Length:32561       Min.   : 1.00   Length:32561       Length:32561      
##  Class :character   1st Qu.: 9.00   Class :character   Class :character  
##  Mode  :character   Median :10.00   Mode  :character   Mode  :character  
##                     Mean   :10.08                                        
##                     3rd Qu.:12.00                                        
##                     Max.   :16.00                                        
##  relationship           race               sex             capital_gain  
##  Length:32561       Length:32561       Length:32561       Min.   :    0  
##  Class :character   Class :character   Class :character   1st Qu.:    0  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0  
##                                                           Mean   : 1078  
##                                                           3rd Qu.:    0  
##                                                           Max.   :99999  
##   capital_loss     hr_per_week      country             income         
##  Min.   :   0.0   Min.   : 1.00   Length:32561       Length:32561      
##  1st Qu.:   0.0   1st Qu.:40.00   Class :character   Class :character  
##  Median :   0.0   Median :40.00   Mode  :character   Mode  :character  
##  Mean   :  87.3   Mean   :40.44                                        
##  3rd Qu.:   0.0   3rd Qu.:45.00                                        
##  Max.   :4356.0   Max.   :99.00

Data Cleaning process ‘’type_employe column’’

table(a$type_employer)

## 
##                ?      Federal-gov        Local-gov     Never-worked 
##             1836              960             2093                7 
##          Private     Self-emp-inc Self-emp-not-inc        State-gov 
##            22696             1116             2541             1298 
##      Without-pay 
##               14

unemp <- function(x)
{
  x <- as.character(x)
  if(x=='Never-worked'|x=='Without-pay')
  {
    return('Unemployed')
  }else{
    return(x)
  }
}

a$type_employer<-sapply(a$type_employer,unemp)

table(a$type_employer)

## 
##                ?      Federal-gov        Local-gov          Private 
##             1836              960             2093            22696 
##     Self-emp-inc Self-emp-not-inc        State-gov       Unemployed 
##             1116             2541             1298               21

group_emp <- function(job){
    if (job=='Local-gov' | job=='State-gov'){
        return('SL-gov')
    }else if (job=='Self-emp-inc' | job=='Self-emp-not-inc'){
        return('self-emp')
    }else{
        return(job)
    }
}

a$type_employer<-sapply(a$type_employer,group_emp)

table(a$type_employer)

## 
##           ? Federal-gov     Private    self-emp      SL-gov  Unemployed 
##        1836         960       22696        3657        3391          21

‘Martial column’

table(a$marital)

## 
##              Divorced     Married-AF-spouse    Married-civ-spouse 
##                  4443                    23                 14976 
## Married-spouse-absent         Never-married             Separated 
##                   418                 10683                  1025 
##               Widowed 
##                   993

group_marital <- function(mar){
    mar <- as.character(mar)
    
    # Not-Married
    if (mar=='Separated' | mar=='Divorced' | mar=='Widowed'){
        return('Not-Married')
    
    # Never-Married   
    }else if(mar=='Never-married'){
        return(mar)
    
     #Married
    }else{
        return('Married')
    }
}

a$marital<-sapply(a$marital,group_marital)

table(a$marital)

## 
##       Married Never-married   Not-Married 
##         15417         10683          6461

country column

table(a$country)

## 
##                          ?                   Cambodia 
##                        583                         19 
##                     Canada                      China 
##                        121                         75 
##                   Columbia                       Cuba 
##                         59                         95 
##         Dominican-Republic                    Ecuador 
##                         70                         28 
##                El-Salvador                    England 
##                        106                         90 
##                     France                    Germany 
##                         29                        137 
##                     Greece                  Guatemala 
##                         29                         64 
##                      Haiti         Holand-Netherlands 
##                         44                          1 
##                   Honduras                       Hong 
##                         13                         20 
##                    Hungary                      India 
##                         13                        100 
##                       Iran                    Ireland 
##                         43                         24 
##                      Italy                    Jamaica 
##                         73                         81 
##                      Japan                       Laos 
##                         62                         18 
##                     Mexico                  Nicaragua 
##                        643                         34 
## Outlying-US(Guam-USVI-etc)                       Peru 
##                         14                         31 
##                Philippines                     Poland 
##                        198                         60 
##                   Portugal                Puerto-Rico 
##                         37                        114 
##                   Scotland                      South 
##                         12                         80 
##                     Taiwan                   Thailand 
##                         51                         18 
##            Trinadad&Tobago              United-States 
##                         19                      29170 
##                    Vietnam                 Yugoslavia 
##                         67                         16

levels(a$country)

## NULL

asia <- c('China','Hong','India','Iran','Cambodia','Japan', 'Laos' ,
          'Philippines' ,'Vietnam' ,'Taiwan', 'Thailand')

north.America <- c('Canada','United-States','Puerto-Rico' )

europe <- c('England' ,'France', 'Germany' ,'Greece','Holand-Netherlands','Hungary',
            'Ireland','Italy','Poland','Portugal','Scotland','Yugoslavia')

latin.and.South.America <- c('Columbia','Cuba','Dominican-Republic','Ecuador',
                             'El-Salvador','Guatemala','Haiti','Honduras',
                             'Mexico','Nicaragua','Outlying-US(Guam-USVI-etc)','Peru',
                            'Jamaica','Trinadad&Tobago')
other <- c('South')

group_country <- function(cry){
    if (cry %in% asia){
        return('Asia')
    }else if (cry %in% north.America){
        return('North.America')
    }else if (cry %in% europe){
        return('Europe')
    }else if (cry %in% latin.and.South.America){
        return('Latin.and.South.America')
    }else{
        return('Other')      
    }
}

a$country <- sapply(a$country,group_country)

table(a$country)

## 
##                    Asia                  Europe Latin.and.South.America 
##                     671                     521                    1301 
##           North.America                   Other 
##                   29405                     663

str(a)

## 'data.frame':    32561 obs. of  16 variables:
##  $ X            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age          : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ type_employer: chr  "SL-gov" "self-emp" "Private" "Private" ...
##  $ fnlwgt       : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education    : chr  "Bachelors" "Bachelors" "HS-grad" "11th" ...
##  $ education_num: int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital      : chr  "Never-married" "Married" "Not-Married" "Married" ...
##  $ occupation   : chr  "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
##  $ relationship : chr  "Not-in-family" "Husband" "Not-in-family" "Husband" ...
##  $ race         : chr  "White" "White" "White" "Black" ...
##  $ sex          : chr  "Male" "Male" "Male" "Male" ...
##  $ capital_gain : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hr_per_week  : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ country      : chr  "North.America" "North.America" "North.America" "North.America" ...
##  $ income       : chr  "<=50K" "<=50K" "<=50K" "<=50K" ...

a$type_employer <- sapply(a$type_employer,factor)
a$country <- sapply(a$country,factor)
a$marital <- sapply(a$marital,factor)
a$income <- sapply(a$income,factor)

str(a)

## 'data.frame':    32561 obs. of  16 variables:
##  $ X            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age          : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ type_employer: Factor w/ 6 levels "SL-gov","self-emp",..: 1 2 3 3 3 3 3 2 3 3 ...
##  $ fnlwgt       : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education    : chr  "Bachelors" "Bachelors" "HS-grad" "11th" ...
##  $ education_num: int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital      : Factor w/ 3 levels "Never-married",..: 1 2 3 2 2 2 2 2 1 2 ...
##  $ occupation   : chr  "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
##  $ relationship : chr  "Not-in-family" "Husband" "Not-in-family" "Husband" ...
##  $ race         : chr  "White" "White" "White" "Black" ...
##  $ sex          : chr  "Male" "Male" "Male" "Male" ...
##  $ capital_gain : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hr_per_week  : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ country      : Factor w/ 5 levels "North.America",..: 1 1 1 1 2 1 2 1 1 1 ...
##  $ income       : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...

Missing data

library(Amelia)

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

a[a=='?'] <- NA

table(a$type_employer)

## 
##      SL-gov    self-emp     Private Federal-gov           ?  Unemployed 
##        3391        3657       22696         960           0          21

missmap(a,y.at=c(1),y.labels = c(''),col=c('yellow','black'))

EDA

str(a)

## 'data.frame':    32561 obs. of  16 variables:
##  $ X            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age          : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ type_employer: Factor w/ 6 levels "SL-gov","self-emp",..: 1 2 3 3 3 3 3 2 3 3 ...
##  $ fnlwgt       : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education    : chr  "Bachelors" "Bachelors" "HS-grad" "11th" ...
##  $ education_num: int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital      : Factor w/ 3 levels "Never-married",..: 1 2 3 2 2 2 2 2 1 2 ...
##  $ occupation   : chr  "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
##  $ relationship : chr  "Not-in-family" "Husband" "Not-in-family" "Husband" ...
##  $ race         : chr  "White" "White" "White" "Black" ...
##  $ sex          : chr  "Male" "Male" "Male" "Male" ...
##  $ capital_gain : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hr_per_week  : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ country      : Factor w/ 5 levels "North.America",..: 1 1 1 1 2 1 2 1 1 1 ...
##  $ income       : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

ggplot(a,aes(age))+geom_histogram(aes(fill=income),color='black',binwidth=1)+theme_bw()

ggplot(a,aes(hr_per_week))+geom_histogram()+theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

names(a)[names(a)=='country'] <-'region'

str(a)

## 'data.frame':    32561 obs. of  16 variables:
##  $ X            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age          : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ type_employer: Factor w/ 6 levels "SL-gov","self-emp",..: 1 2 3 3 3 3 3 2 3 3 ...
##  $ fnlwgt       : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education    : chr  "Bachelors" "Bachelors" "HS-grad" "11th" ...
##  $ education_num: int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital      : Factor w/ 3 levels "Never-married",..: 1 2 3 2 2 2 2 2 1 2 ...
##  $ occupation   : chr  "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
##  $ relationship : chr  "Not-in-family" "Husband" "Not-in-family" "Husband" ...
##  $ race         : chr  "White" "White" "White" "Black" ...
##  $ sex          : chr  "Male" "Male" "Male" "Male" ...
##  $ capital_gain : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hr_per_week  : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ region       : Factor w/ 5 levels "North.America",..: 1 1 1 1 2 1 2 1 1 1 ...
##  $ income       : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...

ggplot(a,aes(region))+geom_bar(aes(fill=income),color='black')+theme_bw()

theme(axis.text.x = element_text(angle = 90, hjust = 1))

## List of 1
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

head(a)

##   X age type_employer fnlwgt education education_num       marital
## 1 1  39        SL-gov  77516 Bachelors            13 Never-married
## 2 2  50      self-emp  83311 Bachelors            13       Married
## 3 3  38       Private 215646   HS-grad             9   Not-Married
## 4 4  53       Private 234721      11th             7       Married
## 5 5  28       Private 338409 Bachelors            13       Married
## 6 6  37       Private 284582   Masters            14       Married
##          occupation  relationship  race    sex capital_gain capital_loss
## 1      Adm-clerical Not-in-family White   Male         2174            0
## 2   Exec-managerial       Husband White   Male            0            0
## 3 Handlers-cleaners Not-in-family White   Male            0            0
## 4 Handlers-cleaners       Husband Black   Male            0            0
## 5    Prof-specialty          Wife Black Female            0            0
## 6   Exec-managerial          Wife White Female            0            0
##   hr_per_week                  region income
## 1          40           North.America  <=50K
## 2          13           North.America  <=50K
## 3          40           North.America  <=50K
## 4          40           North.America  <=50K
## 5          40 Latin.and.South.America  <=50K
## 6          40           North.America  <=50K

Train and test

library(caTools)
set.seed(101)
sample <- sample.split(a$income,SplitRatio=0.7)
train <-subset(a,sample == T)
test <-subset(a,sample == F)

Deploying model

model <- glm(income ~. ,family=binomial(logit),data = train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(model)

## 
## Call:
## glm(formula = income ~ ., family = binomial(logit), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4277  -0.5145  -0.1894   0.0000   3.8037  
## 
## Coefficients: (1 not defined because of singularities)
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -7.533e+00  4.351e-01 -17.311  < 2e-16 ***
## X                              3.063e-06  2.270e-06   1.349 0.177248    
## age                            2.696e-02  2.029e-03  13.287  < 2e-16 ***
## type_employerself-emp         -1.110e-01  9.024e-02  -1.231 0.218492    
## type_employerPrivate           2.198e-01  7.284e-02   3.017 0.002554 ** 
## type_employerFederal-gov       7.061e-01  1.246e-01   5.669 1.44e-08 ***
## type_employerUnemployed       -1.452e+01  6.500e+02  -0.022 0.982177    
## fnlwgt                         5.958e-07  2.062e-07   2.889 0.003868 ** 
## education11th                 -1.237e-01  2.496e-01  -0.496 0.620056    
## education12th                  2.807e-01  3.211e-01   0.874 0.382009    
## education1st-4th              -8.216e-01  6.050e-01  -1.358 0.174468    
## education5th-6th              -1.079e+00  4.778e-01  -2.259 0.023883 *  
## education7th-8th              -9.080e-01  2.924e-01  -3.106 0.001899 ** 
## education9th                  -2.828e-01  3.020e-01  -0.936 0.349026    
## educationAssoc-acdm            1.094e+00  2.082e-01   5.253 1.50e-07 ***
## educationAssoc-voc             1.157e+00  2.005e-01   5.771 7.87e-09 ***
## educationBachelors             1.764e+00  1.856e-01   9.503  < 2e-16 ***
## educationDoctorate             2.902e+00  2.573e-01  11.279  < 2e-16 ***
## educationHS-grad               6.261e-01  1.800e-01   3.479 0.000503 ***
## educationMasters               2.061e+00  1.985e-01  10.384  < 2e-16 ***
## educationPreschool            -2.037e+01  2.719e+02  -0.075 0.940295    
## educationProf-school           2.549e+00  2.395e-01  10.644  < 2e-16 ***
## educationSome-college          9.107e-01  1.831e-01   4.974 6.55e-07 ***
## education_num                         NA         NA      NA       NA    
## maritalMarried                 1.413e+00  1.990e-01   7.103 1.22e-12 ***
## maritalNot-Married             4.855e-01  1.013e-01   4.795 1.63e-06 ***
## occupationArmed-Forces        -5.780e-01  1.915e+00  -0.302 0.762760    
## occupationCraft-repair         7.650e-02  9.605e-02   0.797 0.425740    
## occupationExec-managerial      7.988e-01  9.273e-02   8.614  < 2e-16 ***
## occupationFarming-fishing     -1.284e+00  1.740e-01  -7.382 1.56e-13 ***
## occupationHandlers-cleaners   -6.400e-01  1.692e-01  -3.782 0.000155 ***
## occupationMachine-op-inspct   -3.097e-01  1.226e-01  -2.527 0.011500 *  
## occupationOther-service       -9.062e-01  1.412e-01  -6.417 1.39e-10 ***
## occupationPriv-house-serv     -1.345e+01  1.972e+02  -0.068 0.945626    
## occupationProf-specialty       4.525e-01  9.833e-02   4.601 4.20e-06 ***
## occupationProtective-serv      5.354e-01  1.496e-01   3.580 0.000344 ***
## occupationSales                2.491e-01  9.942e-02   2.505 0.012237 *  
## occupationTech-support         6.851e-01  1.299e-01   5.276 1.32e-07 ***
## occupationTransport-moving    -1.732e-01  1.191e-01  -1.454 0.145924    
## relationshipNot-in-family     -7.552e-01  1.958e-01  -3.856 0.000115 ***
## relationshipOther-relative    -1.132e+00  2.677e-01  -4.229 2.35e-05 ***
## relationshipOwn-child         -1.711e+00  2.392e-01  -7.155 8.38e-13 ***
## relationshipUnmarried         -8.609e-01  2.187e-01  -3.936 8.29e-05 ***
## relationshipWife               1.423e+00  1.253e-01  11.357  < 2e-16 ***
## raceAsian-Pac-Islander         6.837e-01  3.356e-01   2.037 0.041657 *  
## raceBlack                      5.544e-01  2.971e-01   1.866 0.062064 .  
## raceOther                     -1.763e-01  4.457e-01  -0.395 0.692482    
## raceWhite                      6.905e-01  2.847e-01   2.426 0.015282 *  
## sexMale                        9.287e-01  9.579e-02   9.695  < 2e-16 ***
## capital_gain                   3.326e-04  1.289e-05  25.807  < 2e-16 ***
## capital_loss                   6.587e-04  4.586e-05  14.365  < 2e-16 ***
## hr_per_week                    3.157e-02  2.006e-03  15.742  < 2e-16 ***
## regionLatin.and.South.America -5.214e-01  1.595e-01  -3.269 0.001079 ** 
## regionAsia                    -2.803e-01  2.139e-01  -1.310 0.190084    
## regionOther                   -4.765e-01  1.635e-01  -2.913 0.003574 ** 
## regionEurope                   6.083e-02  1.524e-01   0.399 0.689829    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24139  on 21500  degrees of freedom
## Residual deviance: 13865  on 21446  degrees of freedom
##   (1292 observations deleted due to missingness)
## AIC: 13975
## 
## Number of Fisher Scoring iterations: 15

new.step.model <- step(model)

## Start:  AIC=13975.19
## income ~ X + age + type_employer + fnlwgt + education + education_num + 
##     marital + occupation + relationship + race + sex + capital_gain + 
##     capital_loss + hr_per_week + region

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 
## Step:  AIC=13975.19
## income ~ X + age + type_employer + fnlwgt + education + marital + 
##     occupation + relationship + race + sex + capital_gain + capital_loss + 
##     hr_per_week + region

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##                 Df Deviance   AIC
## - X              1    13867 13975
## <none>                13865 13975
## - fnlwgt         1    13874 13982
## - race           4    13881 13983
## - region         4    13885 13987
## - marital        2    13921 14027
## - type_employer  4    13927 14029
## - sex            1    13963 14071
## - age            1    14044 14152
## - capital_loss   1    14079 14187
## - relationship   5    14114 14214
## - hr_per_week    1    14121 14229
## - occupation    13    14330 14414
## - education     15    14596 14676
## - capital_gain   1    15166 15274

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## 
## Step:  AIC=13975.01
## income ~ age + type_employer + fnlwgt + education + marital + 
##     occupation + relationship + race + sex + capital_gain + capital_loss + 
##     hr_per_week + region

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##                 Df Deviance   AIC
## <none>                13867 13975
## - fnlwgt         1    13875 13981
## - race           4    13883 13983
## - region         4    13887 13987
## - marital        2    13923 14027
## - type_employer  4    13929 14029
## - sex            1    13964 14070
## - age            1    14045 14151
## - capital_loss   1    14081 14187
## - relationship   5    14115 14213
## - hr_per_week    1    14123 14229
## - occupation    13    14332 14414
## - education     15    14598 14676
## - capital_gain   1    15168 15274

summary(new.step.model)

## 
## Call:
## glm(formula = income ~ age + type_employer + fnlwgt + education + 
##     marital + occupation + relationship + race + sex + capital_gain + 
##     capital_loss + hr_per_week + region, family = binomial(logit), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4361  -0.5142  -0.1895   0.0000   3.8162  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -7.482e+00  4.335e-01 -17.258  < 2e-16 ***
## age                            2.696e-02  2.029e-03  13.284  < 2e-16 ***
## type_employerself-emp         -1.129e-01  9.021e-02  -1.251 0.210831    
## type_employerPrivate           2.194e-01  7.282e-02   3.013 0.002588 ** 
## type_employerFederal-gov       7.056e-01  1.246e-01   5.665 1.47e-08 ***
## type_employerUnemployed       -1.449e+01  6.515e+02  -0.022 0.982253    
## fnlwgt                         5.938e-07  2.062e-07   2.880 0.003982 ** 
## education11th                 -1.227e-01  2.497e-01  -0.492 0.623038    
## education12th                  2.850e-01  3.210e-01   0.888 0.374646    
## education1st-4th              -8.209e-01  6.049e-01  -1.357 0.174767    
## education5th-6th              -1.079e+00  4.781e-01  -2.256 0.024066 *  
## education7th-8th              -9.036e-01  2.923e-01  -3.091 0.001993 ** 
## education9th                  -2.847e-01  3.021e-01  -0.942 0.346095    
## educationAssoc-acdm            1.096e+00  2.082e-01   5.262 1.42e-07 ***
## educationAssoc-voc             1.160e+00  2.005e-01   5.785 7.26e-09 ***
## educationBachelors             1.765e+00  1.856e-01   9.509  < 2e-16 ***
## educationDoctorate             2.903e+00  2.572e-01  11.286  < 2e-16 ***
## educationHS-grad               6.274e-01  1.800e-01   3.486 0.000491 ***
## educationMasters               2.064e+00  1.985e-01  10.394  < 2e-16 ***
## educationPreschool            -2.039e+01  2.712e+02  -0.075 0.940079    
## educationProf-school           2.552e+00  2.395e-01  10.657  < 2e-16 ***
## educationSome-college          9.129e-01  1.831e-01   4.985 6.19e-07 ***
## maritalMarried                 1.414e+00  1.990e-01   7.106 1.20e-12 ***
## maritalNot-Married             4.867e-01  1.013e-01   4.807 1.53e-06 ***
## occupationArmed-Forces        -5.706e-01  1.898e+00  -0.301 0.763750    
## occupationCraft-repair         7.865e-02  9.602e-02   0.819 0.412730    
## occupationExec-managerial      8.006e-01  9.271e-02   8.635  < 2e-16 ***
## occupationFarming-fishing     -1.282e+00  1.739e-01  -7.373 1.67e-13 ***
## occupationHandlers-cleaners   -6.382e-01  1.692e-01  -3.771 0.000162 ***
## occupationMachine-op-inspct   -3.097e-01  1.226e-01  -2.526 0.011526 *  
## occupationOther-service       -9.054e-01  1.412e-01  -6.413 1.43e-10 ***
## occupationPriv-house-serv     -1.344e+01  1.973e+02  -0.068 0.945699    
## occupationProf-specialty       4.537e-01  9.831e-02   4.615 3.93e-06 ***
## occupationProtective-serv      5.358e-01  1.495e-01   3.583 0.000340 ***
## occupationSales                2.494e-01  9.940e-02   2.509 0.012108 *  
## occupationTech-support         6.881e-01  1.299e-01   5.299 1.16e-07 ***
## occupationTransport-moving    -1.731e-01  1.191e-01  -1.453 0.146206    
## relationshipNot-in-family     -7.562e-01  1.958e-01  -3.862 0.000113 ***
## relationshipOther-relative    -1.133e+00  2.679e-01  -4.230 2.33e-05 ***
## relationshipOwn-child         -1.713e+00  2.392e-01  -7.159 8.11e-13 ***
## relationshipUnmarried         -8.615e-01  2.187e-01  -3.939 8.17e-05 ***
## relationshipWife               1.418e+00  1.252e-01  11.326  < 2e-16 ***
## raceAsian-Pac-Islander         6.837e-01  3.356e-01   2.037 0.041637 *  
## raceBlack                      5.533e-01  2.972e-01   1.862 0.062594 .  
## raceOther                     -1.757e-01  4.456e-01  -0.394 0.693325    
## raceWhite                      6.895e-01  2.847e-01   2.422 0.015454 *  
## sexMale                        9.261e-01  9.576e-02   9.670  < 2e-16 ***
## capital_gain                   3.325e-04  1.288e-05  25.808  < 2e-16 ***
## capital_loss                   6.584e-04  4.584e-05  14.364  < 2e-16 ***
## hr_per_week                    3.159e-02  2.006e-03  15.749  < 2e-16 ***
## regionLatin.and.South.America -5.190e-01  1.594e-01  -3.256 0.001128 ** 
## regionAsia                    -2.813e-01  2.138e-01  -1.316 0.188178    
## regionOther                   -4.787e-01  1.635e-01  -2.929 0.003403 ** 
## regionEurope                   5.966e-02  1.524e-01   0.392 0.695399    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24139  on 21500  degrees of freedom
## Residual deviance: 13867  on 21447  degrees of freedom
##   (1292 observations deleted due to missingness)
## AIC: 13975
## 
## Number of Fisher Scoring iterations: 15

test$predicted.income = predict(model, newdata=test, type="response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

table(test$income, test$predicted.income > 0.5)

##        
##         FALSE TRUE
##   <=50K  6405  518
##   >50K    920 1374

accuracy of our model

6405+1374/(6405+518+920+1374)

## [1] 6405.149

recall

6405/(6405+518)

## [1] 0.9251769

6405/(6405+920)

## [1] 0.8744027