Logistic Regression Project
In this project we will be working with the UCI adult dataset. We will be attempting to predict if people in the data set belong in a certain class by salary, either making <=50k or >50k per year.
Get the Data
library(readr)
a<-read.csv('adult_sal.csv')
head(a)
## X age type_employer fnlwgt education education_num marital
## 1 1 39 State-gov 77516 Bachelors 13 Never-married
## 2 2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## 3 3 38 Private 215646 HS-grad 9 Divorced
## 4 4 53 Private 234721 11th 7 Married-civ-spouse
## 5 5 28 Private 338409 Bachelors 13 Married-civ-spouse
## 6 6 37 Private 284582 Masters 14 Married-civ-spouse
## occupation relationship race sex capital_gain capital_loss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## hr_per_week country income
## 1 40 United-States <=50K
## 2 13 United-States <=50K
## 3 40 United-States <=50K
## 4 40 United-States <=50K
## 5 40 Cuba <=50K
## 6 40 United-States <=50K
str(a)
## 'data.frame': 32561 obs. of 16 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ type_employer: chr "State-gov" "Self-emp-not-inc" "Private" "Private" ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : chr "Bachelors" "Bachelors" "HS-grad" "11th" ...
## $ education_num: int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital : chr "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
## $ occupation : chr "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
## $ relationship : chr "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ race : chr "White" "White" "White" "Black" ...
## $ sex : chr "Male" "Male" "Male" "Male" ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ country : chr "United-States" "United-States" "United-States" "United-States" ...
## $ income : chr "<=50K" "<=50K" "<=50K" "<=50K" ...
summary(a)
## X age type_employer fnlwgt
## Min. : 1 Min. :17.00 Length:32561 Min. : 12285
## 1st Qu.: 8141 1st Qu.:28.00 Class :character 1st Qu.: 117827
## Median :16281 Median :37.00 Mode :character Median : 178356
## Mean :16281 Mean :38.58 Mean : 189778
## 3rd Qu.:24421 3rd Qu.:48.00 3rd Qu.: 237051
## Max. :32561 Max. :90.00 Max. :1484705
## education education_num marital occupation
## Length:32561 Min. : 1.00 Length:32561 Length:32561
## Class :character 1st Qu.: 9.00 Class :character Class :character
## Mode :character Median :10.00 Mode :character Mode :character
## Mean :10.08
## 3rd Qu.:12.00
## Max. :16.00
## relationship race sex capital_gain
## Length:32561 Length:32561 Length:32561 Min. : 0
## Class :character Class :character Class :character 1st Qu.: 0
## Mode :character Mode :character Mode :character Median : 0
## Mean : 1078
## 3rd Qu.: 0
## Max. :99999
## capital_loss hr_per_week country income
## Min. : 0.0 Min. : 1.00 Length:32561 Length:32561
## 1st Qu.: 0.0 1st Qu.:40.00 Class :character Class :character
## Median : 0.0 Median :40.00 Mode :character Mode :character
## Mean : 87.3 Mean :40.44
## 3rd Qu.: 0.0 3rd Qu.:45.00
## Max. :4356.0 Max. :99.00
Data Cleaning process ‘’type_employe column’’
table(a$type_employer)
##
## ? Federal-gov Local-gov Never-worked
## 1836 960 2093 7
## Private Self-emp-inc Self-emp-not-inc State-gov
## 22696 1116 2541 1298
## Without-pay
## 14
unemp <- function(x)
{
x <- as.character(x)
if(x=='Never-worked'|x=='Without-pay')
{
return('Unemployed')
}else{
return(x)
}
}
a$type_employer<-sapply(a$type_employer,unemp)
table(a$type_employer)
##
## ? Federal-gov Local-gov Private
## 1836 960 2093 22696
## Self-emp-inc Self-emp-not-inc State-gov Unemployed
## 1116 2541 1298 21
group_emp <- function(job){
if (job=='Local-gov' | job=='State-gov'){
return('SL-gov')
}else if (job=='Self-emp-inc' | job=='Self-emp-not-inc'){
return('self-emp')
}else{
return(job)
}
}
a$type_employer<-sapply(a$type_employer,group_emp)
table(a$type_employer)
##
## ? Federal-gov Private self-emp SL-gov Unemployed
## 1836 960 22696 3657 3391 21
‘Martial column’
table(a$marital)
##
## Divorced Married-AF-spouse Married-civ-spouse
## 4443 23 14976
## Married-spouse-absent Never-married Separated
## 418 10683 1025
## Widowed
## 993
group_marital <- function(mar){
mar <- as.character(mar)
# Not-Married
if (mar=='Separated' | mar=='Divorced' | mar=='Widowed'){
return('Not-Married')
# Never-Married
}else if(mar=='Never-married'){
return(mar)
#Married
}else{
return('Married')
}
}
a$marital<-sapply(a$marital,group_marital)
table(a$marital)
##
## Married Never-married Not-Married
## 15417 10683 6461
country column
table(a$country)
##
## ? Cambodia
## 583 19
## Canada China
## 121 75
## Columbia Cuba
## 59 95
## Dominican-Republic Ecuador
## 70 28
## El-Salvador England
## 106 90
## France Germany
## 29 137
## Greece Guatemala
## 29 64
## Haiti Holand-Netherlands
## 44 1
## Honduras Hong
## 13 20
## Hungary India
## 13 100
## Iran Ireland
## 43 24
## Italy Jamaica
## 73 81
## Japan Laos
## 62 18
## Mexico Nicaragua
## 643 34
## Outlying-US(Guam-USVI-etc) Peru
## 14 31
## Philippines Poland
## 198 60
## Portugal Puerto-Rico
## 37 114
## Scotland South
## 12 80
## Taiwan Thailand
## 51 18
## Trinadad&Tobago United-States
## 19 29170
## Vietnam Yugoslavia
## 67 16
levels(a$country)
## NULL
asia <- c('China','Hong','India','Iran','Cambodia','Japan', 'Laos' ,
'Philippines' ,'Vietnam' ,'Taiwan', 'Thailand')
north.America <- c('Canada','United-States','Puerto-Rico' )
europe <- c('England' ,'France', 'Germany' ,'Greece','Holand-Netherlands','Hungary',
'Ireland','Italy','Poland','Portugal','Scotland','Yugoslavia')
latin.and.South.America <- c('Columbia','Cuba','Dominican-Republic','Ecuador',
'El-Salvador','Guatemala','Haiti','Honduras',
'Mexico','Nicaragua','Outlying-US(Guam-USVI-etc)','Peru',
'Jamaica','Trinadad&Tobago')
other <- c('South')
group_country <- function(cry){
if (cry %in% asia){
return('Asia')
}else if (cry %in% north.America){
return('North.America')
}else if (cry %in% europe){
return('Europe')
}else if (cry %in% latin.and.South.America){
return('Latin.and.South.America')
}else{
return('Other')
}
}
a$country <- sapply(a$country,group_country)
table(a$country)
##
## Asia Europe Latin.and.South.America
## 671 521 1301
## North.America Other
## 29405 663
str(a)
## 'data.frame': 32561 obs. of 16 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ type_employer: chr "SL-gov" "self-emp" "Private" "Private" ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : chr "Bachelors" "Bachelors" "HS-grad" "11th" ...
## $ education_num: int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital : chr "Never-married" "Married" "Not-Married" "Married" ...
## $ occupation : chr "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
## $ relationship : chr "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ race : chr "White" "White" "White" "Black" ...
## $ sex : chr "Male" "Male" "Male" "Male" ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ country : chr "North.America" "North.America" "North.America" "North.America" ...
## $ income : chr "<=50K" "<=50K" "<=50K" "<=50K" ...
a$type_employer <- sapply(a$type_employer,factor)
a$country <- sapply(a$country,factor)
a$marital <- sapply(a$marital,factor)
a$income <- sapply(a$income,factor)
str(a)
## 'data.frame': 32561 obs. of 16 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ type_employer: Factor w/ 6 levels "SL-gov","self-emp",..: 1 2 3 3 3 3 3 2 3 3 ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : chr "Bachelors" "Bachelors" "HS-grad" "11th" ...
## $ education_num: int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital : Factor w/ 3 levels "Never-married",..: 1 2 3 2 2 2 2 2 1 2 ...
## $ occupation : chr "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
## $ relationship : chr "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ race : chr "White" "White" "White" "Black" ...
## $ sex : chr "Male" "Male" "Male" "Male" ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ country : Factor w/ 5 levels "North.America",..: 1 1 1 1 2 1 2 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
Missing data
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
a[a=='?'] <- NA
table(a$type_employer)
##
## SL-gov self-emp Private Federal-gov ? Unemployed
## 3391 3657 22696 960 0 21
missmap(a,y.at=c(1),y.labels = c(''),col=c('yellow','black'))
EDA
str(a)
## 'data.frame': 32561 obs. of 16 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ type_employer: Factor w/ 6 levels "SL-gov","self-emp",..: 1 2 3 3 3 3 3 2 3 3 ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : chr "Bachelors" "Bachelors" "HS-grad" "11th" ...
## $ education_num: int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital : Factor w/ 3 levels "Never-married",..: 1 2 3 2 2 2 2 2 1 2 ...
## $ occupation : chr "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
## $ relationship : chr "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ race : chr "White" "White" "White" "Black" ...
## $ sex : chr "Male" "Male" "Male" "Male" ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ country : Factor w/ 5 levels "North.America",..: 1 1 1 1 2 1 2 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ggplot(a,aes(age))+geom_histogram(aes(fill=income),color='black',binwidth=1)+theme_bw()
ggplot(a,aes(hr_per_week))+geom_histogram()+theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
names(a)[names(a)=='country'] <-'region'
str(a)
## 'data.frame': 32561 obs. of 16 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ type_employer: Factor w/ 6 levels "SL-gov","self-emp",..: 1 2 3 3 3 3 3 2 3 3 ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : chr "Bachelors" "Bachelors" "HS-grad" "11th" ...
## $ education_num: int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital : Factor w/ 3 levels "Never-married",..: 1 2 3 2 2 2 2 2 1 2 ...
## $ occupation : chr "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
## $ relationship : chr "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ race : chr "White" "White" "White" "Black" ...
## $ sex : chr "Male" "Male" "Male" "Male" ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ region : Factor w/ 5 levels "North.America",..: 1 1 1 1 2 1 2 1 1 1 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
ggplot(a,aes(region))+geom_bar(aes(fill=income),color='black')+theme_bw()
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## List of 1
## $ axis.text.x:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : NULL
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
head(a)
## X age type_employer fnlwgt education education_num marital
## 1 1 39 SL-gov 77516 Bachelors 13 Never-married
## 2 2 50 self-emp 83311 Bachelors 13 Married
## 3 3 38 Private 215646 HS-grad 9 Not-Married
## 4 4 53 Private 234721 11th 7 Married
## 5 5 28 Private 338409 Bachelors 13 Married
## 6 6 37 Private 284582 Masters 14 Married
## occupation relationship race sex capital_gain capital_loss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## hr_per_week region income
## 1 40 North.America <=50K
## 2 13 North.America <=50K
## 3 40 North.America <=50K
## 4 40 North.America <=50K
## 5 40 Latin.and.South.America <=50K
## 6 40 North.America <=50K
Train and test
library(caTools)
set.seed(101)
sample <- sample.split(a$income,SplitRatio=0.7)
train <-subset(a,sample == T)
test <-subset(a,sample == F)
Deploying model
model <- glm(income ~. ,family=binomial(logit),data = train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = income ~ ., family = binomial(logit), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4277 -0.5145 -0.1894 0.0000 3.8037
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.533e+00 4.351e-01 -17.311 < 2e-16 ***
## X 3.063e-06 2.270e-06 1.349 0.177248
## age 2.696e-02 2.029e-03 13.287 < 2e-16 ***
## type_employerself-emp -1.110e-01 9.024e-02 -1.231 0.218492
## type_employerPrivate 2.198e-01 7.284e-02 3.017 0.002554 **
## type_employerFederal-gov 7.061e-01 1.246e-01 5.669 1.44e-08 ***
## type_employerUnemployed -1.452e+01 6.500e+02 -0.022 0.982177
## fnlwgt 5.958e-07 2.062e-07 2.889 0.003868 **
## education11th -1.237e-01 2.496e-01 -0.496 0.620056
## education12th 2.807e-01 3.211e-01 0.874 0.382009
## education1st-4th -8.216e-01 6.050e-01 -1.358 0.174468
## education5th-6th -1.079e+00 4.778e-01 -2.259 0.023883 *
## education7th-8th -9.080e-01 2.924e-01 -3.106 0.001899 **
## education9th -2.828e-01 3.020e-01 -0.936 0.349026
## educationAssoc-acdm 1.094e+00 2.082e-01 5.253 1.50e-07 ***
## educationAssoc-voc 1.157e+00 2.005e-01 5.771 7.87e-09 ***
## educationBachelors 1.764e+00 1.856e-01 9.503 < 2e-16 ***
## educationDoctorate 2.902e+00 2.573e-01 11.279 < 2e-16 ***
## educationHS-grad 6.261e-01 1.800e-01 3.479 0.000503 ***
## educationMasters 2.061e+00 1.985e-01 10.384 < 2e-16 ***
## educationPreschool -2.037e+01 2.719e+02 -0.075 0.940295
## educationProf-school 2.549e+00 2.395e-01 10.644 < 2e-16 ***
## educationSome-college 9.107e-01 1.831e-01 4.974 6.55e-07 ***
## education_num NA NA NA NA
## maritalMarried 1.413e+00 1.990e-01 7.103 1.22e-12 ***
## maritalNot-Married 4.855e-01 1.013e-01 4.795 1.63e-06 ***
## occupationArmed-Forces -5.780e-01 1.915e+00 -0.302 0.762760
## occupationCraft-repair 7.650e-02 9.605e-02 0.797 0.425740
## occupationExec-managerial 7.988e-01 9.273e-02 8.614 < 2e-16 ***
## occupationFarming-fishing -1.284e+00 1.740e-01 -7.382 1.56e-13 ***
## occupationHandlers-cleaners -6.400e-01 1.692e-01 -3.782 0.000155 ***
## occupationMachine-op-inspct -3.097e-01 1.226e-01 -2.527 0.011500 *
## occupationOther-service -9.062e-01 1.412e-01 -6.417 1.39e-10 ***
## occupationPriv-house-serv -1.345e+01 1.972e+02 -0.068 0.945626
## occupationProf-specialty 4.525e-01 9.833e-02 4.601 4.20e-06 ***
## occupationProtective-serv 5.354e-01 1.496e-01 3.580 0.000344 ***
## occupationSales 2.491e-01 9.942e-02 2.505 0.012237 *
## occupationTech-support 6.851e-01 1.299e-01 5.276 1.32e-07 ***
## occupationTransport-moving -1.732e-01 1.191e-01 -1.454 0.145924
## relationshipNot-in-family -7.552e-01 1.958e-01 -3.856 0.000115 ***
## relationshipOther-relative -1.132e+00 2.677e-01 -4.229 2.35e-05 ***
## relationshipOwn-child -1.711e+00 2.392e-01 -7.155 8.38e-13 ***
## relationshipUnmarried -8.609e-01 2.187e-01 -3.936 8.29e-05 ***
## relationshipWife 1.423e+00 1.253e-01 11.357 < 2e-16 ***
## raceAsian-Pac-Islander 6.837e-01 3.356e-01 2.037 0.041657 *
## raceBlack 5.544e-01 2.971e-01 1.866 0.062064 .
## raceOther -1.763e-01 4.457e-01 -0.395 0.692482
## raceWhite 6.905e-01 2.847e-01 2.426 0.015282 *
## sexMale 9.287e-01 9.579e-02 9.695 < 2e-16 ***
## capital_gain 3.326e-04 1.289e-05 25.807 < 2e-16 ***
## capital_loss 6.587e-04 4.586e-05 14.365 < 2e-16 ***
## hr_per_week 3.157e-02 2.006e-03 15.742 < 2e-16 ***
## regionLatin.and.South.America -5.214e-01 1.595e-01 -3.269 0.001079 **
## regionAsia -2.803e-01 2.139e-01 -1.310 0.190084
## regionOther -4.765e-01 1.635e-01 -2.913 0.003574 **
## regionEurope 6.083e-02 1.524e-01 0.399 0.689829
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24139 on 21500 degrees of freedom
## Residual deviance: 13865 on 21446 degrees of freedom
## (1292 observations deleted due to missingness)
## AIC: 13975
##
## Number of Fisher Scoring iterations: 15
new.step.model <- step(model)
## Start: AIC=13975.19
## income ~ X + age + type_employer + fnlwgt + education + education_num +
## marital + occupation + relationship + race + sex + capital_gain +
## capital_loss + hr_per_week + region
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=13975.19
## income ~ X + age + type_employer + fnlwgt + education + marital +
## occupation + relationship + race + sex + capital_gain + capital_loss +
## hr_per_week + region
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## - X 1 13867 13975
## <none> 13865 13975
## - fnlwgt 1 13874 13982
## - race 4 13881 13983
## - region 4 13885 13987
## - marital 2 13921 14027
## - type_employer 4 13927 14029
## - sex 1 13963 14071
## - age 1 14044 14152
## - capital_loss 1 14079 14187
## - relationship 5 14114 14214
## - hr_per_week 1 14121 14229
## - occupation 13 14330 14414
## - education 15 14596 14676
## - capital_gain 1 15166 15274
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=13975.01
## income ~ age + type_employer + fnlwgt + education + marital +
## occupation + relationship + race + sex + capital_gain + capital_loss +
## hr_per_week + region
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 13867 13975
## - fnlwgt 1 13875 13981
## - race 4 13883 13983
## - region 4 13887 13987
## - marital 2 13923 14027
## - type_employer 4 13929 14029
## - sex 1 13964 14070
## - age 1 14045 14151
## - capital_loss 1 14081 14187
## - relationship 5 14115 14213
## - hr_per_week 1 14123 14229
## - occupation 13 14332 14414
## - education 15 14598 14676
## - capital_gain 1 15168 15274
summary(new.step.model)
##
## Call:
## glm(formula = income ~ age + type_employer + fnlwgt + education +
## marital + occupation + relationship + race + sex + capital_gain +
## capital_loss + hr_per_week + region, family = binomial(logit),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4361 -0.5142 -0.1895 0.0000 3.8162
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.482e+00 4.335e-01 -17.258 < 2e-16 ***
## age 2.696e-02 2.029e-03 13.284 < 2e-16 ***
## type_employerself-emp -1.129e-01 9.021e-02 -1.251 0.210831
## type_employerPrivate 2.194e-01 7.282e-02 3.013 0.002588 **
## type_employerFederal-gov 7.056e-01 1.246e-01 5.665 1.47e-08 ***
## type_employerUnemployed -1.449e+01 6.515e+02 -0.022 0.982253
## fnlwgt 5.938e-07 2.062e-07 2.880 0.003982 **
## education11th -1.227e-01 2.497e-01 -0.492 0.623038
## education12th 2.850e-01 3.210e-01 0.888 0.374646
## education1st-4th -8.209e-01 6.049e-01 -1.357 0.174767
## education5th-6th -1.079e+00 4.781e-01 -2.256 0.024066 *
## education7th-8th -9.036e-01 2.923e-01 -3.091 0.001993 **
## education9th -2.847e-01 3.021e-01 -0.942 0.346095
## educationAssoc-acdm 1.096e+00 2.082e-01 5.262 1.42e-07 ***
## educationAssoc-voc 1.160e+00 2.005e-01 5.785 7.26e-09 ***
## educationBachelors 1.765e+00 1.856e-01 9.509 < 2e-16 ***
## educationDoctorate 2.903e+00 2.572e-01 11.286 < 2e-16 ***
## educationHS-grad 6.274e-01 1.800e-01 3.486 0.000491 ***
## educationMasters 2.064e+00 1.985e-01 10.394 < 2e-16 ***
## educationPreschool -2.039e+01 2.712e+02 -0.075 0.940079
## educationProf-school 2.552e+00 2.395e-01 10.657 < 2e-16 ***
## educationSome-college 9.129e-01 1.831e-01 4.985 6.19e-07 ***
## maritalMarried 1.414e+00 1.990e-01 7.106 1.20e-12 ***
## maritalNot-Married 4.867e-01 1.013e-01 4.807 1.53e-06 ***
## occupationArmed-Forces -5.706e-01 1.898e+00 -0.301 0.763750
## occupationCraft-repair 7.865e-02 9.602e-02 0.819 0.412730
## occupationExec-managerial 8.006e-01 9.271e-02 8.635 < 2e-16 ***
## occupationFarming-fishing -1.282e+00 1.739e-01 -7.373 1.67e-13 ***
## occupationHandlers-cleaners -6.382e-01 1.692e-01 -3.771 0.000162 ***
## occupationMachine-op-inspct -3.097e-01 1.226e-01 -2.526 0.011526 *
## occupationOther-service -9.054e-01 1.412e-01 -6.413 1.43e-10 ***
## occupationPriv-house-serv -1.344e+01 1.973e+02 -0.068 0.945699
## occupationProf-specialty 4.537e-01 9.831e-02 4.615 3.93e-06 ***
## occupationProtective-serv 5.358e-01 1.495e-01 3.583 0.000340 ***
## occupationSales 2.494e-01 9.940e-02 2.509 0.012108 *
## occupationTech-support 6.881e-01 1.299e-01 5.299 1.16e-07 ***
## occupationTransport-moving -1.731e-01 1.191e-01 -1.453 0.146206
## relationshipNot-in-family -7.562e-01 1.958e-01 -3.862 0.000113 ***
## relationshipOther-relative -1.133e+00 2.679e-01 -4.230 2.33e-05 ***
## relationshipOwn-child -1.713e+00 2.392e-01 -7.159 8.11e-13 ***
## relationshipUnmarried -8.615e-01 2.187e-01 -3.939 8.17e-05 ***
## relationshipWife 1.418e+00 1.252e-01 11.326 < 2e-16 ***
## raceAsian-Pac-Islander 6.837e-01 3.356e-01 2.037 0.041637 *
## raceBlack 5.533e-01 2.972e-01 1.862 0.062594 .
## raceOther -1.757e-01 4.456e-01 -0.394 0.693325
## raceWhite 6.895e-01 2.847e-01 2.422 0.015454 *
## sexMale 9.261e-01 9.576e-02 9.670 < 2e-16 ***
## capital_gain 3.325e-04 1.288e-05 25.808 < 2e-16 ***
## capital_loss 6.584e-04 4.584e-05 14.364 < 2e-16 ***
## hr_per_week 3.159e-02 2.006e-03 15.749 < 2e-16 ***
## regionLatin.and.South.America -5.190e-01 1.594e-01 -3.256 0.001128 **
## regionAsia -2.813e-01 2.138e-01 -1.316 0.188178
## regionOther -4.787e-01 1.635e-01 -2.929 0.003403 **
## regionEurope 5.966e-02 1.524e-01 0.392 0.695399
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24139 on 21500 degrees of freedom
## Residual deviance: 13867 on 21447 degrees of freedom
## (1292 observations deleted due to missingness)
## AIC: 13975
##
## Number of Fisher Scoring iterations: 15
test$predicted.income = predict(model, newdata=test, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
table(test$income, test$predicted.income > 0.5)
##
## FALSE TRUE
## <=50K 6405 518
## >50K 920 1374
accuracy of our model
6405+1374/(6405+518+920+1374)
## [1] 6405.149
recall
6405/(6405+518)
## [1] 0.9251769
6405/(6405+920)
## [1] 0.8744027