In this project, we applied logistic regression to the UCI adult dataset to predict if an individual’s salary is above or below 50k per year.
library(dplyr)
library(tidyr)
library(qdap)
library(ggplot2)
adult <- read.csv("adult_sal.csv",na.strings = c("?",""))
# drop duplicated index column
adult <- adult %>%
select(-X)
Will work to reduce double digit factor variables education, occupation, country, type employer
str(adult)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ type_employer: Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
## $ education_num: int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital : Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ country : Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
### cleaning up factors
table(adult$type_employer)
##
## Federal-gov Local-gov Never-worked Private
## 960 2093 7 22696
## Self-emp-inc Self-emp-not-inc State-gov Without-pay
## 1116 2541 1298 14
We can combine “never-worked” and “without-pay” into unemployed factor.
adult.tidy <- adult # create tidy table for transformations
adult.tidy$type_employer <- gsub("Without-pay","Unemployed",adult.tidy$type_employer)
adult.tidy$type_employer <- gsub("Never-worked","Unemployed",adult.tidy$type_employer)
Combine “local-gove, and state-gov into SL-gov factor.
adult.tidy$type_employer <- gsub("Local-gov","SL-gov",adult.tidy$type_employer)
adult.tidy$type_employer <- gsub("State-gov","SL-gov",adult.tidy$type_employer)
Combine self-employed jobs into Self-emp factor.
adult.tidy$type_employer <- gsub("Self-emp-inc","Self-emp",adult.tidy$type_employer)
adult.tidy$type_employer <- gsub("Self-emp-not-inc","Self-emp",adult.tidy$type_employer)
View new type-employed factors.
adult.tidy$type_employer <- as.factor(adult.tidy$type_employer)
table(adult.tidy$type_employer)
##
## Federal-gov Private Self-emp SL-gov Unemployed
## 960 22696 3657 3391 21
table(adult.tidy$marital)
##
## Divorced Married-AF-spouse Married-civ-spouse
## 4443 23 14976
## Married-spouse-absent Never-married Separated
## 418 10683 1025
## Widowed
## 993
Combine marital factors into Not-married (no longer married) and Married factors. We’ll leave the Never-Married factor as is.
adult.tidy$marital <- gsub("Divorced","Not-Married",adult.tidy$marital)
adult.tidy$marital <- gsub("Separated","Not-Married",adult.tidy$marital)
adult.tidy$marital <- gsub("Widowed","Not-Married",adult.tidy$marital)
adult.tidy$marital <- gsub("Married-AF-spouse","Married",adult.tidy$marital)
adult.tidy$marital <- gsub("Married-civ-spouse","Married",adult.tidy$marital)
adult.tidy$marital <- gsub("Married-spouse-absent","Married",adult.tidy$marital)
View new marital factors.
adult.tidy$marital <- as.factor(adult.tidy$marital)
table(adult.tidy$marital)
##
## Married Never-married Not-Married
## 15417 10683 6461
table(adult.tidy$country)
##
## Cambodia Canada
## 19 121
## China Columbia
## 75 59
## Cuba Dominican-Republic
## 95 70
## Ecuador El-Salvador
## 28 106
## England France
## 90 29
## Germany Greece
## 137 29
## Guatemala Haiti
## 64 44
## Holand-Netherlands Honduras
## 1 13
## Hong Hungary
## 20 13
## India Iran
## 100 43
## Ireland Italy
## 24 73
## Jamaica Japan
## 81 62
## Laos Mexico
## 18 643
## Nicaragua Outlying-US(Guam-USVI-etc)
## 34 14
## Peru Philippines
## 31 198
## Poland Portugal
## 60 37
## Puerto-Rico Scotland
## 114 12
## South Taiwan
## 80 51
## Thailand Trinadad&Tobago
## 18 19
## United-States Vietnam
## 29170 67
## Yugoslavia
## 16
We will combine countries into continent factors. Create vectors that group factors based on continent.
na <- c("Canada","United-States")
sa <- c("Columbia","Ecuador","El-Salvador","Guatemala","Honduras",
"Mexico","Nicaragua","Peru")
cb <- c("Dominican-Republic","Cuba","Haiti","Jamaica","Puerto-Rico","Trinadad&Tobago")
asia <- c("Cambodia","China","Hong","Outlying-US(Guam-USVI-etc)","Vietnam",
"Thailand","Taiwan","Philippines","Laos","Japan","South","Iran")
eu <- c("England","France","Germany","Greece","Holand-Netherlands","Hungary",
"India","Ireland","Italy","Poland","Portugal","Scotland","Yugoslavia")
Use mgsub to group countries into factors based on continents.
adult.tidy$country <- mgsub(na,"North America",adult.tidy$country)
adult.tidy$country <- mgsub(sa,"South America",adult.tidy$country)
adult.tidy$country <- mgsub(cb,"Carribean Islands",adult.tidy$country)
adult.tidy$country <- mgsub(asia,"Asia",adult.tidy$country)
adult.tidy$country <- mgsub(eu,"Europe",adult.tidy$country)
View new country factors.
adult.tidy$country <- as.factor(adult.tidy$country)
table(adult.tidy$country)
##
## Asia Asia America Carribean Islands Europe
## 665 978 423 621
## North America
## 29291
View occupation factors.
table(adult.tidy$occupation)
##
## Adm-clerical Armed-Forces Craft-repair Exec-managerial
## 3770 9 4099 4066
## Farming-fishing Handlers-cleaners Machine-op-inspct Other-service
## 994 1370 2002 3295
## Priv-house-serv Prof-specialty Protective-serv Sales
## 149 4140 649 3650
## Tech-support Transport-moving
## 928 1597
We will factor occupation variables into blue and white collar factors.
bc <- c("Armed-Forces","Craft-repair","Farming-fishing","Handlers-cleaners",
"Machine-op-inspct","Other-service","Priv-house-serv","Protective-serv",
"Transport-moving")
wc <- c("Adm-clerical","Exec-managerial","Prof-specialty","Sales","Tech-support")
adult.tidy$occupation <- mgsub(bc ,"Blue Collar",adult.tidy$occupation)
adult.tidy$occupation <- mgsub(wc ,"White Collar",adult.tidy$occupation)
View new occupation factors
adult.tidy$occupation <- as.factor(adult.tidy$occupation)
table(adult.tidy$occupation)
##
## Blue Collar White Collar
## 14164 16554
View education factors
table(adult.tidy$education)
##
## 10th 11th 12th 1st-4th 5th-6th
## 933 1175 433 168 333
## 7th-8th 9th Assoc-acdm Assoc-voc Bachelors
## 646 514 1067 1382 5355
## Doctorate HS-grad Masters Preschool Prof-school
## 413 10501 1723 51 576
## Some-college
## 7291
Reduce education factors.
# clean eduction factors
table(adult.tidy$education)
##
## 10th 11th 12th 1st-4th 5th-6th
## 933 1175 433 168 333
## 7th-8th 9th Assoc-acdm Assoc-voc Bachelors
## 646 514 1067 1382 5355
## Doctorate HS-grad Masters Preschool Prof-school
## 413 10501 1723 51 576
## Some-college
## 7291
HS.Dropout <- c("10th","11th","12th","9th")
Elem.Dropout <-c("1st-4th","5th-6th","7th-8th","Preschool")
HS.Grad <- c("HS-grad")
Assc.Grad <- c("Assoc-acdm","Assoc-voc")
Col.Dropout <- c("Some-college")
Law.Medical <-c("Prof-school")
adult.tidy$education <- mgsub(HS.Dropout,"HS Dropout",adult.tidy$education)
adult.tidy$education <- mgsub(Elem.Dropout,"Elem Dropout",adult.tidy$education)
adult.tidy$education <- mgsub(HS.Grad,"HS",adult.tidy$education)
adult.tidy$education <- mgsub(Assc.Grad ,"Associate",adult.tidy$education)
adult.tidy$education <- mgsub(Col.Dropout ,"College Dropout",adult.tidy$education)
adult.tidy$education <- mgsub(Law.Medical ,"Med/Law",adult.tidy$education)
View new education factors
adult.tidy$education <- as.factor(adult.tidy$education)
table(adult.tidy$education)
##
## Associate Bachelors College Dropout Doctorate
## 2449 5355 7291 413
## Elem Dropout HS HS Dropout Masters
## 1198 10501 3055 1723
## Med/Law
## 576
Review final list of reduced variable factors.
str(adult.tidy)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ type_employer: Factor w/ 5 levels "Federal-gov",..: 4 3 2 2 2 2 2 3 2 2 ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : Factor w/ 9 levels "Associate","Bachelors",..: 2 2 6 7 2 8 7 6 8 2 ...
## $ education_num: int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital : Factor w/ 3 levels "Married","Never-married",..: 2 1 3 1 1 1 1 1 2 1 ...
## $ occupation : Factor w/ 2 levels "Blue Collar",..: 2 2 1 1 2 2 1 2 2 2 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_per_week : int 40 13 40 40 40 40 16 45 50 40 ...
## $ country : Factor w/ 5 levels "Asia","Asia America",..: 5 5 5 5 3 5 3 5 5 5 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
library(Amelia)
missmap(adult.tidy,y.at=c(1),y.labels = c(''),col=c('yellow','black'))
# percentage of missing data
mean(is.na(adult.tidy))
## [1] 0.008726186
colMeans(is.na(adult.tidy))
## age type_employer fnlwgt education education_num
## 0.00000000 0.05638647 0.00000000 0.00000000 0.00000000
## marital occupation relationship race sex
## 0.00000000 0.05660146 0.00000000 0.00000000 0.00000000
## capital_gain capital_loss hr_per_week country income
## 0.00000000 0.00000000 0.00000000 0.01790486 0.00000000
The dataset has .008726186% missing data. All the missing data is qualitative. We can also see that the percentage of missing data of each column. We will drop the missing data rows.
# the percentage of NA values is low enough to ignore in the case
adult.tidy <- na.omit(adult.tidy)
Visually verify missing data is removed.
# verify missing data is removed
missmap(adult.tidy,y.at=c(1),y.labels = c(''),col=c('yellow','black'))
library(ggplot2)
library(forcats)
ggplot(adult.tidy,aes(age, fill=income)) +
geom_histogram(alpha=0.5,color="black", binwidth = 1) + theme_bw() +
ggtitle("Histogram of Age and Income.")
# histogram of hours worked
ggplot(adult.tidy,aes(hr_per_week)) +
geom_histogram(fill='blue',color='black') + theme_bw() +
ggtitle("Histogram of Hours Worked")
# barplot by region
ggplot(adult.tidy,aes(x=fct_infreq(country), fill = income)) +
geom_bar(color='black') + xlab("Region") +
ggtitle("Barplot of Income by Region")
head(adult.tidy)
## age type_employer fnlwgt education education_num marital
## 1 39 SL-gov 77516 Bachelors 13 Never-married
## 2 50 Self-emp 83311 Bachelors 13 Married
## 3 38 Private 215646 HS 9 Not-Married
## 4 53 Private 234721 HS Dropout 7 Married
## 5 28 Private 338409 Bachelors 13 Married
## 6 37 Private 284582 Masters 14 Married
## occupation relationship race sex capital_gain capital_loss
## 1 White Collar Not-in-family White Male 2174 0
## 2 White Collar Husband White Male 0 0
## 3 Blue Collar Not-in-family White Male 0 0
## 4 Blue Collar Husband Black Male 0 0
## 5 White Collar Wife Black Female 0 0
## 6 White Collar Wife White Female 0 0
## hr_per_week country income
## 1 40 North America <=50K
## 2 13 North America <=50K
## 3 40 North America <=50K
## 4 40 North America <=50K
## 5 40 Carribean Islands <=50K
## 6 40 North America <=50K
library(caTools)
set.seed(101)
split <- sample.split(adult.tidy$income, SplitRatio = .7)
adult.train <- subset(adult.tidy,split == TRUE)
adult.test <- subset(adult.tidy,split == FALSE)
adult.model <- glm(income ~., family = binomial(link = 'logit'), data = adult.train)
summary(adult.model)
##
## Call:
## glm(formula = income ~ ., family = binomial(link = "logit"),
## data = adult.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.2178 -0.5424 -0.2053 -0.0264 3.4901
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.160e+00 1.038e+00 -5.933 2.98e-09 ***
## age 2.609e-02 1.975e-03 13.210 < 2e-16 ***
## type_employerPrivate -4.260e-01 1.077e-01 -3.954 7.69e-05 ***
## type_employerSelf-emp -7.807e-01 1.185e-01 -6.587 4.47e-11 ***
## type_employerSL-gov -5.315e-01 1.216e-01 -4.372 1.23e-05 ***
## type_employerUnemployed -1.226e+01 1.424e+02 -0.086 0.93139
## fnlwgt 8.866e-07 2.070e-07 4.283 1.85e-05 ***
## educationBachelors 5.130e-01 1.572e-01 3.263 0.00110 **
## educationCollege Dropout 5.355e-02 1.457e-01 0.367 0.71332
## educationDoctorate 9.710e-01 4.315e-01 2.251 0.02441 *
## educationElem Dropout -7.945e-01 6.868e-01 -1.157 0.24739
## educationHS -2.360e-01 2.183e-01 -1.081 0.27967
## educationHS Dropout -6.100e-01 4.313e-01 -1.414 0.15726
## educationMasters 7.365e-01 2.421e-01 3.042 0.00235 **
## educationMed/Law 1.121e+00 3.463e-01 3.237 0.00121 **
## education_num 1.293e-01 8.403e-02 1.539 0.12382
## maritalNever-married -1.479e+00 1.970e-01 -7.510 5.92e-14 ***
## maritalNot-Married -8.687e-01 1.960e-01 -4.432 9.33e-06 ***
## occupationWhite Collar 6.463e-01 4.985e-02 12.965 < 2e-16 ***
## relationshipNot-in-family -8.005e-01 1.936e-01 -4.135 3.55e-05 ***
## relationshipOther-relative -9.839e-01 2.607e-01 -3.774 0.00016 ***
## relationshipOwn-child -1.866e+00 2.404e-01 -7.764 8.26e-15 ***
## relationshipUnmarried -1.072e+00 2.183e-01 -4.913 8.99e-07 ***
## relationshipWife 1.221e+00 1.230e-01 9.927 < 2e-16 ***
## raceAsian-Pac-Islander 5.551e-01 3.148e-01 1.763 0.07787 .
## raceBlack 1.363e-01 2.762e-01 0.493 0.62176
## raceOther 1.675e-01 4.132e-01 0.405 0.68521
## raceWhite 3.756e-01 2.623e-01 1.432 0.15207
## sexMale 9.184e-01 9.476e-02 9.692 < 2e-16 ***
## capital_gain 3.084e-04 1.234e-05 24.984 < 2e-16 ***
## capital_loss 6.615e-04 4.581e-05 14.438 < 2e-16 ***
## hr_per_week 3.040e-02 1.941e-03 15.662 < 2e-16 ***
## countryAsia America -5.147e-01 2.985e-01 -1.724 0.08471 .
## countryCarribean Islands 1.071e-01 3.119e-01 0.343 0.73140
## countryEurope 5.564e-01 2.413e-01 2.306 0.02113 *
## countryNorth America 5.532e-01 2.161e-01 2.559 0.01048 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23697 on 21113 degrees of freedom
## Residual deviance: 14026 on 21078 degrees of freedom
## AIC: 14098
##
## Number of Fisher Scoring iterations: 12
# choose a model by aic in a stepwise algorithm
adult.model.new <- step(adult.model)
## Start: AIC=14097.73
## income ~ age + type_employer + fnlwgt + education + education_num +
## marital + occupation + relationship + race + sex + capital_gain +
## capital_loss + hr_per_week + country
##
## Df Deviance AIC
## <none> 14026 14098
## - education_num 1 14028 14098
## - race 4 14036 14100
## - education 8 14051 14107
## - fnlwgt 1 14044 14114
## - country 4 14067 14131
## - type_employer 4 14085 14149
## - marital 2 14096 14164
## - sex 1 14123 14193
## - occupation 1 14195 14265
## - age 1 14202 14272
## - capital_loss 1 14240 14310
## - relationship 5 14261 14323
## - hr_per_week 1 14278 14348
## - capital_gain 1 15202 15272
summary(adult.model.new)
##
## Call:
## glm(formula = income ~ age + type_employer + fnlwgt + education +
## education_num + marital + occupation + relationship + race +
## sex + capital_gain + capital_loss + hr_per_week + country,
## family = binomial(link = "logit"), data = adult.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.2178 -0.5424 -0.2053 -0.0264 3.4901
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.160e+00 1.038e+00 -5.933 2.98e-09 ***
## age 2.609e-02 1.975e-03 13.210 < 2e-16 ***
## type_employerPrivate -4.260e-01 1.077e-01 -3.954 7.69e-05 ***
## type_employerSelf-emp -7.807e-01 1.185e-01 -6.587 4.47e-11 ***
## type_employerSL-gov -5.315e-01 1.216e-01 -4.372 1.23e-05 ***
## type_employerUnemployed -1.226e+01 1.424e+02 -0.086 0.93139
## fnlwgt 8.866e-07 2.070e-07 4.283 1.85e-05 ***
## educationBachelors 5.130e-01 1.572e-01 3.263 0.00110 **
## educationCollege Dropout 5.355e-02 1.457e-01 0.367 0.71332
## educationDoctorate 9.710e-01 4.315e-01 2.251 0.02441 *
## educationElem Dropout -7.945e-01 6.868e-01 -1.157 0.24739
## educationHS -2.360e-01 2.183e-01 -1.081 0.27967
## educationHS Dropout -6.100e-01 4.313e-01 -1.414 0.15726
## educationMasters 7.365e-01 2.421e-01 3.042 0.00235 **
## educationMed/Law 1.121e+00 3.463e-01 3.237 0.00121 **
## education_num 1.293e-01 8.403e-02 1.539 0.12382
## maritalNever-married -1.479e+00 1.970e-01 -7.510 5.92e-14 ***
## maritalNot-Married -8.687e-01 1.960e-01 -4.432 9.33e-06 ***
## occupationWhite Collar 6.463e-01 4.985e-02 12.965 < 2e-16 ***
## relationshipNot-in-family -8.005e-01 1.936e-01 -4.135 3.55e-05 ***
## relationshipOther-relative -9.839e-01 2.607e-01 -3.774 0.00016 ***
## relationshipOwn-child -1.866e+00 2.404e-01 -7.764 8.26e-15 ***
## relationshipUnmarried -1.072e+00 2.183e-01 -4.913 8.99e-07 ***
## relationshipWife 1.221e+00 1.230e-01 9.927 < 2e-16 ***
## raceAsian-Pac-Islander 5.551e-01 3.148e-01 1.763 0.07787 .
## raceBlack 1.363e-01 2.762e-01 0.493 0.62176
## raceOther 1.675e-01 4.132e-01 0.405 0.68521
## raceWhite 3.756e-01 2.623e-01 1.432 0.15207
## sexMale 9.184e-01 9.476e-02 9.692 < 2e-16 ***
## capital_gain 3.084e-04 1.234e-05 24.984 < 2e-16 ***
## capital_loss 6.615e-04 4.581e-05 14.438 < 2e-16 ***
## hr_per_week 3.040e-02 1.941e-03 15.662 < 2e-16 ***
## countryAsia America -5.147e-01 2.985e-01 -1.724 0.08471 .
## countryCarribean Islands 1.071e-01 3.119e-01 0.343 0.73140
## countryEurope 5.564e-01 2.413e-01 2.306 0.02113 *
## countryNorth America 5.532e-01 2.161e-01 2.559 0.01048 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23697 on 21113 degrees of freedom
## Residual deviance: 14026 on 21078 degrees of freedom
## AIC: 14098
##
## Number of Fisher Scoring iterations: 12
# predictions off model
adult.test$predicted <- predict(adult.model.new,adult.test, type = 'response')
#confusion matrix
cm <- table(adult.test$income, adult.test$predicted >0.5)
print(cm)
##
## FALSE TRUE
## <=50K 6291 505
## >50K 930 1322
# accuracy
TN <- cm[[1]]
FN <- cm[[2]]
FP <- cm[[3]]
TP <- cm[[4]]
Total <- TN + FN + FP + TP
# accuracy - how often its correct
acc <- (TP + TN)/Total
#Misclassification Rate - how often its wrong
miss <- (FP+FN)/Total
#True Positive Rate: When it's actually yes, how often does it predict yes?
TPR <- TP/(FN+TP)
# True Negative rate
TNR <- TN/(TN+FP)
# precision
PPV <- TP/(TP+FP)
## Model accuracy is: 0.8414014
## Model miss classification is: 0.1585986
## Model true positive rate is: 0.5870337
## Model true negative rate is: 0.9256916
## Model precision is: 0.7235906