Summary

In this project, we applied logistic regression to the UCI adult dataset to predict if an individual’s salary is above or below 50k per year.

Read in the data

library(dplyr)
library(tidyr)
library(qdap)
library(ggplot2)
adult <- read.csv("adult_sal.csv",na.strings = c("?",""))

Cleaning data.

Drop duplicate index column.

# drop duplicated index column
adult <- adult %>%
        select(-X)

Reduce number of category factors.

Will work to reduce double digit factor variables education, occupation, country, type employer

str(adult)

## 'data.frame':    32561 obs. of  15 variables:
##  $ age          : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ type_employer: Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
##  $ fnlwgt       : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education    : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
##  $ education_num: int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital      : Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation   : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
##  $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ race         : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capital_gain : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hr_per_week  : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ country      : Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
##  $ income       : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...

Reduce type-employer variable factors.

View type-employer factors.

### cleaning up factors
table(adult$type_employer)

## 
##      Federal-gov        Local-gov     Never-worked          Private 
##              960             2093                7            22696 
##     Self-emp-inc Self-emp-not-inc        State-gov      Without-pay 
##             1116             2541             1298               14

We can combine “never-worked” and “without-pay” into unemployed factor.

adult.tidy <- adult # create tidy table for transformations
adult.tidy$type_employer <- gsub("Without-pay","Unemployed",adult.tidy$type_employer)
adult.tidy$type_employer <- gsub("Never-worked","Unemployed",adult.tidy$type_employer)

Combine “local-gove, and state-gov into SL-gov factor.

adult.tidy$type_employer <- gsub("Local-gov","SL-gov",adult.tidy$type_employer)
adult.tidy$type_employer <- gsub("State-gov","SL-gov",adult.tidy$type_employer)

Combine self-employed jobs into Self-emp factor.

adult.tidy$type_employer <- gsub("Self-emp-inc","Self-emp",adult.tidy$type_employer)
adult.tidy$type_employer <- gsub("Self-emp-not-inc","Self-emp",adult.tidy$type_employer)

View new type-employed factors.

adult.tidy$type_employer <- as.factor(adult.tidy$type_employer)
table(adult.tidy$type_employer)

## 
## Federal-gov     Private    Self-emp      SL-gov  Unemployed 
##         960       22696        3657        3391          21

Reduce Marital variable factors.

View marital factors.

table(adult.tidy$marital)

## 
##              Divorced     Married-AF-spouse    Married-civ-spouse 
##                  4443                    23                 14976 
## Married-spouse-absent         Never-married             Separated 
##                   418                 10683                  1025 
##               Widowed 
##                   993

Combine marital factors into Not-married (no longer married) and Married factors. We’ll leave the Never-Married factor as is.

adult.tidy$marital <- gsub("Divorced","Not-Married",adult.tidy$marital)
adult.tidy$marital <- gsub("Separated","Not-Married",adult.tidy$marital)
adult.tidy$marital <- gsub("Widowed","Not-Married",adult.tidy$marital)
adult.tidy$marital <- gsub("Married-AF-spouse","Married",adult.tidy$marital)
adult.tidy$marital <- gsub("Married-civ-spouse","Married",adult.tidy$marital)
adult.tidy$marital <- gsub("Married-spouse-absent","Married",adult.tidy$marital)

View new marital factors.

adult.tidy$marital <- as.factor(adult.tidy$marital)
table(adult.tidy$marital)

## 
##       Married Never-married   Not-Married 
##         15417         10683          6461

Reduce country variable factors.

View country factors

table(adult.tidy$country)

## 
##                   Cambodia                     Canada 
##                         19                        121 
##                      China                   Columbia 
##                         75                         59 
##                       Cuba         Dominican-Republic 
##                         95                         70 
##                    Ecuador                El-Salvador 
##                         28                        106 
##                    England                     France 
##                         90                         29 
##                    Germany                     Greece 
##                        137                         29 
##                  Guatemala                      Haiti 
##                         64                         44 
##         Holand-Netherlands                   Honduras 
##                          1                         13 
##                       Hong                    Hungary 
##                         20                         13 
##                      India                       Iran 
##                        100                         43 
##                    Ireland                      Italy 
##                         24                         73 
##                    Jamaica                      Japan 
##                         81                         62 
##                       Laos                     Mexico 
##                         18                        643 
##                  Nicaragua Outlying-US(Guam-USVI-etc) 
##                         34                         14 
##                       Peru                Philippines 
##                         31                        198 
##                     Poland                   Portugal 
##                         60                         37 
##                Puerto-Rico                   Scotland 
##                        114                         12 
##                      South                     Taiwan 
##                         80                         51 
##                   Thailand            Trinadad&Tobago 
##                         18                         19 
##              United-States                    Vietnam 
##                      29170                         67 
##                 Yugoslavia 
##                         16

We will combine countries into continent factors. Create vectors that group factors based on continent.

na <- c("Canada","United-States")

sa <- c("Columbia","Ecuador","El-Salvador","Guatemala","Honduras",
        "Mexico","Nicaragua","Peru")

cb <- c("Dominican-Republic","Cuba","Haiti","Jamaica","Puerto-Rico","Trinadad&Tobago")

asia <- c("Cambodia","China","Hong","Outlying-US(Guam-USVI-etc)","Vietnam",
          "Thailand","Taiwan","Philippines","Laos","Japan","South","Iran")

eu <- c("England","France","Germany","Greece","Holand-Netherlands","Hungary",
        "India","Ireland","Italy","Poland","Portugal","Scotland","Yugoslavia")

Use mgsub to group countries into factors based on continents.

adult.tidy$country <- mgsub(na,"North America",adult.tidy$country)
adult.tidy$country <- mgsub(sa,"South America",adult.tidy$country)
adult.tidy$country <- mgsub(cb,"Carribean Islands",adult.tidy$country)
adult.tidy$country <- mgsub(asia,"Asia",adult.tidy$country)
adult.tidy$country <- mgsub(eu,"Europe",adult.tidy$country)

View new country factors.

adult.tidy$country <- as.factor(adult.tidy$country)
table(adult.tidy$country)

## 
##              Asia      Asia America Carribean Islands            Europe 
##               665               978               423               621 
##     North America 
##             29291

Reduce occupation variable factors.

View occupation factors.

table(adult.tidy$occupation)

## 
##      Adm-clerical      Armed-Forces      Craft-repair   Exec-managerial 
##              3770                 9              4099              4066 
##   Farming-fishing Handlers-cleaners Machine-op-inspct     Other-service 
##               994              1370              2002              3295 
##   Priv-house-serv    Prof-specialty   Protective-serv             Sales 
##               149              4140               649              3650 
##      Tech-support  Transport-moving 
##               928              1597

We will factor occupation variables into blue and white collar factors.

bc <- c("Armed-Forces","Craft-repair","Farming-fishing","Handlers-cleaners",
        "Machine-op-inspct","Other-service","Priv-house-serv","Protective-serv",
        "Transport-moving")
 
wc <- c("Adm-clerical","Exec-managerial","Prof-specialty","Sales","Tech-support")

adult.tidy$occupation <- mgsub(bc ,"Blue Collar",adult.tidy$occupation)
adult.tidy$occupation <- mgsub(wc ,"White Collar",adult.tidy$occupation)

View new occupation factors

adult.tidy$occupation <- as.factor(adult.tidy$occupation)
table(adult.tidy$occupation)

## 
##  Blue Collar White Collar 
##        14164        16554

Reduce education variable factors.

View education factors

table(adult.tidy$education)

## 
##         10th         11th         12th      1st-4th      5th-6th 
##          933         1175          433          168          333 
##      7th-8th          9th   Assoc-acdm    Assoc-voc    Bachelors 
##          646          514         1067         1382         5355 
##    Doctorate      HS-grad      Masters    Preschool  Prof-school 
##          413        10501         1723           51          576 
## Some-college 
##         7291

Reduce education factors.

# clean eduction factors
table(adult.tidy$education)

## 
##         10th         11th         12th      1st-4th      5th-6th 
##          933         1175          433          168          333 
##      7th-8th          9th   Assoc-acdm    Assoc-voc    Bachelors 
##          646          514         1067         1382         5355 
##    Doctorate      HS-grad      Masters    Preschool  Prof-school 
##          413        10501         1723           51          576 
## Some-college 
##         7291

HS.Dropout <- c("10th","11th","12th","9th")
Elem.Dropout <-c("1st-4th","5th-6th","7th-8th","Preschool")
HS.Grad <- c("HS-grad")
Assc.Grad <- c("Assoc-acdm","Assoc-voc")
Col.Dropout <- c("Some-college")
Law.Medical <-c("Prof-school")

adult.tidy$education <- mgsub(HS.Dropout,"HS Dropout",adult.tidy$education)
adult.tidy$education <- mgsub(Elem.Dropout,"Elem Dropout",adult.tidy$education)
adult.tidy$education <- mgsub(HS.Grad,"HS",adult.tidy$education)
adult.tidy$education <- mgsub(Assc.Grad ,"Associate",adult.tidy$education)
adult.tidy$education <- mgsub(Col.Dropout ,"College Dropout",adult.tidy$education)
adult.tidy$education <- mgsub(Law.Medical ,"Med/Law",adult.tidy$education)

View new education factors

adult.tidy$education <- as.factor(adult.tidy$education)
table(adult.tidy$education)

## 
##       Associate       Bachelors College Dropout       Doctorate 
##            2449            5355            7291             413 
##    Elem Dropout              HS      HS Dropout         Masters 
##            1198           10501            3055            1723 
##         Med/Law 
##             576

Review final list of reduced variable factors.

str(adult.tidy)

## 'data.frame':    32561 obs. of  15 variables:
##  $ age          : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ type_employer: Factor w/ 5 levels "Federal-gov",..: 4 3 2 2 2 2 2 3 2 2 ...
##  $ fnlwgt       : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education    : Factor w/ 9 levels "Associate","Bachelors",..: 2 2 6 7 2 8 7 6 8 2 ...
##  $ education_num: int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital      : Factor w/ 3 levels "Married","Never-married",..: 2 1 3 1 1 1 1 1 2 1 ...
##  $ occupation   : Factor w/ 2 levels "Blue Collar",..: 2 2 1 1 2 2 1 2 2 2 ...
##  $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ race         : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capital_gain : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital_loss : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hr_per_week  : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ country      : Factor w/ 5 levels "Asia","Asia America",..: 5 5 5 5 3 5 3 5 5 5 ...
##  $ income       : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...

Working with missing values

Visualize missing column values with Amelia package

library(Amelia)
missmap(adult.tidy,y.at=c(1),y.labels = c(''),col=c('yellow','black'))

View missing data statistics

# percentage of missing data
mean(is.na(adult.tidy))

## [1] 0.008726186

colMeans(is.na(adult.tidy))

##           age type_employer        fnlwgt     education education_num 
##    0.00000000    0.05638647    0.00000000    0.00000000    0.00000000 
##       marital    occupation  relationship          race           sex 
##    0.00000000    0.05660146    0.00000000    0.00000000    0.00000000 
##  capital_gain  capital_loss   hr_per_week       country        income 
##    0.00000000    0.00000000    0.00000000    0.01790486    0.00000000

The dataset has .008726186% missing data. All the missing data is qualitative. We can also see that the percentage of missing data of each column. We will drop the missing data rows.

# the percentage of NA values is low enough to ignore in the case
adult.tidy <- na.omit(adult.tidy)

Visually verify missing data is removed.

# verify missing data is removed
missmap(adult.tidy,y.at=c(1),y.labels = c(''),col=c('yellow','black'))

Data visualization

Histogram of Age and Income.

library(ggplot2)
library(forcats)
ggplot(adult.tidy,aes(age, fill=income)) +
        geom_histogram(alpha=0.5,color="black", binwidth = 1) + theme_bw() +
        ggtitle("Histogram of Age and Income.")

Histogram of hours worked.

# histogram of hours worked
ggplot(adult.tidy,aes(hr_per_week)) +
        geom_histogram(fill='blue',color='black') + theme_bw() +
        ggtitle("Histogram of Hours Worked")

Barplot of income by region.

# barplot by region
ggplot(adult.tidy,aes(x=fct_infreq(country), fill = income)) +
        geom_bar(color='black') + xlab("Region") +
        ggtitle("Barplot of Income by Region")

Building the logistic model

View data.

head(adult.tidy)

##   age type_employer fnlwgt  education education_num       marital
## 1  39        SL-gov  77516  Bachelors            13 Never-married
## 2  50      Self-emp  83311  Bachelors            13       Married
## 3  38       Private 215646         HS             9   Not-Married
## 4  53       Private 234721 HS Dropout             7       Married
## 5  28       Private 338409  Bachelors            13       Married
## 6  37       Private 284582    Masters            14       Married
##     occupation  relationship  race    sex capital_gain capital_loss
## 1 White Collar Not-in-family White   Male         2174            0
## 2 White Collar       Husband White   Male            0            0
## 3  Blue Collar Not-in-family White   Male            0            0
## 4  Blue Collar       Husband Black   Male            0            0
## 5 White Collar          Wife Black Female            0            0
## 6 White Collar          Wife White Female            0            0
##   hr_per_week           country income
## 1          40     North America  <=50K
## 2          13     North America  <=50K
## 3          40     North America  <=50K
## 4          40     North America  <=50K
## 5          40 Carribean Islands  <=50K
## 6          40     North America  <=50K

Split the data inot test and training sets.

library(caTools)
set.seed(101)
split <- sample.split(adult.tidy$income, SplitRatio = .7)
adult.train <- subset(adult.tidy,split == TRUE)
adult.test <- subset(adult.tidy,split == FALSE)

Train the model

adult.model <- glm(income ~., family = binomial(link = 'logit'), data = adult.train)
summary(adult.model)

## 
## Call:
## glm(formula = income ~ ., family = binomial(link = "logit"), 
##     data = adult.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2178  -0.5424  -0.2053  -0.0264   3.4901  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -6.160e+00  1.038e+00  -5.933 2.98e-09 ***
## age                         2.609e-02  1.975e-03  13.210  < 2e-16 ***
## type_employerPrivate       -4.260e-01  1.077e-01  -3.954 7.69e-05 ***
## type_employerSelf-emp      -7.807e-01  1.185e-01  -6.587 4.47e-11 ***
## type_employerSL-gov        -5.315e-01  1.216e-01  -4.372 1.23e-05 ***
## type_employerUnemployed    -1.226e+01  1.424e+02  -0.086  0.93139    
## fnlwgt                      8.866e-07  2.070e-07   4.283 1.85e-05 ***
## educationBachelors          5.130e-01  1.572e-01   3.263  0.00110 ** 
## educationCollege Dropout    5.355e-02  1.457e-01   0.367  0.71332    
## educationDoctorate          9.710e-01  4.315e-01   2.251  0.02441 *  
## educationElem Dropout      -7.945e-01  6.868e-01  -1.157  0.24739    
## educationHS                -2.360e-01  2.183e-01  -1.081  0.27967    
## educationHS Dropout        -6.100e-01  4.313e-01  -1.414  0.15726    
## educationMasters            7.365e-01  2.421e-01   3.042  0.00235 ** 
## educationMed/Law            1.121e+00  3.463e-01   3.237  0.00121 ** 
## education_num               1.293e-01  8.403e-02   1.539  0.12382    
## maritalNever-married       -1.479e+00  1.970e-01  -7.510 5.92e-14 ***
## maritalNot-Married         -8.687e-01  1.960e-01  -4.432 9.33e-06 ***
## occupationWhite Collar      6.463e-01  4.985e-02  12.965  < 2e-16 ***
## relationshipNot-in-family  -8.005e-01  1.936e-01  -4.135 3.55e-05 ***
## relationshipOther-relative -9.839e-01  2.607e-01  -3.774  0.00016 ***
## relationshipOwn-child      -1.866e+00  2.404e-01  -7.764 8.26e-15 ***
## relationshipUnmarried      -1.072e+00  2.183e-01  -4.913 8.99e-07 ***
## relationshipWife            1.221e+00  1.230e-01   9.927  < 2e-16 ***
## raceAsian-Pac-Islander      5.551e-01  3.148e-01   1.763  0.07787 .  
## raceBlack                   1.363e-01  2.762e-01   0.493  0.62176    
## raceOther                   1.675e-01  4.132e-01   0.405  0.68521    
## raceWhite                   3.756e-01  2.623e-01   1.432  0.15207    
## sexMale                     9.184e-01  9.476e-02   9.692  < 2e-16 ***
## capital_gain                3.084e-04  1.234e-05  24.984  < 2e-16 ***
## capital_loss                6.615e-04  4.581e-05  14.438  < 2e-16 ***
## hr_per_week                 3.040e-02  1.941e-03  15.662  < 2e-16 ***
## countryAsia America        -5.147e-01  2.985e-01  -1.724  0.08471 .  
## countryCarribean Islands    1.071e-01  3.119e-01   0.343  0.73140    
## countryEurope               5.564e-01  2.413e-01   2.306  0.02113 *  
## countryNorth America        5.532e-01  2.161e-01   2.559  0.01048 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23697  on 21113  degrees of freedom
## Residual deviance: 14026  on 21078  degrees of freedom
## AIC: 14098
## 
## Number of Fisher Scoring iterations: 12

Use stepwise feature to select the best variables for the model

# choose a model by aic in a stepwise algorithm
adult.model.new <- step(adult.model)

## Start:  AIC=14097.73
## income ~ age + type_employer + fnlwgt + education + education_num + 
##     marital + occupation + relationship + race + sex + capital_gain + 
##     capital_loss + hr_per_week + country
## 
##                 Df Deviance   AIC
## <none>                14026 14098
## - education_num  1    14028 14098
## - race           4    14036 14100
## - education      8    14051 14107
## - fnlwgt         1    14044 14114
## - country        4    14067 14131
## - type_employer  4    14085 14149
## - marital        2    14096 14164
## - sex            1    14123 14193
## - occupation     1    14195 14265
## - age            1    14202 14272
## - capital_loss   1    14240 14310
## - relationship   5    14261 14323
## - hr_per_week    1    14278 14348
## - capital_gain   1    15202 15272

summary(adult.model.new)

## 
## Call:
## glm(formula = income ~ age + type_employer + fnlwgt + education + 
##     education_num + marital + occupation + relationship + race + 
##     sex + capital_gain + capital_loss + hr_per_week + country, 
##     family = binomial(link = "logit"), data = adult.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2178  -0.5424  -0.2053  -0.0264   3.4901  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -6.160e+00  1.038e+00  -5.933 2.98e-09 ***
## age                         2.609e-02  1.975e-03  13.210  < 2e-16 ***
## type_employerPrivate       -4.260e-01  1.077e-01  -3.954 7.69e-05 ***
## type_employerSelf-emp      -7.807e-01  1.185e-01  -6.587 4.47e-11 ***
## type_employerSL-gov        -5.315e-01  1.216e-01  -4.372 1.23e-05 ***
## type_employerUnemployed    -1.226e+01  1.424e+02  -0.086  0.93139    
## fnlwgt                      8.866e-07  2.070e-07   4.283 1.85e-05 ***
## educationBachelors          5.130e-01  1.572e-01   3.263  0.00110 ** 
## educationCollege Dropout    5.355e-02  1.457e-01   0.367  0.71332    
## educationDoctorate          9.710e-01  4.315e-01   2.251  0.02441 *  
## educationElem Dropout      -7.945e-01  6.868e-01  -1.157  0.24739    
## educationHS                -2.360e-01  2.183e-01  -1.081  0.27967    
## educationHS Dropout        -6.100e-01  4.313e-01  -1.414  0.15726    
## educationMasters            7.365e-01  2.421e-01   3.042  0.00235 ** 
## educationMed/Law            1.121e+00  3.463e-01   3.237  0.00121 ** 
## education_num               1.293e-01  8.403e-02   1.539  0.12382    
## maritalNever-married       -1.479e+00  1.970e-01  -7.510 5.92e-14 ***
## maritalNot-Married         -8.687e-01  1.960e-01  -4.432 9.33e-06 ***
## occupationWhite Collar      6.463e-01  4.985e-02  12.965  < 2e-16 ***
## relationshipNot-in-family  -8.005e-01  1.936e-01  -4.135 3.55e-05 ***
## relationshipOther-relative -9.839e-01  2.607e-01  -3.774  0.00016 ***
## relationshipOwn-child      -1.866e+00  2.404e-01  -7.764 8.26e-15 ***
## relationshipUnmarried      -1.072e+00  2.183e-01  -4.913 8.99e-07 ***
## relationshipWife            1.221e+00  1.230e-01   9.927  < 2e-16 ***
## raceAsian-Pac-Islander      5.551e-01  3.148e-01   1.763  0.07787 .  
## raceBlack                   1.363e-01  2.762e-01   0.493  0.62176    
## raceOther                   1.675e-01  4.132e-01   0.405  0.68521    
## raceWhite                   3.756e-01  2.623e-01   1.432  0.15207    
## sexMale                     9.184e-01  9.476e-02   9.692  < 2e-16 ***
## capital_gain                3.084e-04  1.234e-05  24.984  < 2e-16 ***
## capital_loss                6.615e-04  4.581e-05  14.438  < 2e-16 ***
## hr_per_week                 3.040e-02  1.941e-03  15.662  < 2e-16 ***
## countryAsia America        -5.147e-01  2.985e-01  -1.724  0.08471 .  
## countryCarribean Islands    1.071e-01  3.119e-01   0.343  0.73140    
## countryEurope               5.564e-01  2.413e-01   2.306  0.02113 *  
## countryNorth America        5.532e-01  2.161e-01   2.559  0.01048 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23697  on 21113  degrees of freedom
## Residual deviance: 14026  on 21078  degrees of freedom
## AIC: 14098
## 
## Number of Fisher Scoring iterations: 12

Make predictions off the model

# predictions off model
adult.test$predicted <- predict(adult.model.new,adult.test, type = 'response')

Create Confusion matrix

#confusion matrix
cm <- table(adult.test$income, adult.test$predicted >0.5)
print(cm)

##        
##         FALSE TRUE
##   <=50K  6291  505
##   >50K    930 1322

Calculate model stats

# accuracy
TN <- cm[[1]]
FN <- cm[[2]]
FP <- cm[[3]]
TP <- cm[[4]]
Total <- TN + FN + FP + TP
# accuracy - how often its correct
acc <- (TP + TN)/Total
#Misclassification Rate -  how often its wrong
miss <- (FP+FN)/Total
#True Positive Rate: When it's actually yes, how often does it predict yes?
TPR <- TP/(FN+TP)
# True Negative rate
TNR <- TN/(TN+FP)
# precision
PPV <- TP/(TP+FP)

## Model accuracy is: 0.8414014

## Model miss classification is: 0.1585986

## Model true positive rate is: 0.5870337

## Model true negative rate is: 0.9256916

## Model precision is: 0.7235906

Logistic Regression

Wayne Tipton

February 1, 2018