R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

Logistic Regression Algorithm to predict Income Prediction Using Census Data.

Logistic Regression Model

#Loading Library xlConnect to read the xls data
options(java.parameters = "-Xmx4g" )
library(XLConnect)
## Loading required package: XLConnectJars
## XLConnect 0.2-13 by Mirai Solutions GmbH [aut],
##   Martin Studer [cre],
##   The Apache Software Foundation [ctb, cph] (Apache POI),
##   Graph Builder [ctb, cph] (Curvesapi Java library)
## http://www.mirai-solutions.com ,
## http://miraisolutions.wordpress.com
library(xtable)
## Warning: package 'xtable' was built under R version 3.4.3
library(caret)
## Warning: package 'caret' was built under R version 3.4.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.2
census <- loadWorkbook("C:/Users/Saurabh/Documents/Logistic Regression Test.xlsx")
census_data <- readWorksheet(census,sheet = "Train Data", header = TRUE)

#exploring structure of the train data and then Summarize 
str(census_data)
## 'data.frame':    32561 obs. of  15 variables:
##  $ Age            : num  39 50 38 53 28 37 49 52 31 42 ...
##  $ Workclass      : chr  " State-gov" " Self-emp-not-inc" " Private" " Private" ...
##  $ Fnlwgt         : num  77516 83311 215646 234721 338409 ...
##  $ Education      : chr  " Bachelors" " Bachelors" " HS-grad" " 11th" ...
##  $ Education...Num: num  13 13 9 7 13 14 5 9 14 13 ...
##  $ Marital.Status : chr  " Never-married" " Married-civ-spouse" " Divorced" " Married-civ-spouse" ...
##  $ Occupation     : chr  " Adm-clerical" " Exec-managerial" " Handlers-cleaners" " Handlers-cleaners" ...
##  $ Relationship   : chr  " Not-in-family" " Husband" " Not-in-family" " Husband" ...
##  $ Race           : chr  " White" " White" " White" " Black" ...
##  $ Sex            : chr  " Male" " Male" " Male" " Male" ...
##  $ Capital.Gain   : num  2174 0 0 0 0 ...
##  $ Capital.Loss   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Hours.per.Week : num  40 13 40 40 40 40 16 45 50 40 ...
##  $ Native.Country : chr  " United-States" " United-States" " United-States" " United-States" ...
##  $ Income         : chr  " <=50K" " <=50K" " <=50K" " <=50K" ...
table (complete.cases (census_data))
## 
##  TRUE 
## 32561
head(census_data)
##   Age         Workclass Fnlwgt  Education Education...Num
## 1  39         State-gov  77516  Bachelors              13
## 2  50  Self-emp-not-inc  83311  Bachelors              13
## 3  38           Private 215646    HS-grad               9
## 4  53           Private 234721       11th               7
## 5  28           Private 338409  Bachelors              13
## 6  37           Private 284582    Masters              14
##        Marital.Status         Occupation   Relationship   Race     Sex
## 1       Never-married       Adm-clerical  Not-in-family  White    Male
## 2  Married-civ-spouse    Exec-managerial        Husband  White    Male
## 3            Divorced  Handlers-cleaners  Not-in-family  White    Male
## 4  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male
## 5  Married-civ-spouse     Prof-specialty           Wife  Black  Female
## 6  Married-civ-spouse    Exec-managerial           Wife  White  Female
##   Capital.Gain Capital.Loss Hours.per.Week Native.Country Income
## 1         2174            0             40  United-States  <=50K
## 2            0            0             13  United-States  <=50K
## 3            0            0             40  United-States  <=50K
## 4            0            0             40  United-States  <=50K
## 5            0            0             40           Cuba  <=50K
## 6            0            0             40  United-States  <=50K
#converting char as factor
cols <- c("Workclass","Education","Marital.Status","Occupation","Relationship","Race","Sex","Native.Country","Income")
census_data[cols]= lapply(census_data[cols],as.factor)
summary(census_data)
##       Age                    Workclass         Fnlwgt       
##  Min.   :17.00    Private         :22696   Min.   :  12285  
##  1st Qu.:28.00    Self-emp-not-inc: 2541   1st Qu.: 117827  
##  Median :37.00    Local-gov       : 2093   Median : 178356  
##  Mean   :38.58    ?               : 1836   Mean   : 189778  
##  3rd Qu.:48.00    State-gov       : 1298   3rd Qu.: 237051  
##  Max.   :90.00    Self-emp-inc    : 1116   Max.   :1484705  
##                  (Other)          :  981                    
##          Education     Education...Num                Marital.Status 
##   HS-grad     :10501   Min.   : 1.00    Divorced             : 4443  
##   Some-college: 7291   1st Qu.: 9.00    Married-AF-spouse    :   23  
##   Bachelors   : 5355   Median :10.00    Married-civ-spouse   :14976  
##   Masters     : 1723   Mean   :10.08    Married-spouse-absent:  418  
##   Assoc-voc   : 1382   3rd Qu.:12.00    Never-married        :10683  
##   11th        : 1175   Max.   :16.00    Separated            : 1025  
##  (Other)      : 5134                    Widowed              :  993  
##             Occupation            Relationship  
##   Prof-specialty :4140    Husband       :13193  
##   Craft-repair   :4099    Not-in-family : 8305  
##   Exec-managerial:4066    Other-relative:  981  
##   Adm-clerical   :3770    Own-child     : 5068  
##   Sales          :3650    Unmarried     : 3446  
##   Other-service  :3295    Wife          : 1568  
##  (Other)         :9541                          
##                   Race            Sex         Capital.Gain  
##   Amer-Indian-Eskimo:  311    Female:10771   Min.   :    0  
##   Asian-Pac-Islander: 1039    Male  :21790   1st Qu.:    0  
##   Black             : 3124                   Median :    0  
##   Other             :  271                   Mean   : 1078  
##   White             :27816                   3rd Qu.:    0  
##                                              Max.   :99999  
##                                                             
##   Capital.Loss    Hours.per.Week         Native.Country     Income     
##  Min.   :   0.0   Min.   : 1.00    United-States:29170    <=50K:24720  
##  1st Qu.:   0.0   1st Qu.:40.00    Mexico       :  643    >50K : 7841  
##  Median :   0.0   Median :40.00    ?            :  583                 
##  Mean   :  87.3   Mean   :40.44    Philippines  :  198                 
##  3rd Qu.:   0.0   3rd Qu.:45.00    Germany      :  137                 
##  Max.   :4356.0   Max.   :99.00    Canada       :  121                 
##                                   (Other)       : 1709
#Reading the Test data and converting char as factor
census_test_data <- readWorksheet(census,sheet = "Test Data", header = TRUE)
str(census_test_data)
## 'data.frame':    16281 obs. of  15 variables:
##  $ Age            : chr  "25" "38" "28" "44" ...
##  $ Workclass      : chr  " Private" " Private" " Local-gov" " Private" ...
##  $ Fnlwgt         : num  226802 89814 336951 160323 103497 ...
##  $ Education      : chr  " 11th" " HS-grad" " Assoc-acdm" " Some-college" ...
##  $ Education...Num: num  7 9 12 10 10 6 9 15 10 4 ...
##  $ Marital.Status : chr  " Never-married" " Married-civ-spouse" " Married-civ-spouse" " Married-civ-spouse" ...
##  $ Occupation     : chr  " Machine-op-inspct" " Farming-fishing" " Protective-serv" " Machine-op-inspct" ...
##  $ Relationship   : chr  " Own-child" " Husband" " Husband" " Husband" ...
##  $ Race           : chr  " Black" " White" " White" " Black" ...
##  $ Sex            : chr  " Male" " Male" " Male" " Male" ...
##  $ Capital.Gain   : num  0 0 0 7688 0 ...
##  $ Capital.Loss   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Hours.per.Week : num  40 50 40 40 30 30 40 32 40 10 ...
##  $ Native.Country : chr  " United-States" " United-States" " United-States" " United-States" ...
##  $ Income         : chr  " <=50K." " <=50K." " >50K." " >50K." ...
summary(census_test_data)
##      Age             Workclass             Fnlwgt       
##  Length:16281       Length:16281       Min.   :  13492  
##  Class :character   Class :character   1st Qu.: 116736  
##  Mode  :character   Mode  :character   Median : 177831  
##                                        Mean   : 189436  
##                                        3rd Qu.: 238384  
##                                        Max.   :1490400  
##   Education         Education...Num Marital.Status      Occupation       
##  Length:16281       Min.   : 1.00   Length:16281       Length:16281      
##  Class :character   1st Qu.: 9.00   Class :character   Class :character  
##  Mode  :character   Median :10.00   Mode  :character   Mode  :character  
##                     Mean   :10.07                                        
##                     3rd Qu.:12.00                                        
##                     Max.   :16.00                                        
##  Relationship           Race               Sex             Capital.Gain  
##  Length:16281       Length:16281       Length:16281       Min.   :    0  
##  Class :character   Class :character   Class :character   1st Qu.:    0  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0  
##                                                           Mean   : 1082  
##                                                           3rd Qu.:    0  
##                                                           Max.   :99999  
##   Capital.Loss    Hours.per.Week  Native.Country        Income         
##  Min.   :   0.0   Min.   : 1.00   Length:16281       Length:16281      
##  1st Qu.:   0.0   1st Qu.:40.00   Class :character   Class :character  
##  Median :   0.0   Median :40.00   Mode  :character   Mode  :character  
##  Mean   :  87.9   Mean   :40.39                                        
##  3rd Qu.:   0.0   3rd Qu.:45.00                                        
##  Max.   :3770.0   Max.   :99.00
census_test_data[cols]= lapply(census_test_data[cols],as.factor)
census_test_data$Age = as.numeric(census_test_data$Age)
str(census_test_data)
## 'data.frame':    16281 obs. of  15 variables:
##  $ Age            : num  25 38 28 44 18 34 29 63 24 55 ...
##  $ Workclass      : Factor w/ 9 levels " ?"," Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
##  $ Fnlwgt         : num  226802 89814 336951 160323 103497 ...
##  $ Education      : Factor w/ 16 levels " 10th"," 11th",..: 2 12 8 16 16 1 12 15 16 6 ...
##  $ Education...Num: num  7 9 12 10 10 6 9 15 10 4 ...
##  $ Marital.Status : Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
##  $ Occupation     : Factor w/ 15 levels " ?"," Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
##  $ Relationship   : Factor w/ 6 levels " Husband"," Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ Race           : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ Sex            : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ Capital.Gain   : num  0 0 0 7688 0 ...
##  $ Capital.Loss   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Hours.per.Week : num  40 50 40 40 30 30 40 32 40 10 ...
##  $ Native.Country : Factor w/ 41 levels " ?"," Cambodia",..: 39 39 39 39 39 39 39 39 39 39 ...
##  $ Income         : Factor w/ 2 levels " <=50K."," >50K.": 1 1 2 2 1 1 1 2 1 1 ...
summary(census_test_data)
##       Age                    Workclass         Fnlwgt       
##  Min.   :17.00    Private         :11210   Min.   :  13492  
##  1st Qu.:28.00    Self-emp-not-inc: 1321   1st Qu.: 116736  
##  Median :37.00    Local-gov       : 1043   Median : 177831  
##  Mean   :38.77    ?               :  963   Mean   : 189436  
##  3rd Qu.:48.00    State-gov       :  683   3rd Qu.: 238384  
##  Max.   :90.00    Self-emp-inc    :  579   Max.   :1490400  
##                  (Other)          :  482                    
##          Education    Education...Num                Marital.Status
##   HS-grad     :5283   Min.   : 1.00    Divorced             :2190  
##   Some-college:3587   1st Qu.: 9.00    Married-AF-spouse    :  14  
##   Bachelors   :2670   Median :10.00    Married-civ-spouse   :7403  
##   Masters     : 934   Mean   :10.07    Married-spouse-absent: 210  
##   Assoc-voc   : 679   3rd Qu.:12.00    Never-married        :5434  
##   11th        : 637   Max.   :16.00    Separated            : 505  
##  (Other)      :2491                    Widowed              : 525  
##             Occupation            Relationship 
##   Prof-specialty :2032    Husband       :6523  
##   Exec-managerial:2020    Not-in-family :4278  
##   Craft-repair   :2013    Other-relative: 525  
##   Sales          :1854    Own-child     :2513  
##   Adm-clerical   :1841    Unmarried     :1679  
##   Other-service  :1628    Wife          : 763  
##  (Other)         :4893                         
##                   Race            Sex         Capital.Gain  
##   Amer-Indian-Eskimo:  159    Female: 5421   Min.   :    0  
##   Asian-Pac-Islander:  480    Male  :10860   1st Qu.:    0  
##   Black             : 1561                   Median :    0  
##   Other             :  135                   Mean   : 1082  
##   White             :13946                   3rd Qu.:    0  
##                                              Max.   :99999  
##                                                             
##   Capital.Loss    Hours.per.Week         Native.Country      Income     
##  Min.   :   0.0   Min.   : 1.00    United-States:14662    <=50K.:12435  
##  1st Qu.:   0.0   1st Qu.:40.00    Mexico       :  308    >50K. : 3846  
##  Median :   0.0   Median :40.00    ?            :  274                  
##  Mean   :  87.9   Mean   :40.39    Philippines  :   97                  
##  3rd Qu.:   0.0   3rd Qu.:45.00    Puerto-Rico  :   70                  
##  Max.   :3770.0   Max.   :99.00    Germany      :   69                  
##                                   (Other)       :  801
summary(census_test_data$Workclass)
##                 ?       Federal-gov         Local-gov      Never-worked 
##               963               472              1043                 3 
##           Private      Self-emp-inc  Self-emp-not-inc         State-gov 
##             11210               579              1321               683 
##       Without-pay 
##                 7
#to check the incomelevel
table(census_data$Income)
## 
##  <=50K   >50K 
##  24720   7841
#variable is removed from the training data set due to it's diminished impact on income level.
census_data$Fnlwgt = NULL
census_test_data$Fnlwgt=NULL

library(ggplot2)
summary(census_data$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17.00   28.00   37.00   38.58   48.00   90.00
#Exploring Numerical data and impact on dependent variable

# boxplot of age by income group
boxplot (Age ~ Income, data = census_data, 
         main = "Age distribution at different income levels",
         xlab = "Income Levels", ylab = "Age", col = "orange")

# histogram of age by income group
ggplot(census_data) + aes(x=as.numeric(Age), group=Income, fill=Income) + 
  geom_histogram(binwidth=1, color='black')

summary(census_data$Education...Num)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    9.00   10.00   10.08   12.00   16.00
boxplot (Education...Num ~ Income, data = census_data, 
         main = "Years of Education for different income levels",
         xlab = "Income Levels", ylab = "Years of Education", col = "blue")

summary(census_data$Capital.Gain)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0    1078       0   99999
ggplot(census_data) + aes(x=as.numeric(Capital.Loss), group=Income, fill=Income) + 
  geom_histogram(bins=10, color='black') + ggtitle('Histogram of Capital Loss')

ggplot(census_data) + aes(x=as.numeric(Capital.Gain), group=Income, fill=Income) + 
  geom_histogram(bins=10, color='black') + ggtitle('Histogram of Capital Loss')

# percentage of observatiosn with no capital gain or loss
sum(census_data$Capital.Gain == 0)/length(census_data$Capital.Gain)
## [1] 0.9167102
# percentage of observatiosn with no capital gain or loss
sum(census_data$Capital.Loss == 0)/length(census_data$Capital.Loss)
## [1] 0.9533491
summary(census_data$Hours.per.Week)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   40.00   40.00   40.44   45.00   99.00
boxplot(Hours.per.Week~Income, data = census_data, 
        main = "Hours Per week Vs Income Level", xlab = "Income Levels", ylab="Hours Per Week",col = "blue")

corMat = cor(census_data[, c("Age", "Education...Num", "Capital.Gain", "Capital.Loss", "Hours.per.Week")])
corMat
##                        Age Education...Num Capital.Gain Capital.Loss
## Age             1.00000000      0.03652719   0.07767450   0.05777454
## Education...Num 0.03652719      1.00000000   0.12263011   0.07992296
## Capital.Gain    0.07767450      0.12263011   1.00000000  -0.03161506
## Capital.Loss    0.05777454      0.07992296  -0.03161506   1.00000000
## Hours.per.Week  0.06875571      0.14812273   0.07840862   0.05425636
##                 Hours.per.Week
## Age                 0.06875571
## Education...Num     0.14812273
## Capital.Gain        0.07840862
## Capital.Loss        0.05425636
## Hours.per.Week      1.00000000
#Now checking the categorical variables and the impact on dependent variable income

table(census_data$Sex)
## 
##  Female    Male 
##   10771   21790
table(census_data[,c("Sex","Income")])
##          Income
## Sex        <=50K  >50K
##    Female   9592  1179
##    Male    15128  6662
qplot (Income, data = census_data, fill = Workclass) + facet_grid (. ~ Workclass)

qplot (Income, data = census_data, fill = Occupation) + facet_grid (. ~ Occupation)

qplot (Income, data = census_data, fill = Marital.Status) + facet_grid (. ~ Marital.Status)

qplot (Income, data = census_data, fill = Relationship) + facet_grid (. ~ Relationship)

qplot (Income, data = census_data, fill = Education) + facet_grid (. ~ Education)

summary(census_data)
##       Age                    Workclass             Education    
##  Min.   :17.00    Private         :22696    HS-grad     :10501  
##  1st Qu.:28.00    Self-emp-not-inc: 2541    Some-college: 7291  
##  Median :37.00    Local-gov       : 2093    Bachelors   : 5355  
##  Mean   :38.58    ?               : 1836    Masters     : 1723  
##  3rd Qu.:48.00    State-gov       : 1298    Assoc-voc   : 1382  
##  Max.   :90.00    Self-emp-inc    : 1116    11th        : 1175  
##                  (Other)          :  981   (Other)      : 5134  
##  Education...Num                Marital.Status             Occupation  
##  Min.   : 1.00    Divorced             : 4443    Prof-specialty :4140  
##  1st Qu.: 9.00    Married-AF-spouse    :   23    Craft-repair   :4099  
##  Median :10.00    Married-civ-spouse   :14976    Exec-managerial:4066  
##  Mean   :10.08    Married-spouse-absent:  418    Adm-clerical   :3770  
##  3rd Qu.:12.00    Never-married        :10683    Sales          :3650  
##  Max.   :16.00    Separated            : 1025    Other-service  :3295  
##                   Widowed              :  993   (Other)         :9541  
##           Relationship                    Race            Sex       
##   Husband       :13193    Amer-Indian-Eskimo:  311    Female:10771  
##   Not-in-family : 8305    Asian-Pac-Islander: 1039    Male  :21790  
##   Other-relative:  981    Black             : 3124                  
##   Own-child     : 5068    Other             :  271                  
##   Unmarried     : 3446    White             :27816                  
##   Wife          : 1568                                              
##                                                                     
##   Capital.Gain    Capital.Loss    Hours.per.Week         Native.Country 
##  Min.   :    0   Min.   :   0.0   Min.   : 1.00    United-States:29170  
##  1st Qu.:    0   1st Qu.:   0.0   1st Qu.:40.00    Mexico       :  643  
##  Median :    0   Median :   0.0   Median :40.00    ?            :  583  
##  Mean   : 1078   Mean   :  87.3   Mean   :40.44    Philippines  :  198  
##  3rd Qu.:    0   3rd Qu.:   0.0   3rd Qu.:45.00    Germany      :  137  
##  Max.   :99999   Max.   :4356.0   Max.   :99.00    Canada       :  121  
##                                                   (Other)       : 1709  
##     Income     
##   <=50K:24720  
##   >50K : 7841  
##                
##                
##                
##                
## 
#create model using Logistic regression

model1 <- glm(Income~.
              , data = census_data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model1)
## 
## Call:
## glm(formula = Income ~ ., family = binomial, data = census_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.0695  -0.5056  -0.1825  -0.0250   3.7473  
## 
## Coefficients: (2 not defined because of singularities)
##                                             Estimate Std. Error z value
## (Intercept)                               -8.954e+00  4.394e-01 -20.376
## Age                                        2.514e-02  1.647e-03  15.266
## Workclass Federal-gov                      1.088e+00  1.537e-01   7.082
## Workclass Local-gov                        4.067e-01  1.402e-01   2.900
## Workclass Never-worked                    -1.049e+01  2.717e+02  -0.039
## Workclass Private                          5.910e-01  1.252e-01   4.722
## Workclass Self-emp-inc                     7.591e-01  1.495e-01   5.076
## Workclass Self-emp-not-inc                 9.711e-02  1.370e-01   0.709
## Workclass State-gov                        2.766e-01  1.517e-01   1.824
## Workclass Without-pay                     -1.222e+01  1.984e+02  -0.062
## Education 11th                             7.728e-02  2.107e-01   0.367
## Education 12th                             4.807e-01  2.642e-01   1.820
## Education 1st-4th                         -5.306e-01  4.899e-01  -1.083
## Education 5th-6th                         -2.553e-01  3.247e-01  -0.786
## Education 7th-8th                         -4.853e-01  2.320e-01  -2.091
## Education 9th                             -2.007e-01  2.611e-01  -0.769
## Education Assoc-acdm                       1.331e+00  1.763e-01   7.547
## Education Assoc-voc                        1.340e+00  1.693e-01   7.913
## Education Bachelors                        1.929e+00  1.574e-01  12.256
## Education Doctorate                        2.984e+00  2.142e-01  13.935
## Education HS-grad                          8.061e-01  1.533e-01   5.257
## Education Masters                          2.278e+00  1.678e-01  13.575
## Education Preschool                       -2.118e+01  3.696e+02  -0.057
## Education Prof-school                      2.783e+00  2.000e-01  13.918
## Education Some-college                     1.152e+00  1.555e-01   7.408
## Education...Num                                   NA         NA      NA
## Marital.Status Married-AF-spouse           2.696e+00  5.540e-01   4.867
## Marital.Status Married-civ-spouse          2.209e+00  2.655e-01   8.318
## Marital.Status Married-spouse-absent      -2.121e-02  2.301e-01  -0.092
## Marital.Status Never-married              -4.807e-01  8.747e-02  -5.495
## Marital.Status Separated                  -1.216e-01  1.640e-01  -0.742
## Marital.Status Widowed                     1.251e-01  1.537e-01   0.814
## Occupation Adm-clerical                    1.124e-01  9.913e-02   1.134
## Occupation Armed-Forces                   -1.002e+00  1.513e+00  -0.662
## Occupation Craft-repair                    1.810e-01  8.482e-02   2.134
## Occupation Exec-managerial                 8.955e-01  8.719e-02  10.271
## Occupation Farming-fishing                -9.019e-01  1.421e-01  -6.349
## Occupation Handlers-cleaners              -5.702e-01  1.458e-01  -3.910
## Occupation Machine-op-inspct              -1.732e-01  1.061e-01  -1.633
## Occupation Other-service                  -7.220e-01  1.244e-01  -5.802
## Occupation Priv-house-serv                -4.072e+00  1.668e+00  -2.441
## Occupation Prof-specialty                  6.265e-01  9.358e-02   6.694
## Occupation Protective-serv                 6.907e-01  1.303e-01   5.301
## Occupation Sales                           3.907e-01  9.009e-02   4.337
## Occupation Tech-support                    7.657e-01  1.193e-01   6.417
## Occupation Transport-moving                       NA         NA      NA
## Relationship Not-in-family                 5.763e-01  2.627e-01   2.193
## Relationship Other-relative               -3.691e-01  2.427e-01  -1.521
## Relationship Own-child                    -6.604e-01  2.601e-01  -2.539
## Relationship Unmarried                     4.440e-01  2.787e-01   1.593
## Relationship Wife                          1.362e+00  1.025e-01  13.283
## Race Asian-Pac-Islander                    6.792e-01  2.695e-01   2.520
## Race Black                                 4.758e-01  2.322e-01   2.049
## Race Other                                 2.071e-01  3.541e-01   0.585
## Race White                                 6.160e-01  2.214e-01   2.783
## Sex Male                                   8.694e-01  7.913e-02  10.988
## Capital.Gain                               3.196e-04  1.031e-05  31.008
## Capital.Loss                               6.457e-04  3.709e-05  17.409
## Hours.per.Week                             2.954e-02  1.621e-03  18.229
## Native.Country Cambodia                    1.510e+00  6.340e-01   2.382
## Native.Country Canada                      5.231e-01  2.949e-01   1.774
## Native.Country China                      -4.835e-01  3.935e-01  -1.229
## Native.Country Columbia                   -1.907e+00  8.250e-01  -2.312
## Native.Country Cuba                        5.732e-01  3.366e-01   1.703
## Native.Country Dominican-Republic         -1.645e+00  1.050e+00  -1.567
## Native.Country Ecuador                    -8.717e-02  7.258e-01  -0.120
## Native.Country El-Salvador                -4.002e-01  4.971e-01  -0.805
## Native.Country England                     4.743e-01  3.334e-01   1.423
## Native.Country France                      7.745e-01  5.270e-01   1.470
## Native.Country Germany                     6.218e-01  2.840e-01   2.189
## Native.Country Greece                     -8.340e-01  5.672e-01  -1.470
## Native.Country Guatemala                  -1.764e-02  7.583e-01  -0.023
## Native.Country Haiti                       1.160e-01  6.859e-01   0.169
## Native.Country Holand-Netherlands         -1.036e+01  8.827e+02  -0.012
## Native.Country Honduras                   -1.100e+00  2.429e+00  -0.453
## Native.Country Hong                        1.358e-01  6.786e-01   0.200
## Native.Country Hungary                     5.751e-02  7.729e-01   0.074
## Native.Country India                      -1.906e-01  3.285e-01  -0.580
## Native.Country Iran                        2.249e-01  4.508e-01   0.499
## Native.Country Ireland                     6.849e-01  6.445e-01   1.063
## Native.Country Italy                       9.944e-01  3.455e-01   2.878
## Native.Country Jamaica                     2.015e-01  4.630e-01   0.435
## Native.Country Japan                       5.850e-01  4.191e-01   1.396
## Native.Country Laos                       -3.611e-01  8.576e-01  -0.421
## Native.Country Mexico                     -2.950e-01  2.545e-01  -1.159
## Native.Country Nicaragua                  -4.926e-01  7.978e-01  -0.617
## Native.Country Outlying-US(Guam-USVI-etc) -1.204e+01  2.112e+02  -0.057
## Native.Country Peru                       -5.865e-01  8.591e-01  -0.683
## Native.Country Philippines                 6.273e-01  2.805e-01   2.236
## Native.Country Poland                      1.768e-01  4.205e-01   0.421
## Native.Country Portugal                    1.221e-01  6.339e-01   0.193
## Native.Country Puerto-Rico                -1.433e-01  4.035e-01  -0.355
## Native.Country Scotland                    1.828e-01  7.890e-01   0.232
## Native.Country South                      -8.763e-01  4.422e-01  -1.982
## Native.Country Taiwan                      2.392e-01  4.719e-01   0.507
## Native.Country Thailand                   -3.578e-01  8.351e-01  -0.428
## Native.Country Trinadad&Tobago            -2.083e-01  8.677e-01  -0.240
## Native.Country United-States               3.736e-01  1.380e-01   2.708
## Native.Country Vietnam                    -9.634e-01  6.163e-01  -1.563
## Native.Country Yugoslavia                  8.936e-01  6.814e-01   1.311
##                                           Pr(>|z|)    
## (Intercept)                                < 2e-16 ***
## Age                                        < 2e-16 ***
## Workclass Federal-gov                     1.42e-12 ***
## Workclass Local-gov                        0.00374 ** 
## Workclass Never-worked                     0.96921    
## Workclass Private                         2.34e-06 ***
## Workclass Self-emp-inc                    3.85e-07 ***
## Workclass Self-emp-not-inc                 0.47850    
## Workclass State-gov                        0.06815 .  
## Workclass Without-pay                      0.95089    
## Education 11th                             0.71378    
## Education 12th                             0.06877 .  
## Education 1st-4th                          0.27883    
## Education 5th-6th                          0.43166    
## Education 7th-8th                          0.03649 *  
## Education 9th                              0.44208    
## Education Assoc-acdm                      4.45e-14 ***
## Education Assoc-voc                       2.51e-15 ***
## Education Bachelors                        < 2e-16 ***
## Education Doctorate                        < 2e-16 ***
## Education HS-grad                         1.46e-07 ***
## Education Masters                          < 2e-16 ***
## Education Preschool                        0.95430    
## Education Prof-school                      < 2e-16 ***
## Education Some-college                    1.29e-13 ***
## Education...Num                                 NA    
## Marital.Status Married-AF-spouse          1.13e-06 ***
## Marital.Status Married-civ-spouse          < 2e-16 ***
## Marital.Status Married-spouse-absent       0.92656    
## Marital.Status Never-married              3.90e-08 ***
## Marital.Status Separated                   0.45837    
## Marital.Status Widowed                     0.41569    
## Occupation Adm-clerical                    0.25686    
## Occupation Armed-Forces                    0.50768    
## Occupation Craft-repair                    0.03283 *  
## Occupation Exec-managerial                 < 2e-16 ***
## Occupation Farming-fishing                2.17e-10 ***
## Occupation Handlers-cleaners              9.22e-05 ***
## Occupation Machine-op-inspct               0.10256    
## Occupation Other-service                  6.53e-09 ***
## Occupation Priv-house-serv                 0.01464 *  
## Occupation Prof-specialty                 2.17e-11 ***
## Occupation Protective-serv                1.15e-07 ***
## Occupation Sales                          1.45e-05 ***
## Occupation Tech-support                   1.39e-10 ***
## Occupation Transport-moving                     NA    
## Relationship Not-in-family                 0.02829 *  
## Relationship Other-relative                0.12831    
## Relationship Own-child                     0.01112 *  
## Relationship Unmarried                     0.11115    
## Relationship Wife                          < 2e-16 ***
## Race Asian-Pac-Islander                    0.01174 *  
## Race Black                                 0.04043 *  
## Race Other                                 0.55858    
## Race White                                 0.00539 ** 
## Sex Male                                   < 2e-16 ***
## Capital.Gain                               < 2e-16 ***
## Capital.Loss                               < 2e-16 ***
## Hours.per.Week                             < 2e-16 ***
## Native.Country Cambodia                    0.01723 *  
## Native.Country Canada                      0.07607 .  
## Native.Country China                       0.21920    
## Native.Country Columbia                    0.02077 *  
## Native.Country Cuba                        0.08860 .  
## Native.Country Dominican-Republic          0.11720    
## Native.Country Ecuador                     0.90440    
## Native.Country El-Salvador                 0.42076    
## Native.Country England                     0.15479    
## Native.Country France                      0.14166    
## Native.Country Germany                     0.02857 *  
## Native.Country Greece                      0.14144    
## Native.Country Guatemala                   0.98144    
## Native.Country Haiti                       0.86565    
## Native.Country Holand-Netherlands          0.99064    
## Native.Country Honduras                    0.65070    
## Native.Country Hong                        0.84134    
## Native.Country Hungary                     0.94069    
## Native.Country India                       0.56172    
## Native.Country Iran                        0.61785    
## Native.Country Ireland                     0.28793    
## Native.Country Italy                       0.00400 ** 
## Native.Country Jamaica                     0.66343    
## Native.Country Japan                       0.16277    
## Native.Country Laos                        0.67369    
## Native.Country Mexico                      0.24627    
## Native.Country Nicaragua                   0.53696    
## Native.Country Outlying-US(Guam-USVI-etc)  0.95454    
## Native.Country Peru                        0.49481    
## Native.Country Philippines                 0.02534 *  
## Native.Country Poland                      0.67408    
## Native.Country Portugal                    0.84726    
## Native.Country Puerto-Rico                 0.72249    
## Native.Country Scotland                    0.81674    
## Native.Country South                       0.04751 *  
## Native.Country Taiwan                      0.61221    
## Native.Country Thailand                    0.66834    
## Native.Country Trinadad&Tobago             0.81031    
## Native.Country United-States               0.00678 ** 
## Native.Country Vietnam                     0.11797    
## Native.Country Yugoslavia                  0.18974    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 35948  on 32560  degrees of freedom
## Residual deviance: 20582  on 32463  degrees of freedom
## AIC: 20778
## 
## Number of Fisher Scoring iterations: 13
#predicting the model
table(census_test_data$Income)
## 
##  <=50K.   >50K. 
##   12435    3846
PredictModel <- predict(model1, newdata = census_test_data, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
#pred_m <- rep('<=50K', length(PredictModel))
#pred_m[PredictModel>=.3] <- '>50K'

# confusion matrix 
confusionmatrix_LR<- table(census_test_data$Income, PredictModel >= 0.3)
confusionmatrix_LR
##          
##           FALSE  TRUE
##    <=50K. 10474  1961
##    >50K.    844  3002
Accuracy <- (confusionmatrix_LR[1,1] + confusionmatrix_LR[2,2])/sum(confusionmatrix_LR)
Accuracy
## [1] 0.8277133
tpr <- confusionmatrix_LR[2,2]/(confusionmatrix_LR[2,1]+confusionmatrix_LR[2,2])
tpr
## [1] 0.7805512
fpr <- confusionmatrix_LR[1,2]/(confusionmatrix_LR[1,1]+confusionmatrix_LR[1,2])
fpr
## [1] 0.1577
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.2
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
pred1 <- prediction(PredictModel,census_test_data$Income)
perf <- performance(pred1,"tpr","fpr")
plot(perf)

as.numeric(performance(pred1,"auc")@y.values)
## [1] 0.9040029