This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
Logistic Regression Algorithm to predict Income Prediction Using Census Data.
Logistic Regression Model
#Loading Library xlConnect to read the xls data
options(java.parameters = "-Xmx4g" )
library(XLConnect)
## Loading required package: XLConnectJars
## XLConnect 0.2-13 by Mirai Solutions GmbH [aut],
## Martin Studer [cre],
## The Apache Software Foundation [ctb, cph] (Apache POI),
## Graph Builder [ctb, cph] (Curvesapi Java library)
## http://www.mirai-solutions.com ,
## http://miraisolutions.wordpress.com
library(xtable)
## Warning: package 'xtable' was built under R version 3.4.3
library(caret)
## Warning: package 'caret' was built under R version 3.4.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.2
census <- loadWorkbook("C:/Users/Saurabh/Documents/Logistic Regression Test.xlsx")
census_data <- readWorksheet(census,sheet = "Train Data", header = TRUE)
#exploring structure of the train data and then Summarize
str(census_data)
## 'data.frame': 32561 obs. of 15 variables:
## $ Age : num 39 50 38 53 28 37 49 52 31 42 ...
## $ Workclass : chr " State-gov" " Self-emp-not-inc" " Private" " Private" ...
## $ Fnlwgt : num 77516 83311 215646 234721 338409 ...
## $ Education : chr " Bachelors" " Bachelors" " HS-grad" " 11th" ...
## $ Education...Num: num 13 13 9 7 13 14 5 9 14 13 ...
## $ Marital.Status : chr " Never-married" " Married-civ-spouse" " Divorced" " Married-civ-spouse" ...
## $ Occupation : chr " Adm-clerical" " Exec-managerial" " Handlers-cleaners" " Handlers-cleaners" ...
## $ Relationship : chr " Not-in-family" " Husband" " Not-in-family" " Husband" ...
## $ Race : chr " White" " White" " White" " Black" ...
## $ Sex : chr " Male" " Male" " Male" " Male" ...
## $ Capital.Gain : num 2174 0 0 0 0 ...
## $ Capital.Loss : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Hours.per.Week : num 40 13 40 40 40 40 16 45 50 40 ...
## $ Native.Country : chr " United-States" " United-States" " United-States" " United-States" ...
## $ Income : chr " <=50K" " <=50K" " <=50K" " <=50K" ...
table (complete.cases (census_data))
##
## TRUE
## 32561
head(census_data)
## Age Workclass Fnlwgt Education Education...Num
## 1 39 State-gov 77516 Bachelors 13
## 2 50 Self-emp-not-inc 83311 Bachelors 13
## 3 38 Private 215646 HS-grad 9
## 4 53 Private 234721 11th 7
## 5 28 Private 338409 Bachelors 13
## 6 37 Private 284582 Masters 14
## Marital.Status Occupation Relationship Race Sex
## 1 Never-married Adm-clerical Not-in-family White Male
## 2 Married-civ-spouse Exec-managerial Husband White Male
## 3 Divorced Handlers-cleaners Not-in-family White Male
## 4 Married-civ-spouse Handlers-cleaners Husband Black Male
## 5 Married-civ-spouse Prof-specialty Wife Black Female
## 6 Married-civ-spouse Exec-managerial Wife White Female
## Capital.Gain Capital.Loss Hours.per.Week Native.Country Income
## 1 2174 0 40 United-States <=50K
## 2 0 0 13 United-States <=50K
## 3 0 0 40 United-States <=50K
## 4 0 0 40 United-States <=50K
## 5 0 0 40 Cuba <=50K
## 6 0 0 40 United-States <=50K
#converting char as factor
cols <- c("Workclass","Education","Marital.Status","Occupation","Relationship","Race","Sex","Native.Country","Income")
census_data[cols]= lapply(census_data[cols],as.factor)
summary(census_data)
## Age Workclass Fnlwgt
## Min. :17.00 Private :22696 Min. : 12285
## 1st Qu.:28.00 Self-emp-not-inc: 2541 1st Qu.: 117827
## Median :37.00 Local-gov : 2093 Median : 178356
## Mean :38.58 ? : 1836 Mean : 189778
## 3rd Qu.:48.00 State-gov : 1298 3rd Qu.: 237051
## Max. :90.00 Self-emp-inc : 1116 Max. :1484705
## (Other) : 981
## Education Education...Num Marital.Status
## HS-grad :10501 Min. : 1.00 Divorced : 4443
## Some-college: 7291 1st Qu.: 9.00 Married-AF-spouse : 23
## Bachelors : 5355 Median :10.00 Married-civ-spouse :14976
## Masters : 1723 Mean :10.08 Married-spouse-absent: 418
## Assoc-voc : 1382 3rd Qu.:12.00 Never-married :10683
## 11th : 1175 Max. :16.00 Separated : 1025
## (Other) : 5134 Widowed : 993
## Occupation Relationship
## Prof-specialty :4140 Husband :13193
## Craft-repair :4099 Not-in-family : 8305
## Exec-managerial:4066 Other-relative: 981
## Adm-clerical :3770 Own-child : 5068
## Sales :3650 Unmarried : 3446
## Other-service :3295 Wife : 1568
## (Other) :9541
## Race Sex Capital.Gain
## Amer-Indian-Eskimo: 311 Female:10771 Min. : 0
## Asian-Pac-Islander: 1039 Male :21790 1st Qu.: 0
## Black : 3124 Median : 0
## Other : 271 Mean : 1078
## White :27816 3rd Qu.: 0
## Max. :99999
##
## Capital.Loss Hours.per.Week Native.Country Income
## Min. : 0.0 Min. : 1.00 United-States:29170 <=50K:24720
## 1st Qu.: 0.0 1st Qu.:40.00 Mexico : 643 >50K : 7841
## Median : 0.0 Median :40.00 ? : 583
## Mean : 87.3 Mean :40.44 Philippines : 198
## 3rd Qu.: 0.0 3rd Qu.:45.00 Germany : 137
## Max. :4356.0 Max. :99.00 Canada : 121
## (Other) : 1709
#Reading the Test data and converting char as factor
census_test_data <- readWorksheet(census,sheet = "Test Data", header = TRUE)
str(census_test_data)
## 'data.frame': 16281 obs. of 15 variables:
## $ Age : chr "25" "38" "28" "44" ...
## $ Workclass : chr " Private" " Private" " Local-gov" " Private" ...
## $ Fnlwgt : num 226802 89814 336951 160323 103497 ...
## $ Education : chr " 11th" " HS-grad" " Assoc-acdm" " Some-college" ...
## $ Education...Num: num 7 9 12 10 10 6 9 15 10 4 ...
## $ Marital.Status : chr " Never-married" " Married-civ-spouse" " Married-civ-spouse" " Married-civ-spouse" ...
## $ Occupation : chr " Machine-op-inspct" " Farming-fishing" " Protective-serv" " Machine-op-inspct" ...
## $ Relationship : chr " Own-child" " Husband" " Husband" " Husband" ...
## $ Race : chr " Black" " White" " White" " Black" ...
## $ Sex : chr " Male" " Male" " Male" " Male" ...
## $ Capital.Gain : num 0 0 0 7688 0 ...
## $ Capital.Loss : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Hours.per.Week : num 40 50 40 40 30 30 40 32 40 10 ...
## $ Native.Country : chr " United-States" " United-States" " United-States" " United-States" ...
## $ Income : chr " <=50K." " <=50K." " >50K." " >50K." ...
summary(census_test_data)
## Age Workclass Fnlwgt
## Length:16281 Length:16281 Min. : 13492
## Class :character Class :character 1st Qu.: 116736
## Mode :character Mode :character Median : 177831
## Mean : 189436
## 3rd Qu.: 238384
## Max. :1490400
## Education Education...Num Marital.Status Occupation
## Length:16281 Min. : 1.00 Length:16281 Length:16281
## Class :character 1st Qu.: 9.00 Class :character Class :character
## Mode :character Median :10.00 Mode :character Mode :character
## Mean :10.07
## 3rd Qu.:12.00
## Max. :16.00
## Relationship Race Sex Capital.Gain
## Length:16281 Length:16281 Length:16281 Min. : 0
## Class :character Class :character Class :character 1st Qu.: 0
## Mode :character Mode :character Mode :character Median : 0
## Mean : 1082
## 3rd Qu.: 0
## Max. :99999
## Capital.Loss Hours.per.Week Native.Country Income
## Min. : 0.0 Min. : 1.00 Length:16281 Length:16281
## 1st Qu.: 0.0 1st Qu.:40.00 Class :character Class :character
## Median : 0.0 Median :40.00 Mode :character Mode :character
## Mean : 87.9 Mean :40.39
## 3rd Qu.: 0.0 3rd Qu.:45.00
## Max. :3770.0 Max. :99.00
census_test_data[cols]= lapply(census_test_data[cols],as.factor)
census_test_data$Age = as.numeric(census_test_data$Age)
str(census_test_data)
## 'data.frame': 16281 obs. of 15 variables:
## $ Age : num 25 38 28 44 18 34 29 63 24 55 ...
## $ Workclass : Factor w/ 9 levels " ?"," Federal-gov",..: 5 5 3 5 1 5 1 7 5 5 ...
## $ Fnlwgt : num 226802 89814 336951 160323 103497 ...
## $ Education : Factor w/ 16 levels " 10th"," 11th",..: 2 12 8 16 16 1 12 15 16 6 ...
## $ Education...Num: num 7 9 12 10 10 6 9 15 10 4 ...
## $ Marital.Status : Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
## $ Occupation : Factor w/ 15 levels " ?"," Adm-clerical",..: 8 6 12 8 1 9 1 11 9 4 ...
## $ Relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
## $ Race : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
## $ Sex : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 2 2 2 1 2 ...
## $ Capital.Gain : num 0 0 0 7688 0 ...
## $ Capital.Loss : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Hours.per.Week : num 40 50 40 40 30 30 40 32 40 10 ...
## $ Native.Country : Factor w/ 41 levels " ?"," Cambodia",..: 39 39 39 39 39 39 39 39 39 39 ...
## $ Income : Factor w/ 2 levels " <=50K."," >50K.": 1 1 2 2 1 1 1 2 1 1 ...
summary(census_test_data)
## Age Workclass Fnlwgt
## Min. :17.00 Private :11210 Min. : 13492
## 1st Qu.:28.00 Self-emp-not-inc: 1321 1st Qu.: 116736
## Median :37.00 Local-gov : 1043 Median : 177831
## Mean :38.77 ? : 963 Mean : 189436
## 3rd Qu.:48.00 State-gov : 683 3rd Qu.: 238384
## Max. :90.00 Self-emp-inc : 579 Max. :1490400
## (Other) : 482
## Education Education...Num Marital.Status
## HS-grad :5283 Min. : 1.00 Divorced :2190
## Some-college:3587 1st Qu.: 9.00 Married-AF-spouse : 14
## Bachelors :2670 Median :10.00 Married-civ-spouse :7403
## Masters : 934 Mean :10.07 Married-spouse-absent: 210
## Assoc-voc : 679 3rd Qu.:12.00 Never-married :5434
## 11th : 637 Max. :16.00 Separated : 505
## (Other) :2491 Widowed : 525
## Occupation Relationship
## Prof-specialty :2032 Husband :6523
## Exec-managerial:2020 Not-in-family :4278
## Craft-repair :2013 Other-relative: 525
## Sales :1854 Own-child :2513
## Adm-clerical :1841 Unmarried :1679
## Other-service :1628 Wife : 763
## (Other) :4893
## Race Sex Capital.Gain
## Amer-Indian-Eskimo: 159 Female: 5421 Min. : 0
## Asian-Pac-Islander: 480 Male :10860 1st Qu.: 0
## Black : 1561 Median : 0
## Other : 135 Mean : 1082
## White :13946 3rd Qu.: 0
## Max. :99999
##
## Capital.Loss Hours.per.Week Native.Country Income
## Min. : 0.0 Min. : 1.00 United-States:14662 <=50K.:12435
## 1st Qu.: 0.0 1st Qu.:40.00 Mexico : 308 >50K. : 3846
## Median : 0.0 Median :40.00 ? : 274
## Mean : 87.9 Mean :40.39 Philippines : 97
## 3rd Qu.: 0.0 3rd Qu.:45.00 Puerto-Rico : 70
## Max. :3770.0 Max. :99.00 Germany : 69
## (Other) : 801
summary(census_test_data$Workclass)
## ? Federal-gov Local-gov Never-worked
## 963 472 1043 3
## Private Self-emp-inc Self-emp-not-inc State-gov
## 11210 579 1321 683
## Without-pay
## 7
#to check the incomelevel
table(census_data$Income)
##
## <=50K >50K
## 24720 7841
#variable is removed from the training data set due to it's diminished impact on income level.
census_data$Fnlwgt = NULL
census_test_data$Fnlwgt=NULL
library(ggplot2)
summary(census_data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 28.00 37.00 38.58 48.00 90.00
#Exploring Numerical data and impact on dependent variable
# boxplot of age by income group
boxplot (Age ~ Income, data = census_data,
main = "Age distribution at different income levels",
xlab = "Income Levels", ylab = "Age", col = "orange")
# histogram of age by income group
ggplot(census_data) + aes(x=as.numeric(Age), group=Income, fill=Income) +
geom_histogram(binwidth=1, color='black')
summary(census_data$Education...Num)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 9.00 10.00 10.08 12.00 16.00
boxplot (Education...Num ~ Income, data = census_data,
main = "Years of Education for different income levels",
xlab = "Income Levels", ylab = "Years of Education", col = "blue")
summary(census_data$Capital.Gain)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 1078 0 99999
ggplot(census_data) + aes(x=as.numeric(Capital.Loss), group=Income, fill=Income) +
geom_histogram(bins=10, color='black') + ggtitle('Histogram of Capital Loss')
ggplot(census_data) + aes(x=as.numeric(Capital.Gain), group=Income, fill=Income) +
geom_histogram(bins=10, color='black') + ggtitle('Histogram of Capital Loss')
# percentage of observatiosn with no capital gain or loss
sum(census_data$Capital.Gain == 0)/length(census_data$Capital.Gain)
## [1] 0.9167102
# percentage of observatiosn with no capital gain or loss
sum(census_data$Capital.Loss == 0)/length(census_data$Capital.Loss)
## [1] 0.9533491
summary(census_data$Hours.per.Week)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 40.00 40.00 40.44 45.00 99.00
boxplot(Hours.per.Week~Income, data = census_data,
main = "Hours Per week Vs Income Level", xlab = "Income Levels", ylab="Hours Per Week",col = "blue")
corMat = cor(census_data[, c("Age", "Education...Num", "Capital.Gain", "Capital.Loss", "Hours.per.Week")])
corMat
## Age Education...Num Capital.Gain Capital.Loss
## Age 1.00000000 0.03652719 0.07767450 0.05777454
## Education...Num 0.03652719 1.00000000 0.12263011 0.07992296
## Capital.Gain 0.07767450 0.12263011 1.00000000 -0.03161506
## Capital.Loss 0.05777454 0.07992296 -0.03161506 1.00000000
## Hours.per.Week 0.06875571 0.14812273 0.07840862 0.05425636
## Hours.per.Week
## Age 0.06875571
## Education...Num 0.14812273
## Capital.Gain 0.07840862
## Capital.Loss 0.05425636
## Hours.per.Week 1.00000000
#Now checking the categorical variables and the impact on dependent variable income
table(census_data$Sex)
##
## Female Male
## 10771 21790
table(census_data[,c("Sex","Income")])
## Income
## Sex <=50K >50K
## Female 9592 1179
## Male 15128 6662
qplot (Income, data = census_data, fill = Workclass) + facet_grid (. ~ Workclass)
qplot (Income, data = census_data, fill = Occupation) + facet_grid (. ~ Occupation)
qplot (Income, data = census_data, fill = Marital.Status) + facet_grid (. ~ Marital.Status)
qplot (Income, data = census_data, fill = Relationship) + facet_grid (. ~ Relationship)
qplot (Income, data = census_data, fill = Education) + facet_grid (. ~ Education)
summary(census_data)
## Age Workclass Education
## Min. :17.00 Private :22696 HS-grad :10501
## 1st Qu.:28.00 Self-emp-not-inc: 2541 Some-college: 7291
## Median :37.00 Local-gov : 2093 Bachelors : 5355
## Mean :38.58 ? : 1836 Masters : 1723
## 3rd Qu.:48.00 State-gov : 1298 Assoc-voc : 1382
## Max. :90.00 Self-emp-inc : 1116 11th : 1175
## (Other) : 981 (Other) : 5134
## Education...Num Marital.Status Occupation
## Min. : 1.00 Divorced : 4443 Prof-specialty :4140
## 1st Qu.: 9.00 Married-AF-spouse : 23 Craft-repair :4099
## Median :10.00 Married-civ-spouse :14976 Exec-managerial:4066
## Mean :10.08 Married-spouse-absent: 418 Adm-clerical :3770
## 3rd Qu.:12.00 Never-married :10683 Sales :3650
## Max. :16.00 Separated : 1025 Other-service :3295
## Widowed : 993 (Other) :9541
## Relationship Race Sex
## Husband :13193 Amer-Indian-Eskimo: 311 Female:10771
## Not-in-family : 8305 Asian-Pac-Islander: 1039 Male :21790
## Other-relative: 981 Black : 3124
## Own-child : 5068 Other : 271
## Unmarried : 3446 White :27816
## Wife : 1568
##
## Capital.Gain Capital.Loss Hours.per.Week Native.Country
## Min. : 0 Min. : 0.0 Min. : 1.00 United-States:29170
## 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:40.00 Mexico : 643
## Median : 0 Median : 0.0 Median :40.00 ? : 583
## Mean : 1078 Mean : 87.3 Mean :40.44 Philippines : 198
## 3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.:45.00 Germany : 137
## Max. :99999 Max. :4356.0 Max. :99.00 Canada : 121
## (Other) : 1709
## Income
## <=50K:24720
## >50K : 7841
##
##
##
##
##
#create model using Logistic regression
model1 <- glm(Income~.
, data = census_data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model1)
##
## Call:
## glm(formula = Income ~ ., family = binomial, data = census_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.0695 -0.5056 -0.1825 -0.0250 3.7473
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -8.954e+00 4.394e-01 -20.376
## Age 2.514e-02 1.647e-03 15.266
## Workclass Federal-gov 1.088e+00 1.537e-01 7.082
## Workclass Local-gov 4.067e-01 1.402e-01 2.900
## Workclass Never-worked -1.049e+01 2.717e+02 -0.039
## Workclass Private 5.910e-01 1.252e-01 4.722
## Workclass Self-emp-inc 7.591e-01 1.495e-01 5.076
## Workclass Self-emp-not-inc 9.711e-02 1.370e-01 0.709
## Workclass State-gov 2.766e-01 1.517e-01 1.824
## Workclass Without-pay -1.222e+01 1.984e+02 -0.062
## Education 11th 7.728e-02 2.107e-01 0.367
## Education 12th 4.807e-01 2.642e-01 1.820
## Education 1st-4th -5.306e-01 4.899e-01 -1.083
## Education 5th-6th -2.553e-01 3.247e-01 -0.786
## Education 7th-8th -4.853e-01 2.320e-01 -2.091
## Education 9th -2.007e-01 2.611e-01 -0.769
## Education Assoc-acdm 1.331e+00 1.763e-01 7.547
## Education Assoc-voc 1.340e+00 1.693e-01 7.913
## Education Bachelors 1.929e+00 1.574e-01 12.256
## Education Doctorate 2.984e+00 2.142e-01 13.935
## Education HS-grad 8.061e-01 1.533e-01 5.257
## Education Masters 2.278e+00 1.678e-01 13.575
## Education Preschool -2.118e+01 3.696e+02 -0.057
## Education Prof-school 2.783e+00 2.000e-01 13.918
## Education Some-college 1.152e+00 1.555e-01 7.408
## Education...Num NA NA NA
## Marital.Status Married-AF-spouse 2.696e+00 5.540e-01 4.867
## Marital.Status Married-civ-spouse 2.209e+00 2.655e-01 8.318
## Marital.Status Married-spouse-absent -2.121e-02 2.301e-01 -0.092
## Marital.Status Never-married -4.807e-01 8.747e-02 -5.495
## Marital.Status Separated -1.216e-01 1.640e-01 -0.742
## Marital.Status Widowed 1.251e-01 1.537e-01 0.814
## Occupation Adm-clerical 1.124e-01 9.913e-02 1.134
## Occupation Armed-Forces -1.002e+00 1.513e+00 -0.662
## Occupation Craft-repair 1.810e-01 8.482e-02 2.134
## Occupation Exec-managerial 8.955e-01 8.719e-02 10.271
## Occupation Farming-fishing -9.019e-01 1.421e-01 -6.349
## Occupation Handlers-cleaners -5.702e-01 1.458e-01 -3.910
## Occupation Machine-op-inspct -1.732e-01 1.061e-01 -1.633
## Occupation Other-service -7.220e-01 1.244e-01 -5.802
## Occupation Priv-house-serv -4.072e+00 1.668e+00 -2.441
## Occupation Prof-specialty 6.265e-01 9.358e-02 6.694
## Occupation Protective-serv 6.907e-01 1.303e-01 5.301
## Occupation Sales 3.907e-01 9.009e-02 4.337
## Occupation Tech-support 7.657e-01 1.193e-01 6.417
## Occupation Transport-moving NA NA NA
## Relationship Not-in-family 5.763e-01 2.627e-01 2.193
## Relationship Other-relative -3.691e-01 2.427e-01 -1.521
## Relationship Own-child -6.604e-01 2.601e-01 -2.539
## Relationship Unmarried 4.440e-01 2.787e-01 1.593
## Relationship Wife 1.362e+00 1.025e-01 13.283
## Race Asian-Pac-Islander 6.792e-01 2.695e-01 2.520
## Race Black 4.758e-01 2.322e-01 2.049
## Race Other 2.071e-01 3.541e-01 0.585
## Race White 6.160e-01 2.214e-01 2.783
## Sex Male 8.694e-01 7.913e-02 10.988
## Capital.Gain 3.196e-04 1.031e-05 31.008
## Capital.Loss 6.457e-04 3.709e-05 17.409
## Hours.per.Week 2.954e-02 1.621e-03 18.229
## Native.Country Cambodia 1.510e+00 6.340e-01 2.382
## Native.Country Canada 5.231e-01 2.949e-01 1.774
## Native.Country China -4.835e-01 3.935e-01 -1.229
## Native.Country Columbia -1.907e+00 8.250e-01 -2.312
## Native.Country Cuba 5.732e-01 3.366e-01 1.703
## Native.Country Dominican-Republic -1.645e+00 1.050e+00 -1.567
## Native.Country Ecuador -8.717e-02 7.258e-01 -0.120
## Native.Country El-Salvador -4.002e-01 4.971e-01 -0.805
## Native.Country England 4.743e-01 3.334e-01 1.423
## Native.Country France 7.745e-01 5.270e-01 1.470
## Native.Country Germany 6.218e-01 2.840e-01 2.189
## Native.Country Greece -8.340e-01 5.672e-01 -1.470
## Native.Country Guatemala -1.764e-02 7.583e-01 -0.023
## Native.Country Haiti 1.160e-01 6.859e-01 0.169
## Native.Country Holand-Netherlands -1.036e+01 8.827e+02 -0.012
## Native.Country Honduras -1.100e+00 2.429e+00 -0.453
## Native.Country Hong 1.358e-01 6.786e-01 0.200
## Native.Country Hungary 5.751e-02 7.729e-01 0.074
## Native.Country India -1.906e-01 3.285e-01 -0.580
## Native.Country Iran 2.249e-01 4.508e-01 0.499
## Native.Country Ireland 6.849e-01 6.445e-01 1.063
## Native.Country Italy 9.944e-01 3.455e-01 2.878
## Native.Country Jamaica 2.015e-01 4.630e-01 0.435
## Native.Country Japan 5.850e-01 4.191e-01 1.396
## Native.Country Laos -3.611e-01 8.576e-01 -0.421
## Native.Country Mexico -2.950e-01 2.545e-01 -1.159
## Native.Country Nicaragua -4.926e-01 7.978e-01 -0.617
## Native.Country Outlying-US(Guam-USVI-etc) -1.204e+01 2.112e+02 -0.057
## Native.Country Peru -5.865e-01 8.591e-01 -0.683
## Native.Country Philippines 6.273e-01 2.805e-01 2.236
## Native.Country Poland 1.768e-01 4.205e-01 0.421
## Native.Country Portugal 1.221e-01 6.339e-01 0.193
## Native.Country Puerto-Rico -1.433e-01 4.035e-01 -0.355
## Native.Country Scotland 1.828e-01 7.890e-01 0.232
## Native.Country South -8.763e-01 4.422e-01 -1.982
## Native.Country Taiwan 2.392e-01 4.719e-01 0.507
## Native.Country Thailand -3.578e-01 8.351e-01 -0.428
## Native.Country Trinadad&Tobago -2.083e-01 8.677e-01 -0.240
## Native.Country United-States 3.736e-01 1.380e-01 2.708
## Native.Country Vietnam -9.634e-01 6.163e-01 -1.563
## Native.Country Yugoslavia 8.936e-01 6.814e-01 1.311
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## Age < 2e-16 ***
## Workclass Federal-gov 1.42e-12 ***
## Workclass Local-gov 0.00374 **
## Workclass Never-worked 0.96921
## Workclass Private 2.34e-06 ***
## Workclass Self-emp-inc 3.85e-07 ***
## Workclass Self-emp-not-inc 0.47850
## Workclass State-gov 0.06815 .
## Workclass Without-pay 0.95089
## Education 11th 0.71378
## Education 12th 0.06877 .
## Education 1st-4th 0.27883
## Education 5th-6th 0.43166
## Education 7th-8th 0.03649 *
## Education 9th 0.44208
## Education Assoc-acdm 4.45e-14 ***
## Education Assoc-voc 2.51e-15 ***
## Education Bachelors < 2e-16 ***
## Education Doctorate < 2e-16 ***
## Education HS-grad 1.46e-07 ***
## Education Masters < 2e-16 ***
## Education Preschool 0.95430
## Education Prof-school < 2e-16 ***
## Education Some-college 1.29e-13 ***
## Education...Num NA
## Marital.Status Married-AF-spouse 1.13e-06 ***
## Marital.Status Married-civ-spouse < 2e-16 ***
## Marital.Status Married-spouse-absent 0.92656
## Marital.Status Never-married 3.90e-08 ***
## Marital.Status Separated 0.45837
## Marital.Status Widowed 0.41569
## Occupation Adm-clerical 0.25686
## Occupation Armed-Forces 0.50768
## Occupation Craft-repair 0.03283 *
## Occupation Exec-managerial < 2e-16 ***
## Occupation Farming-fishing 2.17e-10 ***
## Occupation Handlers-cleaners 9.22e-05 ***
## Occupation Machine-op-inspct 0.10256
## Occupation Other-service 6.53e-09 ***
## Occupation Priv-house-serv 0.01464 *
## Occupation Prof-specialty 2.17e-11 ***
## Occupation Protective-serv 1.15e-07 ***
## Occupation Sales 1.45e-05 ***
## Occupation Tech-support 1.39e-10 ***
## Occupation Transport-moving NA
## Relationship Not-in-family 0.02829 *
## Relationship Other-relative 0.12831
## Relationship Own-child 0.01112 *
## Relationship Unmarried 0.11115
## Relationship Wife < 2e-16 ***
## Race Asian-Pac-Islander 0.01174 *
## Race Black 0.04043 *
## Race Other 0.55858
## Race White 0.00539 **
## Sex Male < 2e-16 ***
## Capital.Gain < 2e-16 ***
## Capital.Loss < 2e-16 ***
## Hours.per.Week < 2e-16 ***
## Native.Country Cambodia 0.01723 *
## Native.Country Canada 0.07607 .
## Native.Country China 0.21920
## Native.Country Columbia 0.02077 *
## Native.Country Cuba 0.08860 .
## Native.Country Dominican-Republic 0.11720
## Native.Country Ecuador 0.90440
## Native.Country El-Salvador 0.42076
## Native.Country England 0.15479
## Native.Country France 0.14166
## Native.Country Germany 0.02857 *
## Native.Country Greece 0.14144
## Native.Country Guatemala 0.98144
## Native.Country Haiti 0.86565
## Native.Country Holand-Netherlands 0.99064
## Native.Country Honduras 0.65070
## Native.Country Hong 0.84134
## Native.Country Hungary 0.94069
## Native.Country India 0.56172
## Native.Country Iran 0.61785
## Native.Country Ireland 0.28793
## Native.Country Italy 0.00400 **
## Native.Country Jamaica 0.66343
## Native.Country Japan 0.16277
## Native.Country Laos 0.67369
## Native.Country Mexico 0.24627
## Native.Country Nicaragua 0.53696
## Native.Country Outlying-US(Guam-USVI-etc) 0.95454
## Native.Country Peru 0.49481
## Native.Country Philippines 0.02534 *
## Native.Country Poland 0.67408
## Native.Country Portugal 0.84726
## Native.Country Puerto-Rico 0.72249
## Native.Country Scotland 0.81674
## Native.Country South 0.04751 *
## Native.Country Taiwan 0.61221
## Native.Country Thailand 0.66834
## Native.Country Trinadad&Tobago 0.81031
## Native.Country United-States 0.00678 **
## Native.Country Vietnam 0.11797
## Native.Country Yugoslavia 0.18974
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 35948 on 32560 degrees of freedom
## Residual deviance: 20582 on 32463 degrees of freedom
## AIC: 20778
##
## Number of Fisher Scoring iterations: 13
#predicting the model
table(census_test_data$Income)
##
## <=50K. >50K.
## 12435 3846
PredictModel <- predict(model1, newdata = census_test_data, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
#pred_m <- rep('<=50K', length(PredictModel))
#pred_m[PredictModel>=.3] <- '>50K'
# confusion matrix
confusionmatrix_LR<- table(census_test_data$Income, PredictModel >= 0.3)
confusionmatrix_LR
##
## FALSE TRUE
## <=50K. 10474 1961
## >50K. 844 3002
Accuracy <- (confusionmatrix_LR[1,1] + confusionmatrix_LR[2,2])/sum(confusionmatrix_LR)
Accuracy
## [1] 0.8277133
tpr <- confusionmatrix_LR[2,2]/(confusionmatrix_LR[2,1]+confusionmatrix_LR[2,2])
tpr
## [1] 0.7805512
fpr <- confusionmatrix_LR[1,2]/(confusionmatrix_LR[1,1]+confusionmatrix_LR[1,2])
fpr
## [1] 0.1577
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.2
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
pred1 <- prediction(PredictModel,census_test_data$Income)
perf <- performance(pred1,"tpr","fpr")
plot(perf)
as.numeric(performance(pred1,"auc")@y.values)
## [1] 0.9040029