\[ lm(horsepower ~ Displacement + Acceleration) \]
library(Metrics)
## Warning: package 'Metrics' was built under R version 3.5.3
mpgtrain <- read.csv("auto_mpg_train.csv",header=TRUE)
attach(mpgtrain)
pairs(mpgtrain[1:7])
model1 <- lm(mpg~horsepower+weight+model.year+origin)
summary(model1)
##
## Call:
## lm(formula = mpg ~ horsepower + weight + model.year + origin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0372 -2.1947 -0.1314 1.7328 13.1326
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.447e+01 4.385e+00 -3.299 0.00107 **
## horsepower -1.314e-02 1.030e-02 -1.276 0.20295
## weight -5.668e-03 4.988e-04 -11.364 < 2e-16 ***
## model.year 7.175e-01 5.370e-02 13.362 < 2e-16 ***
## origin 1.116e+00 2.819e-01 3.959 9.15e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.454 on 343 degrees of freedom
## Multiple R-squared: 0.8158, Adjusted R-squared: 0.8136
## F-statistic: 379.7 on 4 and 343 DF, p-value: < 2.2e-16
1). This value shows the amount of variability in the estimated response variable that is explained by the model. In this case, almost 82% of mpg can be explained by independent variables cylinders, displacement, horsepower and weight. In multiple linear regression (i.e. when the number of variables is more than one), we need to adjust \(R^2\) according to the number of modeled variables because as the number of variable increases the \(R^2\) also grows regardless of the model being improved or not. As a result, the adjusted \(R^2\) would be a more accurate metric than the original \(R^2\).autotest <- read.csv("auto_mpg_test.csv",header = TRUE)
autotest$y <- predict(model1,autotest) # adding a new column conaining predicted value
attach(autotest)
## The following objects are masked from mpgtrain:
##
## acceleration, car.name, cylinders, displacement, horsepower,
## model.year, mpg, origin, weight
cat("Mean Square error based on Model:",mse(mpg,y))
## Mean Square error based on Model: 6.767472
attach(mpgtrain)
## The following objects are masked from autotest:
##
## acceleration, car.name, cylinders, displacement, horsepower,
## model.year, mpg, origin, weight
## The following objects are masked from mpgtrain (pos = 5):
##
## acceleration, car.name, cylinders, displacement, horsepower,
## model.year, mpg, origin, weight
model2 <- lm(mpg ~ weight+model.year+origin)
summary(model2)
##
## Call:
## lm(formula = mpg ~ weight + model.year + origin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1389 -2.1770 -0.0552 1.7956 13.1857
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.602e+01 4.216e+00 -3.801 0.000170 ***
## weight -6.195e-03 2.791e-04 -22.200 < 2e-16 ***
## model.year 7.412e-01 5.041e-02 14.704 < 2e-16 ***
## origin 1.070e+00 2.799e-01 3.825 0.000155 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.457 on 344 degrees of freedom
## Multiple R-squared: 0.8149, Adjusted R-squared: 0.8133
## F-statistic: 504.8 on 3 and 344 DF, p-value: < 2.2e-16
autotest$y2 <- predict(model2,autotest)
attach(autotest)
## The following objects are masked from mpgtrain (pos = 3):
##
## acceleration, car.name, cylinders, displacement, horsepower,
## model.year, mpg, origin, weight
## The following objects are masked from autotest (pos = 5):
##
## acceleration, car.name, cylinders, displacement, horsepower,
## model.year, mpg, origin, weight, y
## The following objects are masked from mpgtrain (pos = 6):
##
## acceleration, car.name, cylinders, displacement, horsepower,
## model.year, mpg, origin, weight
cat("Mean Square error based on Model:",mse(mpg,y2))
## Mean Square error based on Model: 6.530443
mpgtrain$log_weight <- log(mpgtrain$weight,2) # creating a new column that contains log weight
autotest$log_weight <- log(autotest$weight,2) # creating a new column that contains log weight
attach(mpgtrain)
## The following objects are masked from autotest (pos = 3):
##
## acceleration, car.name, cylinders, displacement, horsepower,
## model.year, mpg, origin, weight
## The following objects are masked from mpgtrain (pos = 4):
##
## acceleration, car.name, cylinders, displacement, horsepower,
## model.year, mpg, origin, weight
## The following objects are masked from autotest (pos = 6):
##
## acceleration, car.name, cylinders, displacement, horsepower,
## model.year, mpg, origin, weight
## The following objects are masked from mpgtrain (pos = 7):
##
## acceleration, car.name, cylinders, displacement, horsepower,
## model.year, mpg, origin, weight
model3 <- lm(mpg ~ log_weight+model.year+origin)
summary(model3)
##
## Call:
## lm(formula = mpg ~ log_weight + model.year + origin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1181 -2.0649 -0.0322 1.7316 12.9940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 120.85808 8.17108 14.791 < 2e-16 ***
## log_weight -13.63674 0.55080 -24.758 < 2e-16 ***
## model.year 0.76540 0.04675 16.371 < 2e-16 ***
## origin 0.70236 0.26672 2.633 0.00884 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.233 on 344 degrees of freedom
## Multiple R-squared: 0.8381, Adjusted R-squared: 0.8367
## F-statistic: 593.7 on 3 and 344 DF, p-value: < 2.2e-16
mse(autotest$mpg,predict(model3,autotest))
## [1] 5.042397
incometrain <- read.csv("adult_income_train.csv",header = TRUE)
# Replacing "?" with "Missing Informative"
library(plyr)
## Warning: package 'plyr' was built under R version 3.5.3
##
## Attaching package: 'plyr'
## The following object is masked from 'package:faraway':
##
## ozone
incometrain$workclass <- revalue(incometrain$workclass, c("?"="Missing Informative"))
incometrain$occupation <- revalue(incometrain$occupation, c("?"="Missing Informative"))
incometrain$native_country <- revalue(incometrain$native_country, c("?"="Missing Informative"))
model4 <- glm(income~.,family=binomial, data=incometrain)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model4)
##
## Call:
## glm(formula = income ~ ., family = binomial, data = incometrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.1131 -0.5027 -0.1823 -0.0336 3.8667
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -8.983e+00 3.817e-01 -23.534
## age 2.493e-02 1.421e-03 17.547
## workclassFederal-gov 1.199e+00 1.312e-01 9.142
## workclassLocal-gov 5.072e-01 1.191e-01 4.258
## workclassNever-worked -8.058e+00 8.524e+01 -0.095
## workclassPrivate 6.753e-01 1.056e-01 6.396
## workclassSelf-emp-inc 8.304e-01 1.272e-01 6.528
## workclassSelf-emp-not-inc 1.356e-01 1.160e-01 1.169
## workclassState-gov 3.109e-01 1.295e-01 2.401
## workclassWithout-pay -2.029e-01 7.916e-01 -0.256
## fnlwgt 7.803e-07 1.473e-07 5.298
## education11th 4.803e-02 1.834e-01 0.262
## education12th 4.517e-01 2.274e-01 1.987
## education1st-4th -6.381e-01 4.437e-01 -1.438
## education5th-6th -3.744e-01 2.842e-01 -1.317
## education7th-8th -4.565e-01 2.001e-01 -2.281
## education9th -1.979e-01 2.244e-01 -0.882
## educationAssoc-acdm 1.370e+00 1.525e-01 8.985
## educationAssoc-voc 1.297e+00 1.473e-01 8.808
## educationBachelors 1.930e+00 1.368e-01 14.113
## educationDoctorate 2.871e+00 1.849e-01 15.524
## educationHS-grad 8.032e-01 1.333e-01 6.027
## educationMasters 2.265e+00 1.454e-01 15.578
## educationPreschool -5.110e+00 3.713e+00 -1.376
## educationProf-school 2.782e+00 1.739e-01 16.001
## educationSome-college 1.168e+00 1.352e-01 8.642
## educational_num NA NA NA
## marital_statusMarried-AF-spouse 2.484e+00 4.762e-01 5.216
## marital_statusMarried-civ-spouse 2.324e+00 2.318e-01 10.028
## marital_statusMarried-spouse-absent 1.221e-01 1.898e-01 0.643
## marital_statusNever-married -4.299e-01 7.584e-02 -5.668
## marital_statusSeparated -1.449e-01 1.446e-01 -1.002
## marital_statusWidowed 8.315e-02 1.355e-01 0.613
## occupationAdm-clerical 9.467e-02 8.477e-02 1.117
## occupationArmed-Forces 3.553e-01 9.076e-01 0.391
## occupationCraft-repair 1.303e-01 7.292e-02 1.787
## occupationExec-managerial 8.604e-01 7.496e-02 11.477
## occupationFarming-fishing -8.822e-01 1.231e-01 -7.166
## occupationHandlers-cleaners -6.438e-01 1.247e-01 -5.161
## occupationMachine-op-inspct -1.953e-01 9.134e-02 -2.138
## occupationOther-service -7.800e-01 1.071e-01 -7.280
## occupationPriv-house-serv -2.508e+00 1.007e+00 -2.490
## occupationProf-specialty 6.173e-01 8.039e-02 7.679
## occupationProtective-serv 5.778e-01 1.130e-01 5.115
## occupationSales 3.531e-01 7.737e-02 4.564
## occupationTech-support 6.917e-01 1.018e-01 6.797
## occupationTransport-moving NA NA NA
## relationshipNot-in-family 5.902e-01 2.294e-01 2.573
## relationshipOther-relative -4.475e-01 2.163e-01 -2.069
## relationshipOwn-child -5.141e-01 2.249e-01 -2.286
## relationshipUnmarried 4.186e-01 2.437e-01 1.718
## relationshipWife 1.207e+00 8.796e-02 13.727
## raceAsian-Pac-Islander 8.455e-01 2.338e-01 3.616
## raceBlack 4.001e-01 2.033e-01 1.968
## raceOther 4.883e-01 2.904e-01 1.681
## raceWhite 6.173e-01 1.934e-01 3.192
## genderMale 7.743e-01 6.793e-02 11.398
## capital_gain 3.231e-04 9.041e-06 35.736
## capital_loss 6.397e-04 3.199e-05 19.999
## hours_per_week 2.867e-02 1.382e-03 20.745
## native_countryCambodia 9.898e-01 5.506e-01 1.798
## native_countryCanada 6.812e-01 2.420e-01 2.815
## native_countryChina -7.025e-01 3.243e-01 -2.167
## native_countryColumbia -2.253e+00 7.956e-01 -2.832
## native_countryCuba 3.318e-01 2.955e-01 1.123
## native_countryDominican-Republic -1.551e+00 7.610e-01 -2.038
## native_countryEcuador -4.960e-01 6.298e-01 -0.788
## native_countryEl-Salvador -6.264e-01 4.477e-01 -1.399
## native_countryEngland 5.152e-01 2.980e-01 1.729
## native_countryFrance 8.185e-01 4.584e-01 1.786
## native_countryGermany 2.507e-01 2.520e-01 0.995
## native_countryGreece -2.283e-01 4.052e-01 -0.563
## native_countryGuatemala -3.188e-01 7.477e-01 -0.426
## native_countryHaiti 1.927e-01 5.078e-01 0.379
## native_countryHoland-Netherlands -8.348e+00 3.247e+02 -0.026
## native_countryHonduras -1.413e+00 2.105e+00 -0.671
## native_countryHong -4.326e-01 5.976e-01 -0.724
## native_countryHungary 4.144e-01 6.326e-01 0.655
## native_countryIndia -2.766e-01 2.836e-01 -0.975
## native_countryIran 2.927e-01 4.009e-01 0.730
## native_countryIreland 1.276e+00 5.018e-01 2.542
## native_countryItaly 7.790e-01 2.991e-01 2.605
## native_countryJamaica 2.015e-01 4.142e-01 0.486
## native_countryJapan -6.937e-02 3.474e-01 -0.200
## native_countryLaos -1.304e+00 8.638e-01 -1.510
## native_countryMexico -5.924e-01 2.234e-01 -2.651
## native_countryNicaragua -9.403e-01 7.831e-01 -1.201
## native_countryOutlying-US(Guam-USVI-etc) -7.466e-01 1.080e+00 -0.692
## native_countryPeru -6.493e-01 6.353e-01 -1.022
## native_countryPhilippines 2.360e-01 2.417e-01 0.976
## native_countryPoland -2.304e-02 3.639e-01 -0.063
## native_countryPortugal 6.013e-01 4.451e-01 1.351
## native_countryPuerto-Rico -1.232e-01 3.323e-01 -0.371
## native_countryScotland -1.595e-01 7.555e-01 -0.211
## native_countrySouth -1.147e+00 3.835e-01 -2.991
## native_countryTaiwan -2.788e-02 4.148e-01 -0.067
## native_countryThailand -7.829e-01 6.973e-01 -1.123
## native_countryTrinadad&Tobago -1.156e+00 8.340e-01 -1.386
## native_countryUnited-States 2.445e-01 1.135e-01 2.155
## native_countryVietnam -9.150e-01 5.077e-01 -1.802
## native_countryYugoslavia 7.909e-01 6.123e-01 1.292
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## age < 2e-16 ***
## workclassFederal-gov < 2e-16 ***
## workclassLocal-gov 2.06e-05 ***
## workclassNever-worked 0.924684
## workclassPrivate 1.60e-10 ***
## workclassSelf-emp-inc 6.68e-11 ***
## workclassSelf-emp-not-inc 0.242303
## workclassState-gov 0.016340 *
## workclassWithout-pay 0.797719
## fnlwgt 1.17e-07 ***
## education11th 0.793465
## education12th 0.046958 *
## education1st-4th 0.150368
## education5th-6th 0.187776
## education7th-8th 0.022567 *
## education9th 0.377763
## educationAssoc-acdm < 2e-16 ***
## educationAssoc-voc < 2e-16 ***
## educationBachelors < 2e-16 ***
## educationDoctorate < 2e-16 ***
## educationHS-grad 1.67e-09 ***
## educationMasters < 2e-16 ***
## educationPreschool 0.168785
## educationProf-school < 2e-16 ***
## educationSome-college < 2e-16 ***
## educational_num NA
## marital_statusMarried-AF-spouse 1.83e-07 ***
## marital_statusMarried-civ-spouse < 2e-16 ***
## marital_statusMarried-spouse-absent 0.520216
## marital_statusNever-married 1.44e-08 ***
## marital_statusSeparated 0.316373
## marital_statusWidowed 0.539548
## occupationAdm-clerical 0.264077
## occupationArmed-Forces 0.695457
## occupationCraft-repair 0.074013 .
## occupationExec-managerial < 2e-16 ***
## occupationFarming-fishing 7.74e-13 ***
## occupationHandlers-cleaners 2.46e-07 ***
## occupationMachine-op-inspct 0.032506 *
## occupationOther-service 3.34e-13 ***
## occupationPriv-house-serv 0.012767 *
## occupationProf-specialty 1.61e-14 ***
## occupationProtective-serv 3.14e-07 ***
## occupationSales 5.03e-06 ***
## occupationTech-support 1.07e-11 ***
## occupationTransport-moving NA
## relationshipNot-in-family 0.010078 *
## relationshipOther-relative 0.038592 *
## relationshipOwn-child 0.022251 *
## relationshipUnmarried 0.085822 .
## relationshipWife < 2e-16 ***
## raceAsian-Pac-Islander 0.000299 ***
## raceBlack 0.049111 *
## raceOther 0.092677 .
## raceWhite 0.001414 **
## genderMale < 2e-16 ***
## capital_gain < 2e-16 ***
## capital_loss < 2e-16 ***
## hours_per_week < 2e-16 ***
## native_countryCambodia 0.072240 .
## native_countryCanada 0.004880 **
## native_countryChina 0.030269 *
## native_countryColumbia 0.004622 **
## native_countryCuba 0.261368
## native_countryDominican-Republic 0.041571 *
## native_countryEcuador 0.430972
## native_countryEl-Salvador 0.161737
## native_countryEngland 0.083770 .
## native_countryFrance 0.074147 .
## native_countryGermany 0.319802
## native_countryGreece 0.573207
## native_countryGuatemala 0.669817
## native_countryHaiti 0.704402
## native_countryHoland-Netherlands 0.979492
## native_countryHonduras 0.501972
## native_countryHong 0.469151
## native_countryHungary 0.512433
## native_countryIndia 0.329360
## native_countryIran 0.465257
## native_countryIreland 0.011025 *
## native_countryItaly 0.009196 **
## native_countryJamaica 0.626736
## native_countryJapan 0.841702
## native_countryLaos 0.131047
## native_countryMexico 0.008020 **
## native_countryNicaragua 0.229859
## native_countryOutlying-US(Guam-USVI-etc) 0.489229
## native_countryPeru 0.306739
## native_countryPhilippines 0.328968
## native_countryPoland 0.949509
## native_countryPortugal 0.176764
## native_countryPuerto-Rico 0.710764
## native_countryScotland 0.832756
## native_countrySouth 0.002779 **
## native_countryTaiwan 0.946418
## native_countryThailand 0.261537
## native_countryTrinadad&Tobago 0.165765
## native_countryUnited-States 0.031189 *
## native_countryVietnam 0.071501 .
## native_countryYugoslavia 0.196480
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 48173 on 43841 degrees of freedom
## Residual deviance: 27615 on 43743 degrees of freedom
## AIC: 27813
##
## Number of Fisher Scoring iterations: 11
incometest <- read.csv("adult_income_test.csv",header=TRUE)
incometest$incomepred <- round(predict(model4,incometest,type = 'response'))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
# Confusion Matrix
confusion.matrix <- as.matrix(table('Actual'=incometest$income,'Prediction'=(incometest$incomepred)))
confusion.matrix
## Prediction
## Actual 0 1
## <=50K 3507 264
## >50K 493 736
N <- nrow(incometest)
diag <- diag(confusion.matrix)
Accuracy <- sum(diag)/N
round(Accuracy*100,2)
## [1] 84.86
#precision, accuracy and recall
rowsums = apply(confusion.matrix, 1, sum)
colsums = apply(confusion.matrix, 2, sum)
Actual.Dist = rowsums / N
Predicted.Dist= colsums / N
Precision = diag / colsums
Recall = diag / rowsums
F1 = 2 * Precision * Recall / (Precision + Recall)
round(data.frame(Precision, Recall, F1, Actual.Dist, Predicted.Dist)*100,2)
## Precision Recall F1 Actual.Dist Predicted.Dist
## 0 87.67 93.00 90.26 75.42 80
## 1 73.60 59.89 66.04 24.58 20
library(plyr)
incomenew <- read.csv("adult_income_train.csv",header = TRUE)
incomenew$workclass <- revalue(incomenew$workclass, c("?"="Missing Informative","Never-worked"="Other","Without-pay"= "Other","Self-emp-not-inc"="Other","State-gov"="Other"))
incomenew$education <- revalue(incomenew$education, c("11th"="Other", "12th"= "Other", "1st-4th"= "Other","5th-6th"= "Other", "7th-8th"= "Other", "9th"= "Other","Preschool"= "Other"))
incomenew$marital_status <- revalue(incomenew$marital_status, c("Separated"="Other", "Widowed"= "Other","Never-married"="Other","Married-spouse-absent"="Other"))
incomenew$occupation <- revalue(incomenew$occupation, c("?"="Missing Informative","Adm-clerical"="Other","Armed-Forces"= "Other", "Craft-repair"="Other","Machine-op-inspct"="Other","Priv-house-serv" = "Other", "Transport-moving" = "Other"))
incomenew$relationship <- revalue(incomenew$relationship, c("Not-in-family"="Other", "Other-relative"= "Other", "Own-child"="Other", "Unmarried"= "Other"))
incomenew$race <- revalue(incomenew$race, c("Black"="Other", "White"= "Other"))
incomenew$native_country <- revalue(incomenew$native_country, c("?"="Missing Informative", "Cambodia"="Other", "China"="Other","Cuba"="Other", "Dominican-Republic"="Other", "Ecuador"="Other", "El-Salvador"="Other", "England"="Other", "France"="Other", "Germany"="Other", "Greece"="Other", "Guatemala"="Other", "Haiti"="Other", "Holand-Netherlands"="Other", "Honduras"="Other", "Hong"="Other", "Hungary"="Other", "India"="Other", "Iran"="Other", "Ireland"="Other", "Jamaica"="Other", "Japan"="Other", "Laos"="Other", "Nicaragua"="Other", "Outlying-US(Guam-USVI-etc)"="Other", "Peru"="Other", "Philippines"="Other", "Poland"="Other", "Portugal"="Other", "Puerto-Rico"="Other", "Scotland"="Other", "Taiwan"="Other", "Thailand"="Other", "Trinadad&Tobago"="Other", "United-States"="Other", "Vietnam"="Other", "Yugoslavia"="Other"))
model5 <- glm(income~.,family=binomial, data=incomenew)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model5)
##
## Call:
## glm(formula = income ~ ., family = binomial, data = incomenew)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.1311 -0.4979 -0.2017 -0.0513 3.6621
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.419e+00 4.364e-01 -21.586 < 2e-16
## age 2.797e-02 1.364e-03 20.507 < 2e-16
## workclassFederal-gov -7.296e+00 8.776e+01 -0.083 0.933741
## workclassLocal-gov -8.009e+00 8.776e+01 -0.091 0.927290
## workclassOther -8.311e+00 8.776e+01 -0.095 0.924548
## workclassPrivate -7.838e+00 8.776e+01 -0.089 0.928835
## workclassSelf-emp-inc -7.670e+00 8.776e+01 -0.087 0.930353
## fnlwgt 6.838e-07 1.455e-07 4.701 2.59e-06
## educationOther -7.154e-02 1.504e-01 -0.476 0.634276
## educationAssoc-acdm 2.722e-01 2.903e-01 0.938 0.348351
## educationAssoc-voc 3.860e-01 2.531e-01 1.525 0.127163
## educationBachelors 6.299e-01 3.193e-01 1.973 0.048537
## educationDoctorate 1.007e+00 4.531e-01 2.223 0.026234
## educationHS-grad 2.496e-01 1.816e-01 1.374 0.169328
## educationMasters 7.737e-01 3.610e-01 2.143 0.032100
## educationProf-school 1.072e+00 4.105e-01 2.610 0.009048
## educationSome-college 4.317e-01 2.129e-01 2.028 0.042555
## educational_num 1.926e-01 4.141e-02 4.651 3.31e-06
## marital_statusMarried-AF-spouse 1.688e+00 4.571e-01 3.692 0.000222
## marital_statusMarried-civ-spouse 1.462e+00 1.890e-01 7.739 1.00e-14
## marital_statusOther -4.048e-01 6.699e-02 -6.043 1.51e-09
## occupationOther 8.584e+00 8.776e+01 0.098 0.922083
## occupationExec-managerial 9.405e+00 8.776e+01 0.107 0.914655
## occupationFarming-fishing 7.614e+00 8.776e+01 0.087 0.930862
## occupationHandlers-cleaners 7.874e+00 8.776e+01 0.090 0.928504
## occupationOther-service 7.725e+00 8.776e+01 0.088 0.929859
## occupationProf-specialty 9.171e+00 8.776e+01 0.105 0.916772
## occupationProtective-serv 9.130e+00 8.776e+01 0.104 0.917143
## occupationSales 8.884e+00 8.776e+01 0.101 0.919364
## occupationTech-support 9.229e+00 8.776e+01 0.105 0.916249
## relationshipOther -4.034e-01 1.839e-01 -2.194 0.028242
## relationshipWife 1.173e+00 8.537e-02 13.743 < 2e-16
## raceAsian-Pac-Islander 5.503e-01 2.118e-01 2.599 0.009357
## raceOther 5.837e-01 1.932e-01 3.021 0.002523
## genderMale 7.439e-01 6.439e-02 11.552 < 2e-16
## capital_gain 3.236e-04 8.979e-06 36.035 < 2e-16
## capital_loss 6.435e-04 3.194e-05 20.146 < 2e-16
## hours_per_week 2.967e-02 1.361e-03 21.795 < 2e-16
## native_countryOther 2.110e-01 1.116e-01 1.891 0.058667
## native_countryCanada 6.847e-01 2.422e-01 2.827 0.004698
## native_countryColumbia -2.266e+00 7.948e-01 -2.851 0.004354
## native_countryItaly 7.880e-01 2.999e-01 2.627 0.008605
## native_countryMexico -5.424e-01 2.212e-01 -2.452 0.014210
## native_countrySouth -9.433e-01 3.738e-01 -2.524 0.011614
##
## (Intercept) ***
## age ***
## workclassFederal-gov
## workclassLocal-gov
## workclassOther
## workclassPrivate
## workclassSelf-emp-inc
## fnlwgt ***
## educationOther
## educationAssoc-acdm
## educationAssoc-voc
## educationBachelors *
## educationDoctorate *
## educationHS-grad
## educationMasters *
## educationProf-school **
## educationSome-college *
## educational_num ***
## marital_statusMarried-AF-spouse ***
## marital_statusMarried-civ-spouse ***
## marital_statusOther ***
## occupationOther
## occupationExec-managerial
## occupationFarming-fishing
## occupationHandlers-cleaners
## occupationOther-service
## occupationProf-specialty
## occupationProtective-serv
## occupationSales
## occupationTech-support
## relationshipOther *
## relationshipWife ***
## raceAsian-Pac-Islander **
## raceOther **
## genderMale ***
## capital_gain ***
## capital_loss ***
## hours_per_week ***
## native_countryOther .
## native_countryCanada **
## native_countryColumbia **
## native_countryItaly **
## native_countryMexico *
## native_countrySouth *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 48173 on 43841 degrees of freedom
## Residual deviance: 27871 on 43798 degrees of freedom
## AIC: 27959
##
## Number of Fisher Scoring iterations: 11
incometest2 <- read.csv("adult_income_test.csv",header=TRUE)
incometest2$workclass <- revalue(incometest2$workclass, c("Without-pay"= "Other","Self-emp-not-inc"="Other","State-gov"="Other"))
incometest2$education <- revalue(incometest2$education, c("11th"="Other", "12th"= "Other", "1st-4th"= "Other","5th-6th"= "Other", "7th-8th"= "Other", "9th"= "Other","Preschool"= "Other"))
incometest2$marital_status <- revalue(incometest2$marital_status, c("Separated"="Other", "Widowed"= "Other","Never-married"="Other","Married-spouse-absent"="Other"))
incometest2$occupation <- revalue(incometest2$occupation, c("Adm-clerical"="Other","Armed-Forces"= "Other", "Craft-repair"="Other","Machine-op-inspct"="Other","Priv-house-serv" = "Other", "Transport-moving" = "Other"))
incometest2$relationship <- revalue(incometest2$relationship, c("Not-in-family"="Other", "Other-relative"= "Other", "Own-child"="Other", "Unmarried"= "Other"))
incometest2$race <- revalue(incometest2$race, c("Black"="Other", "White"= "Other"))
incometest2$native_country <- revalue(incometest2$native_country, c("Cambodia"="Other", "China"="Other","Cuba"="Other", "Dominican-Republic"="Other", "Ecuador"="Other", "El-Salvador"="Other", "England"="Other", "France"="Other", "Germany"="Other", "Greece"="Other", "Guatemala"="Other", "Haiti"="Other", "Honduras"="Other", "Hong"="Other", "Hungary"="Other", "India"="Other", "Iran"="Other", "Ireland"="Other", "Jamaica"="Other", "Japan"="Other", "Laos"="Other", "Nicaragua"="Other", "Outlying-US(Guam-USVI-etc)"="Other", "Peru"="Other", "Philippines"="Other", "Poland"="Other", "Portugal"="Other", "Puerto-Rico"="Other", "Scotland"="Other", "Taiwan"="Other", "Thailand"="Other", "Trinadad&Tobago"="Other", "United-States"="Other", "Vietnam"="Other", "Yugoslavia"="Other"))
incometest2$incomepred <- round(predict(model5,incometest2,type = 'response'))
# Confusion Matrix
confusion.matrix <- as.matrix(table('Actual'=incometest2$income,'Prediction'=(incometest2$incomepred)))
confusion.matrix
## Prediction
## Actual 0 1
## <=50K 3500 271
## >50K 503 726
N <- nrow(incometest2)
diag <- diag(confusion.matrix)
Accuracy <- sum(diag)/N
round(Accuracy*100,2)
## [1] 84.52