Use LogisticRegression and CART methods to predict whether an individual’s earnings are above $50,000 using the other variables. Split the data set into train and test (60% train, 40% test) For test datasets use a threshold value of .5 while predicting Calculate accuracy of all two models
library(xlsx)
library(rJava)
library(xlsxjars)
library(caret)
library(dplyr)
library(rpart)
library(rpart.plot)
census<-read.csv2("/Users/apple/Desktop/census.csv",sep=",",head=T,na.strings = "NA",fill = T);
This is very essential for preprocessing the data.
DATA<-census
summary(DATA)
## age workclass education
## Min. :17.00 Private :22286 HS-grad :10368
## 1st Qu.:28.00 Self-emp-not-inc: 2499 Some-college: 7187
## Median :37.00 Local-gov : 2067 Bachelors : 5210
## Mean :38.58 ? : 1809 Masters : 1674
## 3rd Qu.:48.00 State-gov : 1279 Assoc-voc : 1366
## Max. :90.00 Self-emp-inc : 1074 11th : 1167
## (Other) : 964 (Other) : 5006
## educationnum maritalstatus occupation
## Min. : 1.00 Divorced : 4394 Prof-specialty :4038
## 1st Qu.: 9.00 Married-AF-spouse : 23 Craft-repair :4030
## Median :10.00 Married-civ-spouse :14692 Exec-managerial:3992
## Mean :10.07 Married-spouse-absent: 397 Adm-clerical :3721
## 3rd Qu.:12.00 Never-married :10488 Sales :3584
## Max. :16.00 Separated : 1005 Other-service :3212
## Widowed : 979 (Other) :9401
## relationship race sex
## Husband :12947 Amer-Indian-Eskimo: 311 Female:10608
## Not-in-family : 8156 Asian-Pac-Islander: 956 Male :21370
## Other-relative: 952 Black : 3028
## Own-child : 5005 Other : 253
## Unmarried : 3384 White :27430
## Wife : 1534
##
## capitalgain capitalloss hoursperweek nativecountry
## Min. : 0 Min. : 0.00 Min. : 1.00 United-States:29170
## 1st Qu.: 0 1st Qu.: 0.00 1st Qu.:40.00 Mexico : 643
## Median : 0 Median : 0.00 Median :40.00 Philippines : 198
## Mean : 1064 Mean : 86.74 Mean :40.42 Germany : 137
## 3rd Qu.: 0 3rd Qu.: 0.00 3rd Qu.:45.00 Canada : 121
## Max. :99999 Max. :4356.00 Max. :99.00 Puerto-Rico : 114
## (Other) : 1595
## over50k
## <=50K:24283
## >50K : 7695
##
##
##
##
##
head(DATA,5)
## age workclass education educationnum maritalstatus
## 1 39 State-gov Bachelors 13 Never-married
## 2 50 Self-emp-not-inc Bachelors 13 Married-civ-spouse
## 3 38 Private HS-grad 9 Divorced
## 4 53 Private 11th 7 Married-civ-spouse
## 5 28 Private Bachelors 13 Married-civ-spouse
## occupation relationship race sex capitalgain capitalloss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## hoursperweek nativecountry over50k
## 1 40 United-States <=50K
## 2 13 United-States <=50K
## 3 40 United-States <=50K
## 4 40 United-States <=50K
## 5 40 Cuba <=50K
60% of the data is the training set and 40% is the test set.
inTrain <- createDataPartition( y = DATA$over50k, p = 0.6, list = FALSE)
training <- DATA[inTrain, ]
testing <- DATA[-inTrain,]
We see a correlation between the diffent parameters and the wage.
featurePlot(x=training[,c("age", "educationnum", "occupation")],
y = training$over50k,
plot = "pairs")
#age vs wage
qplot(age, over50k, data = training)
qplot(age, over50k, colour = education, data = training)
#Density plots
qplot(over50k, colour = education, data = training, geom="density")
logist <- glm(over50k ~., data = training, family = 'binomial')
prediction <- predict(logist, type="response", newdata = testing)
output <- table(prediction > 0.5, testing$over50k)
accuracy <- sum(diag(output)) / (sum(output))
accuracy
## [1] 0.8521617
CARTmodel = rpart(over50k ~. , data=training, method="class")
prediction2 <- predict(CARTmodel, type = "class", newdata = testing)
output2 <- table(prediction2, testing$over50k)
accuracy2 <- sum(diag(output2)) / (sum(output2))
accuracy2
## [1] 0.8452818
#the answer is:
ifelse(accuracy>accuracy2,"Logistic Regression is more accurate","CART is more accurate")
## [1] "Logistic Regression is more accurate"