library(rpart)
library(caret)
Loading required package: lattice
Loading required package: ggplot2
library(rpart.plot)
library(ROCR)
Loading required package: gplots
package ‘gplots’ was built under R version 3.5.2
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
rm(list=ls())
setwd("/Users/jayavarshini/Desktop/ms/sem5/dm/assing2//")
set.seed(1122)
# Load the Diabetes dataset
train <- read.csv("adult-train.csv", header=T, sep=",", comment.char = '#')
test <- read.csv("adult-train.csv",header=T,sep=",",comment.char = '#')
head(train)
#Try for all the columns in Train and Test Dataset
#Columns with ? : Workclass=1836,Occupation=1843,Native_country=583
print(sum(train$age=="?"))
[1] 0
print(sum(train$workclass=="?"))
[1] 1836
print(sum(train$fnlwgt=="?"))
[1] 0
print(sum(train$education=="?"))
[1] 0
print(sum(train$education_num=="?"))
[1] 0
print(sum(train$marital_status=="?"))
[1] 0
print(sum(train$occupation=="?"))
[1] 1843
print(sum(train$relationship=="?"))
[1] 0
print(sum(train$race=="?"))
[1] 0
print(sum(train$capital_gain=="?"))
[1] 0
print(sum(train$capital_loss=="?"))
[1] 0
print(sum(train$hours_per_week=="?"))
[1] 0
print(sum(train$native_country=="?"))
[1] 583
print(sum(train$income=="?"))
[1] 0
print(train[ which( train$age %in% "?") , 0])
print(train[ which( train$workclass %in% "?") , 0])
print(train[ which( train$fnlwgt %in% "?") , 0])
print(train[ which( train$education %in% "?") , 0])
print(train[ which( train$education_num %in% "?") , 0])
print(train[ which( train$marital_status %in% "?") , 0])
print(train[ which( train$occupation %in% "?") , 0])
print(train[ which( train$relationship %in% "?") , 0])
print(train[ which( train$race %in% "?") , 0])
print(train[ which( train$capital_gain %in% "?") , 0])
print(train[ which( train$capital_loss %in% "?") , 0])
print(train[ which( train$hours_per_week %in% "?") , 0])
print(train[ which( train$native_country %in% "?") , 0])
print(train[ which( train$income %in% "?") , 0])
train_df <-train[ which( ! train$native_country %in% "?") , ]
train_df <-train[ which( ! train$occupation %in% "?") , ]
train_df <-train[ which( ! train$workclass %in% "?") , ]
train_df
test_df <-test[ which( ! test$native_country %in% "?") , ]
test_df <-test[ which( ! test$occupation %in% "?") , ]
test_df <-test[ which( ! test$workclass %in% "?") , ]
test_df
print(sum(test_df$occupation=="?"))
[1] 7
decision_tree <- rpart(income~.,data=train_df,method="class")
rpart.plot(decision_tree,extra=110,fallen.leaves = T,type=4)
summary(decision_tree)
Call:
rpart(formula = income ~ ., data = train_df, method = "class")
n= 30724
CP nsplit rel error xerror xstd
1 0.12980392 0 1.0000000 1.0000000 0.009908133
2 0.06379085 2 0.7403922 0.7403922 0.008884886
3 0.03725490 3 0.6766013 0.6766013 0.008575812
4 0.01000000 4 0.6393464 0.6393464 0.008382739
Variable importance
relationship marital_status capital_gain education education_num sex occupation
24 23 10 9 9 8 7
age hours_per_week
5 3
Node number 1: 30724 observations, complexity param=0.1298039
predicted class=<=50K expected loss=0.248991 P(node) =1
class counts: 23074 7650
probabilities: 0.751 0.249
left son=2 (16584 obs) right son=3 (14140 obs)
Primary splits:
relationship splits as RLLLLR, improve=2316.524, (0 missing)
marital_status splits as LRRLLLL, improve=2283.735, (0 missing)
capital_gain < 5119 to the left, improve=1581.058, (0 missing)
education_num < 12.5 to the left, improve=1215.390, (0 missing)
education splits as LLLLLLLLLRRLRLRL, improve=1215.390, (0 missing)
Surrogate splits:
marital_status splits as LRRLLLL, agree=0.993, adj=0.984, (0 split)
sex splits as LR, agree=0.690, adj=0.327, (0 split)
age < 33.5 to the left, agree=0.646, adj=0.231, (0 split)
occupation splits as LLLRRRLLLLRRLLR, agree=0.620, adj=0.175, (0 split)
hours_per_week < 43.5 to the left, agree=0.603, adj=0.137, (0 split)
Node number 2: 16584 observations, complexity param=0.0372549
predicted class=<=50K expected loss=0.06970574 P(node) =0.5397735
class counts: 15428 1156
probabilities: 0.930 0.070
left son=4 (16279 obs) right son=5 (305 obs)
Primary splits:
capital_gain < 7073.5 to the left, improve=500.5730, (0 missing)
education splits as LLLLLLLLLRRLRLRL, improve=143.7050, (0 missing)
education_num < 12.5 to the left, improve=143.7050, (0 missing)
occupation splits as LLLLRLLLLLRRLLL, improve=116.0916, (0 missing)
hours_per_week < 42.5 to the left, improve=108.8292, (0 missing)
Node number 3: 14140 observations, complexity param=0.1298039
predicted class=<=50K expected loss=0.4592645 P(node) =0.4602265
class counts: 7646 6494
probabilities: 0.541 0.459
left son=6 (9858 obs) right son=7 (4282 obs)
Primary splits:
education splits as LLLLLLLLLRRLRLRL, improve=913.0724, (0 missing)
education_num < 12.5 to the left, improve=913.0724, (0 missing)
occupation splits as LLRLRLLLLLRRRRL, improve=852.3953, (0 missing)
capital_gain < 5095.5 to the left, improve=717.4410, (0 missing)
capital_loss < 1782.5 to the left, improve=249.1241, (0 missing)
Surrogate splits:
education_num < 12.5 to the left, agree=1.000, adj=1.000, (0 split)
occupation splits as LLLLRLLLLLRLLLL, agree=0.792, adj=0.312, (0 split)
capital_gain < 7493 to the left, agree=0.714, adj=0.056, (0 split)
native_country splits as LLLRLLLLLRRLLLLRLLRRLLLRLLLLLRLLLLRRLLLLL, agree=0.705, adj=0.026, (0 split)
capital_loss < 1894.5 to the left, agree=0.703, adj=0.018, (0 split)
Node number 4: 16279 observations
predicted class=<=50K expected loss=0.05289023 P(node) =0.5298464
class counts: 15418 861
probabilities: 0.947 0.053
Node number 5: 305 observations
predicted class=>50K expected loss=0.03278689 P(node) =0.009927093
class counts: 10 295
probabilities: 0.033 0.967
Node number 6: 9858 observations, complexity param=0.06379085
predicted class=<=50K expected loss=0.3408399 P(node) =0.3208567
class counts: 6498 3360
probabilities: 0.659 0.341
left son=12 (9352 obs) right son=13 (506 obs)
Primary splits:
capital_gain < 5095.5 to the left, improve=438.8205, (0 missing)
occupation splits as LRLLRLLLLLRRRRL, improve=230.7125, (0 missing)
education splits as LLLLLLLRR--R-L-R, improve=163.8678, (0 missing)
education_num < 8.5 to the left, improve=163.8678, (0 missing)
age < 35.5 to the left, improve=131.5391, (0 missing)
Node number 7: 4282 observations
predicted class=>50K expected loss=0.268099 P(node) =0.1393699
class counts: 1148 3134
probabilities: 0.268 0.732
Node number 12: 9352 observations
predicted class=<=50K expected loss=0.3061377 P(node) =0.3043874
class counts: 6489 2863
probabilities: 0.694 0.306
Node number 13: 506 observations
predicted class=>50K expected loss=0.01778656 P(node) =0.01646921
class counts: 9 497
probabilities: 0.018 0.982
decision_tree$variable.importance
relationship marital_status capital_gain education education_num sex occupation
2316.52441 2280.31848 990.14343 913.07236 913.07236 756.39273 688.94624
age hours_per_week native_country capital_loss
534.73378 316.67904 23.88232 16.84557
The top 3 important variable are 1.Relationship 2.Maritalstatus 3.CapitalGain
The first split is made on relationship. The predicted class in first node is <=50K Distributed Observation at first node : 1) <=50K : 23074 and >50K : 7650
predict_model <- predict(decision_tree, test_df,type = "class")
tab<-table(predict_model,test_df$income)
confusionMatrix(tab)
Confusion Matrix and Statistics
predict_model <=50K >50K
<=50K 21907 3724
>50K 1167 3926
Accuracy : 0.8408
95% CI : (0.8367, 0.8449)
No Information Rate : 0.751
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5208
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9494
Specificity : 0.5132
Pos Pred Value : 0.8547
Neg Pred Value : 0.7709
Prevalence : 0.7510
Detection Rate : 0.7130
Detection Prevalence : 0.8342
Balanced Accuracy : 0.7313
'Positive' Class : <=50K
Balanced Accuracy is 0.841
Balanced Error=1-Balanced Accuracy = 0.159
Sensitivity: TPR : 0.949 Specificity : TNR : 0.513
pred.rocr <- predict(decision_tree, newdata=test_df, type="prob")[,2]
f.pred <- prediction(pred.rocr, test_df$income)
f.perf <- performance(f.pred, "tpr", "fpr")
plot(f.perf, colorize=T, lwd=3)
abline(0,1)
auc <- performance(f.pred, measure = "auc")
print(auc)
An object of class "performance"
Slot "x.name":
[1] "None"
Slot "y.name":
[1] "Area under the ROC curve"
Slot "alpha.name":
[1] "none"
Slot "x.values":
list()
Slot "y.values":
[[1]]
[1] 0.8429366
Slot "alpha.values":
list()
plotcp(decision_tree)
print("As we can see in the graphical rep of cross validation of error summary against the mean, The minimum value has already reached. There is no need to prune")
[1] "As we can see in the graphical rep of cross validation of error summary against the mean, The minimum value has already reached. There is no need to prune"
summary(train_df$income)
<=50K >50K
23074 7650
new_train_less50<-subset(train_df,train_df$income=="<=50K")
new_train_more50<-subset(train_df,train_df$income==">50K")
sample_train<-sample(1:nrow(new_train_less50),7650)
new_trainLT50k <- new_train_less50[sample_train,]
new_train <- rbind(new_trainLT50k, new_train_more50)
new_train
#summary(new_train$income)
tree2 <- rpart(income ~ .,data = new_train, method = "class")
rpart.plot(tree2, main = "New Tree",extra = 110,fallen.leaves = T,type = 4)
pred2 <- predict(tree2, test_df,type = "class")
t <- table(pred2,test_df$income)
confusionMatrix(t)
Confusion Matrix and Statistics
pred2 <=50K >50K
<=50K 17949 1279
>50K 5125 6371
Accuracy : 0.7916
95% CI : (0.787, 0.7961)
No Information Rate : 0.751
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5228
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.7779
Specificity : 0.8328
Pos Pred Value : 0.9335
Neg Pred Value : 0.5542
Prevalence : 0.7510
Detection Rate : 0.5842
Detection Prevalence : 0.6258
Balanced Accuracy : 0.8053
'Positive' Class : <=50K
Balanced Accuracy : 0.795
Balanced Error Rate : 1- 0.795 = 0.205
Sensitivity : TPR : 0.711 | Specificity : TNR : 0.879
pred.rocr2 <- predict(tree2, newdata=test_df, type="prob")[,2]
f.pred2 <- prediction(pred.rocr2, test_df$income)
f.perf2 <- performance(f.pred2, "tpr", "fpr")
plot(f.perf2, colorize=T, lwd=3)
abline(0,1)
auc <- performance(f.pred2, measure = "auc")
print(auc)
An object of class "performance"
Slot "x.name":
[1] "None"
Slot "y.name":
[1] "Area under the ROC curve"
Slot "alpha.name":
[1] "none"
Slot "x.values":
list()
Slot "y.values":
[[1]]
[1] 0.8451194
Slot "alpha.values":
list()
We see a increased balanced accuracy in the model used in e than the model used in c. We also note the Sensitivity has decreased in the model e. The specificity has a increased value in model e do did the positive predictive value. The AUC is almost the same for both the models.