#remove typos
for (i in 17:31){LandSatis<- LandSatis %>% filter(LandSatis[,i]%in% c("1","0")) %>% droplevels()}
for (i in 32:39){LandSatis <- LandSatis %>% filter(LandSatis[,i]%in% c("1","2","3","4","5")) %>% droplevels()}
#Convert numeric variable to factor
nsm<-colnames(LandSatis[16:39])
LandSatis[nsm] <- lapply(LandSatis[nsm], as.factor)
# recode numeric to categories
LandSatis %<>% mutate_at(c(16), recode, "1"="Agreed","2"="Agreed","3"="Neutral","4"="Disagreed","5"="Disagreed")
LandSatis %<>% mutate_at(c(17:31), recode, "1"="Yes","0"="No")
LandSatis %<>% mutate_at(c(32:39), recode, "5"="Agreed","4"="Agreed","3"="Neutral","2"="Disagreed","1"="Disagreed")
# Transform FA score to factor
histogram(LandSatis$satisfaction)
LandSatis$SatisClass <- cut(LandSatis$satisfaction, breaks=c(-Inf,-1,Inf), labels=c("Other", "Satisfied"))
#landtableau <- LandSatis %>% select(satisfaction,SatisClass,BenefitAttitude,Time.Over,FutureProtection,NoVisits,Income,Age,Upbringing,OccupationMG,ContactFrequency,Education,HowEnrol,PassToChild,CovOwnedYear,CovAreaPropBand)
#write.xlsx(landtableau, file = "landtableau.xlsx",append = FALSE)
LandSatis$satisfaction <- NULL
#summarizeColumns(LandSatis) %>% knitr::kable( caption = 'Feature Summary before Data Preprocessing')
summary(LandSatis)
## State CovOwnedYear X1MoreCov CovAreaPropBand OccupationMG
## NSW: 33 <10 yrs : 79 No :253 <50% : 83 Manager : 52
## TAS: 54 >21 yrs :123 Yes: 49 >90% :135 Other : 42
## VIC:215 11-20 yrs:100 51%-90%: 84 Professional: 79
## Retired :129
## Resident TimeonLand Upbringing MemberOtherOrgs HowEnrol
## No :134 Full :164 Both : 58 No : 86 Other : 33
## Yes:168 None : 63 Other: 27 Yes:216 Self :226
## Quarters: 75 Rural: 85 Successor: 43
## Urban:132
## BenefitAttitude ContactFrequency IncentivesReceived Time.Over
## Other : 29 multiple: 83 No :143 Increase: 45
## positive:273 None :157 Yes:159 Maintain:227
## Other : 62 Other : 30
##
## AmendCovent FutureProtection AbsenteeOwners ManagerAge Burning
## No :268 Agreed :257 No :296 No :247 No :271
## Yes: 34 Neutral : 32 Yes: 6 Yes: 55 Yes: 31
## Disagreed: 13
##
## ExpenseTimeMoney Helpless LackInformation DeclinedOverYear DisatWithHelp
## No :256 No :291 No :297 No :283 No :294
## Yes: 46 Yes: 11 Yes: 5 Yes: 19 Yes: 8
##
##
## NoVisits SeeOfficerMore Weak Minimal NoVisits.1
## No :263 No :293 No :275 No :286 No :282
## Yes: 39 Yes: 9 Yes: 27 Yes: 16 Yes: 20
##
##
## NotTailoredToProperty UnhelpfulUseless ConcernedFuture PassToChild
## No :294 No :296 Disagreed:207 Disagreed:170
## Yes: 8 Yes: 6 Neutral : 44 Neutral : 69
## Agreed : 51 Agreed : 63
##
## SellBuyNew IncreaseArea Subdividing Sell
## Disagreed: 12 Disagreed: 24 Disagreed: 11 Disagreed: 35
## Neutral : 52 Neutral : 92 Neutral : 19 Neutral : 76
## Agreed :238 Agreed :186 Agreed :272 Agreed :191
##
## ConsiderSelling MaySell Age Education
## Disagreed: 39 Disagreed:125 <50 : 31 >Undergrad :124
## Neutral : 32 Neutral : 55 >71 : 64 Other :103
## Agreed :231 Agreed :122 51-60: 81 Undergraduate: 75
## 61-70:126
## Retired Income SatisClass
## No :166 <37K :109 Other : 44
## Yes:136 37k-80K : 97 Satisfied:258
## 80k-180K: 67
## Other : 29
fit_rf = randomForest(SatisClass~., data=LandSatis,importance=TRUE)
varImpPlot(fit_rf,type=2,main="Features Ranked By Importance")
fit_rf$importance
## Other Satisfied MeanDecreaseAccuracy
## State -2.271711e-03 -3.149903e-05 -3.323058e-04
## CovOwnedYear 1.069239e-03 1.223750e-03 1.267698e-03
## X1MoreCov -1.565189e-05 -1.927658e-04 -1.748612e-04
## CovAreaPropBand -1.179209e-03 2.912580e-04 1.426324e-04
## OccupationMG -4.617731e-03 -5.146716e-04 -1.156627e-03
## Resident -1.757032e-03 1.274034e-03 8.490760e-04
## TimeonLand 1.010185e-03 1.691343e-03 1.619284e-03
## Upbringing 1.248123e-03 3.207033e-04 4.958115e-04
## MemberOtherOrgs 2.765345e-04 -5.531056e-04 -4.351715e-04
## HowEnrol 8.325114e-03 5.923504e-04 1.713127e-03
## BenefitAttitude 5.721786e-02 9.517837e-03 1.655063e-02
## ContactFrequency 1.068429e-02 1.612796e-03 2.915223e-03
## IncentivesReceived -1.032482e-04 1.194164e-04 7.203278e-05
## Time.Over 4.155242e-02 7.132788e-03 1.212065e-02
## AmendCovent 3.662055e-03 1.663917e-03 1.980241e-03
## FutureProtection 2.250188e-02 5.135048e-03 7.641352e-03
## AbsenteeOwners 0.000000e+00 0.000000e+00 0.000000e+00
## ManagerAge 4.587625e-04 -4.049949e-04 -2.152897e-04
## Burning -3.715034e-04 -2.128406e-04 -2.238222e-04
## ExpenseTimeMoney -1.895822e-03 2.802956e-05 -2.489073e-04
## Helpless 6.534523e-04 2.747660e-04 3.752960e-04
## LackInformation 0.000000e+00 0.000000e+00 0.000000e+00
## DeclinedOverYear 4.017486e-05 1.020213e-04 7.270849e-05
## DisatWithHelp -1.049180e-03 6.691172e-05 -5.030070e-05
## NoVisits 2.370717e-02 5.072336e-03 7.667364e-03
## SeeOfficerMore -7.087846e-04 -1.985652e-04 -2.605779e-04
## Weak 1.911765e-03 1.820855e-04 4.151863e-04
## Minimal 6.487544e-04 -2.787412e-04 -1.309483e-04
## NoVisits.1 9.438658e-03 7.189636e-04 2.033448e-03
## NotTailoredToProperty -2.894003e-04 -2.166893e-04 -2.318897e-04
## UnhelpfulUseless 8.352187e-04 5.459802e-04 5.573020e-04
## ConcernedFuture -8.651772e-04 4.195069e-05 -9.916223e-05
## PassToChild 3.919174e-03 4.440595e-04 9.681192e-04
## SellBuyNew -5.258585e-03 1.434090e-03 5.226562e-04
## IncreaseArea 1.161536e-03 7.415455e-04 7.730714e-04
## Subdividing 5.735634e-04 1.426731e-03 1.336366e-03
## Sell 2.317464e-03 1.567608e-03 1.684565e-03
## ConsiderSelling 3.668163e-03 9.033049e-04 1.253283e-03
## MaySell 6.177454e-04 3.465914e-03 3.143054e-03
## Age 1.074870e-03 -1.128036e-03 -8.577329e-04
## Education 3.842451e-03 9.857853e-04 1.397835e-03
## Retired 4.880687e-04 7.153517e-04 7.327152e-04
## Income 8.302325e-03 7.676570e-04 1.914841e-03
## MeanDecreaseGini
## State 1.486710329
## CovOwnedYear 1.989519606
## X1MoreCov 0.689492973
## CovAreaPropBand 1.779948773
## OccupationMG 2.361741958
## Resident 0.988275348
## TimeonLand 1.585686683
## Upbringing 2.663571013
## MemberOtherOrgs 0.806735131
## HowEnrol 1.982917946
## BenefitAttitude 5.573576430
## ContactFrequency 2.136235708
## IncentivesReceived 1.050537524
## Time.Over 5.623616419
## AmendCovent 1.570160213
## FutureProtection 4.794133874
## AbsenteeOwners 0.012744442
## ManagerAge 0.978587295
## Burning 0.376796978
## ExpenseTimeMoney 0.484663798
## Helpless 1.101908632
## LackInformation 0.006676662
## DeclinedOverYear 0.571262241
## DisatWithHelp 0.519660364
## NoVisits 3.450852536
## SeeOfficerMore 0.294262345
## Weak 0.996938408
## Minimal 0.931078074
## NoVisits.1 2.096107994
## NotTailoredToProperty 0.336544681
## UnhelpfulUseless 0.799762496
## ConcernedFuture 1.780131708
## PassToChild 2.127068075
## SellBuyNew 0.848020280
## IncreaseArea 1.175138378
## Subdividing 1.834545684
## Sell 1.956436860
## ConsiderSelling 1.871679099
## MaySell 1.886784967
## Age 3.128267931
## Education 2.124888763
## Retired 0.817925524
## Income 3.720646893
landFS14 <- LandSatis %>% select(SatisClass,BenefitAttitude,Time.Over,FutureProtection,NoVisits,Income,Age,Upbringing,OccupationMG,ContactFrequency,Education,HowEnrol,PassToChild,CovOwnedYear,CovAreaPropBand)
#write.xlsx(landFS14, file = "landFS14.xlsx",append = FALSE)
In order to further investigate in terms of classification, it is necessary to find more significant relationship between the attitudes of ‘Positive’ and each factor. First of all, separating data into tarining data and testing data. Decision Tree, Random Forest and Boosting will be used for classification in tarining data. And then, compare the prediction accuracy using testing data. Therefore, the model with the best accuracy could reach the goal of identify the attitude.
Decision tree is one of the predictive modelling be used in machine learning. Through a tree-like graph or model to display an algorithm. It can be applied to both regression and classication problems. Classification tree predicted the outcome class of the data belongs to.
Random forest is an ensemble learning method for classification. It is also an improvement over decision tree due to correct for decision trees’ habit of overfitting to their training set.
Boosting is one of the robust machine learning techniques that could extract a stronger learner from several previous weak learner, which the weak learner is defined to be a classifier that is slightly correlated to the true classifier. In contrast, the strong learners are those who well-correlated to the true classification. The theory that we use in our Machine Learning process is Zhu’s method.
model <- na.omit(landFS14)
set.seed(999)
alpha=0.8
d = sort(sample(nrow(model), nrow(model)*alpha))
train = model[d,]
test = model[-d,]
set.seed(123)
decisionTreeFit <- caret :: train(SatisClass ~.,data=train, method="rpart", trControl = trainControl(method="cv", number=10), tuneLength = 100, parms=list(split='information'))
# The tuneLength parameter tells the algorithm to try different default values for the main parameter
confusionMatrix(predict(decisionTreeFit),train$SatisClass)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Other Satisfied
## Other 12 0
## Satisfied 30 199
##
## Accuracy : 0.8755
## 95% CI : (0.8271, 0.9144)
## No Information Rate : 0.8257
## P-Value [Acc > NIR] : 0.02203
##
## Kappa : 0.3978
## Mcnemar's Test P-Value : 1.192e-07
##
## Sensitivity : 0.28571
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.86900
## Prevalence : 0.17427
## Detection Rate : 0.04979
## Detection Prevalence : 0.04979
## Balanced Accuracy : 0.64286
##
## 'Positive' Class : Other
##
decisionTreeFit$bestTune
#summary(decisionTreeFinal)
draw the decision tree
decisionTreeFinal <- rpart(SatisClass ~ ., method="class", cp=0.05194805, data=train)
cols <- ifelse(decisionTreeFinal$frame$yval == 1, "darkred", "green4")
# green if survived
prp(decisionTreeFinal,main = "Final Decision Tree Model",
extra = 106, # display prob of survival and percent of obs
nn = TRUE, # display the node numbers
fallen.leaves = TRUE, # put the leaves on the bottom of the page
shadow.col = "gray", # shadows under the leaves
branch.lty = 3, # draw branches using dotted lines
branch = .5, # change angle of branch lines
faclen = 0, # faclen = 0 to print full factor names
trace = 1, # print the auto calculated cex, xlim, ylim
split.cex = 1.2, # make the split text larger than the node tex
col = cols, border.col = cols, # green if survived
split.box.col = "lightgray", # lightgray split boxes (default is white)
split.border.col = "darkgray", # darkgray border on split boxes
split.round = .5) # round the split box corners a tad
## cex 1 xlim c(-0.2, 1.2) ylim c(-0.1, 1.1)
The best number of variables randomly sampled as candidates at each split.
set.seed(999)
mtry <- caret :: train(SatisClass ~ ., method = "rf", data = train, importance = T,trControl = trainControl(method = "cv", number = 10))
plot(mtry)
print(mtry)
## Random Forest
##
## 241 samples
## 14 predictor
## 2 classes: 'Other', 'Satisfied'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 216, 217, 217, 218, 217, 217, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8467899 0.1855140
## 16 0.8464420 0.3849219
## 30 0.8337464 0.3733800
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
The cross validation graph shows that the model with 10 predictors is selected by the best accuracy. Therefore, using 11 as mtry value to fit random forest model in training data.
set.seed(999)
randomForestFit <- randomForest(formula = SatisClass ~ ., ntree=300, data=train, mtry= 14)
plot(randomForestFit)
The finally fit the random forest model to the data. Plotting the model shows us that after about 70 trees, not much changes in terms of error. However, it still fluctuates a bit with a small degree.
print(randomForestFit)
##
## Call:
## randomForest(formula = SatisClass ~ ., data = train, ntree = 300, mtry = 14)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 14
##
## OOB estimate of error rate: 15.35%
## Confusion matrix:
## Other Satisfied class.error
## Other 19 23 0.54761905
## Satisfied 14 185 0.07035176
Printing the model shows the number of variables tried at each split to be 23 and an out-of-bag estimate of error rate 13.57%. The model fit the data was not perfectly. There was some attitude of positive which was classified incorrectly specially in terms of Neutral (over 73% of error rate). It is caused by the few number of observations in Neutral.
varImpPlot(randomForestFit, sort = T, main="Overall Variable Importance")
It is important to look at what is shown in terms of variable importance. This plot indicates what variables had the greatest impact in the classification model.
Firstly we do the 10-folds cross validation step by using “train” process in order to find the optimal parameter of the boosting study. The specific code and process would not be demonstrated due to the output which would takes long time. The best tune of the boosting method is using the max depth=10, 50 trees by “Zhu” (SAMME)’s coefficient type. However, there are some apparent limitations in this section of study. The maximum value of the max depth is only 3, which clearly would not include the enough information, due to the variability of the classes (3 class in our case). And such high dimention of tree selecting can be easily cuase a over fitting problem. Thus we decide to use the maximum depth =5 and mfinal=50 instead. And use the error evolution method to find the optimal fitting interval.
We now bulid two initial model by using the assuming index that we gather from the previous study and build two models by using different bootstrap weight selection method.
model1 <- boosting(SatisClass~., data=train, boos = F, mfinal = 50, coeflearn = 'Zhu',
control=rpart.control(maxdepth = 10))
model2 <- boosting(SatisClass~., data=train, boos = T, mfinal = 50, coeflearn = 'Zhu',
control=rpart.control(maxdepth = 10))
evoltrain=errorevol(model1,train)
evoltest=errorevol(model1,test)
plot(evoltest,evoltrain)
evoltrain=errorevol(model2,train)
evoltest=errorevol(model2,test)
plot(evoltest,evoltrain)
Through the error evolution plot we can observe that the best fitting range are between 1 and 3 iterations by using the boos=F. Since then we run we extract the specific confusion matrix and fitting accuracy trough the cross-validation method. Now we use the optimal coefficient to extract the final boosting and do the further studying.
set.seed(5)
boostcv <- boosting.cv(SatisClass ~.,data=train, v=10, boos=F, mfinal=2, coeflearn="Zhu",
control=rpart.control(maxdepth = 10))
## i: 1 Tue Mar 12 17:52:52 2019
## i: 2 Tue Mar 12 17:52:53 2019
## i: 3 Tue Mar 12 17:52:55 2019
## i: 4 Tue Mar 12 17:52:56 2019
## i: 5 Tue Mar 12 17:52:57 2019
## i: 6 Tue Mar 12 17:52:58 2019
## i: 7 Tue Mar 12 17:52:59 2019
## i: 8 Tue Mar 12 17:53:00 2019
## i: 9 Tue Mar 12 17:53:01 2019
## i: 10 Tue Mar 12 17:53:02 2019
boostcv$confusion
## Observed Class
## Predicted Class Other Satisfied
## Other 16 9
## Satisfied 26 190
1-boostcv$error
## [1] 0.8547718
Finalmodel <- boosting(SatisClass ~.,data=train, v=10,boos =F, mfinal=2, coeflearn="Zhu",
control=rpart.control(maxdepth = 10))
mean(Finalmodel$class==test$SatisClass)
## [1] 0.8547718
The cross-validation and model fitted value has a certain level of deviation, which is possibly over fitting. But we decide to keep it and discuss in the model selection feature.
margins.train <- margins(Finalmodel,train)[[1]]
margins.test <- margins(Finalmodel, test)[[1]]
plot(sort(margins.train), (1:length(margins.train)) /length(margins.train), type = "l", xlim = c(-1,1),
main = "Margin cumulative distribution graph", xlab = "Margin",
ylab = "% observations", col = "blue3", lty = 2, lwd = 2)
abline(v = 0, col = "red", lty = 2, lwd = 2)
lines(sort(margins.test), (1:length(margins.test)) / length(margins.test),
type = "l", cex = .5, col = "green", lwd = 2)
legend("topleft", c("test", "train"), col = c("green", "blue3"),lty = 1:2, lwd = 2)
The margin plot shows the cumulative distribution of margins for boosting classifiers developed in this application. The margin of the training set is coloured in blue and the test set in green. For SAMME, there are about 20% observation owing to training error. And there are more than 50 percent boservation both in training and testing sets reach the positive margin.
Finalmodel$importance
## Age BenefitAttitude ContactFrequency CovAreaPropBand
## 9.102804 14.518112 7.901278 0.000000
## CovOwnedYear Education FutureProtection HowEnrol
## 6.746169 6.388062 6.162331 0.000000
## Income NoVisits OccupationMG PassToChild
## 10.011330 12.866171 0.000000 3.824880
## Time.Over Upbringing
## 22.478862 0.000000
imp = data.frame(names(Finalmodel$importance),Finalmodel$importance)
rownames(imp) = NULL
colnames(imp) = c("names","imp")
imp1 = imp[-c(which(imp$imp<1)),]
g <- ggplot(imp1, aes(x=reorder(names, imp),y = imp))
g + geom_bar( stat = "identity" ,width = 0.5) +coord_flip()+ theme(axis.text.x = element_text(vjust=0.6),
plot.caption = element_text(hjust = 0),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
plot.margin = unit(c(0,1,0,0), "cm"),
legend.position = "bottom")+
labs(title="Overall Variable Importance")
The variable importance shows which the effected and useless variables in this study. We can observe that the Enjoying play an important role, while ageband and Education are important as well. The Upbringing and IncreaseArea are less likely useful in this section.
Testing decision tree model in test data.
decisionTreePredict <- predict(decisionTreeFinal, train, type = "class")
mean(decisionTreePredict==train$SatisClass)
## [1] 0.8755187
Testing random forest model in test data.(Random forest is a bagging technique and not a boosting technique. In boosting as the name suggests, one is learning from other which in turn boosts the learning. The trees in random forests are run in parallel. There is no interaction between these trees while building the trees.)
randomForestPredict <- predict(randomForestFit, train, type='class')
mean(randomForestPredict==train$SatisClass)
## [1] 1
Testing boosting model in test data.( The fundamental difference between bagging and random forest is that in Random forests, only a subset of features are selected at random out of the total and the best split feature from the subset is used to split each node in a tree, unlike in bagging where all features are considered for splitting a node)
boostingfit=predict.boosting(Finalmodel,train,type='class')
1-boostingfit$error
## [1] 0.8879668
Prediction Accuracy of Each Method
result=matrix(c(0.9672131, 0.9180328, 0.9344262, 0.8755187, 1, 0.8879668),nrow=3,ncol=2)
row.names(result)=c("Decision Tree","Random Forest","Boosting")
colnames(result)=c("Test set","Training set")
result
## Test set Training set
## Decision Tree 0.9672131 0.8755187
## Random Forest 0.9180328 1.0000000
## Boosting 0.9344262 0.8879668
As the result shows the highest prediction accuracy is around 83.33% in method Random Forest, and the cross-validation result of random forest is 85.91%, also be the least biased method among all three techniques. Although the testing result is better than the traning, it could be considerd as a reasonable deviation since the testing data set is not too large. Therefore, the Random Forest shoule be used for classification in out of data which with the best expression.
confusionMatrix(predict(decisionTreeFit),train$SatisClass)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Other Satisfied
## Other 12 0
## Satisfied 30 199
##
## Accuracy : 0.8755
## 95% CI : (0.8271, 0.9144)
## No Information Rate : 0.8257
## P-Value [Acc > NIR] : 0.02203
##
## Kappa : 0.3978
## Mcnemar's Test P-Value : 1.192e-07
##
## Sensitivity : 0.28571
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.86900
## Prevalence : 0.17427
## Detection Rate : 0.04979
## Detection Prevalence : 0.04979
## Balanced Accuracy : 0.64286
##
## 'Positive' Class : Other
##
print(randomForestFit)
##
## Call:
## randomForest(formula = SatisClass ~ ., data = train, ntree = 300, mtry = 14)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 14
##
## OOB estimate of error rate: 15.35%
## Confusion matrix:
## Other Satisfied class.error
## Other 19 23 0.54761905
## Satisfied 14 185 0.07035176
boostcv$confusion
## Observed Class
## Predicted Class Other Satisfied
## Other 16 9
## Satisfied 26 190
This part is to specify the approrate logistic regression model, using linear combination of all variable to initialize the model. Then using backward elimination to the feature selection. Finalize the model by manually observing the performance of significant level for the remaining variables.
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
##
## Call:
## glm(formula = SatisClass ~ ., family = binomial(link = "logit"),
## data = landFS14)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.76209 0.04841 0.15797 0.35136 2.13338
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.996130 1.794132 -1.113 0.265885
## BenefitAttitudepositive 2.558253 0.740217 3.456 0.000548 ***
## Time.OverMaintain 0.636065 0.664939 0.957 0.338781
## Time.OverOther -1.514455 0.895921 -1.690 0.090954 .
## FutureProtectionNeutral -1.269952 0.722045 -1.759 0.078607 .
## FutureProtectionDisagreed -3.702910 1.004160 -3.688 0.000226 ***
## NoVisitsYes -1.955004 0.697485 -2.803 0.005064 **
## Income37k-80K 0.577299 0.704258 0.820 0.412372
## Income80k-180K 0.233914 0.736916 0.317 0.750923
## IncomeOther -0.777529 0.760815 -1.022 0.306796
## Age>71 0.763629 1.013552 0.753 0.451198
## Age51-60 2.698131 1.007492 2.678 0.007405 **
## Age61-70 1.461239 0.890387 1.641 0.100771
## UpbringingOther 0.049727 1.036073 0.048 0.961720
## UpbringingRural 0.655945 0.834693 0.786 0.431955
## UpbringingUrban -0.005301 0.684460 -0.008 0.993821
## OccupationMGOther -0.002758 1.051041 -0.003 0.997906
## OccupationMGProfessional 0.664719 0.940786 0.707 0.479842
## OccupationMGRetired -0.468876 0.859749 -0.545 0.585503
## ContactFrequencyNone -0.513333 0.562964 -0.912 0.361854
## ContactFrequencyOther 2.859863 1.723381 1.659 0.097025 .
## EducationOther -0.495481 0.623868 -0.794 0.427074
## EducationUndergraduate -0.453342 0.685384 -0.661 0.508328
## HowEnrolSelf 0.765066 0.903382 0.847 0.397056
## HowEnrolSuccessor 0.248562 0.963117 0.258 0.796345
## PassToChildNeutral -0.419658 0.669851 -0.626 0.530990
## PassToChildAgreed -0.567413 0.681375 -0.833 0.404987
## CovOwnedYear>21 yrs 1.348955 0.830791 1.624 0.104440
## CovOwnedYear11-20 yrs 0.255900 0.759610 0.337 0.736205
## CovAreaPropBand>90% 0.599616 0.721400 0.831 0.405869
## CovAreaPropBand51%-90% 0.309621 0.752650 0.411 0.680798
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 250.76 on 301 degrees of freedom
## Residual deviance: 127.32 on 271 degrees of freedom
## AIC: 189.32
##
## Number of Fisher Scoring iterations: 8
The full response model for class shows that a majority of Novisits, Sundividing, Occupation, Sell and BenefitCategory have a significant impact on the satisfaction. Running an Anova (from car library) on the model reveals that Novisits, Subdividing and BenefitCategory have P-value below 0.05. Thus,they are statistically significant, we choose to not omit them. In this case we use the step function to optimize the combination of model.
## Start: AIC=189.32
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection +
## NoVisits + Income + Age + Upbringing + OccupationMG + ContactFrequency +
## Education + HowEnrol + PassToChild + CovOwnedYear + CovAreaPropBand
##
## Df Deviance AIC
## - Upbringing 3 128.31 184.31
## - OccupationMG 3 129.20 185.20
## - Income 3 129.54 185.54
## - CovAreaPropBand 2 128.02 186.02
## - Education 2 128.09 186.09
## - HowEnrol 2 128.13 186.13
## - PassToChild 2 128.15 186.15
## <none> 127.32 189.32
## - CovOwnedYear 2 131.35 189.35
## - ContactFrequency 2 135.03 193.03
## - Age 3 137.23 193.23
## - Time.Over 2 135.95 193.95
## - NoVisits 1 135.40 195.40
## - BenefitAttitude 1 139.76 199.76
## - FutureProtection 2 143.57 201.57
##
## Step: AIC=184.31
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection +
## NoVisits + Income + Age + OccupationMG + ContactFrequency +
## Education + HowEnrol + PassToChild + CovOwnedYear + CovAreaPropBand
##
## Df Deviance AIC
## - Income 3 130.40 180.40
## - OccupationMG 3 130.50 180.50
## - CovAreaPropBand 2 128.79 180.79
## - HowEnrol 2 128.92 180.92
## - Education 2 129.07 181.07
## - PassToChild 2 129.66 181.66
## <none> 128.31 184.31
## - CovOwnedYear 2 132.40 184.40
## - Age 3 137.65 187.65
## - ContactFrequency 2 136.06 188.06
## - Time.Over 2 137.97 189.97
## - NoVisits 1 137.23 191.23
## - BenefitAttitude 1 140.03 194.03
## - FutureProtection 2 143.96 195.96
##
## Step: AIC=180.4
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection +
## NoVisits + Age + OccupationMG + ContactFrequency + Education +
## HowEnrol + PassToChild + CovOwnedYear + CovAreaPropBand
##
## Df Deviance AIC
## - HowEnrol 2 131.10 177.10
## - Education 2 131.29 177.29
## - CovAreaPropBand 2 131.45 177.45
## - PassToChild 2 131.54 177.54
## - OccupationMG 3 134.04 178.04
## <none> 130.40 180.40
## - CovOwnedYear 2 134.68 180.68
## - Age 3 140.19 184.19
## - ContactFrequency 2 138.73 184.73
## - Time.Over 2 140.23 186.23
## - BenefitAttitude 1 141.50 189.50
## - NoVisits 1 141.75 189.75
## - FutureProtection 2 149.84 195.84
##
## Step: AIC=177.1
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection +
## NoVisits + Age + OccupationMG + ContactFrequency + Education +
## PassToChild + CovOwnedYear + CovAreaPropBand
##
## Df Deviance AIC
## - Education 2 131.97 173.97
## - PassToChild 2 132.07 174.07
## - CovAreaPropBand 2 132.09 174.09
## - OccupationMG 3 134.72 174.72
## <none> 131.10 177.10
## - CovOwnedYear 2 138.27 180.27
## - Age 3 140.93 180.93
## - ContactFrequency 2 139.26 181.26
## - Time.Over 2 141.76 183.76
## - BenefitAttitude 1 143.46 187.46
## - NoVisits 1 144.00 188.00
## - FutureProtection 2 149.85 191.85
##
## Step: AIC=173.97
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection +
## NoVisits + Age + OccupationMG + ContactFrequency + PassToChild +
## CovOwnedYear + CovAreaPropBand
##
## Df Deviance AIC
## - PassToChild 2 132.93 170.93
## - CovAreaPropBand 2 133.17 171.17
## - OccupationMG 3 135.59 171.59
## <none> 131.97 173.97
## - CovOwnedYear 2 138.69 176.69
## - ContactFrequency 2 140.32 178.32
## - Age 3 142.40 178.40
## - Time.Over 2 143.91 181.91
## - BenefitAttitude 1 144.49 184.49
## - NoVisits 1 145.29 185.29
## - FutureProtection 2 149.96 187.96
##
## Step: AIC=170.93
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection +
## NoVisits + Age + OccupationMG + ContactFrequency + CovOwnedYear +
## CovAreaPropBand
##
## Df Deviance AIC
## - CovAreaPropBand 2 134.41 168.41
## - OccupationMG 3 137.37 169.37
## <none> 132.93 170.93
## - CovOwnedYear 2 139.47 173.47
## - Age 3 142.66 174.66
## - ContactFrequency 2 140.87 174.87
## - Time.Over 2 144.78 178.78
## - NoVisits 1 146.21 182.21
## - BenefitAttitude 1 147.07 183.07
## - FutureProtection 2 150.92 184.92
##
## Step: AIC=168.4
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection +
## NoVisits + Age + OccupationMG + ContactFrequency + CovOwnedYear
##
## Df Deviance AIC
## - OccupationMG 3 137.71 165.71
## <none> 134.41 168.41
## - CovOwnedYear 2 140.26 170.26
## - Age 3 143.84 171.84
## - ContactFrequency 2 142.40 172.40
## - Time.Over 2 146.74 176.74
## - NoVisits 1 146.76 178.76
## - BenefitAttitude 1 148.22 180.22
## - FutureProtection 2 152.75 182.75
##
## Step: AIC=165.71
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection +
## NoVisits + Age + ContactFrequency + CovOwnedYear
##
## Df Deviance AIC
## <none> 137.71 165.71
## - CovOwnedYear 2 143.78 167.78
## - Age 3 148.56 170.56
## - ContactFrequency 2 146.75 170.75
## - Time.Over 2 150.22 174.22
## - NoVisits 1 149.85 175.85
## - BenefitAttitude 1 152.15 178.15
## - FutureProtection 2 159.06 183.06
##
## Call:
## glm(formula = SatisClass ~ BenefitAttitude + Time.Over + FutureProtection +
## NoVisits + Age + ContactFrequency + CovOwnedYear, family = binomial(link = "logit"),
## data = landFS14)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.74026 0.07737 0.17708 0.39234 1.90179
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6674 1.0690 -0.624 0.532440
## BenefitAttitudepositive 2.3532 0.6282 3.746 0.000180 ***
## Time.OverMaintain 0.8141 0.5980 1.361 0.173408
## Time.OverOther -1.4157 0.7696 -1.840 0.065835 .
## FutureProtectionNeutral -1.7033 0.6338 -2.687 0.007202 **
## FutureProtectionDisagreed -3.5600 0.9334 -3.814 0.000137 ***
## NoVisitsYes -2.0828 0.5975 -3.486 0.000490 ***
## Age>71 -0.2533 0.7406 -0.342 0.732337
## Age51-60 1.7590 0.8342 2.109 0.034984 *
## Age61-70 0.8885 0.7178 1.238 0.215736
## ContactFrequencyNone -0.6438 0.5213 -1.235 0.216787
## ContactFrequencyOther 2.1547 1.2971 1.661 0.096693 .
## CovOwnedYear>21 yrs 1.4842 0.6559 2.263 0.023644 *
## CovOwnedYear11-20 yrs 0.4543 0.5778 0.786 0.431684
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 250.76 on 301 degrees of freedom
## Residual deviance: 137.71 on 288 degrees of freedom
## AIC: 165.71
##
## Number of Fisher Scoring iterations: 7
According to the smallest AIC(185.78) the best model was formulated as: satisfaction ~ BenefitAttitude + Time.Over + FutureProtection + NoVisits + Age + ContactFrequency + CovOwnedYear After running an Anova on the model reveals that CovOwnedYear close to 0.05. However, since some of class in CovOwnedYear are statistically significant, we choose not to omit variables Occupation and Sell either, and rebuild the model as final model shown below.
mod.final = glm(SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + NoVisits + Age + ContactFrequency + CovOwnedYear, family = binomial(link = "logit"),
data = model)
summary(mod.final)
##
## Call:
## glm(formula = SatisClass ~ BenefitAttitude + Time.Over + FutureProtection +
## NoVisits + Age + ContactFrequency + CovOwnedYear, family = binomial(link = "logit"),
## data = model)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.74026 0.07737 0.17708 0.39234 1.90179
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6674 1.0690 -0.624 0.532440
## BenefitAttitudepositive 2.3532 0.6282 3.746 0.000180 ***
## Time.OverMaintain 0.8141 0.5980 1.361 0.173408
## Time.OverOther -1.4157 0.7696 -1.840 0.065835 .
## FutureProtectionNeutral -1.7033 0.6338 -2.687 0.007202 **
## FutureProtectionDisagreed -3.5600 0.9334 -3.814 0.000137 ***
## NoVisitsYes -2.0828 0.5975 -3.486 0.000490 ***
## Age>71 -0.2533 0.7406 -0.342 0.732337
## Age51-60 1.7590 0.8342 2.109 0.034984 *
## Age61-70 0.8885 0.7178 1.238 0.215736
## ContactFrequencyNone -0.6438 0.5213 -1.235 0.216787
## ContactFrequencyOther 2.1547 1.2971 1.661 0.096693 .
## CovOwnedYear>21 yrs 1.4842 0.6559 2.263 0.023644 *
## CovOwnedYear11-20 yrs 0.4543 0.5778 0.786 0.431684
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 250.76 on 301 degrees of freedom
## Residual deviance: 137.71 on 288 degrees of freedom
## AIC: 165.71
##
## Number of Fisher Scoring iterations: 7
Anova(mod.final)
coef(mod.final)
## (Intercept) BenefitAttitudepositive
## -0.6673514 2.3531811
## Time.OverMaintain Time.OverOther
## 0.8141005 -1.4156674
## FutureProtectionNeutral FutureProtectionDisagreed
## -1.7033414 -3.5599500
## NoVisitsYes Age>71
## -2.0828187 -0.2533093
## Age51-60 Age61-70
## 1.7589539 0.8885433
## ContactFrequencyNone ContactFrequencyOther
## -0.6438065 2.1546702
## CovOwnedYear>21 yrs CovOwnedYear11-20 yrs
## 1.4841905 0.4543421
# Model diagnostic with odds ratio
model.coef <- summary(mod.final)$coefficients[, c("Estimate", "Std. Error", "Pr(>|z|)")]
# display coefficients of m2.model and Odds Ratio
OR.mm <- exp(summary(mod.final)$coefficients[,1])
cbind("Odds Ratio (OR)" = round(OR.mm,4), round(model.coef,5))
## Odds Ratio (OR) Estimate Std. Error Pr(>|z|)
## (Intercept) 0.5131 -0.66735 1.06898 0.53244
## BenefitAttitudepositive 10.5190 2.35318 0.62822 0.00018
## Time.OverMaintain 2.2571 0.81410 0.59802 0.17341
## Time.OverOther 0.2428 -1.41567 0.76957 0.06584
## FutureProtectionNeutral 0.1821 -1.70334 0.63383 0.00720
## FutureProtectionDisagreed 0.0284 -3.55995 0.93336 0.00014
## NoVisitsYes 0.1246 -2.08282 0.59747 0.00049
## Age>71 0.7762 -0.25331 0.74063 0.73234
## Age51-60 5.8064 1.75895 0.83421 0.03498
## Age61-70 2.4316 0.88854 0.71776 0.21574
## ContactFrequencyNone 0.5253 -0.64381 0.52125 0.21679
## ContactFrequencyOther 8.6250 2.15467 1.29713 0.09669
## CovOwnedYear>21 yrs 4.4114 1.48419 0.65589 0.02364
## CovOwnedYear11-20 yrs 1.5751 0.45434 0.57781 0.43168
Variable Importance for variables in model 2
From the plot we can see top 10 most important variable in the logistic regression model.
ROC curves of model 2 using training dataset (top) test dataset (middle) and entire dataset (bottom)
(Receiver Operating Characteristic) curve indicates the Ture positive rate against false positive rate of different cut points of a diagnostic test. It displays the trade off between sensitivity and specificity. According to the generated plot the curve is following the left-hand border and the top border of the ROC space, which covers 89.8% of the area (AUC=0.92). This can be concluded that approximately 90% of accuracy to determine whether the satisfaction is Satisfied or Other.
To find the optimal cut off value
cutoffs <- seq(0.1, 0.9, 0.02)
accuracy <- NULL
for (i in seq(along = cutoffs)) {
prediction <- ifelse(prob.model >= cutoffs[i], 1, 0) #Predicting for cut-off
accuracy <- c(accuracy, length(which(mod.final$y == prediction))/length(prediction) *
100)
}
plot(cutoffs, accuracy, pch = 19, type = "b", col = "steelblue", main = "Cutoff Value Performance in Final Model",
xlab = "Cutoff Level", ylab = "Accuracy %")
cutoffs[which.max(accuracy)]
Successively, the appropriate threshold of the logistic regression model needed to be found. 40 possible values between 0.1 and 0.9 are taken to test the accuracy of the model. The plot shows the trend of accuracy changing under different cut off values.
Since the optimal threshold of cut off value is determined, the confusion matrix can be generated. The accuracy for the prediction using the finalized model is 92.16%. Moreover, the no information rate (best guess given no information beyond the overall distribution of the classes you are trying to predict.) is 56.27, less than the accuracy, shows that the model can be trusted in certain level.
library(caret)
# Get predictions for the entire dataset using model 2 convert values that
# are greater than cutoff to be 1 and otherwise 0 with 0.745 as cutoff value
pre.class = NULL
for (i in 1:nrow(model)) {
pre.class = ifelse(prob.model >= 0.42, "Satisfied", "Other")
}
pre.class = as.factor(pre.class)
table(pre.class)
# display confusion matrix with a cutoff value at 0.3
confusionMatrix(pre.class, landFS14$SatisClass)
We utilized the logistic regression with linear combination of variables to construct the model. And use the techniques to optimize the model and detect the feasibility using backwards eliminations and ROC curves. Generally, only 5 out of 11 independent variables are considered to be significantly effective to the possibility of a landholder being satisfied or other. NoVisits + Subdividing + OccupationMG + TimeonLand + Sell + BenefitCategory + ContactFrequency formulate the model to predict the landholder’s satisfaction level. There are 90.73% chance of satisfaction level could be predicted if the corresponding information could be given correctly.
Surprisingly, some variables like IncentivesReceived and Education, were deleted from the model during the studying process.