#remove typos
for (i in 17:31){LandSatis<- LandSatis %>% filter(LandSatis[,i]%in% c("1","0")) %>% droplevels()}
for (i in 32:39){LandSatis <- LandSatis %>% filter(LandSatis[,i]%in% c("1","2","3","4","5")) %>% droplevels()}

#Convert numeric variable to factor
nsm<-colnames(LandSatis[16:39])
LandSatis[nsm] <- lapply(LandSatis[nsm], as.factor)

# recode numeric to categories
LandSatis %<>% mutate_at(c(16), recode, "1"="Agreed","2"="Agreed","3"="Neutral","4"="Disagreed","5"="Disagreed")
LandSatis %<>% mutate_at(c(17:31), recode, "1"="Yes","0"="No") 
LandSatis %<>% mutate_at(c(32:39), recode, "5"="Agreed","4"="Agreed","3"="Neutral","2"="Disagreed","1"="Disagreed")

# Transform FA score to factor
histogram(LandSatis$satisfaction)

LandSatis$SatisClass <- cut(LandSatis$satisfaction, breaks=c(-Inf,-1,Inf), labels=c("Other", "Satisfied"))

#landtableau <- LandSatis %>% select(satisfaction,SatisClass,BenefitAttitude,Time.Over,FutureProtection,NoVisits,Income,Age,Upbringing,OccupationMG,ContactFrequency,Education,HowEnrol,PassToChild,CovOwnedYear,CovAreaPropBand)
#write.xlsx(landtableau, file = "landtableau.xlsx",append = FALSE)

LandSatis$satisfaction <- NULL

#summarizeColumns(LandSatis) %>% knitr::kable( caption =  'Feature Summary before Data Preprocessing')
summary(LandSatis)

##  State        CovOwnedYear X1MoreCov CovAreaPropBand       OccupationMG
##  NSW: 33   <10 yrs  : 79   No :253   <50%   : 83     Manager     : 52  
##  TAS: 54   >21 yrs  :123   Yes: 49   >90%   :135     Other       : 42  
##  VIC:215   11-20 yrs:100             51%-90%: 84     Professional: 79  
##                                                      Retired     :129  
##  Resident     TimeonLand  Upbringing  MemberOtherOrgs      HowEnrol  
##  No :134   Full    :164   Both : 58   No : 86         Other    : 33  
##  Yes:168   None    : 63   Other: 27   Yes:216         Self     :226  
##            Quarters: 75   Rural: 85                   Successor: 43  
##                           Urban:132                                  
##  BenefitAttitude ContactFrequency IncentivesReceived    Time.Over  
##  Other   : 29    multiple: 83     No :143            Increase: 45  
##  positive:273    None    :157     Yes:159            Maintain:227  
##                  Other   : 62                        Other   : 30  
##                                                                    
##  AmendCovent  FutureProtection AbsenteeOwners ManagerAge Burning  
##  No :268     Agreed   :257     No :296        No :247    No :271  
##  Yes: 34     Neutral  : 32     Yes:  6        Yes: 55    Yes: 31  
##              Disagreed: 13                                        
##                                                                   
##  ExpenseTimeMoney Helpless  LackInformation DeclinedOverYear DisatWithHelp
##  No :256          No :291   No :297         No :283          No :294      
##  Yes: 46          Yes: 11   Yes:  5         Yes: 19          Yes:  8      
##                                                                           
##                                                                           
##  NoVisits  SeeOfficerMore  Weak     Minimal   NoVisits.1
##  No :263   No :293        No :275   No :286   No :282   
##  Yes: 39   Yes:  9        Yes: 27   Yes: 16   Yes: 20   
##                                                         
##                                                         
##  NotTailoredToProperty UnhelpfulUseless  ConcernedFuture    PassToChild 
##  No :294               No :296          Disagreed:207    Disagreed:170  
##  Yes:  8               Yes:  6          Neutral  : 44    Neutral  : 69  
##                                         Agreed   : 51    Agreed   : 63  
##                                                                         
##      SellBuyNew     IncreaseArea    Subdividing         Sell    
##  Disagreed: 12   Disagreed: 24   Disagreed: 11   Disagreed: 35  
##  Neutral  : 52   Neutral  : 92   Neutral  : 19   Neutral  : 76  
##  Agreed   :238   Agreed   :186   Agreed   :272   Agreed   :191  
##                                                                 
##   ConsiderSelling      MaySell       Age              Education  
##  Disagreed: 39    Disagreed:125   <50  : 31   >Undergrad   :124  
##  Neutral  : 32    Neutral  : 55   >71  : 64   Other        :103  
##  Agreed   :231    Agreed   :122   51-60: 81   Undergraduate: 75  
##                                   61-70:126                      
##  Retired        Income        SatisClass 
##  No :166   <37K    :109   Other    : 44  
##  Yes:136   37k-80K : 97   Satisfied:258  
##            80k-180K: 67                  
##            Other   : 29

Feature selection/ Test Importance

fit_rf = randomForest(SatisClass~., data=LandSatis,importance=TRUE)
varImpPlot(fit_rf,type=2,main="Features Ranked By Importance")

fit_rf$importance

##                               Other     Satisfied MeanDecreaseAccuracy
## State                 -2.271711e-03 -3.149903e-05        -3.323058e-04
## CovOwnedYear           1.069239e-03  1.223750e-03         1.267698e-03
## X1MoreCov             -1.565189e-05 -1.927658e-04        -1.748612e-04
## CovAreaPropBand       -1.179209e-03  2.912580e-04         1.426324e-04
## OccupationMG          -4.617731e-03 -5.146716e-04        -1.156627e-03
## Resident              -1.757032e-03  1.274034e-03         8.490760e-04
## TimeonLand             1.010185e-03  1.691343e-03         1.619284e-03
## Upbringing             1.248123e-03  3.207033e-04         4.958115e-04
## MemberOtherOrgs        2.765345e-04 -5.531056e-04        -4.351715e-04
## HowEnrol               8.325114e-03  5.923504e-04         1.713127e-03
## BenefitAttitude        5.721786e-02  9.517837e-03         1.655063e-02
## ContactFrequency       1.068429e-02  1.612796e-03         2.915223e-03
## IncentivesReceived    -1.032482e-04  1.194164e-04         7.203278e-05
## Time.Over              4.155242e-02  7.132788e-03         1.212065e-02
## AmendCovent            3.662055e-03  1.663917e-03         1.980241e-03
## FutureProtection       2.250188e-02  5.135048e-03         7.641352e-03
## AbsenteeOwners         0.000000e+00  0.000000e+00         0.000000e+00
## ManagerAge             4.587625e-04 -4.049949e-04        -2.152897e-04
## Burning               -3.715034e-04 -2.128406e-04        -2.238222e-04
## ExpenseTimeMoney      -1.895822e-03  2.802956e-05        -2.489073e-04
## Helpless               6.534523e-04  2.747660e-04         3.752960e-04
## LackInformation        0.000000e+00  0.000000e+00         0.000000e+00
## DeclinedOverYear       4.017486e-05  1.020213e-04         7.270849e-05
## DisatWithHelp         -1.049180e-03  6.691172e-05        -5.030070e-05
## NoVisits               2.370717e-02  5.072336e-03         7.667364e-03
## SeeOfficerMore        -7.087846e-04 -1.985652e-04        -2.605779e-04
## Weak                   1.911765e-03  1.820855e-04         4.151863e-04
## Minimal                6.487544e-04 -2.787412e-04        -1.309483e-04
## NoVisits.1             9.438658e-03  7.189636e-04         2.033448e-03
## NotTailoredToProperty -2.894003e-04 -2.166893e-04        -2.318897e-04
## UnhelpfulUseless       8.352187e-04  5.459802e-04         5.573020e-04
## ConcernedFuture       -8.651772e-04  4.195069e-05        -9.916223e-05
## PassToChild            3.919174e-03  4.440595e-04         9.681192e-04
## SellBuyNew            -5.258585e-03  1.434090e-03         5.226562e-04
## IncreaseArea           1.161536e-03  7.415455e-04         7.730714e-04
## Subdividing            5.735634e-04  1.426731e-03         1.336366e-03
## Sell                   2.317464e-03  1.567608e-03         1.684565e-03
## ConsiderSelling        3.668163e-03  9.033049e-04         1.253283e-03
## MaySell                6.177454e-04  3.465914e-03         3.143054e-03
## Age                    1.074870e-03 -1.128036e-03        -8.577329e-04
## Education              3.842451e-03  9.857853e-04         1.397835e-03
## Retired                4.880687e-04  7.153517e-04         7.327152e-04
## Income                 8.302325e-03  7.676570e-04         1.914841e-03
##                       MeanDecreaseGini
## State                      1.486710329
## CovOwnedYear               1.989519606
## X1MoreCov                  0.689492973
## CovAreaPropBand            1.779948773
## OccupationMG               2.361741958
## Resident                   0.988275348
## TimeonLand                 1.585686683
## Upbringing                 2.663571013
## MemberOtherOrgs            0.806735131
## HowEnrol                   1.982917946
## BenefitAttitude            5.573576430
## ContactFrequency           2.136235708
## IncentivesReceived         1.050537524
## Time.Over                  5.623616419
## AmendCovent                1.570160213
## FutureProtection           4.794133874
## AbsenteeOwners             0.012744442
## ManagerAge                 0.978587295
## Burning                    0.376796978
## ExpenseTimeMoney           0.484663798
## Helpless                   1.101908632
## LackInformation            0.006676662
## DeclinedOverYear           0.571262241
## DisatWithHelp              0.519660364
## NoVisits                   3.450852536
## SeeOfficerMore             0.294262345
## Weak                       0.996938408
## Minimal                    0.931078074
## NoVisits.1                 2.096107994
## NotTailoredToProperty      0.336544681
## UnhelpfulUseless           0.799762496
## ConcernedFuture            1.780131708
## PassToChild                2.127068075
## SellBuyNew                 0.848020280
## IncreaseArea               1.175138378
## Subdividing                1.834545684
## Sell                       1.956436860
## ConsiderSelling            1.871679099
## MaySell                    1.886784967
## Age                        3.128267931
## Education                  2.124888763
## Retired                    0.817925524
## Income                     3.720646893

landFS14 <- LandSatis %>% select(SatisClass,BenefitAttitude,Time.Over,FutureProtection,NoVisits,Income,Age,Upbringing,OccupationMG,ContactFrequency,Education,HowEnrol,PassToChild,CovOwnedYear,CovAreaPropBand)

#write.xlsx(landFS14, file = "landFS14.xlsx",append = FALSE)

Modeling

Methodology

In order to further investigate in terms of classification, it is necessary to find more significant relationship between the attitudes of ‘Positive’ and each factor. First of all, separating data into tarining data and testing data. Decision Tree, Random Forest and Boosting will be used for classification in tarining data. And then, compare the prediction accuracy using testing data. Therefore, the model with the best accuracy could reach the goal of identify the attitude.

Method

Decision tree is one of the predictive modelling be used in machine learning. Through a tree-like graph or model to display an algorithm. It can be applied to both regression and classication problems. Classification tree predicted the outcome class of the data belongs to.

Random forest is an ensemble learning method for classification. It is also an improvement over decision tree due to correct for decision trees’ habit of overfitting to their training set.

Boosting is one of the robust machine learning techniques that could extract a stronger learner from several previous weak learner, which the weak learner is defined to be a classifier that is slightly correlated to the true classifier. In contrast, the strong learners are those who well-correlated to the true classification. The theory that we use in our Machine Learning process is Zhu’s method.

separating data into tarining data and test data

model <- na.omit(landFS14)
set.seed(999)
alpha=0.8
d = sort(sample(nrow(model), nrow(model)*alpha))
train = model[d,]
test = model[-d,]

Building the Model

1.Decision Tree

set.seed(123)
decisionTreeFit <- caret :: train(SatisClass ~.,data=train, method="rpart", trControl = trainControl(method="cv",                          number=10), tuneLength = 100, parms=list(split='information'))
# The tuneLength parameter tells the algorithm to try different default values for the main parameter
confusionMatrix(predict(decisionTreeFit),train$SatisClass)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Other Satisfied
##   Other        12         0
##   Satisfied    30       199
##                                           
##                Accuracy : 0.8755          
##                  95% CI : (0.8271, 0.9144)
##     No Information Rate : 0.8257          
##     P-Value [Acc > NIR] : 0.02203         
##                                           
##                   Kappa : 0.3978          
##  Mcnemar's Test P-Value : 1.192e-07       
##                                           
##             Sensitivity : 0.28571         
##             Specificity : 1.00000         
##          Pos Pred Value : 1.00000         
##          Neg Pred Value : 0.86900         
##              Prevalence : 0.17427         
##          Detection Rate : 0.04979         
##    Detection Prevalence : 0.04979         
##       Balanced Accuracy : 0.64286         
##                                           
##        'Positive' Class : Other           
##

decisionTreeFit$bestTune

#summary(decisionTreeFinal)

draw the decision tree

decisionTreeFinal <- rpart(SatisClass ~ ., method="class", cp=0.05194805, data=train)

cols <- ifelse(decisionTreeFinal$frame$yval == 1, "darkred", "green4")
                           # green if survived
prp(decisionTreeFinal,main = "Final Decision Tree Model",
    extra = 106,           # display prob of survival and percent of obs
    nn = TRUE,             # display the node numbers
    fallen.leaves = TRUE,  # put the leaves on the bottom of the page
    shadow.col = "gray",   # shadows under the leaves
    branch.lty = 3,        # draw branches using dotted lines
    branch = .5,           # change angle of branch lines
    faclen = 0,            # faclen = 0 to print full factor names
    trace = 1,             # print the auto calculated cex, xlim, ylim
    split.cex = 1.2,       # make the split text larger than the node tex
    col = cols, border.col = cols,   # green if survived
    split.box.col = "lightgray",   # lightgray split boxes (default is white)
    split.border.col = "darkgray", # darkgray border on split boxes
    split.round = .5)              # round the split box corners a tad

## cex 1   xlim c(-0.2, 1.2)   ylim c(-0.1, 1.1)

2.Random Forest

The best number of variables randomly sampled as candidates at each split.

set.seed(999)
mtry <- caret :: train(SatisClass ~ ., method = "rf", data = train, importance = T,trControl = trainControl(method = "cv", number = 10))
plot(mtry)

print(mtry)

## Random Forest 
## 
## 241 samples
##  14 predictor
##   2 classes: 'Other', 'Satisfied' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 216, 217, 217, 218, 217, 217, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8467899  0.1855140
##   16    0.8464420  0.3849219
##   30    0.8337464  0.3733800
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

The cross validation graph shows that the model with 10 predictors is selected by the best accuracy. Therefore, using 11 as mtry value to fit random forest model in training data.

Fitting Random Forest Model

set.seed(999)
randomForestFit <- randomForest(formula = SatisClass ~ ., ntree=300, data=train, mtry= 14)
plot(randomForestFit)

The finally fit the random forest model to the data. Plotting the model shows us that after about 70 trees, not much changes in terms of error. However, it still fluctuates a bit with a small degree.

Detailed Information of the Final Model

print(randomForestFit)

## 
## Call:
##  randomForest(formula = SatisClass ~ ., data = train, ntree = 300,      mtry = 14) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 14
## 
##         OOB estimate of  error rate: 15.35%
## Confusion matrix:
##           Other Satisfied class.error
## Other        19        23  0.54761905
## Satisfied    14       185  0.07035176

Printing the model shows the number of variables tried at each split to be 23 and an out-of-bag estimate of error rate 13.57%. The model fit the data was not perfectly. There was some attitude of positive which was classified incorrectly specially in terms of Neutral (over 73% of error rate). It is caused by the few number of observations in Neutral.

Variable Importance

varImpPlot(randomForestFit, sort = T, main="Overall Variable Importance")

It is important to look at what is shown in terms of variable importance. This plot indicates what variables had the greatest impact in the classification model.

3.Boosting

Firstly we do the 10-folds cross validation step by using “train” process in order to find the optimal parameter of the boosting study. The specific code and process would not be demonstrated due to the output which would takes long time. The best tune of the boosting method is using the max depth=10, 50 trees by “Zhu” (SAMME)’s coefficient type. However, there are some apparent limitations in this section of study. The maximum value of the max depth is only 3, which clearly would not include the enough information, due to the variability of the classes (3 class in our case). And such high dimention of tree selecting can be easily cuase a over fitting problem. Thus we decide to use the maximum depth =5 and mfinal=50 instead. And use the error evolution method to find the optimal fitting interval.

We now bulid two initial model by using the assuming index that we gather from the previous study and build two models by using different bootstrap weight selection method.

model1 <- boosting(SatisClass~., data=train, boos = F, mfinal = 50, coeflearn = 'Zhu', 
                       control=rpart.control(maxdepth = 10))

model2 <- boosting(SatisClass~., data=train, boos = T, mfinal = 50, coeflearn = 'Zhu', 
                       control=rpart.control(maxdepth = 10))

evoltrain=errorevol(model1,train)
evoltest=errorevol(model1,test)
plot(evoltest,evoltrain)

evoltrain=errorevol(model2,train)
evoltest=errorevol(model2,test)
plot(evoltest,evoltrain)

Through the error evolution plot we can observe that the best fitting range are between 1 and 3 iterations by using the boos=F. Since then we run we extract the specific confusion matrix and fitting accuracy trough the cross-validation method. Now we use the optimal coefficient to extract the final boosting and do the further studying.

Cross validation Confusion Matrix.

set.seed(5)
boostcv <- boosting.cv(SatisClass ~.,data=train, v=10, boos=F, mfinal=2, coeflearn="Zhu", 
                       control=rpart.control(maxdepth = 10))

## i:  1 Tue Mar 12 17:52:52 2019 
## i:  2 Tue Mar 12 17:52:53 2019 
## i:  3 Tue Mar 12 17:52:55 2019 
## i:  4 Tue Mar 12 17:52:56 2019 
## i:  5 Tue Mar 12 17:52:57 2019 
## i:  6 Tue Mar 12 17:52:58 2019 
## i:  7 Tue Mar 12 17:52:59 2019 
## i:  8 Tue Mar 12 17:53:00 2019 
## i:  9 Tue Mar 12 17:53:01 2019 
## i:  10 Tue Mar 12 17:53:02 2019

boostcv$confusion

##                Observed Class
## Predicted Class Other Satisfied
##       Other        16         9
##       Satisfied    26       190

1-boostcv$error

## [1] 0.8547718

Finalmodel <- boosting(SatisClass ~.,data=train, v=10,boos =F, mfinal=2, coeflearn="Zhu", 
                       control=rpart.control(maxdepth = 10))
mean(Finalmodel$class==test$SatisClass)

## [1] 0.8547718

The cross-validation and model fitted value has a certain level of deviation, which is possibly over fitting. But we decide to keep it and discuss in the model selection feature.

Margin Plot

margins.train <- margins(Finalmodel,train)[[1]]
margins.test <- margins(Finalmodel, test)[[1]]

plot(sort(margins.train), (1:length(margins.train)) /length(margins.train), type = "l", xlim = c(-1,1),
 main = "Margin cumulative distribution graph", xlab = "Margin",
ylab = "% observations", col = "blue3", lty = 2, lwd = 2)
abline(v = 0, col = "red", lty = 2, lwd = 2)
lines(sort(margins.test), (1:length(margins.test)) / length(margins.test),
      type = "l", cex = .5, col = "green", lwd = 2)
legend("topleft", c("test", "train"), col = c("green", "blue3"),lty = 1:2, lwd = 2)

The margin plot shows the cumulative distribution of margins for boosting classifiers developed in this application. The margin of the training set is coloured in blue and the test set in green. For SAMME, there are about 20% observation owing to training error. And there are more than 50 percent boservation both in training and testing sets reach the positive margin.

Variables Relative Importance

Finalmodel$importance

##              Age  BenefitAttitude ContactFrequency  CovAreaPropBand 
##         9.102804        14.518112         7.901278         0.000000 
##     CovOwnedYear        Education FutureProtection         HowEnrol 
##         6.746169         6.388062         6.162331         0.000000 
##           Income         NoVisits     OccupationMG      PassToChild 
##        10.011330        12.866171         0.000000         3.824880 
##        Time.Over       Upbringing 
##        22.478862         0.000000

imp = data.frame(names(Finalmodel$importance),Finalmodel$importance)
rownames(imp) = NULL
colnames(imp) = c("names","imp")
imp1 = imp[-c(which(imp$imp<1)),]

g <- ggplot(imp1, aes(x=reorder(names, imp),y = imp))
g + geom_bar( stat = "identity" ,width = 0.5) +coord_flip()+ theme(axis.text.x = element_text(vjust=0.6),
            plot.caption = element_text(hjust = 0),
            axis.title.y = element_blank(),
            axis.title.x = element_blank(),
            plot.margin = unit(c(0,1,0,0), "cm"),
            legend.position = "bottom")+
       labs(title="Overall Variable Importance")

The variable importance shows which the effected and useless variables in this study. We can observe that the Enjoying play an important role, while ageband and Education are important as well. The Upbringing and IncreaseArea are less likely useful in this section.

Comparing accuracy

Testing decision tree model in test data.

decisionTreePredict <- predict(decisionTreeFinal, train, type = "class")
mean(decisionTreePredict==train$SatisClass)

## [1] 0.8755187

Testing random forest model in test data.(Random forest is a bagging technique and not a boosting technique. In boosting as the name suggests, one is learning from other which in turn boosts the learning. The trees in random forests are run in parallel. There is no interaction between these trees while building the trees.)

randomForestPredict <- predict(randomForestFit, train, type='class')
mean(randomForestPredict==train$SatisClass)

## [1] 1

Testing boosting model in test data.( The fundamental difference between bagging and random forest is that in Random forests, only a subset of features are selected at random out of the total and the best split feature from the subset is used to split each node in a tree, unlike in bagging where all features are considered for splitting a node)

boostingfit=predict.boosting(Finalmodel,train,type='class')
1-boostingfit$error

## [1] 0.8879668

Result

Prediction Accuracy of Each Method

result=matrix(c(0.9672131, 0.9180328, 0.9344262, 0.8755187, 1, 0.8879668),nrow=3,ncol=2)
row.names(result)=c("Decision Tree","Random Forest","Boosting")
colnames(result)=c("Test set","Training set")
result

##                Test set Training set
## Decision Tree 0.9672131    0.8755187
## Random Forest 0.9180328    1.0000000
## Boosting      0.9344262    0.8879668

As the result shows the highest prediction accuracy is around 83.33% in method Random Forest, and the cross-validation result of random forest is 85.91%, also be the least biased method among all three techniques. Although the testing result is better than the traning, it could be considerd as a reasonable deviation since the testing data set is not too large. Therefore, the Random Forest shoule be used for classification in out of data which with the best expression.

confusionMatrix(predict(decisionTreeFit),train$SatisClass)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Other Satisfied
##   Other        12         0
##   Satisfied    30       199
##                                           
##                Accuracy : 0.8755          
##                  95% CI : (0.8271, 0.9144)
##     No Information Rate : 0.8257          
##     P-Value [Acc > NIR] : 0.02203         
##                                           
##                   Kappa : 0.3978          
##  Mcnemar's Test P-Value : 1.192e-07       
##                                           
##             Sensitivity : 0.28571         
##             Specificity : 1.00000         
##          Pos Pred Value : 1.00000         
##          Neg Pred Value : 0.86900         
##              Prevalence : 0.17427         
##          Detection Rate : 0.04979         
##    Detection Prevalence : 0.04979         
##       Balanced Accuracy : 0.64286         
##                                           
##        'Positive' Class : Other           
##

print(randomForestFit)

## 
## Call:
##  randomForest(formula = SatisClass ~ ., data = train, ntree = 300,      mtry = 14) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 14
## 
##         OOB estimate of  error rate: 15.35%
## Confusion matrix:
##           Other Satisfied class.error
## Other        19        23  0.54761905
## Satisfied    14       185  0.07035176

boostcv$confusion

##                Observed Class
## Predicted Class Other Satisfied
##       Other        16         9
##       Satisfied    26       190

4.Logistic Regression Model

This part is to specify the approrate logistic regression model, using linear combination of all variable to initialize the model. Then using backward elimination to the feature selection. Finalize the model by manually observing the performance of significant level for the remaining variables.

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

## 
## Call:
## glm(formula = SatisClass ~ ., family = binomial(link = "logit"), 
##     data = landFS14)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.76209   0.04841   0.15797   0.35136   2.13338  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.996130   1.794132  -1.113 0.265885    
## BenefitAttitudepositive    2.558253   0.740217   3.456 0.000548 ***
## Time.OverMaintain          0.636065   0.664939   0.957 0.338781    
## Time.OverOther            -1.514455   0.895921  -1.690 0.090954 .  
## FutureProtectionNeutral   -1.269952   0.722045  -1.759 0.078607 .  
## FutureProtectionDisagreed -3.702910   1.004160  -3.688 0.000226 ***
## NoVisitsYes               -1.955004   0.697485  -2.803 0.005064 ** 
## Income37k-80K              0.577299   0.704258   0.820 0.412372    
## Income80k-180K             0.233914   0.736916   0.317 0.750923    
## IncomeOther               -0.777529   0.760815  -1.022 0.306796    
## Age>71                     0.763629   1.013552   0.753 0.451198    
## Age51-60                   2.698131   1.007492   2.678 0.007405 ** 
## Age61-70                   1.461239   0.890387   1.641 0.100771    
## UpbringingOther            0.049727   1.036073   0.048 0.961720    
## UpbringingRural            0.655945   0.834693   0.786 0.431955    
## UpbringingUrban           -0.005301   0.684460  -0.008 0.993821    
## OccupationMGOther         -0.002758   1.051041  -0.003 0.997906    
## OccupationMGProfessional   0.664719   0.940786   0.707 0.479842    
## OccupationMGRetired       -0.468876   0.859749  -0.545 0.585503    
## ContactFrequencyNone      -0.513333   0.562964  -0.912 0.361854    
## ContactFrequencyOther      2.859863   1.723381   1.659 0.097025 .  
## EducationOther            -0.495481   0.623868  -0.794 0.427074    
## EducationUndergraduate    -0.453342   0.685384  -0.661 0.508328    
## HowEnrolSelf               0.765066   0.903382   0.847 0.397056    
## HowEnrolSuccessor          0.248562   0.963117   0.258 0.796345    
## PassToChildNeutral        -0.419658   0.669851  -0.626 0.530990    
## PassToChildAgreed         -0.567413   0.681375  -0.833 0.404987    
## CovOwnedYear>21 yrs        1.348955   0.830791   1.624 0.104440    
## CovOwnedYear11-20 yrs      0.255900   0.759610   0.337 0.736205    
## CovAreaPropBand>90%        0.599616   0.721400   0.831 0.405869    
## CovAreaPropBand51%-90%     0.309621   0.752650   0.411 0.680798    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 250.76  on 301  degrees of freedom
## Residual deviance: 127.32  on 271  degrees of freedom
## AIC: 189.32
## 
## Number of Fisher Scoring iterations: 8

The full response model for class shows that a majority of Novisits, Sundividing, Occupation, Sell and BenefitCategory have a significant impact on the satisfaction. Running an Anova (from car library) on the model reveals that Novisits, Subdividing and BenefitCategory have P-value below 0.05. Thus,they are statistically significant, we choose to not omit them. In this case we use the step function to optimize the combination of model.

## Start:  AIC=189.32
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + 
##     NoVisits + Income + Age + Upbringing + OccupationMG + ContactFrequency + 
##     Education + HowEnrol + PassToChild + CovOwnedYear + CovAreaPropBand
## 
##                    Df Deviance    AIC
## - Upbringing        3   128.31 184.31
## - OccupationMG      3   129.20 185.20
## - Income            3   129.54 185.54
## - CovAreaPropBand   2   128.02 186.02
## - Education         2   128.09 186.09
## - HowEnrol          2   128.13 186.13
## - PassToChild       2   128.15 186.15
## <none>                  127.32 189.32
## - CovOwnedYear      2   131.35 189.35
## - ContactFrequency  2   135.03 193.03
## - Age               3   137.23 193.23
## - Time.Over         2   135.95 193.95
## - NoVisits          1   135.40 195.40
## - BenefitAttitude   1   139.76 199.76
## - FutureProtection  2   143.57 201.57
## 
## Step:  AIC=184.31
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + 
##     NoVisits + Income + Age + OccupationMG + ContactFrequency + 
##     Education + HowEnrol + PassToChild + CovOwnedYear + CovAreaPropBand
## 
##                    Df Deviance    AIC
## - Income            3   130.40 180.40
## - OccupationMG      3   130.50 180.50
## - CovAreaPropBand   2   128.79 180.79
## - HowEnrol          2   128.92 180.92
## - Education         2   129.07 181.07
## - PassToChild       2   129.66 181.66
## <none>                  128.31 184.31
## - CovOwnedYear      2   132.40 184.40
## - Age               3   137.65 187.65
## - ContactFrequency  2   136.06 188.06
## - Time.Over         2   137.97 189.97
## - NoVisits          1   137.23 191.23
## - BenefitAttitude   1   140.03 194.03
## - FutureProtection  2   143.96 195.96
## 
## Step:  AIC=180.4
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + 
##     NoVisits + Age + OccupationMG + ContactFrequency + Education + 
##     HowEnrol + PassToChild + CovOwnedYear + CovAreaPropBand
## 
##                    Df Deviance    AIC
## - HowEnrol          2   131.10 177.10
## - Education         2   131.29 177.29
## - CovAreaPropBand   2   131.45 177.45
## - PassToChild       2   131.54 177.54
## - OccupationMG      3   134.04 178.04
## <none>                  130.40 180.40
## - CovOwnedYear      2   134.68 180.68
## - Age               3   140.19 184.19
## - ContactFrequency  2   138.73 184.73
## - Time.Over         2   140.23 186.23
## - BenefitAttitude   1   141.50 189.50
## - NoVisits          1   141.75 189.75
## - FutureProtection  2   149.84 195.84
## 
## Step:  AIC=177.1
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + 
##     NoVisits + Age + OccupationMG + ContactFrequency + Education + 
##     PassToChild + CovOwnedYear + CovAreaPropBand
## 
##                    Df Deviance    AIC
## - Education         2   131.97 173.97
## - PassToChild       2   132.07 174.07
## - CovAreaPropBand   2   132.09 174.09
## - OccupationMG      3   134.72 174.72
## <none>                  131.10 177.10
## - CovOwnedYear      2   138.27 180.27
## - Age               3   140.93 180.93
## - ContactFrequency  2   139.26 181.26
## - Time.Over         2   141.76 183.76
## - BenefitAttitude   1   143.46 187.46
## - NoVisits          1   144.00 188.00
## - FutureProtection  2   149.85 191.85
## 
## Step:  AIC=173.97
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + 
##     NoVisits + Age + OccupationMG + ContactFrequency + PassToChild + 
##     CovOwnedYear + CovAreaPropBand
## 
##                    Df Deviance    AIC
## - PassToChild       2   132.93 170.93
## - CovAreaPropBand   2   133.17 171.17
## - OccupationMG      3   135.59 171.59
## <none>                  131.97 173.97
## - CovOwnedYear      2   138.69 176.69
## - ContactFrequency  2   140.32 178.32
## - Age               3   142.40 178.40
## - Time.Over         2   143.91 181.91
## - BenefitAttitude   1   144.49 184.49
## - NoVisits          1   145.29 185.29
## - FutureProtection  2   149.96 187.96
## 
## Step:  AIC=170.93
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + 
##     NoVisits + Age + OccupationMG + ContactFrequency + CovOwnedYear + 
##     CovAreaPropBand
## 
##                    Df Deviance    AIC
## - CovAreaPropBand   2   134.41 168.41
## - OccupationMG      3   137.37 169.37
## <none>                  132.93 170.93
## - CovOwnedYear      2   139.47 173.47
## - Age               3   142.66 174.66
## - ContactFrequency  2   140.87 174.87
## - Time.Over         2   144.78 178.78
## - NoVisits          1   146.21 182.21
## - BenefitAttitude   1   147.07 183.07
## - FutureProtection  2   150.92 184.92
## 
## Step:  AIC=168.4
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + 
##     NoVisits + Age + OccupationMG + ContactFrequency + CovOwnedYear
## 
##                    Df Deviance    AIC
## - OccupationMG      3   137.71 165.71
## <none>                  134.41 168.41
## - CovOwnedYear      2   140.26 170.26
## - Age               3   143.84 171.84
## - ContactFrequency  2   142.40 172.40
## - Time.Over         2   146.74 176.74
## - NoVisits          1   146.76 178.76
## - BenefitAttitude   1   148.22 180.22
## - FutureProtection  2   152.75 182.75
## 
## Step:  AIC=165.71
## SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + 
##     NoVisits + Age + ContactFrequency + CovOwnedYear
## 
##                    Df Deviance    AIC
## <none>                  137.71 165.71
## - CovOwnedYear      2   143.78 167.78
## - Age               3   148.56 170.56
## - ContactFrequency  2   146.75 170.75
## - Time.Over         2   150.22 174.22
## - NoVisits          1   149.85 175.85
## - BenefitAttitude   1   152.15 178.15
## - FutureProtection  2   159.06 183.06

## 
## Call:
## glm(formula = SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + 
##     NoVisits + Age + ContactFrequency + CovOwnedYear, family = binomial(link = "logit"), 
##     data = landFS14)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.74026   0.07737   0.17708   0.39234   1.90179  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -0.6674     1.0690  -0.624 0.532440    
## BenefitAttitudepositive     2.3532     0.6282   3.746 0.000180 ***
## Time.OverMaintain           0.8141     0.5980   1.361 0.173408    
## Time.OverOther             -1.4157     0.7696  -1.840 0.065835 .  
## FutureProtectionNeutral    -1.7033     0.6338  -2.687 0.007202 ** 
## FutureProtectionDisagreed  -3.5600     0.9334  -3.814 0.000137 ***
## NoVisitsYes                -2.0828     0.5975  -3.486 0.000490 ***
## Age>71                     -0.2533     0.7406  -0.342 0.732337    
## Age51-60                    1.7590     0.8342   2.109 0.034984 *  
## Age61-70                    0.8885     0.7178   1.238 0.215736    
## ContactFrequencyNone       -0.6438     0.5213  -1.235 0.216787    
## ContactFrequencyOther       2.1547     1.2971   1.661 0.096693 .  
## CovOwnedYear>21 yrs         1.4842     0.6559   2.263 0.023644 *  
## CovOwnedYear11-20 yrs       0.4543     0.5778   0.786 0.431684    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 250.76  on 301  degrees of freedom
## Residual deviance: 137.71  on 288  degrees of freedom
## AIC: 165.71
## 
## Number of Fisher Scoring iterations: 7

According to the smallest AIC(185.78) the best model was formulated as: satisfaction ~ BenefitAttitude + Time.Over + FutureProtection + NoVisits + Age + ContactFrequency + CovOwnedYear After running an Anova on the model reveals that CovOwnedYear close to 0.05. However, since some of class in CovOwnedYear are statistically significant, we choose not to omit variables Occupation and Sell either, and rebuild the model as final model shown below.

mod.final =  glm(SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + NoVisits + Age + ContactFrequency + CovOwnedYear, family = binomial(link = "logit"), 
    data = model)
summary(mod.final)

## 
## Call:
## glm(formula = SatisClass ~ BenefitAttitude + Time.Over + FutureProtection + 
##     NoVisits + Age + ContactFrequency + CovOwnedYear, family = binomial(link = "logit"), 
##     data = model)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.74026   0.07737   0.17708   0.39234   1.90179  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -0.6674     1.0690  -0.624 0.532440    
## BenefitAttitudepositive     2.3532     0.6282   3.746 0.000180 ***
## Time.OverMaintain           0.8141     0.5980   1.361 0.173408    
## Time.OverOther             -1.4157     0.7696  -1.840 0.065835 .  
## FutureProtectionNeutral    -1.7033     0.6338  -2.687 0.007202 ** 
## FutureProtectionDisagreed  -3.5600     0.9334  -3.814 0.000137 ***
## NoVisitsYes                -2.0828     0.5975  -3.486 0.000490 ***
## Age>71                     -0.2533     0.7406  -0.342 0.732337    
## Age51-60                    1.7590     0.8342   2.109 0.034984 *  
## Age61-70                    0.8885     0.7178   1.238 0.215736    
## ContactFrequencyNone       -0.6438     0.5213  -1.235 0.216787    
## ContactFrequencyOther       2.1547     1.2971   1.661 0.096693 .  
## CovOwnedYear>21 yrs         1.4842     0.6559   2.263 0.023644 *  
## CovOwnedYear11-20 yrs       0.4543     0.5778   0.786 0.431684    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 250.76  on 301  degrees of freedom
## Residual deviance: 137.71  on 288  degrees of freedom
## AIC: 165.71
## 
## Number of Fisher Scoring iterations: 7

Anova(mod.final)

coef(mod.final)

##               (Intercept)   BenefitAttitudepositive 
##                -0.6673514                 2.3531811 
##         Time.OverMaintain            Time.OverOther 
##                 0.8141005                -1.4156674 
##   FutureProtectionNeutral FutureProtectionDisagreed 
##                -1.7033414                -3.5599500 
##               NoVisitsYes                    Age>71 
##                -2.0828187                -0.2533093 
##                  Age51-60                  Age61-70 
##                 1.7589539                 0.8885433 
##      ContactFrequencyNone     ContactFrequencyOther 
##                -0.6438065                 2.1546702 
##       CovOwnedYear>21 yrs     CovOwnedYear11-20 yrs 
##                 1.4841905                 0.4543421

Final Model

# Model diagnostic with odds ratio
model.coef <- summary(mod.final)$coefficients[, c("Estimate", "Std. Error", "Pr(>|z|)")]
# display coefficients of m2.model and Odds Ratio
OR.mm <- exp(summary(mod.final)$coefficients[,1])
cbind("Odds Ratio (OR)" = round(OR.mm,4), round(model.coef,5))

##                           Odds Ratio (OR) Estimate Std. Error Pr(>|z|)
## (Intercept)                        0.5131 -0.66735    1.06898  0.53244
## BenefitAttitudepositive           10.5190  2.35318    0.62822  0.00018
## Time.OverMaintain                  2.2571  0.81410    0.59802  0.17341
## Time.OverOther                     0.2428 -1.41567    0.76957  0.06584
## FutureProtectionNeutral            0.1821 -1.70334    0.63383  0.00720
## FutureProtectionDisagreed          0.0284 -3.55995    0.93336  0.00014
## NoVisitsYes                        0.1246 -2.08282    0.59747  0.00049
## Age>71                             0.7762 -0.25331    0.74063  0.73234
## Age51-60                           5.8064  1.75895    0.83421  0.03498
## Age61-70                           2.4316  0.88854    0.71776  0.21574
## ContactFrequencyNone               0.5253 -0.64381    0.52125  0.21679
## ContactFrequencyOther              8.6250  2.15467    1.29713  0.09669
## CovOwnedYear>21 yrs                4.4114  1.48419    0.65589  0.02364
## CovOwnedYear11-20 yrs              1.5751  0.45434    0.57781  0.43168

Variable Importance

Variable Importance for variables in model 2

From the plot we can see top 10 most important variable in the logistic regression model.

Model Evaluation

AUC/ROC Curve

ROC curves of model 2 using training dataset (top) test dataset (middle) and entire dataset (bottom)

(Receiver Operating Characteristic) curve indicates the Ture positive rate against false positive rate of different cut points of a diagnostic test. It displays the trade off between sensitivity and specificity. According to the generated plot the curve is following the left-hand border and the top border of the ROC space, which covers 89.8% of the area (AUC=0.92). This can be concluded that approximately 90% of accuracy to determine whether the satisfaction is Satisfied or Other.

Specifiying the cutoff value

To find the optimal cut off value

cutoffs <- seq(0.1, 0.9, 0.02)
accuracy <- NULL
for (i in seq(along = cutoffs)) {
    prediction <- ifelse(prob.model >= cutoffs[i], 1, 0)  #Predicting for cut-off
    accuracy <- c(accuracy, length(which(mod.final$y == prediction))/length(prediction) * 
        100)
}

plot(cutoffs, accuracy, pch = 19, type = "b", col = "steelblue", main = "Cutoff Value Performance in Final Model", 
    xlab = "Cutoff Level", ylab = "Accuracy %")

cutoffs[which.max(accuracy)]

Successively, the appropriate threshold of the logistic regression model needed to be found. 40 possible values between 0.1 and 0.9 are taken to test the accuracy of the model. The plot shows the trend of accuracy changing under different cut off values.

Confusion Matrix

Since the optimal threshold of cut off value is determined, the confusion matrix can be generated. The accuracy for the prediction using the finalized model is 92.16%. Moreover, the no information rate (best guess given no information beyond the overall distribution of the classes you are trying to predict.) is 56.27, less than the accuracy, shows that the model can be trusted in certain level.

library(caret)
# Get predictions for the entire dataset using model 2 convert values that
# are greater than cutoff to be 1 and otherwise 0 with 0.745 as cutoff value
pre.class = NULL
for (i in 1:nrow(model)) {
    pre.class = ifelse(prob.model >= 0.42, "Satisfied", "Other")
}

pre.class = as.factor(pre.class)

table(pre.class)
# display confusion matrix with a cutoff value at 0.3
confusionMatrix(pre.class, landFS14$SatisClass)

We utilized the logistic regression with linear combination of variables to construct the model. And use the techniques to optimize the model and detect the feasibility using backwards eliminations and ROC curves. Generally, only 5 out of 11 independent variables are considered to be significantly effective to the possibility of a landholder being satisfied or other. NoVisits + Subdividing + OccupationMG + TimeonLand + Sell + BenefitCategory + ContactFrequency formulate the model to predict the landholder’s satisfaction level. There are 90.73% chance of satisfaction level could be predicted if the corresponding information could be given correctly.

Surprisingly, some variables like IncentivesReceived and Education, were deleted from the model during the studying process.

FianalReport

Chen Cenlue

2018楠<9e>10閺<88><88>16閺<83>

Feature selection/ Test Importance

Modeling

Methodology

Method

separating data into tarining data and test data

Building the Model

1.Decision Tree

2.Random Forest

Fitting Random Forest Model

Detailed Information of the Final Model

Variable Importance

3.Boosting

Cross validation Confusion Matrix.

Margin Plot

Variables Relative Importance

Comparing accuracy

Result

4.Logistic Regression Model

Final Model

Variable Importance

Model Evaluation

AUC/ROC Curve

Specifiying the cutoff value

Confusion Matrix