library(knitr)
knitr::opts_chunk$set(fig.width = 12,fig.height = 8)
library(readr)
# read the dataset
EssayData <- read_csv("Disputed_Essay_data.csv")
# Create dataset including only explanatory factors
DfNum <- EssayData[,-1:-2]
sum(!complete.cases(EssayData))
## [1] 0
This Dataset has no missing value, which is clean enough.
# Create vectors for Author labels
index <- rep(NA,85)
for (i in 1:85){
index[i] <- paste(substr(EssayData$author[i],1,2),i)
}
# Change rownames for the dataframe for futher demonstration of Clustering/PCA methods
rownames(DfNum) <- index
DFTree <- EssayData[,-2]
# Using stratified sampling methods to split the dataset into training and testing datasets
Ham <- DFTree[12:62,]
Mad <- DFTree[71:85,]
SizeH <- floor(nrow(Ham)*0.8) # sample size from Hamilton
SizeM <- floor(nrow(Mad)*0.8) # sample size from not Madison
#Sample index generating
H <- sample(12:62,SizeH,replace = F)
M <- sample(71:85,SizeM,replace = F)
# Training Data
TrDF <- DFTree[c(H,M),]
# Testing Data
TeDF <- DFTree[-c(H,M,63:70),]
TeDF <- TeDF[-1:-11,]
# Prediction Data
PreDF <- DFTree[1:11,]
colnames(TrDF) <- make.names(colnames(TrDF))
colnames(TeDF) <- make.names(colnames(TeDF))
colnames(PreDF) <- make.names(colnames(PreDF))
# Data Standardization
DfNum <- scale(DfNum,center = T,scale = T)
# summary(DfNum)
# Distance Function and visualization
library(factoextra)
# Initial Visualization
distance <- get_dist(DfNum,method = 'euclidean')
fviz_dist(distance,gradient = list(low='#00AFBB',mid='white',high='#FC4E07'))
From the graph, we can see that Dispute 1,3,4,5,6,7,8 is more similar to Maison,While dipute 2 is more similar to joint authorship-HM, and Ja 66, Ma 83,Ma76 also share more features with HM.
# Elbow method to decide Optimal Number of clusters
set.seed(8)
wss <- function(k){
return(kmeans(DfNum,k,nstart = 25)$tot.withinss)
}
k_value <- 1:10
wss_value <- purrr::map_dbl(k_value,wss)
plot(x=k_value,y=wss_value,
type = 'b',frame=F,
xlab = 'Number of clusters K',
ylab = 'Total within-clusters sum of square')
# There is no apparent bend(knee) in this plot, it's hard to decide the k clusters by this
# Using Silhouette method
fviz_nbclust(DfNum,kmeans,method = 'silhouette')+labs(sutitle='Silhouette method')
# Gap statistics
set.seed(123)
fviz_nbclust(DfNum,kmeans,nstart=25,method='gap_stat',nboot=50)+labs(subtitle = 'Gap Statistic method')
# From the gap statistics, either choose 2 or 4 as numbers of clusters
Observation: * There is bigger difference while using 2 clusters, which indicates groups between Ha and non-Han. * Dipute 1, 3, 6, 7 and 10 is both close to Halmiton. * Di 4,5,9 are exclusively close to Ma. * Di 2 is close to Ma and HM
# Using 4
km_output4 <- kmeans(DfNum,centers = 4,nstart = 25,iter.max = 1000,algorithm = 'Hartigan-Wong')
fviz_cluster(km_output4,data=DfNum)
# Using 2
km_output2 <- kmeans(DfNum,centers = 2,nstart = 25,iter.max = 1000,algorithm = 'Hartigan-Wong')
#str(km_output2)
fviz_cluster(km_output2,data=DfNum)
pca <- prcomp(DfNum,scale. = T,center = T)
#summary(pca)
pcaDf <- as.data.frame(pca$rotation)
PC 1-PC 10 collectively explain 49.83% of the original information
cluster <- km_output4$cluster
pcs$cluster <- cluster[match(rownames(pcs),names(cluster))]
library(ggplot2)
ggplot(pcs,aes(x=PC1,y=PC2))+geom_point(col=cluster)+geom_text(label=rownames(pcs))
Observation: * Articles of Joint Authorship are closer to Madison on the graph.
The following result shows that the Hierarchical Clustering is not that efficient for clustering compared with Kmeans due to the subtle difference. Not be able to produce desirable number of clusters by consistent comparison methods
hac_output <- hclust(dist(DfNum,method = 'euclidean'),method = 'complete')
plot(hac_output)
hac_cut2 <- cutree(hac_output,2)
for (i in 1:length(hac_cut2)) {
if(hac_cut2[i]!=km_output2$cluster[i]) print(names(hac_cut2)[i])
}
## [1] "di 1"
## [1] "di 2"
## [1] "di 3"
## [1] "di 4"
## [1] "di 5"
## [1] "di 6"
## [1] "di 8"
## [1] "di 9"
## [1] "di 10"
## [1] "di 11"
## [1] "Ha 15"
## [1] "Ha 36"
## [1] "Ha 62"
## [1] "HM 63"
## [1] "HM 64"
## [1] "HM 65"
## [1] "Ja 66"
## [1] "Ja 67"
## [1] "Ja 70"
## [1] "Ma 71"
## [1] "Ma 72"
## [1] "Ma 73"
## [1] "Ma 74"
## [1] "Ma 75"
## [1] "Ma 76"
## [1] "Ma 77"
## [1] "Ma 78"
## [1] "Ma 79"
## [1] "Ma 80"
## [1] "Ma 81"
## [1] "Ma 82"
## [1] "Ma 83"
## [1] "Ma 84"
## [1] "Ma 85"
hac_cut4 <- cutree(hac_output,4)
for (i in 1:length(hac_cut4)) {
if(hac_cut4[i]!=km_output4$cluster[i]) print(names(hac_cut4)[i])
}
## [1] "di 1"
## [1] "di 2"
## [1] "di 3"
## [1] "di 4"
## [1] "di 5"
## [1] "di 6"
## [1] "di 7"
## [1] "di 8"
## [1] "di 9"
## [1] "di 10"
## [1] "di 11"
## [1] "Ha 12"
## [1] "Ha 13"
## [1] "Ha 14"
## [1] "Ha 15"
## [1] "Ha 16"
## [1] "Ha 17"
## [1] "Ha 18"
## [1] "Ha 19"
## [1] "Ha 20"
## [1] "Ha 21"
## [1] "Ha 22"
## [1] "Ha 23"
## [1] "Ha 24"
## [1] "Ha 25"
## [1] "Ha 26"
## [1] "Ha 27"
## [1] "Ha 28"
## [1] "Ha 29"
## [1] "Ha 30"
## [1] "Ha 31"
## [1] "Ha 32"
## [1] "Ha 33"
## [1] "Ha 34"
## [1] "Ha 35"
## [1] "Ha 36"
## [1] "Ha 37"
## [1] "Ha 38"
## [1] "Ha 39"
## [1] "Ha 40"
## [1] "Ha 41"
## [1] "Ha 42"
## [1] "Ha 43"
## [1] "Ha 44"
## [1] "Ha 45"
## [1] "Ha 46"
## [1] "Ha 47"
## [1] "Ha 48"
## [1] "Ha 49"
## [1] "Ha 50"
## [1] "Ha 51"
## [1] "Ha 52"
## [1] "Ha 53"
## [1] "Ha 54"
## [1] "Ha 55"
## [1] "Ha 56"
## [1] "Ha 57"
## [1] "Ha 58"
## [1] "Ha 59"
## [1] "Ha 60"
## [1] "Ha 61"
## [1] "Ha 62"
## [1] "HM 64"
## [1] "HM 65"
## [1] "Ja 68"
## [1] "Ja 69"
## [1] "Ja 70"
## [1] "Ma 71"
## [1] "Ma 72"
## [1] "Ma 73"
## [1] "Ma 74"
## [1] "Ma 75"
## [1] "Ma 77"
## [1] "Ma 78"
## [1] "Ma 79"
## [1] "Ma 80"
## [1] "Ma 81"
## [1] "Ma 82"
## [1] "Ma 83"
## [1] "Ma 84"
## [1] "Ma 85"
However, from intuitively reading the graph, some observations are as follows: * Dispute 1 could be ha or Ma * Dispute 3,4,5,6,10,11 are very likely to be Ma * Dispute 7,8,9 are more likely to be Ha * Dispute 2 could be by HM co-auther
library(caret)
library(rpart)
dt_model <- train(author~.,data=TrDF,metric='Accuracy',method='rpart')
print(dt_model)
## CART
##
## 52 samples
## 70 predictors
## 2 classes: 'Hamilton', 'Madison'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 52, 52, 52, 52, 52, 52, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.0000000 0.9120824 0.7542551
## 0.4583333 0.8987491 0.7188121
## 0.9166667 0.8186825 0.3628433
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
print(dt_model$finalModel)
## n= 52
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 52 12 Hamilton (0.76923077 0.23076923)
## 2) upon>=0.019 39 0 Hamilton (1.00000000 0.00000000) *
## 3) upon< 0.019 13 1 Madison (0.07692308 0.92307692) *
dt_pred <- predict(dt_model,newdata=TeDF,type = 'prob')
dt_pred
## Hamilton Madison
## 1 1.00000000 0.0000000
## 2 1.00000000 0.0000000
## 3 1.00000000 0.0000000
## 4 1.00000000 0.0000000
## 5 1.00000000 0.0000000
## 6 1.00000000 0.0000000
## 7 1.00000000 0.0000000
## 8 1.00000000 0.0000000
## 9 1.00000000 0.0000000
## 10 1.00000000 0.0000000
## 11 1.00000000 0.0000000
## 12 0.07692308 0.9230769
## 13 0.07692308 0.9230769
## 14 0.07692308 0.9230769
dt_pred2 <- predict(dt_model,newdata=TeDF,type = 'raw')
dt_pred2
## [1] Hamilton Hamilton Hamilton Hamilton Hamilton Hamilton Hamilton
## [8] Hamilton Hamilton Hamilton Hamilton Madison Madison Madison
## Levels: Hamilton Madison
table(dt_pred2,TeDF$author)
##
## dt_pred2 Hamilton Madison
## Hamilton 11 0
## Madison 0 3
# Accuracy of this model is 100%.
precision <- posPredValue(as.factor(dt_pred2), as.factor(TeDF$author), positive = "Hamilton")
recall <- sensitivity(as.factor(dt_pred2), as.factor(TeDF$author), positive = "Hamilton")
f <- 2 * precision * recall / (precision + recall)
sprintf("Precision is %.2f; recall is %.2f; F measure if %.2f",precision, recall, f)
## [1] "Precision is 1.00; recall is 1.00; F measure if 1.00"
dt_model_tune <- train(author~.,data=TrDF,method='rpart',
metric='Accuracy',
tuneLength=8)
print(dt_model_tune$finalModel)
## n= 52
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 52 12 Hamilton (0.76923077 0.23076923)
## 2) upon>=0.019 39 0 Hamilton (1.00000000 0.00000000) *
## 3) upon< 0.019 13 1 Madison (0.07692308 0.92307692) *
dt_predictTune <- predict(dt_model_tune,newdata = TeDF,type = 'prob')
head(dt_predictTune)
## Hamilton Madison
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
dt_model_postprune <- prune(dt_model$finalModel,cp=0.1)
print(dt_model_postprune)
## n= 52
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 52 12 Hamilton (0.76923077 0.23076923)
## 2) upon>=0.019 39 0 Hamilton (1.00000000 0.00000000) *
## 3) upon< 0.019 13 1 Madison (0.07692308 0.92307692) *
dt_predictTune <- predict(dt_model_postprune,newdata = TeDF,type = 'prob')
dt_predictTune
## Hamilton Madison
## 1 1.00000000 0.0000000
## 2 1.00000000 0.0000000
## 3 1.00000000 0.0000000
## 4 1.00000000 0.0000000
## 5 1.00000000 0.0000000
## 6 1.00000000 0.0000000
## 7 1.00000000 0.0000000
## 8 1.00000000 0.0000000
## 9 1.00000000 0.0000000
## 10 1.00000000 0.0000000
## 11 1.00000000 0.0000000
## 12 0.07692308 0.9230769
## 13 0.07692308 0.9230769
## 14 0.07692308 0.9230769
## the decrease of cp does not do harm to the model fitting and prediction precision. The accuracy is still 100%.
plot(dt_model$finalModel)
text(dt_model$finalModel)
library(rattle)
fancyRpartPlot(dt_model$finalModel)
library(rpart.plot)
prp(dt_model$finalModel)
Based on the model, there are 75% of articles written by Hamilton, 25% written by Madison in the training dataset. The probability of cases in Halmiton that are classified rightly is 100%, while that in Madison is 92%.
dt_judge <- predict(dt_model,newdata = PreDF,type = 'prob')
dt_judge
## Hamilton Madison
## 1 0.07692308 0.9230769
## 2 0.07692308 0.9230769
## 3 0.07692308 0.9230769
## 4 0.07692308 0.9230769
## 5 0.07692308 0.9230769
## 6 0.07692308 0.9230769
## 7 0.07692308 0.9230769
## 8 0.07692308 0.9230769
## 9 0.07692308 0.9230769
## 10 0.07692308 0.9230769
## 11 0.07692308 0.9230769
dt_judge2 <- predict(dt_model,newdata = PreDF,type = 'raw')
dt_judge2
## [1] Madison Madison Madison Madison Madison Madison Madison Madison
## [9] Madison Madison Madison
## Levels: Hamilton Madison
Comparison between cluster analysis:
Comparison between cluster analysis and decision tree: