Distance between documents : Dist(V,W)=½(KL(V,AVG)+KL(W,AVG))
Measure of similarity between groups
opts_chunk$set(comment = "test", results = "hide", warning = FALSE, error = FALSE,
message = FALSE)
computeSJDistance = function(tf, df, terms, logdf = TRUE, verbose = TRUE) {
# terms - a character vector of all the unique words, length numTerms df - a
# numeric vector, length numTerms, number of docs that contains term tf -
# matrix, numTerms rows, numCols cols (number of documents)
numTerms = nrow(tf)
numDocs = ncol(tf)
tfn = t(tf)/colSums(tf)
if (logdf)
tfidfs = t(tfn) * (log(numDocs) - log(df)) else tfidfs = numDocs * (t(tfn)/df)
D.SJ = matrix(0, numDocs, numDocs)
for (i in 1:(numDocs - 1)) {
for (j in (i + 1):numDocs) {
D.SJ[i, j] = D.SJ[j, i] = D.JensenShannon(tfidfs[, i], tfidfs[,
j])
}
if (verbose)
print(i)
}
return(D.SJ)
}
D.BasicKullbackLeibler = function(p, q) {
tmp = !(p == 0 | q == 0)
p = p[tmp]
q = q[tmp]
sum(-p * log(q) + p * log(p))
}
D.JensenShannon = function(p, q) {
T = 0.5 * (p + q)
0.5 * D.BasicKullbackLeibler(p, T) + 0.5 * D.BasicKullbackLeibler(q, T)
}
library("SnowballC")
setwd("~/Desktop/Homework_backup/datamining/R_practice/HW7")
# skip first 273 lines which are not speeches part.
union <- scan("unif.txt", what = character(), sep = "\n", strip.white = c(T),
skip = 273, blank.lines.skip = TRUE)
singleString <- paste(union, collapse = " ")
unionspeeches <- unlist(strsplit(singleString, "\\*{3}")) #speeches are seperated by ***
unionspeeches <- unionspeeches[1:12] #this example will only process first 12 speeches
# clean the text and finally get a list of 12 speeches and each of 12 is
# constituted of character vector.
speechsentence <- lapply(unionspeeches, function(x) unlist(strsplit(x, "\\.")))
speechclean <- lapply(speechsentence, function(x) gsub("[']|[[:digit:]]|Applause",
"", x))
speechcleanlow <- lapply(speechclean, function(x) tolower(x))
sentencesplit <- lapply(speechcleanlow, function(x) unlist(strsplit(x, "[[:punct:]]| |[[:punct:]] | [[:punct:]]")))
wordsplit <- lapply(sentencesplit, function(x) x <- x[x != ""])
# Stemming
stemword <- lapply(wordsplit, function(x) wordStem(x, language = "english"))
bagword <- unique(unlist(stemword))
bag <- length(bagword)
n <- 0
tf <- matrix(ncol = 12, nrow = bag)
for (c in bagword) {
n <- n + 1
tf[n, ] <- unlist(sapply(stemword, function(x) length(grep(c, x))))
}
normTermFreq = matrix(0, nrow = nrow(tf), ncol = ncol(tf))
wordsInDoc = apply(tf, 2, sum)
for (i in 1:ncol(tf)) {
normTermFreq[, i] = tf[, i]/wordsInDoc[i]
}
idf = apply(tf, 1, function(x) sum(x > 0))
simMatrix = computeSJDistance(tf = tf, df = idf, terms = bagword, logdf = FALSE)
documents = as.dist(simMatrix)
hc = hclust(documents)
hc2 = hclust(documents, "single")
Typically the tree is drawn such that the heights of the branches proportional to the dissimilarity between two groups
plot(hc, main = "Cluster Dendrogram", sub = "first 12 speeches from unionspeech")
plot(hc2, main = "Cluster Dendrogram", sub = "first 12 speeches from unionspeech")