TEXT MINING OF UNION SPEECHES

BASIC TEXT MINING TECHNIQUES & SIMILARITY ANALYSIS & CLUSTERING BASED ON Kulback-Leibler MEASUREMENT

Knowledge of information theory.

opts_chunk$set(comment = "test", results = "hide", warning = FALSE, error = FALSE, 
    message = FALSE)

Function

computeSJDistance = function(tf, df, terms, logdf = TRUE, verbose = TRUE) {
    # terms - a character vector of all the unique words, length numTerms df - a
    # numeric vector, length numTerms, number of docs that contains term tf -
    # matrix, numTerms rows, numCols cols (number of documents)

    numTerms = nrow(tf)
    numDocs = ncol(tf)

    tfn = t(tf)/colSums(tf)
    if (logdf) 
        tfidfs = t(tfn) * (log(numDocs) - log(df)) else tfidfs = numDocs * (t(tfn)/df)

    D.SJ = matrix(0, numDocs, numDocs)
    for (i in 1:(numDocs - 1)) {
        for (j in (i + 1):numDocs) {
            D.SJ[i, j] = D.SJ[j, i] = D.JensenShannon(tfidfs[, i], tfidfs[, 
                j])
        }
        if (verbose) 
            print(i)
    }
    return(D.SJ)
}

D.BasicKullbackLeibler = function(p, q) {
    tmp = !(p == 0 | q == 0)
    p = p[tmp]
    q = q[tmp]

    sum(-p * log(q) + p * log(p))
}

D.JensenShannon = function(p, q) {
    T = 0.5 * (p + q)
    0.5 * D.BasicKullbackLeibler(p, T) + 0.5 * D.BasicKullbackLeibler(q, T)
}

library("SnowballC")
setwd("~/Desktop/Homework_backup/datamining/R_practice/HW7")
# skip first 273 lines which are not speeches part.
union <- scan("unif.txt", what = character(), sep = "\n", strip.white = c(T), 
    skip = 273, blank.lines.skip = TRUE)
singleString <- paste(union, collapse = " ")
unionspeeches <- unlist(strsplit(singleString, "\\*{3}"))  #speeches are seperated by ***
unionspeeches <- unionspeeches[1:12]  #this example will only process first 12 speeches
# clean the text and finally get a list of 12 speeches and each of 12 is
# constituted of character vector.
speechsentence <- lapply(unionspeeches, function(x) unlist(strsplit(x, "\\.")))
speechclean <- lapply(speechsentence, function(x) gsub("[']|[[:digit:]]|Applause", 
    "", x))
speechcleanlow <- lapply(speechclean, function(x) tolower(x))
sentencesplit <- lapply(speechcleanlow, function(x) unlist(strsplit(x, "[[:punct:]]| |[[:punct:]] | [[:punct:]]")))
wordsplit <- lapply(sentencesplit, function(x) x <- x[x != ""])

# Stemming
stemword <- lapply(wordsplit, function(x) wordStem(x, language = "english"))
bagword <- unique(unlist(stemword))
bag <- length(bagword)
n <- 0
tf <- matrix(ncol = 12, nrow = bag)
for (c in bagword) {
    n <- n + 1
    tf[n, ] <- unlist(sapply(stemword, function(x) length(grep(c, x))))
}

normTermFreq = matrix(0, nrow = nrow(tf), ncol = ncol(tf))

wordsInDoc = apply(tf, 2, sum)

for (i in 1:ncol(tf)) {
    normTermFreq[, i] = tf[, i]/wordsInDoc[i]
}

idf = apply(tf, 1, function(x) sum(x > 0))

simMatrix = computeSJDistance(tf = tf, df = idf, terms = bagword, logdf = FALSE)

documents = as.dist(simMatrix)
hc = hclust(documents)
hc2 = hclust(documents, "single")

Typically the tree is drawn such that the heights of the branches proportional to the dissimilarity between two groups

plot(hc, main = "Cluster Dendrogram", sub = "first 12 speeches from unionspeech")

plot of chunk unnamed-chunk-5

plot(hc2, main = "Cluster Dendrogram", sub = "first 12 speeches from unionspeech")

plot of chunk unnamed-chunk-6