Homework 10: Text Mining

Now that we are doing text mining, we will be creating our own termDocMatrix.

This was also done in class, when we analyzed the structure of the “I have a dream” speech – in terms of the use of positive and negative words. However, in that effort, we treated all positive words the same (ex. good is the same as great). This might not be appropriate – maybe we should count more positive (and negative) words more than other words. For example “I loved the movie” might be stronger than “I liked the movie”.

There is a different word file that ranks each word on a scale of -5 to 5 (negative to positive). It is known as the AFINN word list.

Your task for this homework is to adapt the lab that we did in class, to compute the score for the MLK speech using the AFINN word list (as opposed to the positive and negative word lists).

First read in the AFINN word list. Note that each line is both a word and a score (between -5 and 5). You will need to split the line and create two vectors (one for words and one for scores). Using AFFIN wordlist text labels.

# read in the data using read.delim() 

# change column names to "Word" and "Score"
library(tm)

## Loading required package: NLP

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(devtools)

## Loading required package: usethis

library(tidyr)
library(readr)
AFFINworldlist <- read.delim("https://cjacks04.github.io/687/Datasets/AFINN111.txt",
                             sep = " ")
Splinter <- separate(AFFINworldlist, col = abandon..2, into = c("Word","Score"), sep = "\t")

## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 16 rows [339,
## 354, 502, 764, 765, 768, 984, 1115, 1117, 1525, 1615, 1622, 1624, 1934, 1993,
## 2088].

Compute the overall score for the MLK speech using the AFINN word list (as opposed to the positive and negative word lists).

# read in text file MLK.txt  

Speech <-"http://www.coreybjackson.com/687/Datasets/MLKspeech.txt"
mlk <-scan(Speech, character(0),sep = "\n")

  # interprets each element of the "mlk" as a document and create a vector source
words.vec <-VectorSource(mlk)

  # create a Corpus, a "Bag of Words"
words.corpus <-Corpus(words.vec)

 # first step transformation: make all of the letters in "words.corpus" lowercase
words.corpus <- tm_map(words.corpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(words.corpus, content_transformer(tolower)):
## transformation drops documents

  # second step transformation: remove the punctuation in "words.corpus"
words.corpus <- tm_map(words.corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(words.corpus, removePunctuation): transformation
## drops documents

  # third step transformation: remove numbers in "words.corpus"
words.corpus <- tm_map(words.corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(words.corpus, removeNumbers): transformation
## drops documents

  # final step transformation: take out the "stop" words, such as "the", "a" and "at"
words.corpus <- tm_map(words.corpus, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(words.corpus, removeWords, stopwords("english")):
## transformation drops documents

  # create a term-document matrix "tdm"
tdm <- TermDocumentMatrix(words.corpus)

  # convert tdm into a matrix called "m"
m <- as.matrix(tdm)

  # create a list of counts for each word named "wordCounts"
wordCounts <-rowSums(m)

  # sort words in "wordCounts" by frequency
wordCounts <- sort(wordCounts, decreasing = TRUE)

  # check the first ten items in "wordCounts" to see if it is built correctly
head(wordCounts, 10)

##    will freedom   negro     one     let    ring     day   dream  nation    come 
##      26      20      13      13      13      12      11      11      10      10

  # calculate the total number of words
Totalwords <- sum(wordCounts)
 words <- names(wordCounts)
    matched <- match(words, Splinter$Word, nomatch = 0)
    mCounts <- wordCounts[which(matched != 0)]
    match <- data.frame(names(mCounts),mCounts,row.names = c(1:length(mCounts)))
    colnames(match)<-c("word","counts")
    # merge matched words with Afinn scores
    mergedTable <- merge(match, Splinter, by.x = "word" ,by.y = "Word")
    mergedTable$Score <- as.numeric(mergedTable$Score)
mergedTable$Total <-mergedTable$counts * mergedTable$Score
  # calculate the overall score
Score <- sum(mergedTable$Total, na.rm = TRUE)/Totalwords
    # calculate the total score  
Score

## [1] 0.1117717

mlk <- readLines("http://www.coreybjackson.com/687/Datasets/MLKspeech.txt")

Then, just as in class, compute the sentiment score for each quarter (25%) of the speech to see how this sentiment analysis is the same or different than what was computing with just the positive and negative word files.

 words.vec <- VectorSource(mlk)
    # create a Corpus, which is a "Bag of Words"
    words.corpus <- Corpus(words.vec)
    # define "cutpoint_l" as the first cut points; round the number to get an interger
    cutpoint_l <- round(length(words.corpus)/4)
    View(cutpoint_l)
    words.corpus1 <- words.corpus[1:7]
    words.corpus2 <- words.corpus[8:15]
    words.corpus3 <- words.corpus[16:23]
    words.corpus4 <- words.corpus[24:29]
  # create a function to calculate scores for each quater
  TextMine <- function(q){
    # word corpora transformation
    words.corpus <- tm_map(words.corpus, content_transformer(tolower))
    words.corpus <- tm_map(words.corpus, removePunctuation)
    words.corpus <- tm_map(words.corpus, removeNumbers)
    words.corpus <- tm_map(words.corpus, removeWords, stopwords("english"))
    # create term document matrix
    tdm <- TermDocumentMatrix(words.corpus)
    m <- as.matrix(tdm)
    # calculate a list of counts for each word
    wordCounts <- rowSums(m)
    wordCounts <- sort(wordCounts, decreasing=TRUE)
    # calculate total words
    totalWords <- sum(wordCounts)
    # locate the mlk words appeared in Afinn list
    words <- names(wordCounts)
    matched <- match(words, Splinter$Word, nomatch = 0)
    mCounts <- wordCounts[which(matched != 0)]
    match <- data.frame(names(mCounts),mCounts,row.names = c(1:length(mCounts)))
    colnames(match)<-c("word","counts")
    # merge matched words with Afinn scores
    mergedTable <- merge(match, Splinter, by.x = "word" ,by.y = "Word")
    mergedTable$Score <- as.numeric(mergedTable$Score)
mergedTable$Total <-mergedTable$counts * mergedTable$Score
  # calculate the overall score
Score <- sum(mergedTable$Total, na.rm = TRUE)/totalWords
    # calculate the total score
    # return the results
    return(Score)
  }

  # apply function to first quarter
PlotOne <-TextMine(words.corpus1)

## Warning in tm_map.SimpleCorpus(words.corpus, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removeWords, stopwords("english")):
## transformation drops documents

  # apply function to second quarter
PlotTwo<-TextMine(words.corpus2)

## Warning in tm_map.SimpleCorpus(words.corpus, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removeWords, stopwords("english")):
## transformation drops documents

  # apply function to third quarter
PlotThree<-TextMine(words.corpus3)

## Warning in tm_map.SimpleCorpus(words.corpus, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removeWords, stopwords("english")):
## transformation drops documents

  # apply function to fourth quarter
PlotFour <-TextMine(words.corpus4)

## Warning in tm_map.SimpleCorpus(words.corpus, content_transformer(tolower)):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removeNumbers): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(words.corpus, removeWords, stopwords("english")):
## transformation drops documents

Finally, plot the results (i.e, 4 numbers) via a bar chart

  # combine scores of four quarters into one dataframe

  # create a bar plot for the four scores
Hamlet <- c(PlotOne,PlotTwo,PlotThree,PlotFour)
barplot(Hamlet)

Homework 10: Text Mining

Derrick Espadas

2020-09-07 22:56:08