Step 1: Read in the positive and negative word files

Pos <- "https://cjacks04.github.io/687/Datasets/positive-words.txt"
Pos <-scan(Pos, character(0),sep = "\n")
Read 2006 items
Neg <-"https://cjacks04.github.io/687/Datasets/negative-words.txt"
Neg <-scan(Neg, character(0), sep = "\n")
Read 4783 items

Step 2: Process in the MLK speech

Speech <-"http://www.coreybjackson.com/687/Datasets/MLKspeech.txt"
MLK <-scan(Speech, character(0),sep = "\n")
Read 29 items
library(tm)
Loading required package: NLP
words.vec <-VectorSource(MLK)
words.corpus <-Corpus(words.vec)
words.corpus <-tm_map(words.corpus, content_transformer(tolower))
transformation drops documents
words.corpus <- tm_map(words.corpus, removePunctuation)
transformation drops documents
words.corpus <- tm_map(words.corpus, removeNumbers)
transformation drops documents
words.corpus <- tm_map(words.corpus, removeWords, stopwords("english"))
transformation drops documents
tdm <- TermDocumentMatrix(words.corpus)
m <- as.matrix(tdm)
wordcounts <-rowSums(m)
words <- sort(wordcounts, decreasing = TRUE)
words <- names(wordcounts)

Step 3: Determine how many positive words were in the speech

totalWords <- sum(wordcounts)
pTotal/totalWords
[1] 0.1129608

Step 4: Determine how many negative words were in the speech

# Hint: one way to do this is to use the ‘match’ function on the list of words from Step 2 and the positive words in the list from the import. 
matchedN <- match(words, Neg, nomatch = 0)
ntotal <-sum(wordcounts[which(matchedN != 0)])
print(ntotal)
ntotal/totalWords 

Step 5: Redo the ‘positive’ and ‘negative’ calculations for each 25% of the speech

---
title: "Lab 10: Text Mining"
author: 
- Tesslyn Knapp
- Derrick Espadas
date: "`r Sys.time()`"
output: html_notebook
---

### Step	1:	Read	in	the	positive	and	negative	word	files
```{r}

# Create two vectors of words, one for the positive words and one for the negative words. The positive words can be found here: "https://cjacks04.github.io/687/Datasets/positive-words.txt" and negative words here: "https://cjacks04.github.io/687/Datasets/negative-words.txt". You should use the scan() function which reads data into a vector or list from the console or file. You'll need three arguments: (1) the file name/path, (2) the second argument is character(0) which will read the next  line as a character (as opposed to integer or some other datat ype), (3) the sep argument to tell R how the data are seperated e.g., \n 

# Note that when reading in the files, there might be lines at the start and/or the end that will need to be removed (i.e. you should clean you data if needed). 
Pos <- "https://cjacks04.github.io/687/Datasets/positive-words.txt"
Pos <-scan(Pos, character(0),sep = "\n")
Neg <-"https://cjacks04.github.io/687/Datasets/negative-words.txt"
Neg <-scan(Neg, character(0), sep = "\n")

```

### Step 2: Process in the MLK speech 
```{r}
# Read the MLK text file using the readLines() function. Only the URL is required.
# Inspect the vector above. Some lines are blank "". Remove these.
# Create a term matrix. There are several steps here beginning with creating a vector source and making text transformations. (Check chapter 14 where sba is transformed)
# Create a list of counts for each word 

Speech <-"http://www.coreybjackson.com/687/Datasets/MLKspeech.txt"
MLK <-scan(Speech, character(0),sep = "\n")
library(tm)
words.vec <-VectorSource(MLK)
words.corpus <-Corpus(words.vec)
words.corpus <-tm_map(words.corpus, content_transformer(tolower))
words.corpus <- tm_map(words.corpus, removePunctuation)
words.corpus <- tm_map(words.corpus, removeNumbers)
words.corpus <- tm_map(words.corpus, removeWords, stopwords("english"))
tdm <- TermDocumentMatrix(words.corpus)
m <- as.matrix(tdm)
wordcounts <-rowSums(m)
words <- sort(wordcounts, decreasing = TRUE)
words <- names(wordcounts)
```

### Step 3: Determine how many positive words were in the speech  
```{r}
# Hint: one way to do this is to use the ‘match’ function on the list of words from Step 2 and the positive words in the list from the import. 

  # sum the total number of words and store the value to "totalWords"
  
  # create a vector "words" that contains all the words in "wordCounts"
  
  # locate which words in "mlk" were positive (appeared in positive-word list)
  
  # calculate the total number of positive words in "mlk" speech (in wordCounts) and assign the number to the variable "pTotal". The which() function on words the vector above will give you the index number. 
 
  # view the total number of positive words (95 positive words in the speech)
   
  # view the percentage of positive words (11.29608% of the speech words are positive)
matchedP <- match(words, Pos, nomatch = 0)
pTotal <-sum(wordcounts[which(matchedP != 0)])
print(pTotal)
totalWords <- sum(wordcounts)
pTotal/totalWords
```

### Step 4: Determine how many negative words were in the speech 
```{r}
# Hint: one way to do this is to use the ‘match’ function on the list of words from Step 2 and the positive words in the list from the import. 
matchedN <- match(words, Neg, nomatch = 0)
ntotal <-sum(wordcounts[which(matchedN != 0)])
print(ntotal)
ntotal/totalWords 

```

### Step 5: Redo the ‘positive’ and ‘negative’ calculations for each 25% of the speech 
```{r}
# Compare the results (ex. a simple barchart of the 4 numbers). I recommend taking extracting quarters of the speech, storing each quarter in a vector and then conducting the calculations over each quarter. 
cutpoint <- round(length(words.corpus)/4)
words.corpus1 <- words.corpus[1:cutpoint]
tdm1 <- TermDocumentMatrix(words.corpus1)
m1 <- as.matrix(tdm1)

wordcounts1 <-rowSums(m1)
wordcounts1 <- sort(wordcounts1, decreasing = TRUE)
totalWords1 <- sum(wordcounts1)
words1 <- names(wordcounts1)

matchedP1 <- match(words1, Pos, nomatch = 0)
ptotalNumber1 <- sum(wordcounts1[which(matchedP1 !=0)])
ratiop1 <- ptotalNumber1/totalWords1
ratiop1

matchedN1 <- match(words1, Neg, nomatch = 0)
NtotalNumber1 <- sum(wordcounts1[which(matchedN1 !=0)])
ration1 <- NtotalNumber1/totalWords1
ration1

#Q2
cutpoint2 <- round(length(words.corpus)/2)
words.corpus2 <- words.corpus[8:14]
tdm2 <- TermDocumentMatrix(words.corpus2)
m2 <- as.matrix(tdm2)

wordcounts2 <-rowSums(m2)
wordcounts2 <- sort(wordcounts2, decreasing = TRUE)
totalWords2 <- sum(wordcounts2)
words2 <- names(wordcounts2)

matchedP2 <- match(words2, Pos, nomatch = 0)
ptotalNumber2 <- sum(wordcounts2[which(matchedP2 !=0)])
ratiop2 <- ptotalNumber2/totalWords2
ratiop2

matchedN2 <- match(words2, Neg, nomatch = 0)
NtotalNumber2 <- sum(wordcounts2[which(matchedN2 !=0)])
ration2 <- NtotalNumber2/totalWords2
ration2

#Q3
words.corpus3 <- words.corpus[9:21]
tdm3 <- TermDocumentMatrix(words.corpus3)
m3 <- as.matrix(tdm3)

wordcounts3 <-rowSums(m3)
wordcounts3 <- sort(wordcounts3, decreasing = TRUE)
totalWords3 <- sum(wordcounts3)
words3 <- names(wordcounts3)

matchedP3 <- match(words3, Pos, nomatch = 0)
ptotalNumber3 <- sum(wordcounts3[which(matchedP2 !=0)])
ratiop3 <- ptotalNumber3/totalWords3
ratiop3

matchedN3 <- match(words3, Neg, nomatch = 0)
NtotalNumber3 <- sum(wordcounts3[which(matchedN3 !=0)])
ration3 <- NtotalNumber3/totalWords3
ration3
#Q4
words.corpus4 <- words.corpus[22:29]
tdm4 <- TermDocumentMatrix(words.corpus4)
m4 <- as.matrix(tdm4)

wordcounts4 <-rowSums(m4)
wordcounts4 <- sort(wordcounts4, decreasing = TRUE)
totalWords4 <- sum(wordcounts4)
words4 <- names(wordcounts4)

matchedP4 <- match(words4, Pos, nomatch = 0)
ptotalNumber4 <- sum(wordcounts4[which(matchedP2 !=0)])
ratiop4 <- ptotalNumber4/totalWords4
ratiop4

matchedN4 <- match(words4, Neg, nomatch = 0)
NtotalNumber4 <- sum(wordcounts4[which(matchedN4 !=0)])
ration4 <- NtotalNumber4/totalWords4
ration4

Posi <- c(ratiop1, ratiop2, ratiop3, ratiop4)
barplot(Posi)

Negi <- c(ration1, ration2, ration3, ration4)
barplot(Negi)
```

