library(stringr)
library(tm)
library(dplyr)
library(stringr)
library(qdap)
library(ggplot2)
library(wordcloud) 
library(qdap)
library(RWeka)
library(caTools)

Intrduction

This project does prelimnary analysis of text data taken from a corpus called HC Corpora. The data is from four locales en_US, de_DE, ru_RU and fi_FI. Only en_US is taken. This conists of Blogs, Tweets and News items. For this analysis a sample (10%) is taken for creating a predictive model of the next word. This is based on the fact that a sample taken from a large population is represnetative of the population and appropriate inferences can be made. This 10% sample is divided into training set (75%) and test set (25%).

These training sets samples of tweets, blogs and news items are ingested into a oOrpus which is then cleaned of whitespaces, punctuation, stopwords etc. Profanities are removed. Detailed analysis is done on the words and word frequencies, words and letter frequencies etc

Input the data from the tweets,blogs and news

Create samp_tw.txt, samp_blogs.txt, samp_news.txt Create samp_blogs.txt with sample blogs Create samp_news.txt with a sample of news items

  1. Read the file
  2. Sample 10% of the files (tweets,blogs and news items)
  3. Divide this sample into training (75%) and test set(25%)
  4. Clean special characters etc
  5. Write into the apporpriate text file
# Create samp_tw.txt with the sample tweets into samp_tw.txt
con_trn <- file("./train/samp_tw_train.txt", "wt") 
con_test <- file("./test/samp_tw_test.txt", "wt") 
lines <- readLines("../en_US/en_US.twitter.txt")
samp_tw <-sample(lines,size=length(lines)*0.10,replace=FALSE)
samp_tw <- gsub("[^0-9a-zA-z \\.\\?\\,\\.\\!\\s\\']","",samp_tw)
split <- sample.split(samp_tw, SplitRatio=3/4)
train <- samp_tw[split]
test <- samp_tw[-split]
writeLines(train,con=con_trn,sep="\n")
writeLines(test,con=con_test,sep="\n")
close(con_trn)
close(con_test)

# Create samp_blog.txt with sample text from blog
con_trn <- file("./train/samp_blog_train.txt", "wt") 
con_test <- file("./test/samp_blog_test.txt", "wt") 
lines <- readLines("../en_US/en_US.blogs.txt")
samp_blog <-sample(lines,size=length(lines)*0.10,replace=FALSE)
samp_blog <- gsub("[^0-9a-zA-z \\.\\?\\,\\.\\!\\s\\']","",samp_blog)
split <- sample.split(samp_blog, SplitRatio=3/4)
train <- samp_blog[split]
test <- samp_blog[-split]
writeLines(train,con=con_trn,sep="\n")
writeLines(test,con=con_test,sep="\n")
close(con_trn)
close(con_test)


# Create samp_news.txt with sample text from blog
con_trn <- file("./train/samp_news_train.txt", "wt") 
con_test <- file("./test/samp_news_test.txt", "wt")
lines <- readLines("../en_US/en_US.news.txt")
samp_news <-sample(lines,size=length(lines)*0.10,replace=FALSE)
samp_news <- gsub("[^0-9a-zA-z \\.\\?\\,\\.\\!\\s\\']","",samp_news)
split <- sample.split(samp_news, SplitRatio=3/4)
train <- samp_news[split]
test <- samp_news[-split]
writeLines(train,con=con_trn,sep="\n")
writeLines(test,con=con_test,sep="\n")
close(con_trn)
close(con_test)
# Print the length of the samples from the tweets, blogs and news data
print(length(samp_tw))
## [1] 118007
print(length(samp_blog))
## [1] 44964
print(length(samp_news))
## [1] 3862

Create a corpus

Create a corpus of the sample documents (tweets, blogs and news) from the ./data directory. The corpus will include sample samp_tw.txt, samp_blogs.txt and samp_news.txt

cname <- "./data"
docs <- Corpus(DirSource(cname), readerControl=list(language="en_US"))

Explore the corpus

inspect(docs)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 10242905
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 764212
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 7997482

Clean the Corpus

Clean the Corpus with the following tasks

  1. Remove extra white space
  2. Remove punctuation symbols
  3. Remove numbers
  4. Change case to lower
  5. Remove stop words like ‘i’,‘me’,‘my’,‘ours’ etc
  6. Stem the document
  7. Convert to plain text
docs <- tm_map(docs,stripWhitespace)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)   
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removeWords, stopwords("english"))  
docs <- tm_map(docs, stemDocument) 
docs <- tm_map(docs, PlainTextDocument)   

Remove profane words

Read a list of profane words about 362 then. Create it as a character vector Remove these words from the Corpus

profanity <- read.table("./profanity/profanity.txt",sep="\n")
badwords <- as.character(profanity$V1)
docs <- tm_map(docs, removeWords, badwords)

Create Document Term Matrix

Now for further investigation of the Corpus we need to convert the Corpus into a Document Term Matrix (DTM) which provides a list of words (terms) in the rows and the frequencies of occurences of these words. After creating the DTM explore the DTM using inspect.

dtm <- DocumentTermMatrix(docs)  
inspect(dtm[1:3,3000:3020])
## <<DocumentTermMatrix (documents: 3, terms: 21)>>
## Non-/sparse entries: 29/34
## Sparsity           : 54%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           amarillo amaro amarosa amaryllis amas amasebal amasin amass
##   character(0)        5     0       0         2    0        0      0     1
##   character(0)        0     0       0         0    0        0      0     0
##   character(0)        0     1       1         1    2        1      2     1
##               Terms
## Docs           amassed amasummit amateur amateurish amateurly amateurs
##   character(0)       0         0      24          2         1        3
##   character(0)       0         0       3          0         0        1
##   character(0)       2         1       4          0         0        1
##               Terms
## Docs           amawake amayzayn amaz amazayn amaze amazebal amazeballs
##   character(0)       0        0   28       0     7        0          0
##   character(0)       0        0    0       0     0        0          0
##   character(0)       1        1  173       1    10        1          4
dtm
## <<DocumentTermMatrix (documents: 3, terms: 116901)>>
## Non-/sparse entries: 161678/189025
## Sparsity           : 54%
## Maximal term length: 109
## Weighting          : term frequency (tf)

Create Term Document Matrix (TDM) (transpose of DTM)

Also create a Term Document Matrix (TDM) and explore the words and the corresponding frequencies

tdm <- TermDocumentMatrix(docs)
inspect(tdm[3000:3020,1:3])
## <<TermDocumentMatrix (terms: 21, documents: 3)>>
## Non-/sparse entries: 29/34
## Sparsity           : 54%
## Maximal term length: 10
## Weighting          : term frequency (tf)
## 
##             Docs
## Terms        character(0) character(0) character(0)
##   amarillo              5            0            0
##   amaro                 0            0            1
##   amarosa               0            0            1
##   amaryllis             2            0            1
##   amas                  0            0            2
##   amasebal              0            0            1
##   amasin                0            0            2
##   amass                 1            0            1
##   amassed               0            0            2
##   amasummit             0            0            1
##   amateur              24            3            4
##   amateurish            2            0            0
##   amateurly             1            0            0
##   amateurs              3            1            1
##   amawake               0            0            1
##   amayzayn              0            0            1
##   amaz                 28            0          173
##   amazayn               0            0            1
##   amaze                 7            0           10
##   amazebal              0            0            1
##   amazeballs            0            0            4

Consolidate the frequencies of words

  1. Compute the total frequencies of words by taking column sums
  2. Order the frequencies in ascending order
  3. Get the words with the lowest frequencies
  4. Create a table of the 20 words with lowest frequencies and 20 words with the highest
freq <- colSums(as.matrix(dtm))   
length(freq)   
## [1] 116901
ord <- order(freq) 
freq[head(ord)]   
##             aaaa   aaaaaaandwhere      aaaaaalllll       aaaaahhhhh 
##                1                1                1                1 
##        aaaaandgo aaaaannnnnnddddd 
##                1                1
freq[tail(ord)]   
##   get   can   one  will  like  just 
##  9341  9445 10136 10757 11105 12673
head(table(freq), 20)  
## freq
##     1     2     3     4     5     6     7     8     9    10    11    12 
## 67773 13543  6721  4078  2937  2133  1607  1440  1138   944   841   705 
##    13    14    15    16    17    18    19    20 
##   643   608   506   417   453   374   386   290
tail(table(freq), 20)  
## freq
##  4950  5262  5405  5480  5481  5672  6623  7153  7158  7228  7300  7563 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  7607  7609  9341  9445 10136 10757 11105 12673 
##     1     1     1     1     1     1     1     1

Plot Words and frequencies

plot(log(table(freq)), xlab="Frequency of word",ylab="log(number of words)", main="log(number words) vs word frequencies")

Save DTM as a CSV for future processing

m <- as.matrix(dtm)   
dim(m)  
## [1]      3 116901
write.csv(m, file="./misc/dtm.csv") 
dtms <- removeSparseTerms(dtm, 0.46) # This makes a matrix that is 10% empty space, maximum.   

Explore word frequencies

freq <- colSums(as.matrix(dtms))   
freq <- sort(colSums(as.matrix(dtms)),decreasing=TRUE)
head(freq,20)
##   just   like   will    one    can    get   good   time   dont    day 
##  12673  11105  10757  10136   9445   9341   7609   7607   7563   7300 
##   love    now   know    new    see  great   back people  think   make 
##   7228   7158   7153   6623   5672   5481   5480   5405   5262   4950
tail(freq,20)
##  youngers    younot     youto   yucatan     zachs      zane      zani 
##         2         2         2         2         2         2         2 
##    zazzle    zealot zechariah     zeiss    zeroed     zevon   ziggler 
##         2         2         2         2         2         2         2 
##      zimm  zippered   zooming     zotto      zues    zuzana 
##         2         2         2         2         2         2

Create a data frame of words and word frequencies

wf <- data.frame(word=names(freq), freq=freq)   
head(wf)
##      word  freq
## just just 12673
## like like 11105
## will will 10757
## one   one 10136
## can   can  9445
## get   get  9341
p <- ggplot(subset(wf, freq>5000), aes(word, freq))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
p 

Create word cloud showing more frequent words in large font

set.seed(142)   
wordcloud(names(freq), freq, min.freq=1000) 

wordcloud(names(freq), freq, min.freq=1000, scale=c(5, .1), colors=brewer.pal(6, "Dark2")) 

Use package qdap to compute frequencies of words, letters

words <- dtm %>% as.matrix %>% colnames %>% (function(x) x[nchar(x) < 20])
head(words)
## [1] "aaa"            "aaaa"           "aaaaaaandwhere" "aaaaaalllll"   
## [5] "aaaaahhhhh"     "aaaaandgo"
tail(words)
## [1] "zymurgy" "zynga"   "zyrtec"  "zzere"   "zzt"     "zzzzz"
summary(nchar(words))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   6.000   8.000   7.899  10.000  19.000

Plot the number of words and the number of characters in word

a <- table(nchar(words))
barplot(a,col="blue",xlab="Number of characters",ylab="Number of words",main="No words vs no. characters")

Letter frequencies in Corpus

Plot the letter frequencies in the Corpus

words %>%
    str_split("") %>%
    sapply(function(x) x[-1]) %>%
    unlist %>%
    dist_tab %>%
    mutate(Letter=factor(toupper(interval),
                         levels=toupper(interval[order(freq)]))) %>%
    ggplot(aes(Letter, weight=percent)) +
    geom_bar() +
    coord_flip() +
    ylab("Proportion") +
    scale_y_continuous(breaks=seq(0, 12, 2),
                       label=function(x) paste0(x, "%"),
                       expand=c(0,0), limits=c(0,12))

Create unigram model

Create a unigram model. Calculate the word probabilties. Arrange the words in in descending order of probabilities.

tdm <- TermDocumentMatrix(docs)
tdm <- removeSparseTerms(tdm, 0.54) 
a <- as.matrix(tdm)
b <- rowSums(a)
unigram <- as.data.frame(b)
colnames(unigram) <- c("freq")
unigram$term <- rownames(unigram)
unigram$prob <-with(freq/sum(freq),data=unigram)
unigram <- arrange(unigram,desc(prob))

Determine the size of the unigram model

Display the size and the 10 most frequent words and the 10 least frequent words

dim(unigram)
## [1] 32897     3
head(unigram,10)
##     freq term        prob
## 1  12673 just 0.007416832
## 2  11105 like 0.006499165
## 3  10757 will 0.006295499
## 4  10136  one 0.005932061
## 5   9445  can 0.005527656
## 6   9341  get 0.005466790
## 7   7609 good 0.004453143
## 8   7607 time 0.004451972
## 9   7563 dont 0.004426221
## 10  7300  day 0.004272301
tail(unigram,10)
##       freq     term         prob
## 32888    2    zeiss 1.170493e-06
## 32889    2   zeroed 1.170493e-06
## 32890    2    zevon 1.170493e-06
## 32891    2  ziggler 1.170493e-06
## 32892    2     zimm 1.170493e-06
## 32893    2 zippered 1.170493e-06
## 32894    2  zooming 1.170493e-06
## 32895    2    zotto 1.170493e-06
## 32896    2     zues 1.170493e-06
## 32897    2   zuzana 1.170493e-06

Create bigram model

Calculate the bigram probabilties. Arrange the bigrams in in descending order of probabilities.

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm <- TermDocumentMatrix(docs, control = list(tokenize = BigramTokenizer))
tdm <- removeSparseTerms(tdm, 0.64) 
a <- as.matrix(tdm)
b <- rowSums(a)
bigram <- as.data.frame(b)
colnames(bigram) <- c("freq")
bigram$term <- rownames(bigram)
bigram$prob <-with(freq/sum(freq),data=bigram)
bigram <- arrange(bigram,desc(prob))

Determine the size of the bigram model

Display the size and the 10 most frequent bigrams and the 10 least frequent bigrams

dim(bigram)
## [1] 79690     3
head(bigram,10)
##    freq            term        prob
## 1  1097       right now 0.002449213
## 2   948       cant wait 0.002116549
## 3   877       dont know 0.001958031
## 4   698      last night 0.001558387
## 5   626        im going 0.001397637
## 6   593       feel like 0.001323959
## 7   554 looking forward 0.001236886
## 8   491        new york 0.001096229
## 9   455         can get 0.001015854
## 10  448  happy birthday 0.001000225
tail(bigram,10)
##       freq              term         prob
## 79681    2           yup can 4.465292e-06
## 79682    2         yup first 4.465292e-06
## 79683    2       yup kicking 4.465292e-06
## 79684    2        zach braff 4.465292e-06
## 79685    2        zen garden 4.465292e-06
## 79686    2 zimmerman florida 4.465292e-06
## 79687    2          zip file 4.465292e-06
## 79688    2         zone also 4.465292e-06
## 79689    2          zone can 4.465292e-06
## 79690    2       zone people 4.465292e-06

Create trigram model

Calculate the bigram probabilties. Arrange the tigrams in in descending order of probabilities.

TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm <- TermDocumentMatrix(docs, control = list(tokenize = TrigramTokenizer))
tdm <- removeSparseTerms(tdm, 0.66) 
a <- as.matrix(tdm)
b <- rowSums(a)
trigram <- as.data.frame(b)
colnames(trigram) <- c("freq")
trigram$term <- rownames(trigram)
trigram$prob <-with(freq/sum(freq),data=trigram)
trigram <- arrange(trigram,desc(prob))

Determine the size of the trigram model

Display the size and the 10 most frequent trigrams and the 10 least frequent trigrams

dim(trigram)
## [1] 8327    3
head(trigram,10)
##    freq               term        prob
## 1   192  happy mothers day 0.006960052
## 2   186      cant wait see 0.006742551
## 3   110        let us know 0.003987530
## 4   103     happy new year 0.003733778
## 5    89     im pretty sure 0.003226274
## 6    67     dont even know 0.002428768
## 7    52      cinco de mayo 0.001885014
## 8    51      new york city 0.001848764
## 9    45      cant wait get 0.001631262
## 10   45 im looking forward 0.001631262
tail(trigram,10)
##      freq                 term         prob
## 8318    2    youre ready youre 7.250054e-05
## 8319    2 youre really getting 7.250054e-05
## 8320    2      youve done good 7.250054e-05
## 8321    2      youve got going 7.250054e-05
## 8322    2        youve got one 7.250054e-05
## 8323    2      youve got right 7.250054e-05
## 8324    2        youve got two 7.250054e-05
## 8325    2     youve read first 7.250054e-05
## 8326    2       youve read one 7.250054e-05
## 8327    2      yr old daughter 7.250054e-05

Interesting observations

  1. The 5 most frequent words in the corpus are just, like, will, one and can.
  2. The 5 most frequent letters are E,A,I,N and O

Conclusion

This document ingests a sample of documents based on tweets, blogs and news items and does prelimnary exploratory analysis on it The following was done

  1. A sample of tweets, blogs and news items are taken and stored as the samples
  2. These samples are then ingested into a Corpus
  3. The Corpus is cleaned of stopwords, punctuation, whitespace, and profanity etc
  4. A Document Term Matrix is created (DTM)
  5. The DTM is use to analyze the frequencies of the words.
  6. Barplots, word cloud are created to depict the frequencies of words
  7. Also a plot of the frequencies of letters are also plotted
  8. Unigram, bigran and trigram models created
  9. The size of of the unigram, bigram and trigram were computed as also the most frequent and least frequent unigrams, bigrams and trigrams were computed and displayed.
  10. This allows the model to be worked to predict the next word using Markov chains and Katz backoff algorithm