library(stringr)
library(tm)
library(dplyr)
library(stringr)
library(qdap)
library(ggplot2)
library(wordcloud)
library(qdap)
library(RWeka)
library(caTools)
This project does prelimnary analysis of text data taken from a corpus called HC Corpora. The data is from four locales en_US, de_DE, ru_RU and fi_FI. Only en_US is taken. This conists of Blogs, Tweets and News items. For this analysis a sample (10%) is taken for creating a predictive model of the next word. This is based on the fact that a sample taken from a large population is represnetative of the population and appropriate inferences can be made. This 10% sample is divided into training set (75%) and test set (25%).
These training sets samples of tweets, blogs and news items are ingested into a oOrpus which is then cleaned of whitespaces, punctuation, stopwords etc. Profanities are removed. Detailed analysis is done on the words and word frequencies, words and letter frequencies etc
Create samp_tw.txt, samp_blogs.txt, samp_news.txt Create samp_blogs.txt with sample blogs Create samp_news.txt with a sample of news items
# Create samp_tw.txt with the sample tweets into samp_tw.txt
con_trn <- file("./train/samp_tw_train.txt", "wt")
con_test <- file("./test/samp_tw_test.txt", "wt")
lines <- readLines("../en_US/en_US.twitter.txt")
samp_tw <-sample(lines,size=length(lines)*0.10,replace=FALSE)
samp_tw <- gsub("[^0-9a-zA-z \\.\\?\\,\\.\\!\\s\\']","",samp_tw)
split <- sample.split(samp_tw, SplitRatio=3/4)
train <- samp_tw[split]
test <- samp_tw[-split]
writeLines(train,con=con_trn,sep="\n")
writeLines(test,con=con_test,sep="\n")
close(con_trn)
close(con_test)
# Create samp_blog.txt with sample text from blog
con_trn <- file("./train/samp_blog_train.txt", "wt")
con_test <- file("./test/samp_blog_test.txt", "wt")
lines <- readLines("../en_US/en_US.blogs.txt")
samp_blog <-sample(lines,size=length(lines)*0.10,replace=FALSE)
samp_blog <- gsub("[^0-9a-zA-z \\.\\?\\,\\.\\!\\s\\']","",samp_blog)
split <- sample.split(samp_blog, SplitRatio=3/4)
train <- samp_blog[split]
test <- samp_blog[-split]
writeLines(train,con=con_trn,sep="\n")
writeLines(test,con=con_test,sep="\n")
close(con_trn)
close(con_test)
# Create samp_news.txt with sample text from blog
con_trn <- file("./train/samp_news_train.txt", "wt")
con_test <- file("./test/samp_news_test.txt", "wt")
lines <- readLines("../en_US/en_US.news.txt")
samp_news <-sample(lines,size=length(lines)*0.10,replace=FALSE)
samp_news <- gsub("[^0-9a-zA-z \\.\\?\\,\\.\\!\\s\\']","",samp_news)
split <- sample.split(samp_news, SplitRatio=3/4)
train <- samp_news[split]
test <- samp_news[-split]
writeLines(train,con=con_trn,sep="\n")
writeLines(test,con=con_test,sep="\n")
close(con_trn)
close(con_test)
# Print the length of the samples from the tweets, blogs and news data
print(length(samp_tw))
## [1] 118007
print(length(samp_blog))
## [1] 44964
print(length(samp_news))
## [1] 3862
Create a corpus of the sample documents (tweets, blogs and news) from the ./data directory. The corpus will include sample samp_tw.txt, samp_blogs.txt and samp_news.txt
cname <- "./data"
docs <- Corpus(DirSource(cname), readerControl=list(language="en_US"))
inspect(docs)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 10242905
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 764212
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 7997482
Clean the Corpus with the following tasks
docs <- tm_map(docs,stripWhitespace)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, stemDocument)
docs <- tm_map(docs, PlainTextDocument)
Read a list of profane words about 362 then. Create it as a character vector Remove these words from the Corpus
profanity <- read.table("./profanity/profanity.txt",sep="\n")
badwords <- as.character(profanity$V1)
docs <- tm_map(docs, removeWords, badwords)
Now for further investigation of the Corpus we need to convert the Corpus into a Document Term Matrix (DTM) which provides a list of words (terms) in the rows and the frequencies of occurences of these words. After creating the DTM explore the DTM using inspect.
dtm <- DocumentTermMatrix(docs)
inspect(dtm[1:3,3000:3020])
## <<DocumentTermMatrix (documents: 3, terms: 21)>>
## Non-/sparse entries: 29/34
## Sparsity : 54%
## Maximal term length: 10
## Weighting : term frequency (tf)
##
## Terms
## Docs amarillo amaro amarosa amaryllis amas amasebal amasin amass
## character(0) 5 0 0 2 0 0 0 1
## character(0) 0 0 0 0 0 0 0 0
## character(0) 0 1 1 1 2 1 2 1
## Terms
## Docs amassed amasummit amateur amateurish amateurly amateurs
## character(0) 0 0 24 2 1 3
## character(0) 0 0 3 0 0 1
## character(0) 2 1 4 0 0 1
## Terms
## Docs amawake amayzayn amaz amazayn amaze amazebal amazeballs
## character(0) 0 0 28 0 7 0 0
## character(0) 0 0 0 0 0 0 0
## character(0) 1 1 173 1 10 1 4
dtm
## <<DocumentTermMatrix (documents: 3, terms: 116901)>>
## Non-/sparse entries: 161678/189025
## Sparsity : 54%
## Maximal term length: 109
## Weighting : term frequency (tf)
Also create a Term Document Matrix (TDM) and explore the words and the corresponding frequencies
tdm <- TermDocumentMatrix(docs)
inspect(tdm[3000:3020,1:3])
## <<TermDocumentMatrix (terms: 21, documents: 3)>>
## Non-/sparse entries: 29/34
## Sparsity : 54%
## Maximal term length: 10
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0) character(0) character(0)
## amarillo 5 0 0
## amaro 0 0 1
## amarosa 0 0 1
## amaryllis 2 0 1
## amas 0 0 2
## amasebal 0 0 1
## amasin 0 0 2
## amass 1 0 1
## amassed 0 0 2
## amasummit 0 0 1
## amateur 24 3 4
## amateurish 2 0 0
## amateurly 1 0 0
## amateurs 3 1 1
## amawake 0 0 1
## amayzayn 0 0 1
## amaz 28 0 173
## amazayn 0 0 1
## amaze 7 0 10
## amazebal 0 0 1
## amazeballs 0 0 4
freq <- colSums(as.matrix(dtm))
length(freq)
## [1] 116901
ord <- order(freq)
freq[head(ord)]
## aaaa aaaaaaandwhere aaaaaalllll aaaaahhhhh
## 1 1 1 1
## aaaaandgo aaaaannnnnnddddd
## 1 1
freq[tail(ord)]
## get can one will like just
## 9341 9445 10136 10757 11105 12673
head(table(freq), 20)
## freq
## 1 2 3 4 5 6 7 8 9 10 11 12
## 67773 13543 6721 4078 2937 2133 1607 1440 1138 944 841 705
## 13 14 15 16 17 18 19 20
## 643 608 506 417 453 374 386 290
tail(table(freq), 20)
## freq
## 4950 5262 5405 5480 5481 5672 6623 7153 7158 7228 7300 7563
## 1 1 1 1 1 1 1 1 1 1 1 1
## 7607 7609 9341 9445 10136 10757 11105 12673
## 1 1 1 1 1 1 1 1
plot(log(table(freq)), xlab="Frequency of word",ylab="log(number of words)", main="log(number words) vs word frequencies")
m <- as.matrix(dtm)
dim(m)
## [1] 3 116901
write.csv(m, file="./misc/dtm.csv")
dtms <- removeSparseTerms(dtm, 0.46) # This makes a matrix that is 10% empty space, maximum.
freq <- colSums(as.matrix(dtms))
freq <- sort(colSums(as.matrix(dtms)),decreasing=TRUE)
head(freq,20)
## just like will one can get good time dont day
## 12673 11105 10757 10136 9445 9341 7609 7607 7563 7300
## love now know new see great back people think make
## 7228 7158 7153 6623 5672 5481 5480 5405 5262 4950
tail(freq,20)
## youngers younot youto yucatan zachs zane zani
## 2 2 2 2 2 2 2
## zazzle zealot zechariah zeiss zeroed zevon ziggler
## 2 2 2 2 2 2 2
## zimm zippered zooming zotto zues zuzana
## 2 2 2 2 2 2
wf <- data.frame(word=names(freq), freq=freq)
head(wf)
## word freq
## just just 12673
## like like 11105
## will will 10757
## one one 10136
## can can 9445
## get get 9341
p <- ggplot(subset(wf, freq>5000), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
set.seed(142)
wordcloud(names(freq), freq, min.freq=1000)
wordcloud(names(freq), freq, min.freq=1000, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
words <- dtm %>% as.matrix %>% colnames %>% (function(x) x[nchar(x) < 20])
head(words)
## [1] "aaa" "aaaa" "aaaaaaandwhere" "aaaaaalllll"
## [5] "aaaaahhhhh" "aaaaandgo"
tail(words)
## [1] "zymurgy" "zynga" "zyrtec" "zzere" "zzt" "zzzzz"
summary(nchar(words))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 6.000 8.000 7.899 10.000 19.000
a <- table(nchar(words))
barplot(a,col="blue",xlab="Number of characters",ylab="Number of words",main="No words vs no. characters")
Plot the letter frequencies in the Corpus
words %>%
str_split("") %>%
sapply(function(x) x[-1]) %>%
unlist %>%
dist_tab %>%
mutate(Letter=factor(toupper(interval),
levels=toupper(interval[order(freq)]))) %>%
ggplot(aes(Letter, weight=percent)) +
geom_bar() +
coord_flip() +
ylab("Proportion") +
scale_y_continuous(breaks=seq(0, 12, 2),
label=function(x) paste0(x, "%"),
expand=c(0,0), limits=c(0,12))
Create a unigram model. Calculate the word probabilties. Arrange the words in in descending order of probabilities.
tdm <- TermDocumentMatrix(docs)
tdm <- removeSparseTerms(tdm, 0.54)
a <- as.matrix(tdm)
b <- rowSums(a)
unigram <- as.data.frame(b)
colnames(unigram) <- c("freq")
unigram$term <- rownames(unigram)
unigram$prob <-with(freq/sum(freq),data=unigram)
unigram <- arrange(unigram,desc(prob))
Display the size and the 10 most frequent words and the 10 least frequent words
dim(unigram)
## [1] 32897 3
head(unigram,10)
## freq term prob
## 1 12673 just 0.007416832
## 2 11105 like 0.006499165
## 3 10757 will 0.006295499
## 4 10136 one 0.005932061
## 5 9445 can 0.005527656
## 6 9341 get 0.005466790
## 7 7609 good 0.004453143
## 8 7607 time 0.004451972
## 9 7563 dont 0.004426221
## 10 7300 day 0.004272301
tail(unigram,10)
## freq term prob
## 32888 2 zeiss 1.170493e-06
## 32889 2 zeroed 1.170493e-06
## 32890 2 zevon 1.170493e-06
## 32891 2 ziggler 1.170493e-06
## 32892 2 zimm 1.170493e-06
## 32893 2 zippered 1.170493e-06
## 32894 2 zooming 1.170493e-06
## 32895 2 zotto 1.170493e-06
## 32896 2 zues 1.170493e-06
## 32897 2 zuzana 1.170493e-06
Calculate the bigram probabilties. Arrange the bigrams in in descending order of probabilities.
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm <- TermDocumentMatrix(docs, control = list(tokenize = BigramTokenizer))
tdm <- removeSparseTerms(tdm, 0.64)
a <- as.matrix(tdm)
b <- rowSums(a)
bigram <- as.data.frame(b)
colnames(bigram) <- c("freq")
bigram$term <- rownames(bigram)
bigram$prob <-with(freq/sum(freq),data=bigram)
bigram <- arrange(bigram,desc(prob))
Display the size and the 10 most frequent bigrams and the 10 least frequent bigrams
dim(bigram)
## [1] 79690 3
head(bigram,10)
## freq term prob
## 1 1097 right now 0.002449213
## 2 948 cant wait 0.002116549
## 3 877 dont know 0.001958031
## 4 698 last night 0.001558387
## 5 626 im going 0.001397637
## 6 593 feel like 0.001323959
## 7 554 looking forward 0.001236886
## 8 491 new york 0.001096229
## 9 455 can get 0.001015854
## 10 448 happy birthday 0.001000225
tail(bigram,10)
## freq term prob
## 79681 2 yup can 4.465292e-06
## 79682 2 yup first 4.465292e-06
## 79683 2 yup kicking 4.465292e-06
## 79684 2 zach braff 4.465292e-06
## 79685 2 zen garden 4.465292e-06
## 79686 2 zimmerman florida 4.465292e-06
## 79687 2 zip file 4.465292e-06
## 79688 2 zone also 4.465292e-06
## 79689 2 zone can 4.465292e-06
## 79690 2 zone people 4.465292e-06
Calculate the bigram probabilties. Arrange the tigrams in in descending order of probabilities.
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tdm <- TermDocumentMatrix(docs, control = list(tokenize = TrigramTokenizer))
tdm <- removeSparseTerms(tdm, 0.66)
a <- as.matrix(tdm)
b <- rowSums(a)
trigram <- as.data.frame(b)
colnames(trigram) <- c("freq")
trigram$term <- rownames(trigram)
trigram$prob <-with(freq/sum(freq),data=trigram)
trigram <- arrange(trigram,desc(prob))
Display the size and the 10 most frequent trigrams and the 10 least frequent trigrams
dim(trigram)
## [1] 8327 3
head(trigram,10)
## freq term prob
## 1 192 happy mothers day 0.006960052
## 2 186 cant wait see 0.006742551
## 3 110 let us know 0.003987530
## 4 103 happy new year 0.003733778
## 5 89 im pretty sure 0.003226274
## 6 67 dont even know 0.002428768
## 7 52 cinco de mayo 0.001885014
## 8 51 new york city 0.001848764
## 9 45 cant wait get 0.001631262
## 10 45 im looking forward 0.001631262
tail(trigram,10)
## freq term prob
## 8318 2 youre ready youre 7.250054e-05
## 8319 2 youre really getting 7.250054e-05
## 8320 2 youve done good 7.250054e-05
## 8321 2 youve got going 7.250054e-05
## 8322 2 youve got one 7.250054e-05
## 8323 2 youve got right 7.250054e-05
## 8324 2 youve got two 7.250054e-05
## 8325 2 youve read first 7.250054e-05
## 8326 2 youve read one 7.250054e-05
## 8327 2 yr old daughter 7.250054e-05
This document ingests a sample of documents based on tweets, blogs and news items and does prelimnary exploratory analysis on it The following was done