The data is from a corpus called HC Corpora (www.corpora.heliohost.org). The data is downloaded from the link Data
The goal of this project is:
Quetions to consider:
library(stringi)
library(dplyr)
library(tm)
library(wordcloud)
library(ggplot2)
library(gridExtra)
library(RWeka)
The data is first downloaded and extracted. Then, dataset consisting of english foreign language is considered for the analysis.
twitter <- readLines("en_US.twitter.txt")
blogs <- readLines("en_US.blogs.txt")
news <- readLines("en_US.news.txt")
In order to get some sense of the data set, some basic features, such as file size in bytes, number of lines, number of words, average word count per line are plotted in table format.
details <- data.frame(Name = c("twitter", "blogs", "news"),
Size_Bytes = c(file.info("en_US.twitter.txt")$size, file.info("en_US.blogs.txt")$size,
file.info("en_US.news.txt")$size),
Length = c(length(twitter), length(blogs), length(news)),
Word_count = c(sum(stri_count_words(twitter)),sum(stri_count_words(blogs)),
sum(stri_count_words(news))))
details <- mutate(details, Words_per_line = Word_count/Length)
print(details)
## Name Size_Bytes Length Word_count Words_per_line
## 1 twitter 167105338 2360148 30218125 12.80349
## 2 blogs 210160014 899288 38154238 42.42716
## 3 news 205811889 77259 2693898 34.86840
From the table, the file size is indeed too large, and therefore random sampling should be done on the data set and analysis should be done.
We have three different data files from three sources. Considering limitations such as file size and speed, a sampling of only 1000 will be done on each data set.
Here is the full list of profanity words which can be downloaded and extracted from badwords
tweet_sample <- sample(twitter, 1000)
blogs_sample<- sample(blogs, 1000)
news_sample <- sample(news, 1000)
sample_data <- c(tweet_sample,blogs_sample,news_sample)
rm(tweet_sample, blogs_sample, news_sample)
badwords <- readLines("full-list-of-bad-words-text-file_2018_03_26.txt")
text_data <- function(x) {
text <- paste(x)
text <- removePunctuation(text)
## Removing special characters
text <- iconv(text, "UTF-8", "ASCII", sub = "")
text <- removeNumbers(text)
## Converting to lower case
text <- tolower(text)
text <- gsub("\\b[a-z]\\b{1}", replace= " ", text)
text <- removeWords(text, c(badwords, "s","ve", "m"))
text <- stripWhitespace(text)
}
text <- text_data(sample_data)
## Removing stopwords
text1 <- removeWords(text, c(stopwords("english")))
text1 = stripWhitespace(text1)
wordcloud(text,random.order = FALSE, max.words = 50, col = rainbow(3))
Using the functions described below, we generate unigrams, bigrams and trigrams from the cleaned data and plotted.
## Stopwords
ns_gram <- NGramTokenizer(text1)
grams1 <- function(x){
x <- NGramTokenizer(ns_gram, Weka_control(min=1, max=1))
x <- data.frame(table(x))
x <- arrange(x, desc(Freq))
}
grams2 <- function(x){
x <- NGramTokenizer(ns_gram, Weka_control(min=2, max=2))
x <- data.frame(table(x))
x <- arrange(x, desc(Freq))
}
grams3 <- function(x){
x <- NGramTokenizer(ns_gram, Weka_control(min=3, max=3))
x <- data.frame(table(x))
x <- arrange(x, desc(Freq))
}
unigrams <- grams1(ns_gram)
bigrams <- grams2(ns_gram)
trigrams <- grams3(ns_gram)
## Unigram Plotting
p1s <- ggplot(unigrams[1:20,], aes(x = reorder(x, Freq), y = Freq)) +
geom_bar(stat='identity', aes(fill = x)) +
geom_text(aes(x = x, y = 1, label = Freq[1:20]),
hjust=0, vjust=0.5, size = 4, colour = 'black',
fontface = 'bold') +
guides(fill = FALSE) +
xlab("Uni-grams") + ylab("Frequency") + ggtitle("Top 20 Uni-grams")+
coord_flip() +
theme_bw()
## Bigram Plotting
p2s <- ggplot(bigrams[1:20,], aes(x = reorder(x, Freq), y = Freq)) +
geom_bar(stat='identity', aes(fill = x)) +
geom_text(aes(x = x, y = 1, label = Freq[1:20]),
hjust=0, vjust=0.5, size = 4, colour = 'black',
fontface = 'bold') +
guides(fill = FALSE) +
xlab("Bi-grams") + ylab("Frequency") + ggtitle("Top 20 Bi-grams")+
coord_flip() +
theme_bw()
## Trigram Plotting
p3s <- ggplot(trigrams[1:20,], aes(x = reorder(x, Freq), y = Freq)) +
geom_bar(stat='identity', aes(fill = x)) +
geom_text(aes(x = x, y = 1, label = Freq[1:20]),
hjust=0, vjust=0.5, size = 4, colour = 'black',
fontface = 'bold') +
guides(fill = FALSE) +
xlab("Tri-grams") + ylab("Frequency") + ggtitle("Top 20 Tri-grams")+
coord_flip() +
theme_bw()
grid.arrange(p1s, p2s, p3s, ncol = 3, top = "When stopwords are excluded")
unique_word_percent <- nrow(unigrams)/sum(unigrams$Freq)
unique_word_percent
## [1] 0.0518642
coverage <- function(x, percent_cover){
total <- 0
for(i in 1:length(x$Freq)){
total <-total + x$Freq[i]
if(total >= percent_cover*sum(x$Freq)){
return(i)}
}}
coverage(unigrams, 0.5)
## [1] 968
coverage(unigrams, 0.9)
## [1] 8559
We need 968 words to cover 50% of all word instances in the language and 8559 words to cover 90% of all word instances in the language.
There’s been an exponential increase in the frequency of words with increase in percentage.
The best way to perform this task is to compare the data with some well known dictionary language code block. Since most of the data considered is written in English, it is not necessary to do this exploratory analysis.
The unique words follow close to exponential distribution with increase in coverage. So we can reduce the number of lower frequency unique words and substitute it with similar synonyms.