Exploratory data analysis

Loading the data sets and counting the number of entries

We will first analyze the 3 US data sets, including blogs, news and tweets. Let us look first at the number of lines for each file, i.e. blogs, news and tweets in English.

# processFile calculates number of lines in the input file
getFileLength = function(filepath) {
  con = file(filepath, "r")
  lineNum = 0
  while ( TRUE ) {
    line = readLines(con, n = 1)
    if ( length(line) == 0 ) {
      break
    }
    lineNum <- lineNum +1
  }
  close(con)
  return (lineNum)
}
#calculate length of blog file
lenBlogs<-getFileLength(blogs)
lenTweets<-getFileLength(tweets)
lenNews<-getFileLength(news)

print(paste(paste(paste('The number of lines in the file ', blogs),' is'), lenBlogs))
## [1] "The number of lines in the file  ./final/en_US/en_US.blogs.txt  is 899288"
print(paste(paste(paste('The number of lines in the file ', tweets),' is'), lenTweets))
## [1] "The number of lines in the file  ./final/en_US/en_US.twitter.txt  is 2360148"
print(paste(paste(paste('The number of lines in the file ', news),' is'), lenNews))
## [1] "The number of lines in the file  ./final/en_US/en_US.news.txt  is 1010242"

The files are quite big, we will just take a percentage of samples for each file for further analysis.

Sampling the datasets

Now let us lok a bit more in detail and check the word counts per file and their distribution. For this purpose, we will load only 1% samples from each file using the binomial distribution. We will save the 1% sample in text files for later use.

readFileSamples = function(filepath, len) {
  con = file(filepath, "r")
  sample= rbinom(len, 1, .01)
  lines=c()
  for (i in 1:len)  {
    line = readLines(con, n = 1)
    if ( length(line) == 0 ) {
      break
    }
    if (sample[i]==1) {
      lines<-c(lines,line)
    }
  }
  close(con)
  return (lines)
}

#generate random samples using binomial distribution for logs
linesBlogs<-readFileSamples(blogs,lenBlogs)
#write sample files
d<-lapply(linesBlogs, write, file="samplesblogs.txt", append=T)

#generate random samples using binomial distribution for tweets
linesTweets<-readFileSamples(tweets,lenTweets)
#write sample files
d<-lapply(linesTweets, write, file="samplestweets.txt", append=T)

#generate random samples using binomial distribution for news
linesNews<-readFileSamples(news,lenNews)
#write sample files
d<-lapply(linesNews, write, file="samplesnews.txt", append=T)

# let us combine all together
# read the sample files and merge into a single dataframe
linesDFBlog<-as.data.frame(linesBlogs)
colnames(linesDFBlog)<-('text')
linesDFNews<-as.data.frame(linesNews)
colnames(linesDFNews)<-('text')
linesDFTweets<-as.data.frame(linesTweets)
colnames(linesDFTweets)<-('text')
linesAll<-rbind(linesDFBlog,linesDFNews)
linesAll<-rbind(linesAll,linesDFTweets)
# load English stop words
stopwords = data.frame(word = stopwords("en"))

remove_words_from_text <- function(text) {
  text <- unlist(strsplit(text, " "))
  paste(text[!text %in% words_to_remove], collapse = " ")
}
words_to_remove <- stop_words$word
linesAll$text <- lapply(linesAll$text, remove_words_from_text)

Top Frequencies for words

allTextUniGrams <- linesAll %>% unnest_tokens(word, text) %>% anti_join(stopwords)
## Joining, by = "word"
frequency = allTextUniGrams %>% count(word) %>% arrange(desc(n))
topFrequency = head(frequency, 10)
print(topFrequency)
##      word    n
## 1    time 2164
## 2     day 1750
## 3    love 1633
## 4  people 1592
## 5    said 1294
## 6       3 1086
## 7       2 1073
## 8       1 1022
## 9      rt  895
## 10   life  885
topFrequency %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col(fill = "slateblue") +
  xlab(NULL) +
  coord_flip()

Top frequencies for Bigrams

allTextBiGrams <- linesAll %>% unnest_tokens(word, text, token= 'ngrams', n=2 ) %>% anti_join(stopwords)
frequency = allTextBiGrams %>% count(word) %>% arrange(desc(n))
topFrequency = head(frequency, 10)
print(topFrequency)
##       word    n
## 1   of the 4405
## 2   in the 4084
## 3   to the 2171
## 4  for the 2070
## 5   on the 1992
## 6    to be 1652
## 7   at the 1426
## 8  and the 1286
## 9     in a 1221
## 10    is a 1067
topFrequency %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col(fill = "slateblue") +
  xlab(NULL) +
  coord_flip()

Top frequencies for Trigrams

allTextBiGrams <- linesAll %>% unnest_tokens(word, text, token= 'ngrams', n=3 ) %>% anti_join(stopwords)
frequency = allTextBiGrams %>% count(word) %>% arrange(desc(n))
topFrequency = head(frequency, 10)
print(topFrequency)
##              word    n
## 1            <NA> 1195
## 2      one of the  325
## 3        a lot of  314
## 4  thanks for the  238
## 5         to be a  181
## 6     going to be  164
## 7       i want to  161
## 8      the end of  161
## 9      as well as  156
## 10     out of the  154
topFrequency %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col(fill = "slateblue") +
  xlab(NULL) +
  coord_flip()