We will first analyze the 3 US data sets, including blogs, news and tweets. Let us look first at the number of lines for each file, i.e. blogs, news and tweets in English.
# processFile calculates number of lines in the input file
getFileLength = function(filepath) {
con = file(filepath, "r")
lineNum = 0
while ( TRUE ) {
line = readLines(con, n = 1)
if ( length(line) == 0 ) {
break
}
lineNum <- lineNum +1
}
close(con)
return (lineNum)
}
#calculate length of blog file
lenBlogs<-getFileLength(blogs)
lenTweets<-getFileLength(tweets)
lenNews<-getFileLength(news)
print(paste(paste(paste('The number of lines in the file ', blogs),' is'), lenBlogs))
## [1] "The number of lines in the file ./final/en_US/en_US.blogs.txt is 899288"
print(paste(paste(paste('The number of lines in the file ', tweets),' is'), lenTweets))
## [1] "The number of lines in the file ./final/en_US/en_US.twitter.txt is 2360148"
print(paste(paste(paste('The number of lines in the file ', news),' is'), lenNews))
## [1] "The number of lines in the file ./final/en_US/en_US.news.txt is 1010242"
The files are quite big, we will just take a percentage of samples for each file for further analysis.
Now let us lok a bit more in detail and check the word counts per file and their distribution. For this purpose, we will load only 1% samples from each file using the binomial distribution. We will save the 1% sample in text files for later use.
readFileSamples = function(filepath, len) {
con = file(filepath, "r")
sample= rbinom(len, 1, .01)
lines=c()
for (i in 1:len) {
line = readLines(con, n = 1)
if ( length(line) == 0 ) {
break
}
if (sample[i]==1) {
lines<-c(lines,line)
}
}
close(con)
return (lines)
}
#generate random samples using binomial distribution for logs
linesBlogs<-readFileSamples(blogs,lenBlogs)
#write sample files
d<-lapply(linesBlogs, write, file="samplesblogs.txt", append=T)
#generate random samples using binomial distribution for tweets
linesTweets<-readFileSamples(tweets,lenTweets)
#write sample files
d<-lapply(linesTweets, write, file="samplestweets.txt", append=T)
#generate random samples using binomial distribution for news
linesNews<-readFileSamples(news,lenNews)
#write sample files
d<-lapply(linesNews, write, file="samplesnews.txt", append=T)
# let us combine all together
# read the sample files and merge into a single dataframe
linesDFBlog<-as.data.frame(linesBlogs)
colnames(linesDFBlog)<-('text')
linesDFNews<-as.data.frame(linesNews)
colnames(linesDFNews)<-('text')
linesDFTweets<-as.data.frame(linesTweets)
colnames(linesDFTweets)<-('text')
linesAll<-rbind(linesDFBlog,linesDFNews)
linesAll<-rbind(linesAll,linesDFTweets)
# load English stop words
stopwords = data.frame(word = stopwords("en"))
remove_words_from_text <- function(text) {
text <- unlist(strsplit(text, " "))
paste(text[!text %in% words_to_remove], collapse = " ")
}
words_to_remove <- stop_words$word
linesAll$text <- lapply(linesAll$text, remove_words_from_text)
allTextUniGrams <- linesAll %>% unnest_tokens(word, text) %>% anti_join(stopwords)
## Joining, by = "word"
frequency = allTextUniGrams %>% count(word) %>% arrange(desc(n))
topFrequency = head(frequency, 10)
print(topFrequency)
## word n
## 1 time 2164
## 2 day 1750
## 3 love 1633
## 4 people 1592
## 5 said 1294
## 6 3 1086
## 7 2 1073
## 8 1 1022
## 9 rt 895
## 10 life 885
topFrequency %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill = "slateblue") +
xlab(NULL) +
coord_flip()
allTextBiGrams <- linesAll %>% unnest_tokens(word, text, token= 'ngrams', n=2 ) %>% anti_join(stopwords)
frequency = allTextBiGrams %>% count(word) %>% arrange(desc(n))
topFrequency = head(frequency, 10)
print(topFrequency)
## word n
## 1 of the 4405
## 2 in the 4084
## 3 to the 2171
## 4 for the 2070
## 5 on the 1992
## 6 to be 1652
## 7 at the 1426
## 8 and the 1286
## 9 in a 1221
## 10 is a 1067
topFrequency %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill = "slateblue") +
xlab(NULL) +
coord_flip()
allTextBiGrams <- linesAll %>% unnest_tokens(word, text, token= 'ngrams', n=3 ) %>% anti_join(stopwords)
frequency = allTextBiGrams %>% count(word) %>% arrange(desc(n))
topFrequency = head(frequency, 10)
print(topFrequency)
## word n
## 1 <NA> 1195
## 2 one of the 325
## 3 a lot of 314
## 4 thanks for the 238
## 5 to be a 181
## 6 going to be 164
## 7 i want to 161
## 8 the end of 161
## 9 as well as 156
## 10 out of the 154
topFrequency %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill = "slateblue") +
xlab(NULL) +
coord_flip()