library(dplyr)
library(tidytext)
library(RColorBrewer)
setwd("C:/Users/JW-RA2018/Desktop/Coursera/final/en_US")
readBinary <- function(filePath) {
f <- file(filePath, "rb", encoding = "UTF-8")
res <- readLines(f, encoding = "UTF-8")
close(f)
res
}
news <- tibble(text=readBinary('en_US.news.txt'))
blog <- tibble(text=readBinary('en_US.blogs.txt'))
twitter <- tibble(text=readBinary('en_US.twitter.txt'))
news_token <- news %>% unnest_tokens(word, text)
blog_token <- blog %>% unnest_tokens(word, text)
twitter_token <- twitter %>% unnest_tokens(word, text)
profanity <- data.frame('text' = t(read.csv('profanity.csv', header=FALSE)), stringsAsFactors = FALSE)
row.names(profanity) <- 1:nrow(profanity)
profanity$text <- gsub('^ ', '', profanity$text)
news_clean <- news_token %>% anti_join(profanity, by=c('word'='text')) %>% anti_join(get_stopwords(), by='word')
blog_clean <- blog_token %>% anti_join(profanity, by=c('word'='text')) %>% anti_join(get_stopwords(), by='word')
twitter_clean <- twitter_token %>% anti_join(profanity, by=c('word'='text')) %>% anti_join(get_stopwords(), by='word')
The twitter file has the most lines of 2360148, followed by the blog file 899288 and news file 1010242.
data.frame('news'=c(nrow(news), nrow(news_clean)), 'blogs'=c(nrow(blog), nrow(blog_clean)),
'tweets'=c(nrow(twitter), nrow(twitter_clean)), row.names=c('Lines #', 'Words #'))
## news blogs tweets
## Lines # 1010242 899288 2360148
## Words # 20493173 19584892 17121096
We can also take a look at the length distribution for each tweet in terms of the character number. Not surprisingly they are all no longer than 140 characters.
hist(sapply(twitter, nchar), col='pink', xlab='# characters', main='Length Distribution of Tweets')
With respect to word distribution, the most frequent word appearing in the news is ‘said’, whereas on blogs and twttier, ‘one’ and ‘just’ are the most frequent word, respectively.
news_table <- sort(table(news_clean), decreasing=TRUE)
blog_table <- sort(table(blog_clean), decreasing=TRUE)
twitter_table <- sort(table(twitter_clean), decreasing=TRUE)
head(news_table, 8)
## news_clean
## said one year new two can also first
## 250418 88794 76765 70773 63867 58924 58786 57866
head(blog_table, 8)
## blog_clean
## one just like can time get know now
## 127287 100793 100442 98420 90918 71093 60496 60358
head(twitter_table, 8)
## twitter_clean
## just like get love good day can thanks
## 151115 122455 112459 106721 101026 91710 89847 89660
The top 8 most frequent words for each file are summarized in the three figures below. For news articles, the most frequent word ‘said’ appears more frequently than the rest 7 words by a lot, whereas tweets show a quite smooth decreasing trend.
barplot(news_table[1:8], col=rev(brewer.pal(8, 'Reds')), ylab='frequency', main='Top 8 Most Frequent Words in News')
barplot(blog_table[1:8], col=rev(brewer.pal(8, 'Blues')), ylab='frequency', main='Top 8 Most Frequent Words on Blogs')
barplot(twitter_table[1:8], col=rev(brewer.pal(8, 'Purples')), ylab='frequency', main='Top 8 Most Frequent Words on Twitter')
un <- length(news_table)
ub <- length(blog_table)
ut <- length(twitter_table)
unidf <- data.frame('news' = un, 'blogs' = ub, 'tweets' = ut, row.names = '# unique words')
unidf
## news blogs tweets
## # unique words 283854 319101 369398
There are 283854 unique words for news, 319101 unique words for blogs and 369398 for tweets.
Of all the unique words appearing in news, at least 1182 are needed to account for at least over 50% of the instances.
cum <- function(table){
df <- data.frame(table)
df$Cum <- cumsum(df$Freq)
df$Port <- df$Cum/sum(table)
head(df[which(df$Port>=0.5), ])
}
cum(news_table)
## news_clean Freq Cum Port
## 1182 majority 3196 10247597 0.5000493
## 1183 card 3191 10250788 0.5002050
## 1184 al 3189 10253977 0.5003606
## 1185 items 3189 10257166 0.5005162
## 1186 response 3185 10260351 0.5006717
## 1187 terms 3180 10263531 0.5008268
The table below summarizes the number of unique words needed for each file:
n <- as.numeric(row.names(cum(news_table)))[1]
b <- as.numeric(row.names(cum(blog_table)))[1]
t <- as.numeric(row.names(cum(twitter_table)))[1]
rbind(unidf, '# unique words to cover 50%+' = c(n, b, t))
## news blogs tweets
## # unique words 283854 319101 369398
## # unique words to cover 50%+ 1182 973 543
Due to the huge data volume slowing down the speed, we can randomly select 50000 observations to get a rough idea
set.seed(7735)
news_small <- news[sample(1:nrow(news), 10000), ]
blog_small <- blog[sample(1:nrow(blog), 10000), ]
twitter_small <- twitter[sample(1:nrow(twitter), 10000), ]
We should also clean the profanity phrases & the stop phrases, too.
grams <- function(data, k){
data %>% unnest_tokens(word, text, token='ngrams', n=k) %>% anti_join(profanity, by=c('word'='text')) %>%
anti_join(get_stopwords(), by='word')
}
n2 <- grams(news_small, 2)
n3 <- grams(news_small, 3)
t2 <- grams(twitter_small, 2)
t3 <- grams(twitter_small, 3)
b2 <- grams(blog_small, 2)
b3 <- grams(blog_small, 3)
Below are three figures showing the top 6 most frequent phrases appearing in each file:
tb <- function(df){sort(table(df), decreasing=TRUE)[1:8]}
tbn2 <- tb(n2)
tbn3 <- tb(n3)
tbb2 <- tb(b2)
tbb3 <- tb(b3)
tbt2 <- tb(t2)
tbt3 <- tb(t3)
plt <- function(table1, table2, color, type){
par(mfrow=c(2,1), mar=c(5,4,4,1))
barplot(table1, col=rev(brewer.pal(8, color)), ylab='Frequency', main=paste('2-gram',type, " "))
barplot(table2, col=rev(brewer.pal(8, color)), ylab='Frequency', main=paste('3-gram',type, " "))
}
plt(tbn2, tbn3, 'Reds', 'News')
plt(tbb2, tbb3, 'Blues', 'Blogs')
plt(tbt2, tbt3, 'Purples', 'Tweets')