Capstone Project: Exploring Data

Zhu Yiwen

April 9th, 2019

1. Data Preparation

library(dplyr)
library(tidytext)
library(RColorBrewer)

1.1 Read the lines of each file

setwd("C:/Users/JW-RA2018/Desktop/Coursera/final/en_US")

readBinary <- function(filePath) {
  f <- file(filePath, "rb", encoding = "UTF-8")
  res <- readLines(f, encoding = "UTF-8")
  close(f)
  res
}

news <- tibble(text=readBinary('en_US.news.txt'))
blog <- tibble(text=readBinary('en_US.blogs.txt'))
twitter <- tibble(text=readBinary('en_US.twitter.txt'))

1.2 Tokenize the lines by words

news_token <- news %>% unnest_tokens(word, text)
blog_token <- blog %>% unnest_tokens(word, text)
twitter_token <- twitter %>% unnest_tokens(word, text)

1.3 Clean the profanity words and stop words

profanity <- data.frame('text' = t(read.csv('profanity.csv', header=FALSE)), stringsAsFactors = FALSE)
row.names(profanity) <- 1:nrow(profanity)
profanity$text <- gsub('^ ', '', profanity$text)

news_clean <- news_token %>% anti_join(profanity, by=c('word'='text')) %>% anti_join(get_stopwords(), by='word')
blog_clean <- blog_token %>% anti_join(profanity, by=c('word'='text')) %>% anti_join(get_stopwords(), by='word')
twitter_clean <- twitter_token %>% anti_join(profanity, by=c('word'='text')) %>% anti_join(get_stopwords(), by='word')

2. Summary statistics of each file

2.1 Basic Counts

The twitter file has the most lines of 2360148, followed by the blog file 899288 and news file 1010242.

data.frame('news'=c(nrow(news), nrow(news_clean)), 'blogs'=c(nrow(blog), nrow(blog_clean)), 
           'tweets'=c(nrow(twitter), nrow(twitter_clean)), row.names=c('Lines #', 'Words #'))
##             news    blogs   tweets
## Lines #  1010242   899288  2360148
## Words # 20493173 19584892 17121096

We can also take a look at the length distribution for each tweet in terms of the character number. Not surprisingly they are all no longer than 140 characters.

hist(sapply(twitter, nchar), col='pink', xlab='# characters', main='Length Distribution of Tweets')

2.2 Words distribution

With respect to word distribution, the most frequent word appearing in the news is ‘said’, whereas on blogs and twttier, ‘one’ and ‘just’ are the most frequent word, respectively.

news_table <- sort(table(news_clean), decreasing=TRUE)
blog_table <- sort(table(blog_clean), decreasing=TRUE)
twitter_table <- sort(table(twitter_clean), decreasing=TRUE)

head(news_table, 8)
## news_clean
##   said    one   year    new    two    can   also  first 
## 250418  88794  76765  70773  63867  58924  58786  57866
head(blog_table, 8)
## blog_clean
##    one   just   like    can   time    get   know    now 
## 127287 100793 100442  98420  90918  71093  60496  60358
head(twitter_table, 8)
## twitter_clean
##   just   like    get   love   good    day    can thanks 
## 151115 122455 112459 106721 101026  91710  89847  89660

The top 8 most frequent words for each file are summarized in the three figures below. For news articles, the most frequent word ‘said’ appears more frequently than the rest 7 words by a lot, whereas tweets show a quite smooth decreasing trend.

barplot(news_table[1:8], col=rev(brewer.pal(8, 'Reds')), ylab='frequency', main='Top 8 Most Frequent Words in News')

barplot(blog_table[1:8], col=rev(brewer.pal(8, 'Blues')), ylab='frequency', main='Top 8 Most Frequent Words on Blogs')

barplot(twitter_table[1:8], col=rev(brewer.pal(8, 'Purples')), ylab='frequency', main='Top 8 Most Frequent Words on Twitter')

2.3 Unique words

un <- length(news_table)
ub <- length(blog_table)
ut <- length(twitter_table)
unidf <- data.frame('news' = un, 'blogs' = ub, 'tweets' = ut, row.names = '# unique words')
unidf
##                  news  blogs tweets
## # unique words 283854 319101 369398

There are 283854 unique words for news, 319101 unique words for blogs and 369398 for tweets.

Of all the unique words appearing in news, at least 1182 are needed to account for at least over 50% of the instances.

cum <- function(table){
  df <- data.frame(table)
  df$Cum <- cumsum(df$Freq)
  df$Port <- df$Cum/sum(table)
  head(df[which(df$Port>=0.5), ])
}

cum(news_table)
##      news_clean Freq      Cum      Port
## 1182   majority 3196 10247597 0.5000493
## 1183       card 3191 10250788 0.5002050
## 1184         al 3189 10253977 0.5003606
## 1185      items 3189 10257166 0.5005162
## 1186   response 3185 10260351 0.5006717
## 1187      terms 3180 10263531 0.5008268

The table below summarizes the number of unique words needed for each file:

n <- as.numeric(row.names(cum(news_table)))[1]
b <- as.numeric(row.names(cum(blog_table)))[1]
t <- as.numeric(row.names(cum(twitter_table)))[1]
rbind(unidf, '# unique words to cover 50%+' = c(n, b, t))
##                                news  blogs tweets
## # unique words               283854 319101 369398
## # unique words to cover 50%+   1182    973    543

3. More in-depth exploration: multiple grams

Due to the huge data volume slowing down the speed, we can randomly select 50000 observations to get a rough idea

set.seed(7735)
news_small <- news[sample(1:nrow(news), 10000), ]
blog_small <- blog[sample(1:nrow(blog), 10000), ]
twitter_small <- twitter[sample(1:nrow(twitter), 10000), ]

We should also clean the profanity phrases & the stop phrases, too.

grams <- function(data, k){
  data %>% unnest_tokens(word, text, token='ngrams', n=k) %>% anti_join(profanity, by=c('word'='text')) %>%
    anti_join(get_stopwords(), by='word')
}

n2 <- grams(news_small, 2)
n3 <- grams(news_small, 3)
t2 <- grams(twitter_small, 2)
t3 <- grams(twitter_small, 3)
b2 <- grams(blog_small, 2)
b3 <- grams(blog_small, 3)

Below are three figures showing the top 6 most frequent phrases appearing in each file:

tb <- function(df){sort(table(df), decreasing=TRUE)[1:8]}

tbn2 <- tb(n2)
tbn3 <- tb(n3)
tbb2 <- tb(b2)
tbb3 <- tb(b3)
tbt2 <- tb(t2)
tbt3 <- tb(t3)
plt <- function(table1, table2, color, type){
  par(mfrow=c(2,1), mar=c(5,4,4,1))
  barplot(table1, col=rev(brewer.pal(8, color)), ylab='Frequency', main=paste('2-gram',type, " "))
  barplot(table2, col=rev(brewer.pal(8, color)), ylab='Frequency', main=paste('3-gram',type, " "))
}

plt(tbn2, tbn3, 'Reds', 'News')

plt(tbb2, tbb3, 'Blues', 'Blogs')

plt(tbt2, tbt3, 'Purples', 'Tweets')