Note: Codes were automatically hidden. Please click on the button to show each code chunks.
There are three datasets for US English encoding: twitter, blogs and news. Each file weights from 160 to 200 MB. The lengths of lines of each datasets are shown below in a table with twitter having over 2 millions. With 140 character limitation on twitter, the character length of each line cannot be longer than 140. On the other hand, the character length can be as high as 5760 for news and 40835 for blogs. More details are reported in Table 1.
# importing data
folder <- "..\\Coursera-SwiftKey\\final\\en_US"
file_twitter <- "en_US.twitter.txt"
twitter <- readLines(file.path(folder, file_twitter))
file_blogs <- "en_US.blogs.txt"
blogs <- readLines(file.path(folder, file_blogs))
file_news <- "en_US.news.txt"
news <- readLines(file.path(folder, file_news))
folder <- "..\\Coursera-SwiftKey\\final\\en_US"
file_twitter <- "en_US.twitter.txt"
file_blogs <- "en_US.blogs.txt"
file_news <- "en_US.news.txt"
sizes <- c(utils:::format.object_size(file.size(file.path(folder, file_twitter)), "auto"),
utils:::format.object_size(file.size(file.path(folder, file_blogs)), "auto"),
utils:::format.object_size(file.size(file.path(folder, file_news)), "auto")
)
dataset_summary <- data.frame(dataset = c("twitter", "blogs", "news"),
files = c("en_US.twitter.txt", "en_US.blogs.txt", "en_US.news.txt"),
size = sizes,
length = c(length(twitter), length(blogs), length(news))
)
DT::datatable(dataset_summary, rownames = FALSE)
Due to the big size of the files, there is a need to sample the data to prevent the overloading of the computer.
set.seed(50)
twitter.sampling <- twitter[sample(1:length(twitter), size = 10000)]
set.seed(50)
blogs.sampling <- twitter[sample(1:length(blogs), size = 10000)]
set.seed(50)
news.sampling <- twitter[sample(1:length(news), size = 10000)]
Each data requires cleaning steps of removing punctuations and making them into lower case. preprocess function in ngram package can help to do the clean-up.
## turning into readable ASCII
twitter_sampling_cl1 <- iconv(twitter.sampling, "latin1", "ASCII", sub = "")
blogs_sampling_cl1 <- iconv(blogs.sampling, "latin1", "ASCII", sub = "")
news_sampling_cl1 <- iconv(news.sampling, "latin1", "ASCII", sub = "")
## cleaning the data
twitter.sampling_cl2 <- ngram::preprocess(ngram::concatenate(twitter_sampling_cl1),
remove.punct = TRUE)
blogs.sampling_cl2 <- ngram::preprocess(ngram::concatenate(blogs_sampling_cl1),
remove.punct = TRUE)
news.sampling_cl2 <- ngram::preprocess(ngram::concatenate(news_sampling_cl1),
remove.punct = TRUE)
After cleaning, the strings were tokenized before further analysis.
# tokenize
twitter.sampling_cl3 <- tau::tokenize(twitter.sampling_cl2)
blogs.sampling_cl3 <- tau::tokenize(blogs.sampling_cl2)
news.sampling_cl3 <- tau::tokenize(news.sampling_cl2)
# delete empty space
twitter.sampling_cl3 <- twitter.sampling_cl3[!(twitter.sampling_cl3 == "" | twitter.sampling_cl3 == " ")]
blogs.sampling_cl3 <- blogs.sampling_cl3[!(blogs.sampling_cl3 == "" | blogs.sampling_cl3 == " ")]
news.sampling_cl3 <- news.sampling_cl3[!(news.sampling_cl3 == "" | news.sampling_cl3 == " ")]
For each dataset of twitter, blogs and news, the distribution of the words were calculated. The top words that covers 50 % of the appearance frequency in each dataset were shown in a wordcloud. The top 20 words were also represented in a barplot.
# distribution of words
dist.twitter <- as.data.frame(table(twitter_done))
dist.twitter <- dist.twitter[order(dist.twitter$Freq, decreasing = TRUE),]
## words that cover 50 % of frequency
total.twitter <- sum(dist.twitter$Freq)
oc_cumsum.twitter <- cumsum(dist.twitter$Freq)
index.twitter <- which(oc_cumsum.twitter > total.twitter * 0.5)[1]
wordcloud(as.character(dist.twitter$twitter_done[1:index.twitter]), dist.twitter$Freq[1:index.twitter],
main = "test")
Top words that covers 50 % of the appearance frequency:
## top 10
dist.twitter$percent <- dist.twitter$Freq/total.twitter * 100
barplot(dist.twitter$percent[1:10], names.arg = dist.twitter$twitter_done[1:10], las = 2,
ylab = "%", cex.lab = 1.5, main = "Top 10 most used words in twitter")
# distribution of words
dist.blogs <- as.data.frame(table(blogs_done))
dist.blogs <- dist.blogs[order(dist.blogs$Freq, decreasing = TRUE),]
## words that cover 50 % of frequency
total.blogs <- sum(dist.blogs$Freq)
oc_cumsum.blogs <- cumsum(dist.blogs$Freq)
index.blogs <- which(oc_cumsum.blogs > total.blogs * 0.5)[1]
wordcloud(as.character(dist.blogs$blogs_done[1:index.blogs]), dist.blogs$Freq[1:index.blogs])
Top words that covers 50 % of the appearance frequency:
## top 10
dist.blogs$percent <- dist.blogs$Freq/total.blogs * 100
barplot(dist.blogs$percent[1:10], names.arg = dist.blogs$blogs_done[1:10], las = 2,
ylab = "%", cex.lab = 1.5, main = "Top 10 most used words in blogs")
# distribution of words
dist.news <- as.data.frame(table(news_done))
dist.news <- dist.news[order(dist.news$Freq, decreasing = TRUE),]
## words that cover 50 % of frequency
total.news <- sum(dist.news$Freq)
oc_cumsum.news <- cumsum(dist.news$Freq)
index.news <- which(oc_cumsum.news > total.news * 0.5)[1]
wordcloud(as.character(dist.news$news_done[1:index.news]), dist.news$Freq[1:index.news])
Top words that covers 50 % of the appearance frequency:
## top 10
dist.news$percent <- dist.news$Freq/total.news * 100
barplot(dist.news$percent[1:10], names.arg = dist.news$news_done[1:10], las = 2,
ylab = "%", cex.lab = 1.5, main = "Top 10 most used words in news")
One interesting result is that when the set-point was set at 10%, the same five words were extracted for all datasets. The five words are: the, to, i, a, you.