This report is a summary of exploratory analysis on the en_US.blog.txt, en_US.new.txt, and en_US.twitter.txt files from the en_US folder provided. The properties of the individual files were analyzed e.g. file size, line lengths, and line numbers. Training data set was then created combining all files and containing 30% of randomly selected lines. This data set was cleaned up were non-characters, city/event names, spaces, were removed and the words transformed to their stem words. Finally, frequency distribution of words and top ten 1-n, 2-n, and 3-m were analyzed.
Comparing the three file, the blog and news had the largest files sizes followed by twitter. Similarly, the longest line is found in the blog text file followed by the news and twitter text files. The twitter text file had the most lines followed by the blog and news files. The word frequency distribution shows that limited numbers of words occur frequenly. The word cloud and the box plot show that the most frequent words is “one”, followed by “will”, “like”, “time”, “just”, “can”, “get”, “make”, “year”, and “day”, respectively. The most frequent n-2 and n-3 grams were “year old” and “two year ago”.
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(ggplot2)
library(stringr)
library(tidyverse)
library(R.utils)
library(RColorBrewer)
library(RWeka)
library(ngram)
library(tau)
library(clValid)
library(quanteda.textstats)
# File size
FsizeB <- file.info("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.blogs.txt")
FsizeN <- file.info("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.news.txt")
FsizeT <- file.info("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.twitter.txt")
FS_B <- FsizeB$size
FS_N <- FsizeN$size
FS_T <- FsizeT$size
# Count lines
LN_B <- NROW(readLines("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.blogs.txt"))
LN_N <- NROW(readLines("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.news.txt"))
LN_T <- NROW(readLines("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.twitter.txt"))
#Find longest line
con1 = file("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.blogs.txt", "r")
lenUSblog <- nchar(readLines(con1))
MLn_B <- max(lenUSblog)
close(con1)
con2 = file("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.news.txt", "r")
lenUSblog <- nchar(readLines(con2))
MLn_N <- max(lenUSblog)
close(con2)
con3 = file("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.twitter.txt", "r")
lenUSblog <- nchar(readLines(con3))
MLn_T <- max(lenUSblog)
close(con3)
BlogI <- data.frame(FS_B, MLn_B, LN_B)
colnames(BlogI) <- c('FileSize', 'MaximumLength', 'LineNumbers')
BlogN <- data.frame(FS_N, MLn_N, LN_N)
colnames(BlogN) <- c('FileSize', 'MaximumLength', 'LineNumbers')
BlogT <- data.frame(FS_T, MLn_T, LN_T)
colnames(BlogT) <- c('FileSize', 'MaximumLength', 'LineNumbers')
BlogI <- t(BlogI); BlogN<- t(BlogN); BlogT<- t(BlogT)
AllTx <- cbind(BlogI, BlogN, BlogT)
colnames(AllTx) <- c("Blog", "News", "Twitter")
as.data.frame(AllTx)
## Blog News Twitter
## FileSize 210160014 205811889 167105338
## MaximumLength 40835 5760 213
## LineNumbers 899288 77259 2360148
Comparing the three file, the blog and news have a similar file size and larger than twitter. Similarly, the longest line is found in the blog text file followed by the news and twitter text files. The twitter text file has the most lines followed by blog and news.
A training data set with 30% of the lines were randomly selected for all data text files. These files were then written and imported as a one CORPUS object.
#Creating a random vector for sampling
#Blog
n70B <- 0.70 * LN_B
n75B <- 0.75 * LN_B
n50B <- 0.50 * LN_B
n25B <- 0.25 * LN_B
n30B <- 0.30 * LN_B
nB <- 1:LN_B
set.seed(123)
ntrainB <- as.numeric(sample(nB, n30B))
ntestB <- as.numeric(setdiff(nB, ntrainB))
#News
n70N <- 0.70 * LN_N
n75N <- 0.75 * LN_N
n50N <- 0.50 * LN_N
n25N <- 0.25 * LN_N
n30N <- 0.30 * LN_N
nN <- 1:LN_N
set.seed(123)
ntrainN <- as.numeric(sample(nN, n30N))
ntestN <- as.numeric(setdiff(nN, ntrainN))
#Twitter
set.seed(123)
n70T <- 0.70 * LN_T
n75T <- 0.75 * LN_T
n50T <- 0.50 * LN_T
n25T <- 0.25 * LN_T
n30T <- 0.30 * LN_T
nT <- 1:LN_T
set.seed(123)
ntrainT <- as.numeric(sample(nT, n30T))
ntestT <- as.numeric(setdiff(nT, ntrainT))
#Create the training sets
#Blog
con1 = file("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.blogs.txt", "r")
USBlog <- readLines(con1)
close(con1)
USBlog_trainB <- USBlog[ntrainB]
USBlog_testB <- USBlog[ntestB]
#USBlog_trainB, USBlog_trainN, USTwitter_train
#Write data sets
write.table(USBlog_trainB, file = "C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/US_Train/us_train.txt", sep = "\n",row.names = TRUE, col.names = FALSE)
write.table(USBlog_testB, file = "C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/US_Test/us_test.txt", sep = "\n",row.names = TRUE, col.names = FALSE)
#News
con2 = file("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.news.txt", "r")
USNews <- readLines(con2)
close(con2)
USBlog_trainN <- USNews[ntrainN]
USBlog_testN <- USNews[ntestN]
write.table(USBlog_trainN, file = "C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/US_Train/us_train_news.txt", sep = "\n",row.names = TRUE, col.names = FALSE)
write.table(USBlog_testN, file = "C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/US_Test/us_test_news.txt", sep = "\n",row.names = TRUE, col.names = FALSE)
#Twitter
con3 = file("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/en_US.twitter.txt", "r")
USTwitter <- readLines(con3)
close(con3)
USTwitter_train <- USTwitter[ntrainN]
USTwitter_test <- USTwitter[ntestN]
write.table(USTwitter_train, file = "C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/US_Train/us_train_twitter.txt", sep = "\n",row.names = TRUE, col.names = FALSE)
write.table(USTwitter_test, file = "C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/US_Test/us_test_twitter.txt", sep = "\n",row.names = TRUE, col.names = FALSE)
#Create one corpus object with all three files
US_train <- VCorpus(DirSource("C:/Users/Kristin.Butler/Desktop/Coursera/Capstone/final/en_US/US_Train", encoding = "UTF-8"), readerControl = list(language = "en"))
The text files were cleaned up where non characters and spaces were removed. Additionally, the words were transformed their stem word.
#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
EN_blog <- tm_map(US_train, toSpace, "/")
EN_blog <- tm_map(EN_blog, toSpace, "@")
EN_blog <- tm_map(EN_blog, toSpace, "\\|")
EN_blog <- tm_map(EN_blog, toSpace, ":)")
EN_blog <- tm_map(EN_blog, toSpace, "!")
EN_blog <- tm_map(EN_blog, toSpace, "-")
EN_blog <- tm_map(EN_blog, toSpace, "Amazon")
EN_blog <- tm_map(EN_blog, toSpace, "New York")
EN_blog <- tm_map(EN_blog, toSpace, "NY")
EN_blog <- tm_map(EN_blog, toSpace, "C")
EN_blog <- tm_map(EN_blog, toSpace, "EU")
EN_blog <- tm_map(EN_blog, toSpace, "World War")
# Transforming everything to lowercase
EN_blog <- tm_map(EN_blog, content_transformer(tolower))
# Removing numbers
EN_blog <- tm_map(EN_blog, removeNumbers)
# Removing punctuation
EN_blog <- tm_map(EN_blog, removePunctuation)
# Removing stop words
EN_blog <- tm_map(EN_blog, removeWords, stopwords("english"))
# Stripping any extra white space:
EN_blog <- tm_map(EN_blog, stripWhitespace)
#EN_blog[[1]]$content
#Collaps word to a common root
EN_blog <- tm_map(EN_blog, stemDocument)
# Stripping any extra white space:
EN_blog <- tm_map(EN_blog, stripWhitespace)
#EN_blog[[1]]$content
document term matrix was created were the frequency of the word were determined. This matrix was them rearranged and matrix with the top 10, 30, 50 and 75 words was created.
#Create a document term matrix
DTM <- DocumentTermMatrix(EN_blog)
FREQ <- colSums(as.matrix(DTM))
ATx <- as.data.frame(FREQ)
ATx$Words <- rownames(ATx)
colnames(ATx) <- c("Freq", "Word")
#Arrange and select top 10, 30, 50, 75 frequent words.
top10 <- head(arrange(ATx, desc(Freq)),n=10)
top30 <- head(arrange(ATx, desc(Freq)),n=30)
top50<- head(arrange(ATx, desc(Freq)),n=50)
top75<- head(arrange(ATx, desc(Freq)),n=75)
The word frequency distribution shows that limited numbers of words have high frequency (Figure 1).
#Word frequency distribution
plot(sort(FREQ, decreasing = T),col="turquoise2",main="Word Frequencies", xlab="Word Rank", ylab = "Word Frequency", sub ="Figure 1. Word frequency distribution")
The word cloud and the box plot show that the most frequent words is one, followed by will, like, time, just, can, get, make, year, and day, respectively (Figure 2).
#Word cloud
wordcloud(words = top50$Word, freq = top50$Freq, min.freq = 1000, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Paired"))
#Bar plot of 10 most frequent words
ggplot(top10, aes(reorder(Word, Freq), Freq)) +
geom_bar(stat="identity", fill = "violet") + coord_flip() +
xlab("Terms") + ylab("Frequency") +
ggtitle("Frequencies of top 10 words")+
theme_classic()+
theme(legend.position = "none", plot.title = element_text(face = "bold", hjust = 0.5))+
labs(caption = "Figure 2. Top 10 frequent words of text ")
The most frequent n-2 and n-3 grams were “year old” and “two year ago” as seen in figure 3 and 4.
# create a vector of all three texts
str <- concatenate(lapply(EN_blog,"[", 1))
USBlog_trainG <- preprocess(str, remove.punct=TRUE, remove.numbers = TRUE, fix.spacing = TRUE)
# Find the most frequent two and three words
ng2 <- ngram(USBlog_trainG, n=2)
ng3 <- ngram(USBlog_trainG, n=3)
# create a matrix to make a bar-plot
W2 <- head(get.phrasetable(ng2), n=10)
W3 <- head(get.phrasetable(ng3), n=10)
W2$ngrams <- as.factor(W2$ngrams)
W2$freq <- as.numeric(W2$freq)
W3$freq <- as.numeric(W3$freq)
W3$freq <- as.numeric(W3$freq)
#Plot the results
FW2 <- ggplot(data=W2, aes(x=reorder(ngrams, freq), y = freq))+
geom_bar(stat="identity", fill = "turquoise2")+
ylab("Frequency")+
xlab("2-gram words")+
scale_fill_brewer(10,palette="Paired")+
ggtitle("Frequency of n-2 grams")+
theme_classic()+
theme(legend.position = "none", plot.title = element_text(face = "bold", hjust = 0.5))+
coord_flip()+
labs(caption = "Figure 3. Top 10 frequent n-2 grams ")
FW2
FW3 <- ggplot(data=W3, aes(x=reorder(ngrams, freq), y = freq))+
geom_bar(stat="identity", fill = "orchid2")+
ylab("Frequency")+
xlab("3-gram words")+
scale_fill_brewer(10,palette="Paired")+
ggtitle("Frequency of n-3 grams")+
theme_classic()+
theme(legend.position = "none", plot.title = element_text(face = "bold", hjust = 0.5))+
coord_flip()+
labs(caption = "Figure 4. Top 10 frequent n-3 grams ")
FW3
# Write function to calculate coverage
CalCoverage <- function(DTM, coverage){
sums <- as.data.frame(colSums(as.matrix(DTM)))
sums <- rownames_to_column(sums)
colnames(sums) <- c("term", "count")
sums <- arrange(sums, desc(count))
NeededFreq <- coverage * sum(sums$count[1:nrow(sums)])
for(i in 1:nrow(sums)){
if(sum(sums$count[1:i]) >= NeededFreq){
return(i)
}}
}
#Calculate coverage of 50 and 90%
D50 <- CalCoverage(DTM, 0.5)
D90 <- CalCoverage(DTM, 0.9)
D50
## [1] 581
D90
## [1] 7369
To cover 50% of unique words in a frequency sorted dictionary, 581 are needed but 7369 for 90% of unique words.