This report is a short analysis of the text data to be used for the Capstone project for the Data Science Specialization. along with a description of plans for the word prediction algorithm. The goals about this project are: Demonstrate that the student have downloaded the data and have successfully loaded it in; Create a basic report of summary statistics about the data sets; Report any interesting findings that you amassed so far.
R packages
library(stringi)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(tm)
library(NLP)
library(SnowballC)
Read file downloaded
blogsdata <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
newsdata <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitterdata <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
File size dimension research
dimblogs <- file.info("en_US.blogs.txt")$size
dimnews <- file.info("en_US.news.txt")$size
dimtwitter <- file.info("en_US.twitter.txt")$size
Number Lines research
lenghtblogs <- length(blogsdata)
lenghtnews <- length(newsdata)
lenghttwitter <- length(twitterdata)
Number characters
numberchar_blogs <- sum(nchar(blogsdata))
numberchar_news <- sum(nchar(newsdata))
numberchar_twitter <- sum(nchar(twitterdata))
Number words
numberword_blogs <- sum(stri_count_words(blogsdata))
numberword_news <- sum(stri_count_words(newsdata))
numberword_twitter <-sum(stri_count_words(twitterdata))
Data table report Summary statistics about the data sets
data.frame(data_type = c("blog", "news", "twitter"),
number.lines = c(lenghtblogs,lenghtnews,lenghttwitter),
number.character = c(numberchar_blogs,numberchar_news,numberchar_twitter),
numumber.words = c(numberword_blogs,numberword_news,numberword_twitter),
dimension_MB = c(dimblogs,dimnews,dimtwitter))
## data_type number.lines number.character numumber.words dimension_MB
## 1 blog 899288 206824505 37546239 210160014
## 2 news 1010242 203223159 34762395 205811889
## 3 twitter 2360148 162096031 30093372 167105338
The files are very large, so we will proceed to extract a parte of this database (5%) about exploratory analysis.
set.seed(12345)
twitter_sample <- sample(twitterdata, length(twitterdata) * 0.005, replace = FALSE)
blogs_sample <- sample(blogsdata, length(blogsdata) * 0.005, replace = FALSE)
news_sample <- sample(newsdata, length(newsdata) * 0.005, replace = FALSE)
data_sample = c(twitter_sample, blogs_sample, news_sample)
Corpus cleaning
corpus <- VCorpus(VectorSource(data_sample))
corpus1 <- tm_map(corpus,removePunctuation)
corpus2 <- tm_map(corpus1,stripWhitespace)
corpus3 <- tm_map(corpus2,tolower)
corpus4 <- tm_map(corpus3,removeNumbers)
corpus5 <- tm_map(corpus4,PlainTextDocument)
corpus6 <- tm_map(corpus5,stripWhitespace)
corpus7 <- tm_map(corpus6,removeWords,stopwords("english"))
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus7))),decreasing=TRUE), 10)
Frequent words head
Barplot most frequent words
barplot(frequentWords,
main = "Blogs Data: Most Frequent Words",
xlab="Word",
ylab = "Count",
angle = "45",
col = "lightblue")
Hist most frequent words
hist(frequentWords,
col= "green")
Wordcloud about most frequent twords
matrix1 <- TermDocumentMatrix(corpus7)
matrix1 <- as.matrix(matrix1)
word.cloud <- sort(rowSums(matrix1), decreasing=TRUE)
cloud1 <- data.frame(word=names(word.cloud), freq=word.cloud)
wordcloud(cloud1$word, cloud1$freq, max.words=100, min.freq= 100, random.order=FALSE, rot.per=.25, colors=brewer.pal(8, "Dark2"))
Conclusion: the corpus has the follow words most used:
head(frequentWords, 10)
## will just said one like can get time new good
## 1575 1560 1525 1413 1314 1258 1190 1071 915 856