Overview

This report is a short analysis of the text data to be used for the Capstone project for the Data Science Specialization. along with a description of plans for the word prediction algorithm. The goals about this project are: Demonstrate that the student have downloaded the data and have successfully loaded it in; Create a basic report of summary statistics about the data sets; Report any interesting findings that you amassed so far.

Data analysis

R packages

library(stringi)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(tm)
library(NLP)
library(SnowballC)

Read file downloaded

blogsdata <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
newsdata <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitterdata <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

File size dimension research

dimblogs <- file.info("en_US.blogs.txt")$size
dimnews <- file.info("en_US.news.txt")$size
dimtwitter <- file.info("en_US.twitter.txt")$size

Number Lines research

lenghtblogs <- length(blogsdata) 
lenghtnews <- length(newsdata) 
lenghttwitter <- length(twitterdata)

Number characters

numberchar_blogs <- sum(nchar(blogsdata))
numberchar_news <- sum(nchar(newsdata))
numberchar_twitter <- sum(nchar(twitterdata))

Number words

numberword_blogs <- sum(stri_count_words(blogsdata)) 
numberword_news <- sum(stri_count_words(newsdata))  
numberword_twitter <-sum(stri_count_words(twitterdata)) 

Data table report Summary statistics about the data sets

data.frame(data_type = c("blog", "news", "twitter"),
           number.lines = c(lenghtblogs,lenghtnews,lenghttwitter),
           number.character = c(numberchar_blogs,numberchar_news,numberchar_twitter),
           numumber.words = c(numberword_blogs,numberword_news,numberword_twitter),
           dimension_MB = c(dimblogs,dimnews,dimtwitter))
##   data_type number.lines number.character numumber.words dimension_MB
## 1      blog       899288        206824505       37546239    210160014
## 2      news      1010242        203223159       34762395    205811889
## 3   twitter      2360148        162096031       30093372    167105338

The files are very large, so we will proceed to extract a parte of this database (5%) about exploratory analysis.

set.seed(12345)
twitter_sample  <- sample(twitterdata, length(twitterdata) * 0.005, replace = FALSE)
blogs_sample    <- sample(blogsdata, length(blogsdata) * 0.005, replace = FALSE)
news_sample     <- sample(newsdata, length(newsdata) * 0.005, replace = FALSE)
data_sample = c(twitter_sample, blogs_sample, news_sample)

Corpus cleaning

corpus <- VCorpus(VectorSource(data_sample))
corpus1 <- tm_map(corpus,removePunctuation)
corpus2 <- tm_map(corpus1,stripWhitespace)
corpus3 <- tm_map(corpus2,tolower) 
corpus4 <- tm_map(corpus3,removeNumbers)
corpus5 <- tm_map(corpus4,PlainTextDocument)
corpus6 <- tm_map(corpus5,stripWhitespace)
corpus7 <- tm_map(corpus6,removeWords,stopwords("english")) 
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus7))),decreasing=TRUE), 10)

Frequent words head

Barplot most frequent words

barplot(frequentWords, 
        main = "Blogs Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count",
        angle = "45",
        col = "lightblue")

Hist most frequent words

hist(frequentWords,
     col= "green")

Wordcloud about most frequent twords

matrix1 <- TermDocumentMatrix(corpus7)
matrix1 <- as.matrix(matrix1)
word.cloud <- sort(rowSums(matrix1), decreasing=TRUE) 
cloud1 <- data.frame(word=names(word.cloud), freq=word.cloud)
wordcloud(cloud1$word, cloud1$freq, max.words=100, min.freq= 100, random.order=FALSE, rot.per=.25, colors=brewer.pal(8, "Dark2"))

Conclusion: the corpus has the follow words most used:

head(frequentWords, 10)
## will just said  one like  can  get time  new good 
## 1575 1560 1525 1413 1314 1258 1190 1071  915  856