Capstone Project - Exploratory Data Analysis

This is the exploration of the Data. In this first part im exploring the basic stats such as number of lines, number of words, maximum and minimum number of words per line in the three packages.

#Exploratory Data
library(stringi)
options(warn=-1)

#READ FILES
EN_blogs_words<-scan(file="Coursera-SwiftKey/final/en_US/en_US.blogs.txt", what = "character")
EN_news_words<-scan(file="Coursera-SwiftKey/final/en_US/en_US.news.txt", what = "character")
EN_tweets_words<-scan(file="Coursera-SwiftKey/final/en_US/en_US.twitter.txt", what = "character")


blogs_file <- file("Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "r")
EN_blogs_lines <- readLines(blogs_file)

news_file <- file("Coursera-SwiftKey/final/en_US/en_US.news.txt", "r")
EN_news_lines <- readLines(news_file)

twitter_file <- file("Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "r"); 
EN_twitter_lines <- readLines(twitter_file)

#DATA STATS 
#Length char
Blogs_stats <- stri_stats_general(EN_blogs_lines)
News_stats <- stri_stats_general(EN_news_lines)
Tweets_stats <- stri_stats_general(EN_twitter_lines)
Length <- cbind(Blogs_stats,News_stats,Tweets_stats)


# words # number of words per line
words_blog <- sapply(strsplit(EN_blogs_lines, " "), length)
words_news <- sapply(strsplit(EN_news_lines, " "), length)
words_twitter <- sapply(strsplit(EN_twitter_lines, " "), length)

#max number of words
num_words <-data.frame(rbind(c(max(words_blog),min(words_blog)),
             c(max(words_news),min(words_news)),
             c(max(words_twitter),min(words_twitter))),
           row.names = c("blogs", "news", "twitter"))
colnames(num_words) <- c("maxnumwords_perline","minnumwords_perline")

Stats <- rbind(Length,cbind(sum(words_blog),sum(words_news), sum(words_twitter)),t(num_words))
rownames(Stats)[5]<- "NumWords"
Stats <- Stats[-c(2,3,4),]
Stats
##                     Blogs_stats News_stats Tweets_stats
## Lines                    899288      77259      2360148
## NumWords               37334131    2643969     30373543
## maxnumwords_perline        6630       1031           47
## minnumwords_perline           1          1            1

Some plots

Its usefull to see all the information in plots

suppressPackageStartupMessages(library(plotly))
Stats <- as.data.frame(t(Stats))
Stats$Category <-rownames(Stats)
rownames(Stats) <- NULL

fig <- plot_ly(x=Stats$Category, y=Stats$Lines, type ="bar", text = text,
               name = "Number of lines")
fig <- fig %>% add_trace(y = Stats$NumWords, name = 'Number of words')
fig <- fig %>% layout(yaxis = list(title = 'Count (log axis)', type="log"), barmode = 'group', title="Basic Stats")
fig

Simplyfing the sample

Since the sample is really big im taking only 5000 lines per file

BlogsSimple <- sample(EN_blogs_lines, size = 5000)
NewsSimple  <-sample(EN_news_lines, size = 5000)
TweetsSimple <-sample(EN_twitter_lines, size = 5000)

High Frecuency words

We are entering the text mining world. In this, a corpus is defined as “a collection of written texts, especially the entire works of a particular author or a body of writing on a particular subject”. Im going to create 2 functions where I convert my text into a Corpus and then it is cleaned.

library(tm); library(SnowballC); library(stringr)
## Warning: package 'stringr' was built under R version 4.0.2
#Cleaning the data
CorpusClean<- function(text) {
      corp<- paste(text, collapse=" ") %>%
      str_replace_all("[^[:alnum:]]", " ") %>%
      VectorSource() %>%
      Corpus() %>%
      tm_map(removeNumbers) %>%
      tm_map(content_transformer(tolower)) %>%
      tm_map(removeWords,stopwords("english"))%>%
      tm_map(removePunctuation) %>%
      tm_map(stripWhitespace) %>%
      tm_map(stemDocument)  
}

This function allows my to find the high frecuency said words.

Words_freq <- function (text) {
    freq <- DocumentTermMatrix(text) %>%
      as.matrix() %>%  ## FrecuencyMatrix
      colSums() 
    freq <- as.data.frame(sort(freq, decreasing=TRUE))
    freq$words <- rownames(freq)
    colnames(freq) <- c("Frequency","Words")
    return(freq)
}

Trying my functions

Im trying my functions to create interactive barplots

options(warn=-1)
BlogsClean <-CorpusClean(BlogsSimple)
BlogsFreq <- Words_freq(BlogsClean)

NewsClean <-CorpusClean(NewsSimple)
NewsFreq <- Words_freq(NewsClean)

TweetsClean <-CorpusClean(TweetsSimple)
TweetsFreq <- Words_freq(TweetsClean)


#Blogs Freq
ggplot(data=BlogsFreq[1:20,], aes(x=reorder(Words,-Frequency), y=Frequency, fill =Words)) +
  geom_bar(stat="identity") + labs(x="Words", title="High Frequency words in Blogs")

#News Freq
ggplot(data=NewsFreq[1:20,], aes(x=reorder(Words,-Frequency), y=Frequency, fill =Words)) +
  geom_bar(stat="identity") + labs(x="Words", title="High Frequency words in News")

#Tweets Freq
ggplot(data=TweetsFreq[1:20,], aes(x=reorder(Words,-Frequency), y=Frequency, fill =Words)) +
  geom_bar(stat="identity") + labs(x="Words", title="High Frequency words in Tweets")