This is the exploration of the Data. In this first part im exploring the basic stats such as number of lines, number of words, maximum and minimum number of words per line in the three packages.
#Exploratory Data
library(stringi)
options(warn=-1)
#READ FILES
EN_blogs_words<-scan(file="Coursera-SwiftKey/final/en_US/en_US.blogs.txt", what = "character")
EN_news_words<-scan(file="Coursera-SwiftKey/final/en_US/en_US.news.txt", what = "character")
EN_tweets_words<-scan(file="Coursera-SwiftKey/final/en_US/en_US.twitter.txt", what = "character")
blogs_file <- file("Coursera-SwiftKey/final/en_US/en_US.blogs.txt", "r")
EN_blogs_lines <- readLines(blogs_file)
news_file <- file("Coursera-SwiftKey/final/en_US/en_US.news.txt", "r")
EN_news_lines <- readLines(news_file)
twitter_file <- file("Coursera-SwiftKey/final/en_US/en_US.twitter.txt", "r");
EN_twitter_lines <- readLines(twitter_file)
#DATA STATS
#Length char
Blogs_stats <- stri_stats_general(EN_blogs_lines)
News_stats <- stri_stats_general(EN_news_lines)
Tweets_stats <- stri_stats_general(EN_twitter_lines)
Length <- cbind(Blogs_stats,News_stats,Tweets_stats)
# words # number of words per line
words_blog <- sapply(strsplit(EN_blogs_lines, " "), length)
words_news <- sapply(strsplit(EN_news_lines, " "), length)
words_twitter <- sapply(strsplit(EN_twitter_lines, " "), length)
#max number of words
num_words <-data.frame(rbind(c(max(words_blog),min(words_blog)),
c(max(words_news),min(words_news)),
c(max(words_twitter),min(words_twitter))),
row.names = c("blogs", "news", "twitter"))
colnames(num_words) <- c("maxnumwords_perline","minnumwords_perline")
Stats <- rbind(Length,cbind(sum(words_blog),sum(words_news), sum(words_twitter)),t(num_words))
rownames(Stats)[5]<- "NumWords"
Stats <- Stats[-c(2,3,4),]
Stats
## Blogs_stats News_stats Tweets_stats
## Lines 899288 77259 2360148
## NumWords 37334131 2643969 30373543
## maxnumwords_perline 6630 1031 47
## minnumwords_perline 1 1 1
Its usefull to see all the information in plots
suppressPackageStartupMessages(library(plotly))
Stats <- as.data.frame(t(Stats))
Stats$Category <-rownames(Stats)
rownames(Stats) <- NULL
fig <- plot_ly(x=Stats$Category, y=Stats$Lines, type ="bar", text = text,
name = "Number of lines")
fig <- fig %>% add_trace(y = Stats$NumWords, name = 'Number of words')
fig <- fig %>% layout(yaxis = list(title = 'Count (log axis)', type="log"), barmode = 'group', title="Basic Stats")
fig
Since the sample is really big im taking only 5000 lines per file
BlogsSimple <- sample(EN_blogs_lines, size = 5000)
NewsSimple <-sample(EN_news_lines, size = 5000)
TweetsSimple <-sample(EN_twitter_lines, size = 5000)
We are entering the text mining world. In this, a corpus is defined as “a collection of written texts, especially the entire works of a particular author or a body of writing on a particular subject”. Im going to create 2 functions where I convert my text into a Corpus and then it is cleaned.
library(tm); library(SnowballC); library(stringr)
## Warning: package 'stringr' was built under R version 4.0.2
#Cleaning the data
CorpusClean<- function(text) {
corp<- paste(text, collapse=" ") %>%
str_replace_all("[^[:alnum:]]", " ") %>%
VectorSource() %>%
Corpus() %>%
tm_map(removeNumbers) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords,stopwords("english"))%>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
}
This function allows my to find the high frecuency said words.
Words_freq <- function (text) {
freq <- DocumentTermMatrix(text) %>%
as.matrix() %>% ## FrecuencyMatrix
colSums()
freq <- as.data.frame(sort(freq, decreasing=TRUE))
freq$words <- rownames(freq)
colnames(freq) <- c("Frequency","Words")
return(freq)
}
Im trying my functions to create interactive barplots
options(warn=-1)
BlogsClean <-CorpusClean(BlogsSimple)
BlogsFreq <- Words_freq(BlogsClean)
NewsClean <-CorpusClean(NewsSimple)
NewsFreq <- Words_freq(NewsClean)
TweetsClean <-CorpusClean(TweetsSimple)
TweetsFreq <- Words_freq(TweetsClean)
#Blogs Freq
ggplot(data=BlogsFreq[1:20,], aes(x=reorder(Words,-Frequency), y=Frequency, fill =Words)) +
geom_bar(stat="identity") + labs(x="Words", title="High Frequency words in Blogs")
#News Freq
ggplot(data=NewsFreq[1:20,], aes(x=reorder(Words,-Frequency), y=Frequency, fill =Words)) +
geom_bar(stat="identity") + labs(x="Words", title="High Frequency words in News")
#Tweets Freq
ggplot(data=TweetsFreq[1:20,], aes(x=reorder(Words,-Frequency), y=Frequency, fill =Words)) +
geom_bar(stat="identity") + labs(x="Words", title="High Frequency words in Tweets")