#Executive Summary This projects aims to : 1. 1) Demonstrate that I’ve downloaded the data and have successfully loaded it in. 2. 2) Create a basic report of summary statistics about the data sets. 3. 3) Report any interesting findings that I found so far. The following are the steps in this project: 4. Data Loading. 5. Make a quick initial preview by taking random data samples . 6. Data cleaning 7. Conducting basic statistics 8. Producing some bar plots and word clouds for 1-4 word sequences 9. A summary of the observation in this work. #Loading libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
library(tm)
## Warning: package 'tm' was built under R version 4.0.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.0.3
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.3
library(stringi)
library(knitr)
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.0.3
Produce some descriptive statistics In order to get insight into dataset, we first produce basic statistics on the text in the data_blogss, data_news and data_twitter files. The basic statistics calculate show total number of characters , words and lines. We will also show minimum, maximum and average word counts for the above mentioned files.
data_twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
data_blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
data_news <- readLines("en_US.news.txt", encoding = "UTF-8", warn = FALSE, skipNul = TRUE)
#profanity <- readLines("bad_words.txt", warn = FALSE, skipNul = TRUE)
#Basic Summary and Statistics
dataset <- c("Blogs", "News", "Twitter")
wordcount_data_blogs <- stri_count_words(data_blogs)
wordcount_data_news <- stri_count_words(data_news)
wordcount_data_twitter <- stri_count_words(data_twitter)
data_linecount <- c(length(data_blogs), length(data_news), length(data_twitter))
data_wordcount <- c(sum(wordcount_data_blogs), sum(wordcount_data_news), sum(wordcount_data_twitter))
data_max_wordcount <- c(max(wordcount_data_blogs), max(wordcount_data_news), max(wordcount_data_twitter))
data_avg_wordcount <- c(mean(wordcount_data_blogs), mean(wordcount_data_news), mean(wordcount_data_twitter))
data_summary <- data.frame(dataset, data_linecount, data_wordcount, data_max_wordcount, data_avg_wordcount)
data_names <- c("Dataset", "Lines", "Total Words", "Max Words", "Average Word Count")
kable(data_summary, digits = 1, col.names = data_names, align = 'c')
| Dataset | Lines | Total Words | Max Words | Average Word Count |
|---|---|---|---|---|
| Blogs | 899288 | 37546239 | 6726 | 41.8 |
| News | 77259 | 2674536 | 1123 | 34.6 |
| 2360148 | 30093413 | 47 | 12.8 | |
| #Getting a | Preview of | the Data |
To have a quick glance at the data we take a 0.4 of the random sample from each file since the computer being used has only 1 Gb ram.
set.seed(22)
sample_data <- c(sample(data_blogs, length(data_blogs)*0.004),sample(data_news, length(data_news)*0.004),sample(data_twitter, length(data_twitter)*0.004))
corpus <- Corpus(VectorSource(sample_data))
#cleaning the data
toSpace <- content_transformer(function(x, pattern) gsub(pattern, "", x))
corpus <- tm_map(corpus, toSpace, "[^[:print:]]")
## Warning in tm_map.SimpleCorpus(corpus, toSpace, "[^[:print:]]"): transformation
## drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
corpus_no_stopwords <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus_no_stopwords <- tm_map(corpus_no_stopwords, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus_no_stopwords, stripWhitespace):
## transformation drops documents
#Exploratory Data Analysis
TwoGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
ThreeGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
ngram_freqdf <- function(tdm, sparsity){
freq <- sort(rowSums(as.matrix(removeSparseTerms(tdm, sparsity))), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
tdm_onegram <- TermDocumentMatrix(corpus)
onegram_freqdf <- ngram_freqdf(tdm_onegram, 0.99)
twogram_tdm <- TermDocumentMatrix(corpus, control = list(tokenize = TwoGramTokenizer))
twogram_freqdf <- ngram_freqdf(twogram_tdm, 0.99)
tdm_threegram <- TermDocumentMatrix(corpus, control = list(tokenize = ThreeGramTokenizer))
threegram_freqdf <- ngram_freqdf(tdm_threegram, 0.999)
onegramNS_tdm <- TermDocumentMatrix(corpus_no_stopwords)
onegramNS_freqdf <- ngram_freqdf(onegramNS_tdm, 0.99)
twogramNS_tdm <- TermDocumentMatrix(corpus_no_stopwords, control = list(tokenize = TwoGramTokenizer))
twogramNS_freqdf <- ngram_freqdf(twogramNS_tdm, 0.999)
threegramNS_tdm <- TermDocumentMatrix(corpus_no_stopwords, control = list(tokenize = ThreeGramTokenizer))
threegramNS_freqdf <- ngram_freqdf(threegramNS_tdm, 0.9999)
ngram_barplot <- function(df, title){
dfsub <- subset(df[1:15,])
ggplot(dfsub, aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") +
labs(x = "Words", y = "Count", title = title) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))
}
barplot_onegram <- ngram_barplot(onegram_freqdf,"Top 15 Words (All)")
barplot_onegramNS <- ngram_barplot(onegramNS_freqdf,"Top 15 Words (No stopwords)")
barplot_twogram <- ngram_barplot(twogram_freqdf,"Top 15 2-grams (All)")
barplot_twogramNS <- ngram_barplot(twogramNS_freqdf,"Top 15 2-grams (No stopwords)")
barplot_threegram <- ngram_barplot(threegram_freqdf,"Top 15 3-grams (All)")
barplot_threegramNS <- ngram_barplot(threegramNS_freqdf,"Top 15 3-grams (No stopwords)")
grid.arrange(barplot_onegram, barplot_onegramNS, ncol = 1)