Synopsis

Short overview of current status of processed Data IO and Exploratory Data Analysis along with most common word, ngrams and scope of population data.

if(!exists('US.twitter')) {
  con <- file("final/en_US/en_US.twitter.txt", "rb", encoding = "UTF-8")
  type_data = readLines(con, skipNul = FALSE,encoding = "UTF-8")
  close(con)
  US.twitter <- data.frame(type_data, stringsAsFactors=FALSE)
  remove(type_data)
}
## Warning in readLines(con, skipNul = FALSE, encoding = "UTF-8"): line 167155
## appears to contain an embedded nul
## Warning in readLines(con, skipNul = FALSE, encoding = "UTF-8"): line 268547
## appears to contain an embedded nul
## Warning in readLines(con, skipNul = FALSE, encoding = "UTF-8"): line
## 1274086 appears to contain an embedded nul
## Warning in readLines(con, skipNul = FALSE, encoding = "UTF-8"): line
## 1759032 appears to contain an embedded nul
if(!exists('US.news')) {
  con <- file('final/en_US/en_US.news.txt', "rb", encoding = "UTF-8")
  type_data = readLines(con, skipNul = FALSE)
  close(con)
  US.news <- data.frame(type_data, stringsAsFactors=FALSE)
  remove(type_data)
  }

if(!exists('US.blogs')) {
  con <- file('final/en_US/en_US.blogs.txt', "rb", encoding = "UTF-8")
  type_data = readLines(con, skipNul = FALSE)
  close(con)
  US.blogs <- data.frame(type_data, stringsAsFactors=FALSE)
  remove(type_data)
}

Exploratory Data Analysis Corpus Data

number of lines

library(plyr)
## Warning: package 'plyr' was built under R version 3.6.1
lines.US.TW <- summary(US.twitter)[1]
lines.US.BL <- summary(US.blogs)[1]
lines.US.NW <- summary(US.news)[1]
lines = c(lines.US.TW, lines.US.BL, lines.US.NW)

object sizes

osize.US.TW <- object.size(US.twitter)
osize.US.BL <- object.size(US.blogs)
osize.US.NW <- object.size(US.news)
osize = c(osize.US.TW, osize.US.BL, osize.US.NW)

line frequencies

library(dplyr)
library(tidytext)
if(!exists('count.US.TW')) {
US.twitter %>%   count(type_data, sort = TRUE) -> count.US.TW
}
if(!exists('count.US.BL')) {
US.blogs %>%   count(type_data, sort = TRUE) -> count.US.BL
}
if(!exists('count.US.NW')) {
US.news %>%   count(type_data, sort = TRUE) -> count.US.NW
}
counts = c(count.US.TW, count.US.BL, count.US.NW)

word counts & ngram 2 counts

library(dplyr)
library(tidytext)

if(!exists('count.US.TW.words')) {
count.US.TW.words <- unnest_tokens(US.twitter, token, type_data, token = "words") %>%
    count(token, sort = TRUE)
}
if(!exists('count.US.BL.words')) {
count.US.BL.words <- unnest_tokens(US.blogs, token, type_data, token = "words") %>%
    count(token, sort = TRUE)
}
if(!exists('count.US.NW.words')) {
count.US.NW.words <- unnest_tokens(US.news, token, type_data, token = "words") %>%
    count(token, sort = TRUE)
}

words= c(count.US.TW.words, count.US.BL.words, count.US.NW.words)
head(count.US.TW.words, 15)
## # A tibble: 15 x 2
##    token      n
##    <chr>  <int>
##  1 the   937405
##  2 to    788645
##  3 i     723447
##  4 a     611358
##  5 you   548089
##  6 and   438538
##  7 for   385348
##  8 in    380376
##  9 of    359635
## 10 is    358775
## 11 it    295087
## 12 my    291906
## 13 on    278022
## 14 that  234661
## 15 me    202547
head(count.US.BL.words, 15)
## # A tibble: 15 x 2
##    token       n
##    <chr>   <int>
##  1 the   1854232
##  2 and   1093307
##  3 to    1068623
##  4 a      899316
##  5 of     876475
##  6 i      769248
##  7 in     597444
##  8 that   459691
##  9 is     431696
## 10 it     400818
## 11 for    363211
## 12 you    295592
## 13 with   286464
## 14 was    278127
## 15 on     275921
#ngrams 2
if(!exists('US.TW.ng2')) {
US.TW.ng2 <- US.twitter %>% unnest_tokens(input = 'type_data', ngrams, token = "ngrams", n = 2)
}
if(!exists('count.US.TW.ng2')) {
count.US.TW.ng2 <- US.TW.ng2 %>% count(ngrams, sort = TRUE)
}
#head(count.US.TW.ng2, 15)

library(ggplot2)
ggplot(data=count.US.TW.ng2[1:40,], aes(x=ngrams, y=n, fill=n)) + 
    geom_bar(stat="identity") +  
    guides(fill=FALSE) + 
    theme(axis.text.x=element_text(angle=90)) +
    ggtitle('Histogram ngrams n=2 US Twitter full population') # for the main title

df <- data.frame(cbind( lines, osize), row.names = c('US Twitter', 'US Blogs', 'US News'))
print(df)
##                         lines     osize
## US Twitter Length:2360148     334485424
## US Blogs   Length:899288      267759320
## US News    Length:1010242     269841680

Findings on the population

The words and ngrams frequency analysis shows the dominance of simple and short words in everyday use. Most popular tweets are short and about thanking or replies to thanking and also Happy Birthday wishes.

Generate Random Samples and combine to corpus

in order to achieve acceptable Memory usage and processing time both in the data science process and in the later running Shiny app, we will take a random unbiased sample. The sample size for a so called ‘gold standard corpus’ has been seen to be 5% in other projects. As our population of documents is large enough we will use this 5% to achieve a 95% confidence with a Possion distribution. We will see later if this is inline with achievable memory and performance constraints. There is no sense in restricting the data scope too tight upfront. Data table saves approx 50% memory compared to data.frames with object US.TW.ng2.

library(data.table)
## Warning: package 'data.table' was built under R version 3.6.1
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
# select random sample rows of the dataframe 
set.seed(88)
US.twitter2 <- sample_frac(US.twitter,0.05)
object.size(US.twitter2)
## 16895008 bytes
US.TW.ng2 <- US.twitter2 %>% unnest_tokens(input = 'type_data', ngrams, token = "ngrams", n = 2)
US.TW.ng4 <- US.twitter2 %>% unnest_tokens(input = 'type_data', ngrams, token = "ngrams", n = 4)
object.size(US.TW.ng2)
## 159894488 bytes
# as data.table
US.TW.ng2 <- data.table(US.TW.ng2)
object.size(US.TW.ng2)
## 52454440 bytes
if(!exists('count.US.TW.ng2')) {
count.US.TW.ng2 <- US.TW.ng2 %>% count(ngrams, sort = TRUE)
}

head(count.US.TW.ng2, 15)
## # A tibble: 15 x 2
##    ngrams         n
##    <chr>      <int>
##  1 in the     78376
##  2 for the    74016
##  3 of the     56979
##  4 on the     48582
##  5 to be      47099
##  6 to the     43530
##  7 thanks for 43016
##  8 at the     37271
##  9 i love     35925
## 10 going to   34276
## 11 have a     33764
## 12 thank you  33404
## 13 if you     33257
## 14 i have     31576
## 15 i am       29840
#sampling
US.blogs2 <- sample_frac(US.blogs,0.05)
US.news2 <- sample_frac(US.news,0.05)

US.corpus <- data.table(rbind(US.twitter2, US.blogs2, US.news2))
# lines in corpus
nrow(US.corpus)
## [1] 213483

Reproducability

This R markdown files is complete and runnable. However the rendering did only succeed from the console with

library("knitr")
rmarkdown::render("Capstone-Week2-Milestone-Report.Rmd")