Short overview of current status of processed Data IO and Exploratory Data Analysis along with most common word, ngrams and scope of population data.
if(!exists('US.twitter')) {
con <- file("final/en_US/en_US.twitter.txt", "rb", encoding = "UTF-8")
type_data = readLines(con, skipNul = FALSE,encoding = "UTF-8")
close(con)
US.twitter <- data.frame(type_data, stringsAsFactors=FALSE)
remove(type_data)
}
## Warning in readLines(con, skipNul = FALSE, encoding = "UTF-8"): line 167155
## appears to contain an embedded nul
## Warning in readLines(con, skipNul = FALSE, encoding = "UTF-8"): line 268547
## appears to contain an embedded nul
## Warning in readLines(con, skipNul = FALSE, encoding = "UTF-8"): line
## 1274086 appears to contain an embedded nul
## Warning in readLines(con, skipNul = FALSE, encoding = "UTF-8"): line
## 1759032 appears to contain an embedded nul
if(!exists('US.news')) {
con <- file('final/en_US/en_US.news.txt', "rb", encoding = "UTF-8")
type_data = readLines(con, skipNul = FALSE)
close(con)
US.news <- data.frame(type_data, stringsAsFactors=FALSE)
remove(type_data)
}
if(!exists('US.blogs')) {
con <- file('final/en_US/en_US.blogs.txt', "rb", encoding = "UTF-8")
type_data = readLines(con, skipNul = FALSE)
close(con)
US.blogs <- data.frame(type_data, stringsAsFactors=FALSE)
remove(type_data)
}
number of lines
library(plyr)
## Warning: package 'plyr' was built under R version 3.6.1
lines.US.TW <- summary(US.twitter)[1]
lines.US.BL <- summary(US.blogs)[1]
lines.US.NW <- summary(US.news)[1]
lines = c(lines.US.TW, lines.US.BL, lines.US.NW)
object sizes
osize.US.TW <- object.size(US.twitter)
osize.US.BL <- object.size(US.blogs)
osize.US.NW <- object.size(US.news)
osize = c(osize.US.TW, osize.US.BL, osize.US.NW)
line frequencies
library(dplyr)
library(tidytext)
if(!exists('count.US.TW')) {
US.twitter %>% count(type_data, sort = TRUE) -> count.US.TW
}
if(!exists('count.US.BL')) {
US.blogs %>% count(type_data, sort = TRUE) -> count.US.BL
}
if(!exists('count.US.NW')) {
US.news %>% count(type_data, sort = TRUE) -> count.US.NW
}
counts = c(count.US.TW, count.US.BL, count.US.NW)
word counts & ngram 2 counts
library(dplyr)
library(tidytext)
if(!exists('count.US.TW.words')) {
count.US.TW.words <- unnest_tokens(US.twitter, token, type_data, token = "words") %>%
count(token, sort = TRUE)
}
if(!exists('count.US.BL.words')) {
count.US.BL.words <- unnest_tokens(US.blogs, token, type_data, token = "words") %>%
count(token, sort = TRUE)
}
if(!exists('count.US.NW.words')) {
count.US.NW.words <- unnest_tokens(US.news, token, type_data, token = "words") %>%
count(token, sort = TRUE)
}
words= c(count.US.TW.words, count.US.BL.words, count.US.NW.words)
head(count.US.TW.words, 15)
## # A tibble: 15 x 2
## token n
## <chr> <int>
## 1 the 937405
## 2 to 788645
## 3 i 723447
## 4 a 611358
## 5 you 548089
## 6 and 438538
## 7 for 385348
## 8 in 380376
## 9 of 359635
## 10 is 358775
## 11 it 295087
## 12 my 291906
## 13 on 278022
## 14 that 234661
## 15 me 202547
head(count.US.BL.words, 15)
## # A tibble: 15 x 2
## token n
## <chr> <int>
## 1 the 1854232
## 2 and 1093307
## 3 to 1068623
## 4 a 899316
## 5 of 876475
## 6 i 769248
## 7 in 597444
## 8 that 459691
## 9 is 431696
## 10 it 400818
## 11 for 363211
## 12 you 295592
## 13 with 286464
## 14 was 278127
## 15 on 275921
#ngrams 2
if(!exists('US.TW.ng2')) {
US.TW.ng2 <- US.twitter %>% unnest_tokens(input = 'type_data', ngrams, token = "ngrams", n = 2)
}
if(!exists('count.US.TW.ng2')) {
count.US.TW.ng2 <- US.TW.ng2 %>% count(ngrams, sort = TRUE)
}
#head(count.US.TW.ng2, 15)
library(ggplot2)
ggplot(data=count.US.TW.ng2[1:40,], aes(x=ngrams, y=n, fill=n)) +
geom_bar(stat="identity") +
guides(fill=FALSE) +
theme(axis.text.x=element_text(angle=90)) +
ggtitle('Histogram ngrams n=2 US Twitter full population') # for the main title
df <- data.frame(cbind( lines, osize), row.names = c('US Twitter', 'US Blogs', 'US News'))
print(df)
## lines osize
## US Twitter Length:2360148 334485424
## US Blogs Length:899288 267759320
## US News Length:1010242 269841680
The words and ngrams frequency analysis shows the dominance of simple and short words in everyday use. Most popular tweets are short and about thanking or replies to thanking and also Happy Birthday wishes.
in order to achieve acceptable Memory usage and processing time both in the data science process and in the later running Shiny app, we will take a random unbiased sample. The sample size for a so called ‘gold standard corpus’ has been seen to be 5% in other projects. As our population of documents is large enough we will use this 5% to achieve a 95% confidence with a Possion distribution. We will see later if this is inline with achievable memory and performance constraints. There is no sense in restricting the data scope too tight upfront. Data table saves approx 50% memory compared to data.frames with object US.TW.ng2.
library(data.table)
## Warning: package 'data.table' was built under R version 3.6.1
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
# select random sample rows of the dataframe
set.seed(88)
US.twitter2 <- sample_frac(US.twitter,0.05)
object.size(US.twitter2)
## 16895008 bytes
US.TW.ng2 <- US.twitter2 %>% unnest_tokens(input = 'type_data', ngrams, token = "ngrams", n = 2)
US.TW.ng4 <- US.twitter2 %>% unnest_tokens(input = 'type_data', ngrams, token = "ngrams", n = 4)
object.size(US.TW.ng2)
## 159894488 bytes
# as data.table
US.TW.ng2 <- data.table(US.TW.ng2)
object.size(US.TW.ng2)
## 52454440 bytes
if(!exists('count.US.TW.ng2')) {
count.US.TW.ng2 <- US.TW.ng2 %>% count(ngrams, sort = TRUE)
}
head(count.US.TW.ng2, 15)
## # A tibble: 15 x 2
## ngrams n
## <chr> <int>
## 1 in the 78376
## 2 for the 74016
## 3 of the 56979
## 4 on the 48582
## 5 to be 47099
## 6 to the 43530
## 7 thanks for 43016
## 8 at the 37271
## 9 i love 35925
## 10 going to 34276
## 11 have a 33764
## 12 thank you 33404
## 13 if you 33257
## 14 i have 31576
## 15 i am 29840
#sampling
US.blogs2 <- sample_frac(US.blogs,0.05)
US.news2 <- sample_frac(US.news,0.05)
US.corpus <- data.table(rbind(US.twitter2, US.blogs2, US.news2))
# lines in corpus
nrow(US.corpus)
## [1] 213483
This R markdown files is complete and runnable. However the rendering did only succeed from the console with
library("knitr")
rmarkdown::render("Capstone-Week2-Milestone-Report.Rmd")