Twitter Dataset

First let’s examine the en_US.twitter.txt file content.

inputFile <- "en_US.twitter.txt"
con  <- file(inputFile, open = "r")

lineCount <- 0
lengthCount <- integer(0)
wordCount <- integer(0)

while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {

  lineCount <- lineCount + 1
  lengthCount[lineCount] <- str_length(oneLine)
  wordCount[lineCount] <- sapply(strsplit(oneLine, " "), length)
}

close(con)

The file contains 2360148 lines, 162384825 characters and 30373543 words.

lengthCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(binwidth = 10,
                 color="black",
                 fill="darkorange") +
  xlab("") + ggtitle("Histogram of Character Length")

wordCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(bins = 10,
                 color="black",
                 fill="darkorchid") +
  xlab("") + ggtitle("Histogram of Word Count")

Now we’ll examine the most frequent words.

my_text <- read_file(inputFile)

tidy_text <- my_text %>%
  tibble(text=.) %>% 
  unnest_tokens(word,text) %>% 
  anti_join(stop_words)
## Joining, by = "word"
tidy_text %>%
  count(word,sort=TRUE) %>%
  head(20) %>% kable() %>% kable_styling()
word n
love 7577
day 6590
rt 6278
time 5581
lol 4932
3 3876
people 3713
follow 3423
happy 3399
tonight 3191
2 3065
night 2885
hope 2461
life 2365
twitter 2219
game 2214
im 2205
week 2117
tomorrow 2034
wait 1948

News Dataset

Let’s repeat the same analysis for the en_US.news.txt file content.

inputFile <- "en_US.news.txt"
con  <- file(inputFile, open = "r")

lineCount <- 0
lengthCount <- integer(0)
wordCount <- integer(0)

while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {

  lineCount <- lineCount + 1
  lengthCount[lineCount] <- str_length(oneLine)
  wordCount[lineCount] <- sapply(strsplit(oneLine, " "), length)
}

close(con)

The file contains 77259 lines, 15683765 characters and 2643969 words.

lengthCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(binwidth = 10,
                 color="black",
                 fill="darkorange") +
  xlab("") + ggtitle("Histogram of Character Length")

wordCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(bins = 10,
                 color="black",
                 fill="darkorchid") +
  xlab("") + ggtitle("Histogram of Word Count")

The most frequent words of this dataset.

my_text <- read_file(inputFile)

tidy_text <- my_text %>%
  tibble(text=.) %>% 
  unnest_tokens(word,text) %>% 
  anti_join(stop_words)
## Joining, by = "word"
tidy_text %>%
  count(word,sort=TRUE) %>%
  head(20) %>% kable() %>% kable_styling()
word n
time 57062
people 47666
city 37953
1 37292
school 35498
game 34949
percent 34690
day 31901
2 31784
million 30914
home 30601
county 30172
10 29686
team 28952
season 28494
3 27192
police 27094
u.s 23434
p.m 23336
public 22744

Blogs Dataset

Finally we’ll examine the en_US.blogs.txt file content.

inputFile <- "en_US.blogs.txt"
con  <- file(inputFile, open = "r")

lineCount <- 0
lengthCount <- integer(0)
wordCount <- integer(0)

while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {

  lineCount <- lineCount + 1
  lengthCount[lineCount] <- str_length(oneLine)
  wordCount[lineCount] <- sapply(strsplit(oneLine, " "), length)
}

close(con)

The file contains 899288 lines, 208361438 characters and 37334131 words.

lengthCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(binwidth = 10,
                 color="black",
                 fill="darkorange") +
  xlab("") + ggtitle("Histogram of Character Length")

wordCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(bins = 10,
                 color="black",
                 fill="darkorchid") +
  xlab("") + ggtitle("Histogram of Word Count")

The most frequent words of this dataset.

my_text <- read_file(inputFile)

tidy_text <- my_text %>%
  tibble(text=.) %>% 
  unnest_tokens(word,text) %>% 
  anti_join(stop_words)
## Joining, by = "word"
tidy_text %>%
  count(word,sort=TRUE) %>%
  head(20) %>% kable() %>% kable_styling()
word n
time 90918
people 59574
day 52372
love 45230
life 41251
it’s 38657
1 30907
2 29561
world 29305
i’m 29189
don’t 28389
book 28147
home 27944
week 26317
feel 24451
god 22368
3 22020
lot 21550
read 21380
days 20160