Milestone Report

Twitter Dataset

First let’s examine the en_US.twitter.txt file content.

inputFile <- "en_US.twitter.txt"
con  <- file(inputFile, open = "r")

lineCount <- 0
lengthCount <- integer(0)
wordCount <- integer(0)

while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {

  lineCount <- lineCount + 1
  lengthCount[lineCount] <- str_length(oneLine)
  wordCount[lineCount] <- sapply(strsplit(oneLine, " "), length)
}

close(con)

The file contains 2360148 lines, 162384825 characters and 30373543 words.

lengthCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(binwidth = 10,
                 color="black",
                 fill="darkorange") +
  xlab("") + ggtitle("Histogram of Character Length")

wordCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(bins = 10,
                 color="black",
                 fill="darkorchid") +
  xlab("") + ggtitle("Histogram of Word Count")

Now we’ll examine the most frequent words.

my_text <- read_file(inputFile)

tidy_text <- my_text %>%
  tibble(text=.) %>% 
  unnest_tokens(word,text) %>% 
  anti_join(stop_words)

## Joining, by = "word"

tidy_text %>%
  count(word,sort=TRUE) %>%
  head(20) %>% kable() %>% kable_styling()

word	n
love	7577
day	6590
rt	6278
time	5581
lol	4932
3	3876
people	3713
follow	3423
happy	3399
tonight	3191
2	3065
night	2885
hope	2461
life	2365
twitter	2219
game	2214
im	2205
week	2117
tomorrow	2034
wait	1948

News Dataset

Let’s repeat the same analysis for the en_US.news.txt file content.

inputFile <- "en_US.news.txt"
con  <- file(inputFile, open = "r")

lineCount <- 0
lengthCount <- integer(0)
wordCount <- integer(0)

while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {

  lineCount <- lineCount + 1
  lengthCount[lineCount] <- str_length(oneLine)
  wordCount[lineCount] <- sapply(strsplit(oneLine, " "), length)
}

close(con)

The file contains 77259 lines, 15683765 characters and 2643969 words.

lengthCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(binwidth = 10,
                 color="black",
                 fill="darkorange") +
  xlab("") + ggtitle("Histogram of Character Length")

wordCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(bins = 10,
                 color="black",
                 fill="darkorchid") +
  xlab("") + ggtitle("Histogram of Word Count")

The most frequent words of this dataset.

my_text <- read_file(inputFile)

tidy_text <- my_text %>%
  tibble(text=.) %>% 
  unnest_tokens(word,text) %>% 
  anti_join(stop_words)

## Joining, by = "word"

tidy_text %>%
  count(word,sort=TRUE) %>%
  head(20) %>% kable() %>% kable_styling()

word	n
time	57062
people	47666
city	37953
1	37292
school	35498
game	34949
percent	34690
day	31901
2	31784
million	30914
home	30601
county	30172
10	29686
team	28952
season	28494
3	27192
police	27094
u.s	23434
p.m	23336
public	22744

Blogs Dataset

Finally we’ll examine the en_US.blogs.txt file content.

inputFile <- "en_US.blogs.txt"
con  <- file(inputFile, open = "r")

lineCount <- 0
lengthCount <- integer(0)
wordCount <- integer(0)

while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {

  lineCount <- lineCount + 1
  lengthCount[lineCount] <- str_length(oneLine)
  wordCount[lineCount] <- sapply(strsplit(oneLine, " "), length)
}

close(con)

The file contains 899288 lines, 208361438 characters and 37334131 words.

lengthCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(binwidth = 10,
                 color="black",
                 fill="darkorange") +
  xlab("") + ggtitle("Histogram of Character Length")

wordCount %>%
  data.frame(x=.) %>% 
  ggplot(aes(x=x)) +
  geom_histogram(bins = 10,
                 color="black",
                 fill="darkorchid") +
  xlab("") + ggtitle("Histogram of Word Count")

The most frequent words of this dataset.

my_text <- read_file(inputFile)

tidy_text <- my_text %>%
  tibble(text=.) %>% 
  unnest_tokens(word,text) %>% 
  anti_join(stop_words)

## Joining, by = "word"

tidy_text %>%
  count(word,sort=TRUE) %>%
  head(20) %>% kable() %>% kable_styling()

word	n
time	90918
people	59574
day	52372
love	45230
life	41251
it’s	38657
1	30907
2	29561
world	29305
i’m	29189
don’t	28389
book	28147
home	27944
week	26317
feel	24451
god	22368
3	22020
lot	21550
read	21380
days	20160

Milestone Report - Week 2

Bruno Tavares

7 de julho de 2020

Twitter Dataset

News Dataset

Blogs Dataset