First let’s examine the en_US.twitter.txt file content.
inputFile <- "en_US.twitter.txt"
con <- file(inputFile, open = "r")
lineCount <- 0
lengthCount <- integer(0)
wordCount <- integer(0)
while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
lineCount <- lineCount + 1
lengthCount[lineCount] <- str_length(oneLine)
wordCount[lineCount] <- sapply(strsplit(oneLine, " "), length)
}
close(con)
The file contains 2360148 lines, 162384825 characters and 30373543 words.
lengthCount %>%
data.frame(x=.) %>%
ggplot(aes(x=x)) +
geom_histogram(binwidth = 10,
color="black",
fill="darkorange") +
xlab("") + ggtitle("Histogram of Character Length")
wordCount %>%
data.frame(x=.) %>%
ggplot(aes(x=x)) +
geom_histogram(bins = 10,
color="black",
fill="darkorchid") +
xlab("") + ggtitle("Histogram of Word Count")
Now we’ll examine the most frequent words.
my_text <- read_file(inputFile)
tidy_text <- my_text %>%
tibble(text=.) %>%
unnest_tokens(word,text) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_text %>%
count(word,sort=TRUE) %>%
head(20) %>% kable() %>% kable_styling()
| word | n |
|---|---|
| love | 7577 |
| day | 6590 |
| rt | 6278 |
| time | 5581 |
| lol | 4932 |
| 3 | 3876 |
| people | 3713 |
| follow | 3423 |
| happy | 3399 |
| tonight | 3191 |
| 2 | 3065 |
| night | 2885 |
| hope | 2461 |
| life | 2365 |
| 2219 | |
| game | 2214 |
| im | 2205 |
| week | 2117 |
| tomorrow | 2034 |
| wait | 1948 |
Let’s repeat the same analysis for the en_US.news.txt file content.
inputFile <- "en_US.news.txt"
con <- file(inputFile, open = "r")
lineCount <- 0
lengthCount <- integer(0)
wordCount <- integer(0)
while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
lineCount <- lineCount + 1
lengthCount[lineCount] <- str_length(oneLine)
wordCount[lineCount] <- sapply(strsplit(oneLine, " "), length)
}
close(con)
The file contains 77259 lines, 15683765 characters and 2643969 words.
lengthCount %>%
data.frame(x=.) %>%
ggplot(aes(x=x)) +
geom_histogram(binwidth = 10,
color="black",
fill="darkorange") +
xlab("") + ggtitle("Histogram of Character Length")
wordCount %>%
data.frame(x=.) %>%
ggplot(aes(x=x)) +
geom_histogram(bins = 10,
color="black",
fill="darkorchid") +
xlab("") + ggtitle("Histogram of Word Count")
The most frequent words of this dataset.
my_text <- read_file(inputFile)
tidy_text <- my_text %>%
tibble(text=.) %>%
unnest_tokens(word,text) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_text %>%
count(word,sort=TRUE) %>%
head(20) %>% kable() %>% kable_styling()
| word | n |
|---|---|
| time | 57062 |
| people | 47666 |
| city | 37953 |
| 1 | 37292 |
| school | 35498 |
| game | 34949 |
| percent | 34690 |
| day | 31901 |
| 2 | 31784 |
| million | 30914 |
| home | 30601 |
| county | 30172 |
| 10 | 29686 |
| team | 28952 |
| season | 28494 |
| 3 | 27192 |
| police | 27094 |
| u.s | 23434 |
| p.m | 23336 |
| public | 22744 |
Finally we’ll examine the en_US.blogs.txt file content.
inputFile <- "en_US.blogs.txt"
con <- file(inputFile, open = "r")
lineCount <- 0
lengthCount <- integer(0)
wordCount <- integer(0)
while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
lineCount <- lineCount + 1
lengthCount[lineCount] <- str_length(oneLine)
wordCount[lineCount] <- sapply(strsplit(oneLine, " "), length)
}
close(con)
The file contains 899288 lines, 208361438 characters and 37334131 words.
lengthCount %>%
data.frame(x=.) %>%
ggplot(aes(x=x)) +
geom_histogram(binwidth = 10,
color="black",
fill="darkorange") +
xlab("") + ggtitle("Histogram of Character Length")
wordCount %>%
data.frame(x=.) %>%
ggplot(aes(x=x)) +
geom_histogram(bins = 10,
color="black",
fill="darkorchid") +
xlab("") + ggtitle("Histogram of Word Count")
The most frequent words of this dataset.
my_text <- read_file(inputFile)
tidy_text <- my_text %>%
tibble(text=.) %>%
unnest_tokens(word,text) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_text %>%
count(word,sort=TRUE) %>%
head(20) %>% kable() %>% kable_styling()
| word | n |
|---|---|
| time | 90918 |
| people | 59574 |
| day | 52372 |
| love | 45230 |
| life | 41251 |
| it’s | 38657 |
| 1 | 30907 |
| 2 | 29561 |
| world | 29305 |
| i’m | 29189 |
| don’t | 28389 |
| book | 28147 |
| home | 27944 |
| week | 26317 |
| feel | 24451 |
| god | 22368 |
| 3 | 22020 |
| lot | 21550 |
| read | 21380 |
| days | 20160 |