In this project I will display my understanding of the data set and provide reproducible steps to download, tidy, summarize, and visualize the text data.
The data set is located here
#Setting up the work space
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.6.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer
#Downloading, extracting, and reading the text data
blogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul=TRUE)
news <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul=TRUE)
## Warning in readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", :
## incomplete final line found on './final/en_US/en_US.news.txt'
twitter <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul=TRUE)
#Remove numbers from the text to get accurate text analysis
news <- gsub('[0-9]+', '', news)
blogs <- gsub('[0-9]+', '', blogs)
twitter <- gsub('[0-9]+', '', twitter)
# convert the text data into a tabble
news_df <- data_frame(text = news)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
blogs_df <- data_frame(text = blogs)
twitter_df <- data_frame(text = twitter)
# now we tokenize the data into a tabble with 1 word per row
news_tidy <- news_df %>% unnest_tokens(word, text)
blogs_tidy <- blogs_df %>% unnest_tokens(word, text)
twitter_tidy <- twitter_df %>% unnest_tokens(word, text)
I will do a basic text analysis on the text sets to include word frequencies and diagrams using the tidytext and ggplot2 packages
news_tidy %>% count(word, sort = TRUE)
## # A tibble: 79,459 x 2
## word n
## <chr> <int>
## 1 the 151717
## 2 to 69757
## 3 and 68605
## 4 a 67426
## 5 of 59315
## 6 in 51895
## 7 for 27166
## 8 that 26384
## 9 is 21973
## 10 on 20815
## # ... with 79,449 more rows
We see in the results above that the highest frequency words are stop words that should be removed to get more meaningful results.
data("stop_words")
news_tidy <- news_tidy %>% anti_join(stop_words)
## Joining, by = "word"
news_tidy %>% count(word, sort = TRUE)
## # A tibble: 78,758 x 2
## word n
## <chr> <int>
## 1 time 4474
## 2 people 3673
## 3 city 2902
## 4 school 2702
## 5 percent 2635
## 6 game 2591
## 7 day 2477
## 8 home 2438
## 9 million 2377
## 10 county 2262
## # ... with 78,748 more rows
blogs_tidy <- blogs_tidy %>% anti_join(stop_words)
## Joining, by = "word"
blogs_tidy %>% count(word, sort = TRUE)
## # A tibble: 296,711 x 2
## word n
## <chr> <int>
## 1 time 90918
## 2 people 59576
## 3 day 52378
## 4 love 45233
## 5 life 41251
## 6 it’s 38657
## 7 world 29305
## 8 i’m 29189
## 9 don’t 28389
## 10 book 28151
## # ... with 296,701 more rows
twitter_tidy <- twitter_tidy %>% anti_join(stop_words)
## Joining, by = "word"
twitter_tidy %>% count(word, sort = TRUE)
## # A tibble: 344,710 x 2
## word n
## <chr> <int>
## 1 love 106738
## 2 day 92800
## 3 rt 89557
## 4 time 76806
## 5 lol 70137
## 6 people 52043
## 7 happy 49009
## 8 follow 48117
## 9 tonight 44701
## 10 night 41446
## # ... with 344,700 more rows
news_tidy %>%
count(word, sort = TRUE) %>%
filter(n > 2000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n)) +
ggtitle("Word Frequency Count For The News Text Set") +
theme(plot.title = element_text(color="blue", size=14, face="bold.italic")) +
geom_col() +
xlab(NULL) +
coord_flip()
blogs_tidy %>%
count(word, sort = TRUE) %>%
filter(n > 28000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n)) +
ggtitle("Word Frequency Count For The Blogs Text Set") +
theme(plot.title = element_text(color="blue", size=14, face="bold.italic")) +
geom_col() +
xlab(NULL) +
coord_flip()
twitter_tidy %>%
count(word, sort = TRUE) %>%
filter(n > 41000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n)) +
ggtitle("Word Frequency Count For The Twitter Text Set") +
theme(plot.title = element_text(color="blue", size=14, face="bold.italic")) +
geom_col() +
xlab(NULL) +
coord_flip()
Lets do some more plots this time we will be using wordcloud package
news_tidy %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): police could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): city could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): months could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): found could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): tuesday could not be fit on
## page. It will not be plotted.
blogs_tidy %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
twitter_tidy %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"