Introduction

In this project I will display my understanding of the data set and provide reproducible steps to download, tidy, summarize, and visualize the text data.

The Data

The data set is located here

#Setting up the work space

library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.6.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer

#Downloading, extracting, and reading the text data

blogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul=TRUE)
news <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul=TRUE)
## Warning in readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", :
## incomplete final line found on './final/en_US/en_US.news.txt'
twitter <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul=TRUE)

Preparing the data for Analysis

#Remove numbers from the text to get accurate text analysis
news <- gsub('[0-9]+', '', news)
blogs <- gsub('[0-9]+', '', blogs)
twitter <- gsub('[0-9]+', '', twitter)
# convert the text data into a tabble
news_df <- data_frame(text = news)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
blogs_df <- data_frame(text = blogs)
twitter_df <- data_frame(text = twitter)
# now we tokenize the data into a tabble with 1 word per row
news_tidy <- news_df %>% unnest_tokens(word, text)
blogs_tidy <- blogs_df %>% unnest_tokens(word, text)
twitter_tidy <- twitter_df %>% unnest_tokens(word, text)

Analysis

I will do a basic text analysis on the text sets to include word frequencies and diagrams using the tidytext and ggplot2 packages

news_tidy %>% count(word, sort = TRUE)
## # A tibble: 79,459 x 2
##    word       n
##    <chr>  <int>
##  1 the   151717
##  2 to     69757
##  3 and    68605
##  4 a      67426
##  5 of     59315
##  6 in     51895
##  7 for    27166
##  8 that   26384
##  9 is     21973
## 10 on     20815
## # ... with 79,449 more rows

We see in the results above that the highest frequency words are stop words that should be removed to get more meaningful results.

data("stop_words")
news_tidy <- news_tidy %>% anti_join(stop_words)
## Joining, by = "word"
news_tidy %>% count(word, sort = TRUE)
## # A tibble: 78,758 x 2
##    word        n
##    <chr>   <int>
##  1 time     4474
##  2 people   3673
##  3 city     2902
##  4 school   2702
##  5 percent  2635
##  6 game     2591
##  7 day      2477
##  8 home     2438
##  9 million  2377
## 10 county   2262
## # ... with 78,748 more rows
blogs_tidy <- blogs_tidy %>% anti_join(stop_words)
## Joining, by = "word"
blogs_tidy %>% count(word, sort = TRUE)
## # A tibble: 296,711 x 2
##    word       n
##    <chr>  <int>
##  1 time   90918
##  2 people 59576
##  3 day    52378
##  4 love   45233
##  5 life   41251
##  6 it’s   38657
##  7 world  29305
##  8 i’m    29189
##  9 don’t  28389
## 10 book   28151
## # ... with 296,701 more rows
twitter_tidy <- twitter_tidy %>% anti_join(stop_words)
## Joining, by = "word"
twitter_tidy %>% count(word, sort = TRUE)
## # A tibble: 344,710 x 2
##    word         n
##    <chr>    <int>
##  1 love    106738
##  2 day      92800
##  3 rt       89557
##  4 time     76806
##  5 lol      70137
##  6 people   52043
##  7 happy    49009
##  8 follow   48117
##  9 tonight  44701
## 10 night    41446
## # ... with 344,700 more rows

Visulazing The Words

news_tidy %>%
  count(word, sort = TRUE) %>%
  filter(n > 2000) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n)) +
  ggtitle("Word Frequency Count For The News Text Set") +
  theme(plot.title = element_text(color="blue", size=14, face="bold.italic")) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

blogs_tidy %>%
  count(word, sort = TRUE) %>%
  filter(n > 28000) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n)) +
  ggtitle("Word Frequency Count For The Blogs Text Set") +
  theme(plot.title = element_text(color="blue", size=14, face="bold.italic")) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

twitter_tidy %>%
  count(word, sort = TRUE) %>%
  filter(n > 41000) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n)) +
  ggtitle("Word Frequency Count For The Twitter Text Set") +
  theme(plot.title = element_text(color="blue", size=14, face="bold.italic")) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

Lets do some more plots this time we will be using wordcloud package

news_tidy %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): police could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): city could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): months could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): found could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): tuesday could not be fit on
## page. It will not be plotted.

blogs_tidy %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

twitter_tidy %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"