TextAnalysis Wk10 Capstone

Data Science Capstone Project

The goal of this project is to analyse three data set:

1. US Blogs
1. US News
1. US Twitter

This document was rendered at August 27, 2019 at 00:57:04.

The approach will be:

1. read the text data
1. analyse the key attributes of the data
1. tokenise the data set
1. summarise the data
1. produce graphs.
1. wordcloud.

Step 1 : reading the text data

1. Load all library
1. Creating an empty dataframe to store all data related to exploratory analysis
1. Checking the file size
1. Reading the files
1. Printing the values for file size

Step 2 : analyse the key attributes of the data

1. Checking the number of lines in file
1. Finding the length of longest line in the dataset blog
1. Printing the values

Step 3: Tokenise the data set

1. Cleaning up data for set pattern
1. Restructuring it in the one-token-per-row format
1. Counting the number words usage

Step 4. Summarise the data

1. Counting the number words usage for word love
1. Storing into ex_data dataframe
1. Finding the line with word biostats and fiding the exact phrase
1. Printing the values

Step 5 : Produce graphs

1. Creating plot for each data set
1. Merging the three plots together
1. Creating a wordcloud
1. Producing sentiment based wordcloud
1. Drawing conclusions

Step 6 : Wordcloud

1. Creating a wordcloud
1. Producing sentiment based wordcloud
1. Drawing conclusions

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## [1] "By reviewing words with top 5 frequency, sees similar words in the three files"

## [1] "Top 5 Blog data"

## # A tibble: 5 x 2
##   word       n
##   <chr>  <int>
## 1 time   90636
## 2 people 60435
## 3 day    52412
## 4 love   45142
## 5 life   41497

## [1] "Top 5 News data"

## # A tibble: 5 x 2
##   word       n
##   <chr>  <int>
## 1 time   57048
## 2 people 47829
## 3 city   38611
## 4 school 35797
## 5 game   34995

## [1] "Top 5 Twitter data"

## # A tibble: 5 x 2
##   word       n
##   <chr>  <int>
## 1 love   99209
## 2 day    85963
## 3 time   71571
## 4 lol    67232
## 5 people 47970

Next Step : Predictive Analysis

To use the n-gram and backoff models built in previous tasks to build and evaluate the predictive model. The goal is to make the model efficient and accurate.

Appendix

## start with a clean environment
rm(list = ls())

## defining the library
library(dplyr)
library(data.table)
library(purrr)
library(tidytext)
library(stringr)
library(tm)
library(NLP)
library(R.utils)
library(sp)
library(ggplot2)
library(gdata)
library(readtext)
library(DT)
library(wordcloud)
library(reshape2)
library(ggpubr)

## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")

##Creating a dataframe for storing exploratory data
ex_data <- data.frame(matrix(vector(), 0, 3,
                             dimnames=list(c(), c("File", "Attribute", "Value"))),
                      stringsAsFactors=F)

##checking the file size
##us_blog
ex_data <- ex_data %>% add_row(File = "blogs", Attribute = "file size blog",
                               Value=humanReadable(c(file.info("en_US.blogs.txt")$size), 
                                                   width=4, units="auto", standard=c("SI")))
##us_news
ex_data <- ex_data %>% add_row(File = "news", Attribute = "file size news",
                               Value=humanReadable(c(file.info("en_US.news.txt")$size),
                                                   width=4, units="auto", standard=c("SI")))
##us_twitter
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "file size twitter",
                               Value=humanReadable(c(file.info("en_US.twitter.txt")$size), 
                                                   width=4, units="auto", standard=c("SI")))

##reading the files
blog <- read.table("en_US.blogs.txt", 
                      header = FALSE, encoding = "UTF-8", sep = "\t", quote = "",
                      nrows = -1, row.names = NULL
                      ,col.names=c("blog"),
                      stringsAsFactors = FALSE) %>% mutate(line = row_number())
news <- read.table("en_US.news.txt", 
                      header = FALSE, encoding = "UTF-8", sep = "\t", quote = "",
                      nrows = -1, row.names = NULL
                      ,col.names=c("news"),
                      stringsAsFactors = FALSE) %>% mutate(line = row_number())
twitter <- read.table("en_US.twitter.txt", 
                         header = FALSE, encoding = "UTF-8", sep = "\t", quote = "",
                         nrows = -1, row.names = NULL
                         ,col.names=c("twitter"),
                         stringsAsFactors = FALSE)  %>% mutate(line = row_number())

## printing the values
datatable(as.matrix(ex_data))
##print the number of lines in files
##Checking the number of lines in file
##us_blog
ex_data <- ex_data %>% add_row(File = "blogs", Attribute = "lines in blog",
                               Value=length(blog[[1]]))
##us_news
ex_data <- ex_data %>% add_row(File = "news", Attribute = "lines in news",
                               Value=length(news[[1]]))
##us_twitter
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "lines in twitter",
                               Value=length(twitter[[1]]))

## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")

##finding the length of longest line in the dataset blog
##us_blog
blog_con <- file("en_US.blogs.txt","r")
blog_lenn <- nchar(readLines(blog_con))
close(blog_con)
ex_data <- ex_data %>% add_row(File = "blogs", Attribute = "length of longest line in blog",
                               Value=max(blog_lenn))
##us_news
news_con <- file("en_US.news.txt","r")
news_lenn <- nchar(readLines(news_con))
close(news_con)
ex_data <- ex_data %>% add_row(File = "news", Attribute = "length of longest line in news",
                               Value=max(news_lenn))
##us_twitter
twitter_con <- file("en_US.twitter.txt","r")
twitter_lenn <- nchar(readLines(twitter_con))
close(twitter_con)
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "length of longest line in twitter",
                               Value=max(twitter_lenn))

## delete variables not required
rm(blog_con,blog_lenn,news_con,news_lenn,twitter_con,twitter_lenn)

## printing the values
datatable(as.matrix(ex_data))
## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")

##cleaning up data for set pattern
exp1 <- "https://t.co/[A-Za-z\\d]+|"
exp2 <- "http://[A-Za-z\\d]+|&amp;|&lt;|&gt;|RT|https"
exp3 <- "[^[:alnum:]]"
exp4 <- "[[:punct:]]"
exp5 <- "[0-9]+"
replace_exp <- paste0(exp1, exp2,exp3,exp4,exp5)
unnest_exp <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"

## Restructuring it in the one-token-per-row format 
blog_token <- blog %>% 
        filter(!str_detect(blog, "^RT")) %>%
        mutate(blog = str_replace_all(blog, replace_exp, " ")) %>%
        unnest_tokens(word, blog, token = "regex", pattern = unnest_exp) %>% 
        anti_join(stop_words) %>%
        filter(!word %in% stop_words$word,
               str_detect(word, "[a-z]"))
news_token <- news %>% 
        filter(!str_detect(news, "^RT")) %>%
        mutate(news = str_replace_all(news, replace_exp, " ")) %>%
        unnest_tokens(word, news, token = "regex", pattern = unnest_exp) %>% 
        anti_join(stop_words) %>%
        filter(!word %in% stop_words$word,
               str_detect(word, "[a-z]"))
twitter_token <- twitter %>%
        filter(!str_detect(twitter, "^RT")) %>%
        mutate(twitter = str_replace_all(twitter, replace_exp, " ")) %>%
        unnest_tokens(word, twitter, token = "regex", pattern = unnest_exp) %>% 
        anti_join(stop_words) %>%
        filter(!word %in% stop_words$word,
               str_detect(word, "[a-z]"))

## Counting the number words usage
blog_token_count <- blog_token %>% count(word, sort = TRUE)
news_token_count <- news_token %>% count(word, sort = TRUE)
twitter_token_count <- twitter_token %>% count(word, sort = TRUE)

## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")

## Counting the number words usage for word love
love <- sapply(colnames(twitter), function(x) grep("love", twitter[,x], ignore.case = FALSE,fixed = TRUE))
hate <- sapply(colnames(twitter), function(x) grep("hate", twitter[,x], ignore.case = FALSE,fixed = TRUE))

## storing into ex_data dataframe
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "love usage in twitter",
                               Value=length(love[[1]]))
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "hate usage in twitter",
                               Value=length(hate[[1]]))
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "hate usage in twitter",
                               Value=length(love[[1]])/length(hate[[1]]))

## finding the line with word biostats and fiding the exact phrase
biostats<- sapply(colnames(twitter), function(x) grep("biostats", twitter[,x], ignore.case = FALSE,fixed = TRUE))
exact_character <- sapply(colnames(twitter), function(x) grep("A computer once beat me at chess, but it was no match for me at kickboxing", twitter[,x], ignore.case = FALSE,fixed = TRUE))
## storing into ex_data dataframe
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "line with word biostats in twitter",
                               Value=twitter[biostats$twitter,])
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "occurence of a phrase in twitter",
                               Value=length(exact_character$twitter))

## delete variables not required
rm(biostats,love,hate,exact_character,exp1,exp2,exp3,exp4,exp5,replace_exp,unnest_exp)

## printing the values
datatable(as.matrix(ex_data))
## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")

##Creating plot for each data set
##blog plot
blog_plot <- blog_token_count %>%
        filter(n > 25000) %>%
        mutate(word = reorder(word, n)) %>%
        ggplot(aes(word, n)) +
        geom_bar(stat="identity", fill="lightsteelblue3", colour="lightsteelblue3") +
        #geom_col() +
        xlab(NULL) +
        coord_flip()

##news plot
news_plot <- news_token_count %>%
        filter(n > 25000) %>%
        mutate(word = reorder(word, n)) %>%
        ggplot(aes(word, n)) +
        geom_bar(stat="identity", fill="darkolivegreen4", colour="darkolivegreen4") +
        #geom_col() +
        xlab(NULL) +
        coord_flip()

##twitter plot
twitter_plot <- twitter_token_count %>%
        filter(n > 25000) %>%
        mutate(word = reorder(word, n)) %>%
        ggplot(aes(word, n)) +
        geom_bar(stat="identity", fill="indianred3", colour="indianred3") +
        #geom_col() +
        xlab(NULL) +
        coord_flip()

## merging the three plots together
figure <- ggarrange(blog_plot, news_plot, twitter_plot,
                    labels = c("Blog", "News", "Twitter"),
                    ncol = 3, nrow = 1)

## delete variables not required
rm(blog_plot, news_plot, twitter_plot)

figure
## Creating a wordcloud
options(warn=-1) ## switch off warnings
## creating subset of data for wordcloud
blog_newdata <- subset(blog_token_count, n >= 5000 )
news_newdata <- subset(news_token_count, n >= 6000 )
twitter_newdata <- subset(twitter_token_count, n >= 5000 )

##blog
blog_wordcloud <- blog_newdata %>% 
        with(wordcloud(word, n, max.words = Inf,rot.per=0.35,
                       fixed.asp=TRUE,colors=brewer.pal(8, "Dark2")))

blog_wordcloud_sent <- blog_newdata %>%
        inner_join(get_sentiments("bing")) %>%
        count(word, sentiment, sort = TRUE) %>%
        acast(word ~ sentiment, value.var = "n", fill = 0) %>%
        comparison.cloud(colors = c("indianred3","lightsteelblue3"),
                         max.words = Inf,rot.per=0.35, title.size=1)

blog_wordcloud
blog_wordcloud_sent

##news
news_wordcloud <- news_newdata %>% 
        with(wordcloud(word, n, max.words = Inf,rot.per=0.35,
                       fixed.asp=TRUE,colors=brewer.pal(8, "Dark2")))

news_wordcloud_sent <- news_newdata %>%
        inner_join(get_sentiments("bing")) %>%
        count(word, sentiment, sort = TRUE) %>%
        acast(word ~ sentiment, value.var = "n", fill = 0) %>%
        comparison.cloud(colors = c("indianred3","lightsteelblue3"),
                         max.words = Inf,rot.per=0.35, title.size=1)

news_wordcloud
news_wordcloud_sent

##twitter

twitter_wordcloud <- twitter_newdata %>% 
        with(wordcloud(word, n, max.words = Inf,rot.per=0.35,
    fixed.asp=TRUE,colors=brewer.pal(8, "Dark2")))

twitter_wordcloud_sent <- twitter_newdata %>%
        inner_join(get_sentiments("bing")) %>%
        count(word, sentiment, sort = TRUE) %>%
        acast(word ~ sentiment, value.var = "n", fill = 0) %>%
        comparison.cloud(colors = c("indianred3","lightsteelblue3"),
                         max.words = Inf,rot.per=0.35, title.size=1)

twitter_wordcloud
twitter_wordcloud_sent

options(warn=0) ##switch on warnings

## Drawing conclusion

print("By reviewing words with top 5 frequency, sees similar words in the three files")
print("Top 5 Blog data")
head(blog_token_count,5)
print("Top 5 News data")
head(news_token_count,5)
print("Top 5 Twitter data")
head(twitter_token_count,5)