Data Science Capstone Project

The goal of this project is to analyse three data set:

This document was rendered at August 27, 2019 at 00:57:04.

The approach will be:

Step 1 : reading the text data

Step 2 : analyse the key attributes of the data

Step 3: Tokenise the data set

Step 4. Summarise the data

Step 5 : Produce graphs

Step 6 : Wordcloud

## NULL
## NULL

## NULL
## NULL

## NULL
## NULL
## [1] "By reviewing words with top 5 frequency, sees similar words in the three files"
## [1] "Top 5 Blog data"
## # A tibble: 5 x 2
##   word       n
##   <chr>  <int>
## 1 time   90636
## 2 people 60435
## 3 day    52412
## 4 love   45142
## 5 life   41497
## [1] "Top 5 News data"
## # A tibble: 5 x 2
##   word       n
##   <chr>  <int>
## 1 time   57048
## 2 people 47829
## 3 city   38611
## 4 school 35797
## 5 game   34995
## [1] "Top 5 Twitter data"
## # A tibble: 5 x 2
##   word       n
##   <chr>  <int>
## 1 love   99209
## 2 day    85963
## 3 time   71571
## 4 lol    67232
## 5 people 47970

Next Step : Predictive Analysis

To use the n-gram and backoff models built in previous tasks to build and evaluate the predictive model. The goal is to make the model efficient and accurate.

Appendix

## start with a clean environment
rm(list = ls())

## defining the library
library(dplyr)
library(data.table)
library(purrr)
library(tidytext)
library(stringr)
library(tm)
library(NLP)
library(R.utils)
library(sp)
library(ggplot2)
library(gdata)
library(readtext)
library(DT)
library(wordcloud)
library(reshape2)
library(ggpubr)

## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")

##Creating a dataframe for storing exploratory data
ex_data <- data.frame(matrix(vector(), 0, 3,
                             dimnames=list(c(), c("File", "Attribute", "Value"))),
                      stringsAsFactors=F)

##checking the file size
##us_blog
ex_data <- ex_data %>% add_row(File = "blogs", Attribute = "file size blog",
                               Value=humanReadable(c(file.info("en_US.blogs.txt")$size), 
                                                   width=4, units="auto", standard=c("SI")))
##us_news
ex_data <- ex_data %>% add_row(File = "news", Attribute = "file size news",
                               Value=humanReadable(c(file.info("en_US.news.txt")$size),
                                                   width=4, units="auto", standard=c("SI")))
##us_twitter
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "file size twitter",
                               Value=humanReadable(c(file.info("en_US.twitter.txt")$size), 
                                                   width=4, units="auto", standard=c("SI")))

##reading the files
blog <- read.table("en_US.blogs.txt", 
                      header = FALSE, encoding = "UTF-8", sep = "\t", quote = "",
                      nrows = -1, row.names = NULL
                      ,col.names=c("blog"),
                      stringsAsFactors = FALSE) %>% mutate(line = row_number())
news <- read.table("en_US.news.txt", 
                      header = FALSE, encoding = "UTF-8", sep = "\t", quote = "",
                      nrows = -1, row.names = NULL
                      ,col.names=c("news"),
                      stringsAsFactors = FALSE) %>% mutate(line = row_number())
twitter <- read.table("en_US.twitter.txt", 
                         header = FALSE, encoding = "UTF-8", sep = "\t", quote = "",
                         nrows = -1, row.names = NULL
                         ,col.names=c("twitter"),
                         stringsAsFactors = FALSE)  %>% mutate(line = row_number())

## printing the values
datatable(as.matrix(ex_data))
##print the number of lines in files
##Checking the number of lines in file
##us_blog
ex_data <- ex_data %>% add_row(File = "blogs", Attribute = "lines in blog",
                               Value=length(blog[[1]]))
##us_news
ex_data <- ex_data %>% add_row(File = "news", Attribute = "lines in news",
                               Value=length(news[[1]]))
##us_twitter
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "lines in twitter",
                               Value=length(twitter[[1]]))

## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")

##finding the length of longest line in the dataset blog
##us_blog
blog_con <- file("en_US.blogs.txt","r")
blog_lenn <- nchar(readLines(blog_con))
close(blog_con)
ex_data <- ex_data %>% add_row(File = "blogs", Attribute = "length of longest line in blog",
                               Value=max(blog_lenn))
##us_news
news_con <- file("en_US.news.txt","r")
news_lenn <- nchar(readLines(news_con))
close(news_con)
ex_data <- ex_data %>% add_row(File = "news", Attribute = "length of longest line in news",
                               Value=max(news_lenn))
##us_twitter
twitter_con <- file("en_US.twitter.txt","r")
twitter_lenn <- nchar(readLines(twitter_con))
close(twitter_con)
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "length of longest line in twitter",
                               Value=max(twitter_lenn))

## delete variables not required
rm(blog_con,blog_lenn,news_con,news_lenn,twitter_con,twitter_lenn)

## printing the values
datatable(as.matrix(ex_data))
## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")

##cleaning up data for set pattern
exp1 <- "https://t.co/[A-Za-z\\d]+|"
exp2 <- "http://[A-Za-z\\d]+|&amp;|&lt;|&gt;|RT|https"
exp3 <- "[^[:alnum:]]"
exp4 <- "[[:punct:]]"
exp5 <- "[0-9]+"
replace_exp <- paste0(exp1, exp2,exp3,exp4,exp5)
unnest_exp <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"

## Restructuring it in the one-token-per-row format 
blog_token <- blog %>% 
        filter(!str_detect(blog, "^RT")) %>%
        mutate(blog = str_replace_all(blog, replace_exp, " ")) %>%
        unnest_tokens(word, blog, token = "regex", pattern = unnest_exp) %>% 
        anti_join(stop_words) %>%
        filter(!word %in% stop_words$word,
               str_detect(word, "[a-z]"))
news_token <- news %>% 
        filter(!str_detect(news, "^RT")) %>%
        mutate(news = str_replace_all(news, replace_exp, " ")) %>%
        unnest_tokens(word, news, token = "regex", pattern = unnest_exp) %>% 
        anti_join(stop_words) %>%
        filter(!word %in% stop_words$word,
               str_detect(word, "[a-z]"))
twitter_token <- twitter %>%
        filter(!str_detect(twitter, "^RT")) %>%
        mutate(twitter = str_replace_all(twitter, replace_exp, " ")) %>%
        unnest_tokens(word, twitter, token = "regex", pattern = unnest_exp) %>% 
        anti_join(stop_words) %>%
        filter(!word %in% stop_words$word,
               str_detect(word, "[a-z]"))

## Counting the number words usage
blog_token_count <- blog_token %>% count(word, sort = TRUE)
news_token_count <- news_token %>% count(word, sort = TRUE)
twitter_token_count <- twitter_token %>% count(word, sort = TRUE)

## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")

## Counting the number words usage for word love
love <- sapply(colnames(twitter), function(x) grep("love", twitter[,x], ignore.case = FALSE,fixed = TRUE))
hate <- sapply(colnames(twitter), function(x) grep("hate", twitter[,x], ignore.case = FALSE,fixed = TRUE))

## storing into ex_data dataframe
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "love usage in twitter",
                               Value=length(love[[1]]))
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "hate usage in twitter",
                               Value=length(hate[[1]]))
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "hate usage in twitter",
                               Value=length(love[[1]])/length(hate[[1]]))

## finding the line with word biostats and fiding the exact phrase
biostats<- sapply(colnames(twitter), function(x) grep("biostats", twitter[,x], ignore.case = FALSE,fixed = TRUE))
exact_character <- sapply(colnames(twitter), function(x) grep("A computer once beat me at chess, but it was no match for me at kickboxing", twitter[,x], ignore.case = FALSE,fixed = TRUE))
## storing into ex_data dataframe
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "line with word biostats in twitter",
                               Value=twitter[biostats$twitter,])
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "occurence of a phrase in twitter",
                               Value=length(exact_character$twitter))

## delete variables not required
rm(biostats,love,hate,exact_character,exp1,exp2,exp3,exp4,exp5,replace_exp,unnest_exp)

## printing the values
datatable(as.matrix(ex_data))
## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")

##Creating plot for each data set
##blog plot
blog_plot <- blog_token_count %>%
        filter(n > 25000) %>%
        mutate(word = reorder(word, n)) %>%
        ggplot(aes(word, n)) +
        geom_bar(stat="identity", fill="lightsteelblue3", colour="lightsteelblue3") +
        #geom_col() +
        xlab(NULL) +
        coord_flip()

##news plot
news_plot <- news_token_count %>%
        filter(n > 25000) %>%
        mutate(word = reorder(word, n)) %>%
        ggplot(aes(word, n)) +
        geom_bar(stat="identity", fill="darkolivegreen4", colour="darkolivegreen4") +
        #geom_col() +
        xlab(NULL) +
        coord_flip()

##twitter plot
twitter_plot <- twitter_token_count %>%
        filter(n > 25000) %>%
        mutate(word = reorder(word, n)) %>%
        ggplot(aes(word, n)) +
        geom_bar(stat="identity", fill="indianred3", colour="indianred3") +
        #geom_col() +
        xlab(NULL) +
        coord_flip()

## merging the three plots together
figure <- ggarrange(blog_plot, news_plot, twitter_plot,
                    labels = c("Blog", "News", "Twitter"),
                    ncol = 3, nrow = 1)

## delete variables not required
rm(blog_plot, news_plot, twitter_plot)

figure
## Creating a wordcloud
options(warn=-1) ## switch off warnings
## creating subset of data for wordcloud
blog_newdata <- subset(blog_token_count, n >= 5000 )
news_newdata <- subset(news_token_count, n >= 6000 )
twitter_newdata <- subset(twitter_token_count, n >= 5000 )

##blog
blog_wordcloud <- blog_newdata %>% 
        with(wordcloud(word, n, max.words = Inf,rot.per=0.35,
                       fixed.asp=TRUE,colors=brewer.pal(8, "Dark2")))

blog_wordcloud_sent <- blog_newdata %>%
        inner_join(get_sentiments("bing")) %>%
        count(word, sentiment, sort = TRUE) %>%
        acast(word ~ sentiment, value.var = "n", fill = 0) %>%
        comparison.cloud(colors = c("indianred3","lightsteelblue3"),
                         max.words = Inf,rot.per=0.35, title.size=1)

blog_wordcloud
blog_wordcloud_sent

##news
news_wordcloud <- news_newdata %>% 
        with(wordcloud(word, n, max.words = Inf,rot.per=0.35,
                       fixed.asp=TRUE,colors=brewer.pal(8, "Dark2")))

news_wordcloud_sent <- news_newdata %>%
        inner_join(get_sentiments("bing")) %>%
        count(word, sentiment, sort = TRUE) %>%
        acast(word ~ sentiment, value.var = "n", fill = 0) %>%
        comparison.cloud(colors = c("indianred3","lightsteelblue3"),
                         max.words = Inf,rot.per=0.35, title.size=1)

news_wordcloud
news_wordcloud_sent

##twitter

twitter_wordcloud <- twitter_newdata %>% 
        with(wordcloud(word, n, max.words = Inf,rot.per=0.35,
    fixed.asp=TRUE,colors=brewer.pal(8, "Dark2")))

twitter_wordcloud_sent <- twitter_newdata %>%
        inner_join(get_sentiments("bing")) %>%
        count(word, sentiment, sort = TRUE) %>%
        acast(word ~ sentiment, value.var = "n", fill = 0) %>%
        comparison.cloud(colors = c("indianred3","lightsteelblue3"),
                         max.words = Inf,rot.per=0.35, title.size=1)

twitter_wordcloud
twitter_wordcloud_sent

options(warn=0) ##switch on warnings

## Drawing conclusion

print("By reviewing words with top 5 frequency, sees similar words in the three files")
print("Top 5 Blog data")
head(blog_token_count,5)
print("Top 5 News data")
head(news_token_count,5)
print("Top 5 Twitter data")
head(twitter_token_count,5)