The goal of this project is to analyse three data set:
This document was rendered at August 27, 2019 at 00:57:04.
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## [1] "By reviewing words with top 5 frequency, sees similar words in the three files"
## [1] "Top 5 Blog data"
## # A tibble: 5 x 2
## word n
## <chr> <int>
## 1 time 90636
## 2 people 60435
## 3 day 52412
## 4 love 45142
## 5 life 41497
## [1] "Top 5 News data"
## # A tibble: 5 x 2
## word n
## <chr> <int>
## 1 time 57048
## 2 people 47829
## 3 city 38611
## 4 school 35797
## 5 game 34995
## [1] "Top 5 Twitter data"
## # A tibble: 5 x 2
## word n
## <chr> <int>
## 1 love 99209
## 2 day 85963
## 3 time 71571
## 4 lol 67232
## 5 people 47970
To use the n-gram and backoff models built in previous tasks to build and evaluate the predictive model. The goal is to make the model efficient and accurate.
## start with a clean environment
rm(list = ls())
## defining the library
library(dplyr)
library(data.table)
library(purrr)
library(tidytext)
library(stringr)
library(tm)
library(NLP)
library(R.utils)
library(sp)
library(ggplot2)
library(gdata)
library(readtext)
library(DT)
library(wordcloud)
library(reshape2)
library(ggpubr)
## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")
##Creating a dataframe for storing exploratory data
ex_data <- data.frame(matrix(vector(), 0, 3,
dimnames=list(c(), c("File", "Attribute", "Value"))),
stringsAsFactors=F)
##checking the file size
##us_blog
ex_data <- ex_data %>% add_row(File = "blogs", Attribute = "file size blog",
Value=humanReadable(c(file.info("en_US.blogs.txt")$size),
width=4, units="auto", standard=c("SI")))
##us_news
ex_data <- ex_data %>% add_row(File = "news", Attribute = "file size news",
Value=humanReadable(c(file.info("en_US.news.txt")$size),
width=4, units="auto", standard=c("SI")))
##us_twitter
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "file size twitter",
Value=humanReadable(c(file.info("en_US.twitter.txt")$size),
width=4, units="auto", standard=c("SI")))
##reading the files
blog <- read.table("en_US.blogs.txt",
header = FALSE, encoding = "UTF-8", sep = "\t", quote = "",
nrows = -1, row.names = NULL
,col.names=c("blog"),
stringsAsFactors = FALSE) %>% mutate(line = row_number())
news <- read.table("en_US.news.txt",
header = FALSE, encoding = "UTF-8", sep = "\t", quote = "",
nrows = -1, row.names = NULL
,col.names=c("news"),
stringsAsFactors = FALSE) %>% mutate(line = row_number())
twitter <- read.table("en_US.twitter.txt",
header = FALSE, encoding = "UTF-8", sep = "\t", quote = "",
nrows = -1, row.names = NULL
,col.names=c("twitter"),
stringsAsFactors = FALSE) %>% mutate(line = row_number())
## printing the values
datatable(as.matrix(ex_data))
##print the number of lines in files
##Checking the number of lines in file
##us_blog
ex_data <- ex_data %>% add_row(File = "blogs", Attribute = "lines in blog",
Value=length(blog[[1]]))
##us_news
ex_data <- ex_data %>% add_row(File = "news", Attribute = "lines in news",
Value=length(news[[1]]))
##us_twitter
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "lines in twitter",
Value=length(twitter[[1]]))
## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")
##finding the length of longest line in the dataset blog
##us_blog
blog_con <- file("en_US.blogs.txt","r")
blog_lenn <- nchar(readLines(blog_con))
close(blog_con)
ex_data <- ex_data %>% add_row(File = "blogs", Attribute = "length of longest line in blog",
Value=max(blog_lenn))
##us_news
news_con <- file("en_US.news.txt","r")
news_lenn <- nchar(readLines(news_con))
close(news_con)
ex_data <- ex_data %>% add_row(File = "news", Attribute = "length of longest line in news",
Value=max(news_lenn))
##us_twitter
twitter_con <- file("en_US.twitter.txt","r")
twitter_lenn <- nchar(readLines(twitter_con))
close(twitter_con)
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "length of longest line in twitter",
Value=max(twitter_lenn))
## delete variables not required
rm(blog_con,blog_lenn,news_con,news_lenn,twitter_con,twitter_lenn)
## printing the values
datatable(as.matrix(ex_data))
## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")
##cleaning up data for set pattern
exp1 <- "https://t.co/[A-Za-z\\d]+|"
exp2 <- "http://[A-Za-z\\d]+|&|<|>|RT|https"
exp3 <- "[^[:alnum:]]"
exp4 <- "[[:punct:]]"
exp5 <- "[0-9]+"
replace_exp <- paste0(exp1, exp2,exp3,exp4,exp5)
unnest_exp <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
## Restructuring it in the one-token-per-row format
blog_token <- blog %>%
filter(!str_detect(blog, "^RT")) %>%
mutate(blog = str_replace_all(blog, replace_exp, " ")) %>%
unnest_tokens(word, blog, token = "regex", pattern = unnest_exp) %>%
anti_join(stop_words) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
news_token <- news %>%
filter(!str_detect(news, "^RT")) %>%
mutate(news = str_replace_all(news, replace_exp, " ")) %>%
unnest_tokens(word, news, token = "regex", pattern = unnest_exp) %>%
anti_join(stop_words) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
twitter_token <- twitter %>%
filter(!str_detect(twitter, "^RT")) %>%
mutate(twitter = str_replace_all(twitter, replace_exp, " ")) %>%
unnest_tokens(word, twitter, token = "regex", pattern = unnest_exp) %>%
anti_join(stop_words) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
## Counting the number words usage
blog_token_count <- blog_token %>% count(word, sort = TRUE)
news_token_count <- news_token %>% count(word, sort = TRUE)
twitter_token_count <- twitter_token %>% count(word, sort = TRUE)
## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")
## Counting the number words usage for word love
love <- sapply(colnames(twitter), function(x) grep("love", twitter[,x], ignore.case = FALSE,fixed = TRUE))
hate <- sapply(colnames(twitter), function(x) grep("hate", twitter[,x], ignore.case = FALSE,fixed = TRUE))
## storing into ex_data dataframe
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "love usage in twitter",
Value=length(love[[1]]))
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "hate usage in twitter",
Value=length(hate[[1]]))
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "hate usage in twitter",
Value=length(love[[1]])/length(hate[[1]]))
## finding the line with word biostats and fiding the exact phrase
biostats<- sapply(colnames(twitter), function(x) grep("biostats", twitter[,x], ignore.case = FALSE,fixed = TRUE))
exact_character <- sapply(colnames(twitter), function(x) grep("A computer once beat me at chess, but it was no match for me at kickboxing", twitter[,x], ignore.case = FALSE,fixed = TRUE))
## storing into ex_data dataframe
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "line with word biostats in twitter",
Value=twitter[biostats$twitter,])
ex_data <- ex_data %>% add_row(File = "twitter", Attribute = "occurence of a phrase in twitter",
Value=length(exact_character$twitter))
## delete variables not required
rm(biostats,love,hate,exact_character,exp1,exp2,exp3,exp4,exp5,replace_exp,unnest_exp)
## printing the values
datatable(as.matrix(ex_data))
## setting the working directory
setwd("~/Documents/Education/Coursera/R Prog/10/files")
##Creating plot for each data set
##blog plot
blog_plot <- blog_token_count %>%
filter(n > 25000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_bar(stat="identity", fill="lightsteelblue3", colour="lightsteelblue3") +
#geom_col() +
xlab(NULL) +
coord_flip()
##news plot
news_plot <- news_token_count %>%
filter(n > 25000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_bar(stat="identity", fill="darkolivegreen4", colour="darkolivegreen4") +
#geom_col() +
xlab(NULL) +
coord_flip()
##twitter plot
twitter_plot <- twitter_token_count %>%
filter(n > 25000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_bar(stat="identity", fill="indianred3", colour="indianred3") +
#geom_col() +
xlab(NULL) +
coord_flip()
## merging the three plots together
figure <- ggarrange(blog_plot, news_plot, twitter_plot,
labels = c("Blog", "News", "Twitter"),
ncol = 3, nrow = 1)
## delete variables not required
rm(blog_plot, news_plot, twitter_plot)
figure
## Creating a wordcloud
options(warn=-1) ## switch off warnings
## creating subset of data for wordcloud
blog_newdata <- subset(blog_token_count, n >= 5000 )
news_newdata <- subset(news_token_count, n >= 6000 )
twitter_newdata <- subset(twitter_token_count, n >= 5000 )
##blog
blog_wordcloud <- blog_newdata %>%
with(wordcloud(word, n, max.words = Inf,rot.per=0.35,
fixed.asp=TRUE,colors=brewer.pal(8, "Dark2")))
blog_wordcloud_sent <- blog_newdata %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("indianred3","lightsteelblue3"),
max.words = Inf,rot.per=0.35, title.size=1)
blog_wordcloud
blog_wordcloud_sent
##news
news_wordcloud <- news_newdata %>%
with(wordcloud(word, n, max.words = Inf,rot.per=0.35,
fixed.asp=TRUE,colors=brewer.pal(8, "Dark2")))
news_wordcloud_sent <- news_newdata %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("indianred3","lightsteelblue3"),
max.words = Inf,rot.per=0.35, title.size=1)
news_wordcloud
news_wordcloud_sent
##twitter
twitter_wordcloud <- twitter_newdata %>%
with(wordcloud(word, n, max.words = Inf,rot.per=0.35,
fixed.asp=TRUE,colors=brewer.pal(8, "Dark2")))
twitter_wordcloud_sent <- twitter_newdata %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("indianred3","lightsteelblue3"),
max.words = Inf,rot.per=0.35, title.size=1)
twitter_wordcloud
twitter_wordcloud_sent
options(warn=0) ##switch on warnings
## Drawing conclusion
print("By reviewing words with top 5 frequency, sees similar words in the three files")
print("Top 5 Blog data")
head(blog_token_count,5)
print("Top 5 News data")
head(news_token_count,5)
print("Top 5 Twitter data")
head(twitter_token_count,5)