# Basic Indformation of the Dataset
filepath1 <- "A:/MEERA HARRIS/Coursera/Capstone Project/week1/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
filepath2 <- "A:/MEERA HARRIS/Coursera/Capstone Project/week1/Coursera-SwiftKey/final/en_US/en_US.news.txt"
filepath3 <- "A:/MEERA HARRIS/Coursera/Capstone Project/week1/Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
# Read blogs data in binary mode
conn <- file(filepath1, open="rb"); blogs <- readLines(conn, encoding="UTF-8",skipNul = T); close(conn)
# Read news data in binary mode
conn <- file(filepath2, open="rb"); news <- readLines(conn, encoding="UTF-8",skipNul = T); close(conn)
# Read twitter data in binary mode
conn <- file(filepath3, open="rb"); twitter <- readLines(conn, encoding="UTF-8",skipNul = T); close(conn)
# Remove temporary variable
rm(conn)
# Basic Summary Table
sumTabl <- data.frame(
file=c("en_US.blogs","en_US.news","en_US.twitter"),
fileinMB=c(file.info(filepath1)$size/1024^2,
file.info(filepath2)$size/1024^2,
file.info(filepath3)$size/1024^2),
t(rbind(sapply(list(blogs,news,twitter),stri_stats_general),
Words=sapply(list(blogs,news,twitter),stri_stats_latex)[4,]))
)
kable(sumTabl)
file | fileinMB | Lines | LinesNEmpty | Chars | CharsNWhite | Words |
---|---|---|---|---|---|---|
en_US.blogs | 200.4242 | 899288 | 899288 | 206824382 | 170389539 | 37570839 |
en_US.news | 196.2775 | 1010242 | 1010242 | 203223154 | 169860866 | 34494539 |
en_US.twitter | 159.3641 | 2360148 | 2360148 | 162096241 | 134082806 | 30451170 |
blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")
set.seed(565)
sample_data <- c(sample(blogs, length(blogs) * 0.0004),
sample(news, length(news) * 0.0055),
sample(twitter, length(twitter) * 0.0001))
corpus <- VCorpus(VectorSource(sample_data))
rm(blogs,news,twitter)
# Cleaning the Sample corpus
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## said said 1422
## year year 663
## will will 656
## one one 556
## new new 444
## time time 429
## like like 398
## get get 391
## state state 380
## say say 361
library(memisc)
library("wordcloud")
library("RColorBrewer")
par(bg="black")
wordcloud(d$word, d$freq[1:600], col=terrain.colors(length(d$word), alpha=0.9), random.order=FALSE, rot.per=0.3 )
title(main = "Most Used Words in the Corpus", font.main = 1, col.main = "cornsilk3", cex.main = 1.0)
# Sentiment Analysis of the Corpus
library(syuzhet)
d<-get_nrc_sentiment(sample_data)
td<-data.frame(t(d))
td_new <- data.frame(rowSums(td[2:6151]))
#The function rowSums computes column sums across rows for each level of a grouping variable.
#Transformation and cleaning
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]
# Visualisation
library("ggplot2")
qplot(sentiment, data=td_new2, weight=count, geom="bar",fill=sentiment)+ggtitle("Corpus sentiments")
###At the end NGramTokenizer and TermDocumentMatrix function was used to collect the terms pattern and the frequency to occur independently and linked to other specific words was performed.The frequency estimation performed included unigram, Bigram and Trigram for use in building the prediction model.
library(RWeka)
uni_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uni_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = uni_tokenizer))
bi_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = bi_tokenizer))
tri_matrix <- TermDocumentMatrix(corpus, control = list(tokenize = tri_tokenizer))
uni_corpus <- findFreqTerms(uni_matrix,lowfreq = 50)
bi_corpus <- findFreqTerms(bi_matrix,lowfreq=10)
tri_corpus <- findFreqTerms(tri_matrix,lowfreq=05)
uni_corpus_freq <- rowSums(as.matrix(uni_matrix[uni_corpus,]))
uni_corpus_freq <- data.frame(word=names(uni_corpus_freq), frequency=uni_corpus_freq)
bi_corpus_freq <- rowSums(as.matrix(bi_matrix[bi_corpus,]))
bi_corpus_freq <- data.frame(word=names(bi_corpus_freq), frequency=bi_corpus_freq)
tri_corpus_freq <- rowSums(as.matrix(tri_matrix[tri_corpus,]))
tri_corpus_freq <- data.frame(word=names(tri_corpus_freq), frequency=tri_corpus_freq)
head(uni_corpus_freq)
## word frequency
## abl abl 60
## accord accord 129
## account account 51
## activ activ 55
## age age 62
## agenc agenc 63
plotNgrams <- function(data, title, num) {
df2 <- data[order(-data$frequency),][1:num,]
l<-df2$word[1:num]
ggplot(df2, aes(x = seq(1:num), y = frequency)) +
geom_bar(stat = "identity", fill = "red", colour = "black", width = 0.80) +
coord_cartesian(xlim = c(0, num+1)) +
labs(title = title) +
xlab("Words")+
ylab("Count") +
scale_x_discrete(breaks = seq(1, num, by = 1), labels= l )+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
plotNgrams(uni_corpus_freq,"Top Unigrams",20)
plotNgrams(bi_corpus_freq,"Top Bigrams",20)
plotNgrams(tri_corpus_freq,"Top Trigrams",08)