Introduction

The purpose of this report was to try all the learnings that we got from the past courses in this specialization as a practice on our upcoming capstone project. Fro reading data, cleaning and organizing, exploratory analysis, and algorithm, all topics will be covered in this report. The report will try to explain on how the aurthor arrive in each code and what is the purpose of each chunks of code for other even beginners to understand somehow the idea behind each.

##Preparing data

Data used can be access in this link. https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip Files are downloaded and extracted in working directory.

library(tm)
## Loading required package: NLP
library(NLP)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(slam)
library(RWeka)
library(stringi)
library(knitr)

##Reading data

blog_p <- paste("C:/Users/geraldherbosa/Documents/", "en_US.blogs.txt",sep="")
news_p <- paste("C:/Users/geraldherbosa/Documents/", "en_US.news.txt",sep="")
twit_p <-  paste("C:/Users/geraldherbosa/Documents/", "en_US.twitter.txt", sep="")

blog <- readLines(blog_p, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news_p, encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines(news_p, encoding = "UTF-8", skipNul = TRUE): incomplete
## final line found on 'C:/Users/geraldherbosa/Documents/en_US.news.txt'
twit <- readLines(twit_p, encoding = "UTF-8", skipNul = TRUE)

Looking at the characteristics of data

These are large size files and it is important to have a grasp on the properties of data in order to fasten our calculation process.

st <- data.frame(
  FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
  FileSizeinMB=c(file.info(blog_p)$size/1024^2,
                 file.info(news_p)$size/1024^2,
                 file.info(twit_p)$size/1024^2),
  t(rbind(sapply(list(blog,news,twit), stri_stats_general),
          WordCount=
            sapply(list(blog, news, twit),
                   stri_stats_latex)[4,]))
)
kable(st)
FileName FileSizeinMB Lines LinesNEmpty Chars CharsNWhite WordCount
en_US.blogs 200.4242 899288 899288 206824382 170389539 37570839
en_US.news 196.2775 77259 77259 15639408 13072698 2651432
en_US.twitter 159.3641 2360148 2360148 162096241 134082806 30451170

##Data Sampling

These chunk of codes will limit the data to be processed since the data were huge.

blog_s <- blog[rbinom(length(blog)*.01, length(blog), .5)]
news_s <- news[rbinom(length(news)*.01, length(news), .5)]
twit_s <- twit[rbinom(length(twit)*.01, length(twit), .5)]

blog_s <- stri_replace_all_regex(blog_s, "\u2018|\u2026|\u201c|\u201d|\u2019","")
news_s <- stri_replace_all_regex(news_s, "\u2018|\u2019|\u2026|\u201c|\u201d\u2019","")
twit_s <- stri_replace_all_regex(twit_s, "\u2018|\u2026|\u201c|\u201d|\u2019","")

write.csv(blog_s, file = "C:/Users/geraldherbosa/Documents/blog_s.csv", row.names = FALSE)
write.csv(news_s, file = "C:/Users/geraldherbosa/Documents/news_s.csv", row.names = FALSE)
write.csv(twit_s, file = "C:/Users/geraldherbosa/Documents/twit_s.csv", row.names = FALSE)

##Corpus

corpus <- Corpus(DirSource("C:/Users/geraldherbosa/Documents"), readerControl = list(reader=readPlain, language="en_US"))

#Create function to transform the data
URL_r <- function(x) gsub("http[[:alnum:]]*","",x)
sign_r <- function(x) gsub("[[:punct:]]","",x)
num_r <- function(x) gsub("[[:digit:]]","",x)
apo_r <- function(x) gsub("'","",x)
nonascii_r <- function(x) iconv(x, "latin1", "ASCII", sub="")
rep_r <- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
case_r <- function(x) sapply(x,tolower)
space_r <- function(x) gsub("\\s+"," ",x)

#Transformation
corpus<-tm_map(corpus,content_transformer(apo_r))#Apostrophe remove
corpus<-tm_map(corpus,content_transformer(num_r))#Numbers remove
corpus<-tm_map(corpus,content_transformer(URL_r)) #URL remove
corpus<-tm_map(corpus,content_transformer(sign_r)) #Number and punctutation aside from apostrophe remove
corpus<-tm_map(corpus,content_transformer(nonascii_r)) #Non-ASCII remove
corpus<-tm_map(corpus,content_transformer(case_r))#Uppercase to lowercase
corpus<-tm_map(corpus,content_transformer(rep_r))#Alphabet repetition in words remove
corpus<-tm_map(corpus,content_transformer(space_r)) #Multiple space remove
corpus<-tm_map(corpus,removeWords,stopwords("english")) #Common English words remove

##Wordcloud

wordcloud(corpus, max.words=75, random.order=TRUE, rot.per=.15, colors=colorRampPalette(brewer.pal(9,"Reds"))(32), scale=c(3, .3))

##Data Plotting

Purpose of this plot was to compare words frequently used in twitter,blogs, and news.

corp_tdm <- TermDocumentMatrix(corpus)
corp_tdm2 <- as.matrix(corp_tdm)
corp_tdm3 <- rowSums(corp_tdm2)
corptdm3 <- sort(corp_tdm3, decreasing = TRUE)
barplot(corp_tdm3[1:25], col = "Purple", las = 2, main = "Word Data Frequency")