The purpose of this report was to try all the learnings that we got from the past courses in this specialization as a practice on our upcoming capstone project. Fro reading data, cleaning and organizing, exploratory analysis, and algorithm, all topics will be covered in this report. The report will try to explain on how the aurthor arrive in each code and what is the purpose of each chunks of code for other even beginners to understand somehow the idea behind each.
##Preparing data
Data used can be access in this link. https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip Files are downloaded and extracted in working directory.
library(tm)
## Loading required package: NLP
library(NLP)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(slam)
library(RWeka)
library(stringi)
library(knitr)
##Reading data
blog_p <- paste("C:/Users/geraldherbosa/Documents/", "en_US.blogs.txt",sep="")
news_p <- paste("C:/Users/geraldherbosa/Documents/", "en_US.news.txt",sep="")
twit_p <- paste("C:/Users/geraldherbosa/Documents/", "en_US.twitter.txt", sep="")
blog <- readLines(blog_p, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news_p, encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines(news_p, encoding = "UTF-8", skipNul = TRUE): incomplete
## final line found on 'C:/Users/geraldherbosa/Documents/en_US.news.txt'
twit <- readLines(twit_p, encoding = "UTF-8", skipNul = TRUE)
These are large size files and it is important to have a grasp on the properties of data in order to fasten our calculation process.
st <- data.frame(
FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
FileSizeinMB=c(file.info(blog_p)$size/1024^2,
file.info(news_p)$size/1024^2,
file.info(twit_p)$size/1024^2),
t(rbind(sapply(list(blog,news,twit), stri_stats_general),
WordCount=
sapply(list(blog, news, twit),
stri_stats_latex)[4,]))
)
kable(st)
| FileName | FileSizeinMB | Lines | LinesNEmpty | Chars | CharsNWhite | WordCount |
|---|---|---|---|---|---|---|
| en_US.blogs | 200.4242 | 899288 | 899288 | 206824382 | 170389539 | 37570839 |
| en_US.news | 196.2775 | 77259 | 77259 | 15639408 | 13072698 | 2651432 |
| en_US.twitter | 159.3641 | 2360148 | 2360148 | 162096241 | 134082806 | 30451170 |
##Data Sampling
These chunk of codes will limit the data to be processed since the data were huge.
blog_s <- blog[rbinom(length(blog)*.01, length(blog), .5)]
news_s <- news[rbinom(length(news)*.01, length(news), .5)]
twit_s <- twit[rbinom(length(twit)*.01, length(twit), .5)]
blog_s <- stri_replace_all_regex(blog_s, "\u2018|\u2026|\u201c|\u201d|\u2019","")
news_s <- stri_replace_all_regex(news_s, "\u2018|\u2019|\u2026|\u201c|\u201d\u2019","")
twit_s <- stri_replace_all_regex(twit_s, "\u2018|\u2026|\u201c|\u201d|\u2019","")
write.csv(blog_s, file = "C:/Users/geraldherbosa/Documents/blog_s.csv", row.names = FALSE)
write.csv(news_s, file = "C:/Users/geraldherbosa/Documents/news_s.csv", row.names = FALSE)
write.csv(twit_s, file = "C:/Users/geraldherbosa/Documents/twit_s.csv", row.names = FALSE)
##Corpus
corpus <- Corpus(DirSource("C:/Users/geraldherbosa/Documents"), readerControl = list(reader=readPlain, language="en_US"))
#Create function to transform the data
URL_r <- function(x) gsub("http[[:alnum:]]*","",x)
sign_r <- function(x) gsub("[[:punct:]]","",x)
num_r <- function(x) gsub("[[:digit:]]","",x)
apo_r <- function(x) gsub("'","",x)
nonascii_r <- function(x) iconv(x, "latin1", "ASCII", sub="")
rep_r <- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
case_r <- function(x) sapply(x,tolower)
space_r <- function(x) gsub("\\s+"," ",x)
#Transformation
corpus<-tm_map(corpus,content_transformer(apo_r))#Apostrophe remove
corpus<-tm_map(corpus,content_transformer(num_r))#Numbers remove
corpus<-tm_map(corpus,content_transformer(URL_r)) #URL remove
corpus<-tm_map(corpus,content_transformer(sign_r)) #Number and punctutation aside from apostrophe remove
corpus<-tm_map(corpus,content_transformer(nonascii_r)) #Non-ASCII remove
corpus<-tm_map(corpus,content_transformer(case_r))#Uppercase to lowercase
corpus<-tm_map(corpus,content_transformer(rep_r))#Alphabet repetition in words remove
corpus<-tm_map(corpus,content_transformer(space_r)) #Multiple space remove
corpus<-tm_map(corpus,removeWords,stopwords("english")) #Common English words remove
##Wordcloud
wordcloud(corpus, max.words=75, random.order=TRUE, rot.per=.15, colors=colorRampPalette(brewer.pal(9,"Reds"))(32), scale=c(3, .3))
##Data Plotting
Purpose of this plot was to compare words frequently used in twitter,blogs, and news.
corp_tdm <- TermDocumentMatrix(corpus)
corp_tdm2 <- as.matrix(corp_tdm)
corp_tdm3 <- rowSums(corp_tdm2)
corptdm3 <- sort(corp_tdm3, decreasing = TRUE)
barplot(corp_tdm3[1:25], col = "Purple", las = 2, main = "Word Data Frequency")