Data Science Capstone W2

Introduction

The purpose of this report was to try all the learnings that we got from the past courses in this specialization as a practice on our upcoming capstone project. Fro reading data, cleaning and organizing, exploratory analysis, and algorithm, all topics will be covered in this report. The report will try to explain on how the aurthor arrive in each code and what is the purpose of each chunks of code for other even beginners to understand somehow the idea behind each.

##Preparing data

Data used can be access in this link. https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip Files are downloaded and extracted in working directory.

library(tm)

## Loading required package: NLP

library(NLP)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(wordcloud)

## Loading required package: RColorBrewer

library(slam)
library(RWeka)
library(stringi)
library(knitr)

##Reading data

blog_p <- paste("C:/Users/geraldherbosa/Documents/", "en_US.blogs.txt",sep="")
news_p <- paste("C:/Users/geraldherbosa/Documents/", "en_US.news.txt",sep="")
twit_p <-  paste("C:/Users/geraldherbosa/Documents/", "en_US.twitter.txt", sep="")

blog <- readLines(blog_p, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news_p, encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines(news_p, encoding = "UTF-8", skipNul = TRUE): incomplete
## final line found on 'C:/Users/geraldherbosa/Documents/en_US.news.txt'

twit <- readLines(twit_p, encoding = "UTF-8", skipNul = TRUE)

Looking at the characteristics of data

These are large size files and it is important to have a grasp on the properties of data in order to fasten our calculation process.

st <- data.frame(
  FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
  FileSizeinMB=c(file.info(blog_p)$size/1024^2,
                 file.info(news_p)$size/1024^2,
                 file.info(twit_p)$size/1024^2),
  t(rbind(sapply(list(blog,news,twit), stri_stats_general),
          WordCount=
            sapply(list(blog, news, twit),
                   stri_stats_latex)[4,]))
)
kable(st)

FileName	FileSizeinMB	Lines	LinesNEmpty	Chars	CharsNWhite	WordCount
en_US.blogs	200.4242	899288	899288	206824382	170389539	37570839
en_US.news	196.2775	77259	77259	15639408	13072698	2651432
en_US.twitter	159.3641	2360148	2360148	162096241	134082806	30451170

##Data Sampling

These chunk of codes will limit the data to be processed since the data were huge.

blog_s <- blog[rbinom(length(blog)*.01, length(blog), .5)]
news_s <- news[rbinom(length(news)*.01, length(news), .5)]
twit_s <- twit[rbinom(length(twit)*.01, length(twit), .5)]

blog_s <- stri_replace_all_regex(blog_s, "\u2018|\u2026|\u201c|\u201d|\u2019","")
news_s <- stri_replace_all_regex(news_s, "\u2018|\u2019|\u2026|\u201c|\u201d\u2019","")
twit_s <- stri_replace_all_regex(twit_s, "\u2018|\u2026|\u201c|\u201d|\u2019","")

write.csv(blog_s, file = "C:/Users/geraldherbosa/Documents/blog_s.csv", row.names = FALSE)
write.csv(news_s, file = "C:/Users/geraldherbosa/Documents/news_s.csv", row.names = FALSE)
write.csv(twit_s, file = "C:/Users/geraldherbosa/Documents/twit_s.csv", row.names = FALSE)

##Corpus

corpus <- Corpus(DirSource("C:/Users/geraldherbosa/Documents"), readerControl = list(reader=readPlain, language="en_US"))

#Create function to transform the data
URL_r <- function(x) gsub("http[[:alnum:]]*","",x)
sign_r <- function(x) gsub("[[:punct:]]","",x)
num_r <- function(x) gsub("[[:digit:]]","",x)
apo_r <- function(x) gsub("'","",x)
nonascii_r <- function(x) iconv(x, "latin1", "ASCII", sub="")
rep_r <- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
case_r <- function(x) sapply(x,tolower)
space_r <- function(x) gsub("\\s+"," ",x)

#Transformation
corpus<-tm_map(corpus,content_transformer(apo_r))#Apostrophe remove
corpus<-tm_map(corpus,content_transformer(num_r))#Numbers remove
corpus<-tm_map(corpus,content_transformer(URL_r)) #URL remove
corpus<-tm_map(corpus,content_transformer(sign_r)) #Number and punctutation aside from apostrophe remove
corpus<-tm_map(corpus,content_transformer(nonascii_r)) #Non-ASCII remove
corpus<-tm_map(corpus,content_transformer(case_r))#Uppercase to lowercase
corpus<-tm_map(corpus,content_transformer(rep_r))#Alphabet repetition in words remove
corpus<-tm_map(corpus,content_transformer(space_r)) #Multiple space remove
corpus<-tm_map(corpus,removeWords,stopwords("english")) #Common English words remove

##Wordcloud

wordcloud(corpus, max.words=75, random.order=TRUE, rot.per=.15, colors=colorRampPalette(brewer.pal(9,"Reds"))(32), scale=c(3, .3))

##Data Plotting

Purpose of this plot was to compare words frequently used in twitter,blogs, and news.

corp_tdm <- TermDocumentMatrix(corpus)
corp_tdm2 <- as.matrix(corp_tdm)
corp_tdm3 <- rowSums(corp_tdm2)
corptdm3 <- sort(corp_tdm3, decreasing = TRUE)
barplot(corp_tdm3[1:25], col = "Purple", las = 2, main = "Word Data Frequency")

Data Science Capstone W2

Gerald Herbosa

8/22/2020

Introduction

Looking at the characteristics of data