Coursera Data Science Capstone: Milestone Report

Setting environment / Creando ambiente de trabajo

#install.packages('devtools') 
library(devtools) 
## Loading required package: usethis
#slam_url <- "https://cran.r-project.org/src/contrib/Archive/slam/slam_0.1-37.tar.gz" 
#install_url(slam_url)
#install.packages('pacman')
library(pacman)
#install.packages('slam')
library(slam)
#install.packages('NLP')
library(NLP)
#install.packages('rJava')
library(rJava)
library(downloader)
## 
## Attaching package: 'downloader'
## The following object is masked from 'package:devtools':
## 
##     source_url
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
library(stringi)
library(tm)
library(NLP)
library(RWeka)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
setwd("D:/Documents/Johns Hopkings University/10 Data Science Capstone/Milestone Report")

Starting process / Iniciando proceso

if(!file.exists("./projectData"))
{
  dir.create("./projectData")
}
Website <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if(!file.exists("./projectData/Coursera-SwiftKey.zip")){
  download.file(Website,destfile="./projectData/Coursera-SwiftKey.zip",mode = "wb")
}

if(!file.exists("./projectData/final"))
{
  unzip(zipfile="./projectData/Coursera-SwiftKey.zip",exdir="./projectData")
}
setwd("./projectData/final/en_US")
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")

Show length / Verificando tamaño

length(twitter)
## [1] 2360148
length(blogs)
## [1] 899288
length(news)
## [1] 77259

Making a word length analysis / Contabilizando palabras

twitterwords <-stri_stats_latex(twitter)[4]
blogswords <-stri_stats_latex(blogs)[4]
newswords <-stri_stats_latex(news)[4]
nchar_twitter<-sum(nchar(twitter))
nchar_blogs<-sum(nchar(blogs))
nchar_news<-sum(nchar(news))
data.frame("File Name" = c("twitter", "blogs", "news"),
           "num.lines" = c(length(twitter),length(blogs), length(news)),
           "num.words" = c(sum(blogswords), sum(newswords), sum(twitterwords)),
           "Num of character"=c(nchar_blogs,nchar_news,nchar_twitter))
##   File.Name num.lines num.words Num.of.character
## 1   twitter   2360148  37570839        206824505
## 2     blogs    899288   2651432         15639408
## 3      news     77259  30451128        162096031

Making and exploratory analysis / Realizando un analisis exploratorio

set.seed(10000)
blogs_c<-iconv(blogs,"latin1","ASCII",sub="")
news_c<-iconv(news,"latin1","ASCII",sub="")
twitter_c<-iconv(twitter,"latin1","ASCII",sub="")
sampledata<-c(sample(twitter_c,length(twitter_c)*0.01), sample(blogs_c,length(blogs_c)*0.01), sample(news_c,length(news_c)*0.01))
corpus <- VCorpus(VectorSource(sampledata))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpusresult<-data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusresult)
##                                                                                       text
## 1                                      tiger woods poker night phil hellmuth doyle brunson
## 2                                            suddenly feel start checking retirement homes
## 3                         acutely aware fact major blowers building sound good microphones
## 4                                                                        seem right avatar
## 5                                 new empathic approach used one high school great results
## 6 many diagnosed altzheimers old head injury ever wonder pres reagan subdural hematoma tbi

Drawing the graphs / Trazando graficos

unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
unigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))
unigramcorpus<-findFreqTerms(unigramtab,lowfreq=1000)
unigramcorpusnum<-rowSums(as.matrix(unigramtab[unigramcorpus,]))
unigramcorpustab<-data.frame(Word=names(unigramcorpusnum),frequency=unigramcorpusnum)
unigramcorpussort<-unigramcorpustab[order(-unigramcorpustab$frequency),]
ggplot(unigramcorpussort[1:15,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("brown"))+
labs(title="Unigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
bigramcorpus<-findFreqTerms(bigramtab,lowfreq=80)
bigramcorpusnum<-rowSums(as.matrix(bigramtab[bigramcorpus,]))
bigramcorpustab<-data.frame(Word=names(bigramcorpusnum),frequency=bigramcorpusnum)
bigramcorpussort<-bigramcorpustab[order(-bigramcorpustab$frequency),]
ggplot(bigramcorpussort[1:12,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("orange"))+
labs(title="Bigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
## Warning: Removed 1 rows containing missing values (position_stack).

trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
trigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
trigramcorpus<-findFreqTerms(trigramtab,lowfreq=10)
trigramcorpusnum<-rowSums(as.matrix(trigramtab[trigramcorpus,]))
trigramcorpustab<-data.frame(Word=names(trigramcorpusnum),frequency=trigramcorpusnum)
trigramcorpussort<-trigramcorpustab[order(-trigramcorpustab$frequency),]
ggplot(trigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("blue"))+
labs(title="Trigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
## Warning: Removed 4 rows containing missing values (position_stack).