The major objective of the capstone project is to build a predictive text model. The data used for for this project include blogs, news and twitters (https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip).
library(tm)
library(NLP)
library(stringi)
library(slam)
library(SnowballC)
library(ngram)
library(ggplot2)
library(cowplot)
library(wordcloud)
library(dplyr)
library(knitr)
#Reading the file "en_US.blogs.txt"
file1 <- "en_US/en_US.blogs.txt"; con <- file(file1,open = "rb")
usblog <- readLines(con, skipNul = TRUE); close(con)
#Reading the file "en_US.news.txt"
file2 <- "en_US/en_US.news.txt"; con <- file(file2,open = "rb")
usnews <- readLines(con,skipNul = TRUE); close(con)
#Reading the file "en_US.twitter.txt"
file3 <- "en_US/en_US.twitter.txt"; con <- file(file3,open = "rb")
ustwitter <- readLines(con,skipNul = TRUE); close(con)
# Amount of senteces
nblines <- sapply(list(usblog,usnews,ustwitter),length)
# We are only interested in row 1 (number of characters) and 4 ( Words)
countwords <- sapply(list(usblog, usnews, ustwitter),
stri_stats_latex)[4,]
countcharacters <- sapply(list(usblog, usnews, ustwitter),
stri_stats_latex)[1,]
stat_sum <- cbind(c("blog","news","twitter"),nblines,countcharacters, countwords)
# Create a table
stat_table <- as.data.frame.array(stat_sum)
# change column names
colnames(stat_table) <- c("file","Nb_lines","Total_Characters","Total words")
knitr::kable(stat_table)
| file | Nb_lines | Total_Characters | Total words |
|---|---|---|---|
| blog | 899288 | 163325412 | 37865888 |
| news | 1010242 | 162566126 | 34678691 |
| 2360148 | 125769474 | 30578933 |
As you can see from the table the amount of data in this set is huge. Therefor I have decided to make a smaller sample dataset
sampleBlogs <- usblog[rbinom(length(usblog)*.01, length(usblog), .2)]
sampleusnews <- usnews[rbinom(length(usnews)*.01, length(usnews), .2)]
sampleustwitter <- ustwitter[rbinom(length(ustwitter)*.01, length(ustwitter), .2)]
#remove the unicode charachter
sampleBlogs <- stri_replace_all_regex(sampleBlogs, "\u2018|\u2026|\u201c|\u201d|\u2019","")
sampleusnews <- stri_replace_all_regex(sampleusnews, "\u2018|\u2026|\u201c|\u201d|\u2019","")
sampleustwitter <- stri_replace_all_regex(sampleustwitter, "\u2018|\u2026|\u201c|\u201d|\u2019","")
write.csv(sampleBlogs, file = "./sampleBlogs1.csv", row.names = FALSE)
write.csv(sampleusnews, file = "./sampleusnews1.csv", row.names = FALSE)
write.csv(sampleustwitter, file = "./sampleustwitter1.csv", row.names = FALSE)
We will use the sample files to clean the data
corpus <- Corpus(DirSource("./"), readerControl = list(reader=readPlain, language="en_US"))
corpusClean <- sapply(corpus, function(word)iconv(word, "latin1", "ASCII", sub=""))
corpusClean <- tm_map(corpus, content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpusClean <- tm_map(corpusClean, toSpace, "/|@|//|$|:|:)|*|&|!|?|_|-|#|") ## replace special characters by space
# Remove common word endigs for English
corpusClean <-tm_map(corpusClean, stemDocument)
#The \w metacharacter is used to find a word character.
removeURL <- function(x) gsub("(f|ht)tp(s?)://(.*)[.][a-z]+", "", x)
corpusClean <- tm_map(corpusClean, removeURL)
removedigit <- function(x) gsub("[[:digit:]]", "",x)
corpusClean <- tm_map(corpusClean, removedigit)
corpusClean <- tm_map(corpusClean, removePunctuation)
# removal of whitespace
corpusClean <- tm_map(corpusClean, stripWhitespace)
corpusClean <- tm_map(corpusClean, removeWords, c("the", "will", "The", "also", "that", "and", "for", "in", "is", "it", "not", "to"))
# common stop Words like for, very, and, of, are, etc,
corpusClean <- tm_map(corpusClean, removeWords, stopwords("english"))
#The document term matrix
dtm <- DocumentTermMatrix(corpusClean)
dtm
## <<DocumentTermMatrix (documents: 5, terms: 24985)>>
## Non-/sparse entries: 33543/91382
## Sparsity : 73%
## Maximal term length: 126105
## Weighting : term frequency (tf)
#The transpose of the document
tdm <- TermDocumentMatrix(corpusClean)
tdm
## <<TermDocumentMatrix (terms: 24985, documents: 5)>>
## Non-/sparse entries: 33543/91382
## Sparsity : 73%
## Maximal term length: 126105
## Weighting : term frequency (tf)
wordcloud(corpusClean, max.words=100, random.order=TRUE, rot.per=.15, colors=colorRampPalette(brewer.pal(4,"Set1"))(32), scale=c(3, .3))
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
wf <- data.frame(word=names(freq), freq=freq)
subset(wf, freq>1500)%>%
ggplot(aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat="identity") +
theme(axis.text.x=element_text(angle=45, hjust=1))+
xlab("word")