Data Science Capstone Week 2 - Milestone Report

The major objective of the capstone project is to build a predictive text model. The data used for for this project include blogs, news and twitters (https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip).

library(tm)
library(NLP)
library(stringi)
library(slam)
library(SnowballC)
library(ngram)
library(ggplot2)
library(cowplot)
library(wordcloud)
library(dplyr)
library(knitr)

Loading the data into R

#Reading the file "en_US.blogs.txt"
file1 <- "en_US/en_US.blogs.txt"; con <- file(file1,open = "rb")
usblog <- readLines(con, skipNul = TRUE); close(con)
#Reading the file "en_US.news.txt"
file2 <- "en_US/en_US.news.txt"; con <- file(file2,open = "rb")
usnews <- readLines(con,skipNul = TRUE); close(con)
#Reading the file "en_US.twitter.txt"
file3 <- "en_US/en_US.twitter.txt"; con <- file(file3,open = "rb")
ustwitter <- readLines(con,skipNul = TRUE); close(con)

Basic information about the files:

# Amount of senteces
nblines <- sapply(list(usblog,usnews,ustwitter),length)
# We are only interested in row 1 (number of characters) and 4 ( Words)
countwords <- sapply(list(usblog, usnews, ustwitter),
                     stri_stats_latex)[4,]
countcharacters <- sapply(list(usblog, usnews, ustwitter),
                     stri_stats_latex)[1,]
stat_sum <- cbind(c("blog","news","twitter"),nblines,countcharacters, countwords)
# Create a table
stat_table <- as.data.frame.array(stat_sum)
# change column names
colnames(stat_table) <- c("file","Nb_lines","Total_Characters","Total words")

knitr::kable(stat_table)

file	Nb_lines	Total_Characters	Total words
blog	899288	163325412	37865888
news	1010242	162566126	34678691
twitter	2360148	125769474	30578933

Create a sample data file

As you can see from the table the amount of data in this set is huge. Therefor I have decided to make a smaller sample dataset

sampleBlogs <- usblog[rbinom(length(usblog)*.01, length(usblog), .2)]
sampleusnews <- usnews[rbinom(length(usnews)*.01, length(usnews), .2)]
sampleustwitter <- ustwitter[rbinom(length(ustwitter)*.01, length(ustwitter), .2)]

#remove the unicode charachter
sampleBlogs <- stri_replace_all_regex(sampleBlogs, "\u2018|\u2026|\u201c|\u201d|\u2019","")
sampleusnews <- stri_replace_all_regex(sampleusnews, "\u2018|\u2026|\u201c|\u201d|\u2019","")
sampleustwitter <- stri_replace_all_regex(sampleustwitter, "\u2018|\u2026|\u201c|\u201d|\u2019","")

write.csv(sampleBlogs, file = "./sampleBlogs1.csv", row.names = FALSE)
write.csv(sampleusnews, file = "./sampleusnews1.csv", row.names = FALSE)
write.csv(sampleustwitter, file = "./sampleustwitter1.csv", row.names = FALSE)

Corpus

We will use the sample files to clean the data

corpus <- Corpus(DirSource("./"), readerControl = list(reader=readPlain, language="en_US"))

corpusClean <- sapply(corpus, function(word)iconv(word, "latin1", "ASCII", sub=""))
corpusClean <- tm_map(corpus, content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpusClean <- tm_map(corpusClean, toSpace, "/|@|//|$|:|:)|*|&|!|?|_|-|#|")  ## replace special characters by space
# Remove common word endigs for English
corpusClean <-tm_map(corpusClean, stemDocument)
#The \w metacharacter is used to find a word character.
removeURL <- function(x) gsub("(f|ht)tp(s?)://(.*)[.][a-z]+", "", x)
corpusClean <- tm_map(corpusClean, removeURL)
removedigit <- function(x) gsub("[[:digit:]]", "",x)
corpusClean <- tm_map(corpusClean, removedigit)
corpusClean <- tm_map(corpusClean, removePunctuation)
# removal of whitespace
corpusClean <- tm_map(corpusClean, stripWhitespace)
corpusClean <- tm_map(corpusClean, removeWords, c("the", "will", "The", "also", "that", "and", "for", "in", "is", "it", "not", "to"))
# common stop Words like for, very, and, of, are, etc,
corpusClean <- tm_map(corpusClean, removeWords, stopwords("english"))

#The document term matrix
dtm <- DocumentTermMatrix(corpusClean)
dtm

## <<DocumentTermMatrix (documents: 5, terms: 24985)>>
## Non-/sparse entries: 33543/91382
## Sparsity           : 73%
## Maximal term length: 126105
## Weighting          : term frequency (tf)

#The transpose of the document
tdm <- TermDocumentMatrix(corpusClean)
tdm

## <<TermDocumentMatrix (terms: 24985, documents: 5)>>
## Non-/sparse entries: 33543/91382
## Sparsity           : 73%
## Maximal term length: 126105
## Weighting          : term frequency (tf)

Creating a word cloud

wordcloud(corpusClean, max.words=100, random.order=TRUE, rot.per=.15, colors=colorRampPalette(brewer.pal(4,"Set1"))(32), scale=c(3, .3))

Frequencies

freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
wf <- data.frame(word=names(freq), freq=freq)

Plot Histogram

subset(wf, freq>1500)%>%
  ggplot(aes(x = reorder(word, -freq), y = freq)) +
  geom_bar(stat="identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1))+
  xlab("word")

Capstone project Course 10

Janneke Schipper

25-1-2020