The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. The motivation for this project is to:
Loading all required libraries
library(R.utils)
library(tm)
library(SnowballC)
library(ggplot2)
library(gridExtra)
library(RWeka)
library(textcat)
library(stringr)
library(stringi)
Loading source code for different functions needed in the project:
#Load modules to draw data
source("DrawFunctions.R")
#Load modules to format data and clean corpus data
source("TextFunctions.R")
We will download files to a local directory and we will clean files from punctuations and not alphanumeric symbols
datasetDirectory <- "./dataset/final"
sampleCleaned <- function(sampleInput, badwords)
{
#Cleaning data
sampleInput<-gsub(pattern="[[:punct:]]", x=sampleInput, replacement=" ")
sampleInput<-gsub(pattern="[^[:alnum:]]", x=sampleInput, replacement=" ")
for (i in badwords){
pat<-paste0("\\<", i)
pat<-paste0(pat, "\\>")
sampleInput<-gsub(pattern=pat,x=sampleInput, replacement="", ignore.case=TRUE)
}
return(sampleInput)
}
# Check file exists
if(!file.exists(datasetDirectory))
{
fileUrl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(fileUrl, dest="./dataset/Coursera-SwiftKey.zip")
unzip("./dataset/Coursera-SwiftKey.zip", exdir="./dataset")
download.file(fileBadWords, dest="./dataset/badWords.txt")
}
We will generate small sample files from original using a binomial distribution ir order to accelerate the reading and cleaning of corpus because the number of computations is high in a personal computer for big files
fileBlogs <- ".//dataset//final//en_US//en_US.blogs.txt"
fileNews <- ".//dataset//final//en_US//en_US.news.txt"
fileTwitter <- ".//dataset//final//en_US//en_US.twitter.txt"
fileSampleBlogs <- ".//dataset//sample//en_US.sample.blogs.txt"
fileSampleNews <- ".//dataset//sample//en_US.sample.news.txt"
fileSampleTwitter <- ".//dataset//sample//en_US.sample.twitter.txt"
fileBadWords <- "http://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
badwords<-readLines(fileBadWords)
#Check is file samples exists
if (!file.exists(fileBlogs) || !file.exists(fileSampleNews) || !file.exists(fileSampleTwitter))
{
linesBlogs <- as.integer(countLines(fileBlogs))
linesNews <- as.integer(countLines(fileNews))
linesTwitter <- as.integer(countLines(fileTwitter))
#Get only a subset of lines
xBlogs <- as.logical(rbinom(linesBlogs, 1, 0.01))
xNews <- as.logical(rbinom(linesNews, 1, 0.1))
xTwitter <- as.logical(rbinom(linesTwitter, 1, 0.01))
#Subset data
connTwitter <- file(fileTwitter)
sampleTwitter <- readLines(connTwitter,skipNul = TRUE, warn = FALSE)[xTwitter]
close(connTwitter)
#Generate sample file
connSampleTwitter <- file(fileSampleTwitter)
#Cleaning data
sampleTwitter <-sampleCleaned(sampleTwitter, badwords)
write(sampleTwitter,connSampleTwitter)
close(connSampleTwitter)
#Subset data
connBlogs <- file(fileBlogs)
sampleBlogs <- readLines(connBlogs,skipNul = TRUE, warn = FALSE)[xBlogs]
close(connBlogs)
#Generate sample file
connSampleBlogs <- file(fileSampleBlogs)
#Cleaning data
sampleBlogs <-sampleCleaned(sampleBlogs, badwords)
write(sampleBlogs,connSampleBlogs)
close(connSampleBlogs)
#Subset data
connNews <- file(fileNews)
sampleNews <- readLines(connNews,skipNul = TRUE, warn = FALSE)[xNews]
close(connNews)
#Generate sample file
connSampleNews <- file(fileSampleNews)
#Cleaning data
sampleNews <-sampleCleaned(sampleNews, badwords)
write(sampleNews,connSampleNews)
close(connSampleNews)
#Delete innecessary objects
rm(xBlogs)
rm(xNews)
rm(xTwitter)
}
We will get a list of profanity words and create the corpus from sample files. We will also clean corpus removing punctuation, stop words,...
directorySample <- ".//dataset//sample"
vectorWords <- readLines("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt")
badwordsvector<- vectorWords[2:length(vectorWords)]
corpusWord <- VCorpus(DirSource( directorySample ),readerControl = list( reader=readPlain, language = "en"))
corpusWord <- cleanCorpus(corpusWord)
#Get separated document for blogs, twitter and news
for(i in 1:length(corpusWord))
{
if (meta(corpusWord[[i]], "id")=="en_US.sample.blogs.txt")
{
documentBlogs<-corpusWord[i]
}
if (meta(corpusWord[[i]], "id")=="en_US.sample.news.txt")
{
documentNews<-corpusWord[i]
}
if (meta(corpusWord[[i]], "id")=="en_US.sample.twitter.txt")
{
documentTwitter<-corpusWord[i]
}
}
Get some summaries from different corpus(twitter, blog, news)
fileSampleBlog <- ".//dataset//sample//en_US.sample.blogs.txt"
fileSampleTwitter <- ".//dataset//sample//en_US.sample.twitter.txt"
fileSampleWeb <- ".//dataset//sample//en_US.sample.news.txt"
linesBlog <- readLines(file(fileSampleBlog))
linesTwitter <- readLines(file(fileSampleTwitter))
linesWeb <- readLines(file(fileSampleWeb))
# Lines and words from files
length(linesBlog)
## [1] 9073
length(linesTwitter)
## [1] 23399
length(linesWeb)
## [1] 100762
nwordsBlog <- stri_count_words(linesBlog)
nwordsTwitter <- stri_count_words(linesTwitter)
nwordsWeb <- stri_count_words(linesWeb)
summary(nwordsBlog)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 29.00 42.07 59.00 1197.00
summary(nwordsTwitter)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 7.00 12.00 12.99 19.00 39.00
summary(nwordsWeb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 1.00 3.59 1.00 543.00
# Summaries from corpus
summary(corpusWord)
## Length Class Mode
## en_US.sample.blogs.txt 2 PlainTextDocument list
## en_US.sample.news.txt 2 PlainTextDocument list
## en_US.sample.twitter.txt 2 PlainTextDocument list
inspect(corpusWord)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 1160789
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 931503
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 953279
summary(corpusWord[[1]])
## Length Class Mode
## content 9073 -none- character
## meta 7 TextDocumentMeta list
summary(corpusWord[[2]])
## Length Class Mode
## content 100762 -none- character
## meta 7 TextDocumentMeta list
summary(corpusWord[[3]])
## Length Class Mode
## content 23399 -none- character
## meta 7 TextDocumentMeta list
We will plot some data, the most frequency words in global, twitter, blogs and news files