library(tm)
## Loading required package: NLP
fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("Coursera-SwiftKey.zip")){
download.file(fileUrl, destfile = "Coursera-SwiftKey.zip", method="curl")
}
unzip("Coursera-SwiftKey.zip")
fileSize <- format(file.info("final/en_US/en_US.news.txt")$size/(1024^2),digits = 0)
con <- file("final/en_US/en_US.news.txt", "rb")
news <- readLines(con)
close(con)
Cleaning data: Create a corpus and make: all lowercase, no numbers, and no puncutation
# Sample the data
news <- sample(news,10000)
# Create corpus
corpus <- Corpus(VectorSource(news))
# To lower case
corpus <- tm_map(corpus, content_transformer(tolower))
# Remove punctuation marks
corpus <- tm_map(corpus, removePunctuation)
# Remove numbers
corpus <- tm_map(corpus, removeNumbers)
#remove stop words
corpus <- tm_map(corpus, removeWords, stopwords("english"))
#Remove whitespaces
corpus <- tm_map(corpus, stripWhitespace)
Create a term matrix and count terms
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "News Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
Twitter data:
fileSize <- format(file.info("final/en_US/en_US.twitter.txt")$size/(1024^2),digits = 0)
con <- file("final/en_US/en_US.twitter.txt", "rt")
twitter <- readLines(con)
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
close(con)
Cleaning data: Create a corpus and make: all lowercase, no numbers, and no puncutation
# Sample the data
twitter <- sample(twitter,10000)
# Create corpus
corpus <- Corpus(VectorSource(twitter))
# To lower case
corpus <- tm_map(corpus, content_transformer(tolower))
# Remove punctuation marks
corpus <- tm_map(corpus, removePunctuation)
# Remove numbers
corpus <- tm_map(corpus, removeNumbers)
#remove stop words
corpus <- tm_map(corpus, removeWords, stopwords("english"))
#Remove whitespaces
corpus <- tm_map(corpus, stripWhitespace)
Now, that the corpus is clean, we can create a term matrix and find the frequent terms
# We pass minWordLength=1 inorder to include single letter words such as 'a'
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "Twitter Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
fileSize <- format(file.info("final/en_US/en_US.blogs.txt")$size/(1024^2),digits = 0)
con <- file("final/en_US/en_US.blogs.txt", "rt")
blogs <- readLines(con)
close(con)
Cleaning data: Create a corpus and make: all lowercase, no numbers, and no puncutation
# Sample the data
blogs <- sample(blogs,10000)
# Create corpus
corpus <- Corpus(VectorSource(blogs))
# To lower case
corpus <- tm_map(corpus, content_transformer(tolower))
# Remove punctuation marks
corpus <- tm_map(corpus, removePunctuation)
# Remove numbers
corpus <- tm_map(corpus, removeNumbers)
#remove stop words
corpus <- tm_map(corpus, removeWords, stopwords("english"))
#Remove whitespaces
corpus <- tm_map(corpus, stripWhitespace)
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus))),decreasing=TRUE), 10)
barplot(frequentWords,
main = "Blog Data: Most Frequent Words",
xlab="Word",
ylab = "Count")
More models - N grams: bigrams, trigrams. Create a prediction model