Download the data and successfully loaded it into the environment.
# Dowload files
setwd("C:/Users/shiqyang/Documents/Data Science Course/Code")
capstoneDatasetUrl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
zipFileName <- "Coursera-SwiftKey.zip"
if (!file.exists(zipFileName))
download.file(capstoneDatasetUrl, zipFileName, method = "auto")
# Define file paths and names
fileblog <- "final/en_US/en_US.blogs.txt"
filetwit <- "final/en_US/en_US.twitter.txt"
filenews <- "final/en_US/en_US.news.txt"
# Unzip the files
if (!file.exists(fileblog) || !file.exists(filetwit) || !file.exists(filenews) )
unzip(zipFileName)
# Load the data into memory
blogs <- readLines(fileblog, encoding="UTF-8")
twitter <- readLines(filetwit, encoding="UTF-8")
## Warning in readLines(filetwit, encoding = "UTF-8"): line 167155 appears to
## contain an embedded nul
## Warning in readLines(filetwit, encoding = "UTF-8"): line 268547 appears to
## contain an embedded nul
## Warning in readLines(filetwit, encoding = "UTF-8"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines(filetwit, encoding = "UTF-8"): line 1759032 appears to
## contain an embedded nul
news <- readLines(filenews, encoding="UTF-8")
## Warning in readLines(filenews, encoding = "UTF-8"): incomplete final line
## found on 'final/en_US/en_US.news.txt'
Basic report of summary statistics about the data sets and Report any interesting findings that you amassed so far
#install.packages("NLP")
#install.packages("tm")
#install.packages("wordcloud")
#install.packages("ngram")
library(stringi)
library(ggplot2)
library(NLP)
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(tm)
## Warning: package 'tm' was built under R version 3.6.2
#library(RWeka)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.2
## Loading required package: RColorBrewer
library(ngram)
# count the number of words in each dataset
wordcount(blogs)
## [1] 37334131
wordcount(news)
## [1] 2643969
wordcount(twitter)
## [1] 30373543
# sampling data
set.seed(12345)
test_data <- c(sample(blogs, length(blogs) * 0.001),
sample(news, length(news) * 0.001),
sample(twitter, length(twitter) * 0.001)
)
# clean data
testdata <- iconv(test_data, "UTF-8", "ASCII", sub="")
sample_corpus <- VCorpus(VectorSource(testdata))
sample_corpus <- tm_map(sample_corpus, tolower)
sample_corpus <- tm_map(sample_corpus, stripWhitespace)
sample_corpus <- tm_map(sample_corpus, removePunctuation)
sample_corpus <- tm_map(sample_corpus, removeNumbers)
sample_corpus <- tm_map(sample_corpus, PlainTextDocument)
# Create some 1-gram, 2-gram, and 3-gram tokenizers
UnigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
# Run the corpora through the tokenizers.
btdm1 <- TermDocumentMatrix(sample_corpus, control = list(tokenize = UnigramTokenizer))
btdm2 <- TermDocumentMatrix(sample_corpus, control = list(tokenize = BigramTokenizer))
btdm3 <- TermDocumentMatrix(sample_corpus, control = list(tokenize = TrigramTokenizer))
# create A function to compute the frequency of words and create a bar plot.
showCorpusInfo <- function(theCorpus)
{
m <- as.matrix(theCorpus)
v <- sort(rowSums(m), decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
#print(head(d, 10))
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
return (d)
}
# create A function to plot a word cloud.
createAcloud <- function(d)
{
minf = 40
wordcloud(words = d$word, freq = d$freq, min.freq = minf,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
}
# Plot the top 10 1-grams, 2-grams and 3-grams
d1<-showCorpusInfo(btdm1)

d2<-showCorpusInfo(btdm2)

d3<-showCorpusInfo(btdm3)

# plot a word cloud
createAcloud(d1)

createAcloud(d2)
