Instructions

The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Download the data and successfully loaded it into the environment.

# Dowload files
setwd("C:/Users/shiqyang/Documents/Data Science Course/Code")
capstoneDatasetUrl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
zipFileName <- "Coursera-SwiftKey.zip"
if (!file.exists(zipFileName))
        download.file(capstoneDatasetUrl, zipFileName, method = "auto")

# Define file paths and names
fileblog <- "final/en_US/en_US.blogs.txt"
filetwit <- "final/en_US/en_US.twitter.txt"
filenews <- "final/en_US/en_US.news.txt"

# Unzip the files
if (!file.exists(fileblog) || !file.exists(filetwit) || !file.exists(filenews) )
    unzip(zipFileName)

# Load the data into memory
blogs   <- readLines(fileblog, encoding="UTF-8")
twitter <- readLines(filetwit, encoding="UTF-8")
## Warning in readLines(filetwit, encoding = "UTF-8"): line 167155 appears to
## contain an embedded nul
## Warning in readLines(filetwit, encoding = "UTF-8"): line 268547 appears to
## contain an embedded nul
## Warning in readLines(filetwit, encoding = "UTF-8"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines(filetwit, encoding = "UTF-8"): line 1759032 appears to
## contain an embedded nul
news    <- readLines(filenews, encoding="UTF-8")
## Warning in readLines(filenews, encoding = "UTF-8"): incomplete final line
## found on 'final/en_US/en_US.news.txt'

Basic report of summary statistics about the data sets and Report any interesting findings that you amassed so far

#install.packages("NLP")
#install.packages("tm")
#install.packages("wordcloud")
#install.packages("ngram")
library(stringi)
library(ggplot2)
library(NLP)
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(tm)
## Warning: package 'tm' was built under R version 3.6.2
#library(RWeka)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.2
## Loading required package: RColorBrewer
library(ngram)

# count the number of words in each dataset
wordcount(blogs)
## [1] 37334131
wordcount(news)
## [1] 2643969
wordcount(twitter)
## [1] 30373543
# sampling data
set.seed(12345)
test_data <- c(sample(blogs, length(blogs) * 0.001),
              sample(news, length(news) * 0.001),
              sample(twitter, length(twitter) * 0.001)
          )

# clean data          
testdata <- iconv(test_data, "UTF-8", "ASCII", sub="")
sample_corpus <- VCorpus(VectorSource(testdata))
sample_corpus <- tm_map(sample_corpus, tolower)
sample_corpus <- tm_map(sample_corpus, stripWhitespace)
sample_corpus <- tm_map(sample_corpus, removePunctuation)
sample_corpus <- tm_map(sample_corpus, removeNumbers)
sample_corpus <- tm_map(sample_corpus, PlainTextDocument)

# Create some 1-gram, 2-gram, and 3-gram tokenizers
UnigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

# Run the corpora through the tokenizers.
btdm1 <- TermDocumentMatrix(sample_corpus, control = list(tokenize = UnigramTokenizer))
btdm2 <- TermDocumentMatrix(sample_corpus, control = list(tokenize = BigramTokenizer))
btdm3 <- TermDocumentMatrix(sample_corpus, control = list(tokenize = TrigramTokenizer))


# create A function to compute the frequency of words and create a bar plot.
showCorpusInfo <- function(theCorpus)
{
    m <- as.matrix(theCorpus)
    v <- sort(rowSums(m), decreasing=TRUE)
    d <- data.frame(word = names(v),freq=v)
    #print(head(d, 10))
    barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")
    return (d)
}

# create A function to plot a word cloud.   
createAcloud <- function(d)
{
    minf = 40
    wordcloud(words = d$word, freq = d$freq, min.freq = minf,
                  max.words=200, random.order=FALSE, rot.per=0.35, 
                  colors=brewer.pal(8, "Dark2"))
}

# Plot the top 10 1-grams, 2-grams and 3-grams

d1<-showCorpusInfo(btdm1)

d2<-showCorpusInfo(btdm2)

d3<-showCorpusInfo(btdm3)

# plot a word cloud
createAcloud(d1)

createAcloud(d2)

Plans for creating a prediction algorithm and Shiny app

The plots show that some words are highly used in all 3. My next steps are:

Research other data cleansing techniques to find a good balance

Change the sample size using different cleansing techniques

Research different prediction models.