Capstone Project

Instructions

The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Download the data and successfully loaded it into the environment.

# Dowload files
setwd("C:/Users/shiqyang/Documents/Data Science Course/Code")
capstoneDatasetUrl<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
zipFileName <- "Coursera-SwiftKey.zip"
if (!file.exists(zipFileName))
        download.file(capstoneDatasetUrl, zipFileName, method = "auto")

# Define file paths and names
fileblog <- "final/en_US/en_US.blogs.txt"
filetwit <- "final/en_US/en_US.twitter.txt"
filenews <- "final/en_US/en_US.news.txt"

# Unzip the files
if (!file.exists(fileblog) || !file.exists(filetwit) || !file.exists(filenews) )
    unzip(zipFileName)

# Load the data into memory
blogs   <- readLines(fileblog, encoding="UTF-8")
twitter <- readLines(filetwit, encoding="UTF-8")

## Warning in readLines(filetwit, encoding = "UTF-8"): line 167155 appears to
## contain an embedded nul

## Warning in readLines(filetwit, encoding = "UTF-8"): line 268547 appears to
## contain an embedded nul

## Warning in readLines(filetwit, encoding = "UTF-8"): line 1274086 appears to
## contain an embedded nul

## Warning in readLines(filetwit, encoding = "UTF-8"): line 1759032 appears to
## contain an embedded nul

news    <- readLines(filenews, encoding="UTF-8")

## Warning in readLines(filenews, encoding = "UTF-8"): incomplete final line
## found on 'final/en_US/en_US.news.txt'

Basic report of summary statistics about the data sets and Report any interesting findings that you amassed so far

#install.packages("NLP")
#install.packages("tm")
#install.packages("wordcloud")
#install.packages("ngram")
library(stringi)
library(ggplot2)
library(NLP)

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(tm)

## Warning: package 'tm' was built under R version 3.6.2

#library(RWeka)
library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.6.2

## Loading required package: RColorBrewer

library(ngram)

# count the number of words in each dataset
wordcount(blogs)

## [1] 37334131

wordcount(news)

## [1] 2643969

wordcount(twitter)

## [1] 30373543

# sampling data
set.seed(12345)
test_data <- c(sample(blogs, length(blogs) * 0.001),
              sample(news, length(news) * 0.001),
              sample(twitter, length(twitter) * 0.001)
          )

# clean data          
testdata <- iconv(test_data, "UTF-8", "ASCII", sub="")
sample_corpus <- VCorpus(VectorSource(testdata))
sample_corpus <- tm_map(sample_corpus, tolower)
sample_corpus <- tm_map(sample_corpus, stripWhitespace)
sample_corpus <- tm_map(sample_corpus, removePunctuation)
sample_corpus <- tm_map(sample_corpus, removeNumbers)
sample_corpus <- tm_map(sample_corpus, PlainTextDocument)

# Create some 1-gram, 2-gram, and 3-gram tokenizers
UnigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

# Run the corpora through the tokenizers.
btdm1 <- TermDocumentMatrix(sample_corpus, control = list(tokenize = UnigramTokenizer))
btdm2 <- TermDocumentMatrix(sample_corpus, control = list(tokenize = BigramTokenizer))
btdm3 <- TermDocumentMatrix(sample_corpus, control = list(tokenize = TrigramTokenizer))


# create A function to compute the frequency of words and create a bar plot.
showCorpusInfo <- function(theCorpus)
{
    m <- as.matrix(theCorpus)
    v <- sort(rowSums(m), decreasing=TRUE)
    d <- data.frame(word = names(v),freq=v)
    #print(head(d, 10))
    barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")
    return (d)
}

# create A function to plot a word cloud.   
createAcloud <- function(d)
{
    minf = 40
    wordcloud(words = d$word, freq = d$freq, min.freq = minf,
                  max.words=200, random.order=FALSE, rot.per=0.35, 
                  colors=brewer.pal(8, "Dark2"))
}

# Plot the top 10 1-grams, 2-grams and 3-grams

d1<-showCorpusInfo(btdm1)

d2<-showCorpusInfo(btdm2)

d3<-showCorpusInfo(btdm3)

# plot a word cloud
createAcloud(d1)

createAcloud(d2)

Capstone Project - Week 2

Shiqi Yang

1/21/2020

Instructions

Download the data and successfully loaded it into the environment.

Basic report of summary statistics about the data sets and Report any interesting findings that you amassed so far

Plans for creating a prediction algorithm and Shiny app

The plots show that some words are highly used in all 3. My next steps are:

Research other data cleansing techniques to find a good balance

Change the sample size using different cleansing techniques

Research different prediction models.