Data Science Capstone- Week 2 Project

Overview

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Load Libraries for Environment Setup

library(ggplot2)
library(knitr)

## Warning: package 'knitr' was built under R version 4.1.3

library(RWeka)

## Warning: package 'RWeka' was built under R version 4.1.3

library(stringi)

## Warning: package 'stringi' was built under R version 4.1.2

library(SnowballC)
library(tm)

## Warning: package 'tm' was built under R version 4.1.3

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.1.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load and Summarize Data

if (!file.exists("Coursera-SwiftKey.zip")){
        download.file(url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", destfile = "Coursera-SwiftKey.zip")
        unzip("Coursera-SwiftKey.zip")
}

blogs <- readLines("./final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
news <- readLines("./final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("./final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)

stats <- data.frame(
        FileName=c("blogs", "news", "twitter"),
        FileSize=sapply(list(blogs, news, twitter), function(x){format(object.size(x), "MB")}),
        # FileSizeMB=c(file.info("./en_US.blogs.txt")$size/1024^2,
                     #file.info("./en_US.news.txt")$size/1024^2,
                     #file.info("./en_US.twitter.txt")$size/1024^2),
        t(rbind(sapply(list(blogs, news, twitter), stri_stats_general),#[c("Lines", "Chars"),],
        Words = sapply(list(blogs, news, twitter), stri_stats_latex)[4,])
        )
)

stats

##   FileName FileSize   Lines LinesNEmpty     Chars CharsNWhite    Words
## 1    blogs 255.4 Mb  899288      899288 206824382   170389539 37570839
## 2     news  19.8 Mb   77259       77259  15639408    13072698  2651432
## 3  twitter   319 Mb 2360148     2360148 162096241   134082806 30451170

Extract Data Sample

set.seed(1001)
sampleSize <- 0.01

blogsSub <- sample(blogs, length(blogs) * sampleSize)
newsSub <- sample(news, length(news) * sampleSize)
twitterSub <- sample(twitter, length(twitter) * sampleSize)

sampleData <- c(sample(blogs, length(blogs) * sampleSize),
                    sample(news, length(news) * sampleSize),
                    sample(twitter, length(twitter) * sampleSize))

sampleStats <- data.frame(
        FileName=c("blogsSub", "newsSub", "twitterSub", "sampleData"),
        FileSize=sapply(list(blogsSub, newsSub, twitterSub, sampleData), function(x){format(object.size(x), "MB")}),
        t(rbind(sapply(list(blogsSub, newsSub, twitterSub, sampleData), stri_stats_general),#[c("Lines", "Chars"),],
        Words = sapply(list(blogsSub, newsSub, twitterSub, sampleData), stri_stats_latex)[4,])
        )
)

sampleStats

##     FileName FileSize Lines LinesNEmpty   Chars CharsNWhite  Words
## 1   blogsSub   2.6 Mb  8992        8992 2083795     1717050 377945
## 2    newsSub   0.2 Mb   772         772  154332      128930  26336
## 3 twitterSub   3.2 Mb 23601       23601 1621004     1340228 304838
## 4 sampleData     6 Mb 33365       33365 3835341     3167327 704226

Clean Data by building Corpus

corpus <- VCorpus(VectorSource(sampleData))
cleanCorpus <- tm_map(corpus, content_transformer(tolower)) 
cleanCorpus <- tm_map(cleanCorpus, removePunctuation)
cleanCorpus <- tm_map(cleanCorpus, removeNumbers)  
cleanCorpus <- tm_map(cleanCorpus, stripWhitespace)
cleanCorpus <- tm_map(cleanCorpus, PlainTextDocument)

Construct and Analyze N-grams

# Tokenizer analysis
uniTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uniMatrix <- TermDocumentMatrix(cleanCorpus, control = list(tokenize = uniTokenizer))
biMatrix <- TermDocumentMatrix(cleanCorpus, control = list(tokenize = biTokenizer))
triMatrix <- TermDocumentMatrix(cleanCorpus, control = list(tokenize = triTokenizer))

# Frequency Analysis
uniCorpus <- findFreqTerms(uniMatrix, lowfreq = 20)
biCorpus <- findFreqTerms(biMatrix, lowfreq = 20)
triCorpus <- findFreqTerms(triMatrix, lowfreq = 20)

uniCorpusFreq <- rowSums(as.matrix(uniMatrix[uniCorpus,]))
uniCorpusFreq <- data.frame(word = names(uniCorpusFreq), frequency = uniCorpusFreq)
biCorpusFreq <- rowSums(as.matrix(biMatrix[biCorpus,]))
biCorpusFreq <- data.frame(word = names(biCorpusFreq), frequency = biCorpusFreq)
triCorpusFreq <- rowSums(as.matrix(triMatrix[triCorpus,]))
triCorpusFreq <- data.frame(word = names(triCorpusFreq), frequency = triCorpusFreq)

head(uniCorpusFreq)

##            word frequency
## “the       “the        60
## “we         “we        29
## ability ability        36
## able       able       190
## about     about      2176
## above     above       107

Graphical Representation of Data

#set variables in table using desc (representing descending order)
uniCorpusFreqDescend <- arrange(uniCorpusFreq, desc(frequency))
biCorpusFreqDescend <- arrange(biCorpusFreq, desc(frequency))
triCorpusFreqDescend <- arrange(triCorpusFreq, desc(frequency))

uniBar <- ggplot(data = uniCorpusFreqDescend[1:20,], aes(x = reorder(word, -frequency), y = frequency)) +
        geom_bar(stat = "identity", fill = "black") +
        xlab("Terminologies") +
        ylab("Frequency") +
        ggtitle(paste("Unigram Frequancy")) +
        theme(plot.title = element_text(hjust = 0.5)) +
        theme(axis.text.x = element_text(angle = 60, hjust = 1))
biBar <- ggplot(data = biCorpusFreqDescend[1:20,], aes(x = reorder(word, -frequency), y = frequency)) +
        geom_bar(stat = "identity", fill = "purple") +
        xlab("Terminologies") +
        ylab("Frequency") +
        ggtitle(paste("Bigram Frequency")) +
        theme(plot.title = element_text(hjust = 0.5)) +
        theme(axis.text.x = element_text(angle = 60, hjust = 1))
triBar <- ggplot(data = triCorpusFreqDescend[1:20,], aes(x = reorder(word, -frequency), y = frequency)) +
        geom_bar(stat = "identity", fill = "red",) +
        xlab("Terminologies") +
        ylab("Frequency") +
        ggtitle(paste("Trigram Frequency")) +
        theme(plot.title = element_text(hjust = 0.5)) +
        theme(axis.text.x = element_text(angle = 60, hjust = 1))
uniBar

biBar

triBar

Hence looking at the three diagrams, one can determine which word or group of words are used most often on the social media website (Twitter).