Problem

The first step in analyzing any new data set is figuring out:

what data you have
what are the standard tools and models used for that type of data.

This exercise uses the files named LOCALE.blogs.txt where LOCALE is the each of the four locales en_US, de_DE, ru_RU and fi_FI. The data is from a corpus called HC Corpora. See the About the Corpora reading for more details. The files have been language filtered but may still contain some foreign text.

In this capstone we will be applying data science in the area of natural language processing. As a first step toward working on this project, you should familiarize yourself with Natural Language Processing, Text Mining, and the associated tools in R. Here are some resources that may be helpful to you. # Libraries

library(plyr)
library(magrittr)
library(stringr)
library(stringi)
library(tm)

## Loading required package: NLP

library(RWeka)
library(SnowballC)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

Data

This is the training data to get you started that will be the basis for most of the capstone. You must download the data from the Coursera site and not from external websites to start. Download data from link

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("Coursera-SwiftKey.zip")) {
  download.file(url, "Coursera-SwiftKey.zip")
  #unzip the file
  unzip("Coursera-SwiftKey.zip", exdir = "Coursera-SwiftKey")
}
# Define file paths
filepath.blog <- "./Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
filepath.twit <- "./Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
filepath.news <- "./Coursera-SwiftKey/final/en_US/en_US.news.txt"
# Load the datasets into memory
con<- file(filepath.blog,"r")
blogs   <- readLines(filepath.blog, encoding="UTF-8", skipNul = TRUE)
close(con)
con<- file(filepath.twit,"r")
twitter <- readLines(filepath.twit, encoding="UTF-8", skipNul = TRUE)
close(con)
con<- file(filepath.news,open = "rb")
news<- readLines(filepath.news, encoding="UTF-8", skipNul = TRUE)
close(con)

Exploring data

stri_stats_general( blogs )

##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   206824382   170389539

stri_stats_general( twitter )

##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162096241   134082806

stri_stats_general( news )

##       Lines LinesNEmpty       Chars CharsNWhite 
##     1010242     1010242   203223154   169860866

blogs

blog_words <- stri_count_words(blogs)
summary(blog_words)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.75   60.00 6726.00

qplot(blog_words)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### News

news_words <- stri_count_words(news)
summary(news_words)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.41   46.00 1796.00

qplot(news_words)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### twitter

twit_words <- stri_count_words(twitter)
summary( twit_words)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00

qplot(twit_words)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Sampling the corpus for further analysis

# Setting seed for using in Sampling of corupis
set.seed(1000)
# Sampling of blogs
sampleBlogs <- blogs[sample(1:length(blogs), 1000)]
head(sampleBlogs, 3)

## [1] "Mum perches at the back behind Dad."                       
## [2] "As you peered at the doomed and haggard faces of your men?"
## [3] "unless you're in your work cubicle or something,"

sampleTwitter <- twitter[sample(1:length(twitter), 1000)]
head(sampleTwitter,3)

## [1] "Great moments are born from great opportunity. And that's what you have tomorrow, ryan. Out of 10 tryouts you might be cut 9"               
## [2] "I have a friend with an opportunity in the fast growing pet industry. Her business has taken off and she needs help! You would work with e…"
## [3] "Why haven't the have a blackout at Staples for the playoffs? Or a purple out? Oh or old school yellow out!"

sampleNews <- news[sample(1:length(news), 1000)]
head(sampleNews, 3)

## [1] "In court petitions, both Hinmon and Dekraai objected to Burke's request to become a co-guardian. They said he is not a biological relative, he lives in New York and has had limited contact with the boy. Hinmon petitioned to replace him as a co-guardian."                                                                                                                                                                                                                                                                          
## [2] "That's why the latest municipality to join the street-legal golf cart craze is something of a shocker."                                                                                                                                                                                                                                                                                                                                                                                                                                 
## [3] "I began to wonder (as Dmitry does every day) where I could find someone who might need a cab on a Monday afternoon. I strategized. It was also, sitting in the front seat and watching his day pass by, obvious what the pitfalls were – namely, imagine driving for hours and waiting for someone to raise a hand and get your attention. Sounds self-evident, but until you sit beside a cabbie actually doing this you don’t entirely grasp the gapping maw of time and near-existential melancholy that goes into a day of driving."

# Conslidating the sample files
sampleData <- c(sampleTwitter,sampleNews,sampleBlogs)
# Removing the original datasets from memory
rm(blogs); rm(twitter); rm(news)

Cleaning the Corpus

myCorpus <- Corpus(VectorSource(sampleData))
toSpace  <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
myCorpus <- tm_map(myCorpus, toSpace,"\"|/|@|\\|")

## Warning in tm_map.SimpleCorpus(myCorpus, toSpace, "\"|/|@|\\|"): transformation
## drops documents

myCorpus <- tm_map(myCorpus, content_transformer(stringi::stri_trans_tolower))

## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(stringi::stri_trans_tolower)): transformation drops
## documents

myCorpus <- tm_map(myCorpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents

myCorpus <- tm_map(myCorpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation drops
## documents

# Remove stop words
myCorpus <- tm_map(myCorpus , removeWords, stopwords('english'))

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents

myCorpusDF <-data.frame(text = unlist(sapply(myCorpus, identity)), 
                        stringsAsFactors = FALSE)

findNGrams <- function(corp, grams, top) {
        ngram <- NGramTokenizer(corp, Weka_control(min = grams, max = grams,
                                                   delimiters = " \\r\\n\\t.,;:\"()?!"))
        ngram <- data.frame(table(ngram))
        ngram <- ngram[order(ngram$Freq, decreasing = TRUE),][1:top,]
        colnames(ngram) <- c("Words","Count")
        ngram
        }

monoGrams   <- findNGrams(myCorpusDF, 1, 100)
biGrams     <- findNGrams(myCorpusDF, 2, 100)
triGrams    <- findNGrams(myCorpusDF, 3, 100)
quadriGrams <- findNGrams(myCorpusDF, 4, 100)

Plot

library(ggplot2)
# number of ngrams to show in the graph
n <- 20
# Plotting of the various nGrams
ggplot(monoGrams[1:n,], aes(Words, Count))   + geom_bar(stat = "identity",fill='blue') + 
        coord_flip()

ggplot(biGrams[1:n,], aes(Words, Count))     + geom_bar(stat = "identity",fill='blue') + 
        coord_flip()

ggplot(triGrams[1:n,], aes(Words, Count))    + geom_bar(stat = "identity",fill='blue') + 
        coord_flip()

ggplot(quadriGrams[1:n,], aes(Words, Count)) + geom_bar(stat = "identity", fill='blue') + 
        coord_flip()

Final Project

Matteo Gambera

27 aprile 2020