Data Science Specialization Capstone Project: Milestone Report

Megan Williams

July 22,, 2015

Task 0: Understanding the Problem

Natural language processing (NLP) is a field of computer science, artificial intelligence, and linguistics concerned with the interactions between computers and human (natural) languages. As such, NLP is related to the area of human–computer interaction. The goal of the Data Science Specialization Capstone Project is to create a predictive text algorithm in R that is based on a user’s text input. The algorithm will predict what word is most likely to be entered next.

Task 1: Data Acquisition and Cleaning

Make sure that you set the path for your working directory to the location where your files are stored. We want to load the files inside the “en_US” folder.

Download Data and Import Datasets

load(file ="/Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.blogs.rdata")
load(file ="/Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.news.rdata")
load(file ="/Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.twitter.rdata")

NOTE: To save time, I preloaded my datasets, but if I were to extract this data from the source file:

#Specify the source and destination of the download
#destination_file <- "Coursera-SwiftKey.zip"
#source_file <- "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

# execute the download
#download.file(source_file, destination_file)

# extract the files from the zip file
#unzip(destination_file)

library(stringi)

# inspect the data
list.files("/Users/meganwilliams/Desktop/DataScienceCapstone/final2")

##  [1] "badword.csv"         "corpus.rda"          "de_DE"              
##  [4] "en_US"               "en.txt"              "fi_FI"              
##  [7] "one.gram.20.Rdata"   "ru_RU"               "three.gram.20.Rdata"
## [10] "two.gram.20.Rdata"

list.files("/Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US")

## [1] "en_US.blogs.csv"     "en_US.blogs.rdata"   "en_US.blogs.txt"    
## [4] "en_US.news.csv"      "en_US.news.rdata"    "en_US.news.txt"     
## [7] "en_US.twitter.csv"   "en_US.twitter.rdata" "en_US.twitter.txt"

# import the blogs and twitter datasets in text mode
news <- readLines("/Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.news.txt", encoding = "UTF-8")
twitter <- readLines("/Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.twitter.txt", encoding = "UTF-8")
blogs <- readLines("/Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.blogs.txt", encoding = "UTF-8")

Summary Statistics for twitter, blogs, and news datasets

1. Twitter

#Number of lines in the file

system("wc -l /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.twitter.txt")

#Word count
system("wc -w /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.twitter.txt",intern=TRUE)

## [1] " 30374206 /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.twitter.txt"

#Character count
system("wc -c /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.twitter.txt",intern=TRUE)

## [1] " 167105338 /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.twitter.txt"

#Preview of data layout (first 5 lines)
readLines(file("/Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.twitter.txt","r"), 5)

## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."  
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."                                                                       
## [4] "So Tired D; Played Lazer Tag & Ran A LOT D; Ughh Going To Sleep Like In 5 Minutes ;)"                           
## [5] "Words from a complete stranger! Made my birthday even better :)"

2. Blogs

#Number of lines in the file
system("wc -l /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.blogs.txt")

#Word Count
system("wc -w /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.blogs.txt",intern=TRUE)

## [1] " 37334690 /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.blogs.txt"

#Character count
system("wc -c /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.blogs.txt",intern=TRUE)

## [1] " 210160014 /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.blogs.txt"

#Preview of data layout (first five lines)
readLines(file("/Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.blogs.txt","r"), 5)

## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
## [2] "We love you Mr. Brown."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
## [4] "so anyways, i am going to share some home decor inspiration that i have been storing in my folder on the puter. i have all these amazing images stored away ready to come to life when we get our home."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
## [5] "With graduation season right around the corner, Nancy has whipped up a fun set to help you out with not only your graduation cards and gifts, but any occasion that brings on a change in one's life. I stamped the images in Memento Tuxedo Black and cut them out with circle Nestabilities. I embossed the kraft and red cardstock with TE's new Stars Impressions Plate, which is double sided and gives you 2 fantastic patterns. You can see how to use the Impressions Plates in this tutorial Taylor created. Just one pass through your die cut machine using the Embossing Pad Kit is all you need to do - super easy!"

3. News

#Number of lines in the file
system("wc -l /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.news.txt")

#Word Count
system("wc -w /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.news.txt",intern=TRUE)

## [1] " 34372720 /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.news.txt"

#Character count
system("wc -c /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.news.txt",intern=TRUE)

## [1] " 205811889 /Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.news.txt"

#Preview of data layout (first five lines)
readLines(file("/Users/meganwilliams/Desktop/DataScienceCapstone/final2/en_US/en_US.news.txt","r"), 5)

## [1] "He wasn't home alone, apparently."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."                                                                                                                                                                                                                                                                                                                                                         
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."                                                                                                                                                                                                                                                                                                                                 
## [4] "The Alaimo Group of Mount Holly was up for a contract last fall to evaluate and suggest improvements to Trenton Water Works. But campaign finance records released this week show the two employees donated a total of $4,500 to the political action committee (PAC) Partners for Progress in early June. Partners for Progress reported it gave more than $10,000 in both direct and in-kind contributions to Mayor Tony Mack in the two weeks leading up to his victory in the mayoral runoff election June 15."
## [5] "And when it's often difficult to predict a law's impact, legislators should think twice before carrying any bill. Is it absolutely necessary? Is it an issue serious enough to merit their attention? Will it definitely not make the situation worse?"

Summary Table

##           Lines    Words Characters
## Twitter 2360148 30374206  167105338
## Blogs    899288 37334690  210160014
## News    1010242 34372720  205811889

Sampling

Sample only 10% of the data since the dataset is so large

set.seed(48)
news.s <- sample(news, length(news)*0.10, replace = FALSE)
twitter.s <- sample(twitter, length(twitter)*0.10, replace = FALSE)
blogs.s <- sample(blogs, length(blogs)*0.10, replace = FALSE)


hist(unlist((lapply(news.s,stri_length))),breaks=100,main = "Number of Words per Line (News)", xlab = "Number of Words", ylab = "Frequency")

plot of chunk unnamed-chunk-17

hist(unlist((lapply(twitter.s,stri_length))),breaks=100,main = "Number of Words per Line (Twitter)", xlab = "Number of Words", ylab = "Frequency")

plot of chunk unnamed-chunk-17

hist(unlist((lapply(blogs.s,stri_length))),breaks=100,main = "Number of Words per Line", xlab = "Number of Words per Line (Blogs)", ylab = "Frequency")

plot of chunk unnamed-chunk-17

Data Cleaning

Combine the data files into one file - then remove numbers, special characters (like those pesky hashtags), and extra white space

NOTE: Due to time constraints, I ran the corpus code a few days ago and saved the object so I could load it in for tokenization during the creation of this Rpubs document. You will find the code below, but I put hastags before each command so it wouldn't run it during the creation of this Rpubs document.

library(tm)
library(SnowballC)
#tbn <- c(news.s,twitter.s,blogs.s)
#corpus <- Corpus(VectorSource(tbn))
#remove.decimals <- function(x) {gsub("([0-9]*)\\.([0-9]+)", "\\1 \\2", x)}
#remove.hashtags <- function(x) { gsub("#[a-zA-z0-9]+", " ", x)}
#remove.nonenglish <- function(x) {gsub("\\W+", " ",x)}
#corpus <- tm_map(corpus, remove.decimals)
#corpus <- tm_map(corpus, removeNumbers)
#corpus <- tm_map(corpus, remove.nonenglish)
#corpus <- tm_map(corpus, remove.hashtags)
#corpus <- tm_map(corpus, stripWhitespace)
#corpus <- tm_map(corpus, removePunctuation)
#corpus <- tm_map(corpus, tolower)
#corpus <- tm_map(corpus, removeWords, stopwords("english"))

Remove Profane and Offensive Words

#badwords <- read.csv("/Users/meganwilliams/Desktop/DataScienceCapstone/en.csv", header = F)
#badwords <- rep(badwords$V1)
#corpus <- tm_map(corpus, removeWords,badwords)

Task 2: Exploratory Data Analysis

#load corpus data

require(mgcv)

## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.7-26. For overview type 'help("mgcv-package")'.

load(file = "/Users/meganwilliams/Desktop/DataScienceCapstone/final2/corpus.rda")

Tokenization

One-Gram Tokenization

options( java.parameters = "-Xmx4g" )
library(rJava)
library(RWeka)

## 
## Attaching package: 'RWeka'
## 
## The following objects are masked from 'package:foreign':
## 
##     read.arff, write.arff

options(mc.cores = 1)
one.gram_Tokenizer <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
one.gram <- data.frame(table(one.gram_Tokenizer))
one.gram.sort <- one.gram[order(one.gram$Freq,decreasing = TRUE),]
one.gram.20 <- one.gram.sort[1:20,]
colnames(one.gram.20) <-c("Word","Frequency")

Bi-Gram Tokenization

options( java.parameters = "-Xmx4g" )
library(rJava)
library(RWeka)
options(mc.cores = 2)
two.gram_Tokenizer <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
two.gram <- data.frame(table(two.gram_Tokenizer))
two.gram.sort <- two.gram[order(two.gram$Freq,decreasing = TRUE),]
two.gram.20 <- two.gram.sort[1:20,]
colnames(two.gram.20) <-c("Word","Frequency")

Tri-Gram Tokenization

options( java.parameters = "-Xmx4g" )
library(rJava)
library(RWeka)
options(mc.cores = 3)
three.gram_Tokenizer <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3))
three.gram <- data.frame(table(three.gram_Tokenizer))
three.gram.sort <- three.gram[order(three.gram$Freq,decreasing = TRUE),]
three.gram.20 <- three.gram.sort[1:20,]
colnames(three.gram.20) <-c("Word","Frequency")

ggplots for word Top 20 Single, Double, and Triple Words

Top 20 Single Words

library(ggplot2)
ggplot(one.gram.20, aes(x=Word,y=Frequency), ) + geom_bar(stat="Identity", fill="blue") +geom_text(aes(label=Frequency), vjust=-0.2)

plot of chunk unnamed-chunk-25

Top 20 Double Words

ggplot(two.gram.20, aes(x=Word,y=Frequency), ) + geom_bar(stat="Identity", fill="light blue") +geom_text(aes(label=Frequency), vjust=-0.2)

plot of chunk unnamed-chunk-26

Top 20 Triple Words

ggplot(three.gram.20, aes(x=Word,y=Frequency), ) + geom_bar(stat="Identity", fill="green") +geom_text(aes(label=Frequency), vjust=-0.2)

plot of chunk unnamed-chunk-27

Word Cloud for Single and Double Words

par(mfrow = c(1,2))
library(wordcloud)

## Loading required package: Rcpp
## Loading required package: RColorBrewer

## Loading required package: RColorBrewer
wordcloud(one.gram.sort[,1],freq=one.gram.sort[,2],scale=c(5,1),random.order=F,rot.per=0.5,min.freq=100,colors=brewer.pal(8,"Dark2"))
wordcloud(two.gram.sort[,1],freq=one.gram.sort[,2],scale=c(5,1),random.order=F,rot.per=0.5,min.freq=100,colors=brewer.pal(8,"Dark2"))

plot of chunk unnamed-chunk-28

Future Directions

Address apostrophe issues
Build a predictive model using the techniques I've learned in this course thus far.
Build a shiny model