The milestone report includes the collection and manipulation of the data including exploratory analysis of word count. A subset of the sample is converted into a corpus and examples of word search and contextual phase search are included.
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(stringi)
library(tm)
library(slam)
library(ggplot2)
library(quanteda)
library(readtext)
The raw data was downloaded and extracted
# Check for zip file and download if necessary
if (!file.exists("C:/Users/Downloads/Coursera-SwiftKey.zip")) {
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
destfile = "C:/Users/Gary Clarke/Downloads/Coursera-SwiftKey.zip")
}
# Check for data file and unzip if necessary
if (!file.exists("C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US")) {
unzip("data/Coursera-SwiftKey.zip", exdir = "C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US", list = TRUE)
}
conn <- file("C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US/en_US.blogs.txt")
blogs <- readLines(conn, encoding = "UTF-8")
close(conn)
conn <- file("C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US/en_US.news.txt")
news <- readLines(conn, encoding = "UTF-8")
close(conn)
conn <- file("C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US/en_US.twitter.txt")
tweets <- readLines(conn, encoding = "UTF-8")
close(conn)
rm(conn)
The words per line were computed and used to graph the results. All data types are biased to the right suggesting short sentences and messages.
# words per line
WPL<-lapply(list(blogs,news,tweets),function(x) stri_count_words(x))
# Compute info for each data type
rawStats<-data.frame(
File=c("blogs","news","twitter"),
t(rbind(sapply(list(blogs,news,tweets),stri_stats_general),
TotalWords=sapply(list(blogs,news,tweets),stri_stats_latex)[4,])),
# Compute words per line summary
WPL_computed=rbind(summary(WPL[[1]]),summary(WPL[[2]]),summary(WPL[[3]]))
)
print(rawStats)
## File Lines LinesNEmpty Chars CharsNWhite TotalWords
## 1 blogs 899288 899288 206824382 170389539 37570839
## 2 news 77259 77259 15639408 13072698 2651432
## 3 twitter 2360148 2360148 162096031 134082634 30451128
## WPL_computed.Min. WPL_computed.1st.Qu. WPL_computed.Median WPL_computed.Mean
## 1 0 9 28 41.75107
## 2 1 19 32 34.61779
## 3 1 7 12 12.75063
## WPL_computed.3rd.Qu. WPL_computed.Max.
## 1 60 6726
## 2 46 1123
## 3 18 47
qplot(WPL[[1]],geom="histogram",main="Histogram for US Blogs",
xlab="Number of Words",ylab="Frequency",binwidth=10)
qplot(WPL[[2]],geom="histogram",main="Histogram for US News",
xlab="Number of Words",ylab="Frequency",binwidth=10)
qplot(WPL[[3]],geom="histogram",main="Histogram for US Tweets",
xlab="Number of Words",ylab="Frequency",binwidth=1)
rm(WPL);rm(rawStats)
##Sampling raw data
samplesize <- 35000 # sample size
set.seed(3206) # Ensure reproducibility
# Create raw data and sample vectors
data <- list(blogs, news, tweets)
sample <- list()
# Iterate each raw data to create 'cleaned'' sample for each
for (i in 1:length(data)) {
# Create sample dataset
Filter <- sample(1:length(data[[i]]), samplesize, replace = FALSE)
sample[[i]] <- data[[i]][Filter]
# clean unusual characters
for (j in 1:length(sample[[i]])) {
row1 <- sample[[i]][j]
row2 <- iconv(row1, "latin1", "ASCII", sub = "")
sample[[i]][j] <- row2
}
}
rm(blogs)
rm(news)
rm(tweets)
The corpus was created then converted into a quanteda corpus so the functionality of quanteda can be used to work with the data. The data was tokenised using the “tokens” function, Tokens are the building blocks for Natural Language Processing and are used to prepare a vocabulary.
conn <- file("C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US/sample.txt")
text <- readLines(conn, encoding = "UTF-8")
## Warning in readLines(conn, encoding = "UTF-8"): incomplete final line found
## on 'C:/Users/Gary Clarke/Desktop/Coursera/data/Coursera-Swiftkey/final/en_US/
## sample.txt'
close(conn)
docs <- Corpus(VectorSource(text))
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
docs <- tm_map(docs, stemDocument)
## Warning in tm_map.SimpleCorpus(docs, stemDocument): transformation drops
## documents
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
corp_quanteda <- corpus(docs)
token_docs <- tokens(corp_quanteda, remove_punct = TRUE)
Examples are shown of searching for a word and for phrases in the data
##keyword in context
kw_politics <- kwic(token_docs, pattern = "oil")
head(kw_politics, 10)
##
## [text220, 37] reduc it depend on foreign | oil |
## [text388, 49] accent with orang and oliv | oil |
## [text630, 12] pursu an all-of-the-abov strategy Coal | Oil |
## [text1248, 46] break to the Big Five | oil |
## [text1341, 58] and onion cook in oliv | oil |
## [text1571, 1] | Oil |
## [text1667, 11] medium-high heat about inch of | oil |
## [text1789, 31] the world economi by caus | oil |
## [text1879, 13] in March show that crude | oil |
## [text2157, 29] drill into rich vein of | oil |
##
##
## gelato and a molten chocol
## Natur gas Solar Wind And
## compani that report record profit
## Another winner and the cutlet
## price fell cent to$
## to F
## price to skyrocket It also
## product on public land onshor
## under the Pennsylvania countryside
##phrase search
kw_phrase <- kwic(token_docs, pattern = phrase(c("as often as not*", "Oil field*")))
head(kw_phrase, 20)
##
## [text36006, 8:9] fracking are move from the | oil field |
## [text36315, 21:22] been the privat of the | oil fields |
## [text44722, 37:38] on bomb run target Hitler | oil field |
##
## to the board room
## If the govern had given
## in Romania accord to U.S
The plan going forward is to use quanteda to create the ngrams and summarise the frequency and use of the words (which are now tokens) and to build a predictive model. This will then be used in a Shiny App to make a word recommendation based on the word the user inputs.