This document highlights the initial exploration steps in dealing with the Swiftkey
Initial steps, loading libraries
# init
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 379299 10.2 592000 15.9 460000 12.3
## Vcells 383730 3.0 786432 6.0 677413 5.2
# libs
if (!require("ggplot2")){ install.packages("ggplot2") }
if (!require("tm")){ install.packages("tm") }
if (!require("stringi")){ install.packages("stringi") }
if (!require("wordcloud")){ install.packages("wordcloud") }
if (!require("RColorBrewer")){ install.packages("RColorBrewer") }
#if (!require("ngram")){ install.packages("ngram") }
#if (!require("RWeka")){ install.packages("RWeka") }
#if (!require("Rgraphviz")){ install.packages("Rgraphviz") }
download large file and unzip, only if needed.
# download file, if not present already
if( ! file.exists("Coursera-SwiftKey.zip") ){
src_zip_file <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(src_zip_file, destfile = "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
# look at the source files
zip_files <- unzip("Coursera-SwiftKey.zip", list = T)
zip_files$Date <- NULL
zip_files$Language <- substr(zip_files$Name, 7, 8)
zip_files$Length_in_Mb <- zip_files$Length/(1024^2)
zip_files <- zip_files[zip_files$Length>0,]
Unzipped file sizes:
ggplot(zip_files, aes(x = Name, y = Length_in_Mb, fill = Language)) +
geom_bar(stat="identity") +
coord_flip() +
theme_light() +
xlab("") + ylab("File size in Mb")
Let’s load the Twitter files in memory
# only load twitter
zip_files <- zip_files[ grep("twitter", zip_files$Name), ]
# this section reads all the files in memory
# using dynamical variable names
# only run this part if you have enough RAM
for (file in zip_files$Name) {
con <- file(file, "r")
file_content <- readLines(con, encoding = "UTF-8")
print( paste("File:", file, "has in-memory size of:") )
print(object.size(file_content), units="Mb")
close(con)
# dynamically create variable names
assign(basename(file), file_content)
file_content <- NULL
}
## [1] "File: final/de_DE/de_DE.twitter.txt has in-memory size of:"
## 112.8 Mb
## [1] "File: final/ru_RU/ru_RU.twitter.txt has in-memory size of:"
## 130.9 Mb
## [1] "File: final/en_US/en_US.twitter.txt has in-memory size of:"
## 257.2 Mb
## [1] "File: final/fi_FI/fi_FI.twitter.txt has in-memory size of:"
## 36.4 Mb
Number of characters per line in each file, a pattern emerges:
op <- par(mfrow = c(2, 2))
hist( nchar(fi_FI.twitter.txt), breaks=30, col=rainbow(50) )
hist( nchar(de_DE.twitter.txt), breaks=30, col=rainbow(50) )
hist( nchar(en_US.twitter.txt), breaks=30, col=rainbow(50) )
hist( nchar(ru_RU.twitter.txt), breaks=30, col=rainbow(50) )
par(op)
Number of words per tweet. Finish tweets contain the most words per tweet, then Germans. US twitters are lazy, almost as lazy as the Russians.
op <- par(mfrow = c(2, 2))
hist( stri_count_words(fi_FI.twitter.txt), breaks=30, col=rainbow(50), main="Finnish")
hist( stri_count_words(de_DE.twitter.txt), breaks=30, col=rainbow(50), main="German" )
hist( stri_count_words(en_US.twitter.txt), breaks=30, col=rainbow(50), main="USA" )
hist( stri_count_words(ru_RU.twitter.txt), breaks=30, col=rainbow(50), main="Russia" )
mtext("Number of words per Tweet for each dataset", side = 3, line = -21, outer = TRUE)
par(op)
prepare sample dataset, check if EN distribution is similar:
# sampling 1000 tweets
set.seed( 1984^2 )
df <- sample(en_US.twitter.txt, 1000)
# cleanup memory
print(object.size(en_US.twitter.txt), units="Mb")
## 257.2 Mb
rm( list=ls(pattern="twitter") )
# does the sampled histogram match the main one?
hist( stri_count_words(df), breaks=30, col=rainbow(50), main= paste0("en_US words per tweet") )
Explore sampled dataset using the tm library
# text mining on sampled data
corp <- VCorpus(VectorSource(df))
# writeCorpus(corp) # writes 100 files on disk
inspect( corp[1:2] )
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 134
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 64
as.character(corp[[1]])
## [1] "Just got done doing #Homework....now im wide awake needing somebody to text #SN: ytf do I like doing #Homework lol it gets me going!!!"
lapply(corp[1:3], as.character)
## $`1`
## [1] "Just got done doing #Homework....now im wide awake needing somebody to text #SN: ytf do I like doing #Homework lol it gets me going!!!"
##
## $`2`
## [1] "M , I think today's just not that day.. I had too much time ...."
##
## $`3`
## [1] "We got this"
# start the tm transformations with tm_map
# eliminate white spaces
corp <- tm_map(corp, stripWhitespace)
# convert to lowercase
#corp <- tm_map(corp, tolower)
#corp <- tm_map(corp, content_transformer(tolower))
# Remove punctuation
corp = tm_map(corp, removePunctuation)
# remove stopwords
length( stopwords("en") )
## [1] 174
corp <- tm_map(corp, removeWords, stopwords("en"))
# stemming
corp <- tm_map(corp, stemDocument)
# inspect the prepared Corpus
lapply(corp[1:3], as.character)
## $`1`
## [1] "Just got done Homeworknow im wide awak need somebodi text SN ytf I like Homework lol get go"
##
## $`2`
## [1] "M I think today just day I much time"
##
## $`3`
## [1] "We got"
build a document term matrix
# Create a Document-Term matrix
dtm <- DocumentTermMatrix(corp)
inspect(dtm[1:3, 1:3])
## <<DocumentTermMatrix (documents: 3, terms: 3)>>
## Non-/sparse entries: 0/9
## Sparsity : 100%
## Maximal term length: 5
## Weighting : term frequency (tf)
##
## Terms
## Docs ’ll ’re “dont
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
# find words occuring at least 10 times
findFreqTerms(dtm, 10)
## [1] "actual" "also" "amaz" "and" "awesom" "back"
## [7] "bad" "best" "better" "big" "book" "bring"
## [13] "but" "can" "cant" "come" "day" "dont"
## [19] "end" "enjoy" "even" "ever" "everyon" "fan"
## [25] "feel" "first" "follow" "friend" "fuck" "fun"
## [31] "game" "get" "give" "gonna" "good" "got"
## [37] "great" "guy" "haha" "happi" "hard" "hate"
## [43] "hit" "home" "hope" "ill" "its" "ive"
## [49] "just" "keep" "know" "last" "let" "life"
## [55] "like" "littl" "live" "lol" "look" "love"
## [61] "make" "man" "may" "mean" "miss" "morn"
## [67] "much" "need" "never" "new" "next" "nice"
## [73] "night" "now" "one" "parti" "peopl" "person"
## [79] "play" "pleas" "realli" "right" "run" "say"
## [85] "see" "show" "start" "sure" "take" "talk"
## [91] "tell" "thank" "that" "the" "thing" "think"
## [97] "this" "thought" "time" "today" "tomorrow" "tonight"
## [103] "tri" "tweet" "twitter" "use" "wait" "want"
## [109] "watch" "way" "week" "well" "what" "will"
## [115] "win" "wish" "work" "yeah" "year" "yes"
## [121] "you" "your"
# find associations (i.e., terms which correlate) with at least 0.8 correlation for the terms, then
findAssocs(dtm, "facebook", 0.3)
## $facebook
## Â’re chemjobb companyÂ’
## 0.71 0.71 0.71
## gasp youshouldbeembarrass couldnt
## 0.71 0.71 0.50
## doe main websit
## 0.50 0.41 0.35
# remove sparse terms (terms occurring only in very few documents)
dtm <- (removeSparseTerms(dtm, 0.98))
#head( inspect(dtm) )
#
dim( as.data.frame( as.matrix( removeSparseTerms(dtm, 0.98) ) ) )
## [1] 1000 39
# Create data frame
labeledTerms = as.data.frame(as.matrix(dtm))
# the most frequent terms
dotchart( head( sort( colSums(labeledTerms) ), 20))
# build wordcloud
wordcloud(colnames(labeledTerms), colSums(labeledTerms), scale=c(2, 0.25))
# wordcloud directly out of the original corpus
wordcloud(corp, max.words = 100, random.order = FALSE)
# build wordcloud
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:2)]
wordcloud(colnames(labeledTerms), colSums(labeledTerms),
scale=c(8,.3),min.freq=2,max.words=100, random.order=T,
rot.per=.15, colors=pal, vfont=c("sans serif","plain"))
# or directly from corpus
wordcloud(corp, max.words = 100, random.order = FALSE,
scale=c(8,.3),min.freq=2,
rot.per=.15, colors=pal, vfont=c("sans serif","plain"))
build a term document matrix to identify bigrams in the sampled set
# build bigrams from the corpus
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
tdm <- TermDocumentMatrix(corp, control = list(tokenize = BigramTokenizer))
# inspecting the bigrams
inspect(tdm[100:110, 1:10])
## <<TermDocumentMatrix (terms: 11, documents: 10)>>
## Non-/sparse entries: 0/110
## Sparsity : 100%
## Maximal term length: 8
## Weighting : term frequency (tf)
##
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10
## 2k miami 0 0 0 0 0 0 0 0 0 0
## 2nd time 0 0 0 0 0 0 0 0 0 0
## 2x pick 0 0 0 0 0 0 0 0 0 0
## 3 day 0 0 0 0 0 0 0 0 0 0
## 3 did 0 0 0 0 0 0 0 0 0 0
## 3 dog 0 0 0 0 0 0 0 0 0 0
## 3 final 0 0 0 0 0 0 0 0 0 0
## 3 ill 0 0 0 0 0 0 0 0 0 0
## 3 joint 0 0 0 0 0 0 0 0 0 0
## 3 morn 0 0 0 0 0 0 0 0 0 0
## 3 vega 0 0 0 0 0 0 0 0 0 0
findFreqTerms(tdm, 5)
## [1] "follow back" "good morn" "i cant" "i didnt"
## [5] "i dont" "i get" "i hate" "i just"
## [9] "i know" "i like" "i love" "i need"
## [13] "i think" "i thought" "i want" "i will"
## [17] "i wish" "im go" "im gonna" "just got"
## [21] "last night" "look forward" "right now" "thank follow"
## [25] "yes i" "you know"
build a term document matrix to identify bigrams in the sampled set
# build trigrams from the corpus
TrigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
tdm <- TermDocumentMatrix(corp, control = list(tokenize = TrigramTokenizer))
# inspecting the trigrams
findFreqTerms(tdm, 2)
## [1] "back back im" "back im back"
## [3] "follow everyon follow" "happi new year"
## [5] "i dont know" "i need new"
## [7] "i wish i" "im back back"
## [9] "just cant stand" "just got done"
## [11] "look forward see" "say right now"
## [13] "stay tune come" "tell hacki joke"
## [15] "thank stay tune" "today great day"
## [17] "tomorrow go back" "yes i will"