Capstone Project - Data Exploration - Understand frequencies of words and word pairs

This document highlights the initial exploration steps in dealing with the Swiftkey

Initial steps, loading libraries

# init
rm(list = ls())
gc()

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 379299 10.2     592000 15.9   460000 12.3
## Vcells 383730  3.0     786432  6.0   677413  5.2

# libs
if (!require("ggplot2")){ install.packages("ggplot2") }
if (!require("tm")){ install.packages("tm") }
if (!require("stringi")){ install.packages("stringi") }
if (!require("wordcloud")){ install.packages("wordcloud") }
if (!require("RColorBrewer")){ install.packages("RColorBrewer") }
#if (!require("ngram")){ install.packages("ngram") }
#if (!require("RWeka")){ install.packages("RWeka") }
#if (!require("Rgraphviz")){ install.packages("Rgraphviz") }

download large file and unzip, only if needed.

# download file, if not present already
if( ! file.exists("Coursera-SwiftKey.zip") ){
  src_zip_file <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
  download.file(src_zip_file, destfile = "Coursera-SwiftKey.zip")
  unzip("Coursera-SwiftKey.zip")
}

# look at the source files
zip_files <- unzip("Coursera-SwiftKey.zip", list = T)
zip_files$Date <- NULL
zip_files$Language <- substr(zip_files$Name, 7, 8)
zip_files$Length_in_Mb <- zip_files$Length/(1024^2)
zip_files <- zip_files[zip_files$Length>0,]

Unzipped file sizes:

ggplot(zip_files, aes(x = Name, y = Length_in_Mb, fill = Language)) + 
  geom_bar(stat="identity") + 
  coord_flip() +
  theme_light() + 
  xlab("") + ylab("File size in Mb")

Let’s load the Twitter files in memory

# only load twitter
zip_files <- zip_files[ grep("twitter", zip_files$Name),  ]

# this section reads all the files in memory 
# using dynamical variable names
# only run this part if you have enough RAM
for (file in zip_files$Name) {
  con <- file(file, "r")
  file_content <- readLines(con, encoding = "UTF-8")
  print(  paste("File:", file, "has in-memory size of:") )
  print(object.size(file_content), units="Mb")
  close(con)
  
  # dynamically create variable names
  assign(basename(file), file_content)
  file_content <- NULL
}

## [1] "File: final/de_DE/de_DE.twitter.txt has in-memory size of:"
## 112.8 Mb
## [1] "File: final/ru_RU/ru_RU.twitter.txt has in-memory size of:"
## 130.9 Mb
## [1] "File: final/en_US/en_US.twitter.txt has in-memory size of:"
## 257.2 Mb
## [1] "File: final/fi_FI/fi_FI.twitter.txt has in-memory size of:"
## 36.4 Mb

Number of characters per line in each file, a pattern emerges:

op <- par(mfrow = c(2, 2))
hist( nchar(fi_FI.twitter.txt), breaks=30, col=rainbow(50) )
hist( nchar(de_DE.twitter.txt), breaks=30, col=rainbow(50) )
hist( nchar(en_US.twitter.txt), breaks=30, col=rainbow(50) )
hist( nchar(ru_RU.twitter.txt), breaks=30, col=rainbow(50) )

par(op)

Number of words per tweet. Finish tweets contain the most words per tweet, then Germans. US twitters are lazy, almost as lazy as the Russians.

op <- par(mfrow = c(2, 2))
hist( stri_count_words(fi_FI.twitter.txt), breaks=30, col=rainbow(50), main="Finnish")
hist( stri_count_words(de_DE.twitter.txt), breaks=30, col=rainbow(50), main="German" )
hist( stri_count_words(en_US.twitter.txt), breaks=30, col=rainbow(50), main="USA" )
hist( stri_count_words(ru_RU.twitter.txt), breaks=30, col=rainbow(50), main="Russia" )
mtext("Number of words per Tweet for each dataset", side = 3, line = -21, outer = TRUE)

par(op)

prepare sample dataset, check if EN distribution is similar:

# sampling 1000 tweets
set.seed( 1984^2 )
df <- sample(en_US.twitter.txt, 1000)

# cleanup memory
print(object.size(en_US.twitter.txt), units="Mb")

## 257.2 Mb

rm( list=ls(pattern="twitter") )

# does the sampled histogram match the main one?
hist( stri_count_words(df), breaks=30, col=rainbow(50), main= paste0("en_US words per tweet") )

Explore sampled dataset using the tm library

# text mining on sampled data
corp <- VCorpus(VectorSource(df))
# writeCorpus(corp) # writes 100 files on disk
inspect( corp[1:2] )

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 2
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 134
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 64

as.character(corp[[1]])

## [1] "Just got done doing #Homework....now im wide awake needing somebody to text #SN: ytf do I like doing #Homework lol it gets me going!!!"

lapply(corp[1:3], as.character)

## $`1`
## [1] "Just got done doing #Homework....now im wide awake needing somebody to text #SN: ytf do I like doing #Homework lol it gets me going!!!"
## 
## $`2`
## [1] "M , I think today's just not that day.. I had too much time ...."
## 
## $`3`
## [1] "We got this"

# start the tm transformations with tm_map

# eliminate white spaces
corp <- tm_map(corp, stripWhitespace)

# convert to lowercase
#corp <- tm_map(corp, tolower)
#corp <- tm_map(corp, content_transformer(tolower))

# Remove punctuation
corp = tm_map(corp, removePunctuation)

# remove stopwords
length( stopwords("en") )

## [1] 174

corp <- tm_map(corp, removeWords, stopwords("en"))

# stemming
corp <- tm_map(corp, stemDocument)

# inspect the prepared Corpus
lapply(corp[1:3], as.character)

## $`1`
## [1] "Just got done  Homeworknow im wide awak need somebodi  text SN ytf  I like  Homework lol  get  go"
## 
## $`2`
## [1] "M  I think today just   day I   much time"
## 
## $`3`
## [1] "We got"

build a document term matrix

# Create a Document-Term matrix
dtm <- DocumentTermMatrix(corp)

inspect(dtm[1:3, 1:3])

## <<DocumentTermMatrix (documents: 3, terms: 3)>>
## Non-/sparse entries: 0/9
## Sparsity           : 100%
## Maximal term length: 5
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs ll re dont
##    1   0   0     0
##    2   0   0     0
##    3   0   0     0

# find words occuring at least 10 times
findFreqTerms(dtm, 10)

##   [1] "actual"   "also"     "amaz"     "and"      "awesom"   "back"    
##   [7] "bad"      "best"     "better"   "big"      "book"     "bring"   
##  [13] "but"      "can"      "cant"     "come"     "day"      "dont"    
##  [19] "end"      "enjoy"    "even"     "ever"     "everyon"  "fan"     
##  [25] "feel"     "first"    "follow"   "friend"   "fuck"     "fun"     
##  [31] "game"     "get"      "give"     "gonna"    "good"     "got"     
##  [37] "great"    "guy"      "haha"     "happi"    "hard"     "hate"    
##  [43] "hit"      "home"     "hope"     "ill"      "its"      "ive"     
##  [49] "just"     "keep"     "know"     "last"     "let"      "life"    
##  [55] "like"     "littl"    "live"     "lol"      "look"     "love"    
##  [61] "make"     "man"      "may"      "mean"     "miss"     "morn"    
##  [67] "much"     "need"     "never"    "new"      "next"     "nice"    
##  [73] "night"    "now"      "one"      "parti"    "peopl"    "person"  
##  [79] "play"     "pleas"    "realli"   "right"    "run"      "say"     
##  [85] "see"      "show"     "start"    "sure"     "take"     "talk"    
##  [91] "tell"     "thank"    "that"     "the"      "thing"    "think"   
##  [97] "this"     "thought"  "time"     "today"    "tomorrow" "tonight" 
## [103] "tri"      "tweet"    "twitter"  "use"      "wait"     "want"    
## [109] "watch"    "way"      "week"     "well"     "what"     "will"    
## [115] "win"      "wish"     "work"     "yeah"     "year"     "yes"     
## [121] "you"      "your"

# find associations (i.e., terms which correlate) with at least 0.8 correlation for the terms, then
findAssocs(dtm, "facebook", 0.3)

## $facebook
##                  re             chemjobb             company 
##                 0.71                 0.71                 0.71 
##                 gasp youshouldbeembarrass              couldnt 
##                 0.71                 0.71                 0.50 
##                  doe                 main               websit 
##                 0.50                 0.41                 0.35

# remove sparse terms (terms occurring only in very few documents)
dtm <- (removeSparseTerms(dtm, 0.98))
#head( inspect(dtm) )
#
dim( as.data.frame( as.matrix( removeSparseTerms(dtm, 0.98) ) ) )

## [1] 1000   39

# Create data frame
labeledTerms = as.data.frame(as.matrix(dtm))

# the most frequent terms
dotchart( head( sort( colSums(labeledTerms) ), 20))

# build wordcloud
wordcloud(colnames(labeledTerms), colSums(labeledTerms), scale=c(2, 0.25))

# wordcloud directly out of the original corpus
wordcloud(corp, max.words = 100, random.order = FALSE)

# build wordcloud
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:2)]
wordcloud(colnames(labeledTerms), colSums(labeledTerms), 
          scale=c(8,.3),min.freq=2,max.words=100, random.order=T, 
          rot.per=.15, colors=pal, vfont=c("sans serif","plain"))

# or directly from corpus
wordcloud(corp, max.words = 100, random.order = FALSE, 
          scale=c(8,.3),min.freq=2, 
          rot.per=.15, colors=pal, vfont=c("sans serif","plain"))

build a term document matrix to identify bigrams in the sampled set

# build bigrams from the corpus
BigramTokenizer <-
  function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)

tdm <- TermDocumentMatrix(corp, control = list(tokenize = BigramTokenizer))

# inspecting the bigrams
inspect(tdm[100:110, 1:10])

## <<TermDocumentMatrix (terms: 11, documents: 10)>>
## Non-/sparse entries: 0/110
## Sparsity           : 100%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## 
##           Docs
## Terms      1 2 3 4 5 6 7 8 9 10
##   2k miami 0 0 0 0 0 0 0 0 0  0
##   2nd time 0 0 0 0 0 0 0 0 0  0
##   2x pick  0 0 0 0 0 0 0 0 0  0
##   3 day    0 0 0 0 0 0 0 0 0  0
##   3 did    0 0 0 0 0 0 0 0 0  0
##   3 dog    0 0 0 0 0 0 0 0 0  0
##   3 final  0 0 0 0 0 0 0 0 0  0
##   3 ill    0 0 0 0 0 0 0 0 0  0
##   3 joint  0 0 0 0 0 0 0 0 0  0
##   3 morn   0 0 0 0 0 0 0 0 0  0
##   3 vega   0 0 0 0 0 0 0 0 0  0

findFreqTerms(tdm, 5)

##  [1] "follow back"  "good morn"    "i cant"       "i didnt"     
##  [5] "i dont"       "i get"        "i hate"       "i just"      
##  [9] "i know"       "i like"       "i love"       "i need"      
## [13] "i think"      "i thought"    "i want"       "i will"      
## [17] "i wish"       "im go"        "im gonna"     "just got"    
## [21] "last night"   "look forward" "right now"    "thank follow"
## [25] "yes i"        "you know"

build a term document matrix to identify bigrams in the sampled set

# build trigrams from the corpus
TrigramTokenizer <-
  function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

tdm <- TermDocumentMatrix(corp, control = list(tokenize = TrigramTokenizer))

# inspecting the trigrams
findFreqTerms(tdm, 2)

##  [1] "back back im"          "back im back"         
##  [3] "follow everyon follow" "happi new year"       
##  [5] "i dont know"           "i need new"           
##  [7] "i wish i"              "im back back"         
##  [9] "just cant stand"       "just got done"        
## [11] "look forward see"      "say right now"        
## [13] "stay tune come"        "tell hacki joke"      
## [15] "thank stay tune"       "today great day"      
## [17] "tomorrow go back"      "yes i will"

Capstone Project - Data Exploration - Understand frequencies of words and word pairs

Calin Uioreanu