EDA on docs

Looking at docs corpus and analysing themes.

#Let's go. Load corpus... 
docs <- VCorpus(DirSource("../raw_data/doc_corpus"))

num.docs <- length(docs)

Write a standard function to clean the documents

clean_docs <- function(docs) {
  
  output = docs
  
  #Remove punctuation - replace punctuation marks with " "
  output = tm_map(output, removePunctuation)
  
  #Transform to lower case
  output = tm_map(output,content_transformer(tolower))
  
  #Strip digits
  output = tm_map(output, removeNumbers)
  
  #Remove stopwords from standard stopword list 
  output = tm_map(output, removeWords, stopwords("english"))
  
  #Create a custom set of stopwords that weren't removed by the standard stopwords list
  myStopwords <- c("can",
                   "also",
                   "get",
                   "see",
                   "may",
                   "much",
                   "now",
                   "said",
                   "will",
                   "way",
                   "well",
                   "howev",
                   "say",
                   "one",
                   "use")
  
  #Remove the custom stopwords
  output = tm_map(output, removeWords, myStopwords)
  
  #Strip whitespace (cosmetic?)
  output = tm_map(output, stripWhitespace)
  
  return(output)
  
}

clean.docs <- clean_docs(docs)

A bit of basic analysis of documents with this function

document_info <- function(documents, clean.documents) {
  
  num.documents = length(clean.documents)
  
  for(i in 1:num.documents) {
    document.meta = clean.documents[[i]]$meta
    document.content = clean.documents[[i]]$content
    raw.document.content = documents[[i]]$content
    
    if (i == 1) {
      document.summary = data.frame(id = document.meta$id, language = document.meta$language, datetimestamp = document.meta$datetimestamp, raw.characters = nchar(raw.document.content), clean.characters = nchar(document.content))
    } else {
      document.temp = data.frame(id = document.meta$id, language = document.meta$language, datetimestamp = document.meta$datetimestamp, raw.characters = nchar(raw.document.content), clean.characters = nchar(document.content))
      document.summary = rbind(document.summary, document.temp)
    }
  }
  
  return(document.summary)
}

#running the function on raw docs and clean docs
document_info(docs, clean.docs)

##           id language       datetimestamp raw.characters clean.characters
## 1  Doc01.txt       en 2018-10-23 05:57:07          15842            11001
## 2  Doc02.txt       en 2018-10-23 05:57:07          12900             9199
## 3  Doc03.txt       en 2018-10-23 05:57:07           5119             3364
## 4  Doc04.txt       en 2018-10-23 05:57:07           2999             1774
## 5  Doc05.txt       en 2018-10-23 05:57:07          10122             7020
## 6  Doc06.txt       en 2018-10-23 05:57:07          11949             8282
## 7  Doc07.txt       en 2018-10-23 05:57:07           5072             3404
## 8  Doc08.txt       en 2018-10-23 05:57:07          19088            13569
## 9  Doc09.txt       en 2018-10-23 05:57:07          13908             9261
## 10 Doc10.txt       en 2018-10-23 05:57:07           7310             4941
## 11 Doc11.txt       en 2018-10-23 05:57:07          26707            17968
## 12 Doc12.txt       en 2018-10-23 05:57:07           9770             6511
## 13 Doc13.txt       en 2018-10-23 05:57:07           6253             3170
## 14 Doc14.txt       en 2018-10-23 05:57:07          12612             7553
## 15 Doc15.txt       en 2018-10-23 05:57:07           8602             5994
## 16 Doc16.txt       en 2018-10-23 05:57:07           9145             6275
## 17 Doc17.txt       en 2018-10-23 05:57:07           9381             6056
## 18 Doc18.txt       en 2018-10-23 05:57:07          13956             9362
## 19 Doc19.txt       en 2018-10-23 05:57:07          34921            22566
## 20 Doc20.txt       en 2018-10-23 05:57:07          25250            16499
## 21 Doc21.txt       en 2018-10-23 05:57:07          20814            13589
## 22 Doc22.txt       en 2018-10-23 05:57:07          19994            13431
## 23 Doc23.txt       en 2018-10-23 05:57:07           9338             6217
## 24 Doc24.txt       en 2018-10-23 05:57:07           7275             5026
## 25 Doc25.txt       en 2018-10-23 05:57:07          13068             8860
## 26 Doc26.txt       en 2018-10-23 05:57:07          13548             9296
## 27 Doc27.txt       en 2018-10-23 05:57:07          10144             6897
## 28 Doc28.txt       en 2018-10-23 05:57:07           5559             3970
## 29 Doc29.txt       en 2018-10-23 05:57:07          20603            14742
## 30 Doc30.txt       en 2018-10-23 05:57:07          17404            12241
## 31 Doc31.txt       en 2018-10-23 05:57:07           7042             4644
## 32 Doc32.txt       en 2018-10-23 05:57:07          11917             8341
## 33 Doc33.txt       en 2018-10-23 05:57:07          12876             8604
## 34 Doc34.txt       en 2018-10-23 05:57:07          13419             8668
## 35 Doc35.txt       en 2018-10-23 05:57:07          27493            16897
## 36 Doc36.txt       en 2018-10-23 05:57:07           6037             3957
## 37 Doc37.txt       en 2018-10-23 05:57:07           6871             4418
## 38 Doc38.txt       en 2018-10-23 05:57:07          10482             6619
## 39 Doc39.txt       en 2018-10-23 05:57:07          16881            11082
## 40 Doc40.txt       en 2018-10-23 05:57:07          18455            11735
## 41 Doc41.txt       en 2018-10-23 05:57:07          14453             9365

Insights on the corpus

clean.docs <- tm_map(clean.docs,stemDocument)
#inspect
writeLines(as.character(clean.docs[[num.docs]]))

## essay uncertainti project task estim describ task estim correspond probabl distribut put simpli task estim actual rang possibl complet time probabl occurr specifi distribut know distribut possibl answer question probabl task complet within x day reliabl predict depend faith distribut captur actual spread task durat – therein lie least coupl problem first probabl distribut task durat general hard characteris lack reliabl data estim good estim histor data usual avail second mani realist distribut complic mathemat form hard characteris manipul problem compound fact project consist sever task durat estim possibl complic distribut first issu usual address fit distribut point estim optimist pessimist like time pert refin estim distribut gain experi second issu tackl mont carlo techniqu involv simul task number time use appropri distribut calcul expect complet time base result aim post present examplebas introduct mont carlo simul project task durat although aim keep thing reason simpl beyond highschool math basic understand probabl ill cover fair bit ground given id best start brief descript approach reader know come mont carlo simul umbrella term cover rang approach random sampl simul event describ known probabl distribut first task specifi probabl distribut howev mention earlier general unknown task durat simplic ill assum task durat uncertainti describ accur use triangular probabl distribut – distribut particular easi handl mathemat point view advantag use triangular distribut simul result valid easili use triangular distribut isnt limit method describ appli arbitrarili shape distribut import techniqu use simul happen multipl task strung togeth project schedul ill cover futur post final ill demonstr mont carlo simul method appli singl task describ triangular distribut although simul overkil case question regard durat answer exact without use simul exampl serv illustr step involv simul complex case – compris task andor involv complic distribut without ado let begin journey describ triangular distribut triangular distribut let assum there project task need person go reckon take hour complet like complet time hour estim come number isnt import stage – mayb there guesswork mayb pad mayb realli base experi what import three number correspond minimum like maximum time keep discuss general call tmin tml tmax respect back estim specif number later probabl associ time sinc tmin tmax correspond minimum maximum time probabl associ zero wasnt zero nonzero probabl complet time less tmin greater tmax – isnt possibl note consequ assumpt probabl vari continu – take nonzero valu p tmin must take valu slight less p – greater – t slight smaller tmin far like time tml concern definit probabl attain highest valu time tml assum probabl describ triangular function distribut must form shown figur figur figur triangular distribut simul need know equat describ distribut although wikipedia tell us answer mouseclick instruct figur first note area triangl must equal task must finish time tmin tmax consequ fractimesbasetimesaltitudefractimestmaxtmintimesptmlldotsldot ptml probabl correspond time tml bit rearrang ptmlfractmaxtminldotsldot deriv probabl time t lie tmax tml note fracttminptfractmltminptmlldotsldot consequ fact ratio either side equat equal slope line join point tmin tml ptml figur figur substitut simplifi bit obtain ptfracttmintmltmintmaxtmindotsldot tminleq t leq tml similar fashion show probabl time lie tml tmax given ptfractmaxttmaxtmltmaxtmindotsldot tmlleq t leq tmax equat togeth describ probabl distribut function pdf time tmin tmax anoth quantiti interest cumul distribut function cdf probabl p task complet time t reiter pdf pt probabl task finish time t wherea cdf pt probabl task complet time t cdf pt essenti sum probabl tmin t t tmin area triangl apex tmin t t pt use formula area triangl base time height equat ptfracttmintmltmintmaxtminldotsldot tminleq t leq tml note t geq tml area curv equal total area minus area enclos triangl base t tmax pt fractmaxttmaxtmltmaxtminldotsldot tmlleq t leq tmax expect pt start valu tmin increas monoton attain valu tmax end section let plug number quot estim start section tmin tml tmax result pdf cdf shown figur figur triangular pdf tmin tml tmax figur triangular cdf tmin tml tmax figur – triangular cdf tmin tml tmax mont carlo simul singl task ok busi end essay – simul ill first outlin simul procedur discuss result case task describ previous section triangular distribut tmin tml tmax note use tk solver – mathemat packag creat univers technic system – simul tk solver builtin backsolv capabl extrem help solv equat come simul calcul excel spreadsheet skill here simul procedur generat random number treat number cumul probabl pt simul run technic note use random number generat come tk solver packag algorithm use generat describ excel random number generat even better find time t correspond pt solv equat t result valu t time taken complet task technic note solv equat t isnt straightforward t appear sever place equat two option solv t numer techniqu bisect newtonraphson method b backsolv goal seek function excel mathemat packag use backsolv capabl tk solver obtain t random valu p generat tk solver backsolv equat automat – fiddl around numer method – make attract option kind calcul repeat step n time n suffici larg number – calcul n use triangular distribut paramet tmin tml tmax gave valu pt t exampl simul run proceed here data first simul run random number generat return call valu pt time correspond cumul probabl obtain solv equat numer t gave t call shown figur complet time first run figur figur complet simul run sort bin correspond time interv hrs start thrs t hrs result histogram shown figur bar correspond number simul run fall within time interv figur distribut simul run figur distribut simul run might expect look like triangular distribut shown figur differ though figur plot probabl continu function time wherea figur plot number simul run step function time convinc two realli let look cumul probabl tml – ie probabl task complet within hrs equat ptml correspond number simul simpli number simul run complet time less equal hrs divid total number simul run simul come agreement perfect convinc enough just sure perform simul number time – generat sever set random number – took averag predict ptml agreement theori simul improv expect wrap limit triangular distribut impos upper cutoff tmax longtail distribut therefor realist end though form distribut neither techniqu appli distribut real question obtain reliabl distribut estim there easi answer start three point estim pert fit triangular complic distribut although best histor data absenc alway start reason guess point refin experi anoth point worth mention simul done level higher indivdu task brilliant book – waltz bear manag risk softwar project – de marco lister demonstr mont carlo method simul various aspect project – veloc time cost etc – project level oppos task level believ better perform simul lowest possibl level although lot work – main reason easier less errorpron estim individu task entir project nevertheless high level simul use reliabl data base remiss didnt mention various mont carlo packag avail market ive never use account theyr pretti good commerci packag exampl product random number generat sampl techniqu far sophist simpl one ive use exampl final admit exampl describ post complic demonstr obvious – start triangular distribut got back triangular distribut via simul point howev illustr method show yield expect result situat answer known futur post ill appli method complex situat exampl multipl task seri parallel depend rule thrown good measur although ill triangular distribut individu task result far obvious simul method realli start shine complex increas wait later hope exampl help illustr mont carlo method use simul project task

#end of preprocessing

#Create document-term matrix
dtm <- DocumentTermMatrix(clean.docs)

#summary
dtm

## <<DocumentTermMatrix (documents: 41, terms: 4492)>>
## Non-/sparse entries: 18623/165549
## Sparsity           : 90%
## Maximal term length: 114
## Weighting          : term frequency (tf)

#4442 terms in the 41 docs
#inspect segment of document term matrix
inspect(dtm[1:10,995:1006])

## <<DocumentTermMatrix (documents: 10, terms: 12)>>
## Non-/sparse entries: 13/107
## Sparsity           : 89%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## Sample             :
##            Terms
## Docs        defocus degre delay deleg delib deliber deliv deliver deliveri
##   Doc01.txt       0     0     0     0     0       0     0       0        0
##   Doc02.txt       1     0     0     0     1       0     0       0        0
##   Doc03.txt       0     0     0     0     0       0     1       0        1
##   Doc04.txt       0     0     0     0     0       0     0       0        0
##   Doc05.txt       0     1     1     0     0       0     0       2        0
##   Doc06.txt       0     0     0     0     0       1     1       0        0
##   Doc07.txt       0     0     0     0     0       0     0       4        1
##   Doc08.txt       0     1     0     0     0       0     0       0        0
##   Doc09.txt       0     0     0     0     0       0     0       0        0
##   Doc10.txt       0     0     0     0     0       0     0       0        0
##            Terms
## Docs        demand
##   Doc01.txt      0
##   Doc02.txt      3
##   Doc03.txt      0
##   Doc04.txt      0
##   Doc05.txt      0
##   Doc06.txt      0
##   Doc07.txt      0
##   Doc08.txt      0
##   Doc09.txt      0
##   Doc10.txt      0

#collapse matrix by summing over columns - this gets total counts (over all docs) for each term
freq <- colSums(as.matrix(dtm))
#length should be total number of terms
length(freq)

## [1] 4492

#create sort order (asc)
ord <- order(freq,decreasing=TRUE)
#inspect most frequently occurring terms
freq[head(ord)]

## project    risk   manag   figur    time     use 
##     579     538     507     335     333     299

#write to disk and inspect file
write.csv(file="../clean_data/freq.csv",freq[ord])

Including Plots

#inspect least frequently occurring terms
freq[tail(ord)]

##        ysqrtsum             yyy           zero…             zip 
##               1               1               1               1 
##            zoom zsqrtxyzsqrtxyz 
##               1               1

#list most frequent terms. Lower bound specified as second argument
findFreqTerms(dtm,lowfreq=80)

##  [1] "algorithm"  "approach"   "argument"   "articl"     "author"    
##  [6] "base"       "best"       "case"       "chang"      "cluster"   
## [11] "complet"    "correl"     "data"       "day"        "decis"     
## [16] "describ"    "develop"    "differ"     "discuss"    "distribut" 
## [21] "doc"        "document"   "estim"      "exampl"     "figur"     
## [26] "first"      "follow"     "function"   "good"       "group"     
## [31] "howev"      "ibi"        "idea"       "ill"        "import"    
## [36] "inform"     "interest"   "issu"       "knowledg"   "like"      
## [41] "look"       "make"       "manag"      "mani"       "map"       
## [46] "mean"       "method"     "model"      "need"       "note"      
## [51] "number"     "often"      "organ"      "organis"    "paper"     
## [56] "point"      "possibl"    "post"       "practic"    "probabl"   
## [61] "problem"    "process"    "project"    "question"   "reason"    
## [66] "result"     "risk"       "set"        "simul"      "system"    
## [71] "take"       "task"       "techniqu"   "term"       "thing"     
## [76] "time"       "topic"      "two"        "understand" "use"       
## [81] "valu"       "word"       "work"

You can also embed plots, for example:

#correlations
findAssocs(dtm,"algorithm",0.9)

## $algorithm
##    out nstart purist rownam 
##   0.97   0.90   0.90   0.90

findAssocs(dtm,"argument",0.75)

## $argument
##        map       issu    respond     applic   question compendium 
##       0.94       0.90       0.80       0.79       0.79       0.78 
##   issuebas        ibi 
##       0.77       0.76

findAssocs(dtm,"cluster",0.9)

## $cluster
##                           agglom                         already… 
##                             1.00                             1.00 
##                            annoy                      asmatrixdtm 
##                             1.00                             1.00 
##                             bolt                           branch 
##                             1.00                             1.00 
##                     britishaussi                          categor 
##                             1.00                             1.00 
##                         centroid                             chug 
##                             1.00                             1.00 
##                         clearest                          closest 
##                             1.00                             1.00 
##                         clusplot                clusplotasmatrixd 
##                             1.00                             1.00 
##               clustersylabwithin                           colort 
##                             1.00                             1.00 
##                         concaten                         configur 
##                             1.00                             1.00 
##    contenttransformergsubpattern                cuserskailashdocu 
##                             1.00                             1.00 
##                              dab                             dabi 
##                             1.00                             1.00 
##                        dendogram                       dendrogram 
##                             1.00                             1.00 
##                          denomin                             dens 
##                             1.00                             1.00 
##                             dist                      distancebas 
##                             1.00                             1.00 
##                      distancerel                      distancewis 
##                             1.00                             1.00 
##                            distm                              dxi 
##                             1.00                             1.00 
##                           educat                            elbow 
##                             1.00                             1.00 
##                        euclidean                            excus 
##                             1.00                             1.00 
##                          flatten                          geometr 
##                             1.00                             1.00 
##                           hclust               hclustdmethodwardd 
##                             1.00                             1.00 
##                             hood                          insofar 
##                             1.00                             1.00 
##                       intraclust                             kfit 
##                             1.00                             1.00 
##                        kfitclust                            kmean 
##                             1.00                             1.00 
##                          kmeansd                     libraryclust 
##                             1.00                             1.00 
##                          longest                           means… 
##                             1.00                             1.00 
##                      miningspeak     ncharrownamesmncharrownamesm 
##                             1.00                             1.00 
##                              nut                             onus 
##                             1.00                             1.00 
##                          optimis                          optimum 
##                             1.00                             1.00 
##  pastesubstringrownamesmrepnrowm                        plotgroup 
##                             1.00                             1.00 
##           postlookrightnowthinkv                     potentiallyy 
##                             1.00                             1.00 
##                            rambl                       recthclust 
##                             1.00                             1.00 
##                  recthclustgroup                           recurs 
##                             1.00                             1.00 
##                           reduct                        rownamesm 
##                             1.00                             1.00 
##                             scan                           shadet 
##                             1.00                             1.00 
##                          sharpli                          shorten 
##                             1.00                             1.00 
##                        showstopp                           silent 
##                             1.00                             1.00 
##                            split                             sqrt 
##                             1.00                             1.00 
##                            sqrtx                        steadfast 
##                             1.00                             1.00 
##                 straightforward–               substringrownamesm 
##                             1.00                             1.00 
##                           subtre sumkmeansdcentersinstartwithinss 
##                             1.00                             1.00 
##                        territori                      toolsinstal 
##                             1.00                             1.00 
##                          truncat                          twodocu 
##                             1.00                             1.00 
##                            typeb                             ward 
##                             1.00                             1.00 
##                        welldefin                      withinclust 
##                             1.00                             1.00 
##                         withinss                              wss 
##                             1.00                             1.00 
##                             wssi                         xlabnumb 
##                             1.00                             1.00 
##                              xxx                              yyy 
##                             1.00                             1.00 
##                          princip                         hierarch 
##                             0.99                             0.98 
##                          distanc                             merg 
##                             0.97                             0.97 
##                          minimis 
##                             0.95

findAssocs(dtm,"project",0.69)

## $project
##   lose dollar    win 
##   0.71   0.70   0.70

findAssocs(dtm,"discuss",0.75)

## $discuss
## werner 
##   0.79

findAssocs(dtm,"model",0.75)

## $model
## overst   test  engin  field 
##   0.86   0.85   0.81   0.76

findAssocs(dtm,"risk",0.7)

## $risk
## numeric(0)

findAssocs(dtm,"question",0.7)

## $question
## argument      map  respond      con     idea     issu     pros issuebas 
##     0.79     0.77     0.77     0.76     0.74     0.73     0.72     0.71

findAssocs(dtm,"document",0.9)

## $document
##      contenttransformerfunctionx                      gsubpattern 
##                             0.97                             0.97 
##                       inenterpri                        librarytm 
##                             0.97                             0.97 
##                 stopwordsenglish                         tmmapdoc 
##                             0.97                             0.97 
## tmmapdocscontenttransformertolow                tmmapdocsstemdocu 
##                             0.97                             0.97 
##           contenttransformergsub                              doc 
##                             0.96                             0.95 
##                           replac         writelinesascharacterdoc 
##                             0.95                             0.95 
##                       removeword                           tospac 
##                             0.94                             0.94 
##                          pattern 
##                             0.91

#histogram
wf = data.frame(term=names(freq), occurrences=freq)

ggplot(subset(wf, occurrences>200), aes(term, occurrences)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1))

#order by frequency
ggplot(subset(wf, occurrences>200), aes(reorder(term,occurrences), occurrences)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1))

#wordcloud
#setting the same seed each time ensures consistent look across clouds
set.seed(42)

#limit words by specifying min frequency
wordcloud(names(freq),freq, max.words=40)

#...add color
wordcloud(names(freq),freq,max.words=40,colors=brewer.pal(6,"Dark2"))

#Making bigrams of words in the corpus

#to see what ngrams does, try running ngrams(words(docs[[1]]$content),2), which
#returns bigrams for the first document in the corpus
BigramTokenizer <-  function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
#create DTM
dtmbi <- DocumentTermMatrix(clean.docs, control = list(tokenize = BigramTokenizer))
freqbi <- colSums(as.matrix(dtmbi))
#length should be total number of terms
length(freqbi)

## [1] 34967

#create sort order (asc)
ordbi <- order(freqbi,decreasing=TRUE)
#inspect most frequently occurring terms
freqbi[head(ordbi)]

## project manag    risk manag  complet time  doc tmmapdoc   figur figur 
##           138           132           105            64            61 
##    mont carlo 
##            61

TF-IDF

dtm_tfidf <- DocumentTermMatrix(clean.docs, control = list(weighting = weightTfIdf))
#note that the weighting is normalised by default (that is, the term frequencies in a
#document are normalised by the number of terms in the document)
#summary
dtm_tfidf

## <<DocumentTermMatrix (documents: 41, terms: 4492)>>
## Non-/sparse entries: 18623/165549
## Sparsity           : 90%
## Maximal term length: 114
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)

#inspect segment of document term matrix
inspect(dtm_tfidf[1:10,1000:1006])

## <<DocumentTermMatrix (documents: 10, terms: 7)>>
## Non-/sparse entries: 8/62
## Sparsity           : 89%
## Maximal term length: 8
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##            Terms
## Docs           deliber delight       deliv     deliver    deliveri delv
##   Doc01.txt 0.00000000       0 0.000000000 0.000000000 0.000000000    0
##   Doc02.txt 0.00000000       0 0.000000000 0.000000000 0.000000000    0
##   Doc03.txt 0.00000000       0 0.007332425 0.000000000 0.008110029    0
##   Doc04.txt 0.00000000       0 0.000000000 0.000000000 0.000000000    0
##   Doc05.txt 0.00000000       0 0.000000000 0.008712678 0.000000000    0
##   Doc06.txt 0.00343308       0 0.003103910 0.000000000 0.000000000    0
##   Doc07.txt 0.00000000       0 0.000000000 0.036015174 0.008013251    0
##   Doc08.txt 0.00000000       0 0.000000000 0.000000000 0.000000000    0
##   Doc09.txt 0.00000000       0 0.000000000 0.000000000 0.000000000    0
##   Doc10.txt 0.00000000       0 0.000000000 0.000000000 0.000000000    0
##            Terms
## Docs            demand
##   Doc01.txt 0.00000000
##   Doc02.txt 0.01221744
##   Doc03.txt 0.00000000
##   Doc04.txt 0.00000000
##   Doc05.txt 0.00000000
##   Doc06.txt 0.00000000
##   Doc07.txt 0.00000000
##   Doc08.txt 0.00000000
##   Doc09.txt 0.00000000
##   Doc10.txt 0.00000000

#collapse matrix by summing over columns - this gets total weights (over all docs) for each term
wt_tot_tfidf <- colSums(as.matrix(dtm_tfidf))
#length should be total number of terms
length(wt_tot_tfidf )

## [1] 4492

#create sort order (asc)
ord_tfidf <- order(wt_tot_tfidf,decreasing=TRUE)
#inspect most frequently occurring terms
wt_tot_tfidf[head(ord_tfidf)]

##      risk distribut     simul       map       ibi     eleph 
## 0.6829182 0.3715409 0.3447371 0.3124370 0.2917337 0.2783366

#write to disk and inspect file
write.csv(file="../clean_data/wt_tot_tfidf.csv",wt_tot_tfidf[ord_tfidf])
#inspect least weighted terms
wt_tot_tfidf[tail(ord_tfidf)]

##                                             wickham 
##                                         0.001895137 
##                            wordcloudnamesfreqrfreqr 
##                                         0.001895137 
## wordcloudnamesfreqrfreqrminfreqcolorsbrewerpaldark″ 
##                                         0.001895137 
##                                            writelin 
##                                         0.001895137 
##                                              youtub 
##                                         0.001895137 
##                                                 zip 
##                                         0.001895137

#correlations - compare to dtm generated by  tf and tf/truncated weighting
#"project" at correlation level of 0.6
findAssocs(dtm_tfidf,"risk",0.5)

## $risk
##     affect methodolog     threat   unambigu       awar    guidanc 
##       0.60       0.58       0.55       0.55       0.54       0.54 
##     impact      prior       sale      seven     attent     expect 
##       0.54       0.53       0.53       0.53       0.52       0.52 
##      handl       huge    analysi 
##       0.52       0.51       0.50

findAssocs(dtm_tfidf,"distribution",0.8)

## $distribution
## numeric(0)

findAssocs(dtm_tfidf,"eleph",0.8)

## $eleph
##           amok          clout           dead           deni      disappear 
##           1.00           1.00           1.00           1.00           1.00 
##        fantasi          havoc      inconveni          invis         mahout 
##           1.00           1.00           1.00           1.00           1.00 
##         moment      pachyderm       patholog           pray         rampag 
##           1.00           1.00           1.00           1.00           1.00 
## unforeseeable–          wreak      exemplifi        imposit           room 
##           1.00           1.00           0.99           0.99           0.99 
##          appar       metaphor       mistaken         murphi         reluct 
##           0.98           0.98           0.98           0.98           0.98 
##           wake          blind         danger           fast           game 
##           0.98           0.97           0.97           0.97           0.97 
##       foreseen          magic          ignor          assur         situat 
##           0.96           0.96           0.95           0.94           0.94 
##         strang         affair         eventu           wish        fraught 
##           0.94           0.93           0.93           0.93           0.92 
##         friend           hope         werent            wil           hand 
##           0.92           0.92           0.92           0.92           0.90 
##           walk       whatsoev           pass        anticip            bad 
##           0.90           0.90           0.89           0.88           0.88 
##           fear           quit         resolv         occurr          handl 
##           0.87           0.87           0.86           0.84           0.81

findAssocs(dtm_tfidf,"wickham",0.8)

## $wickham
##                                                …add 
##                                                1.00 
##                                             …output 
##                                                1.00 
##                                               addon 
##                                                1.00 
##                                              adject 
##                                                1.00 
##                                             aesterm 
##                                                1.00 
##                                             aesthet 
##                                                1.00 
##                                            alphabet 
##                                                1.00 
##                                               anali 
##                                                1.00 
##                                              anytim 
##                                                1.00 
##                                              appeal 
##                                                1.00 
##                                architectarchitectur 
##                                                1.00 
##                                                 asc 
##                                                1.00 
##                                            asmatrix 
##                                                1.00 
##                                              assort 
##                                                1.00 
##                                             augment 
##                                                1.00 
##                                           backslash 
##                                                1.00 
##                                            backyard 
##                                                1.00 
##                                                 bag 
##                                                1.00 
##                                              banana 
##                                                1.00 
##                                              bigram 
##                                                1.00 
##                                              brewer 
##                                                1.00 
##                                          casesensit 
##                                                1.00 
##                                                chop 
##                                                1.00 
##                                              cleans 
##                                                1.00 
##                                             cleanup 
##                                                1.00 
##                                               colon 
##                                                1.00 
##                                               color 
##                                                1.00 
##                                 colsumsasmatrixdtmr 
##                                                1.00 
##                                      commandsscript 
##                                                1.00 
##                                               compo 
##                                                1.00 
##                                            conjunct 
##                                                1.00 
##                                              consol 
##                                                1.00 
##                                             contigu 
##                                                1.00 
##                             controllistwordlengthsc 
##                                                1.00 
##                                             cooccur 
##                                                1.00 
##                                            cooccurr 
##                                                1.00 
##                                           copynpast 
##                                                1.00 
##                                              cosmet 
##                                                1.00 
##                                              crimin 
##                                                1.00 
##                                              crinkl 
##                                                1.00 
##                                               crude 
##                                                1.00 
##                                           ctrlenter 
##                                                1.00 
##                                                 cur 
##                                                1.00 
##                                               cuser 
##                                                1.00 
##                                          cusersdocu 
##                                                1.00 
##                                            cutnpast 
##                                                1.00 
##                                             defacto 
##                                                1.00 
##                                          descriptor 
##                                                1.00 
##                                               dirti 
##                                                1.00 
##                                             downsid 
##                                                1.00 
##                                                dtm– 
##                                                1.00 
##                                                dtmr 
##                                                1.00 
##                                                dtms 
##                                                1.00 
##                                                 eas 
##                                                1.00 
##                                               escap 
##                                                1.00 
##                                              exerci 
##                                                1.00 
##                                                expo 
##                                                1.00 
##                                                 faq 
##                                                1.00 
##                                              feiner 
##                                                1.00 
##                                              files… 
##                                                1.00 
##                                           findassoc 
##                                                1.00 
##                              findassocsdtmrenterpri 
##                                                1.00 
##                               findassocsdtmrproject 
##                                                1.00 
##                                findassocsdtmrsystem 
##                                                1.00 
##                                        findfreqterm 
##                                                1.00 
##                            findfreqtermsdtmrlowfreq 
##                                                1.00 
##                                               flash 
##                                                1.00 
##                                         freqheadord 
##                                                1.00 
##                                               freqr 
##                                                1.00 
##                                       freqrheadordr 
##                                                1.00 
##                                       freqrtailordr 
##                                                1.00 
##                                         freqtailord 
##                                                1.00 
##                                             geombar 
##                                                1.00 
##                                    geombarstatident 
##                                                1.00 
##                                        gettransform 
##                                                1.00 
##                                             getwdto 
##                                                1.00 
##                                              ggplot 
##                                                1.00 
##                                      ggplotsubsetwf 
##                                                1.00 
##                                             grammat 
##                                                1.00 
##                                              grappl 
##                                                1.00 
##                                                gsub 
##                                                1.00 
##                                              hadley 
##                                                1.00 
##                                             heritag 
##                                                1.00 
##                                               hjust 
##                                                1.00 
##                                            horizont 
##                                                1.00 
##                                               hypen 
##                                                1.00 
##                                              hyphen 
##                                                1.00 
##                                                 ide 
##                                                1.00 
##                                              increa 
##                                                1.00 
##                                               index 
##                                                1.00 
##                                               infam 
##                                                1.00 
##                                                ingo 
##                                                1.00 
##                                          inspectdtm 
##                                                1.00 
##                                     intenttopurchas 
##                                                1.00 
##                                               invok 
##                                                1.00 
##                                            italicis 
##                                                1.00 
##                                              laptop 
##                                                1.00 
##                                               lemma 
##                                                1.00 
##                                              lemmat 
##                                                1.00 
##                                         lengthfreqr 
##                                                1.00 
##                                       libraryggplot 
##                                                1.00 
##                                    librarysnowballc 
##                                                1.00 
##                                    librarywordcloud 
##                                                1.00 
##                                              licens 
##                                                1.00 
##                                                limb 
##                                                1.00 
##                                            listglob 
##                                                1.00 
##                                                 log 
##                                                1.00 
##                                                mate 
##                                                1.00 
##                                             minfreq 
##                                                1.00 
##                                             months… 
##                                                1.00 
##                                            movement 
##                                                1.00 
##                                           multiword 
##                                                1.00 
##                                                noun 
##                                                1.00 
##                                               onlin 
##                                                1.00 
##                             orderfreqrdecreasingtru 
##                                                1.00 
##                                                ordr 
##                                                1.00 
##                                            painless 
##                                                1.00 
##                                              paper… 
##                                                1.00 
##                                             permiss 
##                                                1.00 
##                                              permit 
##                                                1.00 
##                                                 pos 
##                                                1.00 
##                                             postcod 
##                                                1.00 
##                                             preambl 
##                                                1.00 
##                                              proxim 
##                                                1.00 
##                                          rcolorbrew 
##                                                1.00 
##                                           reinspect 
##                                                1.00 
##                                           rollsroyc 
##                                                1.00 
##                                             rscript 
##                                                1.00 
##                                         rudimentari 
##                                                1.00 
##                                              script 
##                                                1.00 
##                                              sensit 
##                                                1.00 
##                                               setse 
##                                                1.00 
##                                               setwd 
##                                                1.00 
##                                            setwdcus 
##                                                1.00 
##                                              shown… 
##                                                1.00 
##                                             simple… 
##                                                1.00 
##                                               slash 
##                                                1.00 
##                                           snowballc 
##                                                1.00 
##                                               socal 
##                                                1.00 
##                                         spacehyphen 
##                                                1.00 
##                                              start… 
##                                                1.00 
##                                           statident 
##                                                1.00 
##                                                 std 
##                                                1.00 
##                                            stemdocu 
##                                                1.00 
##                                              street 
##                                                1.00 
##                                         subcategori 
##                                                1.00 
##                                              sydney 
##                                                1.00 
##                                              tagger 
##                                                1.00 
##                                                 tdm 
##                                                1.00 
##                                                tdms 
##                                                1.00 
##                                             tedious 
##                                                1.00 
##                                              tedium 
##                                                1.00 
##                                          termoccurr 
##                                                1.00 
##                                             textmin 
##                                                1.00 
##                       themeaxistextxelementtextangl 
##                                                1.00 
##                                              ticket 
##                                                1.00 
##                                                 tip 
##                                                1.00 
##                                               token 
##                                                1.00 
##                                               tolow 
##                                                1.00 
##                                            transpos 
##                                                1.00 
##                                                trip 
##                                                1.00 
##                                             twoword 
##                                                1.00 
##                                                unix 
##                                                1.00 
##                                               unnam 
##                                                1.00 
##                                               unzip 
##                                                1.00 
##                                             variant 
##                                                1.00 
##                                                vast 
##                                                1.00 
##                                             vcorpus 
##                                                1.00 
##                                                verb 
##                                                1.00 
##           wfdataframetermnamesfreqroccurrencesfreqr 
##                                                1.00 
##                                           wordcloud 
##                                                1.00 
##                            wordcloudnamesfreqrfreqr 
##                                                1.00 
## wordcloudnamesfreqrfreqrminfreqcolorsbrewerpaldark″ 
##                                                1.00 
##                                            writelin 
##                                                1.00 
##                                              youtub 
##                                                1.00 
##                                                zeno 
##                                                1.00 
##                                                 zip 
##                                                1.00 
##                                              window 
##                                                0.98 
##                                    contenttransform 
##                                                0.97 
##                                           transform 
##                                                0.95 
##                                               getwd 
##                                                0.94 
##                                                freq 
##                                                0.93 
##                                              folder 
##                                                0.92 
##                                                stem 
##                                                0.91 
##                                             inspect 
##                                                0.90 
##                                               maxim 
##                                                0.90 
##                                            nonspars 
##                                                0.90 
##                                            sparsiti 
##                                                0.90 
##                                             version 
##                                                0.90 
##                                                text 
##                                                0.88 
##                                             rstudio 
##                                                0.87 
##                                            frequent 
##                                                0.84 
##                                             snippet 
##                                                0.84 
##                                               zulli 
##                                                0.84 
##                                              behold 
##                                                0.81 
##                                         deconstruct 
##                                                0.81 
##                                                 nlp 
##                                                0.81 
##                                            redeploy 
##                                                0.81 
##                                             synergi 
##                                                0.81 
##                                            enterpri 
##                                                0.80 
##                                              organi 
##                                                0.80 
##                                        removepunctu 
##                                                0.80

findAssocs(dtm_tfidf,"hubbard",0.8)

## $hubbard
##        –techniqu         academia             acid            advoc 
##             0.84             0.84             0.84             0.84 
##              aon            arbit            arena             bell 
##             0.84             0.84             0.84             0.84 
##       blackschol            broke           calibr      combinatori 
##             0.84             0.84             0.84             0.84 
##        commonmod            damag            debri       dictionari 
##             0.84             0.84             0.84             0.84 
##        downright      dunningkrug       empiricist         extrapol 
##             0.84             0.84             0.84             0.84 
##          eyebrow          fattail      firstperson           fissil 
##             0.84             0.84             0.84             0.84 
##            flesh           flight         followup     futuredirect 
##             0.84             0.84             0.84             0.84 
##         gaussian              gut            hackl        heavylift 
##             0.84             0.84             0.84             0.84 
##          herring          hydraul        incomplet            insur 
##             0.84             0.84             0.84             0.84 
##           intang          invalid         languish            lobbi 
##             0.84             0.84             0.84             0.84 
##        metropoli          neutron   nonprobabilist     oneinbillion 
##             0.84             0.84             0.84             0.84 
##       overconfid           pcbase          pension         penultim 
##             0.84             0.84             0.84             0.84 
##           piecem            plane         plethora        portfolio 
##             0.84             0.84             0.84             0.84 
## programportfolio           protiv            quant            radar 
##             0.84             0.84             0.84             0.84 
##              raw             rear       selfassess         simultan 
##             0.84             0.84             0.84             0.84 
##      spectacular        stanislaw            stock      stockmarket 
##             0.84             0.84             0.84             0.84 
##          stratif             tese            trait           trivia 
##             0.84             0.84             0.84             0.84 
##           turbin             ulam        unanticip           unfold 
##             0.84             0.84             0.84             0.84 
##             usag          useless            crash        incorrect 
##             0.84             0.84             0.83             0.83 
##          analyst 
##             0.81

findAssocs(dtm_tfidf,"scapegoat",0.8)

## $scapegoat
##                                               …activ 
##                                                 1.00 
##                                                alarm 
##                                                 1.00 
##                                                 bake 
##                                                 1.00 
##                                                breed 
##                                                 1.00 
##                                               causat 
##                                                 1.00 
##                                            centralis 
##                                                 1.00 
##                                                chees 
##                                                 1.00 
##                                            chernobyl 
##                                                 1.00 
##                                             colloqui 
##                                                 1.00 
##                                               combat 
##                                                 1.00 
##                                          convenient… 
##                                                 1.00 
##                                              dormant 
##                                                 1.00 
##                                                drain 
##                                                 1.00 
##                                                 emot 
##                                                 1.00 
##                                                equip 
##                                                 1.00 
##                                             erstwhil 
##                                                 1.00 
##                                              evidenc 
##                                                 1.00 
##                                             frontlin 
##                                                 1.00 
##                                                 fund 
##                                                 1.00 
## httppatientsafetyedduhsdukeedumoduleeswisscheesehtml 
##                                                 1.00 
##                                                 hunt 
##                                                 1.00 
##                                        illthoughtout 
##                                                 1.00 
##                                                layer 
##                                                 1.00 
##                                                medic 
##                                                 1.00 
##                                              medicin 
##                                                 1.00 
##                                            momentari 
##                                                 1.00 
##                                             mosquito 
##                                                 1.00 
##                                              nuclear 
##                                                 1.00 
##                                              occurs… 
##                                                 1.00 
##                                             overwork 
##                                                 1.00 
##                                             pathogen 
##                                                 1.00 
##                                                recur 
##                                                 1.00 
##                                               remedi 
##                                                 1.00 
##                                                slice 
##                                                 1.00 
##                                               soviet 
##                                                 1.00 
##                                                swamp 
##                                                 1.00 
##                                                 swat 
##                                                 1.00 
##                                                swiss 
##                                                 1.00 
##                                          temporarili 
##                                                 1.00 
##                                              traffic 
##                                                 1.00 
##                                              uncoupl 
##                                                 1.00 
##                                                union 
##                                                 1.00 
##                                                unsaf 
##                                                 1.00 
##                                               violat 
##                                                 1.00 
##                                                error 
##                                                 0.99 
##                                                 hole 
##                                                 0.99 
##                                               defens 
##                                                 0.97 
##                                                 safe 
##                                                 0.97 
##                                                 laps 
##                                                 0.96 
##                                                staff 
##                                                 0.96 
##                                           standardis 
##                                                 0.96 
##                                               latent 
##                                                 0.94 
##                                                 rise 
##                                                 0.94 
##                                               target 
##                                                 0.91 
##                                                 caus 
##                                                 0.90 
##                                               condit 
##                                                 0.90 
##                                               mishap 
##                                                 0.89 
##                                                 poor 
##                                                 0.87 
##                                                trace 
##                                                 0.87 
##                                                agent 
##                                                 0.86 
##                                                 cite 
##                                                 0.86 
##                                               faulti 
##                                                 0.86 
##                                                plant 
##                                                 0.86 
##                                                under 
##                                                 0.85 
##                                              prevent 
##                                                 0.84 
##                                            unaddress 
##                                                 0.84 
##                                               learnt 
##                                                 0.83 
##                                            unrealist 
##                                                 0.83 
##                                             approach 
##                                                 0.82 
##                                             procedur 
##                                                 0.82

#notice the difference!

#histogram
wf=data.frame(term=names(wt_tot_tfidf),weights=wt_tot_tfidf)
#library(ggplot2)
ggplot(subset(wf, wt_tot_tfidf>.1), aes(reorder(term,weights), weights)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1))

#WOrdcloud of TF-IDF - to see weighted words

#wordcloud
#library(wordcloud)
#setting the same seed each time ensures consistent look across clouds
set.seed(42)
#limit words by specifying min total wt
wordcloud(names(wt_tot_tfidf),wt_tot_tfidf, max.words=100)

#...add color
wordcloud(names(wt_tot_tfidf),wt_tot_tfidf,max.words=100,colors=brewer.pal(6,"Dark2"))

#play with different values of max.words
#try specifying min.freq instead of max.words

Heirarchical clustering

#Create document-term matrix
cluster <- DocumentTermMatrix(clean.docs)
## start clustering specific code
#convert dtm to matrix (what format is the dtm stored in?)
cluster.matrix <- as.matrix(cluster)
#write as csv file
write.csv(cluster.matrix,file="../clean_data/ClusterAsMatrix.csv")

#shorten rownames for display purposes
#rownames(m_cluster) <- paste(substring(rownames(m),1,3),rep("..",nrow(m)),
                    #substring(rownames(m_cluster),
                    #nchar(rownames(m_cluster))-12,nchar(rownames(m_cluster))-4))
#compute distance between document vectors
hcluster.distance <- dist(cluster.matrix, method="euclidean")
#run hierarchical clustering using Ward's method (explore other options later)
hcluster.groups <- hclust(hcluster.distance,method="ward.D")
#plot, use hang to ensure that labels fall below tree
plot(hcluster.groups, hang=-1)
#cut into 2 subtrees. Try 3,4,5,6 cuts; comment on your results
rect.hclust(hcluster.groups,2)

hcluster.hclusters <- cutree(hcluster.groups,2)
write.csv(hcluster.hclusters,"../clean_data/hclusters.csv")

Cosine distance clusters

##try another distance measure
cosineSim <- function(x){
  as.dist(x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
cosine <- cosineSim(cluster.matrix)
cosine.distance <- cosine

#run hierarchical clustering using cosine distance
cosine.groups <- hclust(cosine.distance, method="ward.D")
#plot, use hang to ensure that labels fall below tree
plot(cosine.groups, hang=-1)
#cut into 2 subtrees.
rect.hclust(cosine.groups, 2)

cosine.hclusters <- cutree(cosine.groups, 2)
write.csv(cosine.hclusters, "../clean_data/hclusters_cosine.csv")

KMeans clustering

#kmeans clustering
#kmeans - run with nstart=100 and k=2,3,5 to compare results with hclust
kfit <- kmeans(hcluster.distance, centers=3, nstart=100)
#plot - need library cluster
clusplot(as.matrix(hcluster.distance), kfit$cluster, color=T, shade=T, labels=2, lines=0)

#print contents of kfit
print(kfit)

## K-means clustering with 3 clusters of sizes 4, 12, 25
## 
## Cluster means:
##   Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt Doc07.txt
## 1  188.6253  192.3036 167.62497  170.8201 181.28787  188.8120 175.09968
## 2  115.5392  118.3771 111.57906  115.9203 124.96098  115.2198 112.95505
## 3  109.6623  112.5678  68.79821   67.9816  96.08796  107.8372  77.87122
##   Doc08.txt Doc09.txt Doc10.txt Doc11.txt Doc12.txt Doc13.txt Doc14.txt
## 1  193.8233  206.3925 174.51699  196.9089 171.25725 169.29243  174.8946
## 2  127.6042  129.8400 117.87510  149.3241 121.89970 123.75103  135.6713
## 3  130.6559  136.5729  81.37006  136.6676  77.71224  74.68835   95.6443
##   Doc15.txt Doc16.txt Doc17.txt Doc18.txt Doc19.txt Doc20.txt Doc21.txt
## 1 174.26716 170.61832 172.97231  174.9302  148.4990  149.9209  177.1190
## 2 124.44654 122.62525 129.59150  137.9235  173.5920  181.7467  143.7394
## 3  86.84732  78.28215  85.33809  100.8456  161.3057  165.5252  132.7690
##   Doc22.txt Doc23.txt Doc24.txt Doc25.txt Doc26.txt Doc27.txt Doc28.txt
## 1  171.4038 169.05986 173.44495 173.05071 177.67272 175.77795  171.5950
## 2  141.7953 123.73378 124.14625 118.61670 131.41773 128.48509  118.7266
## 3  127.6210  78.78967  79.71905  80.78633  90.93077  86.37011   72.9920
##   Doc29.txt Doc30.txt Doc31.txt Doc32.txt Doc33.txt Doc34.txt Doc35.txt
## 1  199.1312  190.7290 177.19832 182.91260  184.7659 174.80883  144.3683
## 2  138.0648  132.4550 128.97569 131.46912  137.3207 127.35684  206.7825
## 3  133.9439  117.9196  86.22049  97.33003  101.1966  86.52654  198.5078
##   Doc36.txt Doc37.txt Doc38.txt Doc39.txt Doc40.txt Doc41.txt
## 1 164.70797  163.4464  152.3760  173.4409  133.4520  147.0604
## 2 122.61888  122.9342  126.4871  147.7589  179.7954  134.4065
## 3  76.19227   78.1793  100.6445  136.4362  165.6052  115.6609
## 
## Clustering vector:
## Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt Doc07.txt 
##         2         2         3         3         3         2         3 
## Doc08.txt Doc09.txt Doc10.txt Doc11.txt Doc12.txt Doc13.txt Doc14.txt 
##         2         2         3         2         3         3         3 
## Doc15.txt Doc16.txt Doc17.txt Doc18.txt Doc19.txt Doc20.txt Doc21.txt 
##         3         3         3         3         1         1         2 
## Doc22.txt Doc23.txt Doc24.txt Doc25.txt Doc26.txt Doc27.txt Doc28.txt 
##         2         3         3         3         3         3         3 
## Doc29.txt Doc30.txt Doc31.txt Doc32.txt Doc33.txt Doc34.txt Doc35.txt 
##         2         2         3         3         3         3         1 
## Doc36.txt Doc37.txt Doc38.txt Doc39.txt Doc40.txt Doc41.txt 
##         3         3         3         2         1         2 
## 
## Within cluster sum of squares by cluster:
## [1] 169865.3 367619.1 321561.2
##  (between_SS / total_SS =  55.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

#print cluster sizes
kfit$size

## [1]  4 12 25

#print clusters (members)
kfit$cluster

## Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt Doc07.txt 
##         2         2         3         3         3         2         3 
## Doc08.txt Doc09.txt Doc10.txt Doc11.txt Doc12.txt Doc13.txt Doc14.txt 
##         2         2         3         2         3         3         3 
## Doc15.txt Doc16.txt Doc17.txt Doc18.txt Doc19.txt Doc20.txt Doc21.txt 
##         3         3         3         3         1         1         2 
## Doc22.txt Doc23.txt Doc24.txt Doc25.txt Doc26.txt Doc27.txt Doc28.txt 
##         2         3         3         3         3         3         3 
## Doc29.txt Doc30.txt Doc31.txt Doc32.txt Doc33.txt Doc34.txt Doc35.txt 
##         2         2         3         3         3         3         1 
## Doc36.txt Doc37.txt Doc38.txt Doc39.txt Doc40.txt Doc41.txt 
##         3         3         3         2         1         2

#write clusters to csv file
write.csv(kfit$cluster,file="../clean_data/KMClustGroups2.csv")
#sum of squared distance between cluster centers 
kfit$betweenss

## [1] 1053721

#sum of squared distance within a cluster (this are the quantities that the algorithm
#attempts to minimise)
kfit$withinss

## [1] 169865.3 367619.1 321561.2

#kmeans - how to determine optimal number of clusters?

#One approach: look for "elbow" in plot of summed intra-cluster distances (withinss) as fn of k

wss <- 2:(length(clean.docs)-1)

for (i in 2:(length(clean.docs)-1)) {
  wss[i] <- sum(kmeans(hcluster.distance,centers=i,nstart=25)$withinss)
}

plot(2:(length(clean.docs)-1), wss[2:(length(clean.docs)-1)], type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")

Igraph

library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

m<-as.matrix(dtm)

#Map filenames to matrix row numbers
#these numbers will be used to reference files in the network graph
filekey <- cbind(1:length(docs),rownames(m))
write.csv(filekey,"filekey.csv",row.names = FALSE)
#have a look at file
rownames(m) <- 1:length(docs)
#compute cosine similarity between document vectors
#converting to distance matrix sets diagonal elements to 0
cosineSim <- function(x){
  as.dist(x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
cs <- cosineSim(m)

#adjacency matrix: set entries below a certain threshold to 0.
#We choose half the magnitude of the largest element of the matrix
#as the cutoff. This is an arbitrary choice
cs[cs < max(cs)/2] <- 0
cs <- round(cs,3)
#write to disk
write.csv(as.matrix(cs),file="../clean_data/AdjacencyMatrix.csv")
#open it and have a look

build that igraph

# build a graph from the above matrix
#mode is undirected because similarity is a bidirectional relationship
igraph <- graph.adjacency(as.matrix(cs), weighted=T, mode = "undirected")
#Plot a Graph
# set seed to make the layout reproducible
set.seed(42)
#one of many possible layouts, see igraph docs
layout1 <- layout.fruchterman.reingold(igraph)
#basic plot with no weighting - fruchtermann reingold weighting
plot(igraph, layout=layout1)

#another layout
plot(igraph, layout=layout.kamada.kawai)

# 19,20,21,22 all related
#24,23,18,27,31,34,24 all sit out on their own
#33 and 26 relate back to 25, which relates back to 9 and 29
#36 and 37 relate back to 41, which has other relations

fast.greedy

#Community detection - Fast/Greedy
comm_fg <- fastgreedy.community(igraph)
comm_fg$membership

##  [1]  4  4  4  4  4  4  4  4  4  4  5  5  5  5  5  5  5  6  2  2  2  2  7
## [24]  8  1  1  9  4  4  4 10  4  1 11  3  3  3  3  3  3  3

V(igraph)$color <- comm_fg$membership
plot(igraph, layout=layout.kamada.kawai)

community_mapping <- cbind(as.data.frame(filekey, row.names = F),comm_fg$membership)
community_mapping

##    V1        V2 comm_fg$membership
## 1   1 Doc01.txt                  4
## 2   2 Doc02.txt                  4
## 3   3 Doc03.txt                  4
## 4   4 Doc04.txt                  4
## 5   5 Doc05.txt                  4
## 6   6 Doc06.txt                  4
## 7   7 Doc07.txt                  4
## 8   8 Doc08.txt                  4
## 9   9 Doc09.txt                  4
## 10 10 Doc10.txt                  4
## 11 11 Doc11.txt                  5
## 12 12 Doc12.txt                  5
## 13 13 Doc13.txt                  5
## 14 14 Doc14.txt                  5
## 15 15 Doc15.txt                  5
## 16 16 Doc16.txt                  5
## 17 17 Doc17.txt                  5
## 18 18 Doc18.txt                  6
## 19 19 Doc19.txt                  2
## 20 20 Doc20.txt                  2
## 21 21 Doc21.txt                  2
## 22 22 Doc22.txt                  2
## 23 23 Doc23.txt                  7
## 24 24 Doc24.txt                  8
## 25 25 Doc25.txt                  1
## 26 26 Doc26.txt                  1
## 27 27 Doc27.txt                  9
## 28 28 Doc28.txt                  4
## 29 29 Doc29.txt                  4
## 30 30 Doc30.txt                  4
## 31 31 Doc31.txt                 10
## 32 32 Doc32.txt                  4
## 33 33 Doc33.txt                  1
## 34 34 Doc34.txt                 11
## 35 35 Doc35.txt                  3
## 36 36 Doc36.txt                  3
## 37 37 Doc37.txt                  3
## 38 38 Doc38.txt                  3
## 39 39 Doc39.txt                  3
## 40 40 Doc40.txt                  3
## 41 41 Doc41.txt                  3

#24, 18 and 31 are their own community
#34 is similar to 38 and the other greens

Louvain

#Community detection - Louvain
comm_lv <- cluster_louvain(igraph)
comm_lv$membership

##  [1]  1  1  1  1  9  1  1  1  1  1  2  2  2  2  2  2  2  3  4  4  4  4  5
## [24]  6  9  9  7  9  9  9  8  9  9 10 11 11 11 11 11 11 11

V(igraph)$color <- comm_lv$membership
plot(igraph, layout=layout.kamada.kawai)

community_mapping <- cbind(community_mapping,comm_lv$membership)
community_mapping

##    V1        V2 comm_fg$membership comm_lv$membership
## 1   1 Doc01.txt                  4                  1
## 2   2 Doc02.txt                  4                  1
## 3   3 Doc03.txt                  4                  1
## 4   4 Doc04.txt                  4                  1
## 5   5 Doc05.txt                  4                  9
## 6   6 Doc06.txt                  4                  1
## 7   7 Doc07.txt                  4                  1
## 8   8 Doc08.txt                  4                  1
## 9   9 Doc09.txt                  4                  1
## 10 10 Doc10.txt                  4                  1
## 11 11 Doc11.txt                  5                  2
## 12 12 Doc12.txt                  5                  2
## 13 13 Doc13.txt                  5                  2
## 14 14 Doc14.txt                  5                  2
## 15 15 Doc15.txt                  5                  2
## 16 16 Doc16.txt                  5                  2
## 17 17 Doc17.txt                  5                  2
## 18 18 Doc18.txt                  6                  3
## 19 19 Doc19.txt                  2                  4
## 20 20 Doc20.txt                  2                  4
## 21 21 Doc21.txt                  2                  4
## 22 22 Doc22.txt                  2                  4
## 23 23 Doc23.txt                  7                  5
## 24 24 Doc24.txt                  8                  6
## 25 25 Doc25.txt                  1                  9
## 26 26 Doc26.txt                  1                  9
## 27 27 Doc27.txt                  9                  7
## 28 28 Doc28.txt                  4                  9
## 29 29 Doc29.txt                  4                  9
## 30 30 Doc30.txt                  4                  9
## 31 31 Doc31.txt                 10                  8
## 32 32 Doc32.txt                  4                  9
## 33 33 Doc33.txt                  1                  9
## 34 34 Doc34.txt                 11                 10
## 35 35 Doc35.txt                  3                 11
## 36 36 Doc36.txt                  3                 11
## 37 37 Doc37.txt                  3                 11
## 38 38 Doc38.txt                  3                 11
## 39 39 Doc39.txt                  3                 11
## 40 40 Doc40.txt                  3                 11
## 41 41 Doc41.txt                  3                 11

#it sees the previous yellow as the same as the orange, but sees 34 as relating to 14, 17 etc
#slightly different communities

another way with Network Graphs

#lets weight the nodes and edges
#set label (not really necessary)
#V=vertex, E=edge
V(igraph)$label <- V(igraph)$name
#Vertex size proportional to number of connections
V(igraph)$size <- degree(igraph)*.6
#Vertex label size proportional to number of connections
V(igraph)$label.cex <-  degree(igraph) / max(degree(igraph))+ .8
#label colour default black
V(igraph)$label.color <- "black"
#Vertex color organe
V(igraph)$color <- "orange"
#edge color grey
E(igraph)$color <- "grey"
#edge width proportional to similarity (weight)
E(igraph)$width <- E(igraph)$weight*7
# plot the graph in layout1 (fruchtermann reingold)
plot(igraph, layout=layout1)

#output is quite ugly. Explore igraph to see how you
#can fix it
#9 is important, so is 6 and 7

#lets weight the nodes and edges
#set label (not really necessary)
#V=vertex, E=edge
V(igraph)$label <- V(igraph)$name
#Vertex size proportional to number of connections
V(igraph)$size <- degree(igraph)*.6
#Vertex label size proportional to number of connections
V(igraph)$label.cex <-  degree(igraph) / max(degree(igraph))+ .6
#label colour default black
V(igraph)$label.color <- "black"
#Vertex color organe
V(igraph)$color <- "orange"
#edge color grey
E(igraph)$color <- "grey"
#edge width proportional to similarity (weight)
E(igraph)$width <- E(igraph)$weight*5
# plot the graph in layout1 (fruchtermann reingold)
plot(igraph, layout=layout.auto)

#output is quite ugly. Explore igraph to see how you
#can fix it
plot(igraph, layout=layout1)

#LSA

library(lsa)
#Create term-document matrix (lsa expects a TDM rather than a DTM)
tdm <- TermDocumentMatrix(clean.docs)
#summary
tdm

## <<TermDocumentMatrix (terms: 4492, documents: 41)>>
## Non-/sparse entries: 18623/165549
## Sparsity           : 90%
## Maximal term length: 114
## Weighting          : term frequency (tf)

#inspect segment of document term matrix
inspect(tdm[1000:1006,1:10])

## <<TermDocumentMatrix (terms: 7, documents: 10)>>
## Non-/sparse entries: 8/62
## Sparsity           : 89%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## Sample             :
##           Docs
## Terms      Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt
##   deliber          0         0         0         0         0         1
##   delight          0         0         0         0         0         0
##   deliv            0         0         1         0         0         1
##   deliver          0         0         0         0         2         0
##   deliveri         0         0         1         0         0         0
##   delv             0         0         0         0         0         0
##   demand           0         3         0         0         0         0
##           Docs
## Terms      Doc07.txt Doc08.txt Doc09.txt Doc10.txt
##   deliber          0         0         0         0
##   delight          0         0         0         0
##   deliv            0         0         0         0
##   deliver          4         0         0         0
##   deliveri         1         0         0         0
##   delv             0         0         0         0
##   demand           0         0         0         0

#what kind of object is the tdm?
class(tdm)

## [1] "TermDocumentMatrix"    "simple_triplet_matrix"

#note: a simple triplet matrix (STM) is an efficient format to store
#sparse matrices
#need to convert STM to regular matrix
#convert to regular matrix
tdm.matrix <- as.matrix(tdm)
#check class
class(tdm.matrix)

## [1] "matrix"

dim(tdm.matrix)

## [1] 4492   41

#weight terms and docs

#Note: We weight the TDM before calculating the latent semantic space.
#This is to better reflect the relative importance of each term/doc 
#in relation to the entire corpus (much like tf-idf weighting).  
#It is convenient to express the transformation as a product of 
#two numbers - local and global weight functions, like so:
#a (i,j) = L(i,j)*G(i).
#The local weight function L(i,j) presents the weight of term i 
#in document j. The global weight function G(i) is used to express 
#the weight of the term iacross the entire document set. 

#We'll use the equivalent of tf-idf (local weight - tf, global -idf)
#check out other weighting schemes in the documentation (link above)
tdm.matrix.lsa <- lw_tf(tdm.matrix) * gw_idf(tdm.matrix)
dim(tdm.matrix.lsa)

## [1] 4492   41

#compute the Latent semantic space
lsaSpace <- lsa(tdm.matrix.lsa, dimcalc_share()) # create LSA space
#examine output
names(lsaSpace)

## [1] "tk" "dk" "sk"

#Original Matrix is decomposed as: 
#tk(nterms,lsadim).Sk(lsadim).dk*(lsadim,ndocs)
#where 
#nterms=number of terms in TDM
#ndocs=number of docs in TDM
#lsadim=dimensionality of Latent Semantic Space (length of Sk)
LSAMat <- as.textmatrix(lsaSpace)
#1)
#Examine a term in LS space
LSAMat["social",1:10]

## Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt Doc07.txt 
## 11.198411  7.246920  4.012227  2.235230 10.588707 14.475969  6.947769 
## Doc08.txt Doc09.txt Doc10.txt 
## 22.950848 17.895179  7.801364

#compare to Term-frequency space
tdm.matrix.lsa["social",1:10]

## Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt Doc07.txt 
##  3.187627  6.375254  0.000000  0.000000  0.000000 92.441183  0.000000 
## Doc08.txt Doc09.txt Doc10.txt 
##  0.000000  0.000000  0.000000

#try other words
#What does this tell you about the term vectors in the two spaces?

EDA on AT3: Text Mining

Alex Brooks

10/14/2018

EDA on docs

Write a standard function to clean the documents

A bit of basic analysis of documents with this function

Insights on the corpus

Including Plots

TF-IDF

Heirarchical clustering

Cosine distance clusters

KMeans clustering

Igraph

build that igraph

fast.greedy

Louvain

another way with Network Graphs