Looking at docs corpus and analysing themes.
#Let's go. Load corpus...
docs <- VCorpus(DirSource("../raw_data/doc_corpus"))
num.docs <- length(docs)
clean_docs <- function(docs) {
output = docs
#Remove punctuation - replace punctuation marks with " "
output = tm_map(output, removePunctuation)
#Transform to lower case
output = tm_map(output,content_transformer(tolower))
#Strip digits
output = tm_map(output, removeNumbers)
#Remove stopwords from standard stopword list
output = tm_map(output, removeWords, stopwords("english"))
#Create a custom set of stopwords that weren't removed by the standard stopwords list
myStopwords <- c("can",
"also",
"get",
"see",
"may",
"much",
"now",
"said",
"will",
"way",
"well",
"howev",
"say",
"one",
"use")
#Remove the custom stopwords
output = tm_map(output, removeWords, myStopwords)
#Strip whitespace (cosmetic?)
output = tm_map(output, stripWhitespace)
return(output)
}
clean.docs <- clean_docs(docs)
document_info <- function(documents, clean.documents) {
num.documents = length(clean.documents)
for(i in 1:num.documents) {
document.meta = clean.documents[[i]]$meta
document.content = clean.documents[[i]]$content
raw.document.content = documents[[i]]$content
if (i == 1) {
document.summary = data.frame(id = document.meta$id, language = document.meta$language, datetimestamp = document.meta$datetimestamp, raw.characters = nchar(raw.document.content), clean.characters = nchar(document.content))
} else {
document.temp = data.frame(id = document.meta$id, language = document.meta$language, datetimestamp = document.meta$datetimestamp, raw.characters = nchar(raw.document.content), clean.characters = nchar(document.content))
document.summary = rbind(document.summary, document.temp)
}
}
return(document.summary)
}
#running the function on raw docs and clean docs
document_info(docs, clean.docs)
## id language datetimestamp raw.characters clean.characters
## 1 Doc01.txt en 2018-10-23 05:57:07 15842 11001
## 2 Doc02.txt en 2018-10-23 05:57:07 12900 9199
## 3 Doc03.txt en 2018-10-23 05:57:07 5119 3364
## 4 Doc04.txt en 2018-10-23 05:57:07 2999 1774
## 5 Doc05.txt en 2018-10-23 05:57:07 10122 7020
## 6 Doc06.txt en 2018-10-23 05:57:07 11949 8282
## 7 Doc07.txt en 2018-10-23 05:57:07 5072 3404
## 8 Doc08.txt en 2018-10-23 05:57:07 19088 13569
## 9 Doc09.txt en 2018-10-23 05:57:07 13908 9261
## 10 Doc10.txt en 2018-10-23 05:57:07 7310 4941
## 11 Doc11.txt en 2018-10-23 05:57:07 26707 17968
## 12 Doc12.txt en 2018-10-23 05:57:07 9770 6511
## 13 Doc13.txt en 2018-10-23 05:57:07 6253 3170
## 14 Doc14.txt en 2018-10-23 05:57:07 12612 7553
## 15 Doc15.txt en 2018-10-23 05:57:07 8602 5994
## 16 Doc16.txt en 2018-10-23 05:57:07 9145 6275
## 17 Doc17.txt en 2018-10-23 05:57:07 9381 6056
## 18 Doc18.txt en 2018-10-23 05:57:07 13956 9362
## 19 Doc19.txt en 2018-10-23 05:57:07 34921 22566
## 20 Doc20.txt en 2018-10-23 05:57:07 25250 16499
## 21 Doc21.txt en 2018-10-23 05:57:07 20814 13589
## 22 Doc22.txt en 2018-10-23 05:57:07 19994 13431
## 23 Doc23.txt en 2018-10-23 05:57:07 9338 6217
## 24 Doc24.txt en 2018-10-23 05:57:07 7275 5026
## 25 Doc25.txt en 2018-10-23 05:57:07 13068 8860
## 26 Doc26.txt en 2018-10-23 05:57:07 13548 9296
## 27 Doc27.txt en 2018-10-23 05:57:07 10144 6897
## 28 Doc28.txt en 2018-10-23 05:57:07 5559 3970
## 29 Doc29.txt en 2018-10-23 05:57:07 20603 14742
## 30 Doc30.txt en 2018-10-23 05:57:07 17404 12241
## 31 Doc31.txt en 2018-10-23 05:57:07 7042 4644
## 32 Doc32.txt en 2018-10-23 05:57:07 11917 8341
## 33 Doc33.txt en 2018-10-23 05:57:07 12876 8604
## 34 Doc34.txt en 2018-10-23 05:57:07 13419 8668
## 35 Doc35.txt en 2018-10-23 05:57:07 27493 16897
## 36 Doc36.txt en 2018-10-23 05:57:07 6037 3957
## 37 Doc37.txt en 2018-10-23 05:57:07 6871 4418
## 38 Doc38.txt en 2018-10-23 05:57:07 10482 6619
## 39 Doc39.txt en 2018-10-23 05:57:07 16881 11082
## 40 Doc40.txt en 2018-10-23 05:57:07 18455 11735
## 41 Doc41.txt en 2018-10-23 05:57:07 14453 9365
clean.docs <- tm_map(clean.docs,stemDocument)
#inspect
writeLines(as.character(clean.docs[[num.docs]]))
## essay uncertainti project task estim describ task estim correspond probabl distribut put simpli task estim actual rang possibl complet time probabl occurr specifi distribut know distribut possibl answer question probabl task complet within x day reliabl predict depend faith distribut captur actual spread task durat – therein lie least coupl problem first probabl distribut task durat general hard characteris lack reliabl data estim good estim histor data usual avail second mani realist distribut complic mathemat form hard characteris manipul problem compound fact project consist sever task durat estim possibl complic distribut first issu usual address fit distribut point estim optimist pessimist like time pert refin estim distribut gain experi second issu tackl mont carlo techniqu involv simul task number time use appropri distribut calcul expect complet time base result aim post present examplebas introduct mont carlo simul project task durat although aim keep thing reason simpl beyond highschool math basic understand probabl ill cover fair bit ground given id best start brief descript approach reader know come mont carlo simul umbrella term cover rang approach random sampl simul event describ known probabl distribut first task specifi probabl distribut howev mention earlier general unknown task durat simplic ill assum task durat uncertainti describ accur use triangular probabl distribut – distribut particular easi handl mathemat point view advantag use triangular distribut simul result valid easili use triangular distribut isnt limit method describ appli arbitrarili shape distribut import techniqu use simul happen multipl task strung togeth project schedul ill cover futur post final ill demonstr mont carlo simul method appli singl task describ triangular distribut although simul overkil case question regard durat answer exact without use simul exampl serv illustr step involv simul complex case – compris task andor involv complic distribut without ado let begin journey describ triangular distribut triangular distribut let assum there project task need person go reckon take hour complet like complet time hour estim come number isnt import stage – mayb there guesswork mayb pad mayb realli base experi what import three number correspond minimum like maximum time keep discuss general call tmin tml tmax respect back estim specif number later probabl associ time sinc tmin tmax correspond minimum maximum time probabl associ zero wasnt zero nonzero probabl complet time less tmin greater tmax – isnt possibl note consequ assumpt probabl vari continu – take nonzero valu p tmin must take valu slight less p – greater – t slight smaller tmin far like time tml concern definit probabl attain highest valu time tml assum probabl describ triangular function distribut must form shown figur figur figur triangular distribut simul need know equat describ distribut although wikipedia tell us answer mouseclick instruct figur first note area triangl must equal task must finish time tmin tmax consequ fractimesbasetimesaltitudefractimestmaxtmintimesptmlldotsldot ptml probabl correspond time tml bit rearrang ptmlfractmaxtminldotsldot deriv probabl time t lie tmax tml note fracttminptfractmltminptmlldotsldot consequ fact ratio either side equat equal slope line join point tmin tml ptml figur figur substitut simplifi bit obtain ptfracttmintmltmintmaxtmindotsldot tminleq t leq tml similar fashion show probabl time lie tml tmax given ptfractmaxttmaxtmltmaxtmindotsldot tmlleq t leq tmax equat togeth describ probabl distribut function pdf time tmin tmax anoth quantiti interest cumul distribut function cdf probabl p task complet time t reiter pdf pt probabl task finish time t wherea cdf pt probabl task complet time t cdf pt essenti sum probabl tmin t t tmin area triangl apex tmin t t pt use formula area triangl base time height equat ptfracttmintmltmintmaxtminldotsldot tminleq t leq tml note t geq tml area curv equal total area minus area enclos triangl base t tmax pt fractmaxttmaxtmltmaxtminldotsldot tmlleq t leq tmax expect pt start valu tmin increas monoton attain valu tmax end section let plug number quot estim start section tmin tml tmax result pdf cdf shown figur figur triangular pdf tmin tml tmax figur triangular cdf tmin tml tmax figur – triangular cdf tmin tml tmax mont carlo simul singl task ok busi end essay – simul ill first outlin simul procedur discuss result case task describ previous section triangular distribut tmin tml tmax note use tk solver – mathemat packag creat univers technic system – simul tk solver builtin backsolv capabl extrem help solv equat come simul calcul excel spreadsheet skill here simul procedur generat random number treat number cumul probabl pt simul run technic note use random number generat come tk solver packag algorithm use generat describ excel random number generat even better find time t correspond pt solv equat t result valu t time taken complet task technic note solv equat t isnt straightforward t appear sever place equat two option solv t numer techniqu bisect newtonraphson method b backsolv goal seek function excel mathemat packag use backsolv capabl tk solver obtain t random valu p generat tk solver backsolv equat automat – fiddl around numer method – make attract option kind calcul repeat step n time n suffici larg number – calcul n use triangular distribut paramet tmin tml tmax gave valu pt t exampl simul run proceed here data first simul run random number generat return call valu pt time correspond cumul probabl obtain solv equat numer t gave t call shown figur complet time first run figur figur complet simul run sort bin correspond time interv hrs start thrs t hrs result histogram shown figur bar correspond number simul run fall within time interv figur distribut simul run figur distribut simul run might expect look like triangular distribut shown figur differ though figur plot probabl continu function time wherea figur plot number simul run step function time convinc two realli let look cumul probabl tml – ie probabl task complet within hrs equat ptml correspond number simul simpli number simul run complet time less equal hrs divid total number simul run simul come agreement perfect convinc enough just sure perform simul number time – generat sever set random number – took averag predict ptml agreement theori simul improv expect wrap limit triangular distribut impos upper cutoff tmax longtail distribut therefor realist end though form distribut neither techniqu appli distribut real question obtain reliabl distribut estim there easi answer start three point estim pert fit triangular complic distribut although best histor data absenc alway start reason guess point refin experi anoth point worth mention simul done level higher indivdu task brilliant book – waltz bear manag risk softwar project – de marco lister demonstr mont carlo method simul various aspect project – veloc time cost etc – project level oppos task level believ better perform simul lowest possibl level although lot work – main reason easier less errorpron estim individu task entir project nevertheless high level simul use reliabl data base remiss didnt mention various mont carlo packag avail market ive never use account theyr pretti good commerci packag exampl product random number generat sampl techniqu far sophist simpl one ive use exampl final admit exampl describ post complic demonstr obvious – start triangular distribut got back triangular distribut via simul point howev illustr method show yield expect result situat answer known futur post ill appli method complex situat exampl multipl task seri parallel depend rule thrown good measur although ill triangular distribut individu task result far obvious simul method realli start shine complex increas wait later hope exampl help illustr mont carlo method use simul project task
#end of preprocessing
#Create document-term matrix
dtm <- DocumentTermMatrix(clean.docs)
#summary
dtm
## <<DocumentTermMatrix (documents: 41, terms: 4492)>>
## Non-/sparse entries: 18623/165549
## Sparsity : 90%
## Maximal term length: 114
## Weighting : term frequency (tf)
#4442 terms in the 41 docs
#inspect segment of document term matrix
inspect(dtm[1:10,995:1006])
## <<DocumentTermMatrix (documents: 10, terms: 12)>>
## Non-/sparse entries: 13/107
## Sparsity : 89%
## Maximal term length: 8
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs defocus degre delay deleg delib deliber deliv deliver deliveri
## Doc01.txt 0 0 0 0 0 0 0 0 0
## Doc02.txt 1 0 0 0 1 0 0 0 0
## Doc03.txt 0 0 0 0 0 0 1 0 1
## Doc04.txt 0 0 0 0 0 0 0 0 0
## Doc05.txt 0 1 1 0 0 0 0 2 0
## Doc06.txt 0 0 0 0 0 1 1 0 0
## Doc07.txt 0 0 0 0 0 0 0 4 1
## Doc08.txt 0 1 0 0 0 0 0 0 0
## Doc09.txt 0 0 0 0 0 0 0 0 0
## Doc10.txt 0 0 0 0 0 0 0 0 0
## Terms
## Docs demand
## Doc01.txt 0
## Doc02.txt 3
## Doc03.txt 0
## Doc04.txt 0
## Doc05.txt 0
## Doc06.txt 0
## Doc07.txt 0
## Doc08.txt 0
## Doc09.txt 0
## Doc10.txt 0
#collapse matrix by summing over columns - this gets total counts (over all docs) for each term
freq <- colSums(as.matrix(dtm))
#length should be total number of terms
length(freq)
## [1] 4492
#create sort order (asc)
ord <- order(freq,decreasing=TRUE)
#inspect most frequently occurring terms
freq[head(ord)]
## project risk manag figur time use
## 579 538 507 335 333 299
#write to disk and inspect file
write.csv(file="../clean_data/freq.csv",freq[ord])
#inspect least frequently occurring terms
freq[tail(ord)]
## ysqrtsum yyy zero… zip
## 1 1 1 1
## zoom zsqrtxyzsqrtxyz
## 1 1
#list most frequent terms. Lower bound specified as second argument
findFreqTerms(dtm,lowfreq=80)
## [1] "algorithm" "approach" "argument" "articl" "author"
## [6] "base" "best" "case" "chang" "cluster"
## [11] "complet" "correl" "data" "day" "decis"
## [16] "describ" "develop" "differ" "discuss" "distribut"
## [21] "doc" "document" "estim" "exampl" "figur"
## [26] "first" "follow" "function" "good" "group"
## [31] "howev" "ibi" "idea" "ill" "import"
## [36] "inform" "interest" "issu" "knowledg" "like"
## [41] "look" "make" "manag" "mani" "map"
## [46] "mean" "method" "model" "need" "note"
## [51] "number" "often" "organ" "organis" "paper"
## [56] "point" "possibl" "post" "practic" "probabl"
## [61] "problem" "process" "project" "question" "reason"
## [66] "result" "risk" "set" "simul" "system"
## [71] "take" "task" "techniqu" "term" "thing"
## [76] "time" "topic" "two" "understand" "use"
## [81] "valu" "word" "work"
You can also embed plots, for example:
#correlations
findAssocs(dtm,"algorithm",0.9)
## $algorithm
## out nstart purist rownam
## 0.97 0.90 0.90 0.90
findAssocs(dtm,"argument",0.75)
## $argument
## map issu respond applic question compendium
## 0.94 0.90 0.80 0.79 0.79 0.78
## issuebas ibi
## 0.77 0.76
findAssocs(dtm,"cluster",0.9)
## $cluster
## agglom already…
## 1.00 1.00
## annoy asmatrixdtm
## 1.00 1.00
## bolt branch
## 1.00 1.00
## britishaussi categor
## 1.00 1.00
## centroid chug
## 1.00 1.00
## clearest closest
## 1.00 1.00
## clusplot clusplotasmatrixd
## 1.00 1.00
## clustersylabwithin colort
## 1.00 1.00
## concaten configur
## 1.00 1.00
## contenttransformergsubpattern cuserskailashdocu
## 1.00 1.00
## dab dabi
## 1.00 1.00
## dendogram dendrogram
## 1.00 1.00
## denomin dens
## 1.00 1.00
## dist distancebas
## 1.00 1.00
## distancerel distancewis
## 1.00 1.00
## distm dxi
## 1.00 1.00
## educat elbow
## 1.00 1.00
## euclidean excus
## 1.00 1.00
## flatten geometr
## 1.00 1.00
## hclust hclustdmethodwardd
## 1.00 1.00
## hood insofar
## 1.00 1.00
## intraclust kfit
## 1.00 1.00
## kfitclust kmean
## 1.00 1.00
## kmeansd libraryclust
## 1.00 1.00
## longest means…
## 1.00 1.00
## miningspeak ncharrownamesmncharrownamesm
## 1.00 1.00
## nut onus
## 1.00 1.00
## optimis optimum
## 1.00 1.00
## pastesubstringrownamesmrepnrowm plotgroup
## 1.00 1.00
## postlookrightnowthinkv potentiallyy
## 1.00 1.00
## rambl recthclust
## 1.00 1.00
## recthclustgroup recurs
## 1.00 1.00
## reduct rownamesm
## 1.00 1.00
## scan shadet
## 1.00 1.00
## sharpli shorten
## 1.00 1.00
## showstopp silent
## 1.00 1.00
## split sqrt
## 1.00 1.00
## sqrtx steadfast
## 1.00 1.00
## straightforward– substringrownamesm
## 1.00 1.00
## subtre sumkmeansdcentersinstartwithinss
## 1.00 1.00
## territori toolsinstal
## 1.00 1.00
## truncat twodocu
## 1.00 1.00
## typeb ward
## 1.00 1.00
## welldefin withinclust
## 1.00 1.00
## withinss wss
## 1.00 1.00
## wssi xlabnumb
## 1.00 1.00
## xxx yyy
## 1.00 1.00
## princip hierarch
## 0.99 0.98
## distanc merg
## 0.97 0.97
## minimis
## 0.95
findAssocs(dtm,"project",0.69)
## $project
## lose dollar win
## 0.71 0.70 0.70
findAssocs(dtm,"discuss",0.75)
## $discuss
## werner
## 0.79
findAssocs(dtm,"model",0.75)
## $model
## overst test engin field
## 0.86 0.85 0.81 0.76
findAssocs(dtm,"risk",0.7)
## $risk
## numeric(0)
findAssocs(dtm,"question",0.7)
## $question
## argument map respond con idea issu pros issuebas
## 0.79 0.77 0.77 0.76 0.74 0.73 0.72 0.71
findAssocs(dtm,"document",0.9)
## $document
## contenttransformerfunctionx gsubpattern
## 0.97 0.97
## inenterpri librarytm
## 0.97 0.97
## stopwordsenglish tmmapdoc
## 0.97 0.97
## tmmapdocscontenttransformertolow tmmapdocsstemdocu
## 0.97 0.97
## contenttransformergsub doc
## 0.96 0.95
## replac writelinesascharacterdoc
## 0.95 0.95
## removeword tospac
## 0.94 0.94
## pattern
## 0.91
#histogram
wf = data.frame(term=names(freq), occurrences=freq)
ggplot(subset(wf, occurrences>200), aes(term, occurrences)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1))
#order by frequency
ggplot(subset(wf, occurrences>200), aes(reorder(term,occurrences), occurrences)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1))
#wordcloud
#setting the same seed each time ensures consistent look across clouds
set.seed(42)
#limit words by specifying min frequency
wordcloud(names(freq),freq, max.words=40)
#...add color
wordcloud(names(freq),freq,max.words=40,colors=brewer.pal(6,"Dark2"))
#Making bigrams of words in the corpus
#to see what ngrams does, try running ngrams(words(docs[[1]]$content),2), which
#returns bigrams for the first document in the corpus
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
#create DTM
dtmbi <- DocumentTermMatrix(clean.docs, control = list(tokenize = BigramTokenizer))
freqbi <- colSums(as.matrix(dtmbi))
#length should be total number of terms
length(freqbi)
## [1] 34967
#create sort order (asc)
ordbi <- order(freqbi,decreasing=TRUE)
#inspect most frequently occurring terms
freqbi[head(ordbi)]
## project manag risk manag complet time doc tmmapdoc figur figur
## 138 132 105 64 61
## mont carlo
## 61
dtm_tfidf <- DocumentTermMatrix(clean.docs, control = list(weighting = weightTfIdf))
#note that the weighting is normalised by default (that is, the term frequencies in a
#document are normalised by the number of terms in the document)
#summary
dtm_tfidf
## <<DocumentTermMatrix (documents: 41, terms: 4492)>>
## Non-/sparse entries: 18623/165549
## Sparsity : 90%
## Maximal term length: 114
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
#inspect segment of document term matrix
inspect(dtm_tfidf[1:10,1000:1006])
## <<DocumentTermMatrix (documents: 10, terms: 7)>>
## Non-/sparse entries: 8/62
## Sparsity : 89%
## Maximal term length: 8
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs deliber delight deliv deliver deliveri delv
## Doc01.txt 0.00000000 0 0.000000000 0.000000000 0.000000000 0
## Doc02.txt 0.00000000 0 0.000000000 0.000000000 0.000000000 0
## Doc03.txt 0.00000000 0 0.007332425 0.000000000 0.008110029 0
## Doc04.txt 0.00000000 0 0.000000000 0.000000000 0.000000000 0
## Doc05.txt 0.00000000 0 0.000000000 0.008712678 0.000000000 0
## Doc06.txt 0.00343308 0 0.003103910 0.000000000 0.000000000 0
## Doc07.txt 0.00000000 0 0.000000000 0.036015174 0.008013251 0
## Doc08.txt 0.00000000 0 0.000000000 0.000000000 0.000000000 0
## Doc09.txt 0.00000000 0 0.000000000 0.000000000 0.000000000 0
## Doc10.txt 0.00000000 0 0.000000000 0.000000000 0.000000000 0
## Terms
## Docs demand
## Doc01.txt 0.00000000
## Doc02.txt 0.01221744
## Doc03.txt 0.00000000
## Doc04.txt 0.00000000
## Doc05.txt 0.00000000
## Doc06.txt 0.00000000
## Doc07.txt 0.00000000
## Doc08.txt 0.00000000
## Doc09.txt 0.00000000
## Doc10.txt 0.00000000
#collapse matrix by summing over columns - this gets total weights (over all docs) for each term
wt_tot_tfidf <- colSums(as.matrix(dtm_tfidf))
#length should be total number of terms
length(wt_tot_tfidf )
## [1] 4492
#create sort order (asc)
ord_tfidf <- order(wt_tot_tfidf,decreasing=TRUE)
#inspect most frequently occurring terms
wt_tot_tfidf[head(ord_tfidf)]
## risk distribut simul map ibi eleph
## 0.6829182 0.3715409 0.3447371 0.3124370 0.2917337 0.2783366
#write to disk and inspect file
write.csv(file="../clean_data/wt_tot_tfidf.csv",wt_tot_tfidf[ord_tfidf])
#inspect least weighted terms
wt_tot_tfidf[tail(ord_tfidf)]
## wickham
## 0.001895137
## wordcloudnamesfreqrfreqr
## 0.001895137
## wordcloudnamesfreqrfreqrminfreqcolorsbrewerpaldark″
## 0.001895137
## writelin
## 0.001895137
## youtub
## 0.001895137
## zip
## 0.001895137
#correlations - compare to dtm generated by tf and tf/truncated weighting
#"project" at correlation level of 0.6
findAssocs(dtm_tfidf,"risk",0.5)
## $risk
## affect methodolog threat unambigu awar guidanc
## 0.60 0.58 0.55 0.55 0.54 0.54
## impact prior sale seven attent expect
## 0.54 0.53 0.53 0.53 0.52 0.52
## handl huge analysi
## 0.52 0.51 0.50
findAssocs(dtm_tfidf,"distribution",0.8)
## $distribution
## numeric(0)
findAssocs(dtm_tfidf,"eleph",0.8)
## $eleph
## amok clout dead deni disappear
## 1.00 1.00 1.00 1.00 1.00
## fantasi havoc inconveni invis mahout
## 1.00 1.00 1.00 1.00 1.00
## moment pachyderm patholog pray rampag
## 1.00 1.00 1.00 1.00 1.00
## unforeseeable– wreak exemplifi imposit room
## 1.00 1.00 0.99 0.99 0.99
## appar metaphor mistaken murphi reluct
## 0.98 0.98 0.98 0.98 0.98
## wake blind danger fast game
## 0.98 0.97 0.97 0.97 0.97
## foreseen magic ignor assur situat
## 0.96 0.96 0.95 0.94 0.94
## strang affair eventu wish fraught
## 0.94 0.93 0.93 0.93 0.92
## friend hope werent wil hand
## 0.92 0.92 0.92 0.92 0.90
## walk whatsoev pass anticip bad
## 0.90 0.90 0.89 0.88 0.88
## fear quit resolv occurr handl
## 0.87 0.87 0.86 0.84 0.81
findAssocs(dtm_tfidf,"wickham",0.8)
## $wickham
## …add
## 1.00
## …output
## 1.00
## addon
## 1.00
## adject
## 1.00
## aesterm
## 1.00
## aesthet
## 1.00
## alphabet
## 1.00
## anali
## 1.00
## anytim
## 1.00
## appeal
## 1.00
## architectarchitectur
## 1.00
## asc
## 1.00
## asmatrix
## 1.00
## assort
## 1.00
## augment
## 1.00
## backslash
## 1.00
## backyard
## 1.00
## bag
## 1.00
## banana
## 1.00
## bigram
## 1.00
## brewer
## 1.00
## casesensit
## 1.00
## chop
## 1.00
## cleans
## 1.00
## cleanup
## 1.00
## colon
## 1.00
## color
## 1.00
## colsumsasmatrixdtmr
## 1.00
## commandsscript
## 1.00
## compo
## 1.00
## conjunct
## 1.00
## consol
## 1.00
## contigu
## 1.00
## controllistwordlengthsc
## 1.00
## cooccur
## 1.00
## cooccurr
## 1.00
## copynpast
## 1.00
## cosmet
## 1.00
## crimin
## 1.00
## crinkl
## 1.00
## crude
## 1.00
## ctrlenter
## 1.00
## cur
## 1.00
## cuser
## 1.00
## cusersdocu
## 1.00
## cutnpast
## 1.00
## defacto
## 1.00
## descriptor
## 1.00
## dirti
## 1.00
## downsid
## 1.00
## dtm–
## 1.00
## dtmr
## 1.00
## dtms
## 1.00
## eas
## 1.00
## escap
## 1.00
## exerci
## 1.00
## expo
## 1.00
## faq
## 1.00
## feiner
## 1.00
## files…
## 1.00
## findassoc
## 1.00
## findassocsdtmrenterpri
## 1.00
## findassocsdtmrproject
## 1.00
## findassocsdtmrsystem
## 1.00
## findfreqterm
## 1.00
## findfreqtermsdtmrlowfreq
## 1.00
## flash
## 1.00
## freqheadord
## 1.00
## freqr
## 1.00
## freqrheadordr
## 1.00
## freqrtailordr
## 1.00
## freqtailord
## 1.00
## geombar
## 1.00
## geombarstatident
## 1.00
## gettransform
## 1.00
## getwdto
## 1.00
## ggplot
## 1.00
## ggplotsubsetwf
## 1.00
## grammat
## 1.00
## grappl
## 1.00
## gsub
## 1.00
## hadley
## 1.00
## heritag
## 1.00
## hjust
## 1.00
## horizont
## 1.00
## hypen
## 1.00
## hyphen
## 1.00
## ide
## 1.00
## increa
## 1.00
## index
## 1.00
## infam
## 1.00
## ingo
## 1.00
## inspectdtm
## 1.00
## intenttopurchas
## 1.00
## invok
## 1.00
## italicis
## 1.00
## laptop
## 1.00
## lemma
## 1.00
## lemmat
## 1.00
## lengthfreqr
## 1.00
## libraryggplot
## 1.00
## librarysnowballc
## 1.00
## librarywordcloud
## 1.00
## licens
## 1.00
## limb
## 1.00
## listglob
## 1.00
## log
## 1.00
## mate
## 1.00
## minfreq
## 1.00
## months…
## 1.00
## movement
## 1.00
## multiword
## 1.00
## noun
## 1.00
## onlin
## 1.00
## orderfreqrdecreasingtru
## 1.00
## ordr
## 1.00
## painless
## 1.00
## paper…
## 1.00
## permiss
## 1.00
## permit
## 1.00
## pos
## 1.00
## postcod
## 1.00
## preambl
## 1.00
## proxim
## 1.00
## rcolorbrew
## 1.00
## reinspect
## 1.00
## rollsroyc
## 1.00
## rscript
## 1.00
## rudimentari
## 1.00
## script
## 1.00
## sensit
## 1.00
## setse
## 1.00
## setwd
## 1.00
## setwdcus
## 1.00
## shown…
## 1.00
## simple…
## 1.00
## slash
## 1.00
## snowballc
## 1.00
## socal
## 1.00
## spacehyphen
## 1.00
## start…
## 1.00
## statident
## 1.00
## std
## 1.00
## stemdocu
## 1.00
## street
## 1.00
## subcategori
## 1.00
## sydney
## 1.00
## tagger
## 1.00
## tdm
## 1.00
## tdms
## 1.00
## tedious
## 1.00
## tedium
## 1.00
## termoccurr
## 1.00
## textmin
## 1.00
## themeaxistextxelementtextangl
## 1.00
## ticket
## 1.00
## tip
## 1.00
## token
## 1.00
## tolow
## 1.00
## transpos
## 1.00
## trip
## 1.00
## twoword
## 1.00
## unix
## 1.00
## unnam
## 1.00
## unzip
## 1.00
## variant
## 1.00
## vast
## 1.00
## vcorpus
## 1.00
## verb
## 1.00
## wfdataframetermnamesfreqroccurrencesfreqr
## 1.00
## wordcloud
## 1.00
## wordcloudnamesfreqrfreqr
## 1.00
## wordcloudnamesfreqrfreqrminfreqcolorsbrewerpaldark″
## 1.00
## writelin
## 1.00
## youtub
## 1.00
## zeno
## 1.00
## zip
## 1.00
## window
## 0.98
## contenttransform
## 0.97
## transform
## 0.95
## getwd
## 0.94
## freq
## 0.93
## folder
## 0.92
## stem
## 0.91
## inspect
## 0.90
## maxim
## 0.90
## nonspars
## 0.90
## sparsiti
## 0.90
## version
## 0.90
## text
## 0.88
## rstudio
## 0.87
## frequent
## 0.84
## snippet
## 0.84
## zulli
## 0.84
## behold
## 0.81
## deconstruct
## 0.81
## nlp
## 0.81
## redeploy
## 0.81
## synergi
## 0.81
## enterpri
## 0.80
## organi
## 0.80
## removepunctu
## 0.80
findAssocs(dtm_tfidf,"hubbard",0.8)
## $hubbard
## –techniqu academia acid advoc
## 0.84 0.84 0.84 0.84
## aon arbit arena bell
## 0.84 0.84 0.84 0.84
## blackschol broke calibr combinatori
## 0.84 0.84 0.84 0.84
## commonmod damag debri dictionari
## 0.84 0.84 0.84 0.84
## downright dunningkrug empiricist extrapol
## 0.84 0.84 0.84 0.84
## eyebrow fattail firstperson fissil
## 0.84 0.84 0.84 0.84
## flesh flight followup futuredirect
## 0.84 0.84 0.84 0.84
## gaussian gut hackl heavylift
## 0.84 0.84 0.84 0.84
## herring hydraul incomplet insur
## 0.84 0.84 0.84 0.84
## intang invalid languish lobbi
## 0.84 0.84 0.84 0.84
## metropoli neutron nonprobabilist oneinbillion
## 0.84 0.84 0.84 0.84
## overconfid pcbase pension penultim
## 0.84 0.84 0.84 0.84
## piecem plane plethora portfolio
## 0.84 0.84 0.84 0.84
## programportfolio protiv quant radar
## 0.84 0.84 0.84 0.84
## raw rear selfassess simultan
## 0.84 0.84 0.84 0.84
## spectacular stanislaw stock stockmarket
## 0.84 0.84 0.84 0.84
## stratif tese trait trivia
## 0.84 0.84 0.84 0.84
## turbin ulam unanticip unfold
## 0.84 0.84 0.84 0.84
## usag useless crash incorrect
## 0.84 0.84 0.83 0.83
## analyst
## 0.81
findAssocs(dtm_tfidf,"scapegoat",0.8)
## $scapegoat
## …activ
## 1.00
## alarm
## 1.00
## bake
## 1.00
## breed
## 1.00
## causat
## 1.00
## centralis
## 1.00
## chees
## 1.00
## chernobyl
## 1.00
## colloqui
## 1.00
## combat
## 1.00
## convenient…
## 1.00
## dormant
## 1.00
## drain
## 1.00
## emot
## 1.00
## equip
## 1.00
## erstwhil
## 1.00
## evidenc
## 1.00
## frontlin
## 1.00
## fund
## 1.00
## httppatientsafetyedduhsdukeedumoduleeswisscheesehtml
## 1.00
## hunt
## 1.00
## illthoughtout
## 1.00
## layer
## 1.00
## medic
## 1.00
## medicin
## 1.00
## momentari
## 1.00
## mosquito
## 1.00
## nuclear
## 1.00
## occurs…
## 1.00
## overwork
## 1.00
## pathogen
## 1.00
## recur
## 1.00
## remedi
## 1.00
## slice
## 1.00
## soviet
## 1.00
## swamp
## 1.00
## swat
## 1.00
## swiss
## 1.00
## temporarili
## 1.00
## traffic
## 1.00
## uncoupl
## 1.00
## union
## 1.00
## unsaf
## 1.00
## violat
## 1.00
## error
## 0.99
## hole
## 0.99
## defens
## 0.97
## safe
## 0.97
## laps
## 0.96
## staff
## 0.96
## standardis
## 0.96
## latent
## 0.94
## rise
## 0.94
## target
## 0.91
## caus
## 0.90
## condit
## 0.90
## mishap
## 0.89
## poor
## 0.87
## trace
## 0.87
## agent
## 0.86
## cite
## 0.86
## faulti
## 0.86
## plant
## 0.86
## under
## 0.85
## prevent
## 0.84
## unaddress
## 0.84
## learnt
## 0.83
## unrealist
## 0.83
## approach
## 0.82
## procedur
## 0.82
#notice the difference!
#histogram
wf=data.frame(term=names(wt_tot_tfidf),weights=wt_tot_tfidf)
#library(ggplot2)
ggplot(subset(wf, wt_tot_tfidf>.1), aes(reorder(term,weights), weights)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45, hjust=1))
#WOrdcloud of TF-IDF - to see weighted words
#wordcloud
#library(wordcloud)
#setting the same seed each time ensures consistent look across clouds
set.seed(42)
#limit words by specifying min total wt
wordcloud(names(wt_tot_tfidf),wt_tot_tfidf, max.words=100)
#...add color
wordcloud(names(wt_tot_tfidf),wt_tot_tfidf,max.words=100,colors=brewer.pal(6,"Dark2"))
#play with different values of max.words
#try specifying min.freq instead of max.words
#Create document-term matrix
cluster <- DocumentTermMatrix(clean.docs)
## start clustering specific code
#convert dtm to matrix (what format is the dtm stored in?)
cluster.matrix <- as.matrix(cluster)
#write as csv file
write.csv(cluster.matrix,file="../clean_data/ClusterAsMatrix.csv")
#shorten rownames for display purposes
#rownames(m_cluster) <- paste(substring(rownames(m),1,3),rep("..",nrow(m)),
#substring(rownames(m_cluster),
#nchar(rownames(m_cluster))-12,nchar(rownames(m_cluster))-4))
#compute distance between document vectors
hcluster.distance <- dist(cluster.matrix, method="euclidean")
#run hierarchical clustering using Ward's method (explore other options later)
hcluster.groups <- hclust(hcluster.distance,method="ward.D")
#plot, use hang to ensure that labels fall below tree
plot(hcluster.groups, hang=-1)
#cut into 2 subtrees. Try 3,4,5,6 cuts; comment on your results
rect.hclust(hcluster.groups,2)
hcluster.hclusters <- cutree(hcluster.groups,2)
write.csv(hcluster.hclusters,"../clean_data/hclusters.csv")
##try another distance measure
cosineSim <- function(x){
as.dist(x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
cosine <- cosineSim(cluster.matrix)
cosine.distance <- cosine
#run hierarchical clustering using cosine distance
cosine.groups <- hclust(cosine.distance, method="ward.D")
#plot, use hang to ensure that labels fall below tree
plot(cosine.groups, hang=-1)
#cut into 2 subtrees.
rect.hclust(cosine.groups, 2)
cosine.hclusters <- cutree(cosine.groups, 2)
write.csv(cosine.hclusters, "../clean_data/hclusters_cosine.csv")
#kmeans clustering
#kmeans - run with nstart=100 and k=2,3,5 to compare results with hclust
kfit <- kmeans(hcluster.distance, centers=3, nstart=100)
#plot - need library cluster
clusplot(as.matrix(hcluster.distance), kfit$cluster, color=T, shade=T, labels=2, lines=0)
#print contents of kfit
print(kfit)
## K-means clustering with 3 clusters of sizes 4, 12, 25
##
## Cluster means:
## Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt Doc07.txt
## 1 188.6253 192.3036 167.62497 170.8201 181.28787 188.8120 175.09968
## 2 115.5392 118.3771 111.57906 115.9203 124.96098 115.2198 112.95505
## 3 109.6623 112.5678 68.79821 67.9816 96.08796 107.8372 77.87122
## Doc08.txt Doc09.txt Doc10.txt Doc11.txt Doc12.txt Doc13.txt Doc14.txt
## 1 193.8233 206.3925 174.51699 196.9089 171.25725 169.29243 174.8946
## 2 127.6042 129.8400 117.87510 149.3241 121.89970 123.75103 135.6713
## 3 130.6559 136.5729 81.37006 136.6676 77.71224 74.68835 95.6443
## Doc15.txt Doc16.txt Doc17.txt Doc18.txt Doc19.txt Doc20.txt Doc21.txt
## 1 174.26716 170.61832 172.97231 174.9302 148.4990 149.9209 177.1190
## 2 124.44654 122.62525 129.59150 137.9235 173.5920 181.7467 143.7394
## 3 86.84732 78.28215 85.33809 100.8456 161.3057 165.5252 132.7690
## Doc22.txt Doc23.txt Doc24.txt Doc25.txt Doc26.txt Doc27.txt Doc28.txt
## 1 171.4038 169.05986 173.44495 173.05071 177.67272 175.77795 171.5950
## 2 141.7953 123.73378 124.14625 118.61670 131.41773 128.48509 118.7266
## 3 127.6210 78.78967 79.71905 80.78633 90.93077 86.37011 72.9920
## Doc29.txt Doc30.txt Doc31.txt Doc32.txt Doc33.txt Doc34.txt Doc35.txt
## 1 199.1312 190.7290 177.19832 182.91260 184.7659 174.80883 144.3683
## 2 138.0648 132.4550 128.97569 131.46912 137.3207 127.35684 206.7825
## 3 133.9439 117.9196 86.22049 97.33003 101.1966 86.52654 198.5078
## Doc36.txt Doc37.txt Doc38.txt Doc39.txt Doc40.txt Doc41.txt
## 1 164.70797 163.4464 152.3760 173.4409 133.4520 147.0604
## 2 122.61888 122.9342 126.4871 147.7589 179.7954 134.4065
## 3 76.19227 78.1793 100.6445 136.4362 165.6052 115.6609
##
## Clustering vector:
## Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt Doc07.txt
## 2 2 3 3 3 2 3
## Doc08.txt Doc09.txt Doc10.txt Doc11.txt Doc12.txt Doc13.txt Doc14.txt
## 2 2 3 2 3 3 3
## Doc15.txt Doc16.txt Doc17.txt Doc18.txt Doc19.txt Doc20.txt Doc21.txt
## 3 3 3 3 1 1 2
## Doc22.txt Doc23.txt Doc24.txt Doc25.txt Doc26.txt Doc27.txt Doc28.txt
## 2 3 3 3 3 3 3
## Doc29.txt Doc30.txt Doc31.txt Doc32.txt Doc33.txt Doc34.txt Doc35.txt
## 2 2 3 3 3 3 1
## Doc36.txt Doc37.txt Doc38.txt Doc39.txt Doc40.txt Doc41.txt
## 3 3 3 2 1 2
##
## Within cluster sum of squares by cluster:
## [1] 169865.3 367619.1 321561.2
## (between_SS / total_SS = 55.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
#print cluster sizes
kfit$size
## [1] 4 12 25
#print clusters (members)
kfit$cluster
## Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt Doc07.txt
## 2 2 3 3 3 2 3
## Doc08.txt Doc09.txt Doc10.txt Doc11.txt Doc12.txt Doc13.txt Doc14.txt
## 2 2 3 2 3 3 3
## Doc15.txt Doc16.txt Doc17.txt Doc18.txt Doc19.txt Doc20.txt Doc21.txt
## 3 3 3 3 1 1 2
## Doc22.txt Doc23.txt Doc24.txt Doc25.txt Doc26.txt Doc27.txt Doc28.txt
## 2 3 3 3 3 3 3
## Doc29.txt Doc30.txt Doc31.txt Doc32.txt Doc33.txt Doc34.txt Doc35.txt
## 2 2 3 3 3 3 1
## Doc36.txt Doc37.txt Doc38.txt Doc39.txt Doc40.txt Doc41.txt
## 3 3 3 2 1 2
#write clusters to csv file
write.csv(kfit$cluster,file="../clean_data/KMClustGroups2.csv")
#sum of squared distance between cluster centers
kfit$betweenss
## [1] 1053721
#sum of squared distance within a cluster (this are the quantities that the algorithm
#attempts to minimise)
kfit$withinss
## [1] 169865.3 367619.1 321561.2
#kmeans - how to determine optimal number of clusters?
#One approach: look for "elbow" in plot of summed intra-cluster distances (withinss) as fn of k
wss <- 2:(length(clean.docs)-1)
for (i in 2:(length(clean.docs)-1)) {
wss[i] <- sum(kmeans(hcluster.distance,centers=i,nstart=25)$withinss)
}
plot(2:(length(clean.docs)-1), wss[2:(length(clean.docs)-1)], type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
m<-as.matrix(dtm)
#Map filenames to matrix row numbers
#these numbers will be used to reference files in the network graph
filekey <- cbind(1:length(docs),rownames(m))
write.csv(filekey,"filekey.csv",row.names = FALSE)
#have a look at file
rownames(m) <- 1:length(docs)
#compute cosine similarity between document vectors
#converting to distance matrix sets diagonal elements to 0
cosineSim <- function(x){
as.dist(x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
cs <- cosineSim(m)
#adjacency matrix: set entries below a certain threshold to 0.
#We choose half the magnitude of the largest element of the matrix
#as the cutoff. This is an arbitrary choice
cs[cs < max(cs)/2] <- 0
cs <- round(cs,3)
#write to disk
write.csv(as.matrix(cs),file="../clean_data/AdjacencyMatrix.csv")
#open it and have a look
# build a graph from the above matrix
#mode is undirected because similarity is a bidirectional relationship
igraph <- graph.adjacency(as.matrix(cs), weighted=T, mode = "undirected")
#Plot a Graph
# set seed to make the layout reproducible
set.seed(42)
#one of many possible layouts, see igraph docs
layout1 <- layout.fruchterman.reingold(igraph)
#basic plot with no weighting - fruchtermann reingold weighting
plot(igraph, layout=layout1)
#another layout
plot(igraph, layout=layout.kamada.kawai)
# 19,20,21,22 all related
#24,23,18,27,31,34,24 all sit out on their own
#33 and 26 relate back to 25, which relates back to 9 and 29
#36 and 37 relate back to 41, which has other relations
#Community detection - Fast/Greedy
comm_fg <- fastgreedy.community(igraph)
comm_fg$membership
## [1] 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 6 2 2 2 2 7
## [24] 8 1 1 9 4 4 4 10 4 1 11 3 3 3 3 3 3 3
V(igraph)$color <- comm_fg$membership
plot(igraph, layout=layout.kamada.kawai)
community_mapping <- cbind(as.data.frame(filekey, row.names = F),comm_fg$membership)
community_mapping
## V1 V2 comm_fg$membership
## 1 1 Doc01.txt 4
## 2 2 Doc02.txt 4
## 3 3 Doc03.txt 4
## 4 4 Doc04.txt 4
## 5 5 Doc05.txt 4
## 6 6 Doc06.txt 4
## 7 7 Doc07.txt 4
## 8 8 Doc08.txt 4
## 9 9 Doc09.txt 4
## 10 10 Doc10.txt 4
## 11 11 Doc11.txt 5
## 12 12 Doc12.txt 5
## 13 13 Doc13.txt 5
## 14 14 Doc14.txt 5
## 15 15 Doc15.txt 5
## 16 16 Doc16.txt 5
## 17 17 Doc17.txt 5
## 18 18 Doc18.txt 6
## 19 19 Doc19.txt 2
## 20 20 Doc20.txt 2
## 21 21 Doc21.txt 2
## 22 22 Doc22.txt 2
## 23 23 Doc23.txt 7
## 24 24 Doc24.txt 8
## 25 25 Doc25.txt 1
## 26 26 Doc26.txt 1
## 27 27 Doc27.txt 9
## 28 28 Doc28.txt 4
## 29 29 Doc29.txt 4
## 30 30 Doc30.txt 4
## 31 31 Doc31.txt 10
## 32 32 Doc32.txt 4
## 33 33 Doc33.txt 1
## 34 34 Doc34.txt 11
## 35 35 Doc35.txt 3
## 36 36 Doc36.txt 3
## 37 37 Doc37.txt 3
## 38 38 Doc38.txt 3
## 39 39 Doc39.txt 3
## 40 40 Doc40.txt 3
## 41 41 Doc41.txt 3
#24, 18 and 31 are their own community
#34 is similar to 38 and the other greens
#Community detection - Louvain
comm_lv <- cluster_louvain(igraph)
comm_lv$membership
## [1] 1 1 1 1 9 1 1 1 1 1 2 2 2 2 2 2 2 3 4 4 4 4 5
## [24] 6 9 9 7 9 9 9 8 9 9 10 11 11 11 11 11 11 11
V(igraph)$color <- comm_lv$membership
plot(igraph, layout=layout.kamada.kawai)
community_mapping <- cbind(community_mapping,comm_lv$membership)
community_mapping
## V1 V2 comm_fg$membership comm_lv$membership
## 1 1 Doc01.txt 4 1
## 2 2 Doc02.txt 4 1
## 3 3 Doc03.txt 4 1
## 4 4 Doc04.txt 4 1
## 5 5 Doc05.txt 4 9
## 6 6 Doc06.txt 4 1
## 7 7 Doc07.txt 4 1
## 8 8 Doc08.txt 4 1
## 9 9 Doc09.txt 4 1
## 10 10 Doc10.txt 4 1
## 11 11 Doc11.txt 5 2
## 12 12 Doc12.txt 5 2
## 13 13 Doc13.txt 5 2
## 14 14 Doc14.txt 5 2
## 15 15 Doc15.txt 5 2
## 16 16 Doc16.txt 5 2
## 17 17 Doc17.txt 5 2
## 18 18 Doc18.txt 6 3
## 19 19 Doc19.txt 2 4
## 20 20 Doc20.txt 2 4
## 21 21 Doc21.txt 2 4
## 22 22 Doc22.txt 2 4
## 23 23 Doc23.txt 7 5
## 24 24 Doc24.txt 8 6
## 25 25 Doc25.txt 1 9
## 26 26 Doc26.txt 1 9
## 27 27 Doc27.txt 9 7
## 28 28 Doc28.txt 4 9
## 29 29 Doc29.txt 4 9
## 30 30 Doc30.txt 4 9
## 31 31 Doc31.txt 10 8
## 32 32 Doc32.txt 4 9
## 33 33 Doc33.txt 1 9
## 34 34 Doc34.txt 11 10
## 35 35 Doc35.txt 3 11
## 36 36 Doc36.txt 3 11
## 37 37 Doc37.txt 3 11
## 38 38 Doc38.txt 3 11
## 39 39 Doc39.txt 3 11
## 40 40 Doc40.txt 3 11
## 41 41 Doc41.txt 3 11
#it sees the previous yellow as the same as the orange, but sees 34 as relating to 14, 17 etc
#slightly different communities
#lets weight the nodes and edges
#set label (not really necessary)
#V=vertex, E=edge
V(igraph)$label <- V(igraph)$name
#Vertex size proportional to number of connections
V(igraph)$size <- degree(igraph)*.6
#Vertex label size proportional to number of connections
V(igraph)$label.cex <- degree(igraph) / max(degree(igraph))+ .8
#label colour default black
V(igraph)$label.color <- "black"
#Vertex color organe
V(igraph)$color <- "orange"
#edge color grey
E(igraph)$color <- "grey"
#edge width proportional to similarity (weight)
E(igraph)$width <- E(igraph)$weight*7
# plot the graph in layout1 (fruchtermann reingold)
plot(igraph, layout=layout1)
#output is quite ugly. Explore igraph to see how you
#can fix it
#9 is important, so is 6 and 7
#lets weight the nodes and edges
#set label (not really necessary)
#V=vertex, E=edge
V(igraph)$label <- V(igraph)$name
#Vertex size proportional to number of connections
V(igraph)$size <- degree(igraph)*.6
#Vertex label size proportional to number of connections
V(igraph)$label.cex <- degree(igraph) / max(degree(igraph))+ .6
#label colour default black
V(igraph)$label.color <- "black"
#Vertex color organe
V(igraph)$color <- "orange"
#edge color grey
E(igraph)$color <- "grey"
#edge width proportional to similarity (weight)
E(igraph)$width <- E(igraph)$weight*5
# plot the graph in layout1 (fruchtermann reingold)
plot(igraph, layout=layout.auto)
#output is quite ugly. Explore igraph to see how you
#can fix it
plot(igraph, layout=layout1)
#LSA
library(lsa)
#Create term-document matrix (lsa expects a TDM rather than a DTM)
tdm <- TermDocumentMatrix(clean.docs)
#summary
tdm
## <<TermDocumentMatrix (terms: 4492, documents: 41)>>
## Non-/sparse entries: 18623/165549
## Sparsity : 90%
## Maximal term length: 114
## Weighting : term frequency (tf)
#inspect segment of document term matrix
inspect(tdm[1000:1006,1:10])
## <<TermDocumentMatrix (terms: 7, documents: 10)>>
## Non-/sparse entries: 8/62
## Sparsity : 89%
## Maximal term length: 8
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt
## deliber 0 0 0 0 0 1
## delight 0 0 0 0 0 0
## deliv 0 0 1 0 0 1
## deliver 0 0 0 0 2 0
## deliveri 0 0 1 0 0 0
## delv 0 0 0 0 0 0
## demand 0 3 0 0 0 0
## Docs
## Terms Doc07.txt Doc08.txt Doc09.txt Doc10.txt
## deliber 0 0 0 0
## delight 0 0 0 0
## deliv 0 0 0 0
## deliver 4 0 0 0
## deliveri 1 0 0 0
## delv 0 0 0 0
## demand 0 0 0 0
#what kind of object is the tdm?
class(tdm)
## [1] "TermDocumentMatrix" "simple_triplet_matrix"
#note: a simple triplet matrix (STM) is an efficient format to store
#sparse matrices
#need to convert STM to regular matrix
#convert to regular matrix
tdm.matrix <- as.matrix(tdm)
#check class
class(tdm.matrix)
## [1] "matrix"
dim(tdm.matrix)
## [1] 4492 41
#weight terms and docs
#Note: We weight the TDM before calculating the latent semantic space.
#This is to better reflect the relative importance of each term/doc
#in relation to the entire corpus (much like tf-idf weighting).
#It is convenient to express the transformation as a product of
#two numbers - local and global weight functions, like so:
#a (i,j) = L(i,j)*G(i).
#The local weight function L(i,j) presents the weight of term i
#in document j. The global weight function G(i) is used to express
#the weight of the term iacross the entire document set.
#We'll use the equivalent of tf-idf (local weight - tf, global -idf)
#check out other weighting schemes in the documentation (link above)
tdm.matrix.lsa <- lw_tf(tdm.matrix) * gw_idf(tdm.matrix)
dim(tdm.matrix.lsa)
## [1] 4492 41
#compute the Latent semantic space
lsaSpace <- lsa(tdm.matrix.lsa, dimcalc_share()) # create LSA space
#examine output
names(lsaSpace)
## [1] "tk" "dk" "sk"
#Original Matrix is decomposed as:
#tk(nterms,lsadim).Sk(lsadim).dk*(lsadim,ndocs)
#where
#nterms=number of terms in TDM
#ndocs=number of docs in TDM
#lsadim=dimensionality of Latent Semantic Space (length of Sk)
LSAMat <- as.textmatrix(lsaSpace)
#1)
#Examine a term in LS space
LSAMat["social",1:10]
## Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt Doc07.txt
## 11.198411 7.246920 4.012227 2.235230 10.588707 14.475969 6.947769
## Doc08.txt Doc09.txt Doc10.txt
## 22.950848 17.895179 7.801364
#compare to Term-frequency space
tdm.matrix.lsa["social",1:10]
## Doc01.txt Doc02.txt Doc03.txt Doc04.txt Doc05.txt Doc06.txt Doc07.txt
## 3.187627 6.375254 0.000000 0.000000 0.000000 92.441183 0.000000
## Doc08.txt Doc09.txt Doc10.txt
## 0.000000 0.000000 0.000000
#try other words
#What does this tell you about the term vectors in the two spaces?