Sharon Cabrera & Marc Ribas
29/02/16
source("supportFunctions.R")
whichTextF(c(39417511,48168000))#txt-->um826k
## [1] "um826K.txt"
getMySeed(c(39417511,48168000))#sed-->25111 set.seed()
## [1] 25111
set.seed(25111)
whichCharacterOption(c(39417511,48168000))
We keep all packages in a variable
Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust",
"cluster", "igraph", "fpc")
Read text file
mytext <- read.table("um826K.txt",stringsAsFactors = FALSE,sep="\n")
## Warning in scan(file, what, nmax, sep, dec, quote, skip, nlines,
## na.strings, : EOF within quoted string
Read the text line by line
fullt<-paste(mytext, collapse = '')
str(fullt)
## chr "c(\"It wasnt all romantic. I didnt have a dorm room, so I slept on the floor in friends rooms, I returned Coke bottles for the "| __truncated__
x<-strsplit(fullt, "")
table(x)
## x
## ' - —  \n " $ ( ) , . : ; ?
## 16 1 9 1828 5 5 11 1 1 1 94 117 7 1 2
## \\ ¢ 0 1 2 3 4 5 6 7 9 a A b B
## 20 1 13 7 2 6 1 2 1 4 2 621 28 98 10
## c C d D e E f F g G h H i I j
## 182 4 316 9 869 4 169 3 159 2 358 6 438 94 8
## k K l L m M n N o O p P q r R
## 56 1 321 5 166 9 510 8 624 2 145 5 2 407 4
## s S t T u v w W x X y Y z
## 404 15 732 16 235 100 201 11 9 3 211 5 4
ldifc <- length(table(x))
Account for words
wordList<-strsplit(fullt, "\\W+") # Identify words throught whitespace
# The regular expression symbol \\W to match non-word characters, using + to indicate one or more in a row
str(wordList)
## List of 1
## $ : chr [1:1863] "c" "It" "wasnt" "all" ...
wordVect<-unlist(wordList) # convert the list to a vector
wordFreq <- table(wordVect)
wordVectDecr<-sort(wordFreq, decreasing=TRUE)
head(wordVectDecr)
## wordVect
## the I to and was a
## 76 67 60 42 40 37
Write down the result in a table
wordVectDecrTabl<-paste(names(wordVectDecr), wordVectDecr, sep="\t")
cat("Word\tFREQ", wordVectDecrTabl[1:20], sep="\n")
## Word FREQ
## the 76
## I 67
## to 60
## and 42
## was 40
## a 37
## of 34
## it 31
## in 29
## that 28
## is 27
## you 27
## had 18
## with 17
## And 16
## have 16
## It 16
## my 16
## for 14
## me 14
Compute length of the words
wordLength<-nchar(names(wordVectDecr))
# Number of words
vapply(strsplit(fullt, "\\W+"), length, integer(1))
## [1] 1863
sapply(gregexpr("\\W+", fullt), length)
## [1] 1863
WordCloud
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(ggplot2)
dfw<-data.frame(word=names(wordVectDecr),freq=wordVectDecr)
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T)
#Or add Color
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T,
rot.per=0.35,colors=brewer.pal(8, "Dark2"))
pal2 <- brewer.pal(11,"Spectral")# brewer.pal(9,"Set1")
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T,colors=pal2)
Bar Diagram
barplot(dfw[1:10,]$freq, las = 2, names.arg = dfw[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
# or
dfw1<-dfw[1:10,]
ggplot(data=dfw1,aes(y=freq,x=word)) + geom_bar(stat="identity") + coord_flip()
TEXTFILE = "um826K.txt"
3 ways to know the characters
length.POSIXlt(x)
## [1] 9747
nchar(mytext)
## V1
## 9747
nchar(fullt)
## [1] 9747
#install.packages("NLP")
library(NLP)
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(tm)
um = readLines(TEXTFILE)
doc.vec <- VectorSource(um)
doc.corpus <- Corpus(doc.vec)
summary(doc.corpus)
## Length Class Mode
## 1 2 PlainTextDocument list
## 2 2 PlainTextDocument list
## 3 2 PlainTextDocument list
## 4 2 PlainTextDocument list
## 5 2 PlainTextDocument list
## 6 2 PlainTextDocument list
## 7 2 PlainTextDocument list
## 8 2 PlainTextDocument list
## 9 2 PlainTextDocument list
## 10 2 PlainTextDocument list
## 11 2 PlainTextDocument list
## 12 2 PlainTextDocument list
## 13 2 PlainTextDocument list
## 14 2 PlainTextDocument list
## 15 2 PlainTextDocument list
## 16 2 PlainTextDocument list
## 17 2 PlainTextDocument list
## 18 2 PlainTextDocument list
## 19 2 PlainTextDocument list
## 20 2 PlainTextDocument list
## 21 2 PlainTextDocument list
## 22 2 PlainTextDocument list
## 23 2 PlainTextDocument list
## 24 2 PlainTextDocument list
## 25 2 PlainTextDocument list
## 26 2 PlainTextDocument list
## 27 2 PlainTextDocument list
## 28 2 PlainTextDocument list
doc.corpus <- tm_map(doc.corpus, tolower)
doc.corpus <- tm_map(doc.corpus, removePunctuation)
doc.corpus <- tm_map(doc.corpus, removeNumbers)
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
doc.corpus <- tm_map(doc.corpus, PlainTextDocument)
library(SnowballC)
doc.corpus <- tm_map(doc.corpus, stemDocument)
#now we remove whitespaces
doc.corpus <- tm_map(doc.corpus, stripWhitespace)
#inspect(doc.corpus[8])
TDM<-TermDocumentMatrix(doc.corpus)
TDM
## <<TermDocumentMatrix (terms: 526, documents: 28)>>
## Non-/sparse entries: 858/13870
## Sparsity : 94%
## Maximal term length: 12
## Weighting : term frequency (tf)
To sum up, we can load the file and research for all the information we need reading line for line.We need the libraries for make diagrams and use the functions
Second part
length(um)#---->28
## [1] 28
length(wordVect)#-->1863
## [1] 1863
print (sapply(strsplit(readLines(TEXTFILE), " "), length))
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
print (sapply(strsplit(readLines(TEXTFILE), "."), length))
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
With the length, we can know the numbers of words of a vector or a text.
Text mining
Our conclusion on the analysis , is that with the use of the library Plot Word can appreciate the frequency with which a word appears in the text. Then we have the option to use the library Word Cloud a nicer method to see the word with more frequently.