source("supportFunctions.R")
whichTextF(c(39417511,48168000))#txt-->um826k
## [1] "um826K.txt"
getMySeed(c(39417511,48168000))#sed-->25111 set.seed()
## [1] 25111
set.seed(25111)
whichCharacterOption(c(39417511,48168000))
#MarcRibas and Sharon Cabrera
#We keep all packages in a variable
Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust",
"cluster", "igraph", "fpc")
#Read text file
mytext <- read.table("um826K.txt",stringsAsFactors = FALSE,sep="\n")
## Warning in scan(file, what, nmax, sep, dec, quote, skip, nlines,
## na.strings, : EOF within quoted string
### Read the text line by line
#1. Build and show a table with the characters you analyse (account for absolute and relative frequencies)
### Account for the Character Frequency
fullt<-paste(mytext, collapse = '')
str(fullt)
## chr "c(\"It wasnt all romantic. I didnt have a dorm room, so I slept on the floor in friends rooms, I returned Coke bottles for the "| __truncated__
x<-strsplit(fullt, "")
table(x)
## x
## ' - <U+0097> Â \n " $ ( ) , . : ; ?
## 16 1 9 1828 5 5 11 1 1 1 94 117 7 1 2
## \\ ¢ 0 1 2 3 4 5 6 7 9 a A b B
## 20 1 13 7 2 6 1 2 1 4 2 621 28 98 10
## c C d D e E f F g G h H i I j
## 182 4 316 9 869 4 169 3 159 2 358 6 438 94 8
## k K l L m M n N o O p P q r R
## 56 1 321 5 166 9 510 8 624 2 145 5 2 407 4
## s S t T u v w W x X y Y z
## 404 15 732 16 235 100 201 11 9 3 211 5 4
ldifc <- length(table(x))
### Account for words
wordList<-strsplit(fullt, "\\W+") # Identify words throught whitespace
# The regular expression symbol \\W to match non-word characters, using + to indicate one or more in a row
str(wordList)
## List of 1
## $ : chr [1:1863] "c" "It" "wasnt" "all" ...
wordVect<-unlist(wordList) # convert the list to a vector
wordFreq <- table(wordVect)
wordVectDecr<-sort(wordFreq, decreasing=TRUE)
head(wordVectDecr)
## wordVect
## the I to and was a
## 76 67 60 42 40 37
#### Write down the result in a table
wordVectDecrTabl<-paste(names(wordVectDecr), wordVectDecr, sep="\t")
cat("Word\tFREQ", wordVectDecrTabl[1:20], sep="\n")
## Word FREQ
## the 76
## I 67
## to 60
## and 42
## was 40
## a 37
## of 34
## it 31
## in 29
## that 28
## is 27
## you 27
## had 18
## with 17
## And 16
## have 16
## It 16
## my 16
## for 14
## me 14
#### Compute length of the words
wordLength<-nchar(names(wordVectDecr))
# Number of words
vapply(strsplit(fullt, "\\W+"), length, integer(1))
## [1] 1863
sapply(gregexpr("\\W+", fullt), length)
## [1] 1863
### WordCloud
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(ggplot2)
dfw<-data.frame(word=names(wordVectDecr),freq=wordVectDecr)
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T)

#Or add Color
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T,
rot.per=0.35,colors=brewer.pal(8, "Dark2"))

pal2 <- brewer.pal(11,"Spectral")# brewer.pal(9,"Set1")
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T,colors=pal2)

### Bar Diagram
barplot(dfw[1:10,]$freq, las = 2, names.arg = dfw[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")

# or
dfw1<-dfw[1:10,]
ggplot(data=dfw1,aes(y=freq,x=word)) + geom_bar(stat="identity") + coord_flip()

TEXTFILE = "um826K.txt"
#FIRST PART
#3. How many characters does your initial file has?
#3 ways to know the characters
length.POSIXlt(x)
## [1] 9747
nchar(mytext)
## V1
## 9747
nchar(fullt)
## [1] 9747
#4. How many charecters have you removed?
#install.packages("NLP")
library(NLP)
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(tm)
um = readLines(TEXTFILE)
doc.vec <- VectorSource(um)
doc.corpus <- Corpus(doc.vec)
summary(doc.corpus)
## Length Class Mode
## 1 2 PlainTextDocument list
## 2 2 PlainTextDocument list
## 3 2 PlainTextDocument list
## 4 2 PlainTextDocument list
## 5 2 PlainTextDocument list
## 6 2 PlainTextDocument list
## 7 2 PlainTextDocument list
## 8 2 PlainTextDocument list
## 9 2 PlainTextDocument list
## 10 2 PlainTextDocument list
## 11 2 PlainTextDocument list
## 12 2 PlainTextDocument list
## 13 2 PlainTextDocument list
## 14 2 PlainTextDocument list
## 15 2 PlainTextDocument list
## 16 2 PlainTextDocument list
## 17 2 PlainTextDocument list
## 18 2 PlainTextDocument list
## 19 2 PlainTextDocument list
## 20 2 PlainTextDocument list
## 21 2 PlainTextDocument list
## 22 2 PlainTextDocument list
## 23 2 PlainTextDocument list
## 24 2 PlainTextDocument list
## 25 2 PlainTextDocument list
## 26 2 PlainTextDocument list
## 27 2 PlainTextDocument list
## 28 2 PlainTextDocument list
doc.corpus <- tm_map(doc.corpus, tolower)
doc.corpus <- tm_map(doc.corpus, removePunctuation)
doc.corpus <- tm_map(doc.corpus, removeNumbers)
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
doc.corpus <- tm_map(doc.corpus, PlainTextDocument)
library(SnowballC)
doc.corpus <- tm_map(doc.corpus, stemDocument)
#now we remove whitespaces
doc.corpus <- tm_map(doc.corpus, stripWhitespace)
#inspect(doc.corpus[8])
TDM<-TermDocumentMatrix(doc.corpus)
TDM
## <<TermDocumentMatrix (terms: 526, documents: 28)>>
## Non-/sparse entries: 858/13870
## Sparsity : 94%
## Maximal term length: 12
## Weighting : term frequency (tf)
#5. What are the conclusions of your analysis
#To sum up, we can load the file and research for all the information we need
#reading line for line.We need the libraries for make
#diagrams and use the functions
#Second part
#1. How many lines are in the document?
length(um)#---->28
## [1] 28
#2. How many words are in the document?
length(wordVect)#-->1863
## [1] 1863
#3. How many words per line?
for (i in readLines(TEXTFILE)){
print (sapply(strsplit(readLines(TEXTFILE), " "), length))
}
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
## [1] 58 8 34 156 148 84 96 143 59 8 172 113 58 89 170 6 96
## [18] 91 137 93 48 102 67 97 108 4 5 0
#4. How many words per paragraph?
for (i in readLines(TEXTFILE)){
print (sapply(strsplit(readLines(TEXTFILE), "."), length))
}
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
## [1] 310 45 164 835 746 422 600 800 325 39 836 555 310 512 842 30 460
## [18] 504 679 487 244 494 388 554 580 26 24 0
#5. What can you tell about word length?
#with the length, we can know the numbers of words of a vector or a text
### Text mining
#5. What are the conclusions of your analysis
#Our conclusion on the analysis , is that with the use of the library Plot Word
#can appreciate the frequency with which a word appears in the text. Then we have
#the option to use the library Word Cloud a nicer method to see the word with more
#frequently.