Sharon Cabrera & Marc Ribas

29/02/16

source("supportFunctions.R")
whichTextF(c(39417511,48168000))#txt-->um826k
## [1] "um826K.txt"
getMySeed(c(39417511,48168000))#sed-->25111  set.seed()
## [1] 25111
set.seed(25111)
whichCharacterOption(c(39417511,48168000))

We keep all packages in a variable

Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust", 
            "cluster", "igraph", "fpc")

Read text file

mytext <- read.table("um826K.txt",stringsAsFactors = FALSE,sep="\n")
## Warning in scan(file, what, nmax, sep, dec, quote, skip, nlines,
## na.strings, : EOF within quoted string

Read the text line by line

  1. Build and show a table with the characters you analyse (account for absolute and relative frequencies) Account for the Character Frequency
fullt<-paste(mytext, collapse = '')
str(fullt)
##  chr "c(\"It wasnt all romantic. I didnt have a dorm room, so I slept on the floor in friends rooms, I returned Coke bottles for the "| __truncated__
x<-strsplit(fullt, "")
table(x)
## x
##    '    -    —             \n    "    $    (    )    ,    .    :    ;    ? 
##   16    1    9 1828    5    5   11    1    1    1   94  117    7    1    2 
##   \\    ¢    0    1    2    3    4    5    6    7    9    a    A    b    B 
##   20    1   13    7    2    6    1    2    1    4    2  621   28   98   10 
##    c    C    d    D    e    E    f    F    g    G    h    H    i    I    j 
##  182    4  316    9  869    4  169    3  159    2  358    6  438   94    8 
##    k    K    l    L    m    M    n    N    o    O    p    P    q    r    R 
##   56    1  321    5  166    9  510    8  624    2  145    5    2  407    4 
##    s    S    t    T    u    v    w    W    x    X    y    Y    z 
##  404   15  732   16  235  100  201   11    9    3  211    5    4
ldifc <- length(table(x))

Account for words

wordList<-strsplit(fullt, "\\W+") # Identify words throught whitespace
# The regular expression symbol \\W to match non-word characters, using + to indicate one or more in a row
str(wordList)
## List of 1
##  $ : chr [1:1863] "c" "It" "wasnt" "all" ...
wordVect<-unlist(wordList) # convert the list to a vector
wordFreq <- table(wordVect)

wordVectDecr<-sort(wordFreq, decreasing=TRUE)
head(wordVectDecr)
## wordVect
## the   I  to and was   a 
##  76  67  60  42  40  37

Write down the result in a table

wordVectDecrTabl<-paste(names(wordVectDecr), wordVectDecr, sep="\t")
cat("Word\tFREQ", wordVectDecrTabl[1:20], sep="\n")
## Word FREQ
## the  76
## I    67
## to   60
## and  42
## was  40
## a    37
## of   34
## it   31
## in   29
## that 28
## is   27
## you  27
## had  18
## with 17
## And  16
## have 16
## It   16
## my   16
## for  14
## me   14

Compute length of the words

wordLength<-nchar(names(wordVectDecr))
# Number of words
vapply(strsplit(fullt, "\\W+"), length, integer(1))
## [1] 1863
sapply(gregexpr("\\W+", fullt), length)
## [1] 1863

WordCloud

library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(ggplot2)

dfw<-data.frame(word=names(wordVectDecr),freq=wordVectDecr)
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T)

#Or add Color
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T,
          rot.per=0.35,colors=brewer.pal(8, "Dark2"))

pal2 <- brewer.pal(11,"Spectral")# brewer.pal(9,"Set1")
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T,colors=pal2)

Bar Diagram

barplot(dfw[1:10,]$freq, las = 2, names.arg = dfw[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies") 

# or        
dfw1<-dfw[1:10,]
ggplot(data=dfw1,aes(y=freq,x=word)) + geom_bar(stat="identity") + coord_flip()

TEXTFILE = "um826K.txt"
  1. How many characters does your initial file has?

3 ways to know the characters

length.POSIXlt(x)
## [1] 9747
nchar(mytext)
##   V1 
## 9747
nchar(fullt)
## [1] 9747
  1. How many charecters have you removed?
#install.packages("NLP")
library(NLP)
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(tm)

um = readLines(TEXTFILE)

doc.vec <- VectorSource(um)
doc.corpus <- Corpus(doc.vec)
summary(doc.corpus)
##    Length Class             Mode
## 1  2      PlainTextDocument list
## 2  2      PlainTextDocument list
## 3  2      PlainTextDocument list
## 4  2      PlainTextDocument list
## 5  2      PlainTextDocument list
## 6  2      PlainTextDocument list
## 7  2      PlainTextDocument list
## 8  2      PlainTextDocument list
## 9  2      PlainTextDocument list
## 10 2      PlainTextDocument list
## 11 2      PlainTextDocument list
## 12 2      PlainTextDocument list
## 13 2      PlainTextDocument list
## 14 2      PlainTextDocument list
## 15 2      PlainTextDocument list
## 16 2      PlainTextDocument list
## 17 2      PlainTextDocument list
## 18 2      PlainTextDocument list
## 19 2      PlainTextDocument list
## 20 2      PlainTextDocument list
## 21 2      PlainTextDocument list
## 22 2      PlainTextDocument list
## 23 2      PlainTextDocument list
## 24 2      PlainTextDocument list
## 25 2      PlainTextDocument list
## 26 2      PlainTextDocument list
## 27 2      PlainTextDocument list
## 28 2      PlainTextDocument list
doc.corpus <- tm_map(doc.corpus, tolower)
doc.corpus <- tm_map(doc.corpus, removePunctuation)
doc.corpus <- tm_map(doc.corpus, removeNumbers)
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
doc.corpus <- tm_map(doc.corpus, PlainTextDocument)

library(SnowballC)

doc.corpus <- tm_map(doc.corpus, stemDocument)
#now we remove whitespaces

doc.corpus <- tm_map(doc.corpus, stripWhitespace)
#inspect(doc.corpus[8])

TDM<-TermDocumentMatrix(doc.corpus)
TDM
## <<TermDocumentMatrix (terms: 526, documents: 28)>>
## Non-/sparse entries: 858/13870
## Sparsity           : 94%
## Maximal term length: 12
## Weighting          : term frequency (tf)
  1. What are the conclusions of your analysis

To sum up, we can load the file and research for all the information we need reading line for line.We need the libraries for make diagrams and use the functions

Second part

  1. How many lines are in the document?
length(um)#---->28
## [1] 28
  1. How many words are in the document?
length(wordVect)#-->1863
## [1] 1863
  1. How many words per line?
print (sapply(strsplit(readLines(TEXTFILE), " "), length))
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
  1. How many words per paragraph?
  print (sapply(strsplit(readLines(TEXTFILE), "."), length))
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
  1. What can you tell about word length?

With the length, we can know the numbers of words of a vector or a text.

Text mining

  1. What are the conclusions of your analysis

Our conclusion on the analysis , is that with the use of the library Plot Word can appreciate the frequency with which a word appears in the text. Then we have the option to use the library Word Cloud a nicer method to see the word with more frequently.