Pràctca TextMining R

Sharon Cabrera & Marc Ribas

29/02/16

source("supportFunctions.R")
whichTextF(c(39417511,48168000))#txt-->um826k

## [1] "um826K.txt"

getMySeed(c(39417511,48168000))#sed-->25111  set.seed()

## [1] 25111

set.seed(25111)
whichCharacterOption(c(39417511,48168000))

We keep all packages in a variable

Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust", 
            "cluster", "igraph", "fpc")

Read text file

mytext <- read.table("um826K.txt",stringsAsFactors = FALSE,sep="\n")

## Warning in scan(file, what, nmax, sep, dec, quote, skip, nlines,
## na.strings, : EOF within quoted string

Read the text line by line

Build and show a table with the characters you analyse (account for absolute and relative frequencies) Account for the Character Frequency

fullt<-paste(mytext, collapse = '')
str(fullt)

##  chr "c(\"It wasnt all romantic. I didnt have a dorm room, so I slept on the floor in friends rooms, I returned Coke bottles for the "| __truncated__

x<-strsplit(fullt, "")
table(x)

## x
##    '    -                 \n    "    $    (    )    ,    .    :    ;    ? 
##   16    1    9 1828    5    5   11    1    1    1   94  117    7    1    2 
##   \\    ¢    0    1    2    3    4    5    6    7    9    a    A    b    B 
##   20    1   13    7    2    6    1    2    1    4    2  621   28   98   10 
##    c    C    d    D    e    E    f    F    g    G    h    H    i    I    j 
##  182    4  316    9  869    4  169    3  159    2  358    6  438   94    8 
##    k    K    l    L    m    M    n    N    o    O    p    P    q    r    R 
##   56    1  321    5  166    9  510    8  624    2  145    5    2  407    4 
##    s    S    t    T    u    v    w    W    x    X    y    Y    z 
##  404   15  732   16  235  100  201   11    9    3  211    5    4

ldifc <- length(table(x))

Account for words

wordList<-strsplit(fullt, "\\W+") # Identify words throught whitespace
# The regular expression symbol \\W to match non-word characters, using + to indicate one or more in a row
str(wordList)

## List of 1
##  $ : chr [1:1863] "c" "It" "wasnt" "all" ...

wordVect<-unlist(wordList) # convert the list to a vector
wordFreq <- table(wordVect)

wordVectDecr<-sort(wordFreq, decreasing=TRUE)
head(wordVectDecr)

## wordVect
## the   I  to and was   a 
##  76  67  60  42  40  37

Write down the result in a table

wordVectDecrTabl<-paste(names(wordVectDecr), wordVectDecr, sep="\t")
cat("Word\tFREQ", wordVectDecrTabl[1:20], sep="\n")

## Word FREQ
## the  76
## I    67
## to   60
## and  42
## was  40
## a    37
## of   34
## it   31
## in   29
## that 28
## is   27
## you  27
## had  18
## with 17
## And  16
## have 16
## It   16
## my   16
## for  14
## me   14

Compute length of the words

wordLength<-nchar(names(wordVectDecr))
# Number of words
vapply(strsplit(fullt, "\\W+"), length, integer(1))

## [1] 1863

sapply(gregexpr("\\W+", fullt), length)

## [1] 1863

WordCloud

library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)
library(ggplot2)

dfw<-data.frame(word=names(wordVectDecr),freq=wordVectDecr)
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T)

#Or add Color
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T,
          rot.per=0.35,colors=brewer.pal(8, "Dark2"))

pal2 <- brewer.pal(11,"Spectral")# brewer.pal(9,"Set1")
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T,colors=pal2)

Bar Diagram

barplot(dfw[1:10,]$freq, las = 2, names.arg = dfw[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

# or        
dfw1<-dfw[1:10,]
ggplot(data=dfw1,aes(y=freq,x=word)) + geom_bar(stat="identity") + coord_flip()

TEXTFILE = "um826K.txt"

How many characters does your initial file has?

3 ways to know the characters

length.POSIXlt(x)

## [1] 9747

nchar(mytext)

##   V1 
## 9747

nchar(fullt)

## [1] 9747

How many charecters have you removed?

#install.packages("NLP")
library(NLP)

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(tm)

um = readLines(TEXTFILE)

doc.vec <- VectorSource(um)
doc.corpus <- Corpus(doc.vec)
summary(doc.corpus)

##    Length Class             Mode
## 1  2      PlainTextDocument list
## 2  2      PlainTextDocument list
## 3  2      PlainTextDocument list
## 4  2      PlainTextDocument list
## 5  2      PlainTextDocument list
## 6  2      PlainTextDocument list
## 7  2      PlainTextDocument list
## 8  2      PlainTextDocument list
## 9  2      PlainTextDocument list
## 10 2      PlainTextDocument list
## 11 2      PlainTextDocument list
## 12 2      PlainTextDocument list
## 13 2      PlainTextDocument list
## 14 2      PlainTextDocument list
## 15 2      PlainTextDocument list
## 16 2      PlainTextDocument list
## 17 2      PlainTextDocument list
## 18 2      PlainTextDocument list
## 19 2      PlainTextDocument list
## 20 2      PlainTextDocument list
## 21 2      PlainTextDocument list
## 22 2      PlainTextDocument list
## 23 2      PlainTextDocument list
## 24 2      PlainTextDocument list
## 25 2      PlainTextDocument list
## 26 2      PlainTextDocument list
## 27 2      PlainTextDocument list
## 28 2      PlainTextDocument list

doc.corpus <- tm_map(doc.corpus, tolower)
doc.corpus <- tm_map(doc.corpus, removePunctuation)
doc.corpus <- tm_map(doc.corpus, removeNumbers)
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
doc.corpus <- tm_map(doc.corpus, PlainTextDocument)

library(SnowballC)

doc.corpus <- tm_map(doc.corpus, stemDocument)
#now we remove whitespaces

doc.corpus <- tm_map(doc.corpus, stripWhitespace)
#inspect(doc.corpus[8])

TDM<-TermDocumentMatrix(doc.corpus)
TDM

## <<TermDocumentMatrix (terms: 526, documents: 28)>>
## Non-/sparse entries: 858/13870
## Sparsity           : 94%
## Maximal term length: 12
## Weighting          : term frequency (tf)

What are the conclusions of your analysis

To sum up, we can load the file and research for all the information we need reading line for line.We need the libraries for make diagrams and use the functions

Second part

How many lines are in the document?

length(um)#---->28

## [1] 28

How many words are in the document?

length(wordVect)#-->1863

## [1] 1863

How many words per line?

print (sapply(strsplit(readLines(TEXTFILE), " "), length))

##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0

How many words per paragraph?

  print (sapply(strsplit(readLines(TEXTFILE), "."), length))

##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0

What can you tell about word length?

With the length, we can know the numbers of words of a vector or a text.

Text mining

What are the conclusions of your analysis

Our conclusion on the analysis , is that with the use of the library Plot Word can appreciate the frequency with which a word appears in the text. Then we have the option to use the library Word Cloud a nicer method to see the word with more frequently.