um826K_mcsc.R

source("supportFunctions.R")
whichTextF(c(39417511,48168000))#txt-->um826k

## [1] "um826K.txt"

getMySeed(c(39417511,48168000))#sed-->25111  set.seed()

## [1] 25111

set.seed(25111)
whichCharacterOption(c(39417511,48168000))

#MarcRibas and Sharon Cabrera



#We keep all packages in a variable
Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust", 
            "cluster", "igraph", "fpc")


#Read text file
mytext <- read.table("um826K.txt",stringsAsFactors = FALSE,sep="\n")

## Warning in scan(file, what, nmax, sep, dec, quote, skip, nlines,
## na.strings, : EOF within quoted string

### Read the text line by line

#1. Build and show a table with the characters you analyse (account for absolute and relative frequencies)
### Account for the Character Frequency 

fullt<-paste(mytext, collapse = '')
str(fullt)

##  chr "c(\"It wasnt all romantic. I didnt have a dorm room, so I slept on the floor in friends rooms, I returned Coke bottles for the "| __truncated__

x<-strsplit(fullt, "")
table(x)

## x
##    '    -    <U+0097>             \n    "    $    (    )    ,    .    :    ;    ? 
##   16    1    9 1828    5    5   11    1    1    1   94  117    7    1    2 
##   \\    ¢    0    1    2    3    4    5    6    7    9    a    A    b    B 
##   20    1   13    7    2    6    1    2    1    4    2  621   28   98   10 
##    c    C    d    D    e    E    f    F    g    G    h    H    i    I    j 
##  182    4  316    9  869    4  169    3  159    2  358    6  438   94    8 
##    k    K    l    L    m    M    n    N    o    O    p    P    q    r    R 
##   56    1  321    5  166    9  510    8  624    2  145    5    2  407    4 
##    s    S    t    T    u    v    w    W    x    X    y    Y    z 
##  404   15  732   16  235  100  201   11    9    3  211    5    4

ldifc <- length(table(x)) 


  
  ### Account for words
wordList<-strsplit(fullt, "\\W+") # Identify words throught whitespace
# The regular expression symbol \\W to match non-word characters, using + to indicate one or more in a row
str(wordList)

## List of 1
##  $ : chr [1:1863] "c" "It" "wasnt" "all" ...

wordVect<-unlist(wordList) # convert the list to a vector
wordFreq <- table(wordVect)


wordVectDecr<-sort(wordFreq, decreasing=TRUE)
head(wordVectDecr)

## wordVect
## the   I  to and was   a 
##  76  67  60  42  40  37

#### Write down the result in a table

wordVectDecrTabl<-paste(names(wordVectDecr), wordVectDecr, sep="\t")
cat("Word\tFREQ", wordVectDecrTabl[1:20], sep="\n")

## Word FREQ
## the  76
## I    67
## to   60
## and  42
## was  40
## a    37
## of   34
## it   31
## in   29
## that 28
## is   27
## you  27
## had  18
## with 17
## And  16
## have 16
## It   16
## my   16
## for  14
## me   14

#### Compute length of the words

wordLength<-nchar(names(wordVectDecr))
# Number of words
vapply(strsplit(fullt, "\\W+"), length, integer(1))

## [1] 1863

sapply(gregexpr("\\W+", fullt), length)

## [1] 1863

### WordCloud

library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)
library(ggplot2)

dfw<-data.frame(word=names(wordVectDecr),freq=wordVectDecr)
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T)

#Or add Color
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T,
          rot.per=0.35,colors=brewer.pal(8, "Dark2"))

pal2 <- brewer.pal(11,"Spectral")# brewer.pal(9,"Set1")
wordcloud(dfw$word,dfw$freq,min.freq=2,max.words=100, random.order=T,colors=pal2)

### Bar Diagram

barplot(dfw[1:10,]$freq, las = 2, names.arg = dfw[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

# or        
dfw1<-dfw[1:10,]
ggplot(data=dfw1,aes(y=freq,x=word)) + geom_bar(stat="identity") + coord_flip()

TEXTFILE = "um826K.txt"

#FIRST PART

#3. How many characters does your initial file has?

#3 ways to know the characters
 
length.POSIXlt(x)

## [1] 9747

nchar(mytext)

##   V1 
## 9747

nchar(fullt)

## [1] 9747

#4. How many charecters have you removed?

#install.packages("NLP")
library(NLP)

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(tm)

um = readLines(TEXTFILE)

doc.vec <- VectorSource(um)
doc.corpus <- Corpus(doc.vec)
summary(doc.corpus)

##    Length Class             Mode
## 1  2      PlainTextDocument list
## 2  2      PlainTextDocument list
## 3  2      PlainTextDocument list
## 4  2      PlainTextDocument list
## 5  2      PlainTextDocument list
## 6  2      PlainTextDocument list
## 7  2      PlainTextDocument list
## 8  2      PlainTextDocument list
## 9  2      PlainTextDocument list
## 10 2      PlainTextDocument list
## 11 2      PlainTextDocument list
## 12 2      PlainTextDocument list
## 13 2      PlainTextDocument list
## 14 2      PlainTextDocument list
## 15 2      PlainTextDocument list
## 16 2      PlainTextDocument list
## 17 2      PlainTextDocument list
## 18 2      PlainTextDocument list
## 19 2      PlainTextDocument list
## 20 2      PlainTextDocument list
## 21 2      PlainTextDocument list
## 22 2      PlainTextDocument list
## 23 2      PlainTextDocument list
## 24 2      PlainTextDocument list
## 25 2      PlainTextDocument list
## 26 2      PlainTextDocument list
## 27 2      PlainTextDocument list
## 28 2      PlainTextDocument list

doc.corpus <- tm_map(doc.corpus, tolower)
doc.corpus <- tm_map(doc.corpus, removePunctuation)
doc.corpus <- tm_map(doc.corpus, removeNumbers)
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
doc.corpus <- tm_map(doc.corpus, PlainTextDocument)

library(SnowballC)

doc.corpus <- tm_map(doc.corpus, stemDocument)
#now we remove whitespaces

doc.corpus <- tm_map(doc.corpus, stripWhitespace)
#inspect(doc.corpus[8])

TDM<-TermDocumentMatrix(doc.corpus)
TDM

## <<TermDocumentMatrix (terms: 526, documents: 28)>>
## Non-/sparse entries: 858/13870
## Sparsity           : 94%
## Maximal term length: 12
## Weighting          : term frequency (tf)

#5. What are the conclusions of your analysis

#To sum up, we can load the file and research for all the information we need
#reading line for line.We need the libraries for make
#diagrams and use the functions


#Second part

#1. How many lines are in the document?


length(um)#---->28

## [1] 28

#2. How many words are in the document?

length(wordVect)#-->1863

## [1] 1863

#3. How many words per line?

for (i in readLines(TEXTFILE)){
print (sapply(strsplit(readLines(TEXTFILE), " "), length))
}

##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0
##  [1]  58   8  34 156 148  84  96 143  59   8 172 113  58  89 170   6  96
## [18]  91 137  93  48 102  67  97 108   4   5   0

#4. How many words per paragraph?

for (i in readLines(TEXTFILE)){
  print (sapply(strsplit(readLines(TEXTFILE), "."), length))
}

##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0
##  [1] 310  45 164 835 746 422 600 800 325  39 836 555 310 512 842  30 460
## [18] 504 679 487 244 494 388 554 580  26  24   0

#5. What can you tell about word length?

#with the length, we can know the numbers of words of a vector or a text


### Text mining

#5. What are the conclusions of your analysis

#Our conclusion on the analysis , is that with the use of the library Plot Word
#can appreciate the frequency with which a word appears in the text. Then we have
#the option to use the library Word Cloud a nicer method to see the word with more
#frequently.

um826K_mcsc.R

mercedes

Sun Feb 28 23:56:03 2016