library(ggplot2)
library(magrittr)
library(rmarkdown)
library(dplyr)
library(tidyr)
library(RColorBrewer)
library(reshape2)
library(ggthemes)
library(MASS)
library(viridis)
library(GSIF)
library(ggtern)
library(geomnet)
library(ggmap)
library(ggfortify)
library(vars)
library(maps)
library(rgdal)
library(animation)
library(class)
library(combinat)
library(grDevices)
library(stringr)
library(readr)
library(rJava)
library(qdap)
library(tm)
library(SnowballC)
library(wordcloud)
library(plotrix)
library(dendextend)
library(RWeka)

Bag of Word

le text mining est un processus d’extraction d’information exploitable à partir d’un texte.

C’est aussi le développement d’une problématique puis le passage d’un état non structure à finalement un état struturé. Le bag of word ne prend pas en compte le type de mots ou ordre (adjective, pronom , nom, verbe ….) contraire à la méthode du Semantic Parsing.

 new_text="DataCamp is the first online learning platform that
focuses on building the best learning experience specifically for
Data Science. We have offices in Boston and Belgium and to date, we
trained over 250,000 (aspiring) data scientists in over 150
countries. These data science enthusiasts completed more than 9
million exercises. You can take free beginner courses, or subscribe
for $25/month to get access to all premium courses." 


# Print new_text to the console
print(new_text)

## [1] "DataCamp is the first online learning platform that\nfocuses on building the best learning experience specifically for\nData Science. We have offices in Boston and Belgium and to date, we\ntrained over 250,000 (aspiring) data scientists in over 150\ncountries. These data science enthusiasts completed more than 9\nmillion exercises. You can take free beginner courses, or subscribe\nfor $25/month to get access to all premium courses."

# Find the 10 most frequent terms: term_count
term_count=freq_terms(new_text,10)

# Plot term_count
plot(term_count)

tweets=read_csv("https://assets.datacamp.com/production/course_935/datasets/coffee.csv")



# Print out the number of rows in tweets
nrow(tweets)

## [1] 1000

# Isolate text from tweets: coffee_tweets
coffee_tweets=tweets$text
class(coffee_tweets)

## [1] "character"

 # Make a vector source: coffee_source
coffee_source=VectorSource(coffee_tweets) 
class(coffee_source)

## [1] "VectorSource" "SimpleSource" "Source"

## coffee_source is already in your workspace

# Make a volatile corpus: coffee_corpus
coffee_corpus=VCorpus(coffee_source)

# Print out coffee_corpus
print(coffee_corpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1000

# Print data on the 15th tweet in coffee_corpus
coffee_corpus[[15]]

## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 111

# Print the content of the 15th tweet in coffee_corpus
coffee_corpus[[15]][1]

## $content
## [1] "@HeatherWhaley I was about 2 joke it takes 2 hands to hold hot coffee...then I read headline! #Don'tDrinkNShoot"

doc_id=c(1,2,3)
text=c("Text mining is a great time.",
       "Text analysis provides insights","qdap and tm are used in text mining")
Author=c("Author1","Author2","Author3")
date=c(1514953399,1514866998,1514780598)

example_text=as.data.frame(cbind(doc_id,text,Author,date))
# Print example_text to the console
example_text

# Create a DataframeSource: df_source

df_source=DataframeSource(example_text)

# Convert df_source to a corpus: df_corpus
df_corpus=VCorpus(df_source)

# Examine df_corpus
df_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 2
## Content:  documents: 3

# Examine the first df_corpus document
df_corpus[1]

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 2
## Content:  documents: 1

# Create a VectorSource on column 3: vec_source
vec_source=VectorSource(example_text[,2])

# Convert vec_source to a corpus: vec_corpus
vec_corpus=VCorpus(vec_source)

# Examine vec_corpus
vec_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3

# Examine the first vec_corpus document
vec_corpus[[1]][1]

## $content
## [1] "Text mining is a great time."

 # En pre-process avec TM
#tolower(): Make all characters lowercase
#removePunctuation(): Remove all punctuation marks
#removeNumbers(): Remove numbers
#stripWhitespace(): Remove excess whitespace

# Create the object: text
text="<b>She</b> woke up at       6 A.M. It\'s so early! 
       She was only 10% awake and began drinking coffee in front of her computer."

# All lowercase
text=tolower(text)
text

## [1] "<b>she</b> woke up at       6 a.m. it's so early! \n       she was only 10% awake and began drinking coffee in front of her computer."

# Remove punctuation
text=removePunctuation(text)
text

## [1] "bsheb woke up at       6 am its so early \n       she was only 10 awake and began drinking coffee in front of her computer"

# Remove numbers
text=removeNumbers(text)
text

## [1] "bsheb woke up at        am its so early \n       she was only  awake and began drinking coffee in front of her computer"

# Remove whitespace
text=stripWhitespace(text)
text

## [1] "bsheb woke up at am its so early she was only awake and began drinking coffee in front of her computer"

#pre-process avec Qdap
#bracketX(): Remove all text within brackets (e.g. "It's (so) cool" becomes "It's cool")
#replace_number(): Replace numbers with their word equivalents (e.g. "2" becomes "two")
#replace_abbreviation(): Replace abbreviations with their full text equivalents 
#(e.g. "Sr" becomes "Senior")
#replace_contraction(): Convert contractions back totheir base words (e.g. "shouldn't" becomes "should not")
# replace_symbol() Replace common symbols with their word equivalents (e.g. "$" becomes "dollar")



# Remove text within brackets
text=bracketX(text)
text

## [1] "bsheb woke up at am its so early she was only awake and began drinking coffee in front of her computer"

# Replace numbers with words
text=replace_number(text)
text

## [1] "bsheb woke up at am its so early she was only awake and began drinking coffee in front of her computer"

# Replace abbreviations
text=replace_abbreviation(text)
text

## [1] "bsheb woke up at am its so early she was only awake and began drinking coffee in front of her computer"

# Replace contractions
text=replace_contraction(text)
text

## [1] "Bsheb woke up at am its so early she was only awake and began drinking coffee in front of her computer"

# Replace symbols with words
text=replace_symbol(text)
text

## [1] "Bsheb woke up at am its so early she was only awake and began drinking coffee in front of her computer"

# List standard English stop words
stopwords("en")

##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"

# Print text without standard stop words
removeWords(text,stopwords("en"))

## [1] "Bsheb woke      early    awake  began drinking coffee  front   computer"

# Add "coffee" and "bean" to the list: new_stops
new_stops=c("coffee","bean",stopwords("en"))

# Remove stop words from text
removeWords(text,new_stops)

## [1] "Bsheb woke      early    awake  began drinking   front   computer"

# Create complicate
complicate=c("complicated","complication","complicatedly")

# Perform word stemming: stem_doc
stem_doc=stemDocument(complicate)
stem_doc

## [1] "complic" "complic" "complic"

# Create the completion dictionary: comp_dict
comp_dict="complicate"

# Perform stem completion: complete_text 
complete_text=stemCompletion(stem_doc,comp_dict)

# Print complete_text
complete_text

##      complic      complic      complic 
## "complicate" "complicate" "complicate"

text_data= "In a complicated haste, Tom rushed to fix a new complication, too complicatedly."
# Remove punctuation: rm_punc
rm_punc <- removePunctuation(text_data)
rm_punc

## [1] "In a complicated haste Tom rushed to fix a new complication too complicatedly"

# Create character vector: n_char_vec
n_char_vec <- unlist(strsplit(rm_punc, split = ' '))
n_char_vec

##  [1] "In"            "a"             "complicated"   "haste"        
##  [5] "Tom"           "rushed"        "to"            "fix"          
##  [9] "a"             "new"           "complication"  "too"          
## [13] "complicatedly"

# Perform word stemming: stem_doc
stem_doc <-stemDocument(n_char_vec) 

# Print stem_doc
stem_doc

##  [1] "In"      "a"       "complic" "hast"    "Tom"     "rush"    "to"     
##  [8] "fix"     "a"       "new"     "complic" "too"     "complic"

# Re-complete stemmed document: complete_doc
complete_doc <-stemCompletion(stem_doc,comp_dict) 

# Print complete_doc
complete_doc

##           In            a      complic         hast          Tom 
##           ""           "" "complicate"           ""           "" 
##         rush           to          fix            a          new 
##           ""           ""           ""           ""           "" 
##      complic          too      complic 
## "complicate"           "" "complicate"

# Alter the function code to match the instructions
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus,content_transformer(stripWhitespace))
  corpus <- tm_map(corpus, removePunctuation)
   corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "coffee","mug"))
 
  return(corpus)
}

# Apply your customized function to the tweet_corp: clean_corp
clean_corp=clean_corpus(coffee_corpus)

# Print out a cleaned up tweet
clean_corp[[227]][1]

## $content
## [1] "also dogs arent smart enough  dip  donut      eat  part thats  dipped ladyandthetramp"

# Print out the same tweet in original form
tweets$text[227]

## [1] "Also, dogs aren't smart enough to dip the donut in the coffee and then eat the part that's been dipped. #ladyandthetramp"

# Create the dtm from the corpus: coffee_dtm

coffee_dtm=DocumentTermMatrix(clean_corp)
# Print out coffee_dtm data
coffee_dtm

## <<DocumentTermMatrix (documents: 1000, terms: 3075)>>
## Non-/sparse entries: 7384/3067616
## Sparsity           : 100%
## Maximal term length: 27
## Weighting          : term frequency (tf)

# Convert coffee_dtm to a matrix: coffee_m
coffee_m=as.matrix(coffee_dtm)

# Print the dimensions of coffee_m
dim(coffee_m)

## [1] 1000 3075

# Review a portion of the matrix
coffee_m[148:150,2587:2590]

##      Terms
## Docs  stampedeblue stand star starbucks
##   148            0     0    0         0
##   149            0     0    0         0
##   150            0     0    0         0

# Create a TDM from clean_corp: coffee_tdm
coffee_tdm=TermDocumentMatrix(clean_corp)

# Print coffee_tdm data
coffee_tdm

## <<TermDocumentMatrix (terms: 3075, documents: 1000)>>
## Non-/sparse entries: 7384/3067616
## Sparsity           : 100%
## Maximal term length: 27
## Weighting          : term frequency (tf)

# Convert coffee_tdm to a matrix: coffee_m
coffee_m=as.matrix(coffee_tdm)

# Print the dimensions of the matrix
dim(coffee_m)

## [1] 3075 1000

# Review a portion of the matrix
coffee_m[2587:2590,148:150]

##               Docs
## Terms          148 149 150
##   stampedeblue   0   0   0
##   stand          0   0   0
##   star           0   0   0
##   starbucks      0   0   0

Visualisation et cloud

La visualisation est trés importante en text mining. En prise de décision.

# Create a matrix: coffee_m
coffee_m=as.matrix(coffee_tdm)

# Calculate the rowSums: term_frequency
term_frequency=rowSums(coffee_m)

# Sort term_frequency in descending order
term_frequency=sort(term_frequency,decreasing=T)

# View the top 10 most common words
term_frequency[1:10]

##     like      cup     shop     just      get  morning     want drinking 
##      111      103       69       66       62       57       49       47 
##      can    looks 
##       45       45

# Plot a barchart of the 10 most common words
barplot(term_frequency[1:10],col="tan",las=2)

# Create frequency
frequency=freq_terms(tweets$text,top=10,
                 at.least=3,stopwords="Top200Words")

# Make a frequency barchart
plot(frequency)

# Create frequency2
frequency2=freq_terms(tweets$text,top=10,
                      at.least=3,stopwords=tm::stopwords("en"))

# Make a frequency2 barchart
plot(frequency2)

# Print the first 10 entries in term_frequency
term_frequency[1:10]

##     like      cup     shop     just      get  morning     want drinking 
##      111      103       69       66       62       57       49       47 
##      can    looks 
##       45       45

# Create word_freqs
word_freqs=data.frame(term=names(term_frequency),num=term_frequency)

# Create a wordcloud for the values in word_freqs
wordcloud(word_freqs$term,word_freqs$num,max.words=100,colors="red")

wordcloud(word_freqs$term,word_freqs$num,max.words=100,col=c("grey80","darkgoldenrod1","tomato"))

# List the available colors
display.brewer.all()

# Create purple_orange
purple_orange=brewer.pal(10,"PuOr")

# Drop 2 faintest colors
purple_orange=purple_orange[-(1:2)]

# Create a wordcloud with purple_orange palette
wordcloud(word_freqs$term,word_freqs$num,col=purple_orange,max.words=100)

tweets=
  read_csv("https://assets.datacamp.com/production/course_935/datasets/chardonnay.csv")
chardonnay_tweets=tweets$text

# Create all_coffee
all_coffee=paste(coffee_tweets,collapse=" ")

# Create all_chardonnay
all_chardonnay=paste(chardonnay_tweets,collapse=" ")

# Create all_tweets
all_tweets=c(all_coffee,all_chardonnay)

# Convert to a vector source
all_tweets=VectorSource(all_tweets)

# Create all_corpus
all_corpus=VCorpus(all_tweets)

clean_corpus=function(corpus){
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "amp", "glass",
                                          "chardonnay",
                                          "coffee"))
  return(corpus)
}

# Clean the corpus
all_clean=clean_corpus(all_corpus)

# Create all_tdm
all_tdm=TermDocumentMatrix(all_clean)

# Create all_m
all_m=as.matrix(all_tdm)

# Print a commonality cloud
commonality.cloud(all_m,colors="steelblue1",max.words=100)

# Create all_tdm
all_tdm=TermDocumentMatrix(all_clean)

# Give the columns distinct names
colnames(all_tdm)=c("coffee","chardonnay")

# Create all_m
all_m=as.matrix(all_tdm)

# Create comparison cloud
comparison.cloud(all_m,colors=c("orange","blue"),max.words=50)

# Create common_words
common_words <- subset(all_m, all_m[, 1] > 0 & all_m[, 2] > 0)

# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])
# Combine common_words and difference
common_words <- cbind(common_words,difference)

# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing = TRUE), ]

# Create top25_df
top25_df <- data.frame(x = common_words[1:25, 1], 
                       y =common_words[1:25, 2], 
                       labels = rownames(common_words[1:25, ]))

# Create the pyramid plot
pyramid.plot(top25_df$x, top25_df$y,
             labels = top25_df$labels, gap = 8,
             top.labels = c("Chardonnay", "Words", "Coffee"),
             main = "Words in Common", laxlab = NULL, 
             raxlab = NULL, unit = NULL)

## [1] 5.1 4.1 4.1 2.1

# Word association
word_associate(coffee_tweets, match.string = c("barista"), 
               stopwords = c(Top200Words, "coffee", "amp"), 
               network.plot = TRUE, cloud.colors = c("gray85", "darkred"))

##   row group unit text                                                                                                                                
## 1 544   all  544 RT @Barista_kyo: #coffee #latte #soylatte #thinkcoffee # # # # @ think coffee http://t.co/Hmy9RPRWTZ                                
## 2 569   all  569 RT @ReversoSmith: What a beautiful mess! #portafilter #coffee #espresso #coffeemachine #barista #baristalife? http://t.co/ZODcTfP22Z
## 3 658   all  658 The moment you realize your Starbucks barista gave you a regular iced Coffee when u asked 4 decaf. Shitty. Late night not planned.  
## 4 931   all  931 Barista made my coffee wrong and still gave me both anyway #Starbucks #coffee #caffeine #upallnight http://t.co/iKCNwO8F6t          
## 5 951   all  951 RT @FrankIero: hahaha @jamiasan :*gives Barista our Starbucks order* Barista: coffee? @jamiasan : yes, isn't this is a coffee store?

# Add title
title(main = "Barista Coffee Tweet Associations")

Text mining et clustering

city=c("Cleveland","Portland","Boston","New Orleans")
rainfall=c(39.14,39.14,43.77,62.45)
rain=tbl_df(cbind(city,rainfall))

# Create dist_rain
dist_rain=dist(rain[,2])

# View the distance matrix
dist_rain

##       1     2     3
## 2  0.00            
## 3  4.63  4.63      
## 4 23.31 23.31 18.68

# Create hc
hc=hclust(dist_rain)

# Plot hc
plot(hc,labels=rain$city)

# Print the dimensions of tweets_tdm
dim(all_tdm)

## [1] 5413    2

# Create tdm1
tdm1=removeSparseTerms(all_tdm,sparse=0.95)

# Create tdm2
tdm2=removeSparseTerms(all_tdm,sparse=0.975)

# Print tdm1
tdm1

## <<TermDocumentMatrix (terms: 5413, documents: 2)>>
## Non-/sparse entries: 6099/4727
## Sparsity           : 44%
## Maximal term length: 266
## Weighting          : term frequency (tf)

# Print tdm2
tdm2

## <<TermDocumentMatrix (terms: 5413, documents: 2)>>
## Non-/sparse entries: 6099/4727
## Sparsity           : 44%
## Maximal term length: 266
## Weighting          : term frequency (tf)

# Create tweets_tdm2
tweets_tdm2=removeSparseTerms(coffee_dtm,sparse=0.975)

# Create tdm_m
tdm_m=as.matrix(tweets_tdm2)

# Create tdm_df
tdm_df=as.data.frame(tdm_m)

# Create tweets_dist
tweets_dist=dist(tdm_df)

# Create hc
hc=hclust(tweets_dist)

# Plot the dendrogram
plot(hc)

# Create hc
hc=hclust(tweets_dist)

# Create hcd
hcd=as.dendrogram(hc)

# Print the labels in hcd
l=labels(hcd)

# Change the branch color to red for "marvin" and "gaye"
branches_attr_by_labels(hcd,c("marvin","gaye"),col=red)

## Warning in branches_attr_by_labels(hcd, c("marvin", "gaye"), col = red): Not all of the labels you provided are included in the dendrogram.
## The following labels were omitted:marvingaye

## 'dendrogram' with 2 branches and 1000 members total, at height 6.324555

# Plot hcd
plot(hcd,main="Better Dendrogram")

# Add cluster rectangles 
rect.dendrogram(hcd,k=2,border="grey50")

# Create associations
associations=findAssocs(tdm2,"venti",0.98)

# Create associations_df
associations_df=list_vect2df(associations)[, 2:3]
associations_df=associations_df[1:35,]

# View the venti associations
head(associations_df)

# Plot the associations_df values (don't change this)
ggplot(associations_df, aes(y = associations_df[, 1])) + 
  geom_point(aes(x = associations_df[, 2]), 
             data = associations_df, size = 3) + 
  theme_gdocs()

# Make tokenizer function 
tokenizer <- function(x) 
  NGramTokenizer(x, Weka_control(min = 2, max = 2))

# Create unigram_dtm
unigram_dtm=DocumentTermMatrix(coffee_corpus)

# Create bigram_dtm
bigram_dtm=DocumentTermMatrix(coffee_corpus,control =
list(tokenize = tokenizer))

# Examine unigram_dtm
unigram_dtm

## <<DocumentTermMatrix (documents: 1000, terms: 3857)>>
## Non-/sparse entries: 10914/3846086
## Sparsity           : 100%
## Maximal term length: 33
## Weighting          : term frequency (tf)

# Examine bigram_dtm
bigram_dtm

## <<DocumentTermMatrix (documents: 1000, terms: 8252)>>
## Non-/sparse entries: 14108/8237892
## Sparsity           : 100%
## Maximal term length: 44
## Weighting          : term frequency (tf)

# Create bigram_dtm_m
bigram_dtm_m=as.matrix(bigram_dtm)

# Create freq
freq=colSums(bigram_dtm_m)

# Create bi_words
bi_words=names(freq)

# Examine part of bi_words
bi_words[2577:2587]

##  [1] "coffee promotion" "coffee rather"    "coffee ready"    
##  [4] "coffee realness"  "coffee recipe"    "coffee redbull"  
##  [7] "coffee reli"      "coffee right"     "coffee rn"       
## [10] "coffee rocks"     "coffee romance"

# Plot a wordcloud
wordcloud(bi_words,freq,max.words=15)

text_corp=coffee_corpus
# Create tf_tdm
tf_tdm=TermDocumentMatrix(text_corp)

# Create tfidf_tdm
tfidf_tdm=TermDocumentMatrix(text_corp,
           control = list(weighting = weightTfIdf))

# Create tf_tdm_m
tf_tdm_m=as.matrix(tf_tdm)

# Create tfidf_tdm_m 
tfidf_tdm_m =as.matrix(tfidf_tdm)

# Examine part of tf_tdm_m
tf_tdm_m[508:509,5:10 ]

##                  Docs
## Terms             5 6 7 8 9 10
##   @calliesimon:   0 0 0 0 0  0
##   @cassieverslype 0 0 0 0 0  0

# Examine part of tfidf_tdm_m
tfidf_tdm_m[508:509,5:10 ]

##                  Docs
## Terms             5 6 7 8 9 10
##   @calliesimon:   0 0 0 0 0  0
##   @cassieverslype 0 0 0 0 0  0

# Select specific columns
tweets <- tweets[, c("id", "text", "screenName", "created")]

# Rename columns
names(tweets)[1] <- "doc_id"

# Set the data frame source schema: docs
docs <- DataframeSource(tweets)

# Make a clean volatile corpus: text_corpus
text_corpus <- clean_corpus(VCorpus(docs))

# Print the first doc's doc_id  
text_corpus[[1]]$meta$id

## [1] "365439320250793984"

# Examine the first doc content
text_corpus[[1]]$content

## [1] "rt oceanclub eilisohanlon stonyjim vonprond eilis im  pearse st  even  can tell   smells like cats pee  "

# Access the first doc metadata
meta(text_corpus[1])

Cas

amzn=read_csv(
  "https://assets.datacamp.com/production/course_935/datasets/500_amzn.csv")

goog=read_csv(
  "https://assets.datacamp.com/production/course_935/datasets/500_goog.csv")


# Create amzn_pros
amzn_pros=amzn$pros

# Create amzn_cons
amzn_cons=amzn$cons



# Create goog_pros
goog_pros=goog$pros


# Create goog_cons
goog_cons=goog$cons

qdap_clean=function(x){
  x <- replace_abbreviation(x)
  x <- replace_contraction(x)
  x <- replace_number(x)
  x <- replace_ordinal(x)
  x <- replace_ordinal(x)
  x <- replace_symbol(x)
  x <- tolower(x)
  return(x)
}

tm_clean=function(corpus){
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removeWords, 
                   c(stopwords("en"), "Google", "Amazon", "company"))
  return(corpus)
}

# Alter amzn_pros
amzn_pros=qdap_clean(amzn_pros)

# Alter amzn_cons
amzn_cons=qdap_clean(amzn_cons)

# Create az_p_corp 
az_p_corp=VCorpus(VectorSource(amzn_pros))

# Create az_c_corp
az_c_corp=VCorpus(VectorSource(amzn_cons))

# Create amzn_pros_corp
amzn_pros_corp=tm_clean(az_p_corp)

# Create amzn_cons_corp
amzn_cons_corp=tm_clean(az_c_corp)

# Apply qdap_clean to goog_pros
goog_pros=qdap_clean(goog_pros)

# Apply qdap_clean to goog_cons
goog_cons=qdap_clean(goog_cons)

# Create goog_p_corp
goog_p_corp=VCorpus(VectorSource(goog_pros))

# Create goog_c_corp
goog_c_corp=VCorpus(VectorSource(goog_cons))

# Create goog_pros_corp
goog_pros_corp=tm_clean(goog_p_corp)

# Create goog_cons_corp
goog_cons_corp=tm_clean(goog_c_corp)

# Create amzn_p_tdm
amzn_p_tdm=TermDocumentMatrix(amzn_pros_corp,
control=list(tokenize=tokenizer))

# Create amzn_p_tdm_m
amzn_p_tdm_m=as.matrix(amzn_p_tdm)

# Create amzn_p_freq
amzn_p_freq=rowSums(amzn_p_tdm_m)

# Plot a wordcloud using amzn_p_freq values
wordcloud(names(amzn_p_freq),amzn_p_freq,max.words=25,col="blue")

# Create amzn_c_tdm
amzn_c_tdm=TermDocumentMatrix(amzn_cons_corp,
control = list(tokenize = tokenizer))

# Create amzn_c_tdm_m

amzn_c_tdm_m=as.matrix(amzn_c_tdm)
# Create amzn_c_freq
amzn_c_freq=rowSums(amzn_c_tdm_m)

# Plot a wordcloud of negative Amazon bigrams
wordcloud(names(amzn_c_freq),amzn_c_freq,max.words=25,color="red")
# Create amzn_c_tdm
amzn_c_tdm=TermDocumentMatrix(amzn_cons_corp,
    control=list(tokenize=tokenizer))
  
    
# Print amzn_c_tdm to the console
amzn_c_tdm
# Create amzn_c_tdm2 by removing sparse terms 
amzn_c_tdm2=removeSparseTerms(amzn_c_tdm,sparse=0.993)

# Create hc as a cluster of distance values
hc=hclust(dist(amzn_c_tdm2,method="euclidean"),
          method="complete")

# Produce a plot of hc
plot(hc)
# Create amzn_p_tdm
amzn_p_tdm=TermDocumentMatrix(amzn_pros_corp,
    control=list(tokenize=tokenizer))

# Create amzn_p_m
amzn_p_m=as.matrix(amzn_p_tdm)

# Create amzn_p_freq
amzn_p_freq=rowSums(amzn_p_m)

# Create term_frequency
term_frequency=sort(amzn_p_freq,decreasing=T)

# Print the 5 most common terms
term_frequency[1:5]

Note Text mining

Bag of Word

Visualisation et cloud

Text mining et clustering

Cas