Text-Mining_tm_package.R

library(qdap)

## Warning: package 'qdap' was built under R version 4.0.5

## Loading required package: qdapDictionaries

## Loading required package: qdapRegex

## Warning: package 'qdapRegex' was built under R version 4.0.5

## Loading required package: qdapTools

## Warning: package 'qdapTools' was built under R version 4.0.5

## Loading required package: RColorBrewer

## 
## Attaching package: 'qdap'

## The following objects are masked from 'package:base':
## 
##     Filter, proportions

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:qdapTools':
## 
##     id

## The following object is masked from 'package:qdapRegex':
## 
##     explain

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tm)

## Warning: package 'tm' was built under R version 4.0.5

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:qdap':
## 
##     ngrams

## 
## Attaching package: 'tm'

## The following objects are masked from 'package:qdap':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.0.5

library(plotrix)
library(dendextend)

## 
## ---------------------
## Welcome to dendextend version 1.14.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

## The following object is masked from 'package:qdapRegex':
## 
##     %+%

library(ggthemes)
library(RWeka)

## Warning: package 'RWeka' was built under R version 4.0.5

###################################
# Print new_text to the console
new_text <- "DataCamp is the first online learning platform 
that focuses on building the best learning experience specifically 
for Data Science. We have offices in Boston and Belgium and to date,
we trained over 250,000 (aspiring) data scientists in over 150 countries.
These data science enthusiasts completed more than 9 million exercises. 
You can take free beginner courses, or subscribe for $25/month to 
get access to all premium courses $$$$$."
################################################
# Clean with qdap
qdap_clean <- function(x) {
  x <- replace_abbreviation(x)
  x <- replace_contraction(x)
  x <- replace_number(x)
  x <- replace_ordinal(x)
  x <- replace_symbol(x)
  x <- tolower(x)
  return(x)
}

# Clean with tm
tm_clean <- function(corpus) {
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removeWords, 
                   c(stopwords("en"), "Google", "Amazon", "company"))
  return(corpus)
}
######################################################
data_1 <- qdap_clean(new_text)
data_1

## [1] "datacamp is the first online learning platform that focuses on building the best learning experience specifically for data science. we have offices in boston and belgium and to date, we trained over two hundred fifty thousand (aspiring) data scientists in over one hundred fifty countries. these data science enthusiasts completed more than nine million exercises. you can take free beginner courses, or subscribe for dollar twenty five/month to get access to all premium courses dollar dollar dollar dollar dollar."

class(data_1)

## [1] "character"

data_1 <- VCorpus(VectorSource(data_1))
data_1

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1

data_2 <- tm_clean(data_1)
data_2

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1

# Create tdm and matrix with normal weighting
tf_tdm <- TermDocumentMatrix(data_2)
tf_tdm

## <<TermDocumentMatrix (terms: 42, documents: 1)>>
## Non-/sparse entries: 42/0
## Sparsity           : 0%
## Maximal term length: 12
## Weighting          : term frequency (tf)

tf_tdm_m <- as.matrix(tf_tdm)
tf_tdm_m

##               Docs
## Terms          1
##   access       1
##   aspiring     1
##   beginner     1
##   belgium      1
##   best         1
##   boston       1
##   building     1
##   can          1
##   completed    1
##   countries    1
##   courses      2
##   data         3
##   datacamp     1
##   date         1
##   dollar       6
##   enthusiasts  1
##   exercises    1
##   experience   1
##   fifty        2
##   first        1
##   fivemonth    1
##   focuses      1
##   free         1
##   get          1
##   hundred      2
##   learning     2
##   million      1
##   nine         1
##   offices      1
##   one          1
##   online       1
##   platform     1
##   premium      1
##   science      2
##   scientists   1
##   specifically 1
##   subscribe    1
##   take         1
##   thousand     1
##   trained      1
##   twenty       1
##   two          1

# Calculate the rowSums: term_frequency
term_frequency <- rowSums(tf_tdm_m)
# Sort term_frequency in descending order
term_frequency <- sort(term_frequency, decreasing = T)
###############
# Plot a barchart of the 10 most common words
barplot(term_frequency [1:10], col = "tan", las = 2)

#######################
# Sum rows and frequency data frame
chardonnay_term_freq <- rowSums(tf_tdm_m)
head(chardonnay_term_freq)

##   access aspiring beginner  belgium     best   boston 
##        1        1        1        1        1        1

# Create a wordcloud for the values in word_freqs
chardonnay_word_freqs <- data.frame(
  term = names(chardonnay_term_freq),
  num = chardonnay_term_freq)

head(chardonnay_word_freqs)

##              term num
## access     access   1
## aspiring aspiring   1
## beginner beginner   1
## belgium   belgium   1
## best         best   1
## boston     boston   1

wordcloud(chardonnay_word_freqs$term, chardonnay_word_freqs$num,
          max.words = 100, 
          min.freq=1,
          colors = c("grey80","darkgoldenrod1", "tomato"))

# Print the list of colors
head(colors(),50)

##  [1] "white"          "aliceblue"      "antiquewhite"   "antiquewhite1" 
##  [5] "antiquewhite2"  "antiquewhite3"  "antiquewhite4"  "aquamarine"    
##  [9] "aquamarine1"    "aquamarine2"    "aquamarine3"    "aquamarine4"   
## [13] "azure"          "azure1"         "azure2"         "azure3"        
## [17] "azure4"         "beige"          "bisque"         "bisque1"       
## [21] "bisque2"        "bisque3"        "bisque4"        "black"         
## [25] "blanchedalmond" "blue"           "blue1"          "blue2"         
## [29] "blue3"          "blue4"          "blueviolet"     "brown"         
## [33] "brown1"         "brown2"         "brown3"         "brown4"        
## [37] "burlywood"      "burlywood1"     "burlywood2"     "burlywood3"    
## [41] "burlywood4"     "cadetblue"      "cadetblue1"     "cadetblue2"    
## [45] "cadetblue3"     "cadetblue4"     "chartreuse"     "chartreuse1"   
## [49] "chartreuse2"    "chartreuse3"

Text-Mining_tm_package.R

liyix

2021-05-02