In this assignment to be graded, you will 1) pre-process and tokenize text retrieved from any literature (e.g. Sherlock Holmes by Arthur Conan Doyle) in the Project Gutenberg and 2) create a table to show top 50 words in usage frequency of the selected work, and 3) generate a wordcloud that visualizes the frequency of words used in the work. In doing so, you will first choose which literature work you will take on and leave a note of your choice in our Blackboard course website. And, by 11:59 PM on April 27 (Friday), you will upload the following things to “과제” section in Blackboard.
install.packages("gutenbergr")
library(gutenbergr) # Install and load the package "gutenbergr"
library(stringr)
gutenberg_works() # This function returns all information we need to retrieve text data from the eBook
## # A tibble: 40,737 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~
## <int> <chr> <chr> <int> <chr> <chr>
## 1 0 <NA> <NA> NA en <NA>
## 2 1 The De~ Jeffer~ 1638 en United States L~
## 3 2 "The U~ United~ 1 en American Revolu~
## 4 3 John F~ Kenned~ 1666 en <NA>
## 5 4 "Linco~ Lincol~ 3 en US Civil War
## 6 5 The Un~ United~ 1 en American Revolu~
## 7 6 Give M~ Henry,~ 4 en American Revolu~
## 8 7 The Ma~ <NA> NA en <NA>
## 9 8 Abraha~ Lincol~ 3 en US Civil War
## 10 9 Abraha~ Lincol~ 3 en US Civil War
## # ... with 40,727 more rows, and 2 more variables: rights <chr>,
## # has_text <lgl>
gt <- gutenberg_works()
gt[which(str_detect(gt$author, "Conan")&str_detect(gt$title, "Adventures of")),] # Locate the element in the dataset object, gt, which includes a string "Conan" in the author variable and a string "Adventures of" in the title variable
## # A tibble: 3 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_bookshe~
## <int> <chr> <chr> <int> <chr> <chr>
## 1 1644 The Ad~ Doyle~ 69 en Historical Fiction
## 2 1661 The Ad~ Doyle~ 69 en Banned Books from~
## 3 48320 "Adven~ Doyle~ 69 en <NA>
## # ... with 2 more variables: rights <chr>, has_text <lgl>
Let’s download the text by ID, using gutenberg_download(ID#)
holmes <- gutenberg_download(1661)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
holmes # We now have a data frame including two variables: "gutenberg_id" and "text"
## # A tibble: 12,648 x 2
## gutenberg_id text
## <int> <chr>
## 1 1661 THE ADVENTURES OF SHERLOCK HOLMES
## 2 1661 ""
## 3 1661 by
## 4 1661 ""
## 5 1661 SIR ARTHUR CONAN DOYLE
## 6 1661 ""
## 7 1661 ""
## 8 1661 ""
## 9 1661 " I. A Scandal in Bohemia"
## 10 1661 " II. The Red-headed League"
## # ... with 12,638 more rows
holmes_string <- str_c(holmes$text, collapse=" ")
Now we are ready to pre-process the string object and then tokenize it into words
load("bts_text.RData")
bts_string <- paste(str_trim(bts_text), collapse=" ")
str_locate_all(bts_string, "References")
## [[1]]
## start end
## [1,] 6878 6887
## [2,] 28503 28512
bts_trunc <- str_trunc(bts_string, 28502, "right")
str_trunc
is not working…bts_string_line <- unlist(str_split(bts_string, "\n"))
str_which(bts_string_line, "References")
bts_trunc <- str_c(bts_string_line[1:306], collapse=" ")
sort(unlist(str_extract_all(bts_trunc, "[^[:ascii:]]+")), decreasing = T)[1:10]
## [1] "호르몬" "하루만" "쩔어" "진격의" "진" "지민" "제이홉"
## [8] "정호석" "정국" "전정국"
sort(table(unlist(str_extract_all(bts_trunc, "[^[:ascii:]]+"))), decreasing = T)[1:10]
##
## <U+2013> · \\ <U+014D> ’ ”
## 36 11 5 4 2 2
## o 防<U+5F3E>少年<U+56E3> 방탄소년단 防彈少年團
## 2 2 2 2
bts_eng <- str_replace_all(bts_trunc, "[^[:ascii:]]+", " ") # Why we need to replace non-ascii characters with a whitespace character " " instead remove them?
table(unlist(str_extract_all(bts_eng, "[[:word:]]*[[:digit:]]+[[:word:]]*")))
##
## 000 1 10 100 101 102
## 24 17 4 12 1 1
## 103 104 105 106 107 108
## 1 1 2 2 1 1
## 109 10th 11 110 111 112
## 2 1 2 1 1 1
## 113 114 115 116 117 118
## 1 1 1 1 1 1
## 119 12 120 121 122 123
## 1 5 2 1 1 1
## 124 125 126 127 128 129
## 1 1 1 1 1 1
## 13 130 131 132 133 134
## 5 1 2 1 2 1
## 135 136 137 138 139 14
## 1 1 1 1 1 1
## 140 141 142 143 144 145
## 1 1 1 1 1 1
## 146 147 148 149 14th 15
## 1 1 1 1 1 3
## 150 151 152 153 154 155
## 1 1 1 1 1 1
## 156 157 158 159 16 160
## 1 1 1 1 3 1
## 161 162 163 164 165 166
## 1 1 1 1 1 1
## 167 168 169 17 170 171
## 1 1 1 1 1 2
## 172 173 174 175 176 177
## 7 1 1 1 1 1
## 178 179 17th 18 180 181
## 1 1 2 2 1 1
## 182 187 19 1992 1993 1994
## 1 1 1 1 1 2
## 1995 1997 1have 1st 2 20
## 2 1 1 1 14 4
## 200 2001 2010 2011 2012 2013
## 6 1 3 1 1 7
## 2013Melon 2014 2015 2016 2017 2018
## 1 16 15 22 23 9
## 21 22 23 24 25 25th
## 2 2 3 5 3 1
## 26 27 28 29 2nd 3
## 2 4 5 2 1 3
## 30 300 31 32 32nd 33
## 4 2 1 1 1 1
## 34 35 36 37 38 39
## 1 1 1 1 1 2
## 4 40 41 42 43 44
## 9 3 1 2 1 3
## 448 45 46 47 48 49
## 1 2 1 1 1 1
## 5 50 500 502 51 52
## 6 4 3 2 1 1
## 53 54 55 56 57 58
## 1 1 1 2 1 1
## 59 6 60 606 61 62
## 1 3 1 1 1 1
## 63 64 65 66 67 68
## 1 2 1 2 3 1
## 69 6th 7 70 700 71
## 1 1 4 1 1 1
## 72 73 74 75 76 77
## 1 1 1 1 2 1
## 78 79 8 80 81 82
## 1 1 6 1 1 1
## 83 84 85 86 87 88
## 1 1 4 1 1 1
## 89 9 90 91 92 93
## 1 6 1 1 1 1
## 94 95 96 97 98 99
## 1 1 1 1 1 1
## RUL8 Shibuya109 their2015
## 2 1 1
sort(table(unlist(str_extract_all(bts_eng, "[[:punct:]]?[[:digit:]]+[[:punct:]]?[[:digit:]]*"))), decreasing = T)[1:20]
##
## 1 2017, 2 2017 (2014) (2016) [172] 2015 100 2014
## 12 10 9 9 7 7 7 7 6 6
## 2016 (2015) 2016, 2018 2013 8 1, 12, 2010 2015,
## 6 5 5 5 4 4 3 3 3 3
bts_eng <- str_replace_all(bts_eng, "[[:digit:]]+[[:alpha:]]+", " ")
bts_eng_nonum <- str_replace_all(bts_eng, "[[:punct:]]?[[:digit:]]+[[:punct:]]?[[:digit:]]*", " ")
sort(table(unlist(str_extract_all(bts_eng_nonum, "[[:digit:]]+"))[1:20]),decreasing = T)
## integer(0)
str_extract_all(bts_eng_nonum, "[[:alpha:]]+['][s|S] ")
## [[1]]
## [1] "group's " "album's " "Korea's " "d's "
## [5] "YouTube's " "Korea's " "world's " "group's "
## [9] "group's " "MTV's " "onBillboard's " "inFuse's "
## [13] "MTV's " "Billboard's " "Oricon's " "group's "
## [17] "group's " "Melon's " "Billboard's " "Clark's "
## [21] "Year's " "show's " "group's " "V's "
## [25] "Hope's " "Suga's " "RM's " "group's "
## [29] "ALLETS's " "Let's " "school's " "Puma's "
## [33] "BTS's "
bts_eng_nonum_noabbre <- str_replace_all(bts_eng_nonum, "['][s|S] ", " ")
str_extract_all(bts_eng_nonum_noabbre, "[[:alpha:]]+['][s|S] ")
## [[1]]
## character(0)
sort(table(unlist(str_extract_all(bts_eng_nonum_noabbre, "[[:word:]]*[[:punct:]]+[[:word:]]*"))), decreasing = T)[1:50]
##
## , ) ( " K-pop : ;
## 24 16 15 13 13 9 8
## (Korean ), album, Awards, "DNA "Mic BTS'
## 7 7 7 7 6 6 6
## J-Hope RR: "No . Awards. copies. Life,
## 6 6 5 5 5 5 5
## month, single, year, - ", "school &
## 5 5 5 4 4 4 4
## ] chart. Dream" RM, Yourself: BTS-based BTS:
## 4 4 4 4 4 3 3
## chart, Chart, Chart. dancer, Day" December, II:
## 3 3 3 3 3 3 3
## Jimin, Jin, Korea, Life: show, Suga, tour,
## 3 3 3 3 3 3 3
## trilogy"
## 3
str_extract_all(bts_eng_nonum_noabbre, " [u|U][[:punct:]][s|S]\\.?")
## [[1]]
## [1] " U.S." " U.S."
str_extract_all(bts_eng_nonum_noabbre, " [r|R]\\&[b|B] ")
## [[1]]
## [1] " R&B "
str_extract_all(bts_eng_nonum_noabbre, " [j|J]\\-[h|H][o|O][p|P][e|E] ")
## [[1]]
## [1] " J-Hope " " J-Hope "
There are some exceptions we need to pre-process carefully
bts_eng_nonum_noabbre_nopunct <- str_replace_all(bts_eng_nonum_noabbre, " [u|U][[:punct:]][s|S]\\.?", " usa")
bts_eng_nonum_noabbre_nopunct <- str_replace_all(bts_eng_nonum_noabbre, " [r|R]\\&[b|B] ", " rnb ")
bts_eng_nonum_noabbre_nopunct <- str_replace_all(bts_eng_nonum_noabbre, " [j|J]\\-[h|H][o|O][p|P][e|E] ", " jhope ")
Now we are ready to remove all punctuations…
table(unlist(str_extract_all(bts_eng_nonum_noabbre_nopunct, "[[:punct:]]+")))
##
## ' - ! " ", ". & ( ) )" )", ),
## 24 65 6 120 14 8 6 35 23 2 1 8
## ). , . ." ... ...", / : ; ? ?, [
## 3 200 116 4 1 1 5 43 8 1 1 2
## ]
## 4
bts_eng_nonum_noabbre_nopunct <- str_replace_all(bts_eng_nonum_noabbre_nopunct, "[[:punct:]]+", " ")
sort(table(str_extract_all(bts_eng_nonum_noabbre_nopunct, "[[:space:]]{2,}")), decreasing = T)[1:10]
##
##
## 348
##
## 222
##
## 46
##
## 42
## \n
## 22
## \n
## 15
## \n
## 13
##
## 9
## \n
## 9
## \n
## 8
bts_eng_nonum_noabbre_nopunct_nospace <- str_replace_all(bts_eng_nonum_noabbre_nopunct, "[[:space:]]{1,}", " ")
sort(table(unlist(str_extract_all(bts_eng_nonum_noabbre_nopunct_nospace, "[[:upper:]]{1}[[:alpha:]]{1,}"))), decreasing = T)[1:50]
##
## BTS The Korean Awards Billboard Music Episode
## 66 47 29 23 20 20 16
## Chart In Japanese Korea Bangtan Gaon Japan
## 14 14 14 13 12 11 11
## Life They Year Beautiful December Moment Most
## 11 11 11 10 10 10 10
## Wings Artist On Seoul South That DNA
## 10 9 9 9 9 9 8
## June Live Love Mnet Suga World Big
## 8 8 8 8 8 8 7
## March October Trilogy Yourself YouTube Album Albums
## 7 7 7 7 7 6 6
## Asian Boy Drop Hot Jimin May Mic
## 6 6 6 6 6 6 6
## Part
## 6
sort(table(unlist(str_extract_all(bts_eng_nonum_noabbre_nopunct_nospace, "[[:upper:]]{1}[[:alpha:]]{1,}"))), decreasing=T)[1:20]
##
## BTS The Korean Awards Billboard Music Episode
## 66 47 29 23 20 20 16
## Chart In Japanese Korea Bangtan Gaon Japan
## 14 14 14 13 12 11 11
## Life They Year Beautiful December Moment
## 11 11 11 10 10 10
sort(table(unlist(str_extract_all(bts_eng_nonum_noabbre_nopunct_nospace, "[[:upper:]]{1}[[:alpha:]]{1,}"))), decreasing=T)[1:50]
##
## BTS The Korean Awards Billboard Music Episode
## 66 47 29 23 20 20 16
## Chart In Japanese Korea Bangtan Gaon Japan
## 14 14 14 13 12 11 11
## Life They Year Beautiful December Moment Most
## 11 11 11 10 10 10 10
## Wings Artist On Seoul South That DNA
## 10 9 9 9 9 9 8
## June Live Love Mnet Suga World Big
## 8 8 8 8 8 8 7
## March October Trilogy Yourself YouTube Album Albums
## 7 7 7 7 7 6 6
## Asian Boy Drop Hot Jimin May Mic
## 6 6 6 6 6 6 6
## Part
## 6
It seems to me there is no special upper-case letter issue here…
bts_eng_nonum_noabbre_nopunct_nospace_tolower <- str_to_lower(bts_eng_nonum_noabbre_nopunct_nospace)
bts_words <- unlist(str_split(bts_eng_nonum_noabbre_nopunct_nospace_tolower, " "))
bts_words_b <- unlist(str_extract_all(bts_eng_nonum_noabbre_nopunct_nospace_tolower, boundary("word")))
sort(table(bts_words), decreasing=T)[1:50]
## bts_words
## the and in on bts of to
## 262 100 86 82 67 63 62
## for their at a first group that
## 54 48 42 40 38 36 34
## as music album korean chart they with
## 33 32 30 29 28 28 27
## awards number was by year over also
## 26 26 26 22 22 20 19
## artist released billboard most single episode pop
## 18 18 17 17 17 16 16
## an k were copies japanese albums korea
## 15 15 15 14 14 13 13
## later bangtan from song gaon its life
## 13 12 12 12 11 11 11
## million
## 11
install.packages("tm")
library(tm)
## Loading required package: NLP
# "en" dictionary
stopwords("en")[1:10]
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
length(stopwords("en"))
## [1] 174
# "SMART" dictionary
stopwords("SMART")[1:10]
## [1] "a" "a's" "able" "about" "above"
## [6] "according" "accordingly" "across" "actually" "after"
length(stopwords("SMART"))
## [1] 571
Let’s use the “SMART” stopwords dictionary
bts_words_nostop <- bts_words[!bts_words %in% stopwords("SMART")]
sort(table(bts_words_nostop), decreasing=T)[1:50]
## bts_words_nostop
## bts group music album korean
## 67 36 32 30 29
## chart awards number year artist
## 28 26 26 22 18
## released billboard single episode pop
## 18 17 17 16 16
## copies japanese albums korea bangtan
## 14 14 13 13 12
## song gaon life million peaked
## 12 11 11 11 11
## top trilogy world beautiful december
## 11 11 11 10 10
## hit japan live moment part
## 10 10 10 10 10
## social wings boy lead members
## 10 10 9 9 9
## school seoul south tour debut
## 9 9 9 9 8
## dna june love representing selling
## 8 8 8 8 8
stemmer.func <- function(text,stemmer,stemmed){
text <- str_replace_all(text, "(\\bam )|(\\bare )|(\\bis )|(\\bwas )|(\\bwere )|(\\bbe )", "be ")
text
}
text <- c("I was a boy. Now I am a man. She was a girl. We were friends.")
text.stem <- stemmer.func(text)
text
## [1] "I was a boy. Now I am a man. She was a girl. We were friends."
text.stem
## [1] "I be a boy. Now I be a man. She be a girl. We be friends."
Let’s take a look at our word vector:
bts_words_nostop[str_detect(bts_words_nostop, "release")]
## [1] "release" "released" "released" "released" "released" "released"
## [7] "released" "released" "released" "released" "release" "released"
## [13] "released" "released" "released" "released" "release" "released"
## [19] "released" "released" "released"
bts_words_nostop[str_which(bts_words_nostop, "release")]
## [1] "release" "released" "released" "released" "released" "released"
## [7] "released" "released" "released" "released" "release" "released"
## [13] "released" "released" "released" "released" "release" "released"
## [19] "released" "released" "released"
bts_words_nostop_stemmed <- stemDocument(bts_words_nostop)
bts_words_nostop_stemmed[str_which(bts_words_nostop_stemmed, "release")]
## character(0)
bts_words_nostop_stemmed[str_which(bts_words_nostop_stemmed, "releas")]
## [1] "releas" "releas" "releas" "releas" "releas" "releas" "releas"
## [8] "releas" "releas" "releas" "releas" "releas" "releas" "releas"
## [15] "releas" "releas" "releas" "releas" "releas" "releas" "releas"
sort(table(bts_words_nostop_stemmed), decreasing = T)[1:50]
## bts_words_nostop_stemmed
## bts album group chart award music korean
## 67 43 37 34 33 33 29
## number year singl artist releas billboard episod
## 26 26 24 23 21 17 17
## top pop song copi japanes korea peak
## 17 16 15 14 14 13 13
## tour bangtan boy show gaon hit life
## 13 12 12 12 11 11 11
## member million trilog world beauti debut decemb
## 11 11 11 11 10 10 10
## donat japan live moment part sell social
## 10 10 10 10 10 10 10
## week wing lead month record repres school
## 10 10 9 9 9 9 9
## seoul
## 9
library(wordcloud)
## Loading required package: RColorBrewer
bts_words_freq <- sort(table(bts_words), decreasing = T)
bts_words_nostop_freq <- sort(table(bts_words_nostop), decreasing = T)
bts_words_nostop_stemmed_freq <- sort(table(bts_words_nostop_stemmed), decreasing = T)
pal <- brewer.pal(10, "Dark2")
## Warning in brewer.pal(10, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
set.seed(412)
bts_word_cloud <- wordcloud(words = names(bts_words_freq),
freq = bts_words_freq,
min.freq = 5,
max.words = 1000,
random.order = FALSE,
rot.per = 0.1,
scale = c(4, 0.3),
colors = pal)
bts_word_nostop_cloud <- wordcloud(words = names(bts_words_nostop_freq),
freq = bts_words_nostop_freq,
min.freq = 5,
max.words = 1000,
random.order = FALSE,
rot.per = 0.1,
scale = c(4, 0.3),
colors = pal) # Word colors
bts_word_nostop_stemmed_cloud <- wordcloud(words = names(bts_words_nostop_stemmed_freq),
freq = bts_words_nostop_stemmed_freq, # Frequency of words
min.freq = 5, # Minimum frequency of words plotted
max.words = 1000, # 1000 words in frequency order plotted
random.order = FALSE, # Highly frequent words placed in the middle
rot.per = 0.1, # Rate of words rotated in plot
scale = c(4, 0.3), # Range of words in size
colors = pal) # Word colors