library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url <- "https://www.o-bible.com/download/kjv.txt"
path <- "C:/Users/chosun/Downloads/hiphop.txt"
bible <- readLines(url)
hiphop <- readLines(path)
head(bible)
## [1] "Holy Bible, Authorized (King James) Version, Textfile 930105."
## [2] "Ge1:1 In the beginning God created the heaven and the earth."
## [3] "Ge1:2 And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters."
## [4] "Ge1:3 And God said, Let there be light: and there was light."
## [5] "Ge1:4 And God saw the light, that it was good: and God divided the light from the darkness."
## [6] "Ge1:5 And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day."
genesis <- grep("^Ge", bible, value = TRUE)
head(genesis)
## [1] "Ge1:1 In the beginning God created the heaven and the earth."
## [2] "Ge1:2 And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters."
## [3] "Ge1:3 And God said, Let there be light: and there was light."
## [4] "Ge1:4 And God saw the light, that it was good: and God divided the light from the darkness."
## [5] "Ge1:5 And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day."
## [6] "Ge1:6 And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters."
genesis_text <- paste(genesis, collapse = " ")
genesis_text <- tolower(genesis_text)
genesis_text <- gsub("[[:punct:]]", "", genesis_text)
words <- strsplit(genesis_text, "\\s+")
words <- unlist(words)
words <- words[words != ""]
word_freq <- table(words)
sorted_word_freq <- sort(word_freq, decreasing = TRUE)
print(sorted_word_freq[1:20])
## words
## and the of his he to in unto that i said him my a for was
## 3678 2458 1365 653 652 612 600 598 521 484 478 402 343 341 326 317
## it with me thou
## 306 293 292 284
library(tm)
## Warning: 패키지 'tm'는 R 버전 4.3.2에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: NLP
genesis_text <- paste(genesis, collapse = " ")
genesis_text <- tolower(genesis_text)
genesis_text <- gsub("[[:punct:]]", "", genesis_text)
words <- unlist(strsplit(genesis_text, "\\s+"))
words <- words[words != ""]
corpus <- Corpus(VectorSource(words))
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("en")):
## transformation drops documents
dtm <- DocumentTermMatrix(corpus)
matrix <- as.matrix(dtm)
word_freq <- colSums(matrix)
sorted_word_freq <- sort(word_freq, decreasing = TRUE)
print(sorted_word_freq[1:20])
## unto said thou thy thee shall god lord will land
## 598 478 284 279 268 259 230 206 195 187
## came father jacob sons son upon joseph earth abraham behold
## 176 169 166 158 148 141 138 121 121 118