창세기 부분 불러오기 및, stop word 제거

# Specify the URL
url <- "https://www.o-bible.com/download/kjv.txt"



# Read the text file from the URL
bible <- readLines(url)


# Display the first few lines of the text data
head(bible)
## [1] "Holy Bible, Authorized (King James) Version, Textfile 930105."                                                                                       
## [2] "Ge1:1 In the beginning God created the heaven and the earth."                                                                                        
## [3] "Ge1:2 And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters."
## [4] "Ge1:3 And God said, Let there be light: and there was light."                                                                                        
## [5] "Ge1:4 And God saw the light, that it was good: and God divided the light from the darkness."                                                         
## [6] "Ge1:5 And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day."
# Select only the lines starting with "Ge"
genesis <- grep("^Ge", bible, value = TRUE)

# Display the selected lines
head(genesis)
## [1] "Ge1:1 In the beginning God created the heaven and the earth."                                                                                        
## [2] "Ge1:2 And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters."
## [3] "Ge1:3 And God said, Let there be light: and there was light."                                                                                        
## [4] "Ge1:4 And God saw the light, that it was good: and God divided the light from the darkness."                                                         
## [5] "Ge1:5 And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day."                           
## [6] "Ge1:6 And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters."

Let’s select the most frequent 20 words in Genesis.

# Combine the lines into a single string
genesis_text <- paste(genesis, collapse = " ")

# Convert to lowercase for case-insensitive comparison
genesis_text <- tolower(genesis_text)

# Convert to lowercase for case-insensitive comparison
genesis_text <- tolower(genesis_text)

# Remove punctuation
genesis_text <- gsub("[[:punct:]]", "", genesis_text)

# Tokenize the text into words
words <- strsplit(genesis_text, "\\s+")

# Flatten the list of words
words <- unlist(words)

# Remove empty strings
words <- words[words != ""]

# Let's remove English stop words. 
stop <- c(
  "a", "about", "above", "across", "after", "again", "against", "all", "almost", "alone",
  "along", "already", "also", "although", "always", "am", "among", "an", "and", "another",
  "any", "anybody", "anyone", "anything", "anywhere", "are", "area", "areas", "aren't", "around",
  "as", "ask", "asked", "asking", "asks", "at", "away", "b", "back", "backed", "backing", "backs",
  "be", "became", "because", "become", "becomes", "been", "before", "began", "behind", "being",
  "beings", "below", "best", "better", "between", "big", "both", "but", "by", "c", "came", "can",
  "cannot", "can't", "case", "cases", "certain", "certainly", "clear", "clearly", "come", "could",
  "couldn't", "d", "did", "didn't", "differ", "different", "differently", "do", "does", "doesn't",
  "doing", "done", "don't", "down", "downed", "downing", "downs", "during", "e", "each", "early",
  "either", "end", "ended", "ending", "ends", "enough", "even", "evenly", "ever", "every", "everybody",
  "everyone", "everything", "everywhere", "f", "face", "faces", "fact", "facts", "far", "felt", "few",
  "find", "finds", "first", "for", "four", "from", "full", "fully", "further", "furthered", "furthering",
  "furthers", "g", "gave", "general", "generally", "get", "gets", "give", "given", "gives", "go", "going",
  "good", "goods", "got", "great", "greater", "greatest", "group", "grouped", "grouping", "groups", "h",
  "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "her", "here",
  "here's", "hers", "herself", "he's", "high", "higher", "highest", "him", "himself", "his", "how",
  "however", "how's", "i", "i'd", "if", "i'll", "i'm", "important", "in", "interest", "interested",
  "interesting", "interests", "into", "is", "isn't", "it", "its", "it's", "itself", "i've", "j", "just",
  "k", "keep", "keeps", "kind", "knew", "know", "known", "knows", "l", "large", "largely", "last", "later",
  "latest", "least", "less", "let", "lets", "let's", "like", "likely", "long", "longer", "longest", "m",
  "made", "make", "making", "man", "many", "may", "me", "member", "members", "men", "might", "more", "most",
  "mostly", "mr", "mrs", "much", "must", "mustn't", "my", "myself", "n", "necessary", "need", "needed",
  "needing", "needs", "never", "new", "newer", "newest", "next", "no", "nobody", "non", "noone", "nor",
  "not", "nothing", "now", "nowhere", "number", "numbers", "o", "of", "off", "often", "old", "older",
  "oldest", "on", "once", "one", "only", "open", "opened", "opening", "opens", "or", "order", "ordered",
  "ordering", "orders", "other", "others", "ought", "our", "ours", "ourselves", "out", "over", "own", "p",
  "part", "parted", "parting", "parts", "per", "perhaps", "place", "places", "point", "pointed", "pointing",
  "points", "possible", "present", "presented", "presenting", "presents", "problem", "problems", "put", "puts",
  "q", "quite", "r", "rather", "really", "right", "room", "rooms", "s", "said", "same", "saw", "say", "says",
  "second", "seconds", "see", "seem", "seemed", "seeming", "seems", "sees", "several", "shall", "shan't", "she",
  "she'd", "she'll", "she's", "should", "shouldn't", "show", "showed", "showing", "shows", "side", "sides",
  "since", "small", "smaller", "smallest", "so", "some", "somebody", "someone", "something", "somewhere",
  "state", "states", "still", "such", "sure", "t", "take", "taken", "than", "that", "that's", "the", "their",
  "theirs", "them", "themselves", "then", "there", "therefore", "there's", "these", "they", "they'd", "they'll",
  "they're", "they've", "thing", "things", "think", "thinks", "this", "those", "though", "thought", "thoughts",
  "three", "through", "thus", "to", "today", "together", "too", "took", "toward", "turn", "turned", "turning",
  "turns", "two", "u", "under", "until", "up", "upon", "us", "use", "used", "uses", "v", "very", "w", "want",
  "wanted", "wanting", "wants", "was", "wasn't", "way", "ways", "we", "we'd", "well", "we'll", "wells", "went",
  "were", "we're",   "weren't", "we've", "what", "what's", "when", "when's", "where", "where's", "whether", "which", "while",
  "who", "whole", "whom", "who's", "whose", "why", "why's", "will", "with", "within", "without", "won't",
  "work", "worked", "working", "works", "would", "wouldn't", "x", "y", "year", "years", "yes"
  , "yet", "you",
  "you'd", "you'll", "young", "younger", "youngest", "your", "you're", "yours", "yourself", "yourselves", "you've", "z", "unto", "thou", "thy", "thee"
)

stop word 를 제거한 벡터를 만든 후 계속 진행하자.

# Remove stop words from words
words[!words %in% stop] -> words

# Create a frequency table
word_freq <- table(words)

# Sort the frequency table in descending order
sorted_word_freq <- sort(word_freq, decreasing = TRUE)

# Display the most frequent 20 words
print(sorted_word_freq[1:20])
## words
##      god     lord     land   father    jacob     sons      son   joseph 
##      230      206      187      169      166      158      148      138 
##  abraham    earth   behold     name     wife   called       ye     hand 
##      121      121      118      101      101       98       96       88 
##    house     pass  brother brethren 
##       86       82       81       80

빈도가 많은 단어 20개 추출 및 변환

word <- c('god', 'lord', 'land', 'father', 'jacob', 'sons', 'son', 'joseph', 'abraham', 'earth',
           'behold', 'name', 'wife', 'called', 'ye', 'hand', 'house', 'pass', 'brother', 'brethren')

Freq <- c(230, 206, 187, 169, 166, 158, 148, 138, 121, 121,
          118, 101, 101, 98, 96, 88, 86, 82, 81, 80)

top20 <- data.frame(word = word, Freq = Freq)

빈도 막대 그래프

# 빈도 순서 변수
order <- arrange(top20, Freq)$word 

ggplot(data = top20, aes(x = word, y = Freq)) + ylim(0, 250) + geom_col() + coord_flip() + scale_x_discrete(limit = order) + geom_text(aes(label = Freq), hjust = 0.1)

워드클라우드 생성

df_words <- as.data.frame(sorted_word_freq)



# 색상 목록 생성
pal <- brewer.pal(9, "Blues")[5:9]
# 난수 고정
set.seed(1234)

wordcloud(words = df_words$words, freq = df_words$Freq, min.freq = 10, max.words = 200, random.order = F, rot.per = 0, scale = c(5,0.3), colors = pal)
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : commanded could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : generations could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : answered could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : firstborn could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : country could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : ishmael could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : peradventure could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : sarai could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : asses could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : circumcised could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : destroy could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : gathered could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : shechem could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : benjamin could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : dreamed could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : egyptians could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : forty could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : garden could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : nahor could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : pharaohs could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : thereof could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : abrahams could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : blessing could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : buried could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : heth could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : neither could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : maid could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df_words$words, freq = df_words$Freq, min.freq =
## 10, : sacks could not be fit on page. It will not be plotted.