Loading required package: qdapDictionaries
Loading required package: qdapRegex
Attaching package: 'qdapRegex'
The following object is masked from 'package:dplyr':
explain
The following object is masked from 'package:ggplot2':
%+%
Loading required package: qdapTools
Attaching package: 'qdapTools'
The following object is masked from 'package:dplyr':
id
Loading required package: RColorBrewer
Attaching package: 'qdap'
The following objects are masked from 'package:base':
Filter, proportions
# Load KJV of the Holy Bible
file_html <- "10-h.htm"
holy_bible <- read_lines(file(file_html), skip = 357, skip_empty_rows = TRUE)# Remove tags from file and Character encoding
holy_bible <- holy_bible %>% iconv('UTF-8', 'ASCII') %>% bracketX(bracket="all")
# Remove digits representing chapters and verses
holy_bible <- gsub("[[:digit:]]", "", holy_bible)
# Remove white spaces
holy_bible <- holy_bible %>% rm_white()# Extract the book of genesis
genesis <- holy_bible[(which(holy_bible == "The First Book of Moses: Called Genesis")+1):(which(holy_bible == "The Second Book of Moses: Called Exodus")-1)]
# Create a tibble
genesis_df <- tibble(line = 1:length(genesis), text = genesis)
# Tokenization
genesis_tidy <- genesis_df %>% unnest_tokens(word, text)
# Stop words removal
data("stop_words")
genesis_tidy <- genesis_tidy %>% anti_join(stop_words)## Joining, by = "word"
# Words frequency
genesis_tidy %>% count(word, sort = TRUE)## # A tibble: 2,117 x 2
## word n
## <chr> <int>
## 1 thou 284
## 2 thy 278
## 3 thee 268
## 4 rsquo 264
## 5 god 233
## 6 lord 211
## 7 father 201
## 8 land 187
## 9 jacob 181
## 10 son 160
## # ... with 2,107 more rows
Words like thou, thy, thee etc, should be removed because these words are stop words.
# Create a data frame of custom stop words
archaic_words <- tibble(word = c("ye", "thee", "hast", "hath", "thou", "thy",
"rsquo", "shalt", "art", "thine", "wilt",
"didst", "saidst", "goest", "camest",
"comest", "doest", "doth", "seest", "dost",
"hadst", "cometh", "saith", "shew", "goeth",
"thyself", "wast", "yea", "nay", "wouldest",
"mayest", "doeth", "shouldest", "thereof",
"whatsoever", "wherefore", "canst", "forthwith",
"putteth", "sayest", "wheresoever", "whosoever",
"thereon", "wherewith", "whomsoever", "couldest"))
# Remove archaic words
genesis_tidy <- anti_join(genesis_tidy, archaic_words, by = "word")
genesis_tidy %>% count(word, sort = TRUE)## # A tibble: 2,076 x 2
## word n
## <chr> <int>
## 1 god 233
## 2 lord 211
## 3 father 201
## 4 land 187
## 5 jacob 181
## 6 son 160
## 7 joseph 157
## 8 sons 146
## 9 abraham 134
## 10 earth 121
## # ... with 2,066 more rows
# Words frequency
genesis_tidy %>% count(word, sort = TRUE)## # A tibble: 2,076 x 2
## word n
## <chr> <int>
## 1 god 233
## 2 lord 211
## 3 father 201
## 4 land 187
## 5 jacob 181
## 6 son 160
## 7 joseph 157
## 8 sons 146
## 9 abraham 134
## 10 earth 121
## # ... with 2,066 more rows
# Visualization
genesis_tidy %>% count(word, sort = TRUE) %>%
filter(n > 100) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) + geom_col(fill = "green", show.legend = FALSE) +
xlab(NULL) +
coord_flip()Joining, by = "word"
# A tibble: 1,684 x 2
word n
<chr> <int>
1 lord 406
2 moses 290
3 people 174
4 israel 169
5 children 139
6 god 123
7 egypt 122
8 land 122
9 aaron 116
10 pharaoh 115
# ... with 1,674 more rows
Joining, by = "word"
# A tibble: 1,138 x 2
word n
<chr> <int>
1 lord 313
2 offering 249
3 priest 185
4 unclean 110
5 holy 94
6 sin 89
7 blood 88
8 altar 87
9 moses 86
10 burnt 84
# ... with 1,128 more rows
Joining, by = "word"
# A tibble: 1,727 x 2
word n
<chr> <int>
1 lord 403
2 children 263
3 israel 236
4 moses 234
5 offering 234
6 son 148
7 congregation 134
8 land 115
9 tabernacle 107
10 aaron 101
# ... with 1,717 more rows
Joining, by = "word"
# A tibble: 1,697 x 2
word n
<chr> <int>
1 lord 553
2 god 357
3 land 187
4 day 103
5 people 99
6 israel 72
7 children 71
8 hand 68
9 eat 65
10 possess 52
# ... with 1,687 more rows
Joining, by = "word"
# A tibble: 1,325 x 2
word n
<chr> <int>
1 lord 228
2 children 192
3 joshua 168
4 israel 156
5 cities 91
6 land 87
7 king 84
8 people 76
9 god 73
10 tribe 72
# ... with 1,315 more rows
Joining, by = "word"
# A tibble: 1,345 x 2
word n
<chr> <int>
1 israel 184
2 lord 181
3 children 131
4 house 71
5 people 69
6 hand 65
7 god 63
8 son 53
9 father 49
10 city 48
# ... with 1,335 more rows
Joining, by = "word"
# A tibble: 303 x 2
word n
<chr> <int>
1 naomi 21
2 boaz 20
3 law 19
4 lord 19
5 kinsman 13
6 daughter 12
7 mother 12
8 ruth 12
9 people 10
10 begat 9
# ... with 293 more rows
Joining, by = "word"
# A tibble: 1,556 x 2
word n
<chr> <int>
1 lord 350
2 saul 297
3 david 292
4 israel 146
5 samuel 131
6 philistines 120
7 people 111
8 hand 102
9 day 97
10 god 96
# ... with 1,546 more rows
Warning in (which(holy_bible == "The Second Book of Samuel") + 1):
(which(holy_bible == : numerical expression has 2 elements: only the first used
Joining, by = "word"
# A tibble: 1,556 x 2
word n
<chr> <int>
1 lord 350
2 saul 297
3 david 292
4 israel 146
5 samuel 132
6 philistines 120
7 people 111
8 hand 102
9 day 97
10 god 96
# ... with 1,546 more rows
Warning in (which(holy_bible == "The First Book of the Kings") + 1):
(which(holy_bible == : numerical expression has 2 elements: only the first used
Warning in (which(holy_bible == "The First Book of the Kings") + 1):
(which(holy_bible == : numerical expression has 2 elements: only the first used
Joining, by = "word"
# A tibble: 1,556 x 2
word n
<chr> <int>
1 lord 350
2 saul 297
3 david 292
4 israel 146
5 samuel 132
6 philistines 120
7 people 111
8 hand 102
9 day 97
10 god 96
# ... with 1,546 more rows
Warning in (which(holy_bible == "The Second Book of the Kings") + 1):
(which(holy_bible == : numerical expression has 2 elements: only the first used
Joining, by = "word"
# A tibble: 2,915 x 2
word n
<chr> <int>
1 king 903
2 lord 783
3 israel 480
4 son 412
5 house 411
6 david 382
7 god 249
8 people 242
9 judah 167
10 solomon 165
# ... with 2,905 more rows
Joining, by = "word"
# A tibble: 2,188 x 2
word n
<chr> <int>
1 son 324
2 sons 308
3 david 190
4 lord 178
5 god 114
6 israel 114
7 house 106
8 begat 86
9 brethren 77
10 thousand 75
# ... with 2,178 more rows
Joining, by = "word"
# A tibble: 1,690 x 2
word n
<chr> <int>
1 lord 390
2 king 258
3 house 202
4 god 188
5 israel 187
6 judah 161
7 jerusalem 127
8 people 111
9 son 104
10 solomon 85
# ... with 1,680 more rows
Joining, by = "word"
# A tibble: 842 x 2
word n
<chr> <int>
1 children 113
2 god 96
3 king 68
4 house 65
5 hundred 60
6 jerusalem 48
7 sons 48
8 son 44
9 israel 40
10 lord 38
# ... with 832 more rows
Joining, by = "word"
# A tibble: 1,184 x 2
word n
<chr> <int>
1 son 117
2 children 114
3 god 75
4 people 50
5 hundred 49
6 house 45
7 levites 43
8 jerusalem 38
9 king 38
10 day 36
# ... with 1,174 more rows
Joining, by = "word"
# A tibble: 552 x 2
word n
<chr> <int>
1 king 195
2 mordecai 58
3 esther 56
4 haman 53
5 jews 45
6 day 37
7 ahasuerus 29
8 people 27
9 queen 27
10 house 26
# ... with 542 more rows
Joining, by = "word"
# A tibble: 1,890 x 2
word n
<chr> <int>
1 god 117
2 job 56
3 earth 50
4 mine 50
5 behold 48
6 hand 45
7 words 39
8 wicked 37
9 mouth 36
10 days 33
# ... with 1,880 more rows
Joining, by = "word"
# A tibble: 2,535 x 2
word n
<chr> <int>
1 lord 787
2 god 439
3 praise 158
4 earth 141
5 mine 135
6 soul 132
7 people 130
8 heart 125
9 mercy 100
10 hand 97
# ... with 2,525 more rows
Joining, by = "word"
# A tibble: 1,548 x 2
word n
<chr> <int>
1 wicked 89
2 lord 87
3 heart 81
4 wise 66
5 righteous 54
6 understanding 54
7 wisdom 54
8 mouth 51
9 evil 50
10 son 45
# ... with 1,538 more rows
Joining, by = "word"
# A tibble: 634 x 2
word n
<chr> <int>
1 god 41
2 heart 40
3 time 39
4 sun 35
5 vanity 33
6 wisdom 28
7 wise 25
8 labour 23
9 evil 22
10 days 20
# ... with 624 more rows
Joining, by = "word"
# A tibble: 470 x 2
word n
<chr> <int>
1 beloved 34
2 love 26
3 fair 11
4 daughters 10
5 behold 9
6 breasts 8
7 eyes 8
8 jerusalem 8
9 myrrh 8
10 lebanon 7
# ... with 460 more rows
Joining, by = "word"
# A tibble: 2,745 x 2
word n
<chr> <int>
1 lord 483
2 people 138
3 god 136
4 behold 104
5 earth 102
6 day 98
7 israel 92
8 hand 82
9 land 81
10 hosts 62
# ... with 2,735 more rows
Joining, by = "word"
# A tibble: 2,234 x 2
word n
<chr> <int>
1 lord 730
2 king 230
3 land 202
4 judah 183
5 babylon 169
6 people 168
7 son 148
8 behold 132
9 jeremiah 131
10 god 129
# ... with 2,224 more rows
Joining, by = "word"
# A tibble: 596 x 2
word n
<chr> <int>
1 lord 46
2 daughter 20
3 zion 15
4 day 14
5 mine 14
6 anger 11
7 enemy 11
8 hand 11
9 people 11
10 children 9
# ... with 586 more rows
Joining, by = "word"
# A tibble: 1,972 x 2
word n
<chr> <int>
1 lord 439
2 god 257
3 israel 186
4 house 162
5 land 153
6 son 108
7 behold 104
8 people 98
9 hand 96
10 midst 94
# ... with 1,962 more rows
Joining, by = "word"
# A tibble: 989 x 2
word n
<chr> <int>
1 king 174
2 daniel 75
3 god 66
4 kingdom 56
5 heaven 32
6 nebuchadnezzar 32
7 time 31
8 interpretation 30
9 lord 30
10 set 28
# ... with 979 more rows
Joining, by = "word"
# A tibble: 745 x 2
word n
<chr> <int>
1 lord 47
2 israel 44
3 ephraim 37
4 god 30
5 children 18
6 king 17
7 people 17
8 judah 15
9 land 15
10 return 15
# ... with 735 more rows
Joining, by = "word"
# A tibble: 369 x 2
word n
<chr> <int>
1 lord 33
2 people 14
3 children 11
4 god 11
5 land 11
6 day 9
7 tree 7
8 wine 7
9 zion 7
10 field 6
# ... with 359 more rows
Joining, by = "word"
# A tibble: 622 x 2
word n
<chr> <int>
1 lord 85
2 god 35
3 israel 30
4 house 24
5 land 20
6 day 15
7 behold 12
8 earth 12
9 palaces 12
10 transgressions 10
# ... with 612 more rows
Joining, by = "word"
# A tibble: 136 x 2
word n
<chr> <int>
1 day 12
2 esau 7
3 lord 7
4 mount 6
5 possess 6
6 house 5
7 cut 4
8 heathen 4
9 calamity 3
10 jacob 3
# ... with 126 more rows
Joining, by = "word"
# A tibble: 207 x 2
word n
<chr> <int>
1 lord 26
2 jonah 18
3 god 16
4 sea 11
5 nineveh 9
6 city 8
7 cast 7
8 evil 6
9 cried 5
10 gourd 5
# ... with 197 more rows
Joining, by = "word"
# A tibble: 545 x 2
word n
<chr> <int>
1 lord 42
2 people 19
3 house 14
4 god 13
5 israel 12
6 jacob 11
7 hear 9
8 zion 9
9 daughter 8
10 day 8
# ... with 535 more rows
Joining, by = "word"
# A tibble: 329 x 2
word n
<chr> <int>
1 lord 13
2 cut 5
3 strong 5
4 behold 4
5 chariots 4
6 lion 4
7 cankerworm 3
8 day 3
9 devour 3
10 enemies 3
# ... with 319 more rows
Joining, by = "word"
# A tibble: 353 x 2
word n
<chr> <int>
1 lord 13
2 people 7
3 behold 6
4 god 6
5 violence 6
6 earth 5
7 land 5
8 woe 5
9 glory 4
10 judgment 4
# ... with 343 more rows
Joining, by = "word"
# A tibble: 325 x 2
word n
<chr> <int>
1 lord 34
2 day 21
3 people 8
4 desolation 7
5 cut 6
6 gather 6
7 land 6
8 midst 6
9 bring 5
10 god 5
# ... with 315 more rows
Joining, by = "word"
# A tibble: 163 x 2
word n
<chr> <int>
1 lord 35
2 hosts 14
3 day 11
4 son 10
5 haggai 9
6 house 9
7 month 8
8 people 8
9 zerubbabel 7
10 word 6
# ... with 153 more rows
Joining, by = "word"
# A tibble: 708 x 2
word n
<chr> <int>
1 lord 141
2 hosts 53
3 jerusalem 41
4 day 32
5 house 31
6 behold 22
7 judah 22
8 angel 20
9 land 20
10 people 20
# ... with 698 more rows
Joining, by = "word"
# A tibble: 293 x 2
word n
<chr> <int>
1 lord 49
2 hosts 24
3 god 9
4 offering 7
5 behold 6
6 covenant 6
7 day 6
8 mine 6
9 israel 5
10 law 5
# ... with 283 more rows
Joining, by = "word"
# A tibble: 1,740 x 2
word n
<chr> <int>
1 jesus 172
2 lord 78
3 son 78
4 heaven 77
5 disciples 71
6 father 61
7 behold 60
8 god 57
9 kingdom 56
10 answered 48
# ... with 1,730 more rows
Joining, by = "word"
# A tibble: 1,322 x 2
word n
<chr> <int>
1 jesus 97
2 god 52
3 disciples 46
4 son 38
5 answered 30
6 house 30
7 cast 28
8 eat 26
9 john 26
10 heard 25
# ... with 1,312 more rows
Joining, by = "word"
# A tibble: 1,989 x 2
word n
<chr> <int>
1 son 145
2 god 126
3 lord 105
4 jesus 100
5 day 64
6 pass 58
7 people 58
8 house 57
9 behold 52
10 father 48
# ... with 1,979 more rows
Joining, by = "word"
# A tibble: 1,038 x 2
word n
<chr> <int>
1 jesus 256
2 father 134
3 god 83
4 world 80
5 answered 77
6 jews 67
7 disciples 64
8 son 63
9 verily 50
10 lord 46
# ... with 1,028 more rows
Joining, by = "word"
# A tibble: 1,904 x 2
word n
<chr> <int>
1 god 175
2 paul 131
3 lord 109
4 jews 71
5 people 69
6 jesus 68
7 day 61
8 jerusalem 60
9 peter 58
10 heard 54
# ... with 1,894 more rows
Joining, by = "word"
# A tibble: 1,027 x 2
word n
<chr> <int>
1 god 166
2 law 77
3 christ 67
4 lord 45
5 sin 45
6 faith 39
7 righteousness 39
8 jesus 38
9 spirit 29
10 flesh 25
# ... with 1,017 more rows
Joining, by = "word"
# A tibble: 939 x 2
word n
<chr> <int>
1 god 107
2 christ 69
3 lord 69
4 body 44
5 spirit 35
6 world 29
7 brethren 28
8 jesus 27
9 speak 25
10 eat 23
# ... with 929 more rows
Joining, by = "word"
# A tibble: 764 x 2
word n
<chr> <int>
1 god 75
2 christ 49
3 lord 30
4 glory 28
5 jesus 20
6 spirit 15
7 grace 13
8 flesh 11
9 love 11
10 body 9
# ... with 754 more rows
Joining, by = "word"
# A tibble: 427 x 2
word n
<chr> <int>
1 christ 41
2 god 33
3 law 32
4 faith 22
5 flesh 18
6 spirit 18
7 jesus 17
8 gospel 12
9 brethren 11
10 abraham 9
# ... with 417 more rows
Joining, by = "word"
# A tibble: 451 x 2
word n
<chr> <int>
1 christ 46
2 god 33
3 lord 25
4 jesus 21
5 spirit 15
6 love 14
7 grace 11
8 father 10
9 flesh 10
10 body 9
# ... with 441 more rows
Joining, by = "word"
# A tibble: 351 x 2
word n
<chr> <int>
1 christ 38
2 god 23
3 jesus 22
4 lord 15
5 rejoice 10
6 gospel 9
7 brethren 8
8 mind 7
9 death 6
10 joy 6
# ... with 341 more rows
Joining, by = "word"
# A tibble: 362 x 2
word n
<chr> <int>
1 christ 26
2 god 22
3 lord 13
4 body 8
5 flesh 8
6 jesus 8
7 father 6
8 wisdom 6
9 dead 5
10 faith 5
# ... with 352 more rows
Joining, by = "word"
# A tibble: 292 x 2
word n
<chr> <int>
1 god 39
2 lord 25
3 brethren 17
4 jesus 17
5 christ 14
6 faith 8
7 word 7
8 day 6
9 father 6
10 gospel 6
# ... with 282 more rows
Joining, by = "word"
# A tibble: 180 x 2
word n
<chr> <int>
1 lord 21
2 god 19
3 christ 13
4 jesus 12
5 brethren 7
6 word 5
7 day 4
8 faith 4
9 grace 4
10 power 4
# ... with 170 more rows
Joining, by = "word"
# A tibble: 480 x 2
word n
<chr> <int>
1 god 22
2 faith 19
3 christ 16
4 jesus 14
5 godliness 9
6 doctrine 8
7 lord 8
8 house 7
9 charge 6
10 teach 6
# ... with 470 more rows
Joining, by = "word"
# A tibble: 382 x 2
word n
<chr> <int>
1 lord 17
2 christ 15
3 god 15
4 jesus 14
5 faith 8
6 truth 6
7 ashamed 4
8 day 4
9 doctrine 4
10 endure 4
# ... with 372 more rows
Joining, by = "word"
# A tibble: 235 x 2
word n
<chr> <int>
1 god 13
2 saviour 6
3 faith 5
4 sound 5
5 christ 4
6 doctrine 4
7 exhort 4
8 grace 4
9 jesus 4
10 love 4
# ... with 225 more rows
Joining, by = "word"
# A tibble: 91 x 2
word n
<chr> <int>
1 christ 7
2 jesus 7
3 lord 6
4 brother 4
5 beloved 3
6 bowels 3
7 love 3
8 mine 3
9 paul 3
10 receive 3
# ... with 81 more rows
Joining, by = "word"
# A tibble: 915 x 2
word n
<chr> <int>
1 god 72
2 faith 32
3 priest 26
4 blood 22
5 son 18
6 lord 17
7 day 15
8 covenant 14
9 jesus 14
10 sins 14
# ... with 905 more rows
Joining, by = "word"
# A tibble: 449 x 2
word n
<chr> <int>
1 god 17
2 faith 16
3 brethren 15
4 lord 15
5 law 10
6 evil 8
7 behold 7
8 perfect 6
9 receive 6
10 body 5
# ... with 439 more rows
Joining, by = "word"
# A tibble: 485 x 2
word n
<chr> <int>
1 god 40
2 christ 21
3 glory 12
4 jesus 11
5 evil 10
6 time 9
7 grace 8
8 holy 8
9 lord 8
10 flesh 7
# ... with 475 more rows
Joining, by = "word"
# A tibble: 332 x 2
word n
<chr> <int>
1 lord 15
2 day 11
3 jesus 9
4 christ 8
5 god 7
6 knowledge 7
7 beloved 6
8 holy 6
9 heavens 5
10 saviour 5
# ... with 322 more rows
Joining, by = "word"
# A tibble: 197 x 2
word n
<chr> <int>
1 god 64
2 love 33
3 son 23
4 world 23
5 sin 16
6 life 15
7 brother 13
8 father 13
9 children 12
10 jesus 12
# ... with 187 more rows
Joining, by = "word"
# A tibble: 332 x 2
word n
<chr> <int>
1 lord 15
2 day 11
3 jesus 9
4 christ 8
5 god 7
6 knowledge 7
7 beloved 6
8 holy 6
9 heavens 5
10 saviour 5
# ... with 322 more rows
Joining, by = "word"
# A tibble: 67 x 2
word n
<chr> <int>
1 truth 6
2 beloved 3
3 brethren 3
4 church 3
5 evil 2
6 friends 2
7 god 2
8 receive 2
9 record 2
10 write 2
# ... with 57 more rows
Joining, by = "word"
# A tibble: 184 x 2
word n
<chr> <int>
1 lord 7
2 ungodly 6
3 christ 5
4 god 5
5 jesus 5
6 beloved 3
7 flesh 3
8 darkness 2
9 dominion 2
10 eternal 2
# ... with 174 more rows
Joining, by = "word"
# A tibble: 1,004 x 2
word n
<chr> <int>
1 god 99
2 earth 81
3 heaven 56
4 angel 53
5 beast 44
6 voice 42
7 throne 39
8 heard 33
9 power 32
10 thousand 30
# ... with 994 more rows