PHÂN TÍCH ĐỊNH TÍNH KINH THÁNH - ĐÁM MÂY TỪ

Các sách Kinh thánh có trong gói sacred (https://github.com/JohnCoene/sacred) Cách cài đặt gói sacred:
Dán dòng lệnh sau vào console: devtools::install_github("JohnCoene/sacred")

Các gói được sử dụng bao gồm: tm,RColorBrewer, tidyverse, magrittr, ggwordcloud, sacred

scripture <- sacred::king_james_version

Kinh thánh là tập hợp nhiều sách được viết trong một thời gian dài, phiên bản sử dụng ở đây là phiên bản tiếng Anh: King James Version (KJV). Để phân tích 1 cuốn sách kinh thánh chẳng hạn sách Malachi, chúng ta dùng hàm dùng hàm filter() trong gói tidyverse như sau filter(book == "mal", ta cũng có thể chọn cuốn khác như sách genesis - sáng thế ký bằng hàm filter(book == "gen"

bi_book = scripture %>% 
  filter(book == "mal")%>% 
  select(text)

Phân tích chữ,chuyển document sang dant5 corpus ( hiểu nôm na corpus là dataset cho văn bản)

bi_book.corpus = bi_book %>% 
  tm::VectorSource() %>% 
  tm::VCorpus()

Sau khi tạo corpus, dữ liệu tiếp tục được làm sạch bằng cách thay thế các ký tự đặc biệt bằng khoảng trắng với hàm tm_map(), ví dụ, bạn có thể thay thế bất kỳ ký tự đặc biệt nào trong văn bản như “/”, “@” và “|” bằng khoảng trắng:

Hàm để chuyển đổi sang ký tự trống

toSpace <- content_transformer(function (x , pattern ) 
  gsub(pattern, " ", x))

Chuyển đổi kí tự đặc biệt(/ \|) sang khoảng trống

bi_book.corpus <- bi_book.corpus %>% 
  tm_map(toSpace, "/") %>%
  tm_map(toSpace, " ") %>%
  tm_map(toSpace, "\\|")

Làm sạch dữ liệu bằng cách xóa các từ dừng (stop words như in, for, at …) và đổi tất cả các kí tự hoa từ thành ký tự thường.

bi_book.corpus =  bi_book.corpus %>% 
  tm_map(FUN = content_transformer(tolower)) %>% # Convert the text to lower case
  tm_map(FUN = removeNumbers) %>% # Remove numbers
  tm_map(removeWords, stopwords("english")) %>% # Remove english common stopwords
  tm_map(removeWords, c("ye", "O", "unto", "yet", "thee", "wherein", "neither", "shall", 
                        "saith", "host", "will", "offer", "say")) %>%   # Remove words
  tm_map(removePunctuation) %>%   # Remove punctuations
  tm_map(stripWhitespace)   #

Khi tài liệu đã sạch, tính tần suất của từng từ. dùng hàm TermDocumentMatrix() trong gói tm. Sau đó, tài liệu được thay đổi từ định dạng corpus sang matrix và data.frame.

bi_book.corpus.tb <-  bi_book.corpus %>% 
  tm::TermDocumentMatrix(control = list(removeNumbers = TRUE,
                                        stopwords = TRUE,
                                        stemming = TRUE)) %>% 
  as.matrix() %>% as.data.frame() %>% 
  tibble::rownames_to_column() %>%
  dplyr::rename(word = 1, freq = 2) %>%
  dplyr::arrange(desc(freq))
ggplot(data = bi_book.corpus.tb, 
       aes(label = word, size = freq, col = as.character(freq))) + 
  geom_text_wordcloud(rm_outside = TRUE, max_steps = 1,
                      grid_size = 1, eccentricity = .9)+
  scale_size_area(max_size = 20)+
  scale_color_brewer(palette = "Paired", direction = -1)+
  theme_void()

Để cho mây từ đẹp hơn, xoay từ , chỉnh góc trong aes(, ở đây sẽ xoay 90 độ cho một tập hợp con ngẫu nhiên gồm 40% các từ.

 bi_book.corpus.tb = bi_book.corpus.tb %>%
   mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40)))

Như vậy wordcloud của sách Malachi với một số từ được xoay 90 độ.

ggplot(data = bi_book.corpus.tb, 
       aes(label = word, size = freq, angle = angle, col = as.character(freq))) + 
  geom_text_wordcloud(rm_outside = TRUE, max_steps = 1,
                      grid_size = 1, eccentricity = .9)+
  scale_size_area(max_size = 20)+
  scale_color_brewer(palette = "Paired", direction = -1)+
  theme_void()

write_sav(bi_book.corpus.tb, "bi_book.corpus.tb.sav")

write_sav(scripture, "scripture.sav")
LS0tDQp0aXRsZTogIkJpYmxlIHRleHQgYW5hbHlzaXMiDQphdXRob3I6ICJIZW5yeSBEbywgIHphbG8vxJBUOiAwOTg1NjYxOTA4Ig0KZGF0ZTogImByIFN5cy5EYXRlKClgIg0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIGNvZGVfZG93bmxvYWQ6IHRydWUNCiAgICBjb2RlX2ZvbGRpbmc6IGhpZGUNCiAgICB0b2M6IHRydWUNCiAgICB0b2NfZmxvYXQ6IGZhbHNlDQogIHdvcmRfZG9jdW1lbnQ6DQogICAgdG9jOiB0cnVlDQogIHBkZl9kb2N1bWVudDoNCiAgICB0b2M6IHRydWUNCiAgd29yZF9kb2N1bWVudHM6DQogICAgdG9jOiB0cnVlDQphbHdheXNfYWxsb3dfaHRtbDogdHJ1ZQ0KZWRpdG9yX29wdGlvbnM6IA0KICBjaHVua19vdXRwdXRfdHlwZTogY29uc29sZQ0KLS0tDQoNCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQ0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFKQ0KYGBgDQoNCg0KDQo8cCBzdHlsZT0idGV4dC1hbGlnbjogY2VudGVyOyBiYWNrZ3JvdW5kLWNvbG9yOnBvd2RlcmJsdWU7IGNvbG9yOlJlZDsiPioqUEjDgk4gVMONQ0ggxJDhu4pOSCBUw41OSCBLSU5IIFRIw4FOSCAtIMSQw4FNIE3DglkgVOG7qioqPC9wPg0KDQpDw6FjIHPDoWNoIEtpbmggdGjDoW5oIGPDsyB0cm9uZyBnw7NpIHNhY3JlZCAgKGh0dHBzOi8vZ2l0aHViLmNvbS9Kb2huQ29lbmUvc2FjcmVkKQ0KQ8OhY2ggY8OgaSDEkeG6t3QgZ8OzaSBgc2FjcmVkYDogPGJyIC8+IA0KRMOhbiBkw7JuZyBs4buHbmggc2F1IHbDoG8gY29uc29sZTogYGRldnRvb2xzOjppbnN0YWxsX2dpdGh1YigiSm9obkNvZW5lL3NhY3JlZCIpYA0KDQoNCkPDoWMgZ8OzaSDEkcaw4bujYyBz4butIGThu6VuZyBiYW8gZ+G7k206IHRtLFJDb2xvckJyZXdlciwgdGlkeXZlcnNlLCBtYWdyaXR0ciwgZ2d3b3JkY2xvdWQsIHNhY3JlZA0KDQpgYGB7ciBlY2hvPUZBTFNFLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFLCBwYWdlZC5wcmludD1GQUxTRX0NCiNsaWJyYXJ5KHRtKQ0KI2xpYnJhcnkoUkNvbG9yQnJld2VyKQ0KI2xpYnJhcnkodGlkeXZlcnNlKQ0KI2xpYnJhcnkobWFncml0dHIpDQojbGlicmFyeShnZ3dvcmRjbG91ZCkNCiNsaWJyYXJ5KHNhY3JlZCkNCiNsaWJyYXJ5KGhhdmVuKQ0KDQojIFBhY2thZ2UgbmFtZXMNCnBhY2thZ2VzIDwtIGMoInRtIiwgIlJDb2xvckJyZXdlciIsICJ0aWR5dmVyc2UiLCAibWFncml0dHIiLCAiZ2d3b3JkY2xvdWQiLCAic2FjcmVkIiwgImhhdmVuIikNCg0KIyBJbnN0YWxsIHBhY2thZ2VzIG5vdCB5ZXQgaW5zdGFsbGVkDQppbnN0YWxsZWRfcGFja2FnZXMgPC0gcGFja2FnZXMgJWluJSByb3duYW1lcyhpbnN0YWxsZWQucGFja2FnZXMoKSkNCmlmIChhbnkoaW5zdGFsbGVkX3BhY2thZ2VzID09IEZBTFNFKSkgew0KICBpbnN0YWxsLnBhY2thZ2VzKHBhY2thZ2VzWyFpbnN0YWxsZWRfcGFja2FnZXNdKQ0KfQ0KDQojIFBhY2thZ2VzIGxvYWRpbmcNCmludmlzaWJsZShsYXBwbHkocGFja2FnZXMsIGxpYnJhcnksIGNoYXJhY3Rlci5vbmx5ID0gVFJVRSkpDQoNCg0KDQoNCmBgYA0KDQoNCmBgYHtyIGVjaG89VFJVRX0NCnNjcmlwdHVyZSA8LSBzYWNyZWQ6OmtpbmdfamFtZXNfdmVyc2lvbg0KYGBgDQoNCktpbmggdGjDoW5oIGzDoCB04bqtcCBo4bujcCBuaGnhu4F1IHPDoWNoIMSRxrDhu6NjIHZp4bq/dCB0cm9uZyBt4buZdCB0aOG7nWkgZ2lhbiBkw6BpLCBwaGnDqm4gYuG6o24gc+G7rSBk4bulbmcg4bufIMSRw6J5IGzDoCBwaGnDqm4gYuG6o24gdGnhur9uZyBBbmg6IEtpbmcgSmFtZXMgVmVyc2lvbiAoS0pWKS4gxJDhu4MgcGjDom4gdMOtY2ggMSBjdeG7kW4gc8OhY2gga2luaCB0aMOhbmggY2jhurNuZyBo4bqhbiBzw6FjaCBNYWxhY2hpLCBjaMO6bmcgdGEgZMO5bmcgaMOgbSBkw7luZyBow6BtIGBmaWx0ZXIoKWAgdHJvbmcgZ8OzaSBgdGlkeXZlcnNlYCBuaMawIHNhdSAgYGZpbHRlcihib29rID09ICJtYWwiYCwgdGEgY8WpbmcgY8OzIHRo4buDIGNo4buNbiBjdeG7kW4ga2jDoWMgbmjGsCBzw6FjaCBnZW5lc2lzIC0gc8OhbmcgdGjhur8ga8O9IGLhurFuZyBow6BtIGBmaWx0ZXIoYm9vayA9PSAiZ2VuImANCg0KDQpgYGB7ciBlY2hvPVRSVUV9DQpiaV9ib29rID0gc2NyaXB0dXJlICU+JSANCiAgZmlsdGVyKGJvb2sgPT0gIm1hbCIpJT4lIA0KICBzZWxlY3QodGV4dCkNCmBgYA0KDQoNClBow6JuIHTDrWNoIGNo4buvLGNodXnhu4NuIGRvY3VtZW50IHNhbmcgZGFudDUgY29ycHVzICggaGnhu4N1IG7DtG0gbmEgY29ycHVzIGzDoCBkYXRhc2V0IGNobyB2xINuIGLhuqNuKQ0KYGBge3IgZWNobz1UUlVFfQ0KYmlfYm9vay5jb3JwdXMgPSBiaV9ib29rICU+JSANCiAgdG06OlZlY3RvclNvdXJjZSgpICU+JSANCiAgdG06OlZDb3JwdXMoKQ0KYGBgDQoNCg0KU2F1IGtoaSB04bqhbyBjb3JwdXMsIGThu68gbGnhu4d1IHRp4bq/cCB04bulYyDEkcaw4bujYyBsw6BtIHPhuqFjaCAgYuG6sW5nIGPDoWNoIHRoYXkgdGjhur8gY8OhYyBrw70gdOG7sSDEkeG6t2MgYmnhu4d0IGLhurFuZyBraG/huqNuZyB0cuG6r25nIHbhu5tpIGjDoG0gYHRtX21hcCgpYCwgdsOtIGThu6UsIGLhuqFuIGPDsyB0aOG7gyB0aGF5IHRo4bq/IGLhuqV0IGvhu7Mga8O9IHThu7EgxJHhurdjIGJp4buHdCBuw6BvIHRyb25nIHbEg24gYuG6o24gbmjGsCDigJwv4oCdLCDigJxA4oCdIHbDoCDigJx84oCdIGLhurFuZyBraG/huqNuZyB0cuG6r25nOg0KDQoNCkjDoG0gxJHhu4MgY2h1eeG7g24gxJHhu5VpIHNhbmcga8O9IHThu7EgdHLhu5FuZw0KDQpgYGB7ciBlY2hvPVRSVUV9DQp0b1NwYWNlIDwtIGNvbnRlbnRfdHJhbnNmb3JtZXIoZnVuY3Rpb24gKHggLCBwYXR0ZXJuICkgDQogIGdzdWIocGF0dGVybiwgIiAiLCB4KSkNCmBgYA0KDQpDaHV54buDbiDEkeG7lWkga8OtIHThu7EgxJHhurdjIGJp4buHdCgvIFxcfCkgc2FuZyBraG/huqNuZyB0cuG7kW5nDQoNCg0KYGBge3IgZWNobz1UUlVFfQ0KDQoNCmJpX2Jvb2suY29ycHVzIDwtIGJpX2Jvb2suY29ycHVzICU+JSANCiAgdG1fbWFwKHRvU3BhY2UsICIvIikgJT4lDQogIHRtX21hcCh0b1NwYWNlLCAiICIpICU+JQ0KICB0bV9tYXAodG9TcGFjZSwgIlxcfCIpDQpgYGANCg0KTMOgbSBz4bqhY2ggZOG7ryBsaeG7h3UgYuG6sW5nIGPDoWNoIHjDs2EgY8OhYyB04burIGThu6tuZyAoc3RvcCB3b3JkcyBuaMawIGluLCBmb3IsIGF0IC4uLikgdsOgIMSR4buVaSB04bqldCBj4bqjIGPDoWMga8OtIHThu7EgaG9hIHThu6sgdGjDoG5oIGvDvSB04buxIHRoxrDhu51uZy4NCg0KDQpgYGB7ciBlY2hvPVRSVUV9DQpiaV9ib29rLmNvcnB1cyA9ICBiaV9ib29rLmNvcnB1cyAlPiUgDQogIHRtX21hcChGVU4gPSBjb250ZW50X3RyYW5zZm9ybWVyKHRvbG93ZXIpKSAlPiUgIyBDb252ZXJ0IHRoZSB0ZXh0IHRvIGxvd2VyIGNhc2UNCiAgdG1fbWFwKEZVTiA9IHJlbW92ZU51bWJlcnMpICU+JSAjIFJlbW92ZSBudW1iZXJzDQogIHRtX21hcChyZW1vdmVXb3Jkcywgc3RvcHdvcmRzKCJlbmdsaXNoIikpICU+JSAjIFJlbW92ZSBlbmdsaXNoIGNvbW1vbiBzdG9wd29yZHMNCiAgdG1fbWFwKHJlbW92ZVdvcmRzLCBjKCJ5ZSIsICJPIiwgInVudG8iLCAieWV0IiwgInRoZWUiLCAid2hlcmVpbiIsICJuZWl0aGVyIiwgInNoYWxsIiwgDQogICAgICAgICAgICAgICAgICAgICAgICAic2FpdGgiLCAiaG9zdCIsICJ3aWxsIiwgIm9mZmVyIiwgInNheSIpKSAlPiUgICAjIFJlbW92ZSB3b3Jkcw0KICB0bV9tYXAocmVtb3ZlUHVuY3R1YXRpb24pICU+JSAgICMgUmVtb3ZlIHB1bmN0dWF0aW9ucw0KICB0bV9tYXAoc3RyaXBXaGl0ZXNwYWNlKSAgICMNCmBgYA0KDQpLaGkgdMOgaSBsaeG7h3UgxJHDoyBz4bqhY2gsIHTDrW5oIHThuqduIHN14bqldCBj4bunYSB04burbmcgdOG7qy4gZMO5bmcgIGjDoG0gVGVybURvY3VtZW50TWF0cml4KCkgdHJvbmcgZ8OzaSB0bS4gU2F1IMSRw7MsIHTDoGkgbGnhu4d1IMSRxrDhu6NjIHRoYXkgxJHhu5VpIHThu6sgxJHhu4tuaCBk4bqhbmcgY29ycHVzIHNhbmcgbWF0cml4IHbDoCBkYXRhLmZyYW1lLiANCg0KYGBge3IgZWNobz1UUlVFfQ0KYmlfYm9vay5jb3JwdXMudGIgPC0gIGJpX2Jvb2suY29ycHVzICU+JSANCiAgdG06OlRlcm1Eb2N1bWVudE1hdHJpeChjb250cm9sID0gbGlzdChyZW1vdmVOdW1iZXJzID0gVFJVRSwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBzdG9wd29yZHMgPSBUUlVFLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHN0ZW1taW5nID0gVFJVRSkpICU+JSANCiAgYXMubWF0cml4KCkgJT4lIGFzLmRhdGEuZnJhbWUoKSAlPiUgDQogIHRpYmJsZTo6cm93bmFtZXNfdG9fY29sdW1uKCkgJT4lDQogIGRwbHlyOjpyZW5hbWUod29yZCA9IDEsIGZyZXEgPSAyKSAlPiUNCiAgZHBseXI6OmFycmFuZ2UoZGVzYyhmcmVxKSkNCmBgYA0KDQoNCi0gTcOieSB04burIGPhu6dhIHRvw6BuIGLhu5kgU8OhY2ggTWFsYWNoaSDEkcaw4bujYyB04bqhbyBiLiBMxrB1IMO9IHLhurFuZyAgdOG7qyBu4bqxbSBuZ2FuZy4NCg0KYGBge3IgZWNobz1UUlVFfQ0KZ2dwbG90KGRhdGEgPSBiaV9ib29rLmNvcnB1cy50YiwgDQogICAgICAgYWVzKGxhYmVsID0gd29yZCwgc2l6ZSA9IGZyZXEsIGNvbCA9IGFzLmNoYXJhY3RlcihmcmVxKSkpICsgDQogIGdlb21fdGV4dF93b3JkY2xvdWQocm1fb3V0c2lkZSA9IFRSVUUsIG1heF9zdGVwcyA9IDEsDQogICAgICAgICAgICAgICAgICAgICAgZ3JpZF9zaXplID0gMSwgZWNjZW50cmljaXR5ID0gLjkpKw0KICBzY2FsZV9zaXplX2FyZWEobWF4X3NpemUgPSAyMCkrDQogIHNjYWxlX2NvbG9yX2JyZXdlcihwYWxldHRlID0gIlBhaXJlZCIsIGRpcmVjdGlvbiA9IC0xKSsNCiAgdGhlbWVfdm9pZCgpDQpgYGANCg0KDQrEkOG7gyBjaG8gbcOieSB04burIMSR4bq5cCBoxqFuLCB4b2F5IHThu6sgLCBjaOG7iW5oIGfDs2MgdHJvbmcgYWVzKCwg4bufIMSRw6J5IHPhur0geG9heSA5MCDEkeG7mSBjaG8gbeG7mXQgdOG6rXAgaOG7o3AgY29uIG5n4bqrdSBuaGnDqm4gZ+G7k20gNDAlIGPDoWMgdOG7qy4gDQoNCmBgYHtyIGVjaG89VFJVRX0NCiBiaV9ib29rLmNvcnB1cy50YiA9IGJpX2Jvb2suY29ycHVzLnRiICU+JQ0KICAgbXV0YXRlKGFuZ2xlID0gOTAgKiBzYW1wbGUoYygwLCAxKSwgbigpLCByZXBsYWNlID0gVFJVRSwgcHJvYiA9IGMoNjAsIDQwKSkpDQpgYGANCk5oxrAgduG6rXkgd29yZGNsb3VkIGPhu6dhIHPDoWNoIE1hbGFjaGkgduG7m2kgbeG7mXQgc+G7kSB04burIMSRxrDhu6NjIHhvYXkgOTAgxJHhu5kuDQoNCmBgYHtyIGVjaG89VFJVRX0NCg0KZ2dwbG90KGRhdGEgPSBiaV9ib29rLmNvcnB1cy50YiwgDQogICAgICAgYWVzKGxhYmVsID0gd29yZCwgc2l6ZSA9IGZyZXEsIGFuZ2xlID0gYW5nbGUsIGNvbCA9IGFzLmNoYXJhY3RlcihmcmVxKSkpICsgDQogIGdlb21fdGV4dF93b3JkY2xvdWQocm1fb3V0c2lkZSA9IFRSVUUsIG1heF9zdGVwcyA9IDEsDQogICAgICAgICAgICAgICAgICAgICAgZ3JpZF9zaXplID0gMSwgZWNjZW50cmljaXR5ID0gLjkpKw0KICBzY2FsZV9zaXplX2FyZWEobWF4X3NpemUgPSAyMCkrDQogIHNjYWxlX2NvbG9yX2JyZXdlcihwYWxldHRlID0gIlBhaXJlZCIsIGRpcmVjdGlvbiA9IC0xKSsNCiAgdGhlbWVfdm9pZCgpDQpgYGANCg0KDQpgYGB7cn0NCndyaXRlX3NhdihiaV9ib29rLmNvcnB1cy50YiwgImJpX2Jvb2suY29ycHVzLnRiLnNhdiIpDQoNCndyaXRlX3NhdihzY3JpcHR1cmUsICJzY3JpcHR1cmUuc2F2IikNCg0KYGBgDQoNCg==