Hidden Markov Model Lab - Distinguishing vowels and consonants

This lab is inspired by Mark Stamp (https://www.cs.sjsu.edu/~stamp/RUA/HMM.pdf).

A Hidden Markov Model can be used to distinguish the differences within the letters of certain language. Let’s consider the case where Marvin the Martian, who has no knowledge of English, tries to apply HMM to understand some structure of the language, particularly its writing system.

Marvin first finds there are 26 letters in English alphabet and he decides to count ‘space’ as a letter as well. And he decides to test the model with 2 hidden states, naming one as “vowels” and “consonants” for another.

library(HMM)
library(readr)

set.seed(20181129)

states <- c("vowels", "consonants")
letters_space <- c(letters, " ")

eps <- rnorm(54, sd = 0.001)

emit_vec <- 1/27 + c(eps[1:26], -sum(eps[1:26]), eps[27:52], -sum(eps[27:52]))

emit_mat <- matrix(emit_vec, byrow = TRUE, nrow = 2)

trans_mat <- matrix(0.5 + c(eps[53], -eps[53], eps[54], -eps[54]), byrow = TRUE, nrow = 2 )

start_vec <- 0.5 + rnorm(1, sd = 0.01) * c(1, -1)

HMM1 <- initHMM(states, letters_space, startProbs = start_vec, transProbs = trans_mat, emissionProbs = emit_mat)

What we want to do is train this HMM - use baumWelch.

baumWelch requires some data to train the model. Let’s say Marvin found Project Gutenberg on Google and decided to use Shakespeare’s tragedy Romeo and Juliet.

romeo <- read_table("https://www.gutenberg.org/files/1112/1112.txt")

## Parsed with column specification:
## cols(
##   `The Project Gutenberg EBook of Romeo and Juliet, by William Shakespeare` = col_character()
## )

names(romeo) <- NULL

head(romeo)

romeo <- unlist(romeo)

head(romeo)

## [1] NA                                                                    
## [2] "This eBook is for the use of anyone anywhere at no cost and with"    
## [3] "almost no restrictions whatsoever.  You may copy it, give it away or"
## [4] "re-use it under the terms of the Project Gutenberg License included" 
## [5] "with this eBook or online at www.gutenberg.org/license"              
## [6] NA

strsplit(c("Hello, world!"), " ")

## [[1]]
## [1] "Hello," "world!"

romeo_split <- unlist(strsplit(romeo, character(0)))

head(romeo_split)

## [1] NA  "T" "h" "i" "s" " "

unique(romeo_split)

##  [1] NA   "T"  "h"  "i"  "s"  " "  "e"  "B"  "o"  "k"  "f"  "r"  "t"  "u" 
## [15] "a"  "n"  "y"  "w"  "c"  "d"  "l"  "m"  "v"  "."  "Y"  "p"  ","  "g" 
## [29] "-"  "P"  "j"  "G"  "b"  "L"  "/"  ":"  "R"  "J"  "A"  "W"  "S"  "D" 
## [43] "M"  "2"  "5"  "0"  "1"  "["  "E"  "#"  "]"  "N"  "9"  "7"  "x"  "C" 
## [57] "I"  "*"  "O"  "F"  "H"  "U"  "K"  "V"  "!"  "("  ")"  "<"  "3"  "X" 
## [71] ">"  "'"  "z"  ";"  "q"  "?"  "Q"  "\"" "Z"  "8"  "4"  "6"  "%"  "@" 
## [85] "$"

romeo_split <- tolower(romeo_split)

unique(romeo_split)

##  [1] NA   "t"  "h"  "i"  "s"  " "  "e"  "b"  "o"  "k"  "f"  "r"  "u"  "a" 
## [15] "n"  "y"  "w"  "c"  "d"  "l"  "m"  "v"  "."  "p"  ","  "g"  "-"  "j" 
## [29] "/"  ":"  "2"  "5"  "0"  "1"  "["  "#"  "]"  "9"  "7"  "x"  "*"  "!" 
## [43] "("  ")"  "<"  "3"  ">"  "'"  "z"  ";"  "q"  "?"  "\"" "8"  "4"  "6" 
## [57] "%"  "@"  "$"

romeo_split <- romeo_split[romeo_split %in% letters_space]

unique(romeo_split)

##  [1] "t" "h" "i" "s" " " "e" "b" "o" "k" "f" "r" "u" "a" "n" "y" "w" "c"
## [18] "d" "l" "m" "v" "p" "g" "j" "x" "z" "q"

baumWelch(HMM1, romeo_split[1:10000])

## $hmm
## $hmm$States
## [1] "vowels"     "consonants"
## 
## $hmm$Symbols
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z" " "
## 
## $hmm$startProbs
##     vowels consonants 
##  0.4866689  0.5133311 
## 
## $hmm$transProbs
##             to
## from            vowels consonants
##   vowels     0.4992591  0.5007409
##   consonants 0.5024649  0.4975351
## 
## $hmm$emissionProbs
##             symbols
## states                a          b          c          d         e
##   vowels     0.06438103 0.01641618 0.02096095 0.02755861 0.0961156
##   consonants 0.06742419 0.01558238 0.02103919 0.02784188 0.1020947
##             symbols
## states                f          g          h          i           j
##   vowels     0.01836522 0.01748199 0.04480569 0.05024195 0.001506397
##   consonants 0.01883558 0.01711739 0.04519498 0.05156031 0.001493581
##             symbols
## states                 k          l          m          n          o
##   vowels     0.006423675 0.03261686 0.02822211 0.05028448 0.06893565
##   consonants 0.006376244 0.03057964 0.02757678 0.05171798 0.06866388
##             symbols
## states                p            q          r          s          t
##   vowels     0.01580517 0.0009033969 0.05737972 0.05378452 0.07306207
##   consonants 0.01599515 0.0008965914 0.05601795 0.05281381 0.07013290
##             symbols
## states                u           v          w            x          y
##   vowels     0.03090488 0.008241290 0.02215162 0.0005002193 0.01783108
##   consonants 0.03029408 0.008158568 0.02345061 0.0004997799 0.01897087
##             symbols
## states                  z          
##   vowels     0.0004010282 0.1747186
##   consonants 0.0003989682 0.1692720
## 
## 
## $difference
##   [1] 2.673345e-01 2.699323e-05 2.702915e-05 2.706671e-05 2.710581e-05
##   [6] 2.714643e-05 2.718868e-05 2.723253e-05 2.727802e-05 2.732495e-05
##  [11] 2.737350e-05 2.742374e-05 2.747545e-05 2.752874e-05 2.758372e-05
##  [16] 2.764033e-05 2.769846e-05 2.775823e-05 2.781960e-05 2.788257e-05
##  [21] 2.794721e-05 2.801339e-05 2.808129e-05 2.815083e-05 2.822185e-05
##  [26] 2.829471e-05 2.836913e-05 2.844527e-05 2.852298e-05 2.860239e-05
##  [31] 2.868344e-05 2.876624e-05 2.885072e-05 2.893691e-05 2.902474e-05
##  [36] 2.911432e-05 2.920550e-05 2.929853e-05 2.939324e-05 2.948967e-05
##  [41] 2.958791e-05 2.968788e-05 2.978963e-05 2.989312e-05 2.999845e-05
##  [46] 3.010554e-05 3.021437e-05 3.032510e-05 3.043766e-05 3.055207e-05
##  [51] 3.066824e-05 3.078635e-05 3.090629e-05 3.102817e-05 3.115192e-05
##  [56] 3.127755e-05 3.140521e-05 3.153463e-05 3.166619e-05 3.179970e-05
##  [61] 3.193512e-05 3.207256e-05 3.221203e-05 3.235344e-05 3.249697e-05
##  [66] 3.264264e-05 3.279030e-05 3.294014e-05 3.309203e-05 3.324597e-05
##  [71] 3.340227e-05 3.356058e-05 3.372119e-05 3.388386e-05 3.404889e-05
##  [76] 3.421630e-05 3.438582e-05 3.455764e-05 3.473179e-05 3.490837e-05
##  [81] 3.508724e-05 3.526843e-05 3.545218e-05 3.563831e-05 3.582701e-05
##  [86] 3.601815e-05 3.621177e-05 3.640792e-05 3.660668e-05 3.680808e-05
##  [91] 3.701206e-05 3.721877e-05 3.742813e-05 3.764024e-05 3.785509e-05
##  [96] 3.807272e-05 3.829321e-05 3.851659e-05 3.874280e-05 3.897196e-05

Hidden Markov Model Lab - Distinguishing vowels and consonants

Kyu Yeon Cho