Markdown

The 3 topics selected for the following analysis are Politics,Laptop and Food. The corresponding text for all these topics are retrieved from top 50 websites got from google search.

The text content of each topic are stored on the documents individually and then processed to a single document.Lets start the analysis with the word cloud and COG.

library(text2vec)
library(data.table)
library(stringr)
library(tm)
## Loading required package: NLP
library(RWeka)
library(tokenizers)
## 
## Attaching package: 'tokenizers'
## The following object is masked from 'package:tm':
## 
##     stopwords
library(slam)
library(wordcloud)
## Loading required package: RColorBrewer
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:stringr':
## 
##     %>%
## The following objects are masked from 'package:text2vec':
## 
##     %>%, normalize
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(maptpx)
## 
## Attaching package: 'maptpx'
## The following object is masked from 'package:igraph':
## 
##     normalize
## The following object is masked from 'package:text2vec':
## 
##     normalize
text.clean = function(x)                    # text data
{ require("tm")
  x  =  gsub("<.*?>", " ", x)               # regex for removing HTML tags
  x  =  iconv(x, "latin1", "ASCII", sub="") # Keep only ASCII characters
  x  =  gsub("[^[:alnum:]]", " ", x)        # keep only alpha numeric 
  x  =  tolower(x)                          # convert to lower case characters
  x  =  removeNumbers(x)                    # removing numbers
  x  =  stripWhitespace(x)                  # removing white space
  x  =  gsub("^\\s+|\\s+$", "", x)          # remove leading and trailing white space
  return(x)
}

distill.cog = function(mat1, # input TCM ADJ MAT
                       title, # title for the graph
                       s,    # no. of central nodes
                       k1){  # max no. of connections  
  
  a = colSums(mat1) # collect colsums into a vector obj a
  b = order(-a)     # nice syntax for ordering vector in decr order  
  
  mat2 = mat1[b,b]  #
  
  diag(mat2) =  0
  
  ## +++ go row by row and find top k adjacencies +++ ##
  
  wc = NULL
  
  for (i1 in 1:s){ 
    thresh1 = mat2[i1,][order(-mat2[i1, ])[k1]]
    mat2[i1, mat2[i1,] < thresh1] = 0   # wow. didn't need 2 use () in the subset here.
    mat2[i1, mat2[i1,] > 0 ] = 1
    word = names(mat2[i1, mat2[i1,] > 0])
    mat2[(i1+1):nrow(mat2), match(word,colnames(mat2))] = 0
    wc = c(wc,word)
  } # i1 loop ends
  
  
  mat3 = mat2[match(wc, colnames(mat2)), match(wc, colnames(mat2))]
  ord = colnames(mat2)[which(!is.na(match(colnames(mat2), colnames(mat3))))]  # removed any NAs from the list
  mat4 = mat3[match(ord, colnames(mat3)), match(ord, colnames(mat3))]
  graph <- graph.adjacency(mat4, mode = "undirected", weighted=T)    # Create Network object
  graph = simplify(graph) 
  V(graph)$color[1:s] = "green"
  V(graph)$color[(s+1):length(V(graph))] = "pink"
  
  graph = delete.vertices(graph, V(graph)[ degree(graph) == 0 ])
  
  plot(graph, 
       layout = layout.kamada.kawai, 
      main = title)

  } # func ends
file.cr = read.csv("/Users/bhargav/searchfoodText.csv")
file.mi = read.csv("/Users/bhargav/searchlaptopText.csv")
file.lin = read.csv("/Users/bhargav/searchpoliticsText.csv")

file.cr = file.cr[!is.na(file.cr$text)|file.cr$text != '',]

file.mi = file.mi[!is.na(file.mi$text)|file.mi$text != '',]

file.lin = file.lin[!is.na(file.lin$text)|file.lin$text != '',]

n = min(nrow(file.cr),nrow(file.mi),nrow(file.lin))

data = data.frame(id = 1:n, text1 = file.cr$text[1:n],
                      text2 = file.mi$text[1:n],
                      text3 = file.lin$text[1:n],
                      stringsAsFactors = F)
data$text = paste(data$text1,data$text2,data$text3)

dim(data)
## [1] 56  5
# Read Stopwords list
stpw1 = readLines("stopwords.txt")# stopwords list from git
stpw2 = tm::stopwords('english')               # tm package stop word list; tokenizer package has the same name function

comn  = unique(c(stpw1, stpw2))                 # Union of two list #'solid state chemistry','microeconomics','linguistic'
stopwords = unique(gsub("'"," ",comn))  # final stop word lsit after removing punctuation

x  = text.clean(data$text)             # pre-process text corpus
x  =  removeWords(x,stopwords)            # removing stopwords created above
x  =  stripWhitespace(x)                  # removing white space
# x  =  stemDocument(x)

t1 = Sys.time()

tok_fun = word_tokenizer

it_0 = itoken( x,
               #preprocessor = text.clean,
               tokenizer = tok_fun,
               ids = data$id,
               progressbar = F)

vocab = create_vocabulary(it_0,
                          ngram = c(2L, 2L)
                          #stopwords = stopwords
)

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 10)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_0  = create_dtm(it_0, vectorizer)

# Sort bi-gram with decreasing order of freq
tsum = as.matrix(t(rollup(dtm_0, 1, na.rm=TRUE, FUN = sum))) # find sum of freq for each term
tsum = tsum[order(tsum, decreasing = T),]       #terms in decreasing order of freq
if (length(tsum) > 1000) {n = 1000} else {n = length(tsum)}
tsum = tsum[1:n]
text2 = x
text2 = paste("",text2,"")

pb <- txtProgressBar(min = 1, max = (length(tsum)), style = 3) ; i = 0

for (term in names(tsum)){
  i = i + 1
  focal.term = gsub("_", " ",term)        # in case dot was word-separator
  replacement.term = term
  text2 = gsub(paste("",focal.term,""),paste("",replacement.term,""), text2)
  #setTxtProgressBar(pb, i)
}


it_m = itoken(text2,
              # preprocessor = text.clean,
              tokenizer = tok_fun,
              ids = data$id,
              progressbar = F)

vocab = create_vocabulary(it_m
                          # ngram = c(2L, 2L),
                          #stopwords = stopwords
)

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 1)
# doc_proportion_max = 0.5,
# doc_proportion_min = 0.001)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_m  = create_dtm(it_m, vectorizer)
dim(dtm_m)
## [1]    56 24455
dtm = as.DocumentTermMatrix(dtm_m, weighting = weightTf)

print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 41.31668 secs
# some basic clean-up ops
dim(dtm)
## [1]    56 24455
a0 = apply(dtm, 1, sum)   # apply sum operation to dtm's rows. i.e. get rowSum
  dtm = dtm[(a0 > 5),]    # retain only those rows with token rowSum >5, i.e. delete empty rows
  dim(dtm); rm(a0)        # delete a0 object
## [1]    56 24455
a0 = apply(dtm, 2, sum)   # use apply() to find colSUms this time
  dtm = dtm[, (a0 > 4)]     # retain only those terms that occurred > 4 times in the corpus
  dim(dtm); rm(a0)
## [1]   56 6663
# view summary wordlcoud
a0 = apply(dtm, 2, sum)     # colSum vector of dtm
  a0[1:5]                   # view what a0 obj is like
##       snowman       sliders  freeshipping inches_inches       adblock 
##             5            26            10             8             7
  a1 = order(as.vector(a0), decreasing = TRUE)     # vector of token locations
  a0 = a0[a1]     # a0 ordered asper token locations
  a0[1:5]         # view a0 now
##       food     laptop    laptops   politics intel_core 
##       1110        843        590        530        464
wordcloud(names(a0), a0,     # invoke wordcloud() func. Use ?wordcloud for more info
          scale=c(4,1), 
          3, # min.freq 
          max.words = 100,
          colors = brewer.pal(8, "Dark2"))
title(sub = "Quick Summary Wordcloud")

The above word cloud of the resultant document clearly indicates the highest frequency of the topics. So, the topics are clearly seperated from the whole of the document content.

pruned_vocab = prune_vocabulary(vocab,
                                term_count_min = 5)

vectorizer = vocab_vectorizer(pruned_vocab, grow_dtm = FALSE, skip_grams_window = 3L)
tcm = create_tcm(it_m, vectorizer,verbose=FALSE)

tcm.mat = as.matrix(tcm)
adj.mat = tcm.mat + t(tcm.mat)

# how about a quick view of the distilled COG as well, now that we're here?
diag(adj.mat) = 0     # set diagonals of the adj matrix to zero --> node isn't its own neighor
a0 = order(apply(adj.mat, 2, sum), decreasing = T)
adj.mat = as.matrix(adj.mat[a0[1:50], a0[1:50]])
#quartz()
distill.cog(adj.mat, 'Distilled COG for full corpus',  10,  10)

The above analysis of the topics interlinks with the recommended topics and gives us the summary of the overall analysis.

Now, lets check for the LIFT and ETA proportion values which helps us the probability of the topics in each document.

K = 3     # overriding model fit criterion

# -- run topic model for selected K -- #
summary( simfit <- topics(dtm,  K=K, verb=2), nwrd = 12 )
## 
## Estimating on a 56 document collection.
## Fitting the 3 topic model.
## log posterior increase: 2308.2, 638.2, 882.9, 547.5, done. (L = -1181739.6)
## 
## Top 12 phrases by topic-over-null term lift (and usage %):
## 
## [1] 'rousseff', 'resealable', 'lockers', 'derwent', 'flipcharts', 'glues', 'compasses', 'protractors', 'scrapbooks', 'gel', 'multiroom', 'endive' (61.3) 
## [2] 'polka', 'burnham', 'divorce', 'migrants', 'migration', 'andromeda', 'cdma', 'compareraja', 'stores_check', 'nom', 'knocks', 'sturgeon' (28.5) 
## [3] 'godfrey', 'snowman', 'rasamalaysia', 'souffle', 'princess', 'reinvented', 'expressive', 'troubleshooting', 'housewife', 'berman', 'fpo', 'buckeye' (10.2) 
## 
## Dispersion = 3.18
rownames1 = gsub(" ", ".", rownames(simfit$theta));  rownames(simfit$theta) = rownames1;  


## what are the factor components of the factorized DTM?

dim(dtm)     # size of the orig input matrix
## [1]   56 6663
str(simfit)     # structure of the output obj
## List of 6
##  $ K    : int 3
##  $ theta: num [1:6663, 1:3] 5.08e-10 5.08e-10 2.13e-09 1.78e-09 1.60e-09 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ phrase: chr [1:6663] "snowman" "sliders" "freeshipping" "inches_inches" ...
##   .. ..$ topic : chr [1:3] "1" "2" "3"
##  $ omega: num [1:56, 1:3] 0.000559 0.000292 0.00016 0.512922 0.808919 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ document: chr [1:56] "1" "2" "3" "4" ...
##   .. ..$ topic   : chr [1:3] "1" "2" "3"
##  $ BF   : NULL
##  $ D    :List of 3
##   ..$ dispersion: num 3.18
##   ..$ pvalue    : num 0
##   ..$ df        : num 322131
##  $ X    :List of 6
##   ..$ i       : int [1:55288] 56 56 54 54 54 53 51 51 51 51 ...
##   ..$ j       : int [1:55288] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ v       : num [1:55288] 5 26 10 8 7 36 14 14 24 24 ...
##   ..$ nrow    : int 56
##   ..$ ncol    : int 6663
##   ..$ dimnames:List of 2
##   .. ..$ Docs : chr [1:56] "1" "2" "3" "4" ...
##   .. ..$ Terms: chr [1:6663] "snowman" "sliders" "freeshipping" "inches_inches" ...
##   ..- attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
##   ..- attr(*, "weighting")= chr [1:2] "term frequency" "tf"
##  - attr(*, "class")= chr "topics"
dim(simfit$theta)   # analogous to factor loadings
## [1] 6663    3
dim(simfit$omega)   # analogous to factor scores 
## [1] 56  3

Theta Values:

Below are the Theta values for top tokens in the document:

simfit$theta[1:5,]
##                topic
## phrase                     1            2            3
##   snowman       5.075749e-10 1.845461e-09 2.920818e-04
##   sliders       5.075751e-10 1.845461e-09 1.518814e-03
##   freeshipping  2.126554e-09 3.513155e-04 2.933013e-09
##   inches_inches 1.776377e-09 2.810529e-04 2.933013e-09
##   adblock       1.603402e-09 2.459215e-04 2.933013e-09

Omega Values:

Below are the Omega values for the tokens in the document:

simfit$omega[1:5,]
##         topic
## document            1            2            3
##        1 0.0005590992 0.9989189550 0.0005219458
##        2 0.0002920402 0.9991193911 0.0005885687
##        3 0.0001596683 0.0003159075 0.9995244242
##        4 0.5129219500 0.4853479812 0.0017300689
##        5 0.8089190860 0.1887835890 0.0022973250
tst = round(ncol(dtm)/100)
a = rep(tst,99)
b = cumsum(a);rm(a)
b = c(0,b,ncol(dtm))

ss.col = c(NULL)
for (i in 1:(length(b)-1)) {
  tempdtm = dtm[,(b[i]+1):(b[i+1])]
  s = colSums(as.matrix(tempdtm))
  ss.col = c(ss.col,s)
  #print(i)
}

theta = simfit$theta
lift = theta*0;       # lift will have same dimn as the theta matrix

sum1 = sum(dtm)
pterms = ss.col/sum1     # each column's marginal occurrence probability

for (i in 1:nrow(theta)){  
  for (j in 1:ncol(theta)){
    ptermtopic = 0; pterm = 0;
    ptermtopic = theta[i, j]
    pterm = pterms[i]
    lift[i, j] = ptermtopic/pterm     # divide each cell by the column's marg. occurr. proby.
  }
}   

dim(lift); head(lift, 15)
## [1] 6663    3
##                    topic
## phrase                         1            2            3
##   snowman           1.497742e-05 5.445549e-05 8.618693e+00
##   sliders           2.880274e-06 1.047221e-05 8.618629e+00
##   freeshipping      3.137497e-05 5.183274e+00 4.327337e-05
##   inches_inches     3.276061e-05 5.183282e+00 5.409172e-05
##   adblock           3.379490e-05 5.183288e+00 6.181910e-05
##   emi_rs            2.629482e-04 5.182407e+00 1.199793e-05
##   mumbai_rajkot     1.446314e+00 3.081649e-05 4.525178e-03
##   gurgaon_delhi     1.446314e+00 3.081649e-05 4.525178e-03
##   pincode_locations 1.446627e+00 1.800933e-05 2.647700e-03
##   product_delivered 1.446627e+00 1.800933e-05 2.647700e-03
##   nasik             1.446015e+00 4.311127e-05 6.326799e-03
##   stoves            1.444967e+00 8.614228e-05 1.262663e-02
##   photoframes       1.444967e+00 8.614228e-05 1.262663e-02
##   conservation      1.444967e+00 8.614228e-05 1.262663e-02
##   merson            1.445316e+00 7.179873e-05 1.052770e-02

Lift Values

On further analysis, the lift values for the corresponding topics are as below:

lift[25:35,]
##                 topic
## phrase                      1            2            3
##   adageunicorns  1.750409e-05 5.183362e+00 1.087037e-04
##   entrepreneur   2.786827e-03 5.173378e+00 1.731776e-04
##   charles_pierce 8.064532e-06 7.466233e-06 8.618585e+00
##   staggers       8.105816e-04 2.850908e-04 8.613571e+00
##   smartcooky     7.791184e-06 1.800563e-05 8.618614e+00
##   statesman      1.578546e-01 1.249039e-02 7.657820e+00
##   foodnavigator  1.441350e+00 2.057864e-02 9.240647e-05
##   confectionery  1.445557e+00 5.484517e-03 9.209458e-05
##   exhaustive     1.442577e+00 1.615626e-02 6.600552e-05
##   ratings_based  4.160594e-06 5.183346e+00 2.159149e-05
##   prices_ratings 4.160594e-06 5.183346e+00 2.159149e-05
# Generate A censored Lift matrix
censored.lift = lift
for (i in 1:nrow(lift)){
  censored.lift[i,][censored.lift[i,] < max(censored.lift[i,])] = 0   # hard assigning tokens to topics
} 

The below censored Lift values of the tokens in the document below shows the weightage of each topic classified.

head(censored.lift, 10) 
##                    topic
## phrase                     1        2        3
##   snowman           0.000000 0.000000 8.618693
##   sliders           0.000000 0.000000 8.618629
##   freeshipping      0.000000 5.183274 0.000000
##   inches_inches     0.000000 5.183282 0.000000
##   adblock           0.000000 5.183288 0.000000
##   emi_rs            0.000000 5.182407 0.000000
##   mumbai_rajkot     1.446314 0.000000 0.000000
##   gurgaon_delhi     1.446314 0.000000 0.000000
##   pincode_locations 1.446627 0.000000 0.000000
##   product_delivered 1.446627 0.000000 0.000000
t = Sys.time()

if(nrow(dtm) < 100) {k1 = 10} else {k1= 100}   # to avoid machine choking up in v small datasets

tst = ceiling(nrow(dtm)/k1)  # now using 1% of the rows at a time
a = rep(tst, (k1 - 1))
b = cumsum(a);rm(a)    # cumsum() is cumulative sum.
b = c(0, b, nrow(dtm))  # broke the supermassive dtm into chunks of 1% ncol each
  a0 = which(b > nrow(dtm));    # sometimes, rounding errors cause out of bound errors
  if (length(a0) > 0) {b = b[-a0]}

eta.new = NULL
for (i1 in 1:K){
  
  a2 = c(NULL)
  for (i in 1:(length(b)-3)) {
    tempdtm = dtm[(b[i]+1):(b[i+1]),]
    a = matrix(rep(lift[, i1], nrow(tempdtm)), nrow(tempdtm), ncol(tempdtm), byrow = TRUE)
    a1 = rowSums(as.matrix(tempdtm * a))
    a2 = c(a2, a1); rm(a, a1, tempdtm)
      } # i ends
  
  eta.new = cbind(eta.new, a2); rm(a2)
  
  } # i1 ends

Sys.time() - t  # will take longer than lift building coz ncol is v v high now
## Time difference of 0.145869 secs
#rownames(eta.new) = rownames(simfit$omega)
colnames(eta.new) = colnames(simfit$theta)

# so what does eta.new look like? what does it mean?
dim(eta.new)
## [1] 48  3

ETA Proportion values and Word Cloud For each Latent Topic:

The Eta proportion values for each topic are calculated and below is the summary. The columns indicates the document numbers and each row represents the topic. For example, if the topic resides in the document-1, all the other document values for the corresponding topic will be assigned Zero.

#head(eta.new)

# eta.new = simfit$theta     # if error is happening, worst case

eta.propn = eta.new / rowSums(eta.new)   # calc topic proportions for each document
  eta.propn [1:5,]
##           1         2         3
## 1 0.1780223 0.6251038 0.1968739
## 2 0.1251950 0.7039112 0.1708938
## 3 0.1032872 0.1628087 0.7339040
## 4 0.2713160 0.5080330 0.2206510
## 5 0.4042983 0.3690693 0.2266324
df.top.terms = data.frame(NULL)    # can't fit ALL terms in plot, so choose top ones with max loading

for (i in 1:K){       # For each topic 
  a0 = which(censored.lift[,i] > 1) # terms with lift greator than 1 for topic i
  freq = theta[a0, i] # Theta for terms greator than 1
  freq = sort(freq, decreasing = T) # Terms with higher probilities for topic i
  
  # Auto Correction -  Sometime terms in topic with lift above 1 are less than 100. So auto correction
  n = ifelse(length(freq) >= 100, 100, length(freq))
  top_word = as.matrix(freq[1:n])
  
  top.terms = row.names(top_word)
  df.top.terms.t = data.frame(topic = i, top.terms =top.terms, stringsAsFactors = F )
  df.top.terms = rbind(df.top.terms, df.top.terms.t  )
  
} # i loop ends

 
 # pdf(file = paste0(K,' Topic Model results.pdf')) # use pdf() func to save plot directly as PDFs in your getwd()

for (i in 1:K){       # For each topic 
  
  a0 = which(censored.lift[,i] > 1) # terms with lift greator than 1 for topic i
  freq = theta[a0,i] # Theta for terms greator than 1
  freq = sort(freq, decreasing = T) # Terms with higher probilities for topic i
  
  # Auto Correction -  Sometime terms in topic with lift above 1 are less than 100. So auto correction
  n = ifelse(length(freq) >= 100, 100, length(freq))
  top_word = as.matrix(freq[1:n])
  
  # SUB TCM
  sub.tcm = adj.mat[colnames(adj.mat) %in% names(a0),colnames(adj.mat) %in% names(a0)]
  
  #   Plot wordcloud
  #windows()
  wordcloud(rownames(top_word), top_word,  scale=c(4,.2), 1,
            random.order=FALSE, random.color=FALSE, 
            colors=brewer.pal(8, "Dark2"))
  mtext(paste("Latent Topic",i,"-Word Cloud"), side = 2, line = 0, cex=2)

  # PLot TCM
  #windows()
  distill.cog(sub.tcm, '',  5,  5)
  mtext(paste("Latent Topic",i,"-cog"), side = 2, line = 0, cex=2)
  #mtext(paste("Term co-occurrence - Topic",i), side = 3, line = 2, cex=2)
    
  } # i loop ends

show.top.loading.rows = function(eta.obj, number.of.units){
  
  K = ncol(eta.obj)    # no. of topic factors
  n = number.of.units
  top.loaders = NULL
  for (i in 1:K){
    a0 = order(eta.obj[,i], decreasing = TRUE)
    a1 = rownames(eta.obj[a0[1:n],])
    top.loaders = cbind(a1, top.loaders)
  } # i loop ends

  a2 = matrix()
  return(top.loaders)
  
} # func ends

The above latent topics clearly classifies each topic from the documents are indicated correctly.

show.top.loading.rows(eta.propn, 20)
##       a1   a1   a1  
##  [1,] "3"  "2"  "24"
##  [2,] "20" "29" "27"
##  [3,] "13" "1"  "31"
##  [4,] "43" "38" "15"
##  [5,] "6"  "41" "10"
##  [6,] "48" "47" "32"
##  [7,] "18" "25" "34"
##  [8,] "36" "40" "26"
##  [9,] "39" "22" "16"
## [10,] "26" "8"  "28"
## [11,] "28" "4"  "23"
## [12,] "21" "45" "17"
## [13,] "14" "12" "30"
## [14,] "45" "33" "9" 
## [15,] "19" "35" "39"
## [16,] "22" "7"  "11"
## [17,] "11" "37" "44"
## [18,] "30" "21" "46"
## [19,] "32" "46" "5" 
## [20,] "25" "42" "36"
show.top.loading.rows(eta.new, 10)
##       a1   a1   a1  
##  [1,] "3"  "2"  "24"
##  [2,] "13" "41" "26"
##  [3,] "20" "13" "28"
##  [4,] "26" "28" "31"
##  [5,] "28" "26" "15"
##  [6,] "48" "12" "30"
##  [7,] "15" "1"  "13"
##  [8,] "24" "22" "34"
##  [9,] "31" "14" "9" 
## [10,] "43" "31" "32"

Final Observations :

QNo.1. whether the topic model is able to separate each subject from other subjects. To what extent is it able to do so?

The topics are able to seperate the subject from other subjects. However the subjects Food and Politics has both the contents in their document and hence few words were included in both the latent topics.

QNo.2. Are there mixed tokens (with high lift in more than one topic)? Are the highest LIFT tokens and the document topic proportions (ETA scores) clear and able to identify each topic?

The mixed tokens available in the topic models are however very less. The ETA proportion values are clear and are able to identify each topic individually.

QNo.3. What are your learnings from this exercise.

Below are the learnings from this exercise: 1. The emperical topic modelling though the topics are clubbed into a single document has successfully identified each topic. 2. The topic model has the tokens classified and the theta and omega values clearly indicates the token belongs to a particlar topic which suggests the model is highly effective. 3.The latent topics obtained by the result of the model is as expected. If the K value is increased, the token classification will become close to a perfect model.