NB

review <- read.csv2("review.csv", encoding = "UTF-8")

library(rsample)

## Warning: package 'rsample' was built under R version 3.5.3

## Loading required package: tidyr

## Warning: package 'tidyr' was built under R version 3.5.3

library(tidytext)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.5.3

## -- Attaching packages ------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.1
## v readr   1.3.1     v stringr 1.4.0
## v ggplot2 3.2.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.5.3

## Warning: package 'tibble' was built under R version 3.5.3

## Warning: package 'readr' was built under R version 3.5.3

## Warning: package 'purrr' was built under R version 3.5.3

## Warning: package 'dplyr' was built under R version 3.5.3

## Warning: package 'stringr' was built under R version 3.5.3

## Warning: package 'forcats' was built under R version 3.5.3

## -- Conflicts ---------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tm)

## Warning: package 'tm' was built under R version 3.5.3

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 3.5.2

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.5.3

## Loading required package: RColorBrewer

library(e1071)

## Warning: package 'e1071' was built under R version 3.5.3

library(fastNaiveBayes)
library(gmodels)

## Warning: package 'gmodels' was built under R version 3.5.2

library(naivebayes)

## Warning: package 'naivebayes' was built under R version 3.5.3

## naivebayes 0.9.6 loaded

library(textclean)

## Warning: package 'textclean' was built under R version 3.5.3

# library("CHAID")
library(Rfast)

## Warning: package 'Rfast' was built under R version 3.5.3

## Loading required package: Rcpp

## Warning: package 'Rcpp' was built under R version 3.5.3

## 
## Attaching package: 'Rcpp'

## The following object is masked from 'package:rsample':
## 
##     populate

## Loading required package: RcppZiggurat

## Warning: package 'RcppZiggurat' was built under R version 3.5.3

## 
## Attaching package: 'Rfast'

## The following object is masked from 'package:dplyr':
## 
##     nth

## The following objects are masked from 'package:purrr':
## 
##     is_integer, transpose

library(Boruta)

## Warning: package 'Boruta' was built under R version 3.5.3

## Loading required package: ranger

## Warning: package 'ranger' was built under R version 3.5.3

library(slam)

## Warning: package 'slam' was built under R version 3.5.3

library(quanteda)

## Warning: package 'quanteda' was built under R version 3.5.3

## Package version: 1.5.1

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

table(review$Sex)

## 
## N/A     Ж   Ж    М  
##  124    2 1963  303

review <- read_csv2("review.csv")

## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.

## Parsed with column specification:
## cols(
##   Author = col_character(),
##   Title = col_character(),
##   Nickname = col_character(),
##   Sex = col_character(),
##   Date = col_character(),
##   Review = col_character()
## )

review <- review %>%
  mutate(Sex = trimws(review$Sex)) %>% 
  filter(Sex %in% c("М", "Ж")) %>%
  mutate(Review = replace_emoticon(Review, emoticon_dt = lexicon::hash_emoticons)) %>%
  mutate(Review = replace_number(Review, remove = TRUE)) %>% 
  mutate(Review = gsub("){2,}", " multibrackethappy ", Review)) %>% 
  mutate(Review = gsub("\\({2,}", " multibracketsad ", Review)) %>% 
  mutate(Review = mgsub(Review, c("«", "»", "[[:digit:]]"), "")) %>% 
  mutate(Review = gsub("...", " multipdots ", Review, fixed = TRUE)) %>%
  mutate(Review = gsub("[!]{2,}", " multiexlam ", Review)) %>% 
  mutate(Review = gsub("[?]{2,}", " multiquest ", Review)) %>% 
  select(Sex, Review)

set.seed(123)
split_review <- initial_split(review, strata = Sex)
review.train <- training(split_review)
review.test <- testing(split_review)
blanket.review <- initial_split(review.train, strata = Sex)
review.train2 <- training(blanket.review)
review.eval <- testing(blanket.review)
rm(blanket.review)

review.feature <- review.train2 %>% 
  unnest_tokens(word, Review) %>% 
  filter(! word %in% stopwords("ru")) %>% 
  count(word, Sex) %>% 
  spread(Sex, n, fill = 0)


names <- review.feature %>% 
  mutate(n = Ж + М) %>%
  filter(n >= 5) %>%
  select(word) %>% 
  c()

train_corp <- VCorpus(VectorSource(review.train$Review))
test_corp <- VCorpus(VectorSource(review.test$Review))

# DTM TRAIN
train_sw <- DocumentTermMatrix(
  train_corp,
  control = list(
    tolower = TRUE,
    language = "ru",
    removeNumbers = TRUE,
    removePunctuation = FALSE,
    stemming = FALSE,
    # updating for remove sw
    stopwords = stopwords("ru")
  )
)
# Remove sparcity for some reason

# train_sw_sparsed <- removeSparseTerms(train_sw, 0.95)

# DTM TEST
test_sw <- DocumentTermMatrix(
  test_corp,
  control = list(
    tolower = TRUE,
    removeNumbers = TRUE,
    removePunctuation = FALSE,
    stemming = FALSE,
    stopwords = stopwords("ru")
  )
)
# test_sw_sparsed <- removeSparseTerms(test_sw, 0.95)

freq_train_sw <- train_sw %>%
  findFreqTerms(5) %>%
  train_sw[ , .]

freq_test_sw <- test_sw %>%
  findFreqTerms(1) %>%
  test_sw[ , .]

# freq_eval_sw <- eval_sw %>%
#   findFreqTerms(1) %>%
#   test_sw[ , .]

convert_counts <- function(x) {
  x <- ifelse(x > 0, 1, 0)
}

train_sw_bi <- freq_train_sw %>%
  apply(MARGIN = 2, convert_counts)

test_sw_bi <- freq_test_sw %>%
  apply(MARGIN = 2, convert_counts)

####### Testing naivebayes package

class_sw_nb <- bernoulli_naive_bayes(as.matrix(train_sw_bi), as.factor(review.train$Sex), laplace = 1)
test_sw_bi_c <- test_sw_bi[,colnames(test_sw_bi) %in% colnames(train_sw_bi)]
sw_bi_pred_nb <- predict(class_sw_nb, as.matrix(test_sw_bi_c))

## Warning: predict.bernoulli_naive_bayes(): Only 6364 feature(s) out of 7498 defined in the naive_bayes object "class_sw_nb" are used for prediction

#######

####### Continue testing nb package multinomial doesn't look that good
class_sw_2_nb <- multinomial_naive_bayes(as.matrix(freq_train_sw), as.factor(review.train$Sex), laplace = 1)
freq_test_sw_c<- freq_test_sw[,colnames(freq_test_sw) %in% colnames(freq_train_sw)]
sw_mn_pred_nb <- predict(class_sw_2_nb, as.matrix(freq_test_sw_c))

## Warning: predict.multinomial_naive_bayes(): Only 6364 feature(s) out of 7498 defined in the naive_bayes object "class_sw_2_nb" are used for prediction

#######

# Bernoulli
CrossTable(sw_bi_pred_nb, review.test$Sex, prop.chisq = FALSE, chisq = FALSE, 
           prop.t = FALSE,
           dnn = c("Predicted", "Actual"))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  567 
## 
##  
##              | Actual 
##    Predicted |         Ж |         М | Row Total | 
## -------------|-----------|-----------|-----------|
##            Ж |       439 |        56 |       495 | 
##              |     0.887 |     0.113 |     0.873 | 
##              |     0.889 |     0.767 |           | 
## -------------|-----------|-----------|-----------|
##            М |        55 |        17 |        72 | 
##              |     0.764 |     0.236 |     0.127 | 
##              |     0.111 |     0.233 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       494 |        73 |       567 | 
##              |     0.871 |     0.129 |           | 
## -------------|-----------|-----------|-----------|
## 
##

# Multinomial
CrossTable(sw_mn_pred_nb, review.test$Sex, prop.chisq = FALSE, chisq = FALSE, 
           prop.t = FALSE,
           dnn = c("Predicted", "Actual"))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  567 
## 
##  
##              | Actual 
##    Predicted |         Ж |         М | Row Total | 
## -------------|-----------|-----------|-----------|
##            Ж |       483 |        63 |       546 | 
##              |     0.885 |     0.115 |     0.963 | 
##              |     0.978 |     0.863 |           | 
## -------------|-----------|-----------|-----------|
##            М |        11 |        10 |        21 | 
##              |     0.524 |     0.476 |     0.037 | 
##              |     0.022 |     0.137 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       494 |        73 |       567 | 
##              |     0.871 |     0.129 |           | 
## -------------|-----------|-----------|-----------|
## 
##

# fit <- caret::train(
#   as.matrix(train_sw_bi), as.factor(review.train$Sex), "nb", 
#    trControl = trainControl(method = "cv", number = 10))
# 
# kfoldbi <- predict(fit$finalModel, as.matrix(test_sw_bi_c))

review.train1_tfidf2 <- review.train %>% 
  unnest_tokens(word, Review) %>% 
  filter(! word %in% stopwords("ru")) %>%
  filter(word %in% names[["word"]]) %>% 
  count(word, Sex) %>% 
  # spread(Sex, n, fill = 0) %>% 
  bind_tf_idf(word, Sex, n) %>%
  # filter(tf_idf >0) %>%
  cast_dtm(Sex, word , tf_idf)


# FINAL TESTING DATA
MNV_TRAIN2 <- multinomial_naive_bayes(as.matrix(review.train1_tfidf2), as.factor(c("Ж", "М")))
mnv_res2 <- predict(MNV_TRAIN2, as.matrix(test_sw_bi_c))

## Warning: predict.multinomial_naive_bayes(): Only 4003 feature(s) out of 5892 defined in the naive_bayes object "MNV_TRAIN2" are used for prediction

## Warning: predict.multinomial_naive_bayes(): More features in the newdata
## are provided as there are parameter estimates in the object. Calculation is
## performed based on features to be found in the object.

# Multinomial TESTING
CrossTable(mnv_res2, review.test$Sex, prop.chisq = FALSE, chisq = FALSE, 
           prop.t = FALSE,
           dnn = c("Predicted", "Actual"))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  567 
## 
##  
##              | Actual 
##    Predicted |         Ж |         М | Row Total | 
## -------------|-----------|-----------|-----------|
##            Ж |       132 |         2 |       134 | 
##              |     0.985 |     0.015 |     0.236 | 
##              |     0.267 |     0.027 |           | 
## -------------|-----------|-----------|-----------|
##            М |       362 |        71 |       433 | 
##              |     0.836 |     0.164 |     0.764 | 
##              |     0.733 |     0.973 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       494 |        73 |       567 | 
##              |     0.871 |     0.129 |           | 
## -------------|-----------|-----------|-----------|
## 
##

ЗДЕСЬ НАЧИНАЕТСЯ МАЛЕНЬКОЕ КЛАДБИЩЕ

# 
# ####CURSED FUNCTIONS####
# # gscore <- g2Test_univariate(as.matrix(review.feature[,-1]), 2)
# # # get row and colnames in order
# # # review.train2 <- review.train2[,-4]
# # colnames(review.train_test) <- rownames(review.train2)
# # rownames(review.train_test) <- colnames(review.train2)
# # 
# # colSums(as.matrix(review.train_test[,-1]))
# 
# # review.train_test <- CHAID::chaid(rowname ~ ., review.train_test)
# 
# 
# # review.train_test <- review.train_test %>% 
# #   select(names[["word"]])
#   
# review.eval <- review.eval %>%
#   unnest_tokens(word, Review) %>% 
#   filter(! word %in% stopwords("ru")) %>% 
#   count(word, Sex) %>% 
#   spread(Sex, n, fill = 0)
#   
#   # unnest_tokens(word, Review) %>% 
#   # filter(! word %in% stopwords("ru")) %>% 
#   # count(word, Sex) %>% 
#   # spread(Sex, n, fill = 0)
# 
# 
# 
# train_corp <- VCorpus(VectorSource(review.train$Review))
# test_corp <- VCorpus(VectorSource(review.test$Review))
# 
# # DTM TRAIN
# train_sw <- DocumentTermMatrix(
#   train_corp,
#   control = list(
#     tolower = TRUE,
#     language = "ru",
#     removeNumbers = TRUE,
#     removePunctuation = FALSE,
#     stemming = FALSE,
#     # updating for remove sw
#     stopwords = stopwords("ru")
#   )
# )
# # Remove sparcity for some reason
# 
# train_sw_sparsed <- removeSparseTerms(train_sw, 0.95)
# 
# # DTM TEST
# test_sw <- DocumentTermMatrix(
#   test_corp,
#   control = list(
#     tolower = TRUE,
#     removeNumbers = TRUE,
#     removePunctuation = FALSE,
#     stemming = FALSE,
#     stopwords = stopwords("ru")
#   )
# )
# 
# 
# 
# test_sw_sparsed <- removeSparseTerms(test_sw, 0.95)
# 
# freq_train_sw <- train_sw %>%
#   findFreqTerms(5) %>%
#   train_sw[ , .]
# freq_test_sw <- test_sw %>%
#   findFreqTerms(1) %>%
#   test_sw[ , .]
# 
# convert_counts <- function(x) {
#   x <- ifelse(x > 0, 1, 0)
# }
# 
# # Trying remove step with freq !!!BAD IDEA!!!
# # train_sw_bi <- train_sw %>%
# #   apply(MARGIN = 2, convert_counts)
# # test_sw_bi <- test_sw %>%
# #   apply(MARGIN = 2, convert_counts) !!!BAD IDEA!!!
# # TRYING TRYING
# 
# train_sw_bi <- freq_train_sw %>%
#   apply(MARGIN = 2, convert_counts)
# 
# test_sw_bi <- freq_test_sw %>%
#   apply(MARGIN = 2, convert_counts)
# 
# class_sw <- fnb.bernoulli(as.matrix(train_sw_bi), as.factor(review.train$Sex), laplace = 1)
# 
# ####### Testing naivebayes package
# 
# class_sw_nb <- bernoulli_naive_bayes(as.matrix(train_sw_bi), as.factor(review.train$Sex), laplace = 1)
# test_sw_bi_c <- test_sw_bi[,colnames(test_sw_bi) %in% colnames(train_sw_bi)]
# sw_bi_pred_nb <- predict(class_sw_nb, as.matrix(test_sw_bi_c))
# #######
# 
# ####### Continue testing nb package multinomial doesn't look that good
# class_sw_2_nb <- multinomial_naive_bayes(as.matrix(freq_train_sw), as.factor(review.train$Sex), laplace = 1)
# freq_test_sw_c<- freq_test_sw[,colnames(freq_test_sw) %in% colnames(freq_train_sw)]
# sw_mn_pred_nb <- predict(class_sw_2_nb, as.matrix(freq_test_sw_c))
# 
# #######
# 
# ####### Continue testing nb package Poisson doesn't look that good
# class_sw_3_nb <- poisson_naive_bayes(as.matrix(freq_train_sw), as.factor(review.train$Sex), laplace = 1)
# sw_po_pred_nb <- predict(class_sw_2_nb, as.matrix(freq_test_sw))
# 
# #######
# 
# class_sw_2 <- fnb.multinomial(as.matrix(freq_train_sw), as.factor(review.train$Sex), laplace = 1)
# 
# 
# 
# freq_test_sw_c<- freq_test_sw[,colnames(freq_test_sw) %in% colnames(freq_train_sw)]
# 
# test_sw_bi_c <- test_sw_bi[,colnames(test_sw_bi) %in% colnames(train_sw_bi)]
# # left-join columns?
# sw_bi_pred <- predict(class_sw, as.matrix(test_sw_bi_c))
# 
# sw_mn_pred <- predict(class_sw_2, freq_test_sw_c)
# 
# # Bernoulli
# CrossTable(sw_bi_pred_nb, review.test$Sex, prop.chisq = FALSE, chisq = FALSE, 
#            prop.t = FALSE,
#            dnn = c("Predicted", "Actual"))
# 
# # Multinomial
# CrossTable(sw_mn_pred_nb, review.test$Sex, prop.chisq = FALSE, chisq = FALSE, 
#            prop.t = FALSE,
#            dnn = c("Predicted", "Actual"))
# 
# # Poisson
# CrossTable(sw_po_pred_nb, review.test$Sex, prop.chisq = FALSE, chisq = FALSE, 
#            prop.t = FALSE,
#            dnn = c("Predicted", "Actual"))
# 
# 
# # WOW MORE SUPER COOL TESTS
# 
# # names <- intersect(class_sw$names, colnames(test_sw_bi_c))
# # newdata <- test_sw_bi_c[, names]
# # if (length(names) == 1) {
# #   newdata <- as.matrix(newdata)
# #   colnames(newdata) <- names
# # }
# # 
# # if(length(object$names)!=length(names)){
# #   if(!silent){
# #     warning('Columns in test and train set not equal! Newdata is padded with zeros')
# #   }
# 
# test <- freq_train_sw %>%
#   apply(MARGIN = 2, sum)
# 
# 
# 
# 
# 
# 
# 
# ##### Here is the dead space
# 
# 
# 
# 
# # train_corp_q <- quanteda::corpus(as.character(review.train$Review))
# # test_corp_q <- quanteda::corpus(as.character(review.test$Review))
# # 
# # train_matrix <- quanteda::dfm(train_corp_q,
# #                               tolower = T,
# #                               stem = F,
# #                               remove = stopwords('russian'))
# # test_matrix <- quanteda::dfm(test_corp_q,
# #                               tolower = T,
# #                               stem = F,
# #                               remove = stopwords('russian')
# #                              )
# # train_matrix@x[train_matrix@x > 1] <- 1
# # test_matrix@Dimnames$features
# # str(test_matrix)
# # test_matrix@x[test_matrix@x > 1] <- 1
# # train_matrix <- train_matrix[,
# #                              (quanteda::featnames(train_matrix) %in% quanteda::featnames(test_matrix))]
# # train_matrix <- train_matrix[,
# #                              (colnames(train_matrix) %in% colnames(test_matrix))]
# # test_matrix <- test_matrix[,
# #                              (quanteda::featnames(test_matrix) %in% quanteda::featnames(train_matrix))]
# # test_matrix <- test_matrix[,
# #                              (colnames(test_matrix) %in% colnames(train_matrix))]
# # str(test_matrix)
# convert_sex <- function(x) {
#   x <- ifelse(x == "М", 1, 0)
# }
# review.train$Sex <- convert_sex(review.train$Sex)
# review.test$Sex <- convert_sex(review.test$Sex)
# # est <- trainNB(review.train$Sex, train_matrix, smoothing = 'normalized')
# # out <- classifyNB_2(est, test_matrix, data.frame(review.test$Review))
# # 
# # 
# # str(est[["w_jc"]])
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# #OPTIONALOPTIONAL updating for remove sw becayse we failed to do it in DTM
# # Надо в начале почистить текст затем стоп слова
# # train_corp <- tm_map(train_corp, removeWords, stopwords(kind = "ru"))
# # test_corp <- tm_map(test_corp, removeWords, stopwords(kind = "ru"))
# 
# 
# 
# 
# 
# 
# 
# 
# # Some testing bullshit
# 
# install.packages("remotes")
# remotes::install_github("mattwloftis/agendacodeR")
# 
# library(agendacodeR)
# 
# train_corpus <- quanteda::corpus(x = training_agendas$text)
# train_matrix <- quanteda::dfm(train_corpus,
#                               language = "danish",
#                               stem = TRUE,
#                               removeNumbers = FALSE)
# 
# test_corpus <- quanteda::corpus(x = test_agendas$text)
# test_matrix <- quanteda::dfm(test_corpus,
#                              language = "danish",
#                              stem = TRUE,
#                              removeNumbers = FALSE)
# str(train_corpus)
# 
# ## Convert matrix of frequencies to matrix of indicators
# train_matrix@x[train_matrix@x > 1] <- 1
# test_matrix@x[test_matrix@x > 1] <- 1
# 
# ## Dropping training features not in the test set
# train_matrix <- train_matrix[, (colnames(train_matrix) %in% colnames(test_matrix))]
# 
# 
# 
# ## Convert matrix of frequencies to matrix of indicators
# train_matrix@x[train_matrix@x > 1] <- 1
# test_matrix@x[test_matrix@x > 1] <- 1
# 
# ## Dropping training features not in the test set
# head(test_agendas)
# 
# 
# est <- trainNB(training_agendas$coding, train_matrix)
# 
# test_matrix <- test_matrix[, (colnames(test_matrix) %in% names(est[[2]]))]
# 
# out <- classifyNB(est, test_matrix, test_agendas)
# 
# acc <- catAccuracy(true = out$coding, predicted = out$ratio_match)
# 
# topWords(est)
# 
# est$
# 
# training_agendas %>% 
#   filter(id %in% test_agendas$id) %>% 
#   select(coding)
# 
# ## Load data and create document-feature matrices
# # train_corpus <- quanteda::corpus(x = as.character(review.train$Review))
# # train_matrix <- quanteda::dfm(train_corpus,
# #                               language = "ru",
# #                               stem = TRUE,
# #                               removeNumbers = TRUE)
# # 
# # 
# # test_corpus <- quanteda::corpus(x = as.character(review.test$Review))
# # test_matrix <- quanteda::dfm(test_corpus,
# #                               language = "ru",
# #                               removeNumbers = TRUE)
# 
# 
# # ## Convert matrix of frequencies to matrix of indicators
# # train_matrix@x[train_matrix@x > 1] <- 1
# # test_matrix@x[test_matrix@x > 1] <- 1
# 
# # Convert Sexes for this package
# convert_sex <- function(x) {
#   x <- ifelse(x == "М", 1, 0)
# }
# 
# # One more idk correction 
# test_sw <- test_sw[, (colnames(test_sw) %in% colnames(test_matrix))]
# 
# # Another one convertion!
# 
# test_sw_bi <- test_sw_bi[,(featnames(as.dfm(test_sw_bi)) %in% featnames(as.dfm(train_sw_bi)))]
# 
# 
# test <- as.dfm(train_sw_bi)
# str(test)
# str(train_sw)
# 
# 
# 
# ## Dropping training features not in the test set
# test_sw_bi_upd <- as.dfm(test_sw_bi)
# train_sw_bi_upd <- train_sw_bi %>% as.dfm %>% .[, (colnames(train_sw_bi) %in% colnames(test_sw_bi_upd))]
# # test_sw_bi_upd <- test_sw_bi_upd %>% .[, (colnames(test_sw_bi_upd) %in% colnames(train_sw_bi_upd))]
# 
# est <- trainNB(review.train$Sex, train_sw_bi_upd)
# 
# # test_sw_bi <- test_sw_bi[, featnames(as.dfm(test_sw_bi)) %in% names(est[["w_jc"]])]
# # 
# # typo <- est[["w_jc"]]
# # 
# # typo_2 <- typo[intersect(names(typo), colnames(test_sw_bi))]
# # 
# # est[["w_jc"]] <- typo_2
# out <- classifyNB(est, test_sw_bi_upd, as.data.frame(review.test$Review))
# acc <- catAccuracy(true = out$coding, predicted = out$ratio_match)
# 
# sum(as.numeric(acc$frequency))
# 
# colnames(train_sw_bi)
# 
# 
# 
# 
# 
# 
# # LAST THING I WILL TRY WITH YYYYYYYUUUUU
# 
# 
# test <- est[["w_0c"]]
# test <- data.frame(t(test))
# test[2,]
# test <- test[intersect(names(test), colnames(test_matrix))]
# str(test)
# head(test_matrix) %*% head(t(test_v))
# str(test_matrix)
# 
# test_v <- as.numeric(as.matrix(test)); names(test_v) <- names(test)
# head(test_v)
# 
# classifyNB_2 <- function(est, test_matrix, test) {
#   w_0c <- est[[1]]
#   w_jc <- est[[2]]
#   # w_jc1 <- data.frame(t(w_jc))
#   nc <- est[[3]]
#   pc <- est[[4]]
#   
#   # test_matrix <- test_matrix[,(colnames(test_matrix) %in% names(w_jc1))]
#   # test_matrix <- test_matrix[,(quanteda::featnames(test_matrix) %in% names(w_jc1))]
#   # w_jc1 <- w_jc1[, colnames(test_matrix)]
#   # w_jc1 <- w_jc1[, quanteda::featnames(test_matrix)]
#   # w_0c <- w_0c[intersect(names(w_0c), colnames(test_matrix))]
#   # w_0c <- w_0c[intersect(names(w_0c), quanteda::featnames(test_matrix))]
#   
#   # w_jc <- as.numeric(as.matrix(w_jc1)); names(w_jc) <- names(w_jc1)
# 
#   ## GETTING POSTERIOR CLASS PROBABILITIES FOR TEST SET
#   term.appearance <- test_matrix %*% t( w_jc ) #Calculate w_jc * x_i (n x c)
#   log_odds <- t( term.appearance ) + (w_0c) #full log-odds for c-1 non-reference categories
#   odds <- cbind(exp( t( log_odds ) ), rep(1, ncol(log_odds))) #get odds and add column of 1s for reference category (n x c)
#   denominator <- rowSums(odds)
#   probs <- odds/denominator
#   colnames(probs) <- names(nc) #make col names of results matrix the categories
#   
#   ## NOTE MATCHES AND PROBABILITY RATIOS
#   unconditional_test <- t(probs)>pc # (c x n)
#   identify_max <- probs==apply(probs,1,max) # (n x c)
#   ratios_to_unconditional <- t(t(probs)/pc)
#   max_ratios <- ratios_to_unconditional==apply(ratios_to_unconditional,1,max)
#   if (all(rowSums(identify_max)==1)){ #MAXIMUM PROBABILITIES -- WITH ERROR CATCHING
#     test$max_posterior <- as.vector(t(probs))[as.vector(t(identify_max))]
#   } else {
#     test$max_posterior <- NA
#     row.picker <- rowSums(identify_max)==1
#     test$max_posterior[row.picker] <- as.vector(t(probs[row.picker,]))[as.vector(t(identify_max[row.picker,]))]
# 
#     for (j in 1:length(test$max_posterior[row.picker==FALSE])){
#       test$max_posterior[row.picker==FALSE][j] <- max(probs[row.picker==FALSE,][j,])
#         }
#     }
#     if (all(rowSums(max_ratios)==1)){ #fill in maximum ratios with error-catching
#       test$max_ratios <- as.vector(t(ratios_to_unconditional))[as.vector(t(max_ratios))]
#     
#     } else if (sum((apply(max_ratios,1,sum)==1)==FALSE)==1) {
#       test$max_ratios[apply(max_ratios,1,sum)==1] <- as.vector(t(ratios_to_unconditional[apply(max_ratios,1,sum)==1]))[as.vector(t(max_ratios[apply(max_ratios,1,sum)==1]))]
#       test$max_ratios[(apply(max_ratios,1,sum)==1)==FALSE] <- max(ratios_to_unconditional[(apply(max_ratios,1,sum)==1)==FALSE,])
#       
#     } else {
#       test$max_ratios <- NA
#       row.picker <- apply(max_ratios,1,sum)==1
#       test$max_ratios[row.picker] <- as.vector(t(ratios_to_unconditional[row.picker,]))[as.vector(t(max_ratios[row.picker,]))]
#     
#       for (j in 1:length(test$max_ratios[row.picker==FALSE])){
#         test$max_ratios[row.picker==FALSE][j] <- mean(ratios_to_unconditional[row.picker==FALSE,][j,])
#         
#           }
#       }
#       name_matrix <- matrix(rep(colnames(probs),nrow(probs)),nrow=nrow(probs),ncol=ncol(probs),byrow=T)
#       if (all(rowSums(identify_max)==1)){ #fill in max prob category names with error-catching
#         test$max_match <- as.vector(t(name_matrix))[as.vector(t(as.matrix(identify_max)))]
#       } else {
#         test$max_match <- NA
#         row.picker <- apply(identify_max,1,sum)==1
#         test$max_match[row.picker] <- as.vector(t(name_matrix[row.picker,]))[as.vector(t(identify_max[row.picker,]))]
#     
#         for (j in 1:length(test$max_match[row.picker==FALSE])){
#           boolian.vector <- max_ratios[row.picker==FALSE,][j,]
#           # test$max_match[row.picker==FALSE][j] <- paste(colnames(probs)[boolian.vector],collapse="; ")
#           class.probs <- pc[colnames(probs)[boolian.vector]]
#           test$max_match[row.picker==FALSE][j] <- names(class.probs)[which(class.probs==min(class.probs))]
#           
#             }
#         }
#         if (all(rowSums(max_ratios)==1)){ #fill in max ratio category names with error-catching
#           test$ratio_match <- as.vector(t(name_matrix))[as.vector(t(as.matrix(max_ratios)))]
#         
#         } else if (sum((rowSums(max_ratios)==1)==FALSE)==1) {
#           test$ratio_match <- NA
#           row.picker <- apply(max_ratios,1,sum)==1
#           test$ratio_match[row.picker] <- as.vector(t(name_matrix[row.picker,]))[as.vector(t(max_ratios[row.picker,]))]
#           boolian.vector <- max_ratios[(row.picker)==FALSE,]
#           class.probs <- pc[colnames(probs)[boolian.vector]]
#           test$ratio_match[row.picker==FALSE] <- names(class.probs)[which(class.probs==min(class.probs))]
#           
#         } else {
#           test$ratio_match <- NA
#           row.picker <- rowSums(max_ratios)==1
#           test$ratio_match[row.picker] <- as.vector(t(name_matrix[row.picker,]))[as.vector(t(max_ratios[row.picker,]))]
#           
#           for (j in 1:length(test$ratio_match[row.picker==FALSE])){
#             boolian.vector <- max_ratios[row.picker==FALSE,][j,]
#             class.probs <- pc[colnames(probs)[boolian.vector]]
#             if(all((sum(class.probs)/length(class.probs))==class.probs)){
#               test$ratio_match[row.picker==FALSE][j] <- sample(names(class.probs), 1)
#             } else {
#               test$ratio_match[row.picker==FALSE][j] <- names(class.probs)[which(class.probs==min(class.probs))]
#               
#                   }
#                 }
#         }
#             return(test)
#   }

NB

Siberia

7 ноября 2019 г