review <- read.csv2("review.csv", encoding = "UTF-8")
library(rsample)
## Warning: package 'rsample' was built under R version 3.5.3
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.5.3
library(tidytext)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages ------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.1
## v readr 1.3.1 v stringr 1.4.0
## v ggplot2 3.2.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ---------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tm)
## Warning: package 'tm' was built under R version 3.5.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.5.2
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.3
## Loading required package: RColorBrewer
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
library(fastNaiveBayes)
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.2
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.5.3
## naivebayes 0.9.6 loaded
library(textclean)
## Warning: package 'textclean' was built under R version 3.5.3
# library("CHAID")
library(Rfast)
## Warning: package 'Rfast' was built under R version 3.5.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.5.3
##
## Attaching package: 'Rcpp'
## The following object is masked from 'package:rsample':
##
## populate
## Loading required package: RcppZiggurat
## Warning: package 'RcppZiggurat' was built under R version 3.5.3
##
## Attaching package: 'Rfast'
## The following object is masked from 'package:dplyr':
##
## nth
## The following objects are masked from 'package:purrr':
##
## is_integer, transpose
library(Boruta)
## Warning: package 'Boruta' was built under R version 3.5.3
## Loading required package: ranger
## Warning: package 'ranger' was built under R version 3.5.3
library(slam)
## Warning: package 'slam' was built under R version 3.5.3
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.5.3
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
table(review$Sex)
##
## N/A Ж Ж М
## 124 2 1963 303
review <- read_csv2("review.csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Parsed with column specification:
## cols(
## Author = col_character(),
## Title = col_character(),
## Nickname = col_character(),
## Sex = col_character(),
## Date = col_character(),
## Review = col_character()
## )
review <- review %>%
mutate(Sex = trimws(review$Sex)) %>%
filter(Sex %in% c("М", "Ж")) %>%
mutate(Review = replace_emoticon(Review, emoticon_dt = lexicon::hash_emoticons)) %>%
mutate(Review = replace_number(Review, remove = TRUE)) %>%
mutate(Review = gsub("){2,}", " multibrackethappy ", Review)) %>%
mutate(Review = gsub("\\({2,}", " multibracketsad ", Review)) %>%
mutate(Review = mgsub(Review, c("«", "»", "[[:digit:]]"), "")) %>%
mutate(Review = gsub("...", " multipdots ", Review, fixed = TRUE)) %>%
mutate(Review = gsub("[!]{2,}", " multiexlam ", Review)) %>%
mutate(Review = gsub("[?]{2,}", " multiquest ", Review)) %>%
select(Sex, Review)
set.seed(123)
split_review <- initial_split(review, strata = Sex)
review.train <- training(split_review)
review.test <- testing(split_review)
blanket.review <- initial_split(review.train, strata = Sex)
review.train2 <- training(blanket.review)
review.eval <- testing(blanket.review)
rm(blanket.review)
review.feature <- review.train2 %>%
unnest_tokens(word, Review) %>%
filter(! word %in% stopwords("ru")) %>%
count(word, Sex) %>%
spread(Sex, n, fill = 0)
names <- review.feature %>%
mutate(n = Ж + М) %>%
filter(n >= 5) %>%
select(word) %>%
c()
train_corp <- VCorpus(VectorSource(review.train$Review))
test_corp <- VCorpus(VectorSource(review.test$Review))
# DTM TRAIN
train_sw <- DocumentTermMatrix(
train_corp,
control = list(
tolower = TRUE,
language = "ru",
removeNumbers = TRUE,
removePunctuation = FALSE,
stemming = FALSE,
# updating for remove sw
stopwords = stopwords("ru")
)
)
# Remove sparcity for some reason
# train_sw_sparsed <- removeSparseTerms(train_sw, 0.95)
# DTM TEST
test_sw <- DocumentTermMatrix(
test_corp,
control = list(
tolower = TRUE,
removeNumbers = TRUE,
removePunctuation = FALSE,
stemming = FALSE,
stopwords = stopwords("ru")
)
)
# test_sw_sparsed <- removeSparseTerms(test_sw, 0.95)
freq_train_sw <- train_sw %>%
findFreqTerms(5) %>%
train_sw[ , .]
freq_test_sw <- test_sw %>%
findFreqTerms(1) %>%
test_sw[ , .]
# freq_eval_sw <- eval_sw %>%
# findFreqTerms(1) %>%
# test_sw[ , .]
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
}
train_sw_bi <- freq_train_sw %>%
apply(MARGIN = 2, convert_counts)
test_sw_bi <- freq_test_sw %>%
apply(MARGIN = 2, convert_counts)
####### Testing naivebayes package
class_sw_nb <- bernoulli_naive_bayes(as.matrix(train_sw_bi), as.factor(review.train$Sex), laplace = 1)
test_sw_bi_c <- test_sw_bi[,colnames(test_sw_bi) %in% colnames(train_sw_bi)]
sw_bi_pred_nb <- predict(class_sw_nb, as.matrix(test_sw_bi_c))
## Warning: predict.bernoulli_naive_bayes(): Only 6364 feature(s) out of 7498 defined in the naive_bayes object "class_sw_nb" are used for prediction
#######
####### Continue testing nb package multinomial doesn't look that good
class_sw_2_nb <- multinomial_naive_bayes(as.matrix(freq_train_sw), as.factor(review.train$Sex), laplace = 1)
freq_test_sw_c<- freq_test_sw[,colnames(freq_test_sw) %in% colnames(freq_train_sw)]
sw_mn_pred_nb <- predict(class_sw_2_nb, as.matrix(freq_test_sw_c))
## Warning: predict.multinomial_naive_bayes(): Only 6364 feature(s) out of 7498 defined in the naive_bayes object "class_sw_2_nb" are used for prediction
#######
# Bernoulli
CrossTable(sw_bi_pred_nb, review.test$Sex, prop.chisq = FALSE, chisq = FALSE,
prop.t = FALSE,
dnn = c("Predicted", "Actual"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 567
##
##
## | Actual
## Predicted | Ж | М | Row Total |
## -------------|-----------|-----------|-----------|
## Ж | 439 | 56 | 495 |
## | 0.887 | 0.113 | 0.873 |
## | 0.889 | 0.767 | |
## -------------|-----------|-----------|-----------|
## М | 55 | 17 | 72 |
## | 0.764 | 0.236 | 0.127 |
## | 0.111 | 0.233 | |
## -------------|-----------|-----------|-----------|
## Column Total | 494 | 73 | 567 |
## | 0.871 | 0.129 | |
## -------------|-----------|-----------|-----------|
##
##
# Multinomial
CrossTable(sw_mn_pred_nb, review.test$Sex, prop.chisq = FALSE, chisq = FALSE,
prop.t = FALSE,
dnn = c("Predicted", "Actual"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 567
##
##
## | Actual
## Predicted | Ж | М | Row Total |
## -------------|-----------|-----------|-----------|
## Ж | 483 | 63 | 546 |
## | 0.885 | 0.115 | 0.963 |
## | 0.978 | 0.863 | |
## -------------|-----------|-----------|-----------|
## М | 11 | 10 | 21 |
## | 0.524 | 0.476 | 0.037 |
## | 0.022 | 0.137 | |
## -------------|-----------|-----------|-----------|
## Column Total | 494 | 73 | 567 |
## | 0.871 | 0.129 | |
## -------------|-----------|-----------|-----------|
##
##
# fit <- caret::train(
# as.matrix(train_sw_bi), as.factor(review.train$Sex), "nb",
# trControl = trainControl(method = "cv", number = 10))
#
# kfoldbi <- predict(fit$finalModel, as.matrix(test_sw_bi_c))
review.train1_tfidf2 <- review.train %>%
unnest_tokens(word, Review) %>%
filter(! word %in% stopwords("ru")) %>%
filter(word %in% names[["word"]]) %>%
count(word, Sex) %>%
# spread(Sex, n, fill = 0) %>%
bind_tf_idf(word, Sex, n) %>%
# filter(tf_idf >0) %>%
cast_dtm(Sex, word , tf_idf)
# FINAL TESTING DATA
MNV_TRAIN2 <- multinomial_naive_bayes(as.matrix(review.train1_tfidf2), as.factor(c("Ж", "М")))
mnv_res2 <- predict(MNV_TRAIN2, as.matrix(test_sw_bi_c))
## Warning: predict.multinomial_naive_bayes(): Only 4003 feature(s) out of 5892 defined in the naive_bayes object "MNV_TRAIN2" are used for prediction
## Warning: predict.multinomial_naive_bayes(): More features in the newdata
## are provided as there are parameter estimates in the object. Calculation is
## performed based on features to be found in the object.
# Multinomial TESTING
CrossTable(mnv_res2, review.test$Sex, prop.chisq = FALSE, chisq = FALSE,
prop.t = FALSE,
dnn = c("Predicted", "Actual"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 567
##
##
## | Actual
## Predicted | Ж | М | Row Total |
## -------------|-----------|-----------|-----------|
## Ж | 132 | 2 | 134 |
## | 0.985 | 0.015 | 0.236 |
## | 0.267 | 0.027 | |
## -------------|-----------|-----------|-----------|
## М | 362 | 71 | 433 |
## | 0.836 | 0.164 | 0.764 |
## | 0.733 | 0.973 | |
## -------------|-----------|-----------|-----------|
## Column Total | 494 | 73 | 567 |
## | 0.871 | 0.129 | |
## -------------|-----------|-----------|-----------|
##
##
ЗДЕСЬ НАЧИНАЕТСЯ МАЛЕНЬКОЕ КЛАДБИЩЕ
#
# ####CURSED FUNCTIONS####
# # gscore <- g2Test_univariate(as.matrix(review.feature[,-1]), 2)
# # # get row and colnames in order
# # # review.train2 <- review.train2[,-4]
# # colnames(review.train_test) <- rownames(review.train2)
# # rownames(review.train_test) <- colnames(review.train2)
# #
# # colSums(as.matrix(review.train_test[,-1]))
#
# # review.train_test <- CHAID::chaid(rowname ~ ., review.train_test)
#
#
# # review.train_test <- review.train_test %>%
# # select(names[["word"]])
#
# review.eval <- review.eval %>%
# unnest_tokens(word, Review) %>%
# filter(! word %in% stopwords("ru")) %>%
# count(word, Sex) %>%
# spread(Sex, n, fill = 0)
#
# # unnest_tokens(word, Review) %>%
# # filter(! word %in% stopwords("ru")) %>%
# # count(word, Sex) %>%
# # spread(Sex, n, fill = 0)
#
#
#
# train_corp <- VCorpus(VectorSource(review.train$Review))
# test_corp <- VCorpus(VectorSource(review.test$Review))
#
# # DTM TRAIN
# train_sw <- DocumentTermMatrix(
# train_corp,
# control = list(
# tolower = TRUE,
# language = "ru",
# removeNumbers = TRUE,
# removePunctuation = FALSE,
# stemming = FALSE,
# # updating for remove sw
# stopwords = stopwords("ru")
# )
# )
# # Remove sparcity for some reason
#
# train_sw_sparsed <- removeSparseTerms(train_sw, 0.95)
#
# # DTM TEST
# test_sw <- DocumentTermMatrix(
# test_corp,
# control = list(
# tolower = TRUE,
# removeNumbers = TRUE,
# removePunctuation = FALSE,
# stemming = FALSE,
# stopwords = stopwords("ru")
# )
# )
#
#
#
# test_sw_sparsed <- removeSparseTerms(test_sw, 0.95)
#
# freq_train_sw <- train_sw %>%
# findFreqTerms(5) %>%
# train_sw[ , .]
# freq_test_sw <- test_sw %>%
# findFreqTerms(1) %>%
# test_sw[ , .]
#
# convert_counts <- function(x) {
# x <- ifelse(x > 0, 1, 0)
# }
#
# # Trying remove step with freq !!!BAD IDEA!!!
# # train_sw_bi <- train_sw %>%
# # apply(MARGIN = 2, convert_counts)
# # test_sw_bi <- test_sw %>%
# # apply(MARGIN = 2, convert_counts) !!!BAD IDEA!!!
# # TRYING TRYING
#
# train_sw_bi <- freq_train_sw %>%
# apply(MARGIN = 2, convert_counts)
#
# test_sw_bi <- freq_test_sw %>%
# apply(MARGIN = 2, convert_counts)
#
# class_sw <- fnb.bernoulli(as.matrix(train_sw_bi), as.factor(review.train$Sex), laplace = 1)
#
# ####### Testing naivebayes package
#
# class_sw_nb <- bernoulli_naive_bayes(as.matrix(train_sw_bi), as.factor(review.train$Sex), laplace = 1)
# test_sw_bi_c <- test_sw_bi[,colnames(test_sw_bi) %in% colnames(train_sw_bi)]
# sw_bi_pred_nb <- predict(class_sw_nb, as.matrix(test_sw_bi_c))
# #######
#
# ####### Continue testing nb package multinomial doesn't look that good
# class_sw_2_nb <- multinomial_naive_bayes(as.matrix(freq_train_sw), as.factor(review.train$Sex), laplace = 1)
# freq_test_sw_c<- freq_test_sw[,colnames(freq_test_sw) %in% colnames(freq_train_sw)]
# sw_mn_pred_nb <- predict(class_sw_2_nb, as.matrix(freq_test_sw_c))
#
# #######
#
# ####### Continue testing nb package Poisson doesn't look that good
# class_sw_3_nb <- poisson_naive_bayes(as.matrix(freq_train_sw), as.factor(review.train$Sex), laplace = 1)
# sw_po_pred_nb <- predict(class_sw_2_nb, as.matrix(freq_test_sw))
#
# #######
#
# class_sw_2 <- fnb.multinomial(as.matrix(freq_train_sw), as.factor(review.train$Sex), laplace = 1)
#
#
#
# freq_test_sw_c<- freq_test_sw[,colnames(freq_test_sw) %in% colnames(freq_train_sw)]
#
# test_sw_bi_c <- test_sw_bi[,colnames(test_sw_bi) %in% colnames(train_sw_bi)]
# # left-join columns?
# sw_bi_pred <- predict(class_sw, as.matrix(test_sw_bi_c))
#
# sw_mn_pred <- predict(class_sw_2, freq_test_sw_c)
#
# # Bernoulli
# CrossTable(sw_bi_pred_nb, review.test$Sex, prop.chisq = FALSE, chisq = FALSE,
# prop.t = FALSE,
# dnn = c("Predicted", "Actual"))
#
# # Multinomial
# CrossTable(sw_mn_pred_nb, review.test$Sex, prop.chisq = FALSE, chisq = FALSE,
# prop.t = FALSE,
# dnn = c("Predicted", "Actual"))
#
# # Poisson
# CrossTable(sw_po_pred_nb, review.test$Sex, prop.chisq = FALSE, chisq = FALSE,
# prop.t = FALSE,
# dnn = c("Predicted", "Actual"))
#
#
# # WOW MORE SUPER COOL TESTS
#
# # names <- intersect(class_sw$names, colnames(test_sw_bi_c))
# # newdata <- test_sw_bi_c[, names]
# # if (length(names) == 1) {
# # newdata <- as.matrix(newdata)
# # colnames(newdata) <- names
# # }
# #
# # if(length(object$names)!=length(names)){
# # if(!silent){
# # warning('Columns in test and train set not equal! Newdata is padded with zeros')
# # }
#
# test <- freq_train_sw %>%
# apply(MARGIN = 2, sum)
#
#
#
#
#
#
#
# ##### Here is the dead space
#
#
#
#
# # train_corp_q <- quanteda::corpus(as.character(review.train$Review))
# # test_corp_q <- quanteda::corpus(as.character(review.test$Review))
# #
# # train_matrix <- quanteda::dfm(train_corp_q,
# # tolower = T,
# # stem = F,
# # remove = stopwords('russian'))
# # test_matrix <- quanteda::dfm(test_corp_q,
# # tolower = T,
# # stem = F,
# # remove = stopwords('russian')
# # )
# # train_matrix@x[train_matrix@x > 1] <- 1
# # test_matrix@Dimnames$features
# # str(test_matrix)
# # test_matrix@x[test_matrix@x > 1] <- 1
# # train_matrix <- train_matrix[,
# # (quanteda::featnames(train_matrix) %in% quanteda::featnames(test_matrix))]
# # train_matrix <- train_matrix[,
# # (colnames(train_matrix) %in% colnames(test_matrix))]
# # test_matrix <- test_matrix[,
# # (quanteda::featnames(test_matrix) %in% quanteda::featnames(train_matrix))]
# # test_matrix <- test_matrix[,
# # (colnames(test_matrix) %in% colnames(train_matrix))]
# # str(test_matrix)
# convert_sex <- function(x) {
# x <- ifelse(x == "М", 1, 0)
# }
# review.train$Sex <- convert_sex(review.train$Sex)
# review.test$Sex <- convert_sex(review.test$Sex)
# # est <- trainNB(review.train$Sex, train_matrix, smoothing = 'normalized')
# # out <- classifyNB_2(est, test_matrix, data.frame(review.test$Review))
# #
# #
# # str(est[["w_jc"]])
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
# #OPTIONALOPTIONAL updating for remove sw becayse we failed to do it in DTM
# # Надо в начале почистить текст затем стоп слова
# # train_corp <- tm_map(train_corp, removeWords, stopwords(kind = "ru"))
# # test_corp <- tm_map(test_corp, removeWords, stopwords(kind = "ru"))
#
#
#
#
#
#
#
#
# # Some testing bullshit
#
# install.packages("remotes")
# remotes::install_github("mattwloftis/agendacodeR")
#
# library(agendacodeR)
#
# train_corpus <- quanteda::corpus(x = training_agendas$text)
# train_matrix <- quanteda::dfm(train_corpus,
# language = "danish",
# stem = TRUE,
# removeNumbers = FALSE)
#
# test_corpus <- quanteda::corpus(x = test_agendas$text)
# test_matrix <- quanteda::dfm(test_corpus,
# language = "danish",
# stem = TRUE,
# removeNumbers = FALSE)
# str(train_corpus)
#
# ## Convert matrix of frequencies to matrix of indicators
# train_matrix@x[train_matrix@x > 1] <- 1
# test_matrix@x[test_matrix@x > 1] <- 1
#
# ## Dropping training features not in the test set
# train_matrix <- train_matrix[, (colnames(train_matrix) %in% colnames(test_matrix))]
#
#
#
# ## Convert matrix of frequencies to matrix of indicators
# train_matrix@x[train_matrix@x > 1] <- 1
# test_matrix@x[test_matrix@x > 1] <- 1
#
# ## Dropping training features not in the test set
# head(test_agendas)
#
#
# est <- trainNB(training_agendas$coding, train_matrix)
#
# test_matrix <- test_matrix[, (colnames(test_matrix) %in% names(est[[2]]))]
#
# out <- classifyNB(est, test_matrix, test_agendas)
#
# acc <- catAccuracy(true = out$coding, predicted = out$ratio_match)
#
# topWords(est)
#
# est$
#
# training_agendas %>%
# filter(id %in% test_agendas$id) %>%
# select(coding)
#
# ## Load data and create document-feature matrices
# # train_corpus <- quanteda::corpus(x = as.character(review.train$Review))
# # train_matrix <- quanteda::dfm(train_corpus,
# # language = "ru",
# # stem = TRUE,
# # removeNumbers = TRUE)
# #
# #
# # test_corpus <- quanteda::corpus(x = as.character(review.test$Review))
# # test_matrix <- quanteda::dfm(test_corpus,
# # language = "ru",
# # removeNumbers = TRUE)
#
#
# # ## Convert matrix of frequencies to matrix of indicators
# # train_matrix@x[train_matrix@x > 1] <- 1
# # test_matrix@x[test_matrix@x > 1] <- 1
#
# # Convert Sexes for this package
# convert_sex <- function(x) {
# x <- ifelse(x == "М", 1, 0)
# }
#
# # One more idk correction
# test_sw <- test_sw[, (colnames(test_sw) %in% colnames(test_matrix))]
#
# # Another one convertion!
#
# test_sw_bi <- test_sw_bi[,(featnames(as.dfm(test_sw_bi)) %in% featnames(as.dfm(train_sw_bi)))]
#
#
# test <- as.dfm(train_sw_bi)
# str(test)
# str(train_sw)
#
#
#
# ## Dropping training features not in the test set
# test_sw_bi_upd <- as.dfm(test_sw_bi)
# train_sw_bi_upd <- train_sw_bi %>% as.dfm %>% .[, (colnames(train_sw_bi) %in% colnames(test_sw_bi_upd))]
# # test_sw_bi_upd <- test_sw_bi_upd %>% .[, (colnames(test_sw_bi_upd) %in% colnames(train_sw_bi_upd))]
#
# est <- trainNB(review.train$Sex, train_sw_bi_upd)
#
# # test_sw_bi <- test_sw_bi[, featnames(as.dfm(test_sw_bi)) %in% names(est[["w_jc"]])]
# #
# # typo <- est[["w_jc"]]
# #
# # typo_2 <- typo[intersect(names(typo), colnames(test_sw_bi))]
# #
# # est[["w_jc"]] <- typo_2
# out <- classifyNB(est, test_sw_bi_upd, as.data.frame(review.test$Review))
# acc <- catAccuracy(true = out$coding, predicted = out$ratio_match)
#
# sum(as.numeric(acc$frequency))
#
# colnames(train_sw_bi)
#
#
#
#
#
#
# # LAST THING I WILL TRY WITH YYYYYYYUUUUU
#
#
# test <- est[["w_0c"]]
# test <- data.frame(t(test))
# test[2,]
# test <- test[intersect(names(test), colnames(test_matrix))]
# str(test)
# head(test_matrix) %*% head(t(test_v))
# str(test_matrix)
#
# test_v <- as.numeric(as.matrix(test)); names(test_v) <- names(test)
# head(test_v)
#
# classifyNB_2 <- function(est, test_matrix, test) {
# w_0c <- est[[1]]
# w_jc <- est[[2]]
# # w_jc1 <- data.frame(t(w_jc))
# nc <- est[[3]]
# pc <- est[[4]]
#
# # test_matrix <- test_matrix[,(colnames(test_matrix) %in% names(w_jc1))]
# # test_matrix <- test_matrix[,(quanteda::featnames(test_matrix) %in% names(w_jc1))]
# # w_jc1 <- w_jc1[, colnames(test_matrix)]
# # w_jc1 <- w_jc1[, quanteda::featnames(test_matrix)]
# # w_0c <- w_0c[intersect(names(w_0c), colnames(test_matrix))]
# # w_0c <- w_0c[intersect(names(w_0c), quanteda::featnames(test_matrix))]
#
# # w_jc <- as.numeric(as.matrix(w_jc1)); names(w_jc) <- names(w_jc1)
#
# ## GETTING POSTERIOR CLASS PROBABILITIES FOR TEST SET
# term.appearance <- test_matrix %*% t( w_jc ) #Calculate w_jc * x_i (n x c)
# log_odds <- t( term.appearance ) + (w_0c) #full log-odds for c-1 non-reference categories
# odds <- cbind(exp( t( log_odds ) ), rep(1, ncol(log_odds))) #get odds and add column of 1s for reference category (n x c)
# denominator <- rowSums(odds)
# probs <- odds/denominator
# colnames(probs) <- names(nc) #make col names of results matrix the categories
#
# ## NOTE MATCHES AND PROBABILITY RATIOS
# unconditional_test <- t(probs)>pc # (c x n)
# identify_max <- probs==apply(probs,1,max) # (n x c)
# ratios_to_unconditional <- t(t(probs)/pc)
# max_ratios <- ratios_to_unconditional==apply(ratios_to_unconditional,1,max)
# if (all(rowSums(identify_max)==1)){ #MAXIMUM PROBABILITIES -- WITH ERROR CATCHING
# test$max_posterior <- as.vector(t(probs))[as.vector(t(identify_max))]
# } else {
# test$max_posterior <- NA
# row.picker <- rowSums(identify_max)==1
# test$max_posterior[row.picker] <- as.vector(t(probs[row.picker,]))[as.vector(t(identify_max[row.picker,]))]
#
# for (j in 1:length(test$max_posterior[row.picker==FALSE])){
# test$max_posterior[row.picker==FALSE][j] <- max(probs[row.picker==FALSE,][j,])
# }
# }
# if (all(rowSums(max_ratios)==1)){ #fill in maximum ratios with error-catching
# test$max_ratios <- as.vector(t(ratios_to_unconditional))[as.vector(t(max_ratios))]
#
# } else if (sum((apply(max_ratios,1,sum)==1)==FALSE)==1) {
# test$max_ratios[apply(max_ratios,1,sum)==1] <- as.vector(t(ratios_to_unconditional[apply(max_ratios,1,sum)==1]))[as.vector(t(max_ratios[apply(max_ratios,1,sum)==1]))]
# test$max_ratios[(apply(max_ratios,1,sum)==1)==FALSE] <- max(ratios_to_unconditional[(apply(max_ratios,1,sum)==1)==FALSE,])
#
# } else {
# test$max_ratios <- NA
# row.picker <- apply(max_ratios,1,sum)==1
# test$max_ratios[row.picker] <- as.vector(t(ratios_to_unconditional[row.picker,]))[as.vector(t(max_ratios[row.picker,]))]
#
# for (j in 1:length(test$max_ratios[row.picker==FALSE])){
# test$max_ratios[row.picker==FALSE][j] <- mean(ratios_to_unconditional[row.picker==FALSE,][j,])
#
# }
# }
# name_matrix <- matrix(rep(colnames(probs),nrow(probs)),nrow=nrow(probs),ncol=ncol(probs),byrow=T)
# if (all(rowSums(identify_max)==1)){ #fill in max prob category names with error-catching
# test$max_match <- as.vector(t(name_matrix))[as.vector(t(as.matrix(identify_max)))]
# } else {
# test$max_match <- NA
# row.picker <- apply(identify_max,1,sum)==1
# test$max_match[row.picker] <- as.vector(t(name_matrix[row.picker,]))[as.vector(t(identify_max[row.picker,]))]
#
# for (j in 1:length(test$max_match[row.picker==FALSE])){
# boolian.vector <- max_ratios[row.picker==FALSE,][j,]
# # test$max_match[row.picker==FALSE][j] <- paste(colnames(probs)[boolian.vector],collapse="; ")
# class.probs <- pc[colnames(probs)[boolian.vector]]
# test$max_match[row.picker==FALSE][j] <- names(class.probs)[which(class.probs==min(class.probs))]
#
# }
# }
# if (all(rowSums(max_ratios)==1)){ #fill in max ratio category names with error-catching
# test$ratio_match <- as.vector(t(name_matrix))[as.vector(t(as.matrix(max_ratios)))]
#
# } else if (sum((rowSums(max_ratios)==1)==FALSE)==1) {
# test$ratio_match <- NA
# row.picker <- apply(max_ratios,1,sum)==1
# test$ratio_match[row.picker] <- as.vector(t(name_matrix[row.picker,]))[as.vector(t(max_ratios[row.picker,]))]
# boolian.vector <- max_ratios[(row.picker)==FALSE,]
# class.probs <- pc[colnames(probs)[boolian.vector]]
# test$ratio_match[row.picker==FALSE] <- names(class.probs)[which(class.probs==min(class.probs))]
#
# } else {
# test$ratio_match <- NA
# row.picker <- rowSums(max_ratios)==1
# test$ratio_match[row.picker] <- as.vector(t(name_matrix[row.picker,]))[as.vector(t(max_ratios[row.picker,]))]
#
# for (j in 1:length(test$ratio_match[row.picker==FALSE])){
# boolian.vector <- max_ratios[row.picker==FALSE,][j,]
# class.probs <- pc[colnames(probs)[boolian.vector]]
# if(all((sum(class.probs)/length(class.probs))==class.probs)){
# test$ratio_match[row.picker==FALSE][j] <- sample(names(class.probs), 1)
# } else {
# test$ratio_match[row.picker==FALSE][j] <- names(class.probs)[which(class.probs==min(class.probs))]
#
# }
# }
# }
# return(test)
# }