#libraries 
library(tidyverse);
library(tidytext);
library(tidymodels)
library(tidymetrics)
library(widyr)
library(ggrepel)
library(scales)
library(ggthemes)
library(tidylo)
library(topicmodels)
library(textfeatures)
library(vip)
theme_set(theme_solarized())

findings_orig.lang<-read_rds('../findings_orig.lang.rds')

findings_orig.lang<-readRDS('../findings_orig.lang.rds')

findings_orig.lang%>%
  count(Country,sort = TRUE)%>%
  mutate(avg=n/sum(n))%>%
  mutate(Country=fct_reorder(Country,n))%>%
  ggplot(aes(avg,Country))+
  geom_col()+
  labs(title = 'Freq per Country by Documents',
       y='')

findings_orig.lang%>%
  ggplot(aes(Date.of.text.entry_min,fill=Country))+
  geom_histogram(bins = 10)

findings_orig.lang%>%
  ggplot(aes(OperNum,count.character_finding,color=Country))+
  geom_point()+
  facet_wrap(~Year.of.text.entry_max,scales = 'free')+
  theme(axis.text.x = element_blank())

countries<-findings_orig.lang%>%
  distinct(OperNum,.keep_all = TRUE)%>%
  distinct(Country,.keep_all = TRUE)%>%
  top_n(10,count.character_finding)%>%
  select(Country)

findings_orig.lang%>%
  distinct(OperNum,.keep_all = TRUE)%>%
  filter(!Country %in% countries$Country)%>%
  filter(Year.of.text.entry_min<2019)%>%
  ggplot(aes(OperNum,count.character_finding))+
  geom_line(group=1)+
  geom_point(aes(color=Country),show.legend = FALSE)+
  geom_text(aes(label=OperNum),vjust=1,hjust=1,check_overlap = TRUE,size=2)+
  theme(axis.text.x = element_blank())+
  facet_wrap(~Country,scales='free')

findings_orig.lang%>%
  distinct(OperNum,.keep_all = TRUE)%>%
  filter(!Country %in% countries$Country)%>%
  filter(Year.of.text.entry_min<2019)%>%
  mutate(ration_finding=count.word_finding/count.character_finding)%>%
  mutate(Country=fct_reorder(Country,ration_finding))%>%
  ggplot(aes(Country,ration_finding))+
  geom_boxplot()+
  coord_flip()

findings_per_document<-findings_orig.lang%>%
  distinct(OperNum,.keep_all = TRUE)%>%
  filter(!Country %in% countries$Country)%>%
  filter(Year.of.text.entry_min<2019)%>%
  mutate(ration_finding=count.word_finding/count.character_finding)%>%
  group_by(Country)%>%
  fill(ration_finding,.direction = 'downup')%>%
  ungroup()
findings_per_document

findings_per_document%>%
  filter(!is.na(finding))%>%
  group_by(Country)%>%
  summarize(correlation=cor(count.character_finding,count.word_finding))

library(lubridate)

findings_orig.lang%>%
  filter(!is.na(finding))%>%
  group_by(Country,Year.of.text.entry_min)%>%
  summarize(n=n(),
            avg_finding=round(mean(count.word_finding),0))%>%
  filter(avg_finding<400)%>%
  ggplot(aes(n,avg_finding))+
  geom_point()+
  geom_smooth(method = 'lm')

sentences<-findings_orig.lang%>%
  filter(Country=='CO',
         !is.na(finding))%>%
  mutate(finding=str_replace(finding,'others:',''))%>%
  mutate(ration_finding=count.word_finding/count.character_finding)%>%
  unnest_tokens(token = 'sentences',output = 'sentences',input = finding)%>%
  select(OperNum,sentences,ration_finding)

## Numbers of sentences per document
sentences %>%
  count(OperNum)

library(stopwords)
library(tm)
bad_words<-stopwords('spanish')
bad_words2<-c('año','usd','i','ii','iii','co','así','fin','mill','vez','si','l','tener','oc','iv','nuevos','problema','santa','dio','p','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre','aproximadamente','varios','ende','tipo','aún','fase','costo','existe','logró','v','acerca','b','entrada','forma','gran','igualmente','misma','orden','oficina','output','podría','presentaron','propuesta','propuestas','referencia','reporte','san','tercer','térmico','zonas','caja','alrededor','asignados','decir','después','días','dicha','diseñado','esperaba','esperado','finalizar','importancia','llevó','maestro','mejorar','necesarios','niveles','particular','planteado','presente','principios','reflejado','registra','reportado','requiere','secado','solo','supuesto','with','working','actualización','alto','asociado','considerando','correspondiente','dichos','escasa','fa','figura','genera','justificación','km','llevado','medidas','menos','modo','municipio','necesarias','necesidades','parcialmente','partir','pese','personas','pmasis','presentar','previas','previstos','real','realizando','sentido','suscrito','tema','vía','acorde','acorde','actividad','adecuada','adelantando','ándres','atención','comprometer','consultores','corresponde','crítica','cuarto','efectos','ejecutores','entonces','especial','establecidos','crítica','ejecutores','especial','establecidos','existen','incluido','lograr','logro','luego','mas','más','mayores','mecanismo','mejor','notable','nuevas','objeto','oferta','original','pari','passu','posibilidad','posterior','propósito','','realidad','realizaron','red','satisfactoria','solicitado','últimos','varias','además','afectar','aplicación','aprobados','artículo','competencia','competencias','conjunto','consecución','creó','considerable','consolidación','convocatoria','cronogramas','cuatro','darle','definidos','desarrollado','efectiva','eficiente','ejecutados','especialista','estima','extensión','gestor','igual','indica','informa','interno','internos','largo','llega','lugar','manual','mismas','mll','múltiples','obtener','permitido','plazos','porcentaje','potenciales','presentación','presentó','prevé','primeras','providencia','proyecciones','puedan','realizada','reglamento','relacionada','renovación','restricciones','semanas','siguiente','solicitar','soluciones','suscribir','suscripción','tales','todas','todo','tomo','usuarios','vías','abstuvo','acumulado','adelantaron','afectó','apenas','apoya','c','buena','centros','compromisos','conjunta','considerablemente','consorcio','contenido','continua','cronograma','deben','deberán','dialogo','disponibles','dificultad','documentos','ejercicio','elegible','ello','esfuerzos','espacios','especiales','esquemas','etapas','formalizó','financiada','fuertes','funcionamiento','generado','generados','generar','generó','gerencial','gestiones','gestores','grupos','haber','hecho','inclusión','informó','inicios','manifestado','mencionados','opinión','partes','pendiente','permitieran','podía','podrá','presentan','presentando','proyectados','punto','puesta','realización','realizará','realizará','reporta','respuesta','revisaron','sigue','sólidos','sigue','sólo','superior','sustancial','torno','tramitar','tres','utilización','ve','vi','vio','acordado','actualizado','adecuado','agencia','ahora','ajustar','ajustó','alcanzado','alcanzarán','ambos','anteriores','aplicativo','aportes','apoyar','aprobó','asimismo','atender','calculado','cercano','cinco','continuar','corresponden','cumplido','da','dará','deberá','decidió','decididas','decidió','definido','definidas','denominada','determinar','determinación','detalles','detalle','diferencias','dinámica','dinamismo','directivas','disponibilidad','distintos','eb','ejecutadas','entendimiento','establecidas','evidente','explica')        
bad_words_def<-c(bad_words,bad_words2)
sentences_unnested<-sentences%>%
  unnest_tokens(word,sentences)%>%
  mutate(word=removeNumbers(word))%>%
  mutate(word=removePunctuation(word))%>%
  filter(!word %in% bad_words_def)%>%
  filter(word!='')%>%
  count(OperNum,word,sort=TRUE)

total_words<-sentences_unnested%>%
  group_by(OperNum)%>%
  summarize(total_words=sum(n))%>%
  ungroup()

odds_documents<-sentences_unnested%>%
  left_join(total_words,by='OperNum')%>%
  bind_log_odds(OperNum,word,n)

odds_documents

documents<-odds_documents%>%
  group_by(OperNum)%>%
  top_n(1,log_odds_weighted)%>%
  head(10)%>%
  select(OperNum)

odds_documents%>%
  filter(OperNum %in% documents$OperNum)%>%
  group_by(OperNum)%>%
  top_n(10,log_odds_weighted)%>%
  ungroup()%>%
  mutate(word=reorder_within(word,log_odds_weighted,OperNum))%>%
  ggplot(aes(log_odds_weighted,word))+
  geom_col()+
  scale_y_reordered()+
  facet_wrap(~OperNum, scales='free')

odds_documents%>%
  filter(OperNum=='CO-L1019',
         n>10)%>%
  ggplot(aes(n, log_odds_weighted, label = word)) +
  geom_hline(yintercept = 0, color = "gray50", lty = 2, size = 1.5) +
  geom_point(alpha = 0.8, color = "midnightblue") +
  geom_text_repel() +
  scale_x_log10()

documents_matrix<-odds_documents%>%
  count(OperNum,word)%>%
  mutate(word=removeNumbers(word))%>%
  mutate(word=removePunctuation(word))%>%
  filter(!word %in% bad_words_def)%>%
  filter(word!='')%>%
  cast_dtm(document = OperNum,term = word,
           value = n,weighting = tm::weightTf)
sample_size<-floor(.80*nrow(documents_matrix))
train_id<-sample(nrow(documents_matrix),size = sample_size)
train<-documents_matrix[train_id,]
test<-documents_matrix[-train_id,]

documents_LDA<-LDA(train,k = 10,method = 'Gibbs',control = list(seed=111))

documents_betas<-tidy(documents_LDA,matrix='beta')
documents_betas

documents_betas%>%
  filter(!term %in% c('actualizada','adecuan','años','nueva'))%>%
  group_by(topic)%>%
  top_n(10,beta)%>%
  arrange(topic,-beta)%>%
  mutate(term=reorder_within(term,beta,topic))%>%
  ggplot(aes(beta,term, fill=factor(topic)))+
  geom_col(show.legend = FALSE)+
  facet_wrap(~topic,scales = 'free_y')+
  scale_y_reordered()

documents_gamma<-tidy(documents_LDA,matrix='gamma')
documents_gamma%>%
  group_by(document)%>%
  top_n(1,gamma)

topics<-documents_betas%>%
  mutate(topic = case_when(
    str_detect(topic,'1')~'Adquisición de equipos',
    str_detect(topic,'2')~'Planes de implementación',
    str_detect(topic,'3')~'Presupuesto del Gobierno',
    str_detect(topic,'4')~'Operación Institucional',
    str_detect(topic,'5')~'Planificación',
    str_detect(topic,'6')~'Créditos',
    str_detect(topic,'7')~'Prestamos Financieros',
    str_detect(topic,'8')~'Contratación Estatal',
    str_detect(topic,'9')~'Supervisión Financiera por parte del Estado',
    str_detect(topic,'10')~'Cumplimiento a los planes de las organizaciones',
  ))

topics

gamma_score<-documents_gamma%>%
  group_by(document)%>%
  summarise(max=max(gamma))

documents_gamma%>%
  inner_join(gamma_score,by=c('document','gamma'='max'))

documents_tagging<-documents_gamma%>%
  group_by(document)%>%
  top_n(1,gamma)%>%
  mutate(topic = case_when(
    str_detect(topic,'1')~'Adquisición de equipos',
    str_detect(topic,'2')~'Planes de implementación',
    str_detect(topic,'3')~'Presupuesto del Gobierno',
    str_detect(topic,'4')~'Operación Institucional',
    str_detect(topic,'5')~'Planificación',
    str_detect(topic,'6')~'Créditos',
    str_detect(topic,'7')~'Prestamos Financieros',
    str_detect(topic,'8')~'Contratación Estatal',
    str_detect(topic,'9')~'Supervisión Financiera por parte del Estado',
    str_detect(topic,'10')~'Cumplimiento a los planes de las organizaciones',
  ))%>%
  distinct()%>%
  select(-gamma)

documents_tagging

Caso_Colombia<-findings_orig.lang%>%
  filter(Country=='CO')%>%
  mutate(ration_finding=count.word_finding/count.character_finding)%>%
  left_join(documents_tagging, by=c('OperNum'='document'))%>%
  filter(!is.na(finding))%>%
  mutate(topic=if_else(is.na(topic),'Otro',topic))
Caso_Colombia

library(textrecipes)
set.seed(123)
Caso_Colombia$ration_finding%>%mean()

Caso_Colombia<-Caso_Colombia%>%
  mutate(rating=case_when(ration_finding>0.16~'Good',TRUE~'Bad'))
review_split <- initial_split(Caso_Colombia, strata = rating)
review_train <- training(review_split)
review_test <- testing(review_split)
review_rec <- recipe(rating ~ finding, data = review_train) %>%
  step_tokenize(finding) %>%
  step_stopwords(finding) %>%
  step_tokenfilter(finding, max_tokens = 500) %>%
  step_tfidf(finding)

review_prep <- prep(review_rec)

lasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet")

lasso_wf <- workflow() %>%
  add_recipe(review_rec) %>%
  add_model(lasso_spec)

lambda_grid <- grid_regular(penalty(), levels = 10)
review_folds <- bootstraps(Caso_Colombia,strata = rating)
doParallel::registerDoParallel()

set.seed(2020)
lasso_grid <- tune_grid(
  lasso_wf,
  resamples = review_folds,
  grid = lambda_grid,
  metrics = metric_set(roc_auc, ppv, npv)
)

lasso_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5, show.legend = FALSE) +
  facet_wrap(~.metric) +
  scale_x_log10()

best_auc <- lasso_grid %>%
  select_best("roc_auc")

final_lasso <- finalize_workflow(lasso_wf, best_auc)

final_lasso %>%
  fit(review_train) %>%
  pull_workflow_fit() %>%
  vi(lambda = best_auc$penalty) %>%
  group_by(Sign) %>%
  top_n(20, wt = abs(Importance)) %>%
  ungroup() %>%
  mutate(
    Importance = abs(Importance),
    Variable = str_remove(Variable, "tfidf_finding_"),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  filter(!Variable %in% bad_words_def)%>%
  filter(!Variable %in% c('años','progreso'))%>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Sign, scales = "free_y") +
  labs(y = NULL)

colombian_df<-Caso_Colombia%>%
  unnest_tokens(word,finding)%>%
  mutate(word=removeNumbers(word))%>%
  mutate(word=removePunctuation(word))%>%
  filter(!word %in% bad_words_def)%>%
  filter(word!='')%>%
  add_count(word)%>%
  filter(n>50)%>%
  select(OperNum,count.character_finding,count.word_finding,topic,word,n,ration_finding,rating)

colombian_df<-colombian_df%>%
  mutate(OperNum)%>%
  mutate(row=row_number())%>%
  pivot_wider(names_from=word,values_from=n,values_fill=0)%>%
  select(-row)%>%
  janitor::clean_names() %>%
  na.omit()
colombian_df

colombian_df$count_character_finding<-as.numeric(colombian_df$count_character_finding)
colombian_df$count_word_finding<-as.numeric(colombian_df$count_word_finding)
colombian_df<-colombian_df%>%
  mutate_if(is.integer,as.numeric)
pca_rec <- recipe(~., data = colombian_df) %>%
  update_role(oper_num, topic,rating, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

pca_prep <- prep(pca_rec)

pca_prep

tidied_pca <- tidy(pca_prep, 2)

tidied_pca %>%
  filter(!terms %in% bad_words_def)%>%
  filter(!terms %in% c('us','plan'))%>%
  filter(component %in% paste0("PC", 1:6)) %>%
  mutate(component = fct_inorder(component))%>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1,scales = 'free_x') +
  scale_y_reordered()+
  labs(y = NULL)

tidied_pca %>%
  filter(!terms %in% bad_words_def)%>%
  filter(!terms %in% c('us','plan','primer','ration_finding','project','parte'))%>%
  filter(component %in% paste0("PC", 1:6))%>%
  top_n(100,terms)%>%
  mutate(terms=reorder_within(terms,value,component))%>%
  ggplot(aes(value,terms,fill=value>0))+
  geom_col(show.legend = FALSE)+
  facet_wrap(~component,scales='free')+
  scale_y_reordered()

juice(pca_prep) %>%
  distinct(oper_num,.keep_all = TRUE)%>%
  ggplot(aes(PC1, PC2, label = topic)) +
  geom_point(aes(color = oper_num), alpha = 0.7, size = 2,show.legend = FALSE) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  labs(color = NULL)

library(embed)
umap_rec <- recipe(~., data = colombian_df) %>%
  update_role(oper_num, topic,rating, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_umap(all_predictors())

umap_prep <- prep(umap_rec)

umap_prep

juice(umap_prep) %>%
  distinct(oper_num,.keep_all = TRUE)%>%
  ggplot(aes(umap_1, umap_2, label = topic)) +
  geom_point(aes(color = oper_num), alpha = 0.7, size = 2,show.legend = FALSE) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  labs(color = NULL)

NLP and Text Mining Daniel Version

EDA

Words over time

NLP

document philosophy

Tagging

Topics per Document

Tagging per document

Feature Eng. – Colombian Case —

Text Model – Colombian Case –

PCA Problems within the document

PCa with embeddings