NLP and Text Mining Daniel Version

#libraries 
library(tidyverse);
library(tidytext);
library(tidymodels)
library(tidymetrics)
library(widyr)
library(ggrepel)
library(scales)
library(ggthemes)
library(tidylo)
library(topicmodels)
library(textfeatures)
library(vip)
theme_set(theme_solarized())
findings_orig.lang<-read_rds('../findings_orig.lang.rds')
findings_orig.lang<-readRDS('../findings_orig.lang.rds')

EDA

findings_orig.lang%>%
  count(Country,sort = TRUE)%>%
  mutate(avg=n/sum(n))%>%
  mutate(Country=fct_reorder(Country,n))%>%
  ggplot(aes(avg,Country))+
  geom_col()+
  labs(title = 'Freq per Country by Documents',
       y='')

* is a Sample? * is a universe?

findings_orig.lang%>%
  ggplot(aes(Date.of.text.entry_min,fill=Country))+
  geom_histogram(bins = 10)

Words over time

findings_orig.lang%>%
  ggplot(aes(OperNum,count.character_finding,color=Country))+
  geom_point()+
  facet_wrap(~Year.of.text.entry_max,scales = 'free')+
  theme(axis.text.x = element_blank())
## Warning: Removed 28 rows containing missing values (geom_point).

With this feature I think that is possible create filter for year <‘2019’, for create features.

countries<-findings_orig.lang%>%
  distinct(OperNum,.keep_all = TRUE)%>%
  distinct(Country,.keep_all = TRUE)%>%
  top_n(10,count.character_finding)%>%
  select(Country)
findings_orig.lang%>%
  distinct(OperNum,.keep_all = TRUE)%>%
  filter(!Country %in% countries$Country)%>%
  filter(Year.of.text.entry_min<2019)%>%
  ggplot(aes(OperNum,count.character_finding))+
  geom_line(group=1)+
  geom_point(aes(color=Country),show.legend = FALSE)+
  geom_text(aes(label=OperNum),vjust=1,hjust=1,check_overlap = TRUE,size=2)+
  theme(axis.text.x = element_blank())+
  facet_wrap(~Country,scales='free')
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_text).

With this dinamic I think that I Can create some Features

findings_orig.lang%>%
  distinct(OperNum,.keep_all = TRUE)%>%
  filter(!Country %in% countries$Country)%>%
  filter(Year.of.text.entry_min<2019)%>%
  mutate(ration_finding=count.word_finding/count.character_finding)%>%
  mutate(Country=fct_reorder(Country,ration_finding))%>%
  ggplot(aes(Country,ration_finding))+
  geom_boxplot()+
  coord_flip()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

Welll With this feature we Can Develop PCA’s for NLP sentences.

findings_per_document<-findings_orig.lang%>%
  distinct(OperNum,.keep_all = TRUE)%>%
  filter(!Country %in% countries$Country)%>%
  filter(Year.of.text.entry_min<2019)%>%
  mutate(ration_finding=count.word_finding/count.character_finding)%>%
  group_by(Country)%>%
  fill(ration_finding,.direction = 'downup')%>%
  ungroup()
findings_per_document
## # A tibble: 512 x 18
##    OperNum Country finding PMR.cycle.ID_min PMR.cycle.ID_max Date.of.text.en…
##    <chr>   <chr>   <chr>              <dbl>            <dbl> <date>          
##  1 BA-L10… BA      "overa…               10               14 2014-08-13      
##  2 BA-L10… BA      "given…               14               14 2015-03-02      
##  3 BA-L10… BA      "1) th…               15               15 2015-07-15      
##  4 BA-L10… BA      "other…                6               14 2014-06-05      
##  5 BA-L10… BA      "inter…               22               23 2016-10-09      
##  6 BA-L10… BA      "durin…               28               28 2018-09-28      
##  7 BA-L10… BA      "after…               28               28 2018-09-20      
##  8 BA-L10… BA      "1.2 (…               15               15 2015-07-15      
##  9 BA-L10… BA      "2.1 a…               22               23 2016-10-20      
## 10 BA-L10… BA      "proje…                9               14 2014-04-07      
## # … with 502 more rows, and 12 more variables: Date.of.text.entry_max <date>,
## #   Year.of.text.entry_min <int>, Year.of.text.entry_max <int>, language <chr>,
## #   lang_spanish <dbl>, lang_english <dbl>, lang_port <dbl>, lang_french <dbl>,
## #   lang_unknown <dbl>, count.character_finding <int>,
## #   count.word_finding <int>, ration_finding <dbl>
findings_per_document%>%
  filter(!is.na(finding))%>%
  group_by(Country)%>%
  summarize(correlation=cor(count.character_finding,count.word_finding))
## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 17 x 2
##    Country correlation
##    <chr>         <dbl>
##  1 BA            0.995
##  2 BL            0.997
##  3 BO            0.997
##  4 BR            0.998
##  5 CO            0.992
##  6 CR            0.999
##  7 DR            0.999
##  8 ES            0.999
##  9 GU            0.998
## 10 GY            0.998
## 11 HA            0.998
## 12 JA            0.998
## 13 ME            0.999
## 14 NI            0.998
## 15 RG            0.997
## 16 SU            0.982
## 17 UR            0.996
library(lubridate)
## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
findings_orig.lang%>%
  filter(!is.na(finding))%>%
  group_by(Country,Year.of.text.entry_min)%>%
  summarize(n=n(),
            avg_finding=round(mean(count.word_finding),0))%>%
  filter(avg_finding<400)%>%
  ggplot(aes(n,avg_finding))+
  geom_point()+
  geom_smooth(method = 'lm')
## `summarise()` regrouping output by 'Country' (override with `.groups` argument)

## `geom_smooth()` using formula 'y ~ x'

NLP

In this case I working with Colombia

sentences<-findings_orig.lang%>%
  filter(Country=='CO',
         !is.na(finding))%>%
  mutate(finding=str_replace(finding,'others:',''))%>%
  mutate(ration_finding=count.word_finding/count.character_finding)%>%
  unnest_tokens(token = 'sentences',output = 'sentences',input = finding)%>%
  select(OperNum,sentences,ration_finding)

## Numbers of sentences per document
sentences %>%
  count(OperNum)
## # A tibble: 41 x 2
##    OperNum      n
##    <chr>    <int>
##  1 CO-L1012     4
##  2 CO-L1019    50
##  3 CO-L1022     6
##  4 CO-L1028    41
##  5 CO-L1034    45
##  6 CO-L1041     5
##  7 CO-L1052     4
##  8 CO-L1059    24
##  9 CO-L1065     8
## 10 CO-L1076    10
## # … with 31 more rows

In this part I eliminate words that dont have meaning within sentences

library(stopwords)
library(tm)
bad_words<-stopwords('spanish')
bad_words2<-c('año','usd','i','ii','iii','co','así','fin','mill','vez','si','l','tener','oc','iv','nuevos','problema','santa','dio','p','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre','aproximadamente','varios','ende','tipo','aún','fase','costo','existe','logró','v','acerca','b','entrada','forma','gran','igualmente','misma','orden','oficina','output','podría','presentaron','propuesta','propuestas','referencia','reporte','san','tercer','térmico','zonas','caja','alrededor','asignados','decir','después','días','dicha','diseñado','esperaba','esperado','finalizar','importancia','llevó','maestro','mejorar','necesarios','niveles','particular','planteado','presente','principios','reflejado','registra','reportado','requiere','secado','solo','supuesto','with','working','actualización','alto','asociado','considerando','correspondiente','dichos','escasa','fa','figura','genera','justificación','km','llevado','medidas','menos','modo','municipio','necesarias','necesidades','parcialmente','partir','pese','personas','pmasis','presentar','previas','previstos','real','realizando','sentido','suscrito','tema','vía','acorde','acorde','actividad','adecuada','adelantando','ándres','atención','comprometer','consultores','corresponde','crítica','cuarto','efectos','ejecutores','entonces','especial','establecidos','crítica','ejecutores','especial','establecidos','existen','incluido','lograr','logro','luego','mas','más','mayores','mecanismo','mejor','notable','nuevas','objeto','oferta','original','pari','passu','posibilidad','posterior','propósito','','realidad','realizaron','red','satisfactoria','solicitado','últimos','varias','además','afectar','aplicación','aprobados','artículo','competencia','competencias','conjunto','consecución','creó','considerable','consolidación','convocatoria','cronogramas','cuatro','darle','definidos','desarrollado','efectiva','eficiente','ejecutados','especialista','estima','extensión','gestor','igual','indica','informa','interno','internos','largo','llega','lugar','manual','mismas','mll','múltiples','obtener','permitido','plazos','porcentaje','potenciales','presentación','presentó','prevé','primeras','providencia','proyecciones','puedan','realizada','reglamento','relacionada','renovación','restricciones','semanas','siguiente','solicitar','soluciones','suscribir','suscripción','tales','todas','todo','tomo','usuarios','vías','abstuvo','acumulado','adelantaron','afectó','apenas','apoya','c','buena','centros','compromisos','conjunta','considerablemente','consorcio','contenido','continua','cronograma','deben','deberán','dialogo','disponibles','dificultad','documentos','ejercicio','elegible','ello','esfuerzos','espacios','especiales','esquemas','etapas','formalizó','financiada','fuertes','funcionamiento','generado','generados','generar','generó','gerencial','gestiones','gestores','grupos','haber','hecho','inclusión','informó','inicios','manifestado','mencionados','opinión','partes','pendiente','permitieran','podía','podrá','presentan','presentando','proyectados','punto','puesta','realización','realizará','realizará','reporta','respuesta','revisaron','sigue','sólidos','sigue','sólo','superior','sustancial','torno','tramitar','tres','utilización','ve','vi','vio','acordado','actualizado','adecuado','agencia','ahora','ajustar','ajustó','alcanzado','alcanzarán','ambos','anteriores','aplicativo','aportes','apoyar','aprobó','asimismo','atender','calculado','cercano','cinco','continuar','corresponden','cumplido','da','dará','deberá','decidió','decididas','decidió','definido','definidas','denominada','determinar','determinación','detalles','detalle','diferencias','dinámica','dinamismo','directivas','disponibilidad','distintos','eb','ejecutadas','entendimiento','establecidas','evidente','explica')        
bad_words_def<-c(bad_words,bad_words2)
sentences_unnested<-sentences%>%
  unnest_tokens(word,sentences)%>%
  mutate(word=removeNumbers(word))%>%
  mutate(word=removePunctuation(word))%>%
  filter(!word %in% bad_words_def)%>%
  filter(word!='')%>%
  count(OperNum,word,sort=TRUE)

total_words<-sentences_unnested%>%
  group_by(OperNum)%>%
  summarize(total_words=sum(n))%>%
  ungroup()

document philosophy

In this section I evaluate how the document is performing and what it addresses

odds_documents<-sentences_unnested%>%
  left_join(total_words,by='OperNum')%>%
  bind_log_odds(OperNum,word,n)

odds_documents
## # A tibble: 9,821 x 5
##    OperNum  word            n total_words log_odds_weighted
##    <chr>    <chr>       <int>       <int>             <dbl>
##  1 CO-L1019 proyecto       88        2073            -6.77 
##  2 CO-L1019 invias         54        2073            19.8  
##  3 CO-L1092 colciencias    45        1858            19.6  
##  4 CO-L1105 ejecución      45        1414            -8.77 
##  5 CO-L1059 programa       44        1042             0.873
##  6 CO-L1019 ejecución      42        2073           -10.6  
##  7 CO-L1091 ejecución      40        2242           -12.0  
##  8 CO-L1091 proyecto       37        2242           -10.2  
##  9 CO-L1105 programa       37        1414            -9.83 
## 10 CO-L1091 recursos       34        2242            -5.67 
## # … with 9,811 more rows
documents<-odds_documents%>%
  group_by(OperNum)%>%
  top_n(1,log_odds_weighted)%>%
  head(10)%>%
  select(OperNum)

Here is all the history per document

odds_documents%>%
  filter(OperNum %in% documents$OperNum)%>%
  group_by(OperNum)%>%
  top_n(10,log_odds_weighted)%>%
  ungroup()%>%
  mutate(word=reorder_within(word,log_odds_weighted,OperNum))%>%
  ggplot(aes(log_odds_weighted,word))+
  geom_col()+
  scale_y_reordered()+
  facet_wrap(~OperNum, scales='free')

Everything on the dotted line is the most important thing to understand within the document.

odds_documents%>%
  filter(OperNum=='CO-L1019',
         n>10)%>%
  ggplot(aes(n, log_odds_weighted, label = word)) +
  geom_hline(yintercept = 0, color = "gray50", lty = 2, size = 1.5) +
  geom_point(alpha = 0.8, color = "midnightblue") +
  geom_text_repel() +
  scale_x_log10()

Tagging

documents_matrix<-odds_documents%>%
  count(OperNum,word)%>%
  mutate(word=removeNumbers(word))%>%
  mutate(word=removePunctuation(word))%>%
  filter(!word %in% bad_words_def)%>%
  filter(word!='')%>%
  cast_dtm(document = OperNum,term = word,
           value = n,weighting = tm::weightTf)
sample_size<-floor(.80*nrow(documents_matrix))
train_id<-sample(nrow(documents_matrix),size = sample_size)
train<-documents_matrix[train_id,]
test<-documents_matrix[-train_id,]

documents_LDA<-LDA(train,k = 10,method = 'Gibbs',control = list(seed=111))

Beta distribution

documents_betas<-tidy(documents_LDA,matrix='beta')
documents_betas
## # A tibble: 40,640 x 3
##    topic term             beta
##    <int> <chr>           <dbl>
##  1     1 actualizada 0.0000811
##  2     2 actualizada 0.0000774
##  3     3 actualizada 0.0000834
##  4     4 actualizada 0.000993 
##  5     5 actualizada 0.0000717
##  6     6 actualizada 0.0000826
##  7     7 actualizada 0.0000792
##  8     8 actualizada 0.0000816
##  9     9 actualizada 0.0000760
## 10    10 actualizada 0.0000746
## # … with 40,630 more rows

Principal words per Topics

documents_betas%>%
  filter(!term %in% c('actualizada','adecuan','años','nueva'))%>%
  group_by(topic)%>%
  top_n(10,beta)%>%
  arrange(topic,-beta)%>%
  mutate(term=reorder_within(term,beta,topic))%>%
  ggplot(aes(beta,term, fill=factor(topic)))+
  geom_col(show.legend = FALSE)+
  facet_wrap(~topic,scales = 'free_y')+
  scale_y_reordered()

Topics per Document

documents_gamma<-tidy(documents_LDA,matrix='gamma')
documents_gamma%>%
  group_by(document)%>%
  top_n(1,gamma)
## # A tibble: 35 x 3
## # Groups:   document [32]
##    document topic gamma
##    <chr>    <int> <dbl>
##  1 CO-L1059     1 0.160
##  2 CO-L1022     2 0.133
##  3 CO-L1156     2 0.137
##  4 CO-L1097     2 0.156
##  5 CO-L1126     3 0.125
##  6 CO-L1109     3 0.211
##  7 CO-L1092     3 0.152
##  8 CO-L1019     4 0.146
##  9 CO-L1126     4 0.125
## 10 CO-L1065     4 0.177
## # … with 25 more rows
topics<-documents_betas%>%
  mutate(topic = case_when(
    str_detect(topic,'1')~'Adquisición de equipos',
    str_detect(topic,'2')~'Planes de implementación',
    str_detect(topic,'3')~'Presupuesto del Gobierno',
    str_detect(topic,'4')~'Operación Institucional',
    str_detect(topic,'5')~'Planificación',
    str_detect(topic,'6')~'Créditos',
    str_detect(topic,'7')~'Prestamos Financieros',
    str_detect(topic,'8')~'Contratación Estatal',
    str_detect(topic,'9')~'Supervisión Financiera por parte del Estado',
    str_detect(topic,'10')~'Cumplimiento a los planes de las organizaciones',
  ))

topics
## # A tibble: 40,640 x 3
##    topic                                       term             beta
##    <chr>                                       <chr>           <dbl>
##  1 Adquisición de equipos                      actualizada 0.0000811
##  2 Planes de implementación                    actualizada 0.0000774
##  3 Presupuesto del Gobierno                    actualizada 0.0000834
##  4 Operación Institucional                     actualizada 0.000993 
##  5 Planificación                               actualizada 0.0000717
##  6 Créditos                                    actualizada 0.0000826
##  7 Prestamos Financieros                       actualizada 0.0000792
##  8 Contratación Estatal                        actualizada 0.0000816
##  9 Supervisión Financiera por parte del Estado actualizada 0.0000760
## 10 Adquisición de equipos                      actualizada 0.0000746
## # … with 40,630 more rows
gamma_score<-documents_gamma%>%
  group_by(document)%>%
  summarise(max=max(gamma))
## `summarise()` ungrouping output (override with `.groups` argument)
documents_gamma%>%
  inner_join(gamma_score,by=c('document','gamma'='max'))
## # A tibble: 35 x 3
##    document topic gamma
##    <chr>    <int> <dbl>
##  1 CO-L1059     1 0.160
##  2 CO-L1022     2 0.133
##  3 CO-L1156     2 0.137
##  4 CO-L1097     2 0.156
##  5 CO-L1126     3 0.125
##  6 CO-L1109     3 0.211
##  7 CO-L1092     3 0.152
##  8 CO-L1019     4 0.146
##  9 CO-L1126     4 0.125
## 10 CO-L1065     4 0.177
## # … with 25 more rows

Tagging per document

documents_tagging<-documents_gamma%>%
  group_by(document)%>%
  top_n(1,gamma)%>%
  mutate(topic = case_when(
    str_detect(topic,'1')~'Adquisición de equipos',
    str_detect(topic,'2')~'Planes de implementación',
    str_detect(topic,'3')~'Presupuesto del Gobierno',
    str_detect(topic,'4')~'Operación Institucional',
    str_detect(topic,'5')~'Planificación',
    str_detect(topic,'6')~'Créditos',
    str_detect(topic,'7')~'Prestamos Financieros',
    str_detect(topic,'8')~'Contratación Estatal',
    str_detect(topic,'9')~'Supervisión Financiera por parte del Estado',
    str_detect(topic,'10')~'Cumplimiento a los planes de las organizaciones',
  ))%>%
  distinct()%>%
  select(-gamma)

documents_tagging
## # A tibble: 35 x 2
## # Groups:   document [32]
##    document topic                   
##    <chr>    <chr>                   
##  1 CO-L1059 Adquisición de equipos  
##  2 CO-L1022 Planes de implementación
##  3 CO-L1156 Planes de implementación
##  4 CO-L1097 Planes de implementación
##  5 CO-L1126 Presupuesto del Gobierno
##  6 CO-L1109 Presupuesto del Gobierno
##  7 CO-L1092 Presupuesto del Gobierno
##  8 CO-L1019 Operación Institucional 
##  9 CO-L1126 Operación Institucional 
## 10 CO-L1065 Operación Institucional 
## # … with 25 more rows

Feature Eng. – Colombian Case —

Caso_Colombia<-findings_orig.lang%>%
  filter(Country=='CO')%>%
  mutate(ration_finding=count.word_finding/count.character_finding)%>%
  left_join(documents_tagging, by=c('OperNum'='document'))%>%
  filter(!is.na(finding))%>%
  mutate(topic=if_else(is.na(topic),'Otro',topic))
Caso_Colombia 
## # A tibble: 634 x 19
##    OperNum Country finding PMR.cycle.ID_min PMR.cycle.ID_max Date.of.text.en…
##    <chr>   <chr>   <chr>              <dbl>            <dbl> <date>          
##  1 CO-L10… CO      others…                4               14 2014-04-07      
##  2 CO-L10… CO      others…                4               14 2014-04-07      
##  3 CO-L10… CO      projec…                4               14 2014-04-07      
##  4 CO-L10… CO      projec…                4               14 2014-04-07      
##  5 CO-L10… CO      3.2 a …               17               17 2016-03-30      
##  6 CO-L10… CO      3.3 ac…               17               17 2016-03-28      
##  7 CO-L10… CO      a dici…               23               23 2016-10-03      
##  8 CO-L10… CO      a dici…               23               26 2016-10-03      
##  9 CO-L10… CO      a juni…               22               22 2016-10-03      
## 10 CO-L10… CO      a juni…               22               22 2016-10-03      
## # … with 624 more rows, and 13 more variables: Date.of.text.entry_max <date>,
## #   Year.of.text.entry_min <int>, Year.of.text.entry_max <int>, language <chr>,
## #   lang_spanish <dbl>, lang_english <dbl>, lang_port <dbl>, lang_french <dbl>,
## #   lang_unknown <dbl>, count.character_finding <int>,
## #   count.word_finding <int>, ration_finding <dbl>, topic <chr>

Text Model – Colombian Case –

library(textrecipes)
set.seed(123)
Caso_Colombia$ration_finding%>%mean()
## [1] 0.1600061
Caso_Colombia<-Caso_Colombia%>%
  mutate(rating=case_when(ration_finding>0.16~'Good',TRUE~'Bad'))
review_split <- initial_split(Caso_Colombia, strata = rating)
review_train <- training(review_split)
review_test <- testing(review_split)
review_rec <- recipe(rating ~ finding, data = review_train) %>%
  step_tokenize(finding) %>%
  step_stopwords(finding) %>%
  step_tokenfilter(finding, max_tokens = 500) %>%
  step_tfidf(finding)

review_prep <- prep(review_rec)
lasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet")

lasso_wf <- workflow() %>%
  add_recipe(review_rec) %>%
  add_model(lasso_spec)

lambda_grid <- grid_regular(penalty(), levels = 10)
review_folds <- bootstraps(Caso_Colombia,strata = rating)
doParallel::registerDoParallel()

set.seed(2020)
lasso_grid <- tune_grid(
  lasso_wf,
  resamples = review_folds,
  grid = lambda_grid,
  metrics = metric_set(roc_auc, ppv, npv)
)
lasso_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5, show.legend = FALSE) +
  facet_wrap(~.metric) +
  scale_x_log10()
## Warning: Removed 1 row(s) containing missing values (geom_path).

best_auc <- lasso_grid %>%
  select_best("roc_auc")

final_lasso <- finalize_workflow(lasso_wf, best_auc)
final_lasso %>%
  fit(review_train) %>%
  pull_workflow_fit() %>%
  vi(lambda = best_auc$penalty) %>%
  group_by(Sign) %>%
  top_n(20, wt = abs(Importance)) %>%
  ungroup() %>%
  mutate(
    Importance = abs(Importance),
    Variable = str_remove(Variable, "tfidf_finding_"),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  filter(!Variable %in% bad_words_def)%>%
  filter(!Variable %in% c('años','progreso'))%>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~Sign, scales = "free_y") +
  labs(y = NULL)

PCA Problems within the document

colombian_df<-Caso_Colombia%>%
  unnest_tokens(word,finding)%>%
  mutate(word=removeNumbers(word))%>%
  mutate(word=removePunctuation(word))%>%
  filter(!word %in% bad_words_def)%>%
  filter(word!='')%>%
  add_count(word)%>%
  filter(n>50)%>%
  select(OperNum,count.character_finding,count.word_finding,topic,word,n,ration_finding,rating)

colombian_df<-colombian_df%>%
  mutate(OperNum)%>%
  mutate(row=row_number())%>%
  pivot_wider(names_from=word,values_from=n,values_fill=0)%>%
  select(-row)%>%
  janitor::clean_names() %>%
  na.omit()
colombian_df
## # A tibble: 4,737 x 55
##    oper_num count_character… count_word_find… topic ration_finding rating diseno
##    <chr>               <int>            <int> <chr>          <dbl> <chr>   <int>
##  1 CO-L1012              284               45 Supe…          0.158 Bad        55
##  2 CO-L1012              284               45 Supe…          0.158 Bad         0
##  3 CO-L1012              284               45 Supe…          0.158 Bad         0
##  4 CO-L1012              284               45 Supe…          0.158 Bad        55
##  5 CO-L1012              284               45 Supe…          0.158 Bad         0
##  6 CO-L1012              284               45 Supe…          0.158 Bad         0
##  7 CO-L1012              284               45 Supe…          0.158 Bad         0
##  8 CO-L1012              141               21 Supe…          0.149 Bad         0
##  9 CO-L1012              141               21 Supe…          0.149 Bad         0
## 10 CO-L1012              160               25 Supe…          0.156 Bad         0
## # … with 4,727 more rows, and 48 more variables: debido <int>, sistema <int>,
## #   operacion <int>, primer <int>, desembolso <int>, informacion <int>,
## #   contrato <int>, obra <int>, ejecucion <int>, financiera <int>, obras <int>,
## #   proyecto <int>, recursos <int>, invias <int>, banco <int>, disenos <int>,
## #   us <int>, millones <int>, plazo <int>, cambio <int>, avance <int>,
## #   dado <int>, actividades <int>, fecha <int>, procesos <int>, proceso <int>,
## #   gestion <int>, desarrollo <int>, proyectos <int>, ministerio <int>,
## #   prestamo <int>, capacidad <int>, institucional <int>, ejecutor <int>,
## #   plan <int>, parte <int>, anterior <int>, equipo <int>, metas <int>,
## #   transporte <int>, gobierno <int>, bid <int>, programa <int>,
## #   semestre <int>, nacional <int>, credito <int>, programacion <int>,
## #   resultados <int>
colombian_df$count_character_finding<-as.numeric(colombian_df$count_character_finding)
colombian_df$count_word_finding<-as.numeric(colombian_df$count_word_finding)
colombian_df<-colombian_df%>%
  mutate_if(is.integer,as.numeric)
pca_rec <- recipe(~., data = colombian_df) %>%
  update_role(oper_num, topic,rating, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

pca_prep <- prep(pca_rec)

pca_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          3
##  predictor         52
## 
## Training data contained 4737 data points and no missing data.
## 
## Operations:
## 
## Centering and scaling for count_character_finding, ... [trained]
## PCA extraction with count_character_finding, ... [trained]
tidied_pca <- tidy(pca_prep, 2)

tidied_pca %>%
  filter(!terms %in% bad_words_def)%>%
  filter(!terms %in% c('us','plan'))%>%
  filter(component %in% paste0("PC", 1:6)) %>%
  mutate(component = fct_inorder(component))%>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1,scales = 'free_x') +
  scale_y_reordered()+
  labs(y = NULL)

tidied_pca %>%
  filter(!terms %in% bad_words_def)%>%
  filter(!terms %in% c('us','plan','primer','ration_finding','project','parte'))%>%
  filter(component %in% paste0("PC", 1:6))%>%
  top_n(100,terms)%>%
  mutate(terms=reorder_within(terms,value,component))%>%
  ggplot(aes(value,terms,fill=value>0))+
  geom_col(show.legend = FALSE)+
  facet_wrap(~component,scales='free')+
  scale_y_reordered()

juice(pca_prep) %>%
  distinct(oper_num,.keep_all = TRUE)%>%
  ggplot(aes(PC1, PC2, label = topic)) +
  geom_point(aes(color = oper_num), alpha = 0.7, size = 2,show.legend = FALSE) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  labs(color = NULL)

library(embed)
umap_rec <- recipe(~., data = colombian_df) %>%
  update_role(oper_num, topic,rating, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_umap(all_predictors())

umap_prep <- prep(umap_rec)

umap_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          3
##  predictor         52
## 
## Training data contained 4737 data points and no missing data.
## 
## Operations:
## 
## Centering and scaling for count_character_finding, ... [trained]
## UMAP embedding for count_character_finding, ... [trained]

PCa with embeddings

juice(umap_prep) %>%
  distinct(oper_num,.keep_all = TRUE)%>%
  ggplot(aes(umap_1, umap_2, label = topic)) +
  geom_point(aes(color = oper_num), alpha = 0.7, size = 2,show.legend = FALSE) +
  geom_text(check_overlap = TRUE, hjust = "inward") +
  labs(color = NULL)