#libraries
library(tidyverse);
library(tidytext);
library(tidymodels)
library(tidymetrics)
library(widyr)
library(ggrepel)
library(scales)
library(ggthemes)
library(tidylo)
library(topicmodels)
library(textfeatures)
library(vip)
theme_set(theme_solarized())findings_orig.lang%>%
count(Country,sort = TRUE)%>%
mutate(avg=n/sum(n))%>%
mutate(Country=fct_reorder(Country,n))%>%
ggplot(aes(avg,Country))+
geom_col()+
labs(title = 'Freq per Country by Documents',
y='') * is a Sample? * is a universe?
findings_orig.lang%>%
ggplot(aes(OperNum,count.character_finding,color=Country))+
geom_point()+
facet_wrap(~Year.of.text.entry_max,scales = 'free')+
theme(axis.text.x = element_blank())## Warning: Removed 28 rows containing missing values (geom_point).
With this feature I think that is possible create filter for year <‘2019’, for create features.
countries<-findings_orig.lang%>%
distinct(OperNum,.keep_all = TRUE)%>%
distinct(Country,.keep_all = TRUE)%>%
top_n(10,count.character_finding)%>%
select(Country)findings_orig.lang%>%
distinct(OperNum,.keep_all = TRUE)%>%
filter(!Country %in% countries$Country)%>%
filter(Year.of.text.entry_min<2019)%>%
ggplot(aes(OperNum,count.character_finding))+
geom_line(group=1)+
geom_point(aes(color=Country),show.legend = FALSE)+
geom_text(aes(label=OperNum),vjust=1,hjust=1,check_overlap = TRUE,size=2)+
theme(axis.text.x = element_blank())+
facet_wrap(~Country,scales='free')## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).
With this dinamic I think that I Can create some Features
findings_orig.lang%>%
distinct(OperNum,.keep_all = TRUE)%>%
filter(!Country %in% countries$Country)%>%
filter(Year.of.text.entry_min<2019)%>%
mutate(ration_finding=count.word_finding/count.character_finding)%>%
mutate(Country=fct_reorder(Country,ration_finding))%>%
ggplot(aes(Country,ration_finding))+
geom_boxplot()+
coord_flip()## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
Welll With this feature we Can Develop PCA’s for NLP sentences.
findings_per_document<-findings_orig.lang%>%
distinct(OperNum,.keep_all = TRUE)%>%
filter(!Country %in% countries$Country)%>%
filter(Year.of.text.entry_min<2019)%>%
mutate(ration_finding=count.word_finding/count.character_finding)%>%
group_by(Country)%>%
fill(ration_finding,.direction = 'downup')%>%
ungroup()
findings_per_document## # A tibble: 512 x 18
## OperNum Country finding PMR.cycle.ID_min PMR.cycle.ID_max Date.of.text.en…
## <chr> <chr> <chr> <dbl> <dbl> <date>
## 1 BA-L10… BA "overa… 10 14 2014-08-13
## 2 BA-L10… BA "given… 14 14 2015-03-02
## 3 BA-L10… BA "1) th… 15 15 2015-07-15
## 4 BA-L10… BA "other… 6 14 2014-06-05
## 5 BA-L10… BA "inter… 22 23 2016-10-09
## 6 BA-L10… BA "durin… 28 28 2018-09-28
## 7 BA-L10… BA "after… 28 28 2018-09-20
## 8 BA-L10… BA "1.2 (… 15 15 2015-07-15
## 9 BA-L10… BA "2.1 a… 22 23 2016-10-20
## 10 BA-L10… BA "proje… 9 14 2014-04-07
## # … with 502 more rows, and 12 more variables: Date.of.text.entry_max <date>,
## # Year.of.text.entry_min <int>, Year.of.text.entry_max <int>, language <chr>,
## # lang_spanish <dbl>, lang_english <dbl>, lang_port <dbl>, lang_french <dbl>,
## # lang_unknown <dbl>, count.character_finding <int>,
## # count.word_finding <int>, ration_finding <dbl>
findings_per_document%>%
filter(!is.na(finding))%>%
group_by(Country)%>%
summarize(correlation=cor(count.character_finding,count.word_finding))## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 17 x 2
## Country correlation
## <chr> <dbl>
## 1 BA 0.995
## 2 BL 0.997
## 3 BO 0.997
## 4 BR 0.998
## 5 CO 0.992
## 6 CR 0.999
## 7 DR 0.999
## 8 ES 0.999
## 9 GU 0.998
## 10 GY 0.998
## 11 HA 0.998
## 12 JA 0.998
## 13 ME 0.999
## 14 NI 0.998
## 15 RG 0.997
## 16 SU 0.982
## 17 UR 0.996
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
findings_orig.lang%>%
filter(!is.na(finding))%>%
group_by(Country,Year.of.text.entry_min)%>%
summarize(n=n(),
avg_finding=round(mean(count.word_finding),0))%>%
filter(avg_finding<400)%>%
ggplot(aes(n,avg_finding))+
geom_point()+
geom_smooth(method = 'lm')## `summarise()` regrouping output by 'Country' (override with `.groups` argument)
## `geom_smooth()` using formula 'y ~ x'
In this case I working with Colombia
sentences<-findings_orig.lang%>%
filter(Country=='CO',
!is.na(finding))%>%
mutate(finding=str_replace(finding,'others:',''))%>%
mutate(ration_finding=count.word_finding/count.character_finding)%>%
unnest_tokens(token = 'sentences',output = 'sentences',input = finding)%>%
select(OperNum,sentences,ration_finding)
## Numbers of sentences per document
sentences %>%
count(OperNum)## # A tibble: 41 x 2
## OperNum n
## <chr> <int>
## 1 CO-L1012 4
## 2 CO-L1019 50
## 3 CO-L1022 6
## 4 CO-L1028 41
## 5 CO-L1034 45
## 6 CO-L1041 5
## 7 CO-L1052 4
## 8 CO-L1059 24
## 9 CO-L1065 8
## 10 CO-L1076 10
## # … with 31 more rows
In this part I eliminate words that dont have meaning within sentences
library(stopwords)
library(tm)
bad_words<-stopwords('spanish')
bad_words2<-c('año','usd','i','ii','iii','co','asÃ','fin','mill','vez','si','l','tener','oc','iv','nuevos','problema','santa','dio','p','enero','febrero','marzo','abril','mayo','junio','julio','agosto','septiembre','octubre','noviembre','diciembre','aproximadamente','varios','ende','tipo','aún','fase','costo','existe','logró','v','acerca','b','entrada','forma','gran','igualmente','misma','orden','oficina','output','podrÃa','presentaron','propuesta','propuestas','referencia','reporte','san','tercer','térmico','zonas','caja','alrededor','asignados','decir','después','dÃas','dicha','diseñado','esperaba','esperado','finalizar','importancia','llevó','maestro','mejorar','necesarios','niveles','particular','planteado','presente','principios','reflejado','registra','reportado','requiere','secado','solo','supuesto','with','working','actualización','alto','asociado','considerando','correspondiente','dichos','escasa','fa','figura','genera','justificación','km','llevado','medidas','menos','modo','municipio','necesarias','necesidades','parcialmente','partir','pese','personas','pmasis','presentar','previas','previstos','real','realizando','sentido','suscrito','tema','vÃa','acorde','acorde','actividad','adecuada','adelantando','ándres','atención','comprometer','consultores','corresponde','crÃtica','cuarto','efectos','ejecutores','entonces','especial','establecidos','crÃtica','ejecutores','especial','establecidos','existen','incluido','lograr','logro','luego','mas','más','mayores','mecanismo','mejor','notable','nuevas','objeto','oferta','original','pari','passu','posibilidad','posterior','propósito','','realidad','realizaron','red','satisfactoria','solicitado','últimos','varias','además','afectar','aplicación','aprobados','artÃculo','competencia','competencias','conjunto','consecución','creó','considerable','consolidación','convocatoria','cronogramas','cuatro','darle','definidos','desarrollado','efectiva','eficiente','ejecutados','especialista','estima','extensión','gestor','igual','indica','informa','interno','internos','largo','llega','lugar','manual','mismas','mll','múltiples','obtener','permitido','plazos','porcentaje','potenciales','presentación','presentó','prevé','primeras','providencia','proyecciones','puedan','realizada','reglamento','relacionada','renovación','restricciones','semanas','siguiente','solicitar','soluciones','suscribir','suscripción','tales','todas','todo','tomo','usuarios','vÃas','abstuvo','acumulado','adelantaron','afectó','apenas','apoya','c','buena','centros','compromisos','conjunta','considerablemente','consorcio','contenido','continua','cronograma','deben','deberán','dialogo','disponibles','dificultad','documentos','ejercicio','elegible','ello','esfuerzos','espacios','especiales','esquemas','etapas','formalizó','financiada','fuertes','funcionamiento','generado','generados','generar','generó','gerencial','gestiones','gestores','grupos','haber','hecho','inclusión','informó','inicios','manifestado','mencionados','opinión','partes','pendiente','permitieran','podÃa','podrá','presentan','presentando','proyectados','punto','puesta','realización','realizará','realizará','reporta','respuesta','revisaron','sigue','sólidos','sigue','sólo','superior','sustancial','torno','tramitar','tres','utilización','ve','vi','vio','acordado','actualizado','adecuado','agencia','ahora','ajustar','ajustó','alcanzado','alcanzarán','ambos','anteriores','aplicativo','aportes','apoyar','aprobó','asimismo','atender','calculado','cercano','cinco','continuar','corresponden','cumplido','da','dará','deberá','decidió','decididas','decidió','definido','definidas','denominada','determinar','determinación','detalles','detalle','diferencias','dinámica','dinamismo','directivas','disponibilidad','distintos','eb','ejecutadas','entendimiento','establecidas','evidente','explica')
bad_words_def<-c(bad_words,bad_words2)
sentences_unnested<-sentences%>%
unnest_tokens(word,sentences)%>%
mutate(word=removeNumbers(word))%>%
mutate(word=removePunctuation(word))%>%
filter(!word %in% bad_words_def)%>%
filter(word!='')%>%
count(OperNum,word,sort=TRUE)
total_words<-sentences_unnested%>%
group_by(OperNum)%>%
summarize(total_words=sum(n))%>%
ungroup()In this section I evaluate how the document is performing and what it addresses
odds_documents<-sentences_unnested%>%
left_join(total_words,by='OperNum')%>%
bind_log_odds(OperNum,word,n)
odds_documents## # A tibble: 9,821 x 5
## OperNum word n total_words log_odds_weighted
## <chr> <chr> <int> <int> <dbl>
## 1 CO-L1019 proyecto 88 2073 -6.77
## 2 CO-L1019 invias 54 2073 19.8
## 3 CO-L1092 colciencias 45 1858 19.6
## 4 CO-L1105 ejecución 45 1414 -8.77
## 5 CO-L1059 programa 44 1042 0.873
## 6 CO-L1019 ejecución 42 2073 -10.6
## 7 CO-L1091 ejecución 40 2242 -12.0
## 8 CO-L1091 proyecto 37 2242 -10.2
## 9 CO-L1105 programa 37 1414 -9.83
## 10 CO-L1091 recursos 34 2242 -5.67
## # … with 9,811 more rows
documents<-odds_documents%>%
group_by(OperNum)%>%
top_n(1,log_odds_weighted)%>%
head(10)%>%
select(OperNum)Here is all the history per document
odds_documents%>%
filter(OperNum %in% documents$OperNum)%>%
group_by(OperNum)%>%
top_n(10,log_odds_weighted)%>%
ungroup()%>%
mutate(word=reorder_within(word,log_odds_weighted,OperNum))%>%
ggplot(aes(log_odds_weighted,word))+
geom_col()+
scale_y_reordered()+
facet_wrap(~OperNum, scales='free')Everything on the dotted line is the most important thing to understand within the document.
odds_documents%>%
filter(OperNum=='CO-L1019',
n>10)%>%
ggplot(aes(n, log_odds_weighted, label = word)) +
geom_hline(yintercept = 0, color = "gray50", lty = 2, size = 1.5) +
geom_point(alpha = 0.8, color = "midnightblue") +
geom_text_repel() +
scale_x_log10()documents_matrix<-odds_documents%>%
count(OperNum,word)%>%
mutate(word=removeNumbers(word))%>%
mutate(word=removePunctuation(word))%>%
filter(!word %in% bad_words_def)%>%
filter(word!='')%>%
cast_dtm(document = OperNum,term = word,
value = n,weighting = tm::weightTf)
sample_size<-floor(.80*nrow(documents_matrix))
train_id<-sample(nrow(documents_matrix),size = sample_size)
train<-documents_matrix[train_id,]
test<-documents_matrix[-train_id,]
documents_LDA<-LDA(train,k = 10,method = 'Gibbs',control = list(seed=111))Beta distribution
## # A tibble: 40,640 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 actualizada 0.0000811
## 2 2 actualizada 0.0000774
## 3 3 actualizada 0.0000834
## 4 4 actualizada 0.000993
## 5 5 actualizada 0.0000717
## 6 6 actualizada 0.0000826
## 7 7 actualizada 0.0000792
## 8 8 actualizada 0.0000816
## 9 9 actualizada 0.0000760
## 10 10 actualizada 0.0000746
## # … with 40,630 more rows
Principal words per Topics
documents_betas%>%
filter(!term %in% c('actualizada','adecuan','años','nueva'))%>%
group_by(topic)%>%
top_n(10,beta)%>%
arrange(topic,-beta)%>%
mutate(term=reorder_within(term,beta,topic))%>%
ggplot(aes(beta,term, fill=factor(topic)))+
geom_col(show.legend = FALSE)+
facet_wrap(~topic,scales = 'free_y')+
scale_y_reordered()documents_gamma<-tidy(documents_LDA,matrix='gamma')
documents_gamma%>%
group_by(document)%>%
top_n(1,gamma)## # A tibble: 35 x 3
## # Groups: document [32]
## document topic gamma
## <chr> <int> <dbl>
## 1 CO-L1059 1 0.160
## 2 CO-L1022 2 0.133
## 3 CO-L1156 2 0.137
## 4 CO-L1097 2 0.156
## 5 CO-L1126 3 0.125
## 6 CO-L1109 3 0.211
## 7 CO-L1092 3 0.152
## 8 CO-L1019 4 0.146
## 9 CO-L1126 4 0.125
## 10 CO-L1065 4 0.177
## # … with 25 more rows
topics<-documents_betas%>%
mutate(topic = case_when(
str_detect(topic,'1')~'Adquisición de equipos',
str_detect(topic,'2')~'Planes de implementación',
str_detect(topic,'3')~'Presupuesto del Gobierno',
str_detect(topic,'4')~'Operación Institucional',
str_detect(topic,'5')~'Planificación',
str_detect(topic,'6')~'Créditos',
str_detect(topic,'7')~'Prestamos Financieros',
str_detect(topic,'8')~'Contratación Estatal',
str_detect(topic,'9')~'Supervisión Financiera por parte del Estado',
str_detect(topic,'10')~'Cumplimiento a los planes de las organizaciones',
))
topics## # A tibble: 40,640 x 3
## topic term beta
## <chr> <chr> <dbl>
## 1 Adquisición de equipos actualizada 0.0000811
## 2 Planes de implementación actualizada 0.0000774
## 3 Presupuesto del Gobierno actualizada 0.0000834
## 4 Operación Institucional actualizada 0.000993
## 5 Planificación actualizada 0.0000717
## 6 Créditos actualizada 0.0000826
## 7 Prestamos Financieros actualizada 0.0000792
## 8 Contratación Estatal actualizada 0.0000816
## 9 Supervisión Financiera por parte del Estado actualizada 0.0000760
## 10 Adquisición de equipos actualizada 0.0000746
## # … with 40,630 more rows
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 35 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 CO-L1059 1 0.160
## 2 CO-L1022 2 0.133
## 3 CO-L1156 2 0.137
## 4 CO-L1097 2 0.156
## 5 CO-L1126 3 0.125
## 6 CO-L1109 3 0.211
## 7 CO-L1092 3 0.152
## 8 CO-L1019 4 0.146
## 9 CO-L1126 4 0.125
## 10 CO-L1065 4 0.177
## # … with 25 more rows
documents_tagging<-documents_gamma%>%
group_by(document)%>%
top_n(1,gamma)%>%
mutate(topic = case_when(
str_detect(topic,'1')~'Adquisición de equipos',
str_detect(topic,'2')~'Planes de implementación',
str_detect(topic,'3')~'Presupuesto del Gobierno',
str_detect(topic,'4')~'Operación Institucional',
str_detect(topic,'5')~'Planificación',
str_detect(topic,'6')~'Créditos',
str_detect(topic,'7')~'Prestamos Financieros',
str_detect(topic,'8')~'Contratación Estatal',
str_detect(topic,'9')~'Supervisión Financiera por parte del Estado',
str_detect(topic,'10')~'Cumplimiento a los planes de las organizaciones',
))%>%
distinct()%>%
select(-gamma)
documents_tagging## # A tibble: 35 x 2
## # Groups: document [32]
## document topic
## <chr> <chr>
## 1 CO-L1059 Adquisición de equipos
## 2 CO-L1022 Planes de implementación
## 3 CO-L1156 Planes de implementación
## 4 CO-L1097 Planes de implementación
## 5 CO-L1126 Presupuesto del Gobierno
## 6 CO-L1109 Presupuesto del Gobierno
## 7 CO-L1092 Presupuesto del Gobierno
## 8 CO-L1019 Operación Institucional
## 9 CO-L1126 Operación Institucional
## 10 CO-L1065 Operación Institucional
## # … with 25 more rows
Caso_Colombia<-findings_orig.lang%>%
filter(Country=='CO')%>%
mutate(ration_finding=count.word_finding/count.character_finding)%>%
left_join(documents_tagging, by=c('OperNum'='document'))%>%
filter(!is.na(finding))%>%
mutate(topic=if_else(is.na(topic),'Otro',topic))
Caso_Colombia ## # A tibble: 634 x 19
## OperNum Country finding PMR.cycle.ID_min PMR.cycle.ID_max Date.of.text.en…
## <chr> <chr> <chr> <dbl> <dbl> <date>
## 1 CO-L10… CO others… 4 14 2014-04-07
## 2 CO-L10… CO others… 4 14 2014-04-07
## 3 CO-L10… CO projec… 4 14 2014-04-07
## 4 CO-L10… CO projec… 4 14 2014-04-07
## 5 CO-L10… CO 3.2 a … 17 17 2016-03-30
## 6 CO-L10… CO 3.3 ac… 17 17 2016-03-28
## 7 CO-L10… CO a dici… 23 23 2016-10-03
## 8 CO-L10… CO a dici… 23 26 2016-10-03
## 9 CO-L10… CO a juni… 22 22 2016-10-03
## 10 CO-L10… CO a juni… 22 22 2016-10-03
## # … with 624 more rows, and 13 more variables: Date.of.text.entry_max <date>,
## # Year.of.text.entry_min <int>, Year.of.text.entry_max <int>, language <chr>,
## # lang_spanish <dbl>, lang_english <dbl>, lang_port <dbl>, lang_french <dbl>,
## # lang_unknown <dbl>, count.character_finding <int>,
## # count.word_finding <int>, ration_finding <dbl>, topic <chr>
## [1] 0.1600061
Caso_Colombia<-Caso_Colombia%>%
mutate(rating=case_when(ration_finding>0.16~'Good',TRUE~'Bad'))
review_split <- initial_split(Caso_Colombia, strata = rating)
review_train <- training(review_split)
review_test <- testing(review_split)
review_rec <- recipe(rating ~ finding, data = review_train) %>%
step_tokenize(finding) %>%
step_stopwords(finding) %>%
step_tokenfilter(finding, max_tokens = 500) %>%
step_tfidf(finding)
review_prep <- prep(review_rec)lasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
lasso_wf <- workflow() %>%
add_recipe(review_rec) %>%
add_model(lasso_spec)
lambda_grid <- grid_regular(penalty(), levels = 10)
review_folds <- bootstraps(Caso_Colombia,strata = rating)
doParallel::registerDoParallel()
set.seed(2020)
lasso_grid <- tune_grid(
lasso_wf,
resamples = review_folds,
grid = lambda_grid,
metrics = metric_set(roc_auc, ppv, npv)
)lasso_grid %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_line(size = 1.5, show.legend = FALSE) +
facet_wrap(~.metric) +
scale_x_log10()## Warning: Removed 1 row(s) containing missing values (geom_path).
best_auc <- lasso_grid %>%
select_best("roc_auc")
final_lasso <- finalize_workflow(lasso_wf, best_auc)final_lasso %>%
fit(review_train) %>%
pull_workflow_fit() %>%
vi(lambda = best_auc$penalty) %>%
group_by(Sign) %>%
top_n(20, wt = abs(Importance)) %>%
ungroup() %>%
mutate(
Importance = abs(Importance),
Variable = str_remove(Variable, "tfidf_finding_"),
Variable = fct_reorder(Variable, Importance)
) %>%
filter(!Variable %in% bad_words_def)%>%
filter(!Variable %in% c('años','progreso'))%>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Sign, scales = "free_y") +
labs(y = NULL)colombian_df<-Caso_Colombia%>%
unnest_tokens(word,finding)%>%
mutate(word=removeNumbers(word))%>%
mutate(word=removePunctuation(word))%>%
filter(!word %in% bad_words_def)%>%
filter(word!='')%>%
add_count(word)%>%
filter(n>50)%>%
select(OperNum,count.character_finding,count.word_finding,topic,word,n,ration_finding,rating)
colombian_df<-colombian_df%>%
mutate(OperNum)%>%
mutate(row=row_number())%>%
pivot_wider(names_from=word,values_from=n,values_fill=0)%>%
select(-row)%>%
janitor::clean_names() %>%
na.omit()
colombian_df## # A tibble: 4,737 x 55
## oper_num count_character… count_word_find… topic ration_finding rating diseno
## <chr> <int> <int> <chr> <dbl> <chr> <int>
## 1 CO-L1012 284 45 Supe… 0.158 Bad 55
## 2 CO-L1012 284 45 Supe… 0.158 Bad 0
## 3 CO-L1012 284 45 Supe… 0.158 Bad 0
## 4 CO-L1012 284 45 Supe… 0.158 Bad 55
## 5 CO-L1012 284 45 Supe… 0.158 Bad 0
## 6 CO-L1012 284 45 Supe… 0.158 Bad 0
## 7 CO-L1012 284 45 Supe… 0.158 Bad 0
## 8 CO-L1012 141 21 Supe… 0.149 Bad 0
## 9 CO-L1012 141 21 Supe… 0.149 Bad 0
## 10 CO-L1012 160 25 Supe… 0.156 Bad 0
## # … with 4,727 more rows, and 48 more variables: debido <int>, sistema <int>,
## # operacion <int>, primer <int>, desembolso <int>, informacion <int>,
## # contrato <int>, obra <int>, ejecucion <int>, financiera <int>, obras <int>,
## # proyecto <int>, recursos <int>, invias <int>, banco <int>, disenos <int>,
## # us <int>, millones <int>, plazo <int>, cambio <int>, avance <int>,
## # dado <int>, actividades <int>, fecha <int>, procesos <int>, proceso <int>,
## # gestion <int>, desarrollo <int>, proyectos <int>, ministerio <int>,
## # prestamo <int>, capacidad <int>, institucional <int>, ejecutor <int>,
## # plan <int>, parte <int>, anterior <int>, equipo <int>, metas <int>,
## # transporte <int>, gobierno <int>, bid <int>, programa <int>,
## # semestre <int>, nacional <int>, credito <int>, programacion <int>,
## # resultados <int>
colombian_df$count_character_finding<-as.numeric(colombian_df$count_character_finding)
colombian_df$count_word_finding<-as.numeric(colombian_df$count_word_finding)
colombian_df<-colombian_df%>%
mutate_if(is.integer,as.numeric)
pca_rec <- recipe(~., data = colombian_df) %>%
update_role(oper_num, topic,rating, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
pca_prep <- prep(pca_rec)
pca_prep## Data Recipe
##
## Inputs:
##
## role #variables
## id 3
## predictor 52
##
## Training data contained 4737 data points and no missing data.
##
## Operations:
##
## Centering and scaling for count_character_finding, ... [trained]
## PCA extraction with count_character_finding, ... [trained]
tidied_pca <- tidy(pca_prep, 2)
tidied_pca %>%
filter(!terms %in% bad_words_def)%>%
filter(!terms %in% c('us','plan'))%>%
filter(component %in% paste0("PC", 1:6)) %>%
mutate(component = fct_inorder(component))%>%
ggplot(aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow = 1,scales = 'free_x') +
scale_y_reordered()+
labs(y = NULL)tidied_pca %>%
filter(!terms %in% bad_words_def)%>%
filter(!terms %in% c('us','plan','primer','ration_finding','project','parte'))%>%
filter(component %in% paste0("PC", 1:6))%>%
top_n(100,terms)%>%
mutate(terms=reorder_within(terms,value,component))%>%
ggplot(aes(value,terms,fill=value>0))+
geom_col(show.legend = FALSE)+
facet_wrap(~component,scales='free')+
scale_y_reordered()juice(pca_prep) %>%
distinct(oper_num,.keep_all = TRUE)%>%
ggplot(aes(PC1, PC2, label = topic)) +
geom_point(aes(color = oper_num), alpha = 0.7, size = 2,show.legend = FALSE) +
geom_text(check_overlap = TRUE, hjust = "inward") +
labs(color = NULL)library(embed)
umap_rec <- recipe(~., data = colombian_df) %>%
update_role(oper_num, topic,rating, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_umap(all_predictors())
umap_prep <- prep(umap_rec)
umap_prep## Data Recipe
##
## Inputs:
##
## role #variables
## id 3
## predictor 52
##
## Training data contained 4737 data points and no missing data.
##
## Operations:
##
## Centering and scaling for count_character_finding, ... [trained]
## UMAP embedding for count_character_finding, ... [trained]
juice(umap_prep) %>%
distinct(oper_num,.keep_all = TRUE)%>%
ggplot(aes(umap_1, umap_2, label = topic)) +
geom_point(aes(color = oper_num), alpha = 0.7, size = 2,show.legend = FALSE) +
geom_text(check_overlap = TRUE, hjust = "inward") +
labs(color = NULL)