setwd("~/Mestrado/Ciencias de Dados em Psicometria/Tijolos")
usos <- read_excel("usos.xlsx", sheet = "Usos")
# Atividade 1: caixa de papelão
# Atividade 2: tijolos
# Seleciona atividade 1
dt_cx <- usos %>% filter(ativi==2 & !is.na(resposta)) %>% select(1:5)
# Calcula fluência e examina distribuiçà o
dt_cx %>% group_by(ID, pre_pos) %>% count %>%
arrange(ID, pre_pos) %>% filter(n <25) %>%
ggplot(aes(x=n)) +
geom_histogram(binwidth = 2, color = "white", fill = "gray") +
scale_x_continuous(breaks = seq(1:25), limits = c(1, 25))
# Correlação entre fluência
dt_cx %>% group_by(ID, pre_pos) %>% count %>%
arrange(ID, pre_pos) %>% filter(n <25) %>%
spread(key = pre_pos, value = n) %>%
ggplot(aes(x=`1`, y=`2`)) + geom_point()
# Tokenize palvras
dt_cx2 <- dt_cx %>% unnest_tokens(words, resposta)
length(unique(dt_cx2$words))
## [1] 931
# Lê stopwords
stopwords <- read_csv(
file = "http://www.labape.com.br/rprimi/ds/stopwords.txt",
col_names = FALSE)
names(stopwords) = "words"
dt_cx2 %>% count(words, sort = TRUE)
## # A tibble: 931 x 2
## words n
## <chr> <int>
## 1 de 368
## 2 fazer 250
## 3 casa 223
## 4 uma 208
## 5 um 117
## 6 tijolo 104
## 7 para 99
## 8 construir 96
## 9 churrasqueira 79
## 10 mesa 75
## # ... with 921 more rows
# Palavras gerais
dt_cx2 %>%
count(words, sort = TRUE) %>%
filter(n > 50 ) %>%
mutate(words = reorder(words, n)) %>%
ggplot(aes(words, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
# Remove stopwords
dt_cx2 %>%
count(words, sort = TRUE) %>%
anti_join(stopwords) %>%
filter(n > 15) %>%
mutate(word = reorder(words, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
# Word cloud
library(wordcloud)
dt_cx2 %>%
anti_join(stopwords) %>%
count(words) %>%
with(wordcloud(words, n,
colors = brewer.pal(12, "Set1"),
max.words = 100))
stopwords[221, "words"] <- "pra"
stopwords[222, "words"] <- "dá"
Termâs inverse document frequency (idf), which decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents. This can be combined with term frequency to calculate a termâs tf-idf (the two quantities multiplied together), the frequency of a term adjusted for how rarely it is used (p. 31)
The idea of tf-idf is to find the important words for the content of each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection or corpus of documents (p. 37)
\[idf_{termo} = \mbox{ln} \frac{n_{documentos}}{n_{documentos.contendo.termo}}\]
# Calcula frequencia de palavras por sujeito (document)
subj_words <- dt_cx2 %>%
count(ID, words, sort = TRUE) %>%
ungroup()
# Calcula total de palavras por sujeito
total_words <- subj_words %>%
group_by(ID) %>%
summarize(total = sum(n))
# Une total de palavras em subj_words
subj_words <- left_join(subj_words, total_words)
# Calcula frequência de palavras por sujeito
subj_words <- subj_words %>%
mutate(term_freq = n/total)
# Calcula número de documentos
subj_words <- subj_words %>%
mutate(n_docs = n_distinct(ID))
# Calcula em quandos documentos cada palavra aparece (document frequency)
subj_words <- subj_words %>%
group_by(words) %>%
mutate(word_doc_freq = n_distinct(ID)/n_docs,
inv_word_doc_freq = n_docs/n_distinct(ID),
ln_inv_word_doc_freq =log(n_docs/n_distinct(ID)))
subj_words <- subj_words %>% filter(words !="0")
subj_words <- subj_words %>%
bind_tf_idf(term = words, document = ID, n=n)
dtm <- subj_words %>% select(c(1, 2, 12)) %>%
spread(key=words, value = tf_idf)
subj_words %>%
arrange(desc(tf_idf))
## # A tibble: 3,907 x 12
## # Groups: words [930]
## ID words n total term_freq n_docs word_doc_freq
## <dbl> <chr> <int> <int> <dbl> <int> <dbl>
## 1 107537444 cenários 1 2 0.5 253 0.00395
## 2 108004784 boné 1 2 0.5 253 0.00395
## 3 109580516 reta 1 2 0.5 253 0.00395
## 4 106299770 predio 1 1 1 253 0.0751
## 5 105537171 carinho 1 2 0.5 253 0.00791
## 6 109094091 maquete 1 2 0.5 253 0.00791
## 7 108644067 carrinho 1 1 1 253 0.0949
## 8 104811247 quadro 1 2 0.5 253 0.0119
## 9 109168832 chaminés 1 2 0.5 253 0.0158
## 10 110218509 espada 1 2 0.5 253 0.0158
## # ... with 3,897 more rows, and 5 more variables: inv_word_doc_freq <dbl>,
## # ln_inv_word_doc_freq <dbl>, tf <dbl>, idf <dbl>, tf_idf <dbl>
subj_words %>%
ggplot(aes(x=tf , y= tf_idf, color = word_doc_freq)) +
geom_point(alpha=1/4) +
scale_colour_gradientn(colours = brewer.pal(7, "Paired")) +
scale_y_continuous(seq(0, 2.5, .5), limits=c(0, 2.5))
# install.packages('servr')
library(stringr)
library(text2vec)
prep_fun <- tolower
tok_fun <- word_tokenizer
it = itoken(
dt_cx$resposta,
ids = dt_cx$ID,
preprocessor = prep_fun,
tokenizer = tok_fun,
progressbar = FALSE)
vocab =
create_vocabulary(it,
stopwords = stopwords$words) %>%
prune_vocabulary(
term_count_min = 1,
doc_proportion_max = 0.8
)
vectorizer = vocab_vectorizer(vocab)
dtm = create_dtm(it, vectorizer)
dim(dtm)
## [1] 2911 892
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_tfidf = fit_transform(dtm, tfidf)
lda_model = LDA$new(
n_topics = 12,
doc_topic_prior = 0.1,
topic_word_prior = 0.01
)
doc_topic_distr =
lda_model$fit_transform(
x = dtm_tfidf,
n_iter = 1000,
convergence_tol = 0.001,
n_check_convergence = 25,
progressbar = FALSE
)
## INFO [2018-12-11 14:04:03] iter 25 loglikelihood = -48570.240
## INFO [2018-12-11 14:04:04] iter 50 loglikelihood = -46577.067
## INFO [2018-12-11 14:04:04] iter 75 loglikelihood = -46333.269
## INFO [2018-12-11 14:04:04] iter 100 loglikelihood = -46231.245
## INFO [2018-12-11 14:04:04] iter 125 loglikelihood = -46184.585
## INFO [2018-12-11 14:04:04] iter 150 loglikelihood = -46196.731
## INFO [2018-12-11 14:04:04] early stopping at 150 iteration
lda_model$get_top_words(n = 12,
topic_number = c(1:12), lambda = 0.2)
## [,1] [,2] [,3] [,4] [,5]
## [1,] "parede" "churrasqueira" "mesa" "prédio" "muros"
## [2,] "muro" "banco" "predios" "telhado" "cama"
## [3,] "teto" "castelo" "chão" "chaminé" "escolas"
## [4,] "mercados" "celular" "cadeiras" "fogão" "calçada"
## [5,] "pontes" "janelas" "avião" "lixo" "cabana"
## [6,] "lareira" "mansão" "chamine" "piso" "apartamento"
## [7,] "jogar" "telhados" "poste" "pisos" "relógio"
## [8,] "laje" "sentar" "beliche" "condominio" "horta"
## [9,] "calçadas" "carros" "parque" "biblioteca" "sabespe"
## [10,] "quadrado" "armarios" "luz" "construções" "salão"
## [11,] "caderno" "borracha" "apoio" "pessoas" "brinquedo"
## [12,] "chaminés" "enfeite" "bomba" "quarto" "forte"
## [,6] [,7] [,8] [,9] [,10]
## [1,] "casa" "prédios" "casas" "escola" "escada"
## [2,] "lojas" "casinha" "igreja" "armário" "porta"
## [3,] "mesas" "escadas" "restaurante" "mercado" "quebrar"
## [4,] "pia" "cachorro" "garagem" "loja" "rua"
## [5,] "poço" "ponte" "brincar" "portal" "vaso"
## [6,] "lanchonete" "quadra" "fogueira" "lage" "estante"
## [7,] "forno" "bancos" "pirâmide" "chao" "mercearia"
## [8,] "espada" "hotel" "estacionamento" "escultura" "piscinas"
## [9,] "sapato" "banquinho" "controle" "lousa" "boneco"
## [10,] "relogio" "cinema" "arquibancada" "igrejas" "construção"
## [11,] "minecraft" "vidro" "quadro" "arma" "lápis"
## [12,] "canil" "estátua" "gato" "farmácias" "pilar"
## [,11] [,12]
## [1,] "piscina" "predio"
## [2,] "cadeira" "shopping"
## [3,] "carrinho" "carro"
## [4,] "hospital" "paredes"
## [5,] "padaria" "janela"
## [6,] "sofá" "galinheiro"
## [7,] "banheiro" "brinquedos"
## [8,] "fogao" "muralha"
## [9,] "prateleira" "empilhar"
## [10,] "cidade" "chácaras"
## [11,] "fábrica" "torres"
## [12,] "peso" "hospitais"
lda_model$plot()
library(stringr)
library(text2vec)
prep_fun = tolower
tok_fun = word_tokenizer
it = itoken(dt_cx$resposta,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = dt_cx$ID,
progressbar = FALSE)
vocab = create_vocabulary(
it,
stopwords = stopwords$words,
ngram = c(2, 2)
) %>%
prune_vocabulary(
term_count_min = 1,
doc_proportion_max = 0.8)
dim(vocab)
## [1] 744 3
vectorizer = vocab_vectorizer(vocab)
dtm = create_dtm(it, vectorizer)
dim(dtm)
## [1] 2911 744
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_tfidf = fit_transform(dtm, tfidf)
dim(dtm_tfidf )
## [1] 2911 744
lda_model = LDA$new(
n_topics =8,
doc_topic_prior = 0.1,
topic_word_prior = 0.01
)
doc_topic_distr =
lda_model$fit_transform(
x = dtm_tfidf, n_iter = 1000,
convergence_tol = 0.001,
n_check_convergence = 25,
progressbar = FALSE
)
## INFO [2018-12-11 14:04:04] iter 25 loglikelihood = -23029.533
## INFO [2018-12-11 14:04:04] iter 50 loglikelihood = -22841.390
## INFO [2018-12-11 14:04:04] iter 75 loglikelihood = -22828.579
## INFO [2018-12-11 14:04:04] early stopping at 75 iteration
barplot(
doc_topic_distr[2, ], xlab = "topic",
ylab = "proportion", ylim = c(0, 1),
names.arg = 1:ncol(doc_topic_distr)
)
lda_model$get_top_words(n = 12,
topic_number = c(1:4), lambda = 0.2)
## [,1] [,2] [,3]
## [1,] "fogao_lenha" "fazer_casa" "casinha_cachorro"
## [2,] "construir_casas" "casa_tijolos" "casa_cachorro"
## [3,] "fazer_muro" "cama_tijolo" "fazer_parede"
## [4,] "construir_escola" "construir_prédio" "vaso_planta"
## [5,] "fazer_banco" "carro_brinquedo" "casa_árvore"
## [6,] "forno_lenha" "fazer_chão" "fazer_churrasqueira"
## [7,] "construir_prédios" "construir_coisas" "posto_saude"
## [8,] "casinha_brinquedo" "mesa_tijolo" "posto_gasolina"
## [9,] "balanço_tijolo" "árvore_tijolos" "fazer_hotel"
## [10,] "construir_lanchonete" "fazer_barco" "forno_pizza"
## [11,] "lage_tijolo" "matar_inimigo" "comer_tijolo"
## [12,] "vaso_flor" "quebrar_janela" "fazer_lousa"
## [,4]
## [1,] "banco_sentar"
## [2,] "casa_boneca"
## [3,] "fazer_chaminé"
## [4,] "fazer_muros"
## [5,] "carro_tijolo"
## [6,] "estojo_tijolo"
## [7,] "caderno_tijolo"
## [8,] "fazer_carrinho"
## [9,] "fazer_fornalha"
## [10,] "churrasqueira_tijolos"
## [11,] "parede_tijolos"
## [12,] "chão_tijolo"
lda_model$plot()
library(readr)
# Lê word embeddings
nilc_wv <- read_delim(
file = "glove_s300.txt",
delim = " ",quote="",
skip = 1,
col_names = FALSE,
progress = TRUE)
names(nilc_wv)[1]<-"words"
names(nilc_wv)[2:301]<-paste("V", 1:300, sep= "")
# cria vocabulário
vocab <- dt_cx2 %>% count(words, sort = TRUE)
# Adiciona vetores
vocab <- left_join(vocab, nilc_wv)
# vetor lógico indicando palavras não encontradas nos vetores
select <- vocab %>% select(V1) %>% is.na %>% as.logical()
select <- !select
table(select)
# Análise de 100 clusters
d <- dist(vocab[select , 3:302], method="euclidean")
cluster <- hclust(d, method="ward.D2")
plot(cluster)
grp200 <- cutree(cluster, k = 200)
table(grp200)
# adiciona clusters
vocab <- vocab %>% filter(select) %>% select(1:2) %>% bind_cols(as.data.frame(grp200))
# leva cluster para a base
dt_cx2 <- dt_cx2[, 1:5]
dt_cx2 <- dt_cx2 %>% left_join(vocab)
# transforma cluster em dummies
dt_cx2 <- bind_cols(dt_cx2, as.data.frame(dummy.code(dt_cx2$grp200)))
names(dt_cx2)
# cria base por resposta
dt_cx3 <- dt_cx2 %>% select(c(1, 2, 4, 8:207)) %>%
group_by(ID, resp_num, pre_pos) %>%
summarise_all(.funs = sum, na.rm=TRUE)
names(dt_cx3)
# Análise de 60 clusters das respostas
d <- dist( dt_cx3[, 4:203], method="binary")
cluster <- hclust(d, method="ward.D2")
plot(cluster)
grp60 <- cutree(cluster, k = 60)
dt_cx3 <- bind_cols(dt_cx3, as.data.frame(grp60))
names(dt_cx3)
dt_cx <- left_join(dt_cx[, 1:5], dt_cx3[, c(1:3, 204)])
library(Rtsne)
library(ggrepel)
library(ggthemes)
library(RColorBrewer)
library(artyfarty)
install.packages("artyfarty")
# Lê word embeddings
nilc_wv <- read_delim(
file = "glove_s300.txt",
delim = " ",quote="",
skip = 1,
col_names = FALSE,
progress = TRUE)
names(nilc_wv)[1]<-"words"
names(nilc_wv)[2:301]<-paste("V", 1:300, sep= "")
vocab <- vocab[ , 1:3]
vocab <- left_join(vocab, nilc_wv)
select <- vocab %>% select(V1) %>% is.na %>% as.logical()
select <- !select
table(select)
tsne_out <- Rtsne(vocab[select , 4:303], perplexity = 18)
vocab <- cbind(vocab[select, 1:3] , as.data.frame(tsne_out$Y))
ggplot(data = vocab,
mapping = aes(
y = V1,
x = V2,
color = grp200)
) +
geom_point() +
geom_text_repel(
aes(label=words),
size=2, vjust=-1.2
) +
theme_minimal() +
scale_color_gradientn(colours = brewer.pal(12, "Paired"))
# cluster via ts-ne
cluster <- kmeans(vocab[, 4:5], 200, nstart = 20)
dim()
# adiciona clusters
vocab <- vocab %>% filter(select) %>%
select(1:2) %>% bind_cols(tsne200 = cluster$cluster)
names(dt_cx2)
# leva cluster para a base
dt_cx2b <- dt_cx2[, c(1:7)] %>% left_join(vocab, by="words")
# transforma cluster em dummies
dt_cx2b <- bind_cols(dt_cx2b, as.data.frame(dummy.code(dt_cx2b$tsne200)))
names(dt_cx2b)
# cria base por resposta
dt_cx3b <- dt_cx2b %>% select(c(1, 2, 4, 10:209)) %>%
group_by(ID, resp_num, pre_pos) %>%
summarise_all(.funs = sum, na.rm=TRUE)
names(dt_cx3b)
# Análise de 60 clusters das respostas
d <- dist( dt_cx3b[, 4:203], method="binary")
cluster <- hclust(d, method="ward.D2")
plot(cluster)
grp60b <- cutree(cluster, k = 60)
names(dt_cx3b)
dt_cx3b <- bind_cols(dt_cx3b[, 1:3], as.data.frame(grp60b))
dt_cx <- left_join(dt_cx[, 1:5], dt_cx3b)
table(dt_cx$grp60b)
table(dt_cx$grp60, dt_cx$grp60b)