Dados

  setwd("~/Mestrado/Ciencias de Dados em Psicometria/Tijolos")

  usos <- read_excel("usos.xlsx", sheet = "Usos") 
  
  # Atividade 1: caixa de papelão
  # Atividade 2: tijolos

Descreve dados

# Seleciona atividade 1    
  
 dt_cx <- usos %>% filter(ativi==2 & !is.na(resposta)) %>% select(1:5)

# Calcula fluência e examina distribuiçào
    dt_cx %>% group_by(ID, pre_pos) %>% count %>% 
      arrange(ID, pre_pos) %>% filter(n <25) %>%
      ggplot(aes(x=n)) + 
      geom_histogram(binwidth = 2, color = "white", fill = "gray") +
      scale_x_continuous(breaks = seq(1:25), limits = c(1, 25))

# Correlação entre fluência    
    dt_cx %>% group_by(ID, pre_pos) %>% count %>% 
        arrange(ID, pre_pos) %>% filter(n <25) %>%
        spread(key = pre_pos, value = n) %>%
        ggplot(aes(x=`1`, y=`2`)) + geom_point()

Tokenize

# Tokenize palvras 
  dt_cx2 <- dt_cx %>% unnest_tokens(words, resposta)

length(unique(dt_cx2$words))
## [1] 931
# Lê stopwords

  stopwords <- read_csv(
    file = "http://www.labape.com.br/rprimi/ds/stopwords.txt", 
    col_names = FALSE)
  
  names(stopwords) = "words"

Frequência de palavras

    dt_cx2 %>% count(words, sort = TRUE)
## # A tibble: 931 x 2
##    words             n
##    <chr>         <int>
##  1 de              368
##  2 fazer           250
##  3 casa            223
##  4 uma             208
##  5 um              117
##  6 tijolo          104
##  7 para             99
##  8 construir        96
##  9 churrasqueira    79
## 10 mesa             75
## # ... with 921 more rows
# Palavras gerais   
    dt_cx2 %>%
        count(words, sort = TRUE) %>%
        filter(n > 50 ) %>%
        mutate(words = reorder(words, n)) %>%
       ggplot(aes(words, n)) +
        geom_col() +
        xlab(NULL) +
        coord_flip()

# Remove stopwords
     dt_cx2 %>%
        count(words, sort = TRUE) %>%
        anti_join(stopwords) %>%
        filter(n > 15)  %>%
        mutate(word = reorder(words, n)) %>%
        ggplot(aes(word, n)) +
        geom_col() +
        xlab(NULL) +
        coord_flip()

# Word cloud   
   library(wordcloud)

    dt_cx2 %>%
        anti_join(stopwords) %>%
        count(words) %>%
        with(wordcloud(words, n, 
          colors = brewer.pal(12, "Set1"),
            max.words = 100)) 

    stopwords[221, "words"] <- "pra"
    stopwords[222, "words"] <- "dá"

Term’s inverse document frequency (idf)

Term’s inverse document frequency (idf), which decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents. This can be combined with term frequency to calculate a term’s tf-idf (the two quantities multiplied together), the frequency of a term adjusted for how rarely it is used (p. 31)

The idea of tf-idf is to find the important words for the content of each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection or corpus of documents (p. 37)

\[idf_{termo} = \mbox{ln} \frac{n_{documentos}}{n_{documentos.contendo.termo}}\]

# Calcula frequencia de palavras por sujeito (document)
 subj_words <- dt_cx2 %>%
  count(ID, words, sort = TRUE) %>%
  ungroup()

# Calcula total de palavras por sujeito
  total_words <- subj_words %>% 
    group_by(ID) %>% 
    summarize(total = sum(n))


# Une total de palavras em subj_words
  subj_words <- left_join(subj_words, total_words)

# Calcula frequência de palavras por sujeito  
  subj_words <- subj_words %>% 
    mutate(term_freq = n/total)

 # Calcula número de documentos  
  subj_words <- subj_words %>%
    mutate(n_docs = n_distinct(ID))
  
 # Calcula em quandos documentos cada palavra aparece (document frequency)
  
   subj_words <- subj_words %>%
    group_by(words) %>%
    mutate(word_doc_freq = n_distinct(ID)/n_docs,
      inv_word_doc_freq = n_docs/n_distinct(ID),
      ln_inv_word_doc_freq =log(n_docs/n_distinct(ID)))
   
   subj_words <-  subj_words %>% filter(words !="0") 

Adicionando tf_tdf automaticamente

  subj_words <- subj_words %>%
    bind_tf_idf(term = words, document = ID, n=n)

dtm <-  subj_words %>% select(c(1, 2, 12)) %>%
  spread(key=words, value = tf_idf)
 subj_words %>%
    arrange(desc(tf_idf))
## # A tibble: 3,907 x 12
## # Groups:   words [930]
##           ID words        n total term_freq n_docs word_doc_freq
##        <dbl> <chr>    <int> <int>     <dbl>  <int>         <dbl>
##  1 107537444 cenários     1     2       0.5    253       0.00395
##  2 108004784 boné         1     2       0.5    253       0.00395
##  3 109580516 reta         1     2       0.5    253       0.00395
##  4 106299770 predio       1     1       1      253       0.0751 
##  5 105537171 carinho      1     2       0.5    253       0.00791
##  6 109094091 maquete      1     2       0.5    253       0.00791
##  7 108644067 carrinho     1     1       1      253       0.0949 
##  8 104811247 quadro       1     2       0.5    253       0.0119 
##  9 109168832 chaminés     1     2       0.5    253       0.0158 
## 10 110218509 espada       1     2       0.5    253       0.0158 
## # ... with 3,897 more rows, and 5 more variables: inv_word_doc_freq <dbl>,
## #   ln_inv_word_doc_freq <dbl>, tf <dbl>, idf <dbl>, tf_idf <dbl>
  subj_words %>%
    ggplot(aes(x=tf , y= tf_idf, color = word_doc_freq)) + 
    geom_point(alpha=1/4) +
    scale_colour_gradientn(colours = brewer.pal(7, "Paired")) +
    scale_y_continuous(seq(0, 2.5, .5), limits=c(0, 2.5))

Latent Semantic Analyais via Topic modeling (Latent Dirichlet Allocation - LDA )
  # install.packages('servr') 
  library(stringr)
  library(text2vec)

  
  prep_fun <- tolower
  tok_fun <- word_tokenizer

  it = itoken(
      dt_cx$resposta, 
      ids = dt_cx$ID, 
      preprocessor = prep_fun, 
      tokenizer = tok_fun, 
      progressbar = FALSE)
  
  vocab = 
    create_vocabulary(it,
      stopwords = stopwords$words) %>%
    
    prune_vocabulary(
      term_count_min = 1, 
      doc_proportion_max = 0.8
      )

  vectorizer = vocab_vectorizer(vocab)
  dtm = create_dtm(it, vectorizer)
  
  dim(dtm)
## [1] 2911  892
  # define tfidf model
  tfidf = TfIdf$new()
  # fit model to train data and transform train data with fitted model
  dtm_tfidf = fit_transform(dtm, tfidf)

 lda_model = LDA$new(
   n_topics = 12, 
   doc_topic_prior = 0.1,
   topic_word_prior = 0.01
   )

  doc_topic_distr = 
    lda_model$fit_transform(
    x = dtm_tfidf, 
    n_iter = 1000, 
    convergence_tol = 0.001, 
    n_check_convergence = 25, 
    progressbar = FALSE
    )
## INFO [2018-12-11 14:04:03] iter 25 loglikelihood = -48570.240
## INFO [2018-12-11 14:04:04] iter 50 loglikelihood = -46577.067
## INFO [2018-12-11 14:04:04] iter 75 loglikelihood = -46333.269
## INFO [2018-12-11 14:04:04] iter 100 loglikelihood = -46231.245
## INFO [2018-12-11 14:04:04] iter 125 loglikelihood = -46184.585
## INFO [2018-12-11 14:04:04] iter 150 loglikelihood = -46196.731
## INFO [2018-12-11 14:04:04] early stopping at 150 iteration
  lda_model$get_top_words(n = 12, 
    topic_number = c(1:12), lambda = 0.2)
##       [,1]       [,2]            [,3]       [,4]          [,5]         
##  [1,] "parede"   "churrasqueira" "mesa"     "prédio"      "muros"      
##  [2,] "muro"     "banco"         "predios"  "telhado"     "cama"       
##  [3,] "teto"     "castelo"       "chão"     "chaminé"     "escolas"    
##  [4,] "mercados" "celular"       "cadeiras" "fogão"       "calçada"    
##  [5,] "pontes"   "janelas"       "avião"    "lixo"        "cabana"     
##  [6,] "lareira"  "mansão"        "chamine"  "piso"        "apartamento"
##  [7,] "jogar"    "telhados"      "poste"    "pisos"       "relógio"    
##  [8,] "laje"     "sentar"        "beliche"  "condominio"  "horta"      
##  [9,] "calçadas" "carros"        "parque"   "biblioteca"  "sabespe"    
## [10,] "quadrado" "armarios"      "luz"      "construções" "salão"      
## [11,] "caderno"  "borracha"      "apoio"    "pessoas"     "brinquedo"  
## [12,] "chaminés" "enfeite"       "bomba"    "quarto"      "forte"      
##       [,6]         [,7]        [,8]             [,9]        [,10]       
##  [1,] "casa"       "prédios"   "casas"          "escola"    "escada"    
##  [2,] "lojas"      "casinha"   "igreja"         "armário"   "porta"     
##  [3,] "mesas"      "escadas"   "restaurante"    "mercado"   "quebrar"   
##  [4,] "pia"        "cachorro"  "garagem"        "loja"      "rua"       
##  [5,] "poço"       "ponte"     "brincar"        "portal"    "vaso"      
##  [6,] "lanchonete" "quadra"    "fogueira"       "lage"      "estante"   
##  [7,] "forno"      "bancos"    "pirâmide"       "chao"      "mercearia" 
##  [8,] "espada"     "hotel"     "estacionamento" "escultura" "piscinas"  
##  [9,] "sapato"     "banquinho" "controle"       "lousa"     "boneco"    
## [10,] "relogio"    "cinema"    "arquibancada"   "igrejas"   "construção"
## [11,] "minecraft"  "vidro"     "quadro"         "arma"      "lápis"     
## [12,] "canil"      "estátua"   "gato"           "farmácias" "pilar"     
##       [,11]        [,12]       
##  [1,] "piscina"    "predio"    
##  [2,] "cadeira"    "shopping"  
##  [3,] "carrinho"   "carro"     
##  [4,] "hospital"   "paredes"   
##  [5,] "padaria"    "janela"    
##  [6,] "sofá"       "galinheiro"
##  [7,] "banheiro"   "brinquedos"
##  [8,] "fogao"      "muralha"   
##  [9,] "prateleira" "empilhar"  
## [10,] "cidade"     "chácaras"  
## [11,] "fábrica"    "torres"    
## [12,] "peso"       "hospitais"
  lda_model$plot()
LDA usando bigramas 2 a 3
  library(stringr)
  library(text2vec)
  
  prep_fun = tolower
  tok_fun = word_tokenizer

  it = itoken(dt_cx$resposta, 
             preprocessor = prep_fun, 
             tokenizer = tok_fun, 
             ids = dt_cx$ID, 
             progressbar = FALSE)
  
  vocab = create_vocabulary(
    it, 
    stopwords = stopwords$words,
    ngram = c(2, 2)
    ) %>%
    prune_vocabulary(
      term_count_min = 1, 
      doc_proportion_max = 0.8)
  
  dim(vocab)
## [1] 744   3
  vectorizer = vocab_vectorizer(vocab)
  dtm = create_dtm(it, vectorizer)
  
  dim(dtm)
## [1] 2911  744
  # define tfidf model
  tfidf = TfIdf$new()
  # fit model to train data and transform train data with fitted model
  dtm_tfidf = fit_transform(dtm, tfidf)

  dim(dtm_tfidf )
## [1] 2911  744
 lda_model = LDA$new(
   n_topics =8, 
   doc_topic_prior = 0.1,
   topic_word_prior = 0.01
   )

  doc_topic_distr = 
    lda_model$fit_transform(
    x = dtm_tfidf, n_iter = 1000, 
    convergence_tol = 0.001, 
    n_check_convergence = 25, 
    progressbar = FALSE
    )
## INFO [2018-12-11 14:04:04] iter 25 loglikelihood = -23029.533
## INFO [2018-12-11 14:04:04] iter 50 loglikelihood = -22841.390
## INFO [2018-12-11 14:04:04] iter 75 loglikelihood = -22828.579
## INFO [2018-12-11 14:04:04] early stopping at 75 iteration
  barplot(
    doc_topic_distr[2, ], xlab = "topic", 
    ylab = "proportion", ylim = c(0, 1), 
    names.arg = 1:ncol(doc_topic_distr)
    )

  lda_model$get_top_words(n = 12, 
    topic_number = c(1:4), lambda = 0.2)
##       [,1]                   [,2]               [,3]                 
##  [1,] "fogao_lenha"          "fazer_casa"       "casinha_cachorro"   
##  [2,] "construir_casas"      "casa_tijolos"     "casa_cachorro"      
##  [3,] "fazer_muro"           "cama_tijolo"      "fazer_parede"       
##  [4,] "construir_escola"     "construir_prédio" "vaso_planta"        
##  [5,] "fazer_banco"          "carro_brinquedo"  "casa_árvore"        
##  [6,] "forno_lenha"          "fazer_chão"       "fazer_churrasqueira"
##  [7,] "construir_prédios"    "construir_coisas" "posto_saude"        
##  [8,] "casinha_brinquedo"    "mesa_tijolo"      "posto_gasolina"     
##  [9,] "balanço_tijolo"       "árvore_tijolos"   "fazer_hotel"        
## [10,] "construir_lanchonete" "fazer_barco"      "forno_pizza"        
## [11,] "lage_tijolo"          "matar_inimigo"    "comer_tijolo"       
## [12,] "vaso_flor"            "quebrar_janela"   "fazer_lousa"        
##       [,4]                   
##  [1,] "banco_sentar"         
##  [2,] "casa_boneca"          
##  [3,] "fazer_chaminé"        
##  [4,] "fazer_muros"          
##  [5,] "carro_tijolo"         
##  [6,] "estojo_tijolo"        
##  [7,] "caderno_tijolo"       
##  [8,] "fazer_carrinho"       
##  [9,] "fazer_fornalha"       
## [10,] "churrasqueira_tijolos"
## [11,] "parede_tijolos"       
## [12,] "chão_tijolo"
  lda_model$plot()

Classificação de idéias usando word embeddings

    library(readr)
  
 # Lê word embeddings
  nilc_wv <- read_delim(
    file = "glove_s300.txt", 
    delim = " ",quote="", 
    skip = 1,  
    col_names = FALSE, 
    progress = TRUE)
  
  names(nilc_wv)[1]<-"words"
  names(nilc_wv)[2:301]<-paste("V", 1:300, sep= "")
  
  
  # cria vocabulário
  vocab <- dt_cx2 %>% count(words, sort = TRUE)
  
  # Adiciona vetores
  vocab <- left_join(vocab, nilc_wv)
  
  # vetor lógico indicando palavras não encontradas nos vetores
  select <- vocab %>% select(V1) %>% is.na %>% as.logical()
  select <- !select
  table(select)
  
  # Análise de 100 clusters
  d   <- dist(vocab[select , 3:302], method="euclidean") 
  cluster  <- hclust(d, method="ward.D2")
  plot(cluster)
  grp200 <- cutree(cluster, k = 200)
  table(grp200)
  
  # adiciona clusters
  vocab <- vocab %>% filter(select) %>% select(1:2) %>% bind_cols(as.data.frame(grp200))
  
  # leva cluster para a base
  dt_cx2 <- dt_cx2[, 1:5]
  dt_cx2 <- dt_cx2 %>% left_join(vocab)
  
  # transforma cluster em dummies
  dt_cx2 <- bind_cols(dt_cx2, as.data.frame(dummy.code(dt_cx2$grp200)))
  
  names(dt_cx2)
  
  # cria base por resposta
  dt_cx3 <-  dt_cx2 %>% select(c(1, 2, 4, 8:207)) %>%
    group_by(ID, resp_num, pre_pos) %>%
    summarise_all(.funs = sum, na.rm=TRUE)
  
   names(dt_cx3)
  
  # Análise de 60 clusters das respostas 
  d   <- dist( dt_cx3[, 4:203], method="binary") 
  cluster  <- hclust(d, method="ward.D2")
  plot(cluster)
  grp60 <- cutree(cluster, k = 60)
   
  dt_cx3 <- bind_cols(dt_cx3, as.data.frame(grp60))
  names(dt_cx3) 
  dt_cx <- left_join(dt_cx[, 1:5], dt_cx3[, c(1:3, 204)])

Visualizando clusters usando tsn-e

  library(Rtsne)
  library(ggrepel)
  library(ggthemes)
  library(RColorBrewer)
  library(artyfarty)
install.packages("artyfarty")
 
 # Lê word embeddings
  nilc_wv <- read_delim(
    file = "glove_s300.txt", 
    delim = " ",quote="", 
    skip = 1,  
    col_names = FALSE, 
    progress = TRUE)
  
  names(nilc_wv)[1]<-"words"
  names(nilc_wv)[2:301]<-paste("V", 1:300, sep= "")

  vocab <- vocab[ , 1:3]
  vocab <- left_join(vocab, nilc_wv)
  select <- vocab %>% select(V1) %>% is.na %>% as.logical()
  select <- !select 
  table(select)
  
  
 tsne_out <- Rtsne(vocab[select , 4:303], perplexity = 18) 
 vocab  <-  cbind(vocab[select, 1:3] , as.data.frame(tsne_out$Y))

  ggplot(data = vocab,  
            mapping = aes(
             y = V1,
             x = V2,
            color = grp200) 
           ) +
          
         geom_point()  +
                  
         geom_text_repel(
            aes(label=words), 
            size=2, vjust=-1.2
            ) +
       theme_minimal() +
      scale_color_gradientn(colours =  brewer.pal(12, "Paired"))
  
  
  # cluster via ts-ne
  
  cluster  <- kmeans(vocab[, 4:5], 200, nstart = 20)
 
  dim()
  # adiciona clusters
  vocab <- vocab %>% filter(select) %>% 
    select(1:2) %>% bind_cols(tsne200 = cluster$cluster)
  
  names(dt_cx2)
  # leva cluster para a base
  dt_cx2b <- dt_cx2[, c(1:7)] %>% left_join(vocab, by="words")
  
  # transforma cluster em dummies
  dt_cx2b <- bind_cols(dt_cx2b, as.data.frame(dummy.code(dt_cx2b$tsne200)))
  
  names(dt_cx2b)
  
  # cria base por resposta
  dt_cx3b <-  dt_cx2b %>% select(c(1, 2, 4, 10:209)) %>%
    group_by(ID, resp_num, pre_pos) %>%
    summarise_all(.funs = sum, na.rm=TRUE)
  
   names(dt_cx3b)
  
    # Análise de 60 clusters das respostas 
  d   <- dist( dt_cx3b[, 4:203], method="binary") 
  cluster  <- hclust(d, method="ward.D2")
  plot(cluster)
  grp60b <- cutree(cluster, k = 60)
  
  names(dt_cx3b) 
  dt_cx3b <- bind_cols(dt_cx3b[, 1:3], as.data.frame(grp60b))
   
  dt_cx <- left_join(dt_cx[, 1:5], dt_cx3b)
  
  table(dt_cx$grp60b)
  table(dt_cx$grp60, dt_cx$grp60b)