rm(list = ls())
library(pacman)
#p_load(tidyverse,tidytext)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext)
##############___data
corpus = c('The sky is blue and beautiful.',
           'Love this blue and beautiful sky!',
           'The quick brown fox jumps over the lazy dog.',
           'The brown fox is quick and the blue dog is lazy!',
           'The sky is very blue and the sky is very beautiful today',
           'The dog is lazy but the brown fox is quick!' )
labels = c('weather', 'weather', 'animals', 'animals', 'weather', 'animals')
#####################_____
corpus_df <- tibble(Document = corpus,Category = labels) 
corpus_df
## # A tibble: 6 x 2
##   Document                                                 Category
##   <chr>                                                    <chr>   
## 1 The sky is blue and beautiful.                           weather 
## 2 Love this blue and beautiful sky!                        weather 
## 3 The quick brown fox jumps over the lazy dog.             animals 
## 4 The brown fox is quick and the blue dog is lazy!         animals 
## 5 The sky is very blue and the sky is very beautiful today weather 
## 6 The dog is lazy but the brown fox is quick!              animals
corpus_df <- corpus_df %>%
   mutate(id = 1:n())
corpus_df
## # A tibble: 6 x 3
##   Document                                                 Category    id
##   <chr>                                                    <chr>    <int>
## 1 The sky is blue and beautiful.                           weather      1
## 2 Love this blue and beautiful sky!                        weather      2
## 3 The quick brown fox jumps over the lazy dog.             animals      3
## 4 The brown fox is quick and the blue dog is lazy!         animals      4
## 5 The sky is very blue and the sky is very beautiful today weather      5
## 6 The dog is lazy but the brown fox is quick!              animals      6
###################split words
bag_of_words_raw <- corpus_df %>%
  group_by(id) %>%
  unnest_tokens(word,Document) %>%
  count(word,sort = T) %>%
  ungroup() 
dim(bag_of_words_raw); bag_of_words_raw 
## [1] 45  3
## # A tibble: 45 x 3
##       id word      n
##    <int> <chr> <int>
##  1     3 the       2
##  2     4 is        2
##  3     4 the       2
##  4     5 is        2
##  5     5 sky       2
##  6     5 the       2
##  7     5 very      2
##  8     6 is        2
##  9     6 the       2
## 10     1 and       1
## # ... with 35 more rows
####################___remove stop words
data("stop_words")
dim(stop_words); head(stop_words)
## [1] 1149    2
## # A tibble: 6 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART
bag_of_words_tidy <- bag_of_words_raw %>%
  anti_join(stop_words) %>%
  arrange(id) 
## Joining, by = "word"
dim(bag_of_words_tidy);bag_of_words_tidy
## [1] 27  3
## # A tibble: 27 x 3
##       id word          n
##    <int> <chr>     <int>
##  1     1 beautiful     1
##  2     1 blue          1
##  3     1 sky           1
##  4     2 beautiful     1
##  5     2 blue          1
##  6     2 love          1
##  7     2 sky           1
##  8     3 brown         1
##  9     3 dog           1
## 10     3 fox           1
## # ... with 17 more rows
bag_of_words_tidy$n
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1
bag_of_words_tidy %>% print(n = Inf)
## # A tibble: 27 x 3
##       id word          n
##    <int> <chr>     <int>
##  1     1 beautiful     1
##  2     1 blue          1
##  3     1 sky           1
##  4     2 beautiful     1
##  5     2 blue          1
##  6     2 love          1
##  7     2 sky           1
##  8     3 brown         1
##  9     3 dog           1
## 10     3 fox           1
## 11     3 jumps         1
## 12     3 lazy          1
## 13     3 quick         1
## 14     4 blue          1
## 15     4 brown         1
## 16     4 dog           1
## 17     4 fox           1
## 18     4 lazy          1
## 19     4 quick         1
## 20     5 sky           2
## 21     5 beautiful     1
## 22     5 blue          1
## 23     6 brown         1
## 24     6 dog           1
## 25     6 fox           1
## 26     6 lazy          1
## 27     6 quick         1
unique(bag_of_words_tidy$word)
##  [1] "beautiful" "blue"      "sky"       "love"      "brown"     "dog"      
##  [7] "fox"       "jumps"     "lazy"      "quick"
###########################____
bag_of_words_dtm <- bag_of_words_tidy %>%
  spread(word,n,fill = 0) 
bag_of_words_dtm
## # A tibble: 6 x 11
##      id beautiful  blue brown   dog   fox jumps  lazy  love quick   sky
##   <int>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1         1     1     0     0     0     0     0     0     0     1
## 2     2         1     1     0     0     0     0     0     1     0     1
## 3     3         0     0     1     1     1     1     1     0     1     0
## 4     4         0     1     1     1     1     0     1     0     1     0
## 5     5         1     1     0     0     0     0     0     0     0     2
## 6     6         0     0     1     1     1     0     1     0     1     0
#ref https://www.icode9.com/content-4-1338410.html