rm(list = ls())
library(pacman)
#p_load(tidyverse,tidytext)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
##############___data
corpus = c('The sky is blue and beautiful.',
'Love this blue and beautiful sky!',
'The quick brown fox jumps over the lazy dog.',
'The brown fox is quick and the blue dog is lazy!',
'The sky is very blue and the sky is very beautiful today',
'The dog is lazy but the brown fox is quick!' )
labels = c('weather', 'weather', 'animals', 'animals', 'weather', 'animals')
#####################_____
corpus_df <- tibble(Document = corpus,Category = labels)
corpus_df
## # A tibble: 6 x 2
## Document Category
## <chr> <chr>
## 1 The sky is blue and beautiful. weather
## 2 Love this blue and beautiful sky! weather
## 3 The quick brown fox jumps over the lazy dog. animals
## 4 The brown fox is quick and the blue dog is lazy! animals
## 5 The sky is very blue and the sky is very beautiful today weather
## 6 The dog is lazy but the brown fox is quick! animals
corpus_df <- corpus_df %>%
mutate(id = 1:n())
corpus_df
## # A tibble: 6 x 3
## Document Category id
## <chr> <chr> <int>
## 1 The sky is blue and beautiful. weather 1
## 2 Love this blue and beautiful sky! weather 2
## 3 The quick brown fox jumps over the lazy dog. animals 3
## 4 The brown fox is quick and the blue dog is lazy! animals 4
## 5 The sky is very blue and the sky is very beautiful today weather 5
## 6 The dog is lazy but the brown fox is quick! animals 6
###################split words
bag_of_words_raw <- corpus_df %>%
group_by(id) %>%
unnest_tokens(word,Document) %>%
count(word,sort = T) %>%
ungroup()
dim(bag_of_words_raw); bag_of_words_raw
## [1] 45 3
## # A tibble: 45 x 3
## id word n
## <int> <chr> <int>
## 1 3 the 2
## 2 4 is 2
## 3 4 the 2
## 4 5 is 2
## 5 5 sky 2
## 6 5 the 2
## 7 5 very 2
## 8 6 is 2
## 9 6 the 2
## 10 1 and 1
## # ... with 35 more rows
####################___remove stop words
data("stop_words")
dim(stop_words); head(stop_words)
## [1] 1149 2
## # A tibble: 6 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
bag_of_words_tidy <- bag_of_words_raw %>%
anti_join(stop_words) %>%
arrange(id)
## Joining, by = "word"
dim(bag_of_words_tidy);bag_of_words_tidy
## [1] 27 3
## # A tibble: 27 x 3
## id word n
## <int> <chr> <int>
## 1 1 beautiful 1
## 2 1 blue 1
## 3 1 sky 1
## 4 2 beautiful 1
## 5 2 blue 1
## 6 2 love 1
## 7 2 sky 1
## 8 3 brown 1
## 9 3 dog 1
## 10 3 fox 1
## # ... with 17 more rows
bag_of_words_tidy$n
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1
bag_of_words_tidy %>% print(n = Inf)
## # A tibble: 27 x 3
## id word n
## <int> <chr> <int>
## 1 1 beautiful 1
## 2 1 blue 1
## 3 1 sky 1
## 4 2 beautiful 1
## 5 2 blue 1
## 6 2 love 1
## 7 2 sky 1
## 8 3 brown 1
## 9 3 dog 1
## 10 3 fox 1
## 11 3 jumps 1
## 12 3 lazy 1
## 13 3 quick 1
## 14 4 blue 1
## 15 4 brown 1
## 16 4 dog 1
## 17 4 fox 1
## 18 4 lazy 1
## 19 4 quick 1
## 20 5 sky 2
## 21 5 beautiful 1
## 22 5 blue 1
## 23 6 brown 1
## 24 6 dog 1
## 25 6 fox 1
## 26 6 lazy 1
## 27 6 quick 1
unique(bag_of_words_tidy$word)
## [1] "beautiful" "blue" "sky" "love" "brown" "dog"
## [7] "fox" "jumps" "lazy" "quick"
###########################____
bag_of_words_dtm <- bag_of_words_tidy %>%
spread(word,n,fill = 0)
bag_of_words_dtm
## # A tibble: 6 x 11
## id beautiful blue brown dog fox jumps lazy love quick sky
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 0 0 0 0 0 0 0 1
## 2 2 1 1 0 0 0 0 0 1 0 1
## 3 3 0 0 1 1 1 1 1 0 1 0
## 4 4 0 1 1 1 1 0 1 0 1 0
## 5 5 1 1 0 0 0 0 0 0 0 2
## 6 6 0 0 1 1 1 0 1 0 1 0
#ref https://www.icode9.com/content-4-1338410.html