1 Stem vs. Lemma

library(tidyverse) # R is better when it is tidy
library(stringr)  # for string manipulation
library(SnowballC)
library(koRpus)
library(ggplot2)
library(textstem)
library(tidytext)
library(gridExtra)
library(grid)

setwd("C:/Users/subas/Syncplicity/MyProjects_IMP/MY_Papers_V2/TRB 2020/00000000 FINALz/0000001a NLP/0000001 LA_Crash_Narr")
hor <- read.csv("horse_10_16.csv")
hor$NARRATIVE <- as.character(hor$NARRATIVE)

hor1= hor[,c(2,3)] %>%
  unnest_tokens(word, NARRATIVE)


lemma_unique <- hor1 %>%
  select(word) %>%
  mutate(word_clean = str_replace_all(word,"\u2019s|'s","")) %>%
  mutate(word_clean = ifelse(str_detect(word_clean,"[^[:alpha:]]"),NA,word_clean)) %>%
  filter(!duplicated(word_clean)) %>%
  filter(!is.na(word_clean)) %>%
  arrange(word)


lemma_unique<-lemma_unique %>%
  mutate(word_stem = wordStem(word_clean, language="english"))

hor2 <- left_join(hor1, lemma_unique)
hor2a= na.omit(hor2)



## Hunspell dictionary
lemma_dictionary <- make_lemma_dictionary(hor2a$word_clean, engine = 'hunspell')
hor2a$word_lemma= lemmatize_strings(hor2a$word_clean, dictionary = lemma_dictionary)

head(hor2a)
##              CRASH_NUM1          word    word_clean word_stem  word_lemma
## 1  LA10_100306084416530            on            on        on          on
## 2  LA10_100306084416530         march         march     march       march
## 5  LA10_100306084416530            at            at        at          at
## 6  LA10_100306084416530 approximately approximately  approxim approximate
## 9  LA10_100306084416530        deputy        deputy    deputi      deputy
## 10 LA10_100306084416530   christopher   christopher christoph christopher
## Stem
hor4= hor2a[,c(1, 4)]
data(stop_words)
colnames(hor4)[2] <- "word"

hor2 <- hor4 %>%
  anti_join(stop_words)

p= hor2 %>%
  count(word, sort = TRUE) %>%
  filter(n > 50) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +theme_bw()+ggtitle("Stem")+
  xlab(NULL) +
  coord_flip()


## Lemma
hor4= hor2a[,c(1, 5)]
data(stop_words)
colnames(hor4)[2] <- "word"

hor2 <- hor4 %>%
  anti_join(stop_words)

p1= hor2 %>%
  count(word, sort = TRUE) %>%
  filter(n > 50) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +theme_bw()+ggtitle("Lemma")+
  xlab(NULL) +
  coord_flip()

grid.arrange(p, p1, ncol=2)