Stem vs. Lemma
library(tidyverse) # R is better when it is tidy
library(stringr) # for string manipulation
library(SnowballC)
library(koRpus)
library(ggplot2)
library(textstem)
library(tidytext)
library(gridExtra)
library(grid)
setwd("C:/Users/subas/Syncplicity/MyProjects_IMP/MY_Papers_V2/TRB 2020/00000000 FINALz/0000001a NLP/0000001 LA_Crash_Narr")
hor <- read.csv("horse_10_16.csv")
hor$NARRATIVE <- as.character(hor$NARRATIVE)
hor1= hor[,c(2,3)] %>%
unnest_tokens(word, NARRATIVE)
lemma_unique <- hor1 %>%
select(word) %>%
mutate(word_clean = str_replace_all(word,"\u2019s|'s","")) %>%
mutate(word_clean = ifelse(str_detect(word_clean,"[^[:alpha:]]"),NA,word_clean)) %>%
filter(!duplicated(word_clean)) %>%
filter(!is.na(word_clean)) %>%
arrange(word)
lemma_unique<-lemma_unique %>%
mutate(word_stem = wordStem(word_clean, language="english"))
hor2 <- left_join(hor1, lemma_unique)
hor2a= na.omit(hor2)
## Hunspell dictionary
lemma_dictionary <- make_lemma_dictionary(hor2a$word_clean, engine = 'hunspell')
hor2a$word_lemma= lemmatize_strings(hor2a$word_clean, dictionary = lemma_dictionary)
head(hor2a)
## CRASH_NUM1 word word_clean word_stem word_lemma
## 1 LA10_100306084416530 on on on on
## 2 LA10_100306084416530 march march march march
## 5 LA10_100306084416530 at at at at
## 6 LA10_100306084416530 approximately approximately approxim approximate
## 9 LA10_100306084416530 deputy deputy deputi deputy
## 10 LA10_100306084416530 christopher christopher christoph christopher
## Stem
hor4= hor2a[,c(1, 4)]
data(stop_words)
colnames(hor4)[2] <- "word"
hor2 <- hor4 %>%
anti_join(stop_words)
p= hor2 %>%
count(word, sort = TRUE) %>%
filter(n > 50) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +theme_bw()+ggtitle("Stem")+
xlab(NULL) +
coord_flip()
## Lemma
hor4= hor2a[,c(1, 5)]
data(stop_words)
colnames(hor4)[2] <- "word"
hor2 <- hor4 %>%
anti_join(stop_words)
p1= hor2 %>%
count(word, sort = TRUE) %>%
filter(n > 50) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +theme_bw()+ggtitle("Lemma")+
xlab(NULL) +
coord_flip()
grid.arrange(p, p1, ncol=2)
