#Load library
library(tm)
library(tidyverse)
library(tidytext)
library(wordcloud)
library(RColorBrewer)
#Read files and combine files
df<- read_csv("../output/processed_moments.csv")
demo_data <- read_csv("../data/demographic.csv")
df <- df %>%
inner_join(demo_data, by = "wid") %>%
select(c("text","country","gender","marital","cleaned_hm","parenthood"))
#filter out the male data
df_male <-df %>%
filter(gender %in% c('m') )
#create stopwords
data("stop_words")
word <- c("happy","ago","yesterday","lot","today","months","month",
"happier","happiest","last","week","past","finally","found","favorite","received","happiness","excited","surprise","beautiful","fun","delicious","celebrated","pay","expected","prepared","share"
)
my_stop_words <- stop_words %>%
bind_rows(mutate(tibble(word), lexicon = "updated"))
# male's sentiment
df_male %>%
unnest_tokens(word,text)%>%
anti_join(my_stop_words) %>%
inner_join(get_sentiments("nrc"))%>%
group_by(sentiment)%>% count(word)%>% summarize(Score = sum(n))%>%arrange(desc(Score))%>%
ggplot(aes(x=reorder(sentiment,-Score), y = Score))+
geom_bar(aes(fill=sentiment),stat = "identity")+
xlab("sentiment") + ylab("Score") + ggtitle("sentiment of males")
From the plot, we see the sentiment from male’s happy moments. We find that positive, joy, trust, and anticipation contributes most of the sentiments. So what are those words from joy sentiment of males?
# explore male's joy sentiment
df_male %>%
unnest_tokens(word,text)%>%
anti_join(my_stop_words) %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment %in% c("joy"))%>%
count(word, sentiment, sort = TRUE) %>%
top_n(10) %>%
mutate(word = reorder(word, n)) %>%
####plot most common joy words of males
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE,fill = "springgreen3") +
labs(y = "Contribution to sentiment",
x = NULL)+coord_flip()
From the plot, We see that friend is the most frequent word. Birthday is the second. Also, males enjoy talking about their family, especially their daughter and mother. Of course, males are happy with money, food, shopping, and vacation.
#split males into single and married
male_single <- df_male%>%filter(marital =="single")
male_married<- df_male%>%filter(marital =="married")
#wordcloud
corpus<-Corpus(VectorSource(male_single$text))
corpus <- tm_map(corpus, stripWhitespace)
tdm.all<-TermDocumentMatrix(corpus)
tdm.tidy=tidy(tdm.all)
tdm.overall=summarise(group_by(tdm.tidy, term), sum(count))
corpus1<-Corpus(VectorSource(male_married$text))
corpus1 <- tm_map(corpus1, stripWhitespace)
tdm.all1<-TermDocumentMatrix(corpus1)
tdm.tidy1=tidy(tdm.all1)
tdm.overall1=summarise(group_by(tdm.tidy1, term), sum(count))
par(mfrow=c(2,1), las=1, mar=c(3.1, 4.1, 1.1, 2.1))
###male married word cloud
wordcloud(tdm.overall1$term, tdm.overall1$`sum(count)`,
scale=c(3,0.5),
max.words=20,
min.freq=1,
random.order=FALSE,
rot.per=0.3,
use.r.layout=T,
random.color=FALSE,
colors=brewer.pal(7,"Greens"))
###male single word cloud
wordcloud(tdm.overall$term, tdm.overall$`sum(count)`,
scale=c(3,0.5),
max.words=20,
min.freq=1,
random.order=FALSE,
rot.per=0.3,
use.r.layout=T,
random.color=FALSE,
colors=brewer.pal(9,"Blues"))
Comparing with the two word clouds, the top one is from married males. It’s clear that married males talk about their wife most of the time and they focus more on their family members such as son and daughter. The bottom is from single males. We find that they care more about their friends. Besides, single males talk more about girlfriend and game.
word <- c("happy","ago","yesterday","lot","today","months","month",
"happier","happiest","last","week","past","finally","found","favorite","received","happiness","excited","surprise","beautiful","fun","delicious","celebrated","pay","expected","prepared","share","friend","money","unexpected","love","food","green","birthday","score","mother","shopping","smile","gift","pretty","vacation","laugh","special","joy","perfect","wonderful","pleasant","bonus","graduation"
)
sm_stop_words <- stop_words %>%
bind_rows(mutate(tibble(word), lexicon = "updated"))
require(gridExtra) # For combining two plots
####married male joy
plot1<- male_married%>%
unnest_tokens(word,text)%>%
anti_join(sm_stop_words) %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment %in% c("joy"))%>%
count(word, sentiment, sort = TRUE) %>%
top_n(5) %>%
mutate(word = reorder(word, n)) %>%
####plot most common joy words of married males
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE,fill = "lightblue") +
labs(y = "Contribution to sentiment",
x = NULL, title="Married")+
coord_flip()
####single male joy
plot2 <- male_single%>%
unnest_tokens(word,text)%>%
anti_join(sm_stop_words) %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment %in% c("joy"))%>%
count(word, sentiment, sort = TRUE) %>%
top_n(5) %>%
mutate(word = reorder(word, n)) %>%
####plot most common joy words of single males
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE,fill = "lightgoldenrod1") +
labs(y = "Contribution to sentiment",
x = NULL, title="Single")+
coord_flip()
grid.arrange(plot1, plot2, ncol=2)
To have a clear comparison, I removed the same common words such as “friend,” “money,” “love,” “food,” and “birthday.” We find that married males have happy moments when they spend time with their family and they care more about their children and marriage. On the other hand, single males enjoy music, cream, and beer.
The frequent words among males’ joy moments are friend, birthday, and love.The difference between singled and married males regarding to joy moment is that single males forcus more on enertainments while married males pay more attention to their family.
Silge,Robinson,2018-12-21,“Text Mining with R”. https://www.tidytextmining.com/index.html https://github.com/TZstatsADS/ADS_Teaching/blob/master/Tutorials/wk2-TextMining/doc/InteractiveWordCloud.Rmd https://github.com/TZstatsADS/ADS_Teaching/blob/master/Tutorials/wk2-TextMining/doc/wk2-Tutorial-TextMining.Rmd