library(dplyr)
library(forcats)
library(ggplot2)
library(data.table)
library(ggpubr)
library(ggthemes)
library(tidyverse)
library(reshape2)
library(RColorBrewer)
library(readxl)
library(rgdal)
library(sp)
library(raster)
library(leaflet)
library(maps)
#library(ggmap)
library(plotly)
library(quanteda)
library(wordcloud)
library(wordcloud2)
library(readr)
library(dplyr)
library(tm)
library(tidytext)
library(patchwork)
library(SnowballC)
mental <- read.csv('mental-heath-in-tech-2016_20161114.csv', header = TRUE,encoding = "UTF-8")
## data preprocessing
# gender info
temp <- mental[, grep('gender', colnames(mental))]
temp[! toupper(str_sub(temp, 1,1)) %in% c('F', 'M','W') ] <- 'Others'
temp[ toupper(str_sub(temp, 1,1)) %in% c('F', 'W') ] <- 'Female'
temp[ toupper(str_sub(temp, 1,1)) %in% c('M') ] <- 'Male'
gender <- temp
# age
age <- mental[, grep('What.is.your.age.', colnames(mental))]
breaks <- c(10*c(1:7))
age_category <- cut(as.numeric(age), breaks,include.lowest = TRUE, right = FALSE, dig.lab=10,labels =c("10-20","20-30","30-40","40-50","50-60","60-70"))
# condition
condition <- mental[grep('condition', colnames(mental))]
#colnames(condition)
condition.diagnosed <- condition$Have.you.been.diagnosed.with.a.mental.health.condition.by.a.medical.professional.
condition.type <- condition$If.so..what.condition.s..were.you.diagnosed.with.
temp <- str_split(condition.type, '[|]', simplify = T)
# data for plot
plotdata <- data.frame(gender = gender, age = age, age_category = as.character(age_category),
condition.diagnosed = condition.diagnosed,temp)
plotdata_long <- melt(plotdata, id.vars = c('gender', 'age','age_category', 'condition.diagnosed'))
#colnames(plotdata_long)
## age distribution
plotdata=plotdata[-which(plotdata$age==323),]
box_gender <- ggplot(plotdata) +
geom_boxplot(aes(x = gender,y =age, fill = gender, group = gender)) +
scale_y_continuous(breaks = seq(19, 99, 10)) +
scale_fill_manual(values = c("#E41A1C", "#449B75", "#AC5782")) +
#scale_fill_manual(values=col) +
theme_bw()+ggtitle("Age Distribution by Sex")+xlab("Gender")+ylab("Age")+theme(plot.title = element_text(hjust = 0.5))
library(plotly)
ggplotly(box_gender)
## condition.diagnosed
library(ggpubr)
figure=ggpubr::ggarrange(
ggplot(plotdata) +
geom_bar(aes(x = gender, fill = condition.diagnosed), position = 'dodge') +
scale_fill_manual(values = c("#449B75", "#E41A1C")) +
theme_bw()+ylab("Number of People")+xlab("Gender") +
labs(fill="Have you been diagnosed with a mental health \n condition by a medical professional?") ,
ggplot(plotdata) +
geom_bar(aes(x = age_category, fill = condition.diagnosed), position = 'dodge') +
scale_fill_manual(values = c("#449B75", "#E41A1C")) +
#scale_fill_manual(palette="Set1") +
theme_bw()+ylab("Number of People")+xlab("Age")+
theme(axis.text.x = element_text(angle = 0)), common.legend = T, legend = "bottom")
annotate_figure(figure,
top = text_grob("Condition Diagnosed by Sex and Age",face="bold"))
## condition plot
col <- colorRampPalette(brewer.pal(9, 'Set1'))(6)
plotdata_long <- plotdata_long %>%
filter(! value == '')
a=unique(plotdata_long$value);a
# [1] "Anxiety Disorder (Generalized, Social, Phobia, etc)"
# [2] "Mood Disorder (Depression, Bipolar Disorder, etc)"
# [3] "Personality Disorder (Borderline, Antisocial, Paranoid, etc)"
# [4] "Attention Deficit Hyperactivity Disorder"
# [5] "Seasonal Affective Disorder"
# [6] "Depression"
# [7] "Substance Use Disorder"
# [8] "Obsessive-Compulsive Disorder"
# [9] "Post-traumatic Stress Disorder"
# [10] "Psychotic Disorder (Schizophrenia, Schizoaffective, etc)"
# [11] "Eating Disorder (Anorexia, Bulimia, etc)"
# [12] "Autism - while not a \"mental illness\", still greatly affects how I handle anxiety"
# [13] "Stress Response Syndromes"
# [14] "attention deficit disorder (but not the hyperactive version)"
# [15] "Asperger Syndrome"
# [16] "ADD (w/o Hyperactivity)"
# [17] "Addictive Disorder"
# [18] "Schizotypal Personality Disorder"
# [19] "PDD-NOS (see above)"
# [20] "Suicidal Ideation"
# [21] "Attention Deficit Disorder"
# [22] "Intimate Disorder"
# [23] "Dissociative Disorder"
# [24] "Aspergers"
# [25] "Autism"
for(i in 1:length(a)){
if (i != 1 & i !=2 & i!=4 & i!=8 & i!=9) {
plotdata_long[which(plotdata_long$value==a[i]),"value"]<-"others"
}
}
ggplot(plotdata_long) +
geom_bar(aes(x = gender, fill = value),position = 'stack') +
scale_fill_manual(values=col) +
theme_bw() +
theme(legend.title = element_blank(),
legend.position = 'bottom',
legend.text = element_text(size = 9))+
guides(fill = guide_legend(ncol = 2,
keywidth = unit(4,'mm'),
keyheight = unit(4,'mm')))+ylab("Number of People")+ggtitle("Specific Mental Health Disorders Composition by Gender")+theme(plot.title = element_text(hjust = 0.5))+ xlab("Gender")
ggplot(plotdata_long) +
geom_bar(aes(x = age_category, fill = value),position = 'stack') +
scale_fill_manual(values=col) +
theme_bw() +
theme(legend.title = element_blank(),
legend.position = 'bottom',
legend.text = element_text(size = 9))+
guides(fill = guide_legend(ncol = 2,
keywidth = unit(4,'mm'),
keyheight = unit(4,'mm')))+ylab("Number of People")+ggtitle("Specific Mental Health Disorders Composition by Age")+theme(plot.title = element_text(hjust = 0.5)) +xlab("Age category")
df <- mental %>%
dplyr::select('Do.you.currently.have.a.mental.health.disorder.','Have.you.ever.sought.treatment.for.a.mental.health.issue.from.a.mental.health.professional.', 'What.US.state.or.territory.do.you.live.in.','Do.you.have.medical.coverage..private.insurance.or.state.provided..which.includes.treatment.of..mental.health.issues.','Does.your.employer.provide.mental.health.benefits.as.part.of.healthcare.coverage.')
names(df) <- c('disorder1','treatment','state','mental_insurance','employer_benefit1')
df <- df %>%
mutate(disorder = case_when(
disorder1 == 'Yes' ~ 1,
disorder1 == 'Maybe' ~ 0.5,
disorder1 == 'No' ~ 0))
df <- df %>%
mutate(employer_benefit = case_when(
employer_benefit1 == 'Yes' ~ 1,
TRUE ~ 0))
df <- df %>%
dplyr::select('state','treatment','employer_benefit','disorder')
head(df)
df1 <- df %>%
dplyr::select('state','treatment')
df_treatment1 <- df1 %>% group_by(state) %>% summarise(count=n())
df_treatment2 <- df1 %>% filter(treatment==1) %>% group_by(state) %>% summarise(count1=n())
df_treatment <- df_treatment1 %>%
left_join(df_treatment2, by = "state") %>%
mutate(treatment_prec = count1/count)
#df_treatment <- df1 %>% group_by(state) %>% summarise(treatment_perc = sum('treatment'== 1))
#df1 <- group_by(df, state) %>% mutate(disorder_percent = disorder/sum(disorder))
#head(df_treatment1)
#sum(df_treatment1$count)
df2 <- df %>%
dplyr::select('state','employer_benefit')
df_employer_benefit1 <- df2 %>% group_by(state) %>% summarise(count=n())
df_employer_benefit2 <- df2 %>% filter(employer_benefit==1) %>% group_by(state) %>% summarise(count1=n())
df_employer_benefit <- df_employer_benefit1 %>%
left_join(df_employer_benefit2, by = "state") %>%
mutate(employer_benefit_prec = count1/count)
df_final <- merge(x = df_employer_benefit, y = df_treatment, by = "state", all = TRUE) %>%
dplyr::select('state','treatment_prec','employer_benefit_prec')
round_df <- function(x, digits) {
numeric_columns <- sapply(x, mode) == 'numeric'
x[numeric_columns] <- round(x[numeric_columns], digits)
x
}
df_final <- round_df(df_final, 3)
df_count <- df %>% count(state, sort = TRUE)
names(df_count) <- c('state','sample')
df_final <- merge(x = df_final, y = df_count, by = "state", all = TRUE)
# From https://www.census.gov/geo/maps-data/data/cbf/cbf_state.html
states <- shapefile("cb_2018_us_state_20m.shp")
combined <- states@data %>%
left_join(df_final, by = c(NAME = "state"))
states@data <- combined
bins <- c(0, 0.2, 0.4, 0.6, 0.8, 1)
pal <- leaflet::colorBin("YlOrRd", domain = states$treatment_prec, bins = c(0, 0.2, 0.4, 0.6, 0.8, 1), na.color = "#C0C0C0")
labels_states <- paste( states$NAME,":",
"Frequency=", states$treatment_prec,",","Sample=",states$sample)
m <- leaflet() %>%
leaflet::setView(-96, 37.8, 4) %>%
leaflet::addProviderTiles("MapBox", options = providerTileOptions(
id = "mapbox.light",
accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>%
leaflet::addPolygons(data=states,
fillColor = ~pal(treatment_prec),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "white",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels_states,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto")) %>%
leaflet::addLegend(data=states, pal = pal, title = "Frequency of Employee Who Sought Treatment",
values = ~treatment_prec, opacity = 0.7,position = "bottomright")
m
bins <- c(0, 0.2, 0.4, 0.6, 0.8, 1)
pal <- colorBin("YlOrRd", domain = states$employer_benefit_prec, bins = bins,na.color = "#C0C0C0")
labels_states <- paste(states$NAME,":",
"Frequency=", states$employer_benefit_prec,",","Sample=",states$sample)
n <- leaflet() %>%
setView(-96, 37.8, 4) %>%
addProviderTiles("MapBox", options = providerTileOptions(
id = "mapbox.light",
accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>%
addPolygons(data=states,
fillColor = ~pal(states$employer_benefit_prec),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "white",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels_states,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto")) %>%
addLegend(data=states, pal = pal, title = "Frequency of Employer with Mental Healthcare Benefit", values = ~employer_benefit_prec, opacity = 0.7,
position = "bottomright")
n
mental1 <- mental %>% filter(gender == 'Male')%>%
dplyr::group_by(company_size,gender, bringup_issue) %>%
dplyr::summarise(frequency=n())%>%
dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)
mental2 <- mental %>% filter(gender == 'Female')%>%
dplyr::group_by(company_size,gender, bringup_issue) %>%
dplyr::summarise(frequency=2.32*n())%>%#times male/female ratio to make sure male and female are within same sample size
dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)
mental3 <- rbind(mental1, mental2)
fig<- ggballoonplot(mental3, y = "company_size", x = "bringup_issue", size="frequency",fill = "grey",
facet.by = "gender", ggtheme = theme_bw())+
scale_fill_viridis_c(option = "D",direction = -1)
fig <- ggpar(fig,main = "Will you bring up mental issue during interviews?", submain ="Faceting by company size and gender", xlab = "Answer", ylab = "Company size")
ggplotly(fig)
mental$mental_hurt_career[mental$mental_hurt_career == "No, it has not" | mental$mental_hurt_career == "No, I don't think it would"] <- "No"
mental$mental_hurt_career[mental$mental_hurt_career == "Yes, I think it would" | mental$mental_hurt_career == "Yes, it has"] <- "Yes"
mental4 <- mental %>% filter(gender == 'Male')%>%
dplyr::group_by(company_size,gender, mental_hurt_career) %>%
dplyr::summarise(frequency=n())%>%
dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)
mental5 <- mental %>% filter(gender == 'Female')%>%
dplyr::group_by(company_size,gender, mental_hurt_career) %>%
dplyr::summarise(frequency=2.32*n())%>%#times male/female ratio to make sure male and female are within same sample size
dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)
mental6 <- rbind(mental4, mental5)
fig2<- ggballoonplot(mental6, y = "company_size", x = "mental_hurt_career", fill = "grey", size="frequency",
facet.by = "gender", ggtheme = theme_bw())+
scale_fill_viridis_c(option = "D",direction = -1)
fig2 <- ggpar(fig2,main = "Will identification of mental health issues hurt your career?", submain ="Faceting by company size and gender", xlab = "Answer", ylab = "Company size")
ggplotly(fig2)
mental$coworker_views[mental$coworker_views == "No, they do not" | mental$coworker_views == "No, I don't think they would"] <- "No"
mental$coworker_views[mental$coworker_views == "Yes, I think they would" | mental$coworker_views == "Yes, they do"] <- "Yes"
mental7 <- mental %>% filter(gender == 'Male')%>%
dplyr::group_by(company_size,gender, coworker_views) %>%
dplyr::summarise(frequency=n())%>%
dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)
mental8 <- mental %>% filter(gender == 'Female')%>%
dplyr::group_by(company_size,gender, coworker_views) %>%
dplyr::summarise(frequency=2.32*n())%>%#times male/female ratio to make sure male and female are within same sample size
dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)
mental9 <- rbind(mental7, mental8)
fig3<- ggballoonplot(mental9, y = "company_size", x = "coworker_views", fill = "grey", size="frequency",
facet.by = "gender", ggtheme = theme_bw())+
scale_fill_viridis_c(option = "D",direction = -1)
fig3 <- ggpar(fig3,main = "Will coworkers view you more negatively?", submain ="Faceting by company size and gender", xlab = "Answer", ylab = "Company size")
ggplotly(fig3)
#answer = maybe
#create corpus
mental_health_maybe <- filter(mental_health, bringup_issue == "Maybe")
mental_health_maybe$doc_id <- as.character(c(1:nrow(mental_health_maybe)))
mental_health_maybe <- mental_health_maybe[, c(67,38,65,64)]
maybe_source <- DataframeSource(mental_health_maybe)
maybe_corpus <- VCorpus(maybe_source)
#clean corpus
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c("mental", "health", "interview", "feel", "bring", "want", "made", "get", "employer", "hire", "need", "know", "may", "affect", "job", "might", "reason", "answer", "think", stopwords("en")))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
maybe_clean <- clean_corpus(maybe_corpus)
#stem and stem completion
stemCompletion2 <- function(x, dictionary) {
x <- unlist(strsplit(as.character(x), " "))
x <- x[x != ""]
x <- stemCompletion(x, dictionary=dictionary)
x <- paste(x, sep="", collapse=" ")
stripWhitespace(x)
}
maybe_stemmed <- tm_map(maybe_clean, stemDocument)
maybe_compl <- lapply(maybe_stemmed, stemCompletion2, dictionary = maybe_clean) %>% VectorSource() %>% Corpus()
#word cloud
maybe_tdm <- TermDocumentMatrix(maybe_compl)
maybe_tf_idf <- tidy(maybe_tdm) %>% left_join(.,mental_health_maybe[,c(1,3,4)],by=c("document"="doc_id")) %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(maybe_tf_idf[, c(1,6)], color = maybe_tf_idf$color_size, shape = "diamond")
#answer = yes
#create corpus
mental_health_yes <- filter(mental_health, bringup_issue == "Yes")
mental_health_yes$doc_id <- as.character(c(1:nrow(mental_health_yes)))
mental_health_yes <- mental_health_yes[, c(67,38,65,64)]
yes_source <- DataframeSource(mental_health_yes)
yes_corpus <- VCorpus(yes_source)
#clean corpus
yes_clean <- clean_corpus(yes_corpus)
#stem and stem completion
yes_stemmed <- tm_map(yes_clean, stemDocument)
yes_compl <- lapply(yes_stemmed, stemCompletion2, dictionary = yes_clean) %>% VectorSource() %>% Corpus()
#word cloud
yes_tdm <- TermDocumentMatrix(yes_compl)
yes_tf_idf <- tidy(yes_tdm) %>% left_join(.,mental_health_yes[,c(1,3,4)],by=c("document"="doc_id")) %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(yes_tf_idf[, c(1,6)], color = yes_tf_idf$color_size, shape = "diamond")
#answer = no
#create corpus
mental_health_no <- filter(mental_health, bringup_issue == "No")
mental_health_no$doc_id <- as.character(c(1:nrow(mental_health_no)))
mental_health_no <- mental_health_no[, c(67,38,65,64)]
no_source <- DataframeSource(mental_health_no)
no_corpus <- VCorpus(no_source)
#clean corpus
no_clean <- clean_corpus(no_corpus)
#stem and stem completion
no_stemmed <- tm_map(no_clean, stemDocument)
no_compl <- lapply(no_stemmed, stemCompletion2, dictionary = no_clean) %>% VectorSource() %>% Corpus()
#word cloud
no_tdm <- TermDocumentMatrix(no_compl)
no_tf_idf <- tidy(no_tdm) %>% left_join(.,mental_health_no[,c(1,3,4)],by=c("document"="doc_id")) %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(no_tf_idf[, c(1,6)], color = no_tf_idf$color_size, shape = "diamond")
#top 10 words in maybe
b1 <- filter(maybe_tf_idf, !(term %in% c("issue","issues","work","make","still"))) %>% group_by(`company size`, term) %>%
summarize("frequency"=sum(count)) %>% slice_max(frequency, n=5) %>%
ggplot(aes(reorder(term, frequency), frequency, fill=`company size`)) +
geom_bar(stat = "identity") + coord_flip() +
facet_wrap(~`company size`, scales="free_x") +
scale_fill_brewer(palette = "Set2", direction=-1) +
guides(fill=FALSE) +
ggtitle("Answer = Maybe") +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 10))
b1
#top 10 words in yes
b2 <- yes_tf_idf %>% filter(!(term %in% c("issue","issues","work","answer","reason","hearing","generally","felt","bringing","allow","ahead","time","theyre","see","part","let","talk","point","fact","ensure"))) %>% group_by(`company size`, term) %>%
summarize("frequency"=sum(count)) %>% slice_max(frequency, n=4) %>%
ggplot(aes(reorder(term, frequency), frequency, fill=`company size`)) +
geom_bar(stat = "identity") + coord_flip() +
facet_wrap(~`company size`, scales="free_x") +
scale_fill_brewer(palette = "Set2", direction=-1) +
guides(fill=FALSE) +
ggtitle("Answer = Yes") +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 10))
b2
#top 10 words in no
b3 <- filter(no_tf_idf, !(term %in% c("issue","issues","work"))) %>% group_by(`company size`, term) %>%
summarize("frequency"=sum(count)) %>% slice_max(frequency, n=5) %>%
ggplot(aes(reorder(term, frequency), frequency, fill=`company size`)) +
geom_bar(stat = "identity") + coord_flip() +
facet_wrap(~`company size`, scales="free_x") +
scale_fill_brewer(palette = "Set2", direction=-1) +
guides(fill=FALSE) +
ggtitle("Answer = No") +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 10))
b3
#import Hu & Liu Dictionary
pos <- read.table("D:/QMSS 2021 SPRING/DV/course_content4/Lectures/Week09/data/dictionaries/positive-words.txt", as.is=T)
neg <- read.table("D:/QMSS 2021 SPRING/DV/course_content4/Lectures/Week09/data/dictionaries/negative-words.txt", as.is=T)
#define sentiment fuction
sentiment <- function(words){
tok <- quanteda::tokens(words)
pos.count <- sum(tok[[1]]%in%pos[,1])
neg.count <- sum(tok[[1]]%in%neg[,1])
out <- (pos.count - neg.count)/(pos.count+neg.count)
return(out)
}
#calculate the sentiment
mental_health$sentiment <- sapply(mental_health$text, sentiment)
#plot the relationship between sentiment and answer
p1 <- ggplot(mental_health, aes(x = bringup_issue, y = sentiment)) +
geom_boxplot(aes(fill=`company size`)) +
stat_summary(mapping=aes(group=bringup_issue),fun="mean",geom="point",shape=23,size=3,fill="white") +
facet_wrap(~`company size`) +
scale_fill_manual(values = c("#8DA0CB","#FC8D62","#66C2A5")) +
labs(title = "Distribution of Sentiment Score by Answer and Company Size", y = "Sentiment Score") +
theme(legend.position = 'none',
axis.title.x = element_blank(),
axis.title.y = element_text(vjust = 2),
panel.background = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 12))
ggplotly(p1, tooltip = "sentiment")
library(stringr)
library(waffle)
library(viridis)
setnames(mental,"Which.of.the.following.best.describes.your.work.position.","work_position_general")
work_position <- mental$work_position_general
work_position <- word(work_position,1,sep = "\\|")
work_position <- as.data.frame(work_position)
mental <- cbind(mental, work_position)
arrange(plyr::count(mental, 'work_position'),desc(freq)) %>% filter(work_position!="Other") %>% top_n(6)
mental$mental_hurt_career <- as.character(mental$mental_hurt_career)
hurt_career <- as.data.frame(table(mental$mental_hurt_career))
names(hurt_career) <- c("Do you feel that a mental health issue would hurt your career?", "Frequency")
hurt_career
#plyr::count(mental$mental_hurt_career)
#arrange by job types
mental_wp_back <- mental %>% dplyr::filter(work_position== "Back-end Developer") %>% dplyr::select(mental_hurt_career, work_position)
back <- table(mental_wp_back$mental_hurt_career)
back_w <- waffle(back, rows = 12) +
labs(title = "Back-end Developer") +
theme(plot.title = element_text(color = "black", size = 10))+
gradient_color("RdYlBu")
mental_wp_front <- mental%>% dplyr::filter(work_position== "Front-end Developer") %>% dplyr::select(mental_hurt_career, work_position)
front <- table(mental_wp_front$mental_hurt_career)
front_w <- waffle(front, rows = 9)+
labs(title = "Front-end Developer") +
theme(plot.title = element_text(color = "black", size = 10))+
gradient_color("RdYlBu")
mental_wp_lead <- mental %>% dplyr::filter(work_position== "Supervisor/Team Lead")%>% dplyr::select(mental_hurt_career, work_position)
lead <- table(mental_wp_lead$mental_hurt_career)
lead_w <- waffle(lead, rows = 9)+
labs(title = "Supervisor/Team Lead") +
theme(plot.title = element_text(color = "black", size = 10))+
gradient_color("RdYlBu")
mental_wp_devops <- mental %>% dplyr::filter(work_position== "DevOps/SysAdmin")%>% dplyr::select(mental_hurt_career, work_position)
devops<- table(mental_wp_devops$mental_hurt_career)
devops_w <- waffle(devops, rows = 9)+
labs(title = "DevOps/SysAdmin") +
theme(plot.title = element_text(color = "black", size = 10))+
gradient_color("RdYlBu")
mental_wp_advocate <- mental %>% dplyr::filter(work_position== "Dev Evangelist/Advocate")%>% dplyr::select(mental_hurt_career, work_position)
advocate<- table(mental_wp_advocate$mental_hurt_career)
advocate_w <- waffle(advocate, rows = 6)+
labs(title = "Dev Evangelist/Advocate") +
theme(plot.title = element_text(color = "black", size = 10))+
gradient_color("RdYlBu")
mental_wp_support <- mental %>% dplyr::filter(work_position== "Support")%>% dplyr::select(mental_hurt_career, work_position)
support<- table(mental_wp_support$mental_hurt_career)
support_w <- waffle(support, rows = 6)+
labs(title = "Support") +
theme(plot.title = element_text(color = "black", size = 10))+
gradient_color("RdYlBu")
back_w + lead_w+ front_w + devops_w + advocate_w + support_w+ plot_annotation(title = 'Will identification of mental health issue would hurt your career?', subtitle = 'Faceting by job types') + plot_layout(nrow =3, byrow = FALSE)
#arrange answers
top6 <-mental %>% dplyr::filter(work_position!="Other") %>%
dplyr::select(mental_hurt_career, work_position) %>%
dplyr::filter(work_position == "Back-end Developer" | work_position == "Front-end Developer" | work_position == "Support"| work_position == "Supervisor/Team Lead"| work_position == "DevOps/SysAdmin"| work_position == "Dev Evangelist/Advocate")
mental_maybe <- top6 %>% dplyr::filter(mental_hurt_career== "Maybe")
maybe <- table(mental_maybe$work_position)
maybe_w <- waffle(maybe, rows = 16, legend_pos="bottom") +
labs(title = "") +
theme(plot.title = element_text(color = "black", size = 10))+
gradient_color("RdYlBu")+
labs(title = "Answer: Maybe") +
theme(plot.title = element_text(color = "black", size = 10))
mental_yes <- top6 %>% dplyr::filter(mental_hurt_career== "Yes")
yes <- table(mental_yes$work_position)
yes_w <- waffle(yes, rows = 16, legend_pos="bottom") +
labs(title = "") +
theme(plot.title = element_text(color = "black", size = 10))+
gradient_color("RdYlBu")+
labs(title = "Answer: Yes") +
theme(plot.title = element_text(color = "black", size = 10))
mental_no <- top6 %>% dplyr::filter(mental_hurt_career== "No")
no <- table(mental_no$work_position)
no_w <- waffle(no, rows = 8, legend_pos="bottom") +
labs(title = "") +
theme(plot.title = element_text(color = "black", size = 10))+
gradient_color("RdYlBu")+
labs(title = "Answer: No") +
theme(plot.title = element_text(color = "black", size = 10))
yes_w + no_w + maybe_w +plot_annotation(title='Will identification of mental health issue would hurt your career?', subtitle="Faceting by people who think mental health issue would/maybe/would not hurt their careers")+ plot_layout(ncol = 3, nrow=1, byrow = FALSE,guides = "collect") & theme(legend.position='bottom')
#answer = maybe
#create corpus
mental_health_maybe_job <- filter(mental_health, bringup_issue == "Maybe")
mental_health_maybe_job$doc_id <- as.character(c(1:nrow(mental_health_maybe_job)))
mental_health_maybe_job <- mental_health_maybe_job[, c(68,38,66,63)]
maybe_job_corpus <- DataframeSource(mental_health_maybe_job) %>% VCorpus(.)
#clean
maybe_job_clean <- clean_corpus(maybe_job_corpus)
maybe_job_stemmed <- tm_map(maybe_job_clean, stemDocument)
maybe_job_compl <- lapply(maybe_job_stemmed, stemCompletion2, dictionary = maybe_job_clean) %>% VectorSource() %>% Corpus()
#word cloud
maybe_job_tdm <- TermDocumentMatrix(maybe_job_compl)
maybe_job_tf_idf <- tidy(maybe_job_tdm) %>% left_join(.,mental_health_maybe_job[,c(1,3,4)],by=c("document"="doc_id")) %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(maybe_job_tf_idf[, c(1,6)], color = maybe_job_tf_idf$color_job, shape = "diamond")
#answer = yes
#create corpus
mental_health_yes_job <- filter(mental_health, bringup_issue == "Yes")
mental_health_yes_job$doc_id <- as.character(c(1:nrow(mental_health_yes_job)))
mental_health_yes_job <- mental_health_yes_job[, c(68,38,66,63)]
yes_job_corpus <- DataframeSource(mental_health_yes_job) %>% VCorpus(.)
#clean
yes_job_clean <- clean_corpus(yes_job_corpus)
yes_job_stemmed <- tm_map(yes_job_clean, stemDocument)
yes_job_compl <- lapply(yes_job_stemmed, stemCompletion2, dictionary = yes_job_clean) %>% VectorSource() %>% Corpus()
#word cloud
yes_job_tdm <- TermDocumentMatrix(yes_job_compl)
yes_job_tf_idf <- tidy(yes_job_tdm) %>% left_join(.,mental_health_yes_job[,c(1,3,4)],by=c("document"="doc_id")) %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(yes_job_tf_idf[, c(1,6)], color = yes_job_tf_idf$color_job, shape = "diamond")
#answer = no
#create corpus
mental_health_no_job <- filter(mental_health, bringup_issue == "No")
mental_health_no_job$doc_id <- as.character(c(1:nrow(mental_health_no_job)))
mental_health_no_job <- mental_health_no_job[, c(68,38,66,63)]
no_job_corpus <- DataframeSource(mental_health_no_job) %>% VCorpus(.)
#clean
no_job_clean <- clean_corpus(no_job_corpus)
no_job_stemmed <- tm_map(no_job_clean, stemDocument)
no_job_compl <- lapply(no_job_stemmed, stemCompletion2, dictionary = no_job_clean) %>% VectorSource() %>% Corpus()
#word cloud
no_job_tdm <- TermDocumentMatrix(no_job_compl)
no_job_tf_idf <- tidy(no_job_tdm) %>% left_join(.,mental_health_no_job[,c(1,3,4)],by=c("document"="doc_id")) %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(no_job_tf_idf[, c(1,6)], color = no_job_tf_idf$color_job, shape = "diamond")
#top 10 words in maybe
df <- filter(maybe_job_tf_idf, !(term %in% c("someone","issues","work","etc","just","felt"))) %>% group_by(job_type, term) %>%
summarize("frequency"=sum(count)) %>% slice_max(frequency, n=5)
b4 <- filter(maybe_job_tf_idf, !(term %in% c("someone","issues","work","etc","just","felt","issue"))) %>% group_by(job_type, term) %>%
summarize("frequency"=sum(count)) %>% slice_max(frequency, n=5) %>%
ggplot(aes(reorder(term, frequency), frequency, fill=job_type)) +
geom_bar(stat = "identity") + coord_flip() +
facet_wrap(~job_type, scales="free_x") +
scale_fill_brewer(palette = "Set2", direction=-1) +
guides(fill=FALSE) +
ggtitle("Answer = Maybe") +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 10))
b4
#top 10 words in yes
b5 <- filter(yes_job_tf_idf, !(term %in% c("issues","work","just","felt","though","keep","front","can","time","table","part","looking","ahead","employers","either","aware","exactly","end","fair","even","cards"))) %>% group_by(job_type, term) %>%
summarize("frequency"=sum(count)) %>% slice_max(frequency, n=5) %>%
ggplot(aes(reorder(term, frequency), frequency, fill=job_type)) +
geom_bar(stat = "identity") + coord_flip() +
facet_wrap(~job_type, scales="free_x") +
scale_fill_brewer(palette = "Set2", direction=-1) +
guides(fill=FALSE) +
ggtitle("Answer = Yes") +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 10))
b5
#top 10 words in no
b6 <- filter(no_job_tf_idf, !(term %in% c("issues","work","etc","just","felt","issue"))) %>% group_by(job_type, term) %>%
summarize("frequency"=sum(count)) %>% slice_max(frequency, n=5) %>%
ggplot(aes(reorder(term, frequency), frequency, fill=job_type)) +
geom_bar(stat = "identity") + coord_flip() +
facet_wrap(~job_type, scales="free_x") +
scale_fill_brewer(palette = "Set2", direction=-1) +
guides(fill=FALSE) +
ggtitle("Answer = No") +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.background = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 10))
b6
#plot the relationship between sentiment and answer
p2 <- ggplot(mental_health, aes(x = bringup_issue, y = sentiment)) +
geom_boxplot(aes(fill=job_type)) +
stat_summary(mapping=aes(group=bringup_issue),fun="mean",geom="point",shape=23,size=3,fill="white") +
facet_wrap(~job_type) +
scale_fill_manual(values = c("#8DA0CB","#FC8D62","#66C2A5")) +
labs(title = "Distribution of Sentiment Score by Answer and Job Type", y = "Sentiment Score") +
theme(legend.position = 'none',
axis.title.x = element_blank(),
axis.title.y = element_text(vjust = 2),
panel.background = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank(),
plot.title = element_text(face = "bold", color = "black", size = 12))
ggplotly(p2 , tooltip = "sentiment")