Sentiments on PhD Programs among Reddit Users

Jaegeon Lee

2024-11-11

# Package names
packages <- c("RedditExtractoR", "anytime", "magrittr", "httr", "tidytext", "tidyverse", "igraph", "ggraph", "wordcloud2", "textdata", "sf", "tmap", "here", "tsibble")

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}

# Load packages
invisible(lapply(packages, library, character.only = TRUE))

library(devtools)

## Loading required package: usethis

devtools::install_github("lchiffon/wordcloud2")

## Using GitHub PAT from the git credential store.

## Skipping install of 'wordcloud2' from a github remote, the SHA1 (8a12a3b6) has not changed since last install.
##   Use `force = TRUE` to force installation

library(wordcloud2)

Describe in one sentence what you aim to examine using user-generated text data and sentiment analysis.

How the discourse surrounding doing a PhD degree changes over the seasons in an year (from Nov. 2023 to Nov. 2024)

Data collection and clean up

Download

I first searched the list of subreddits having “PhD” as a keyword. Among 73 subreddits, I chose three subreddits, namely, PhD, AskAcademia, and GradSchool because each of their subreddit name seems to have high relevance to the keyworkd and has a large number of subscribers.

keyword = "PhD"
subreddit_list <- RedditExtractoR::find_subreddits(keywords = keyword)
subreddit_list %>%
  select('subreddit','title','subscribers') %>% 
  arrange(desc(subscribers)) %>%
  head(5) %>%
  knitr::kable()

I downloaded threads from each subreddit.

threads_1 <- find_thread_urls(subreddit = "PhD", 
                              sort_by = 'top', 
                              period = 'year') %>% 
  drop_na()
rownames(threads_1) <- NULL

threads_2 <- find_thread_urls(subreddit = "AskAcademia", 
                              sort_by = 'top', 
                              period = 'year') %>% 
  drop_na()
rownames(threads_2) <- NULL

threads_3 <- find_thread_urls(subreddit = "GradSchool", 
                              sort_by = 'top', 
                              period = 'year') %>% 
  drop_na()
rownames(threads_3) <- NULL

I filtered out threads that do not contain the keyword (considering slight variations in the way it is expressed) within either title or text.

#phd_criterion <- c("PhD", "phd", "Phd", "Ph.D.", "PhD.", "Ph.D", "doctorate", "Doctorate", "Doctoral", #"doctoral")
phd_criterion <- c("PhD|phd|Phd|Ph.D.|PhD.|Ph.D|doctorate|Doctorate|Doctoral|doctoral")

print(nrow(threads_1))
threads_1 <- threads_1 
for (phd in phd_criterion){
  threads_1 <- threads_1 %>%
    filter(!(
      str_detect(title, phd_criterion, negate=TRUE) & str_detect(text, phd_criterion, negate=TRUE)
      )
    )
      }
print(nrow(threads_1))

print(nrow(threads_2))
threads_2 <- threads_2 
for (phd in phd_criterion){
  threads_2 <- threads_2 %>%
    filter(!(
      str_detect(title, phd_criterion, negate=TRUE) & str_detect(text, phd_criterion, negate=TRUE)
      )
    )
      }
print(nrow(threads_2))

print(nrow(threads_3))
threads_3 <- threads_3 
for (phd in phd_criterion){
  threads_3 <- threads_3 %>%
    filter(!(
      str_detect(title, phd_criterion, negate=TRUE) & str_detect(text, phd_criterion, negate=TRUE)
      )
    )
      }
print(nrow(threads_3))

threads <- threads_1 %>%
  bind_rows(threads_2) %>%
  bind_rows(threads_3)
threads

# saving on my local env
#threads %>%
#  write_csv("asset/threads.csv")

Import pre-processed data

# saving on my local env
threads <- read_csv("asset/threads.csv")

## Rows: 1324 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): title, text, subreddit, url
## dbl  (2): timestamp, comments
## date (1): date_utc
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

threads

## # A tibble: 1,324 × 7
##    date_utc    timestamp title                    text  subreddit comments url  
##    <date>          <dbl> <chr>                    <chr> <chr>        <dbl> <chr>
##  1 2024-10-30 1730301090 "It's finally done - di… "I w… PhD             18 http…
##  2 2024-10-12 1728769540 "I often feel like I'm … "I o… PhD             17 http…
##  3 2024-10-02 1727863168 "Am I finished my PhD"    <NA> PhD              5 http…
##  4 2024-02-20 1708388079 "Turned down potentiall… "Lik… PhD             32 http…
##  5 2023-12-16 1702768786 "What\u0019s your field… "I\u… PhD            466 http…
##  6 2023-12-14 1702566888 "Did you become friends… "I a… PhD             94 http…
##  7 2024-08-10 1723270716 "F1 as PhD students =\u…  <NA> PhD              4 http…
##  8 2024-07-31 1722406410 "Failed my PhD Proposal… "Hey… PhD             29 http…
##  9 2024-04-16 1713267474 "Absolutely hate the ex… "I p… PhD             56 http…
## 10 2024-09-09 1725841208 "I feel like the lazies… "I h… PhD             26 http…
## # ℹ 1,314 more rows

#rm(threads_1, threads_2, threads_3)

threads %<>% 
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date))

The dataset covers from Nov. 2023 to Nov. 2024. It seems that the number of threads peaks up in March and September, each of which corresponds to the middle of Fall and Spring semesters.

# number of threads by week
threads %>% 
  ggplot(aes(x = date)) +
  geom_histogram(color="black", position = 'stack', binwidth = 604800) +
  scale_x_datetime(date_labels = "%b %y",
                   breaks = seq(min(threads$date, na.rm = T), 
                                max(threads$date, na.rm = T), 
                                by = "1 month")) +
  theme_minimal()

Tokenization and stop words

I tokenized texts into words prior to semantic analysis, which is the main part of this study.

# Word tokenization
words <- threads %>% 
  tidytext::unnest_tokens(output = word, input = text, token = "words") %>%
  select(date, subreddit, title, comments, word)
words

## # A tibble: 276,736 × 5
##    date                subreddit title                            comments word 
##    <dttm>              <chr>     <chr>                               <dbl> <chr>
##  1 2024-10-30 00:00:00 PhD       It's finally done - dissertatio…       18 i    
##  2 2024-10-30 00:00:00 PhD       It's finally done - dissertatio…       18 want…
##  3 2024-10-30 00:00:00 PhD       It's finally done - dissertatio…       18 to   
##  4 2024-10-30 00:00:00 PhD       It's finally done - dissertatio…       18 share
##  5 2024-10-30 00:00:00 PhD       It's finally done - dissertatio…       18 some 
##  6 2024-10-30 00:00:00 PhD       It's finally done - dissertatio…       18 happy
##  7 2024-10-30 00:00:00 PhD       It's finally done - dissertatio…       18 news 
##  8 2024-10-30 00:00:00 PhD       It's finally done - dissertatio…       18 i    
##  9 2024-10-30 00:00:00 PhD       It's finally done - dissertatio…       18 subm…
## 10 2024-10-30 00:00:00 PhD       It's finally done - dissertatio…       18 my   
## # ℹ 276,726 more rows

words %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

## Selecting by n

I refered to a list of “stop words” to sort them out from the word dictionary.

# load list of stop words - from the tidytext package
data("stop_words")

stop_words %>% head()

## # A tibble: 6 × 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART

# Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- threads %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = "words") %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)

## Before: 276736, After: 87441

Obviously, the keyword “phd” has the highest frequency. It would be helpful to sort out the keyword before drawing a world cloud.

words_clean %>%
  count(word, sort = TRUE) %>%
  top_n(20, n) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

Generate a word cloud that illustrates the frequency of words except your keyword.

words_clean %>% 
  group_by(word) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

## # A tibble: 10,818 × 2
##    word     count
##    <chr>    <int>
##  1 phd       2137
##  2 time       849
##  3 feel       773
##  4 research   762
##  5 people     586
##  6 student    527
##  7 students   501
##  8 school     468
##  9 job        451
## 10 lab        439
## # ℹ 10,808 more rows

It is worth noting that PhD students, or more broadly those who are concerned with this topic, also mention “time” and “research” ofteh. I also see school-related words like “student”. Words like “academia”, “field”, “PI (principal investigator)”, “advisor” would be more particulary associated with the life of PhD students.

n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

words_clean %>% 
  filter(word != "phd") %>%
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

Conduct a tri-gram analysis.

Extract tri-grams from your text data.

I extracted tri-grams using the unnest_tokens function.

# Get n=3 grams
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_trigram <- threads %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)
words_trigram

## # A tibble: 273,675 × 1
##    paired_words            
##    <chr>                   
##  1 i wanted to             
##  2 wanted to share         
##  3 to share some           
##  4 share some happy        
##  5 some happy news         
##  6 happy news i            
##  7 news i submitted        
##  8 i submitted my          
##  9 submitted my doctoral   
## 10 my doctoral dissertation
## # ℹ 273,665 more rows

Remove tri-grams containing stop words or non-alphabetic terms.

#separate the paired words into two columns
words_trigram_stretched <- words_trigram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# filter rows where there are stop words under word 1 column and word 2 column
words_trigram_stretched <- words_trigram_stretched %>%
  # drop stop words
  filter(!word1 %in% stop_words$word &
         !word2 %in% stop_words$word &
         !word3 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") &
         str_detect(word2, "[a-z]") &
         str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
# To see what's ASCII, google 'ASCII table'
library(stringi)
words_trigram_stretched %<>% 
  filter(stri_enc_isascii(word1) &
         stri_enc_isascii(word2) &
         stri_enc_isascii(word3))

Present the frequency of tri-grams in a table.

# Sort the new tri-gram (n=3) counts:
words_counts <- words_trigram_stretched %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

head(words_counts, 20) %>% 
  head(10) %>%
  knitr::kable()

word1	word2	word3	n
academic	job	market	6
post	doc	position	6
tenure	track	job	6
fellow	grad	students	4
senior	grad	student	4
social	science	phd	4
started	grad	school	4
students	post	docs	4
sunk	cost	fallacy	4
applied	stem	field	3

Discuss any noteworthy tri-grams you come across.

First, several tri-grams indicate how the participants of the selected subreddits are concerned with career path after graduation. (academic job market, post doc position, tenure track job).

Second, PhD programs in certain fields (social sciences, humanities) are mentioned more often, despite the fact that there are a larger number of engineering PhD students at most of universities in the US. Does this indicate that PhD students in certain fields are more likely to engage in (online) discussions?

Third, several words that indicate which year a student is in, such as “fellow”, “senior”, or “incoming”. I guess the social life among the PhD student commnity is very important.

Perform a sentiment analysis on your text data using a dictionary method that accommodates negations.

I used the sentimentr package which can handle negations and take valence shifters into account.

library(sentimentr)

I cleaned up the texts by removing quotes and apostrophes.

replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

threads_clean <- threads %>%
  mutate(title = str_replace_all(title, replace_reg, ""),
         text = str_replace_all(text, replace_reg, "")) %>%
  mutate(title = replace_na(title, ""),
         text = replace_na(text, ""),
         title_text = str_c(title, text, sep = ". ")) 
threads_clean %>% head()

## # A tibble: 6 × 9
##   date_utc    timestamp title text  subreddit comments url   date               
##   <date>          <dbl> <chr> <chr> <chr>        <dbl> <chr> <dttm>             
## 1 2024-10-30 1730301090 "It'… "I w… PhD             18 http… 2024-10-30 00:00:00
## 2 2024-10-12 1728769540 "I o… "I o… PhD             17 http… 2024-10-12 00:00:00
## 3 2024-10-02 1727863168 "Am … ""    PhD              5 http… 2024-10-02 00:00:00
## 4 2024-02-20 1708388079 "Tur… "Lik… PhD             32 http… 2024-02-20 00:00:00
## 5 2023-12-16 1702768786 "Wha… "I\u… PhD            466 http… 2023-12-16 00:00:00
## 6 2023-12-14 1702566888 "Did… "I a… PhD             94 http… 2023-12-14 00:00:00
## # ℹ 1 more variable: title_text <chr>

I sliced each text into sentences.

threads_clean_getsen <- threads_clean %>%
  pull(title_text) %>%
  get_sentences(title_text)

I applied the sentiment_by function to each text (while the function acknowledges where each sentence within the text begins and ends).

threads_clean_sentiment <- sentiment_by(threads_clean_getsen) %>% 
  arrange(desc(ave_sentiment)) 
threads_clean_sentiment %>% 
  head(10) %>%
  knitr::kable()

element_id	word_count	sd	ave_sentiment
738	41	0.4322480	0.5681862
102	67	0.4867714	0.5546764
565	91	0.6738206	0.5191390
200	204	0.5285309	0.5153609
454	32	0.5006943	0.5134390
262	29	NA	0.5013774
775	215	0.4918146	0.4566643
744	98	0.3095693	0.4546383
962	122	0.6788061	0.4500016
600	12	NA	0.4474465

Display 10 sample texts alongside their sentiment scores and evaluate the credibility of the sentiment analysis outcomes.

Using 0 as a cutoff, there are 915 texts having positive sentiment and 409 texts having negative sentiment.

threads_clean_sentiment_ex <- threads_clean %>%
  bind_cols(threads_clean_sentiment) %>%
  rename(sentiment = ave_sentiment) %>%
  mutate(sentiment_abs = abs(sentiment),
         sentiment_binary = case_when(sentiment > 0 ~ 'positive',
                                       TRUE ~ 'negative')) %>%
  group_by(sentiment_binary) %>%
  arrange(desc(sentiment_abs))
threads_clean_sentiment_ex %>% 
  select(sentiment_binary, sentiment_abs) %>%
  count()

## # A tibble: 2 × 2
## # Groups:   sentiment_binary [2]
##   sentiment_binary     n
##   <chr>            <int>
## 1 negative           409
## 2 positive           915

The following is a list of five texts showing the largest negative sentiment scores. The two texts on the top of the list are pertaining to the writers, perhaps PhD students, having concerns with the interactions with their advisers. According to what the writers feel, their advisers do not think the writers are working hard enough and lack motivations. The fifth text is about a PhD student having a difficult time with their self-esteem by relating weight gains with hardships in graduate school. However, the sentiment analysis based on the dictionary method is far from complete in that the third and fourth texts in fact show positive sentiments. While each of the writers had had difficulties with mental health and disabilities, they are sharing their stories of how they have overcome those hardships during their graduate school life. I guess that the sentiment analysis method tends to pick up some part of negative expressions while it fails to capture positive senses that the writers intend to put forth at the end as conclusions of their stories.

threads_clean_sentiment_ex %>% 
  filter(sentiment_binary == 'negative') %>%
  head(5) %>% 
  select(sentiment_binary, sentiment_abs, title_text) %>%
  rowwise() %>%
  mutate(title_text = str_c(sentiment_binary, sentiment_abs, title_text, sep="\n")) %>%
  pull(title_text) %>%
  cat(sep = "\n===============\n")

## negative
## 0.436564125065399
## Prof reached out to mention I'm "less engaged" recently and I'm worried.. I got an email today from a professor I respect basically wondering why I wasn't at the last group meeting for our research collective and to mention that this spring, I seem to be somewhat "less engaged" in the materials and guest speakers than usual. So of course, I'm awake at almost 2:00 am worrying about this and what it means.
## 
## I'm in an MA/PhD program and I have my MA exams coming up in a week, and I technically work two jobs to afford the ridiculous living cost of the area I'm in. I've also been struggling the last few weeks with serious insomnia issues (think 1-2 hours of sleep a night), so I took a risk and skipped a single meeting for the research collective so I could go home and nap (that didn't work, unfortunately). 
## 
## I thought I was doing okay staying on top of all of my immediate responsibilities despite all this, but with this email, now I'm just not sure. I sent an email back saying I was worried about my exams and fighting off an illness, hence the missed meeting, but I'm still concerned I'm not doing enough.
## 
## Am I panicking over something deeply silly?
## ===============
## negative
## 0.413335104869347
## My PI picks up undergrads like strays. Im a 4th year chem PhD candidate. There are three other grad students in my lab: one 5th year, one 2nd year, and one 1st year. We now have SIX undergrads.
## 
## There are no more desks in the office. We have six undergrads split between four grad students, except its really between three of us because the 2nd year is not capable of mentoring an undergrad. When you factor in all of the lab upkeep jobs, teaching responsibilities, classes, meetings, writing, actually doing our own research&Im fucking exhausted.
## 
## And were getting visiting scholars and rotating 1st years over the summer. No idea where theyre gonna sit.
## 
## Ive had countless conversations about my PI about how the amount of undergrads he takes puts insane amounts of pressure on us and prevents us from making progress on our research. And hell say he understands, then the next day be yelling at me for not getting an experiment completed because I was too busy making sure my undergrad didnt accidentally kill himself in lab. And when I say yelling, I mean YELLING.
## 
## Im just so many layers of done that its not even funny anymore. These ducklings are going to be the death of me.
## ===============
## negative
## 0.360233373968818
## Advice for Grad Students. Hi, I'm a 27F who is currently working on a PhD and one piece of advice that I would like to share that has literally saved my life is: GO TO THERAPY!. For one it's free if you use the on-campus services and two it's confidential. If you are intimidated by 1-on-1 sessions, see if your school has group therapy options. At my university there is a Grad Student Support Group and even an Art Therapy group.
## 
## I really didn't think therapy was for me, but it has truly saved my life multiple times, so please do yourself a favor and check out your university's counseling services.
## ===============
## negative
## 0.348099322761
## Do I deserve my degree if I get accommodations?. I am an upper year PhD student. I have finally caved and got accommodations for my disability. I have ADHD and bipolar disorder. Ive been fighting getting help because I dont feel like I deserve my degree if I get accommodations. I never felt that way in undergrad but my MSc supervisor freaked out when she found out I had a disability. She said I was a liability and tried so hard to get rid of me. I feel like thats what I am - a liability. I will get help through grad school, get to work and realize I cant do my job because I dont have accommodations. And no one cares about accommodating me on the outside. They will just take the next able bodied person. And why should they take me? There are other students that can do the same things as me in less time. And if I dont complete the degree as it is presented like everyone else, do I even deserve it?
## Or is it a presentation award so the university can pat themselves on the back that they helped someone who was disabled?
## 
## EDIT: yes I am a researcher
## 
## UPDATE: thank you so much for your perspectives. I really appreciate you taking the time. I am feeling better about my accommodations. FYI, I graduated with my MSc and no longer have that supervisor for my PhD. I was supposed to do a PhD with her but her freak-out amongst other things convinced me to do my MSc and GTFO. I did report her (for everything she did) and she no longer works at the university. The university was terrified of getting sued on her behalf for the things she did to me and her other students. My current supervisor is a good person but does have ableist tendencies. I did talk to him and share my concerns about accommodations. He was very supportive. He said he just thought I was sick a lot and was upset I didnt do 60 hours like the other students. But he said he understood now and he apologized for forcing me to work.
## ===============
## negative
## 0.304857283640827
## How much weight have you gained!?. Finishing my MS in August. Have been working full time  getting my masters. My life has been shutting my work laptop to opening my personal laptop to do school work. I am definitely the most stationary I have ever been. I feel so burnt out and run down. Ive gained over 30 pounds. Completely miserable and questioning if it was worth the 80k in student loans. 
## 
## I hope this pays off and I hope I dont decide to go back to get another MS or PhD. I need to remember how miserable I am ahahah. Whats it like to be a person? I cant remember

The following is a list of five texts showing the largest positive sentiment scores. The first text with the highest scores seems to have a positive sense because the writer is proud of themselves for an achievement and also feels gratitude to others for the progress. The fifth text seems to be encouraging discussions with others with welcoming and accommodating words. However, the second and fourth texts could be evidence showing the limitations of the sentiment analysis. The main theme of the second text is that the writer does not think they are competent enough to be doing a PhD program. I suspect that few phrases like “good job buddy”, “Really? All by yourself?”, “We’re so proud” contribute to a high positive score, while the writer themselves are being sarcastic to those compliments. Similarly, the fourth text is about the writer having a hard time settling in a PhD program but the sentiment score of the analysis indicates positive sentiment.

threads_clean_sentiment_ex %>% 
  filter(sentiment_binary == 'positive') %>%
  head(5) %>% 
  select(sentiment_binary, sentiment_abs, title_text) %>%
  rowwise() %>%
  mutate(title_text = str_c(sentiment_binary, sentiment_abs, title_text, sep="\n")) %>%
  pull(title_text) %>%
  cat(sep = "\n===============\n")

## positive
## 0.568186165635548
## It's finally done - dissertation submitted!. I wanted to share some happy news - I submitted my doctoral dissertation for review yesterday! 
## 
## The writing process took about 9 months, during which I literally became a hermit. For the last 6 months, I barely left my apartment - I worked from home, and every afternoon was dedicated to writing the dissertation.
## 
## I must admit this has been the most challenging experience of my life...
## 
## Throughout this process, this subreddit has been a huge support. Reading your experiences helped me realize I wasn't alone and that what I was going through was a normal part of the journey. Your posts and comments gave me strength to keep going.
## 
## To everyone still writing, I wish you lots of success and patience. Fingers crossed for you all to successfully complete your work. You've got this!
## ===============
## positive
## 0.554676442815944
## I often feel like I'm the dumbest doctoral candidate to exist. They keep saying things like "good job buddy" and "Really? All by yourself? We're so proud."/s. I often feel like my doctoral program must be a scam. If they accepted someone as dumb as me into the program there is no way it's legit. The guy who once stopped at a Chinese restaurant just to get soy sauce packets for the rice he cooked at home... and they let ME in to the program. "Sorry everyone, I single handedly just made your degrees mean less." Don't get me wrong, I've put in the hard work in every class and never rode the coat tails of other in my cohort but I often feel like "This year mom and dad let me sit at the grown-ups table for Christmas dinner. Sucks to be you cousin Becky! Enjoy listening to little Jacob explain the plot to Rainbow Friends again" *blows raspberry*
## ===============
## positive
## 0.519138979776862
## Am I finished my PhD. 
## ===============
## positive
## 0.515360890008477
## Turned down potentially only PhD offer!. Like the title says, I officially sent out my emails to the university that admitted me to decline offer of admission.
## 
## It was not a hard choice, Im out of state and they offered me no financial support besides a scholarship that I make more than in one month at my current job =
## 
## So I applied to 4 schools, rejected from 2 and still waiting for 1 ( I pretty much think its a rejection) 
## 
## Cycle 2025 here I come=
## ===============
## positive
## 0.513439036805265
## Whats your field?. Ive noticed that a lot of posts coming from STEM phds. Interested to know - whats your field? Feel free to be specific! Also - if if you started in a different field, tell us where you started and where you are now. 
## 
## Ill go first - started in religious studies - finished with a PhD in bioethics this November.

Discuss intriguing insights derived from the sentiment analysis, supporting your observations with at least two plots.

How does the overall sentiment change over time?

To see how the overall sentiment change over time, I plotted monthly average sentiment scores from Nov. 2023 to Nov. 2024.

threads_senti_4analysis <- threads_clean %>%
  bind_cols(threads_clean_sentiment) %>%
  rename(sentiment = ave_sentiment) %>%
  mutate(sentiment_abs = abs(sentiment),
         sentiment_binary = case_when(sentiment > 0 ~ 'positive',
                                       TRUE ~ 'negative')) %>%
  group_by(sentiment_binary) %>%
  arrange(desc(sentiment_abs))  %>%
  select(date, title, text, sentiment_binary, sentiment_abs)

threads_senti_4analysis %>%
  ungroup() %>%
  select(-text) %>%
  mutate(yearmonth = tsibble::yearmonth(date)) %>%
  group_by(yearmonth, sentiment_binary) %>%
  mutate(idx = str_c(yearmonth, sentiment_binary)) %>%
  mutate(sentiment = ifelse(sentiment_binary=="negative" , sentiment_abs*(-1), sentiment_abs)) %>%
  mutate(yearmonth = tsibble::yearmonth(date)) %>%
  mutate(sentiment_binary = factor(sentiment_binary)) %>%
  ggplot(aes(x=yearmonth, y=sentiment, group=idx, fill=sentiment_binary)) +
  geom_boxplot(position = position_dodge(width=0.75)) +
  tsibble::scale_x_yearmonth(date_breaks = "1 month", date_labels = "%b %Y") +
  theme_minimal() +
  labs(
    title = "Average Sentiment Over Time",
    x = "Year-Month",
    y = "Average Sentiment",
    color = "Sentiment"
  )

There are two interesting points to note in the above graph. First, the negative sentiment scores seem to be smallest in absolute value in the beginning of a semester Jan 2024 and Aug 2024 and to be largest in absolute value in the end of a semester Nov 2023 and Apr 2024. I guess that the stress of coursework, dissertation proposals and defenses may have impact on PhD students’ well being and mental health. Second, the positive and negative sides of sentiment seem to change in opposite directions, shaping wider and narrower gaps in each month. Notably, for each month between Mar 2024 and Nov 2024, the absolute values on both sides increase and decrease simultaneously, showing repeated patterns of polarization in sentiments. This indicates that many PhD students feel differently within the same temporal periods or at least those who write on Reddits exhibit their feelings quite differently.

What are the words used when the writers exhibit polarized feelings?

To dive into how the writers expressed their thoughts when they exhibited polarized feelings, I took the texts written in Apr 2024 as an example.

threads_senti_4analysis_polarized <- threads_senti_4analysis %>%
  ungroup() %>%
  mutate(yearmonth = tsibble::yearmonth(date)) %>%
  filter(as.character(yearmonth) == "2024 Apr") %>%
  select(-yearmonth) %>%
  select(-date)

threads_senti_4analysis_polarized %>%
  group_by(sentiment_binary) %>%
  count()

## # A tibble: 2 × 2
## # Groups:   sentiment_binary [2]
##   sentiment_binary     n
##   <chr>            <int>
## 1 negative            37
## 2 positive            81

I will use the same number of examples for each side of sentiment.

# Stop word removal and tokenization
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"
phd_criterion <- c("PhD", "phd", "Phd", "Ph.D.", "PhD.")

threads_senti_4analysis_polarized_clean <- threads_senti_4analysis_polarized %>%
  ungroup() %>%
  mutate(title_text = str_c(title, text, sep = "  ")) %>%
  mutate(title_text = str_replace_all(title_text, replace_reg, "")) %>%
  select(-title) %>%
  select(-text) %>%
  # tokenize
  unnest_tokens(word, title_text, token = "words") %>%
  # remove stop words
  anti_join(stop_words, by = "word") %>%
  filter(str_detect(word, "[a-z]")) %>%
  filter(!word %in% phd_criterion) # You need to replace this with your keyword
threads_senti_4analysis_polarized_clean

## # A tibble: 7,069 × 3
##    sentiment_binary sentiment_abs word      
##    <chr>                    <dbl> <chr>     
##  1 positive                 0.450 absolutely
##  2 positive                 0.450 hate      
##  3 positive                 0.450 experience
##  4 positive                 0.450 posted    
##  5 positive                 0.450 similar   
##  6 positive                 0.450 months    
##  7 positive                 0.450 ago       
##  8 positive                 0.450 student   
##  9 positive                 0.450 london    
## 10 positive                 0.450 housing   
## # ℹ 7,059 more rows

Using the anti_join function, I sorted out words that are commonly used in both sides.

# negative text
threads_senti_4analysis_polarized_clean_negative <- threads_senti_4analysis_polarized_clean %>%
  filter(sentiment_binary == "negative") 
# positive text
threads_senti_4analysis_polarized_clean_positive <- threads_senti_4analysis_polarized_clean %>%
  filter(sentiment_binary == "positive")

# Remove words that are commonly seen in both negative and positive threads
threads_senti_4analysis_polarized_clean_negative <- threads_senti_4analysis_polarized_clean_negative %>%
  anti_join(threads_senti_4analysis_polarized_clean_positive, by = 'word')
threads_senti_4analysis_polarized_clean_positive <- threads_senti_4analysis_polarized_clean_positive %>%
  anti_join(threads_senti_4analysis_polarized_clean_negative, by = 'word')

The following word cloud shows words appearing in negative threads. From this, we can see that some of the writers feel pressure from attaining tenure positions and grants. I guess most of them are junior professors who still joing PhD subreddit groups after graduation. Besides, I see words like “decline” or “fired” which indicate that some of the writers have failed in achieving their goals recently.

# Wordcloud with a custom color palette
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark nor too bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

threads_senti_4analysis_polarized_clean_negative %>%
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal,
       minRotation = -pi/6,
       maxRotation = -pi/6,
       rotateRatio = 1)

Meanwhile, from the word cloud showing words frequently used in texts with positive sentiment, I notice that the writers usually have good feelings with the core elements that constitute what they do as PhD students, namely research, lab, project, etc. Interestingly, I also see time-related words, such as time, month and day. While I may need to explore each text in more detail, I can draw a rough conclusion that some of the writers find joy in their routine school life.

# Wordcloud with a custom color palette
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark nor too bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

threads_senti_4analysis_polarized_clean_positive %>%
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal,
       minRotation = pi/6,
       maxRotation = pi/6,
       rotateRatio = 1)