1. Choose topic

Examine changes in sentiment towards “internship” in last 10 years.

2.Search Reddit threads using a keyword of your choice.

Downloading Reddit threads

# using keyword
threads_2 <- find_thread_urls(keywords = "intern", 
                              sort_by = 'relevance', 
                              period = 'all')
colnames(threads_2)
head(threads_2)
#saveRDS(threads_2,file = "(internship)threads_2.rds")

Downloading comments and additional information

# get individual comments
threads_2_content <- get_thread_content(threads_2$url)
names(threads_2_content)
#saveRDS(threads_2_content,file = "(internship)threads_2_content.rds")

# filter dates later than 2022.1.1
threads_2_content_filter<-threads_2_content$comments[threads_2_content$comments$timestamp>1639920077,]
#saveRDS(threads_2_content_filter,file = "(internship)threads_2_content_filter.rds")

Visualize data

threads_2_content<-readRDS("D:/Storage/Master/23fall/CP6088 Intro to Urban Analytics/6088assn4/(internship)threads_2_content.rds")
threads_2<-readRDS("D:/Storage/Master/23fall/CP6088 Intro to Urban Analytics/6088assn4/(internship)threads_2.rds")

# create new column: date
threads_2 %<>% 
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date))

# number of threads by month
Sys.setlocale(locale = "en_US.UTF-8")# set R studio environment language
## [1] "LC_COLLATE=en_US.UTF-8;LC_CTYPE=en_US.UTF-8;LC_MONETARY=en_US.UTF-8;LC_NUMERIC=C;LC_TIME=en_US.UTF-8"
threads_2 %>% 
  ggplot(aes(x = date)) +
  geom_histogram(color="black", position = 'stack', binwidth = 60*60*24*7) +
  scale_x_datetime(date_labels = "%b %Y",
                   breaks = seq(min(threads_2$date, na.rm = T), 
                                max(threads_2$date, na.rm = T), 
                                by = "6 month")) +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

3.Clean your text data and then tokenize it.

Before Tokenization

# Tokenization (word tokens)
words <- threads_2 %>% 
  unnest_tokens(output = word, input = text, token = "words")

words %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")
## Selecting by n

# load list of stop words - from the tidytext package
data("stop_words")
# view random 50 words
print(stop_words$word[sample(1:nrow(stop_words), 50)])
##  [1] "early"      "quite"      "be"         "yourselves" "won't"     
##  [6] "new"        "any"        "state"      "will"       "best"      
## [11] "several"    "toward"     "everybody"  "before"     "most"      
## [16] "does"       "off"        "d"          "further"    "doing"     
## [21] "you've"     "often"      "if"         "non"        "although"  
## [26] "allows"     "make"       "you'd"      "newest"     "somehow"   
## [31] "that"       "that's"     "m"          "does"       "whereupon" 
## [36] "grouping"   "becomes"    "about"      "she'd"      "same"      
## [41] "working"    "selves"     "already"    "made"       "beside"    
## [46] "or"         "seems"      "than"       "wouldn't"   "nothing"
# Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

words_clean <- threads_2 %>% 
  # drop URLs
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  # Tokenization (word tokens)
  unnest_tokens(word, text, token = "words") %>% 
  # drop stop words
  anti_join(stop_words, by = "word") %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word, "[a-z]"))

# Check the number of rows after removal of the stop words. There should be fewer words now
print(
  glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)
## Before: 18318, After: 6166

After Tokenization

words_clean %>%
  count(word, sort = TRUE) %>%
  top_n(20, n) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "words",
       y = "counts",
       title = "Unique wordcounts")

4.Generate a word cloud that illustrates the frequency of words except your keyword

n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

words_clean_noself<-words_clean%>%filter(!grepl("internship",word,ignore.case = TRUE))



words_clean_noself %>% 
  count(word, sort = TRUE) %>% 
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

5. Conduct a tri-gram analysis

5.1 Extract tri-grams from your text data

#get tri-grams.
words_ngram <- threads_2 %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  select(text) %>%
  unnest_tokens(output = paired_words,
                input = text,
                token = "ngrams",
                n = 3)

#show ngrams with sorted values
words_ngram %>%
  count(paired_words, sort = TRUE) %>% 
  head(20) %>% 
  knitr::kable()
paired_words n
NA 127
a lot of 18
i feel like 12
i am a 10
this is a 10
for pathways internships 9
i don t 9
it is not 8
be able to 7
feel free to 7
i have no 7
to answer them 7
an unpaid internship 6
application window for 6
best to answer 6
do our best 6
for an internship 6
have already begun 6
he didn t 6
i want to 6

5.2 Remove tri-grams containing stop words or non-alphabetic terms.

words_ngram_pair <- words_ngram %>%
  separate(paired_words, c("word1", "word2", "word3"), sep = " ")

# filter rows where there are stop words under word 1,2,3 column
words_ngram_pair_filtered <- words_ngram_pair %>%
  # drop stop words
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>% 
  # drop non-alphabet-only strings
  filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]")& str_detect(word3, "[a-z]"))

# Filter out words that are not encoded in ASCII
# To see what's ASCCII, google 'ASCII table'

words_ngram_pair_filtered %<>% 
  filter(stri_enc_isascii(word1) & stri_enc_isascii(word2) & stri_enc_isascii(word3))

# Sort the new tri-gram (n=3) counts:
words_counts <- words_ngram_pair_filtered %>%
  count(word1, word2, word3) %>%
  arrange(desc(n))

5.3 Present the frequency of tri-grams in a table.

head(words_counts, 10) %>% 
  knitr::kable()
word1 word2 word3 n
nasa internships staff 6
pathways internships starting 6
intern supporting nasa 4
nasa ostem intern 4
ostem intern supporting 4
_internship _threads _super_thread_other_internship_threads 3
_other _internship _threads 3
_super _thread _other 3
_thread _other _internship 3
begun scheduling interviews 3
# plot word network
words_counts %>%
  filter(n >= 2) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
  geom_node_point(color = "darkslategray4", size = 3) +
  geom_node_text(aes(label = name), vjust = 1.8) +
  labs(title = "Word Networks",
       x = "", y = "")
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

5.4 Discuss any noteworthy tri-grams you come across

In the frequency plot of tri-grams, “nasa” appears relatively frequent. Maybe nasa offers lots of internship opportunities? Or people are very interested in nasa’s internship opportunities.

“ostem” also appears relatively frequent. “ostem” is actually “office of STEM” in NASA, a department offers internship opportunity. Since it’s also about NASA, the reason that the frequency is high is that NASA OSTEM keeps posting job opportunties on Reddit.

6.Perform a sentiment analysis on your text data using a dictionary method that accommodates negations

Apply BERT model

# data ready for BERT
threads_2_bert<-threads_2%>%select(date_utc,timestamp,title,text,subreddit,comments,url)
write.csv(threads_2_bert,file = "threads_2_bert.csv",row.names = FALSE)

Get sentiment scores using the dictionary method for comparison

# Get sentiment scores using the dictionary method for comparison.

reddit_sentiment <- read_csv('sample_reddit_bert.csv',show_col_types = FALSE) %>% 
  drop_na('bert_label')%>%  
  mutate(title = replace_na(title, ""),
         text = replace_na(text, ""),
         title_text = str_c(title, text, sep = ". "),)
## New names:
## • `` -> `...1`
library(sentimentr)
## Warning: 程辑包'sentimentr'是用R版本4.3.2 来建造的
## 
## 载入程辑包:'sentimentr'
## 
## The following object is masked from 'package:syuzhet':
## 
##     get_sentences
reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text) # by string (a group of sentences)

reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)

save(reddit_sentiment, file = "reddit_sentiment1.RData")

Check the correlation between the sentiment values from two different methods.

load("reddit_sentiment1.RData")

reddit_sentiment <- reddit_sentiment %>% mutate(bert_label_numeric = str_sub(bert_label, 1, 1) %>% as.numeric())

cor(reddit_sentiment$bert_label_numeric, reddit_sentiment$sentiment_dict)
## [1] 0.4539148
ggplot(data = reddit_sentiment, aes(x = bert_label_numeric, y = sentiment_dict)) +
  geom_jitter(width = 0.1, height = 0, color = 'Black') +
  geom_line(aes(y = 0), color = '#FFD700', lwd = 1, linetype='dashed')+
  geom_smooth(method = "lm", se = FALSE, color = "gray")
## `geom_smooth()` using formula = 'y ~ x'

#### 0.45 of correlation coefficient implies a moderate positive correlation between BERT score and sentiment score. ### In the scatter plot below, the two seem to be correlated. The threads that got 4-5 stars from the BERT model are mostly above 0 (meaning positive) in the other method.

7. Display 10 sample texts alongside their sentiment scores and evaluate the credibility of the sentiment analysis outcomes.

BERT: 1 star (negative) vs. 5 stars (positive)

bert_example <- reddit_sentiment %>%
  filter(bert_label_numeric %in% c(1,5)) %>% 
  group_by(bert_label) %>%
  arrange(desc(bert_score)) %>%
  slice_head(n = 5) %>%
  ungroup()

# 1 star
bert_example %>% filter(bert_label_numeric == 1) %>% pull(title_text) %>% print()
## [1] "Fired from tax internship for smelling like weed. Today my boss let me go for \034smelling like weed.\035 I have never smoked, never been around it. Nothing! I even offered to take a drug test! I tried my hardest to convince him that it must be a mistake or my cologne. Despite all of this, he was immovable. Although I hated the job, I feel horrible. Is this racial profiling? Is there anything I can do? Should I just move on? \r\n\r\nI\031m just so dumbfounded. I would understand if I did smoke, but I don\031t! \r\n\r\nAny words of reassurance? =\023"                                                                                                                                                                                                                                                                                                                                        
## [2] "\030Blatantly Illegal:\031 NASCAR Bans White Applicants From \030Diversity Internship\031. "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
## [3] "I Hate That Internships Are Required. Most colleges now require you to have an internship of some kind before graduating.This is completely ridiculous and should be optional.How on earth am I supposed to work an unpaid internship (one recommended to me by my professor),carry a full load of classes, and work an actual real job because I need to pay my bills?I\031m paying for a class to tell me I need experience-no duh.I am failing my internship class even though I have an internship!There should be no other criteria to pass or fail this class other than securing the blasted internship.I don\031t have time to write about how \034enlightening this experience has been\035 when I\031m required to update university postings,send out fundraising letters,and email Congress multiple times a week for this internship.I\031m barely passing my other classes as is and I\031m so tired."
## [4] "Zero internship opportunities at my M7. Every presentation given to us by a company either ends with them saying they are hiring significantly less this year for the summer or they are from a group that is not hiring at all. The one presentation I just got done with, the guy said his team recruited for the summer every year since GFC, and this was the first time in 15 years they would not have interns. There are literally no jobs right now and the situation seems to be getting worse and worse."                                                                                                                                                                                                                                                                                                                                                                                                 
## [5] "12 Months UNPAID internship. What kind of Slavery is this??. "
# 5 star
bert_example %>% filter(bert_label_numeric == 5) %>% pull(title_text) %>% print()
## [1] "Best end-of-internship gift.. "                                                                                                                                                                                                                                                                                                                                                                                                                                             
## [2] "My internship search (incredibly successful). "                                                                                                                                                                                                                                                                                                                                                                                                                             
## [3] "I secured an internship at NASA!. Hello fellow CS majors. I am a non-traditional student (took a multi-year break), and this year has been the toughest job search I have ever had. I was losing hope, but it happened, I got an offer from NASA! This is seriously one of the most awesome moments of my life, and I kind of want to share it with everyone.\r\n\r\nEdit: Thank you, everyone! If I don't respond, sorry. Them DM's be poppin as the youth say these days."
## [4] "Peter Parker's Excellent Internship. "                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [5] "Field-work internships are the best. "

BERT scores seem to be accurate in evaluating the sentiment of texts. All the example texts with 5 stars express excited or ecstatic emotions about internship job experience. And 1 star texts express emotions that are angry or frustrated. However, BERT evaluate sentiment better when it’s a long text. When the text is short, like the 4th and 5th text in 1 start example, BERT score maybe less accurate given the lack of context information. But same things happens if it’s human are evaluating sentiment in single texts. So genrally, BERT sentiment analysis outcomes are of good credibility.

Dictionary method: negative vs. positive

sentimentr_example <- reddit_sentiment %>%
  mutate(sentimentr_abs = abs(sentiment_dict),
         sentimentr_binary = case_when(sentiment_dict > 0 ~ 'positive',
                                       TRUE ~ 'negative')) %>% 
  group_by(sentimentr_binary) %>%
  arrange(desc(sentimentr_abs)) %>%
  slice_head(n = 5) %>%
  ungroup() %>% 
  arrange(sentiment_dict)

# negative
sentimentr_example %>% filter(sentimentr_binary == 'negative') %>% pull(title_text) %>% print()
## [1] "unpaid internship with no benefits. "                                                  
## [2] "how the fuck is \"unpaid internship\" a thing???. "                                    
## [3] "Local asshat complains that college won't let him post a YEAR LONG unpaid internship. "
## [4] "Unpaid Internships. "                                                                  
## [5] "Unpaid internships. "
# positive
sentimentr_example %>% filter(sentimentr_binary == 'positive') %>% pull(title_text) %>% print()
## [1] "2023 Internship Reference Thread. *(Couldn't seem to find one already made but please reference it if so)*\r\n\r\nFeel free to post any of your internship offers for 2023, or gain insight into internship programs for the upcoming year.\r\n\r\nAs a reminder, please respect people's privacy and personal information. Avoid unsolicited DMs--it is recommended to have discussions in the community so everyone can benefit from reading and weighing in.\r\n\r\n**Recommended Template:**\r\n\r\n* School (specific name, T10 CS, etc.)\r\n* Current Year\r\n* Company (F500, FAANG, specific name, etc.)\r\n* Program Term + Location (Spring, Summer, Fall)\r\n* Salary / Sign-on Bonus / Relocation / Other Perks"
## [2] "Unpaid internships shouldn't exist.. "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [3] "Internship money!. "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
## [4] "Best end-of-internship gift.. "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
## [5] "Peter Parker's Excellent Internship. "

For dictionary methond, the credibility is not good. In negative example texts, the 3rd text seems to be a narrative with no subjective sentiment from the poster. In positive example texts, the 2nd text is a complainment, not positive apparently. This indicated a severe inaccuracy of dictionary methond in evaluating sentiment. Then dictionary methond is of poor credibility.

8. Discuss intriguing insights derived from the sentiment analysis, supporting your observations with at least two plots.

Dictionary methond

8.1 Number of threads by sentiment category.

library(ggplot2)

# standardize the dictionary scores: divide scores into 10 categories, from -5 to 5
reddit_sentiment$sdsc_dict<-cut(reddit_sentiment$sentiment_dict,breaks = c(-1.0,-0.8,-0.6,-0.4,-0.2,0,0.2,0.4,0.6,0.8,1.0),labels = c(-5,-4,-3,-2,-1,1,2,3,4,5))
# deal with 0 scores
reddit_sentiment$sdsc_dict <- as.character(reddit_sentiment$sdsc_dict)
reddit_sentiment$sdsc_dict[reddit_sentiment$sentiment_dict==0]<-"0"
reddit_sentiment$sdsc_dict <- factor(reddit_sentiment$sdsc_dict, levels = c("-5", "-4", "-3", "-2", "-1", "0","1","2","3"))

# plot
reddit_sentiment %>% 
  ggplot(aes(x = sdsc_dict)) + 
  geom_bar() 

#### Convert ditionary scores into sdandardized scores from -5 to 5. The plot shows that most scores concentrated between -2 to 2. It means that most texts regarding internship express relative neutral sentimaent.There are few scores -5 or -4 while no scores 4 or 5. It means that people tend to express extreme negative sentiment that extreme positive sentiment

8.2 Word counts by sentiment category.

reddit_sentiment %>% 
  ggplot(aes(x = sdsc_dict, y = word_count)) +
  geom_jitter(height = 0, width = 0.05,colour="black") +
  stat_summary(fun = mean, geom = "crossbar", width = 0.4, color = "red") 

#### For word counts, there are some texts score 0 with low word count. It may means that sentiment package cannnot evaluating short word count texts. Word count in positve texts (score > 0) is more than that of negative texts(score <0 ) It means that maybe people tend to say more about their positive experience and use short words to express their negative feelings.

8.3 Association between a thread’s sentiment and the number of comments on the thread.

reddit_sentiment_rm_outlier <- reddit_sentiment %>%
  group_by(sdsc_dict) %>%
  filter(
    between(
      comments, 
      quantile(comments, 0.25) - 1.5 * IQR(comments), 
      quantile(comments, 0.75) + 1.5 * IQR(comments)))

cor.test(reddit_sentiment_rm_outlier$comments, reddit_sentiment_rm_outlier$bert_label_numeric)
## 
##  Pearson's product-moment correlation
## 
## data:  reddit_sentiment_rm_outlier$comments and reddit_sentiment_rm_outlier$bert_label_numeric
## t = -0.87336, df = 192, p-value = 0.3836
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.20198904  0.07866781
## sample estimates:
##         cor 
## -0.06290415
reddit_sentiment_rm_outlier %>% 
  ggplot(aes(x = sdsc_dict, y = comments)) +
  geom_jitter(height = 0, width = 0.05,colour="black") 

#### There seems bo obvious relationship between number of comments and semtiment scores.

8.4 Word clouds

data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&amp;|&lt;|&gt;"

reddit_sentiment_clean <- reddit_sentiment %>% 
  mutate(title_text = str_replace_all(title_text, replace_reg, "")) %>%
  unnest_tokens(word, title_text, token = "words") %>% 
  anti_join(stop_words, by = "word") %>% 
  filter(str_detect(word, "[a-z]")) %>% 
  filter(!word %in% c('internship','internships'))

reddit_sentiment_clean_negative <- reddit_sentiment_clean %>% 
  filter(sdsc_dict %in% c(-5,-4,-3,-2,-1))
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>% 
  filter(sdsc_dict %in% c(1,2,3))

reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>% 
  anti_join(reddit_sentiment_clean_positive, by = 'word')
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
  anti_join(reddit_sentiment_clean_negative, by = 'word')
# Words appearing in negative threads
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright

df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))

reddit_sentiment_clean_negative_unique %>% 
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

Words appear frequently in negative threads include teaam, google, slavery,test, etc.

It may means:

1. Team work make interns suffer.

2. Doing internship in Google may not be a good experience.

3. Some companies treat interns like slaves.

# Words appearing in positive threads
reddit_sentiment_clean_positive_unique %>% 
  count(word, sort = TRUE) %>%
  wordcloud2(color = pal, 
             minRotation = 0, 
             maxRotation = 0, 
             ellipticity = 0.8)

Words appear frequently in positive threads include community,learn,dream,cs, etc.

It may means:

1. Good community of companies improves internship experience.

2. Interns would love to learn new things from their intership.

3. people who have dreams in their fields may gain more positive feelings during internship.

8.5 Temporal analysis

# create new columns: date, year, day_of_week, is_weekend, time
reddit_sentiment %<>% 
  mutate(date = as.POSIXct(date_utc)) %>%
  filter(!is.na(date)) %>% 
  mutate(year = year(date),
         day_of_week = wday(date, label = TRUE),
         is_weekend = ifelse(day_of_week %in% c("Sat", "Sun"), "Weekend", "Weekday"),
         time = timestamp %>% 
           anytime(tz = anytime:::getTZ()) %>% 
           str_split('-| |:') %>% 
           sapply(function(x) as.numeric(x[4])))

reddit_sentiment %<>% 
  mutate(month=month(date))%>%
  mutate(y_m=paste(year,".",month))
# Sentiment by year using stacked bar plot.

reddit_sentiment %>% 
  ggplot(aes(x = year, fill = sdsc_dict)) +
  geom_bar(position = 'stack') +
  scale_x_continuous(breaks = seq(min(reddit_sentiment$year), 
                                  max(reddit_sentiment$year), 
                                  by = 1)) +
  scale_fill_brewer(palette = 'RdYlBu', direction = -1) + 
  dark_theme_grey()
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().

#### The proportion of different sentiment score catagories seems similar. #### Year 2022 has a slightly higher proportion of negative sentiment. #### Year 2020 has a slightly higher proportion of positive sentiment.