# using keyword
threads_2 <- find_thread_urls(keywords = "intern",
sort_by = 'relevance',
period = 'all')
colnames(threads_2)
head(threads_2)
#saveRDS(threads_2,file = "(internship)threads_2.rds")
# get individual comments
threads_2_content <- get_thread_content(threads_2$url)
names(threads_2_content)
#saveRDS(threads_2_content,file = "(internship)threads_2_content.rds")
# filter dates later than 2022.1.1
threads_2_content_filter<-threads_2_content$comments[threads_2_content$comments$timestamp>1639920077,]
#saveRDS(threads_2_content_filter,file = "(internship)threads_2_content_filter.rds")
threads_2_content<-readRDS("D:/Storage/Master/23fall/CP6088 Intro to Urban Analytics/6088assn4/(internship)threads_2_content.rds")
threads_2<-readRDS("D:/Storage/Master/23fall/CP6088 Intro to Urban Analytics/6088assn4/(internship)threads_2.rds")
# create new column: date
threads_2 %<>%
mutate(date = as.POSIXct(date_utc)) %>%
filter(!is.na(date))
# number of threads by month
Sys.setlocale(locale = "en_US.UTF-8")# set R studio environment language
## [1] "LC_COLLATE=en_US.UTF-8;LC_CTYPE=en_US.UTF-8;LC_MONETARY=en_US.UTF-8;LC_NUMERIC=C;LC_TIME=en_US.UTF-8"
threads_2 %>%
ggplot(aes(x = date)) +
geom_histogram(color="black", position = 'stack', binwidth = 60*60*24*7) +
scale_x_datetime(date_labels = "%b %Y",
breaks = seq(min(threads_2$date, na.rm = T),
max(threads_2$date, na.rm = T),
by = "6 month")) +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Tokenization (word tokens)
words <- threads_2 %>%
unnest_tokens(output = word, input = text, token = "words")
words %>%
count(word, sort = TRUE) %>%
top_n(20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "words",
y = "counts",
title = "Unique wordcounts")
## Selecting by n
# load list of stop words - from the tidytext package
data("stop_words")
# view random 50 words
print(stop_words$word[sample(1:nrow(stop_words), 50)])
## [1] "early" "quite" "be" "yourselves" "won't"
## [6] "new" "any" "state" "will" "best"
## [11] "several" "toward" "everybody" "before" "most"
## [16] "does" "off" "d" "further" "doing"
## [21] "you've" "often" "if" "non" "although"
## [26] "allows" "make" "you'd" "newest" "somehow"
## [31] "that" "that's" "m" "does" "whereupon"
## [36] "grouping" "becomes" "about" "she'd" "same"
## [41] "working" "selves" "already" "made" "beside"
## [46] "or" "seems" "than" "wouldn't" "nothing"
# Regex that matches URL-type string
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&|<|>"
words_clean <- threads_2 %>%
# drop URLs
mutate(text = str_replace_all(text, replace_reg, "")) %>%
# Tokenization (word tokens)
unnest_tokens(word, text, token = "words") %>%
# drop stop words
anti_join(stop_words, by = "word") %>%
# drop non-alphabet-only strings
filter(str_detect(word, "[a-z]"))
# Check the number of rows after removal of the stop words. There should be fewer words now
print(
glue::glue("Before: {nrow(words)}, After: {nrow(words_clean)}")
)
## Before: 18318, After: 6166
words_clean %>%
count(word, sort = TRUE) %>%
top_n(20, n) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "words",
y = "counts",
title = "Unique wordcounts")
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright
df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))
words_clean_noself<-words_clean%>%filter(!grepl("internship",word,ignore.case = TRUE))
words_clean_noself %>%
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = 0,
maxRotation = 0,
ellipticity = 0.8)
#get tri-grams.
words_ngram <- threads_2 %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
select(text) %>%
unnest_tokens(output = paired_words,
input = text,
token = "ngrams",
n = 3)
#show ngrams with sorted values
words_ngram %>%
count(paired_words, sort = TRUE) %>%
head(20) %>%
knitr::kable()
| paired_words | n |
|---|---|
| NA | 127 |
| a lot of | 18 |
| i feel like | 12 |
| i am a | 10 |
| this is a | 10 |
| for pathways internships | 9 |
| i don t | 9 |
| it is not | 8 |
| be able to | 7 |
| feel free to | 7 |
| i have no | 7 |
| to answer them | 7 |
| an unpaid internship | 6 |
| application window for | 6 |
| best to answer | 6 |
| do our best | 6 |
| for an internship | 6 |
| have already begun | 6 |
| he didn t | 6 |
| i want to | 6 |
words_ngram_pair <- words_ngram %>%
separate(paired_words, c("word1", "word2", "word3"), sep = " ")
# filter rows where there are stop words under word 1,2,3 column
words_ngram_pair_filtered <- words_ngram_pair %>%
# drop stop words
filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word & !word3 %in% stop_words$word) %>%
# drop non-alphabet-only strings
filter(str_detect(word1, "[a-z]") & str_detect(word2, "[a-z]")& str_detect(word3, "[a-z]"))
# Filter out words that are not encoded in ASCII
# To see what's ASCCII, google 'ASCII table'
words_ngram_pair_filtered %<>%
filter(stri_enc_isascii(word1) & stri_enc_isascii(word2) & stri_enc_isascii(word3))
# Sort the new tri-gram (n=3) counts:
words_counts <- words_ngram_pair_filtered %>%
count(word1, word2, word3) %>%
arrange(desc(n))
head(words_counts, 10) %>%
knitr::kable()
| word1 | word2 | word3 | n |
|---|---|---|---|
| nasa | internships | staff | 6 |
| pathways | internships | starting | 6 |
| intern | supporting | nasa | 4 |
| nasa | ostem | intern | 4 |
| ostem | intern | supporting | 4 |
| _internship | _threads | _super_thread_other_internship_threads | 3 |
| _other | _internship | _threads | 3 |
| _super | _thread | _other | 3 |
| _thread | _other | _internship | 3 |
| begun | scheduling | interviews | 3 |
# plot word network
words_counts %>%
filter(n >= 2) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = .6, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8) +
labs(title = "Word Networks",
x = "", y = "")
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# data ready for BERT
threads_2_bert<-threads_2%>%select(date_utc,timestamp,title,text,subreddit,comments,url)
write.csv(threads_2_bert,file = "threads_2_bert.csv",row.names = FALSE)
# Get sentiment scores using the dictionary method for comparison.
reddit_sentiment <- read_csv('sample_reddit_bert.csv',show_col_types = FALSE) %>%
drop_na('bert_label')%>%
mutate(title = replace_na(title, ""),
text = replace_na(text, ""),
title_text = str_c(title, text, sep = ". "),)
## New names:
## • `` -> `...1`
library(sentimentr)
## Warning: 程辑包'sentimentr'是用R版本4.3.2 来建造的
##
## 载入程辑包:'sentimentr'
##
## The following object is masked from 'package:syuzhet':
##
## get_sentences
reddit_sentiment_dictionary <- sentiment_by(reddit_sentiment$title_text) # by string (a group of sentences)
reddit_sentiment$sentiment_dict <- reddit_sentiment_dictionary %>% pull(ave_sentiment)
reddit_sentiment$word_count <- reddit_sentiment_dictionary %>% pull(word_count)
save(reddit_sentiment, file = "reddit_sentiment1.RData")
load("reddit_sentiment1.RData")
reddit_sentiment <- reddit_sentiment %>% mutate(bert_label_numeric = str_sub(bert_label, 1, 1) %>% as.numeric())
cor(reddit_sentiment$bert_label_numeric, reddit_sentiment$sentiment_dict)
## [1] 0.4539148
ggplot(data = reddit_sentiment, aes(x = bert_label_numeric, y = sentiment_dict)) +
geom_jitter(width = 0.1, height = 0, color = 'Black') +
geom_line(aes(y = 0), color = '#FFD700', lwd = 1, linetype='dashed')+
geom_smooth(method = "lm", se = FALSE, color = "gray")
## `geom_smooth()` using formula = 'y ~ x'
#### 0.45 of correlation coefficient implies a moderate positive
correlation between BERT score and sentiment score. ### In the scatter
plot below, the two seem to be correlated. The threads that got 4-5
stars from the BERT model are mostly above 0 (meaning positive) in the
other method.
bert_example <- reddit_sentiment %>%
filter(bert_label_numeric %in% c(1,5)) %>%
group_by(bert_label) %>%
arrange(desc(bert_score)) %>%
slice_head(n = 5) %>%
ungroup()
# 1 star
bert_example %>% filter(bert_label_numeric == 1) %>% pull(title_text) %>% print()
## [1] "Fired from tax internship for smelling like weed. Today my boss let me go for \034smelling like weed.\035 I have never smoked, never been around it. Nothing! I even offered to take a drug test! I tried my hardest to convince him that it must be a mistake or my cologne. Despite all of this, he was immovable. Although I hated the job, I feel horrible. Is this racial profiling? Is there anything I can do? Should I just move on? \r\n\r\nI\031m just so dumbfounded. I would understand if I did smoke, but I don\031t! \r\n\r\nAny words of reassurance? =\023"
## [2] "\030Blatantly Illegal:\031 NASCAR Bans White Applicants From \030Diversity Internship\031. "
## [3] "I Hate That Internships Are Required. Most colleges now require you to have an internship of some kind before graduating.This is completely ridiculous and should be optional.How on earth am I supposed to work an unpaid internship (one recommended to me by my professor),carry a full load of classes, and work an actual real job because I need to pay my bills?I\031m paying for a class to tell me I need experience-no duh.I am failing my internship class even though I have an internship!There should be no other criteria to pass or fail this class other than securing the blasted internship.I don\031t have time to write about how \034enlightening this experience has been\035 when I\031m required to update university postings,send out fundraising letters,and email Congress multiple times a week for this internship.I\031m barely passing my other classes as is and I\031m so tired."
## [4] "Zero internship opportunities at my M7. Every presentation given to us by a company either ends with them saying they are hiring significantly less this year for the summer or they are from a group that is not hiring at all. The one presentation I just got done with, the guy said his team recruited for the summer every year since GFC, and this was the first time in 15 years they would not have interns. There are literally no jobs right now and the situation seems to be getting worse and worse."
## [5] "12 Months UNPAID internship. What kind of Slavery is this??. "
# 5 star
bert_example %>% filter(bert_label_numeric == 5) %>% pull(title_text) %>% print()
## [1] "Best end-of-internship gift.. "
## [2] "My internship search (incredibly successful). "
## [3] "I secured an internship at NASA!. Hello fellow CS majors. I am a non-traditional student (took a multi-year break), and this year has been the toughest job search I have ever had. I was losing hope, but it happened, I got an offer from NASA! This is seriously one of the most awesome moments of my life, and I kind of want to share it with everyone.\r\n\r\nEdit: Thank you, everyone! If I don't respond, sorry. Them DM's be poppin as the youth say these days."
## [4] "Peter Parker's Excellent Internship. "
## [5] "Field-work internships are the best. "
sentimentr_example <- reddit_sentiment %>%
mutate(sentimentr_abs = abs(sentiment_dict),
sentimentr_binary = case_when(sentiment_dict > 0 ~ 'positive',
TRUE ~ 'negative')) %>%
group_by(sentimentr_binary) %>%
arrange(desc(sentimentr_abs)) %>%
slice_head(n = 5) %>%
ungroup() %>%
arrange(sentiment_dict)
# negative
sentimentr_example %>% filter(sentimentr_binary == 'negative') %>% pull(title_text) %>% print()
## [1] "unpaid internship with no benefits. "
## [2] "how the fuck is \"unpaid internship\" a thing???. "
## [3] "Local asshat complains that college won't let him post a YEAR LONG unpaid internship. "
## [4] "Unpaid Internships. "
## [5] "Unpaid internships. "
# positive
sentimentr_example %>% filter(sentimentr_binary == 'positive') %>% pull(title_text) %>% print()
## [1] "2023 Internship Reference Thread. *(Couldn't seem to find one already made but please reference it if so)*\r\n\r\nFeel free to post any of your internship offers for 2023, or gain insight into internship programs for the upcoming year.\r\n\r\nAs a reminder, please respect people's privacy and personal information. Avoid unsolicited DMs--it is recommended to have discussions in the community so everyone can benefit from reading and weighing in.\r\n\r\n**Recommended Template:**\r\n\r\n* School (specific name, T10 CS, etc.)\r\n* Current Year\r\n* Company (F500, FAANG, specific name, etc.)\r\n* Program Term + Location (Spring, Summer, Fall)\r\n* Salary / Sign-on Bonus / Relocation / Other Perks"
## [2] "Unpaid internships shouldn't exist.. "
## [3] "Internship money!. "
## [4] "Best end-of-internship gift.. "
## [5] "Peter Parker's Excellent Internship. "
library(ggplot2)
# standardize the dictionary scores: divide scores into 10 categories, from -5 to 5
reddit_sentiment$sdsc_dict<-cut(reddit_sentiment$sentiment_dict,breaks = c(-1.0,-0.8,-0.6,-0.4,-0.2,0,0.2,0.4,0.6,0.8,1.0),labels = c(-5,-4,-3,-2,-1,1,2,3,4,5))
# deal with 0 scores
reddit_sentiment$sdsc_dict <- as.character(reddit_sentiment$sdsc_dict)
reddit_sentiment$sdsc_dict[reddit_sentiment$sentiment_dict==0]<-"0"
reddit_sentiment$sdsc_dict <- factor(reddit_sentiment$sdsc_dict, levels = c("-5", "-4", "-3", "-2", "-1", "0","1","2","3"))
# plot
reddit_sentiment %>%
ggplot(aes(x = sdsc_dict)) +
geom_bar()
#### Convert ditionary scores into sdandardized scores from -5 to 5. The
plot shows that most scores concentrated between -2 to 2. It means that
most texts regarding internship express relative neutral
sentimaent.There are few scores -5 or -4 while no scores 4 or 5. It
means that people tend to express extreme negative sentiment that
extreme positive sentiment
reddit_sentiment %>%
ggplot(aes(x = sdsc_dict, y = word_count)) +
geom_jitter(height = 0, width = 0.05,colour="black") +
stat_summary(fun = mean, geom = "crossbar", width = 0.4, color = "red")
#### For word counts, there are some texts score 0 with low word count.
It may means that sentiment package cannnot evaluating short word count
texts. Word count in positve texts (score > 0) is more than that of
negative texts(score <0 ) It means that maybe people tend to say more
about their positive experience and use short words to express their
negative feelings.
reddit_sentiment_rm_outlier <- reddit_sentiment %>%
group_by(sdsc_dict) %>%
filter(
between(
comments,
quantile(comments, 0.25) - 1.5 * IQR(comments),
quantile(comments, 0.75) + 1.5 * IQR(comments)))
cor.test(reddit_sentiment_rm_outlier$comments, reddit_sentiment_rm_outlier$bert_label_numeric)
##
## Pearson's product-moment correlation
##
## data: reddit_sentiment_rm_outlier$comments and reddit_sentiment_rm_outlier$bert_label_numeric
## t = -0.87336, df = 192, p-value = 0.3836
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.20198904 0.07866781
## sample estimates:
## cor
## -0.06290415
reddit_sentiment_rm_outlier %>%
ggplot(aes(x = sdsc_dict, y = comments)) +
geom_jitter(height = 0, width = 0.05,colour="black")
#### There seems bo obvious relationship between number of comments and
semtiment scores.
data("stop_words")
replace_reg <- "http[s]?://[A-Za-z\\d/\\.]+|&|<|>"
reddit_sentiment_clean <- reddit_sentiment %>%
mutate(title_text = str_replace_all(title_text, replace_reg, "")) %>%
unnest_tokens(word, title_text, token = "words") %>%
anti_join(stop_words, by = "word") %>%
filter(str_detect(word, "[a-z]")) %>%
filter(!word %in% c('internship','internships'))
reddit_sentiment_clean_negative <- reddit_sentiment_clean %>%
filter(sdsc_dict %in% c(-5,-4,-3,-2,-1))
reddit_sentiment_clean_positive <- reddit_sentiment_clean %>%
filter(sdsc_dict %in% c(1,2,3))
reddit_sentiment_clean_negative_unique <- reddit_sentiment_clean_negative %>%
anti_join(reddit_sentiment_clean_positive, by = 'word')
reddit_sentiment_clean_positive_unique <- reddit_sentiment_clean_positive %>%
anti_join(reddit_sentiment_clean_negative, by = 'word')
# Words appearing in negative threads
n <- 20
h <- runif(n, 0, 1) # any color
s <- runif(n, 0.6, 1) # vivid
v <- runif(n, 0.3, 0.7) # neither too dark or bright
df_hsv <- data.frame(h = h, s = s, v = v)
pal <- apply(df_hsv, 1, function(x) hsv(x['h'], x['s'], x['v']))
pal <- c(pal, rep("grey", 10000))
reddit_sentiment_clean_negative_unique %>%
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = 0,
maxRotation = 0,
ellipticity = 0.8)
# Words appearing in positive threads
reddit_sentiment_clean_positive_unique %>%
count(word, sort = TRUE) %>%
wordcloud2(color = pal,
minRotation = 0,
maxRotation = 0,
ellipticity = 0.8)
# create new columns: date, year, day_of_week, is_weekend, time
reddit_sentiment %<>%
mutate(date = as.POSIXct(date_utc)) %>%
filter(!is.na(date)) %>%
mutate(year = year(date),
day_of_week = wday(date, label = TRUE),
is_weekend = ifelse(day_of_week %in% c("Sat", "Sun"), "Weekend", "Weekday"),
time = timestamp %>%
anytime(tz = anytime:::getTZ()) %>%
str_split('-| |:') %>%
sapply(function(x) as.numeric(x[4])))
reddit_sentiment %<>%
mutate(month=month(date))%>%
mutate(y_m=paste(year,".",month))
# Sentiment by year using stacked bar plot.
reddit_sentiment %>%
ggplot(aes(x = year, fill = sdsc_dict)) +
geom_bar(position = 'stack') +
scale_x_continuous(breaks = seq(min(reddit_sentiment$year),
max(reddit_sentiment$year),
by = 1)) +
scale_fill_brewer(palette = 'RdYlBu', direction = -1) +
dark_theme_grey()
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().
#### The proportion of different sentiment score catagories seems
similar. #### Year 2022 has a slightly higher proportion of negative
sentiment. #### Year 2020 has a slightly higher proportion of positive
sentiment.