#————————————————————-# # Herman Brurberg, Anna Pabst, Himay Panirwala # # Mark Quitoy, Patricia Vargas, Ma Victoria Virasoro # ###############################################################
library(tidyverse) library(tidytext) library(stringr) library(dplyr) library(ggplot2) library(scales) library(SnowballC) library(topicmodels) library(tm) library(readtext) library(tidyr) library(textdata)
folder_path <- “C:/Users/patri/OneDrive/Documents/Hult/MBAN/Business Analysis with Unstructured Data - Kurnicki/Nike/ALL DATA (N,A,UA)/New folder”
raw_docs <- readtext(paste0(folder_path, “/*.docx”))
print(raw_docs)
colnames(raw_docs)[1] <- “source” colnames(raw_docs)[2] <- “text”
raw_docs\(source <- str_remove(raw_docs\)source, “\.docx$”)
print(raw_docs$source)
nike_df <- raw_docs %>% select(source, text)
nike_df
#Run this to confirm exact source names unique(nike_df$source)
nike_df\(source <- recode(nike_df\)source, “Nike - App Store Reviews” = “Nike - App Store”, “Nike - News_Headlines_NoYears” = “Nike - News”, “Nike - Reddit_Comments_NoLinks” = “Nike - Reddit”, “nike - Twitter mentions” = “Nike - Twitter”, “Nike - Youtube_Comments_NoLinks” = “Nike - YouTube”, “Nike Product Reviews” = “Nike - Product Reviews”, “Adidas Twitter Mentions” = “Adidas - Twitter”, “adidas_headlines” = “Adidas - News”, “Adidas_Youtube_Comments” = “Adidas - YouTube”, “appstore_adidas” = “Adidas - App Store”, “reddit_adidas” = “Adidas - Reddit”, “Adidas Reviews Website” = “Adidas - Product Reviews”, “Under Armour Twitter Mentions” = “Under Armour - Twitter”, “Under Armour Youtube Comments” = “Under Armour - YouTube”, “under_armour_headlines” = “Under Armour - News”, “appstore_underarmour” = “Under Armour - App Store”, “reddit_underarmour” = “Under Armour - Reddit”, “Under Armour Reviews Website” = “Under Armour - Product Reviews” )
print(unique(nike_df$source))
nike_sources <- c( “Nike - App Store”, “Nike - News”, “Nike - Reddit”, “Nike - Twitter”, “Nike - YouTube”, “Nike - Product Reviews” )
adidas_sources <- c( “Adidas - Twitter”, “Adidas - News”, “Adidas - YouTube”, “Adidas - App Store”, “Adidas - Reddit”, “Adidas - Product Reviews” )
ua_sources <- c( “Under Armour - Twitter”, “Under Armour - YouTube”, “Under Armour - News”, “Under Armour - App Store”, “Under Armour - Reddit”, “Under Armour - Product Reviews” )
nike_tokens <- nike_df %>% unnest_tokens(word, text)
nike_tokens
nike_tokens %>% count(word, sort = TRUE)
tidy_nike <- nike_df %>% unnest_tokens(word, text) %>% anti_join(stop_words) %>% filter(!str_detect(word, “1+$”))
tidy_nike
tidy_nike %>% count(word, sort = TRUE)
tidy_nike_stem <- tidy_nike %>% mutate(word = wordStem(word))
tidy_nike_stem %>% count(word, sort = TRUE)
tidy_nike %>% count(source, word, sort = TRUE) %>% group_by(source) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = source)) + geom_col(show.legend = FALSE) + facet_wrap(~source, ncol = 3, scales = “free”) + labs(x = NULL, y = “Word Count”, title = “Top 10 Words per Source (Stopwords Removed)”) + coord_flip()
nike_bigrams <- nike_df %>% unnest_tokens(bigram, text, token = “ngrams”, n = 2)
nike_bigrams %>% count(bigram, sort = TRUE)
bigrams_separated <- nike_bigrams %>% separate(bigram, c(“word1”, “word2”), sep = ” “)
bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words\(word) %>% filter(!word2 %in% stop_words\)word)
bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE)
bigram_counts
nike_quadrogram <- nike_df %>% unnest_tokens(quadrogram, text, token = “ngrams”, n = 4) %>% separate(quadrogram, c(“word1”, “word2”, “word3”, “word4”), sep = ” “) %>% filter(!word1 %in% stop_words\(word) %>% filter(!word2 %in% stop_words\)word) %>% filter(!word3 %in% stop_words\(word) %>% filter(!word4 %in% stop_words\)word)
nike_quadrogram %>% count(word1, word2, word3, word4, sort = TRUE)
tidy_dtm <- nike_df %>% unnest_tokens(word, text) %>% anti_join(stop_words) %>% count(source, word, sort = TRUE) %>% cast_dtm(source, word, n)
tidy_dtm inspect(tidy_dtm[1:6, 1:10])
afinn <- get_sentiments(“afinn”) nrc <- get_sentiments(“nrc”) bing <- get_sentiments(“bing”)
sentiments <- bind_rows(mutate(afinn, lexicon = “afinn”), mutate(nrc, lexicon = “nrc”), mutate(bing, lexicon = “bing”) )
nrc_data <- subset(sentiments, lexicon == “nrc”) unique(nrc_data$sentiment) # joy, anger, trust, anticipation, fear, etc.
bing_data <- subset(sentiments, lexicon == “bing”) unique(bing_data$sentiment) # positive / negative
afinn_data <- subset(sentiments, lexicon == “afinn”) summary(afinn_data$value) # quantitative scale -5 to +5
nrc_joy <- get_sentiments(“nrc”) %>% filter(sentiment == “joy”)
nrc_anger <- get_sentiments(“nrc”) %>% filter(sentiment == “anger”)
nrc_trust <- get_sentiments(“nrc”) %>% filter(sentiment == “trust”)
nrc_anticipation <- get_sentiments(“nrc”) %>% filter(sentiment == “anticipation”)
tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_joy) %>% count(word, sort = TRUE)
tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_anger) %>% count(word, sort = TRUE)
tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE)
tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_anticipation) %>% count(word, sort = TRUE)
app_store <- tidy_nike %>% filter(source == “Nike - App Store”)
afinn <- app_store %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)
bing_and_nrc <- bind_rows( app_store %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), app_store %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)
bind_rows(afinn, bing_and_nrc) %>% ggplot(aes(method, sentiment, fill = method)) + geom_col(show.legend = FALSE) + facet_wrap(~method, ncol = 1, scales = “free_y”) + labs(title = “App Store Reviews: Sentiment Across Lexicons”)
product_reviews <- tidy_nike %>% filter(source == “Nike - Product Reviews”)
afinn <- product_reviews %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)
bing_and_nrc <- bind_rows( product_reviews %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), product_reviews %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)
bind_rows(afinn, bing_and_nrc) %>% ggplot(aes(method, sentiment, fill = method)) + geom_col(show.legend = FALSE) + facet_wrap(~method, ncol = 1, scales = “free_y”) + labs(title = “Nike Product Reviews: Sentiment Across Lexicons”)
reddit <- tidy_nike %>% filter(source == “Nike - Reddit”)
afinn <- reddit %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)
bing_and_nrc <- bind_rows( reddit %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), reddit %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)
bind_rows(afinn, bing_and_nrc) %>% ggplot(aes(method, sentiment, fill = method)) + geom_col(show.legend = FALSE) + facet_wrap(~method, ncol = 1, scales = “free_y”) + labs(title = “Reddit Comments: Sentiment Across Lexicons”)
twitter <- tidy_nike %>% filter(source == “Nike - Twitter”)
afinn <- twitter %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)
bing_and_nrc <- bind_rows( twitter %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), twitter %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)
bind_rows(afinn, bing_and_nrc) %>% ggplot(aes(method, sentiment, fill = method)) + geom_col(show.legend = FALSE) + facet_wrap(~method, ncol = 1, scales = “free_y”) + labs(title = “Twitter Mentions: Sentiment Across Lexicons”)
youtube <- tidy_nike %>% filter(source == “Nike - YouTube”)
afinn <- youtube %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)
bing_and_nrc <- bind_rows( youtube %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), youtube %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)
bind_rows(afinn, bing_and_nrc) %>% ggplot(aes(method, sentiment, fill = method)) + geom_col(show.legend = FALSE) + facet_wrap(~method, ncol = 1, scales = “free_y”) + labs(title = “YouTube Comments: Sentiment Across Lexicons”)
news <- tidy_nike %>% filter(source == “Nike - News”)
afinn <- news %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)
bing_and_nrc <- bind_rows( news %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), news %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)
bind_rows(afinn, bing_and_nrc) %>% ggplot(aes(method, sentiment, fill = method)) + geom_col(show.legend = FALSE) + facet_wrap(~method, ncol = 1, scales = “free_y”) + labs(title = “News Headlines: Sentiment Across Lexicons”)
bing_counts <- app_store %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()
bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “App Store Reviews: Most Common Positive and Negative Words”) + coord_flip()
bing_counts <- product_reviews %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()
bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “Nike Product Reviews: Most Common Positive and Negative Words”) + coord_flip()
bing_counts <- reddit %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()
bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “Reddit Comments: Most Common Positive and Negative Words”) + coord_flip()
bing_counts <- twitter %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()
bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “Twitter Mentions: Most Common Positive and Negative Words”) + coord_flip()
bing_counts <- youtube %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()
bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “YouTube Comments: Most Common Positive and Negative Words”) + coord_flip()
bing_counts <- news %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()
bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “News Headlines: Most Common Positive and Negative Words”) + coord_flip()
bing_counts <- tidy_nike_stem %>% inner_join(get_sentiments(“bing”), by = “word”) %>% count(word, sentiment, sort = TRUE)
bing_counts %>% group_by(sentiment) %>% slice_max(order_by = n, n = 10, with_ties = FALSE) %>% ungroup() %>% mutate(word = tidytext::reorder_within(word, n, sentiment)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + tidytext::scale_x_reordered() + labs( y = “Contribution to sentiment”, x = NULL, title = “Overall: Most Common Positive and Negative Words” ) + coord_flip()
nike_lda_dtm <- tidy_nike %>% count(source, word, sort = TRUE) %>% cast_dtm(source, word, n)
nike_lda <- LDA(nike_lda_dtm, k = 4, control = list(seed = 123)) nike_lda
nike_topics <- tidy(nike_lda, matrix = “beta”) nike_topics
top_terms <- nike_topics %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta)
top_terms
top_terms %>% mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~topic, scales = “free”) + coord_flip() + labs(title = “Nike LDA Topic Modeling - Top Terms per Topic”, x = NULL, y = “Beta (Word Probability)”)
nike_gamma <- tidy(nike_lda, matrix = “gamma”) nike_gamma
nike_gamma %>% mutate(document = reorder(document, gamma * topic)) %>% ggplot(aes(factor(topic), gamma, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~document) + labs(title = “Topic Distribution per Source”, x = “Topic”, y = “Gamma (Topic Probability)”)
custom_stopwords <- tibble(word = c(“nike”, “just”, “im”, “ive”))
tidy_nike_tfidf <- tidy_nike %>% anti_join(custom_stopwords, by = “word”)
tfidf_source <- tidy_nike_tfidf %>% count(source, word, sort = TRUE)
tfidf_source <- tfidf_source %>% bind_tf_idf(term = word, document = source, n = n)
top_tfidf_source <- tfidf_source %>% group_by(source) %>% slice_max(order_by = tf_idf, n = 10, with_ties = FALSE) %>% ungroup()
top_tfidf_source %>% mutate(word = tidytext::reorder_within(word, tf_idf, source)) %>% ggplot(aes(word, tf_idf, fill = source)) + geom_col(show.legend = FALSE) + facet_wrap(~source, scales = “free_y”) + tidytext::scale_x_reordered() + coord_flip() + labs( title = “TF-IDF: Most Distinctive Words by Source”, x = NULL, y = “TF-IDF” )
aspect_dictionary <- tribble( ~word, ~aspect, “size”, “Fit/Sizing”, “fit”, “Fit/Sizing”, “fits”, “Fit/Sizing”, “half”, “Fit/Sizing”, “oversized”, “Fit/Sizing”, “sizing”, “Fit/Sizing”, “tight”, “Fit/Sizing”, “loose”, “Fit/Sizing”,
“comfortable”, “Comfort”, “comfy”, “Comfort”, “comfort”, “Comfort”, “soft”, “Comfort”, “cushion”, “Comfort”,
“quality”, “Quality/Durability”, “peeled”, “Quality/Durability”, “cheap”, “Quality/Durability”, “durable”, “Quality/Durability”, “fell”, “Quality/Durability”, “problem”, “Quality/Durability”, “awkward”, “Quality/Durability”,
“style”, “Style/Design”, “design”, “Style/Design”, “colorway”, “Style/Design”, “look”, “Style/Design”, “black”, “Style/Design”, “fleece”, “Style/Design”, “bomber”, “Style/Design”,
“app”, “App Experience”, “workout”, “App Experience”, “workouts”, “App Experience”, “fitness”, “App Experience”, “crashing”, “App Experience”, “crashes”, “App Experience”, “bugs”, “App Experience”, “issue”, “App Experience”, “issues”, “App Experience”, “frustrating”, “App Experience”,
“revenue”, “Corporate/Business”, “strategy”, “Corporate/Business”, “investor”, “Corporate/Business”, “stock”, “Corporate/Business”, “ceo”, “Corporate/Business”, “wholesale”, “Corporate/Business”, “relations”, “Corporate/Business”,
“team”, “Sports/Cultural Visibility”, “national”, “Sports/Cultural Visibility”, “shirt”, “Sports/Cultural Visibility”, “jersey”, “Sports/Cultural Visibility”, “cup”, “Sports/Cultural Visibility”, “brazil”, “Sports/Cultural Visibility”, “football”, “Sports/Cultural Visibility” )
aspect_terms <- tidy_nike %>% inner_join(aspect_dictionary, by = “word”)
aspect_sentiment_bing <- aspect_terms %>% inner_join(get_sentiments(“bing”), by = “word”)
aspect_summary_bing <- aspect_sentiment_bing %>% count(aspect, sentiment, sort = TRUE)
aspect_summary_bing
aspect_net_bing <- aspect_sentiment_bing %>% mutate(score = if_else(sentiment == “positive”, 1, -1)) %>% group_by(aspect) %>% summarise( mentions = n(), net_sentiment = sum(score), avg_sentiment = mean(score), .groups = “drop” ) %>% arrange(desc(avg_sentiment))
aspect_net_bing
aspect_summary_bing %>% ggplot(aes(x = aspect, y = n, fill = sentiment)) + geom_col(position = “dodge”) + coord_flip() + labs( title = “Aspect-Based Sentiment Analysis (Bing)”, x = NULL, y = “Word Count” )
aspect_by_source_bing <- aspect_sentiment_bing %>% mutate(score = if_else(sentiment == “positive”, 1, -1)) %>% group_by(source, aspect) %>% summarise( mentions = n(), net_sentiment = sum(score), avg_sentiment = mean(score), .groups = “drop” )
aspect_by_source_bing
aspect_by_source_bing %>% ggplot(aes(x = aspect, y = avg_sentiment, fill = aspect)) + geom_col(show.legend = FALSE) + facet_wrap(~source) + coord_flip() + labs( title = “Aspect-Based Sentiment by Source”, x = NULL, y = “Average Sentiment Score” )
library(widyr) library(igraph) library(ggraph)
word_cors <- tidy_nike %>% group_by(word) %>% filter(n() >= 5) %>% pairwise_cor(word, source, sort = TRUE)
word_cors
word_cors %>% filter(item1 %in% c(“nike”, “shoes”, “quality”, “price”)) %>% group_by(item1) %>% top_n(5) %>% ungroup() %>% mutate(item2 = reorder(item2, correlation)) %>% ggplot(aes(item2, correlation)) + geom_bar(stat = “identity”) + facet_wrap(~item1, scales = “free”) + coord_flip() + labs(title = “Word Correlations with Key Nike Terms”)
tidy_nike %>% inner_join(get_sentiments(“nrc”)) %>% filter(sentiment %in% c(“joy”, “anger”, “trust”, “anticipation”, “fear”, “sadness”, “surprise”, “disgust”)) %>% count(sentiment, sort = TRUE) %>% mutate(sentiment = reorder(sentiment, n)) %>% ggplot(aes(sentiment, n, fill = sentiment)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = “Word Count”, title = “Nike: Overall Emotional Positioning Across All Sources”) + coord_flip()
tidy_nike %>% inner_join(get_sentiments(“nrc”)) %>% filter(sentiment %in% c(“joy”, “anger”, “trust”, “anticipation”, “fear”, “sadness”, “surprise”, “disgust”, “positive”, “negative”)) %>% count(source, sentiment) %>% group_by(source) %>% mutate(proportion = n / sum(n)) %>% ungroup() %>% mutate(sentiment = reorder(sentiment, proportion)) %>% ggplot(aes(sentiment, proportion, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~source, ncol = 3, scales = “free_y”) + scale_y_continuous(labels = percent_format()) + labs(x = NULL, y = “Percentage of Emotional Words”, title = “Nike: Emotional Breakdown per Source (% of words)”) + coord_flip()
tidy_nike %>% inner_join(get_sentiments(“afinn”)) %>% group_by(source) %>% summarise(sentiment = sum(value)) %>% mutate(source = reorder(source, sentiment)) %>% ggplot(aes(source, sentiment, fill = sentiment > 0)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = “Sentiment Score”, title = “AFINN: Sentiment Intensity per Source”) + coord_flip()
tidy_nike %>% inner_join(get_sentiments(“bing”)) %>% count(source, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative) %>% mutate(source = reorder(source, sentiment)) %>% ggplot(aes(source, sentiment, fill = sentiment > 0)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = “Net Sentiment Score (Positive - Negative)”, title = “Bing: Net Sentiment Score per Source”) + coord_flip()
tidy_nike %>% inner_join(get_sentiments(“nrc”)) %>% filter(sentiment %in% c(“positive”, “negative”)) %>% count(source, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative) %>% mutate(source = reorder(source, sentiment)) %>% ggplot(aes(source, sentiment, fill = sentiment > 0)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = “Net Sentiment Score (Positive - Negative)”, title = “NRC: Net Sentiment Score per Source”) + coord_flip()
tidy_nike %>% filter(source == “Nike - App Store”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE)
tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE)
tidy_nike %>% filter(source == “Nike - Reddit”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE)
tidy_nike %>% inner_join(nrc_trust) %>% count(word, sort = TRUE) %>% top_n(20) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = word)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = “Word Count”, title = “Nike: Top Trust Words - Themes Driving Brand Loyalty”) + coord_flip()
bigrams_filtered %>% filter(!str_detect(word1, “2+\(")) %>% filter(!str_detect(word2, "^[0-9]+\)”)) %>% filter(!str_detect(word1, “‘“)) %>% filter(!str_detect(word2,”’”)) %>% filter(!word1 %in% c(“positive”, “negative”, “neutral”)) %>% filter(!word2 %in% c(“positive”, “negative”, “neutral”)) %>% filter(word1 %in% c(“wear”, “favorite”, “recommend”, “real”, “true”, “perfect”, “happy”, “strength”, “love”, “free”, “comfortable”, “trust”) | word2 %in% c(“wear”, “favorite”, “recommend”, “real”, “true”, “perfect”, “happy”, “strength”, “love”, “free”, “comfortable”, “trust”)) %>% count(word1, word2, sort = TRUE) %>% top_n(20) %>% unite(bigram, word1, word2, sep = ” “) %>% mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(bigram, n, fill = bigram)) + geom_col(show.legend = FALSE) + labs(x = NULL, y =”Count”, title = “Nike: Bigrams Around Key Loyalty and Trust Words”) + coord_flip()
nike_df %>% unnest_tokens(trigram, text, token = “ngrams”, n = 3) %>% separate(trigram, c(“word1”, “word2”, “word3”), sep = ” “) %>% filter(!word1 %in% stop_words\(word) %>% filter(!word2 %in% stop_words\)word) %>% filter(!word3 %in% stop_words\(word) %>% filter(!str_detect(word1, "^[0-9]+\)”)) %>% filter(!str_detect(word2, “3+\(")) %>% filter(!str_detect(word3, "^[0-9]+\)”)) %>% filter(!str_detect(word1, “‘“)) %>% filter(!str_detect(word2,”’”)) %>% filter(!str_detect(word3, “’”)) %>% filter(!word1 %in% c(“positive”, “negative”, “neutral”)) %>% filter(!word2 %in% c(“positive”, “negative”, “neutral”)) %>% filter(!word3 %in% c(“positive”, “negative”, “neutral”)) %>% filter(word1 %in% c(“wear”, “favorite”, “recommend”, “real”, “true”, “perfect”, “happy”, “strength”, “love”, “free”, “comfortable”, “trust”) | word2 %in% c(“wear”, “favorite”, “recommend”, “real”, “true”, “perfect”, “happy”, “strength”, “love”, “free”, “comfortable”, “trust”) | word3 %in% c(“wear”, “favorite”, “recommend”, “real”, “true”, “perfect”, “happy”, “strength”, “love”, “free”, “comfortable”, “trust”)) %>% count(word1, word2, word3, sort = TRUE) %>% top_n(5) %>% unite(trigram, word1, word2, word3, sep = ” “) %>% mutate(trigram = reorder(trigram, n)) %>% ggplot(aes(trigram, n, fill = trigram)) + geom_col(show.legend = FALSE) + labs(x = NULL, y =”Count”, title = “Nike: Trigrams Around Key Loyalty and Trust Words”) + coord_flip()
tidy_nike %>% filter(source == “Nike - App Store”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE) %>% mutate(proportion = n / sum(n)) %>% mutate(word = reorder(word, proportion)) %>% ggplot(aes(word, proportion, fill = word)) + geom_col(show.legend = FALSE) + scale_y_continuous(labels = percent_format()) + labs(x = NULL, y = “Percentage of Trust Words”, title = “App Store Reviews: Top Trust Words”) + coord_flip()
tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE) %>% mutate(proportion = n / sum(n)) %>% mutate(word = reorder(word, proportion)) %>% ggplot(aes(word, proportion, fill = word)) + geom_col(show.legend = FALSE) + scale_y_continuous(labels = percent_format()) + labs(x = NULL, y = “Percentage of Trust Words”, title = “Nike Product Reviews: Top Trust Words”) + coord_flip()
tidy_nike %>% filter(source == “Nike - Reddit”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE) %>% mutate(proportion = n / sum(n)) %>% mutate(word = reorder(word, proportion)) %>% ggplot(aes(word, proportion, fill = word)) + geom_col(show.legend = FALSE) + scale_y_continuous(labels = percent_format()) + labs(x = NULL, y = “Percentage of Trust Words”, title = “Reddit Comments: Top Trust Words”) + coord_flip()
tidy_nike %>% inner_join(nrc_trust) %>% count(word, sort = TRUE) %>% mutate(proportion = n / sum(n)) %>% top_n(20, proportion) %>% mutate(word = reorder(word, proportion)) %>% ggplot(aes(word, proportion, fill = word)) + geom_col(show.legend = FALSE) + scale_y_continuous(labels = percent_format()) + labs(x = NULL, y = “Percentage of Trust Words”, title = “Nike: Top Trust Words in Customer Reviews”) + coord_flip()
product_raw <- nike_df %>% filter(source == “Nike - Product Reviews”)
names(product_raw) glimpse(product_raw) head(product_raw, 1)
product_text_blob <- product_raw$text[[1]]
product_lines <- tibble( raw_line = str_split(product_text_blob, “”)[[1]] ) %>% mutate(raw_line = str_squish(raw_line)) %>% filter(raw_line != ““)
head(product_lines, 20)
product_reviews_structured <- product_lines %>% mutate( audience = str_extract(raw_line, “\b(Women|Men|Kids|Everyone)\b”), product_category = str_extract( raw_line, “\b(Pants|Hoodie|Sweatshirt|Leggings|Shoes|Jacket|Skirt|Dress|Shirt|Accessories|Accessorise)\b” ), product_name = str_trim(str_remove( raw_line, “\b(Pants|Hoodie|Sweatshirt|Leggings|Shoes|Jacket|Skirt|Dress|Shirt|Accessories|Accessorise)\b.$” )), review_text = str_trim(str_remove( raw_line, ”^.\b(Women|Men|Kids|Everyone)\b” )) ) %>% filter(!is.na(product_name), product_name != ““) %>% filter(!is.na(product_category), product_category !=”“) %>% filter(!is.na(audience), audience !=”“) %>% filter(!is.na(review_text), review_text !=”“) %>% mutate(review_id = row_number())
glimpse(product_reviews_structured) head(product_reviews_structured, 10)
table(product_reviews_structured\(product_category) table(product_reviews_structured\)audience)
product_tokens <- product_reviews_structured %>% unnest_tokens(word, review_text) %>% anti_join(stop_words, by = “word”) %>% filter(!str_detect(word, “4+$”)) %>% filter(str_length(word) > 2)
glimpse(product_tokens) head(product_tokens, 20)
product_negative_words <- product_tokens %>% inner_join(get_sentiments(“bing”), by = “word”) %>% filter(sentiment == “negative”)
negative_by_product <- product_negative_words %>% count(product_name, word, sort = TRUE)
head(negative_by_product, 20)
negative_volume_by_product <- product_negative_words %>% count(product_name, sort = TRUE)
negative_volume_by_product
review_volume_by_product <- product_reviews_structured %>% count(product_name, name = “review_count”)
negative_rate_by_product <- negative_volume_by_product %>% left_join(review_volume_by_product, by = “product_name”) %>% mutate(negative_words_per_review = n / review_count) %>% arrange(desc(negative_words_per_review))
negative_rate_by_product
product_aspect_dictionary <- tribble( ~word, ~aspect, “size”, “Fit/Sizing”, “fit”, “Fit/Sizing”, “fits”, “Fit/Sizing”, “half”, “Fit/Sizing”, “oversized”, “Fit/Sizing”, “tight”, “Fit/Sizing”, “loose”, “Fit/Sizing”, “awkward”, “Fit/Sizing”, “long”, “Fit/Sizing”, “large”, “Fit/Sizing”, “small”, “Fit/Sizing”,
“comfortable”, “Comfort”, “comfy”, “Comfort”, “comfort”, “Comfort”, “soft”, “Comfort”, “roomy”, “Comfort”,
“quality”, “Quality/Durability”, “peeled”, “Quality/Durability”, “fell”, “Quality/Durability”, “cheap”, “Quality/Durability”, “shrink”, “Quality/Durability”, “shrunk”, “Quality/Durability”, “durable”, “Quality/Durability”, “problem”, “Quality/Durability”, “issues”, “Quality/Durability”,
“style”, “Style/Design”, “design”, “Style/Design”, “color”, “Style/Design”, “colorway”, “Style/Design”, “fleece”, “Style/Design”, “bomber”, “Style/Design”, “beautiful”, “Style/Design”, “feminine”, “Style/Design” )
product_aspect_sentiment <- product_tokens %>% inner_join(product_aspect_dictionary, by = “word”) %>% inner_join(get_sentiments(“bing”), by = “word”)
product_aspect_summary <- product_aspect_sentiment %>% mutate(score = if_else(sentiment == “positive”, 1, -1)) %>% group_by(product_name, aspect) %>% summarise( mentions = n(), net_sentiment = sum(score), avg_sentiment = mean(score), .groups = “drop” ) %>% arrange(product_name, avg_sentiment)
product_aspect_summary
product_negative_drivers <- product_aspect_sentiment %>% filter(sentiment == “negative”) %>% count(product_name, aspect, word, sort = TRUE)
head(product_negative_drivers, 30)
product_aspect_summary %>% ggplot(aes(x = aspect, y = avg_sentiment, fill = aspect)) + geom_col(show.legend = FALSE) + facet_wrap(~product_name, scales = “free_y”) + coord_flip() + labs( title = “Aspect Sentiment by Product Line”, x = NULL, y = “Average Sentiment Score” )
sustainability_keywords <- data_frame( word = c( “sustainable”, “sustainability”, “eco”, “recycle”, “recycled”, “recycling”, “environment”, “environmental”, “planet”, “carbon”, “footprint”, “renewable”, “organic”, “biodegradable”, “climate”, “emissions”, “waste”, “ethical”, “responsible”, “circular”, “vegan”, “natural”, “conservation”, “impact” ) )
greenwashing_keywords <- data_frame( word = c( “greenwashing”, “greenwash”, “misleading”, “mislead”, “propaganda”, “dishonest”, “pretend”, “pretending”, “performative”, “hypocrisy”, “hypocrite”, “virtue” ) )
sustainability_tokens <- nike_tokens %>% select(source, word) %>% inner_join(sustainability_keywords, by = “word”)
sustainability_tokens <- sustainability_tokens %>% mutate(brand = case_when( source %in% adidas_sources ~ “Adidas”, source %in% ua_sources ~ “Under Armour”, TRUE ~ “Nike” ))
sustainability_tokens %>% count(source, sort = TRUE) %>% mutate(source = reorder(source, n)) %>% ggplot(aes(source, n, fill = source)) + geom_col(show.legend = FALSE) + coord_flip() + labs( title = “Sustainability Mentions by Source”, subtitle = “Higher count = more sustainability language in that channel”, x = NULL, y = “Count” )
bing_lexicon <- get_sentiments(“bing”)
sustainability_tokens %>% inner_join(bing_lexicon, by = “word”) %>% count(source, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(net_sentiment = positive - negative) %>% mutate(source = reorder(source, net_sentiment)) %>% ggplot(aes(source, net_sentiment, fill = net_sentiment > 0)) + geom_col(show.legend = FALSE) + coord_flip() + scale_fill_manual(values = c(“TRUE” = “steelblue”, “FALSE” = “tomato”)) + labs( title = “Net Sentiment Around Sustainability Keywords (Bing)”, subtitle = “Blue = net positive | Red = net negative”, x = NULL, y = “Positive minus Negative” )
sustainability_tokens %>% count(brand, sort = TRUE) %>% mutate(brand = reorder(brand, n)) %>% ggplot(aes(brand, n, fill = brand)) + geom_col(show.legend = FALSE) + coord_flip() + labs( title = “Sustainability Mentions: Nike vs. Adidas vs. Under Armour”, x = NULL, y = “Total Keyword Occurrences” )
bigrams_filtered %>% filter( word1 %in% sustainability_keywords\(word | word2 %in% sustainability_keywords\)word ) %>% count(word1, word2, sort = TRUE) %>% head(20) %>% unite(bigram, word1, word2, sep = ” “) %>% mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(bigram, n)) + geom_col(fill =”steelblue”) + coord_flip() + labs( title = “Top Sustainability Bigrams”, subtitle = “Context check: ‘green’ and ‘carbon’ often appear in product contexts”, x = NULL, y = “Count” )
greenwashing_tokens <- nike_tokens %>% select(source, word) %>% inner_join(greenwashing_keywords, by = “word”)
greenwashing_tokens %>% count(source, sort = TRUE) %>% mutate(source = reorder(source, n)) %>% ggplot(aes(source, n, fill = source)) + geom_col(show.legend = FALSE) + coord_flip() + labs( title = “Greenwashing Language by Source”, subtitle = “Low counts = limited public accusation of greenwashing”, x = NULL, y = “Count” )
greenwashing_tokens %>% mutate(brand = case_when( source %in% adidas_sources ~ “Adidas”, source %in% ua_sources ~ “Under Armour”, TRUE ~ “Nike” )) %>% count(brand, sort = TRUE) %>% mutate(brand = reorder(brand, n)) %>% ggplot(aes(brand, n, fill = brand)) + geom_col(show.legend = FALSE) + coord_flip() + labs( title = “Greenwashing Language: Nike vs. Adidas vs. Under Armour”, x = NULL, y = “Greenwashing Keyword Occurrences” )
pricing_terms <- tibble( word = c( “overpriced”, “expensive”, “price”, “priced”, “cost”, “costly”, “value”, “worth”, “retail”, “premium”, “cheap”, “affordable” ) )
pricing_examples <- nike_df %>% filter(str_detect(str_to_lower(text), “overpriced|\bexpensive\b|price tag|\bworth\b|\bpremium\b”)) %>% select(source, text)
pricing_examples
pkgs <- c(“tidyverse”, “tidytext”, “textdata”, “SnowballC”, “ggplot2”, “scales”, “readxl”, “officer”, “stringr”, “forcats”)
invisible(lapply(pkgs, function(p) { if (!requireNamespace(p, quietly = TRUE)) { install.packages(p, dependencies = TRUE) } library(p, character.only = TRUE) }))
set.seed(42)
comparison_raw <- raw_docs
colnames(comparison_raw)[1] <- “source” colnames(comparison_raw)[2] <- “text”
comparison_raw\(source <- basename(comparison_raw\)source) comparison_raw\(source <- URLdecode(comparison_raw\)source) comparison_raw\(source <- str_remove(comparison_raw\)source, “\.docx$”)
comparison_df <- comparison_raw %>% select(source, text)
print(unique(comparison_df$source))
comparison_df <- comparison_df %>% mutate( brand = case_when( str_detect(str_to_lower(source), “nike”) ~ “Nike”, str_detect(str_to_lower(source), “adidas”) ~ “Adidas”, str_detect(str_to_lower(source), “under armour|under_armour|underarmour”) ~ “UnderArmour”, TRUE ~ NA_character_ ), source_type = case_when( str_detect(str_to_lower(source), “twitter”) ~ “Twitter”, str_detect(str_to_lower(source), “youtube”) ~ “YouTube”, str_detect(str_to_lower(source), “reddit”) ~ “Reddit”, str_detect(str_to_lower(source), “app store|appstore”) ~ “App Store”, str_detect(str_to_lower(source), “headline|headlines|news”) ~ “News”, str_detect(str_to_lower(source), “reviews website|product reviews|review”) ~ “Product Reviews”, TRUE ~ NA_character_ ) )
comparison_df %>% count(brand, source_type)
custom_stopwords_comp <- tibble( word = c(“nike”, “adidas”, “under”, “armour”, “ua”, “shoe”, “shoes”, “brand”, “just”, “really”, “like”, “get”, “got”, “one”, “also”, “can”, “use”, “app”, “run”, “running”, “workout”) )
comparison_tokens <- comparison_df %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = “word”) %>% anti_join(custom_stopwords_comp, by = “word”) %>% filter(!str_detect(word, “5+$”)) %>% filter(str_length(word) > 2)
comparison_tokens
comparison_tokens %>% inner_join(get_sentiments(“nrc”), by = “word”) %>% filter(!sentiment %in% c(“positive”, “negative”)) %>% count(brand, sentiment) %>% group_by(brand) %>% mutate(pct = n / sum(n)) %>% ungroup() %>% ggplot(aes(x = brand, y = sentiment, fill = pct)) + geom_tile(color = “white”) + geom_text(aes(label = percent(pct, accuracy = 1)), size = 4) + scale_fill_gradient(low = “white”, high = “steelblue”, labels = percent_format()) + labs( title = “NRC Emotion Intensity Heatmap”, subtitle = “Proportion of emotion words per brand”, x = NULL, y = “Emotion”, fill = “% ofWords” ) + theme_minimal()
comparison_tokens %>% count(brand, word, sort = TRUE) %>% bind_tf_idf(word, brand, n) %>% group_by(brand) %>% slice_max(order_by = tf_idf, n = 15, with_ties = FALSE) %>% ungroup() %>% mutate(word = reorder_within(word, tf_idf, brand)) %>% ggplot(aes(word, tf_idf, fill = brand)) + geom_col(show.legend = FALSE) + facet_wrap(~brand, scales = “free_y”) + scale_x_reordered() + coord_flip() + labs( title = “TF-IDF: Signature Language of Each Brand”, subtitle = “Words most uniquely associated with each brand vs. competitors”, x = NULL, y = “TF-IDF Score” )