Nike Text Analytics - Strategic Brand Intelligence

A2: NLP Pipeline in R

#————————————————————-# # Herman Brurberg, Anna Pabst, Himay Panirwala # # Mark Quitoy, Patricia Vargas, Ma Victoria Virasoro # ###############################################################

install.packages(“tidyverse”)

install.packages(“tidytext”)

install.packages(“stringr”)

install.packages(“SnowballC”)

install.packages(“topicmodels”)

install.packages(“textdata”)

install.packages(“scales”)

install.packages(“tm”)

install.packages(“readtext”)

library(tidyverse) library(tidytext) library(stringr) library(dplyr) library(ggplot2) library(scales) library(SnowballC) library(topicmodels) library(tm) library(readtext) library(tidyr) library(textdata)

DATA PREPROCESSING

Step 1: Set Your Folder Path and Load Documents

CHANGE THIS to the folder where your .docx files are saved !!

folder_path <- “C:/Users/patri/OneDrive/Documents/Hult/MBAN/Business Analysis with Unstructured Data - Kurnicki/Nike/ALL DATA (N,A,UA)/New folder”

Load all .docx files from the folder automatically

raw_docs <- readtext(paste0(folder_path, “/*.docx”))

Check what’s loaded - you should see 6 rows, one per file

print(raw_docs)

Step 2: Build a Data Frame with Source Labels

Rename columns

colnames(raw_docs)[1] <- “source” colnames(raw_docs)[2] <- “text”

Strip the .docx extension from the source names

raw_docs$source <- str_remove(raw_docs$source, “\.docx$”)

Check the source names loaded correctly

print(raw_docs$source)

nike_df <- raw_docs %>% select(source, text)

nike_df

#Run this to confirm exact source names unique(nike_df$source)

Recode sources to consistent, readable labels

nike_df$source <- recode(nike_df$source, “Nike - App Store Reviews” = “Nike - App Store”, “Nike - News_Headlines_NoYears” = “Nike - News”, “Nike - Reddit_Comments_NoLinks” = “Nike - Reddit”, “nike - Twitter mentions” = “Nike - Twitter”, “Nike - Youtube_Comments_NoLinks” = “Nike - YouTube”, “Nike Product Reviews” = “Nike - Product Reviews”, “Adidas Twitter Mentions” = “Adidas - Twitter”, “adidas_headlines” = “Adidas - News”, “Adidas_Youtube_Comments” = “Adidas - YouTube”, “appstore_adidas” = “Adidas - App Store”, “reddit_adidas” = “Adidas - Reddit”, “Adidas Reviews Website” = “Adidas - Product Reviews”, “Under Armour Twitter Mentions” = “Under Armour - Twitter”, “Under Armour Youtube Comments” = “Under Armour - YouTube”, “under_armour_headlines” = “Under Armour - News”, “appstore_underarmour” = “Under Armour - App Store”, “reddit_underarmour” = “Under Armour - Reddit”, “Under Armour Reviews Website” = “Under Armour - Product Reviews” )

VERIFICATION: all names should now be clean labels

print(unique(nike_df$source))

Define source groupings per brand

nike_sources <- c( “Nike - App Store”, “Nike - News”, “Nike - Reddit”, “Nike - Twitter”, “Nike - YouTube”, “Nike - Product Reviews” )

adidas_sources <- c( “Adidas - Twitter”, “Adidas - News”, “Adidas - YouTube”, “Adidas - App Store”, “Adidas - Reddit”, “Adidas - Product Reviews” )

ua_sources <- c( “Under Armour - Twitter”, “Under Armour - YouTube”, “Under Armour - News”, “Under Armour - App Store”, “Under Armour - Reddit”, “Under Armour - Product Reviews” )

Step 3: Tokenization

nike_tokens <- nike_df %>% unnest_tokens(word, text)

nike_tokens

Count before stopword removal

nike_tokens %>% count(word, sort = TRUE)

Step 4: Stopword Removal

tidy_nike <- nike_df %>% unnest_tokens(word, text) %>% anti_join(stop_words) %>% filter(!str_detect(word, “¹+$”))

tidy_nike

Top words after stopword removal

tidy_nike %>% count(word, sort = TRUE)

Step 5: Stemming

tidy_nike_stem <- tidy_nike %>% mutate(word = wordStem(word))

tidy_nike_stem %>% count(word, sort = TRUE)

Step 6: Top Words per Source - Visualisation

tidy_nike %>% count(source, word, sort = TRUE) %>% group_by(source) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = source)) + geom_col(show.legend = FALSE) + facet_wrap(~source, ncol = 3, scales = “free”) + labs(x = NULL, y = “Word Count”, title = “Top 10 Words per Source (Stopwords Removed)”) + coord_flip()

Step 7: N-grams - Bigrams

nike_bigrams <- nike_df %>% unnest_tokens(bigram, text, token = “ngrams”, n = 2)

nike_bigrams %>% count(bigram, sort = TRUE)

Remove stop words from bigrams

bigrams_separated <- nike_bigrams %>% separate(bigram, c(“word1”, “word2”), sep = ” “)

bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word)

bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE)

bigram_counts

Step 8: Quadrogram

nike_quadrogram <- nike_df %>% unnest_tokens(quadrogram, text, token = “ngrams”, n = 4) %>% separate(quadrogram, c(“word1”, “word2”, “word3”, “word4”), sep = ” “) %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) %>% filter(!word3 %in% stop_words$word) %>% filter(!word4 %in% stop_words$word)

nike_quadrogram %>% count(word1, word2, word3, word4, sort = TRUE)

Step 9: Document-Term Matrix (DTM)

tidy_dtm <- nike_df %>% unnest_tokens(word, text) %>% anti_join(stop_words) %>% count(source, word, sort = TRUE) %>% cast_dtm(source, word, n)

tidy_dtm inspect(tidy_dtm[1:6, 1:10])

SENTIMENT ANALYSIS

Step 1: Load and Explore the Lexicons

afinn <- get_sentiments(“afinn”) nrc <- get_sentiments(“nrc”) bing <- get_sentiments(“bing”)

sentiments <- bind_rows(mutate(afinn, lexicon = “afinn”), mutate(nrc, lexicon = “nrc”), mutate(bing, lexicon = “bing”) )

Explore each lexicon

nrc_data <- subset(sentiments, lexicon == “nrc”) unique(nrc_data$sentiment) # joy, anger, trust, anticipation, fear, etc.

bing_data <- subset(sentiments, lexicon == “bing”) unique(bing_data$sentiment) # positive / negative

afinn_data <- subset(sentiments, lexicon == “afinn”) summary(afinn_data$value) # quantitative scale -5 to +5

Step 2: NRC Emotion - Filter specific emotions

nrc_joy <- get_sentiments(“nrc”) %>% filter(sentiment == “joy”)

nrc_anger <- get_sentiments(“nrc”) %>% filter(sentiment == “anger”)

nrc_trust <- get_sentiments(“nrc”) %>% filter(sentiment == “trust”)

nrc_anticipation <- get_sentiments(“nrc”) %>% filter(sentiment == “anticipation”)

Joy words in Nike Product Reviews

tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_joy) %>% count(word, sort = TRUE)

Anger words in Nike Product Reviews

tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_anger) %>% count(word, sort = TRUE)

Trust words in Nike Product Reviews

tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE)

Anticipation words in Nike Product Reviews

tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_anticipation) %>% count(word, sort = TRUE)

Step 3: Comparing Sentiment Libraries - Per Source

App Store Reviews

app_store <- tidy_nike %>% filter(source == “Nike - App Store”)

afinn <- app_store %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)

bing_and_nrc <- bind_rows( app_store %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), app_store %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)

bind_rows(afinn, bing_and_nrc) %>% ggplot(aes(method, sentiment, fill = method)) + geom_col(show.legend = FALSE) + facet_wrap(~method, ncol = 1, scales = “free_y”) + labs(title = “App Store Reviews: Sentiment Across Lexicons”)

Nike Product Reviews

product_reviews <- tidy_nike %>% filter(source == “Nike - Product Reviews”)

afinn <- product_reviews %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)

bing_and_nrc <- bind_rows( product_reviews %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), product_reviews %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)

Reddit Comments

reddit <- tidy_nike %>% filter(source == “Nike - Reddit”)

afinn <- reddit %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)

bing_and_nrc <- bind_rows( reddit %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), reddit %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)

Twitter Mentions

twitter <- tidy_nike %>% filter(source == “Nike - Twitter”)

afinn <- twitter %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)

bing_and_nrc <- bind_rows( twitter %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), twitter %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)

YouTube Comments

youtube <- tidy_nike %>% filter(source == “Nike - YouTube”)

afinn <- youtube %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)

bing_and_nrc <- bind_rows( youtube %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), youtube %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)

News Headlines

news <- tidy_nike %>% filter(source == “Nike - News”)

afinn <- news %>% inner_join(get_sentiments(“afinn”)) %>% summarise(sentiment = sum(value)) %>% mutate(method = “AFINN”)

bing_and_nrc <- bind_rows( news %>% inner_join(get_sentiments(“bing”)) %>% mutate(method = “Bing et al.”), news %>% inner_join(get_sentiments(“nrc”) %>% filter(sentiment %in% c(“positive”, “negative”))) %>% mutate(method = “NRC”)) %>% count(method, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative)

Step 4: Most Common Positive and Negative Words (per source)

App Store Reviews

bing_counts <- app_store %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()

bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “App Store Reviews: Most Common Positive and Negative Words”) + coord_flip()

Nike Product Reviews

bing_counts <- product_reviews %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()

bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “Nike Product Reviews: Most Common Positive and Negative Words”) + coord_flip()

Reddit Comments

bing_counts <- reddit %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()

bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “Reddit Comments: Most Common Positive and Negative Words”) + coord_flip()

Twitter Mentions

bing_counts <- twitter %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()

bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “Twitter Mentions: Most Common Positive and Negative Words”) + coord_flip()

YouTube Comments

bing_counts <- youtube %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()

bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “YouTube Comments: Most Common Positive and Negative Words”) + coord_flip()

News Headlines

bing_counts <- news %>% inner_join(get_sentiments(“bing”)) %>% count(word, sentiment, sort = TRUE) %>% ungroup()

bing_counts %>% group_by(sentiment) %>% top_n(10) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + labs(y = “Contribution to sentiment”, x = NULL, title = “News Headlines: Most Common Positive and Negative Words”) + coord_flip()

Overall

bing_counts <- tidy_nike_stem %>% inner_join(get_sentiments(“bing”), by = “word”) %>% count(word, sentiment, sort = TRUE)

bing_counts %>% group_by(sentiment) %>% slice_max(order_by = n, n = 10, with_ties = FALSE) %>% ungroup() %>% mutate(word = tidytext::reorder_within(word, n, sentiment)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = “free_y”) + tidytext::scale_x_reordered() + labs( y = “Contribution to sentiment”, x = NULL, title = “Overall: Most Common Positive and Negative Words” ) + coord_flip()

Source: Extra1_Day4_LDA_on_USA_and_EU_twitter-1.R

Step 1: Build DTM for LDA

nike_lda_dtm <- tidy_nike %>% count(source, word, sort = TRUE) %>% cast_dtm(source, word, n)

Step 2: Run LDA Model

nike_lda <- LDA(nike_lda_dtm, k = 4, control = list(seed = 123)) nike_lda

Step 3: Per-topic per-word probabilities (beta)

nike_topics <- tidy(nike_lda, matrix = “beta”) nike_topics

top_terms <- nike_topics %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta)

top_terms

Step 4: Plot top terms per topic

top_terms %>% mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~topic, scales = “free”) + coord_flip() + labs(title = “Nike LDA Topic Modeling - Top Terms per Topic”, x = NULL, y = “Beta (Word Probability)”)

Step 5: Per-document topic distribution (gamma)

nike_gamma <- tidy(nike_lda, matrix = “gamma”) nike_gamma

nike_gamma %>% mutate(document = reorder(document, gamma * topic)) %>% ggplot(aes(factor(topic), gamma, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~document) + labs(title = “Topic Distribution per Source”, x = “Topic”, y = “Gamma (Topic Probability)”)

BUSINESS-DRIVEN TEXT ANALYTICS

1: TF-IDF Brand Differentiation

custom_stopwords <- tibble(word = c(“nike”, “just”, “im”, “ive”))

tidy_nike_tfidf <- tidy_nike %>% anti_join(custom_stopwords, by = “word”)

tfidf_source <- tidy_nike_tfidf %>% count(source, word, sort = TRUE)

tfidf_source <- tfidf_source %>% bind_tf_idf(term = word, document = source, n = n)

top_tfidf_source <- tfidf_source %>% group_by(source) %>% slice_max(order_by = tf_idf, n = 10, with_ties = FALSE) %>% ungroup()

top_tfidf_source %>% mutate(word = tidytext::reorder_within(word, tf_idf, source)) %>% ggplot(aes(word, tf_idf, fill = source)) + geom_col(show.legend = FALSE) + facet_wrap(~source, scales = “free_y”) + tidytext::scale_x_reordered() + coord_flip() + labs( title = “TF-IDF: Most Distinctive Words by Source”, x = NULL, y = “TF-IDF” )

2: Aspect-Based Sentiment

—- Aspect Dictionary —-

aspect_dictionary <- tribble( ~word, ~aspect, “size”, “Fit/Sizing”, “fit”, “Fit/Sizing”, “fits”, “Fit/Sizing”, “half”, “Fit/Sizing”, “oversized”, “Fit/Sizing”, “sizing”, “Fit/Sizing”, “tight”, “Fit/Sizing”, “loose”, “Fit/Sizing”,

“comfortable”, “Comfort”, “comfy”, “Comfort”, “comfort”, “Comfort”, “soft”, “Comfort”, “cushion”, “Comfort”,

“quality”, “Quality/Durability”, “peeled”, “Quality/Durability”, “cheap”, “Quality/Durability”, “durable”, “Quality/Durability”, “fell”, “Quality/Durability”, “problem”, “Quality/Durability”, “awkward”, “Quality/Durability”,

“style”, “Style/Design”, “design”, “Style/Design”, “colorway”, “Style/Design”, “look”, “Style/Design”, “black”, “Style/Design”, “fleece”, “Style/Design”, “bomber”, “Style/Design”,

“app”, “App Experience”, “workout”, “App Experience”, “workouts”, “App Experience”, “fitness”, “App Experience”, “crashing”, “App Experience”, “crashes”, “App Experience”, “bugs”, “App Experience”, “issue”, “App Experience”, “issues”, “App Experience”, “frustrating”, “App Experience”,

“revenue”, “Corporate/Business”, “strategy”, “Corporate/Business”, “investor”, “Corporate/Business”, “stock”, “Corporate/Business”, “ceo”, “Corporate/Business”, “wholesale”, “Corporate/Business”, “relations”, “Corporate/Business”,

“team”, “Sports/Cultural Visibility”, “national”, “Sports/Cultural Visibility”, “shirt”, “Sports/Cultural Visibility”, “jersey”, “Sports/Cultural Visibility”, “cup”, “Sports/Cultural Visibility”, “brazil”, “Sports/Cultural Visibility”, “football”, “Sports/Cultural Visibility” )

—- Map words to aspects —-

aspect_terms <- tidy_nike %>% inner_join(aspect_dictionary, by = “word”)

—- Attach Bing sentiment —-

aspect_sentiment_bing <- aspect_terms %>% inner_join(get_sentiments(“bing”), by = “word”)

—- Sentiment counts by aspect —-

aspect_summary_bing <- aspect_sentiment_bing %>% count(aspect, sentiment, sort = TRUE)

aspect_summary_bing

—- Net sentiment by aspect —-

aspect_net_bing <- aspect_sentiment_bing %>% mutate(score = if_else(sentiment == “positive”, 1, -1)) %>% group_by(aspect) %>% summarise( mentions = n(), net_sentiment = sum(score), avg_sentiment = mean(score), .groups = “drop” ) %>% arrange(desc(avg_sentiment))

aspect_net_bing

—- Plot positive vs negative counts by aspect —-

aspect_summary_bing %>% ggplot(aes(x = aspect, y = n, fill = sentiment)) + geom_col(position = “dodge”) + coord_flip() + labs( title = “Aspect-Based Sentiment Analysis (Bing)”, x = NULL, y = “Word Count” )

—- Aspect sentiment by source —-

aspect_by_source_bing <- aspect_sentiment_bing %>% mutate(score = if_else(sentiment == “positive”, 1, -1)) %>% group_by(source, aspect) %>% summarise( mentions = n(), net_sentiment = sum(score), avg_sentiment = mean(score), .groups = “drop” )

aspect_by_source_bing

—- Plot average sentiment by aspect and source —-

aspect_by_source_bing %>% ggplot(aes(x = aspect, y = avg_sentiment, fill = aspect)) + geom_col(show.legend = FALSE) + facet_wrap(~source) + coord_flip() + labs( title = “Aspect-Based Sentiment by Source”, x = NULL, y = “Average Sentiment Score” )

3: Keyword Co-occurrence Network

install.packages(“widyr”)

library(widyr) library(igraph) library(ggraph)

Pairwise correlations between words

word_cors <- tidy_nike %>% group_by(word) %>% filter(n() >= 5) %>% pairwise_cor(word, source, sort = TRUE)

word_cors

Bar chart of correlations for key Nike brand words

word_cors %>% filter(item1 %in% c(“nike”, “shoes”, “quality”, “price”)) %>% group_by(item1) %>% top_n(5) %>% ungroup() %>% mutate(item2 = reorder(item2, correlation)) %>% ggplot(aes(item2, correlation)) + geom_bar(stat = “identity”) + facet_wrap(~item1, scales = “free”) + coord_flip() + labs(title = “Word Correlations with Key Nike Terms”)

Brand Perception

Q1: How is Nike emotionally positioned in online

conversations?

tidy_nike %>% inner_join(get_sentiments(“nrc”)) %>% filter(sentiment %in% c(“joy”, “anger”, “trust”, “anticipation”, “fear”, “sadness”, “surprise”, “disgust”)) %>% count(sentiment, sort = TRUE) %>% mutate(sentiment = reorder(sentiment, n)) %>% ggplot(aes(sentiment, n, fill = sentiment)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = “Word Count”, title = “Nike: Overall Emotional Positioning Across All Sources”) + coord_flip()

Emotion profile broken down per source

tidy_nike %>% inner_join(get_sentiments(“nrc”)) %>% filter(sentiment %in% c(“joy”, “anger”, “trust”, “anticipation”, “fear”, “sadness”, “surprise”, “disgust”, “positive”, “negative”)) %>% count(source, sentiment) %>% group_by(source) %>% mutate(proportion = n / sum(n)) %>% ungroup() %>% mutate(sentiment = reorder(sentiment, proportion)) %>% ggplot(aes(sentiment, proportion, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~source, ncol = 3, scales = “free_y”) + scale_y_continuous(labels = percent_format()) + labs(x = NULL, y = “Percentage of Emotional Words”, title = “Nike: Emotional Breakdown per Source (% of words)”) + coord_flip()

AFINN: score per source

tidy_nike %>% inner_join(get_sentiments(“afinn”)) %>% group_by(source) %>% summarise(sentiment = sum(value)) %>% mutate(source = reorder(source, sentiment)) %>% ggplot(aes(source, sentiment, fill = sentiment > 0)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = “Sentiment Score”, title = “AFINN: Sentiment Intensity per Source”) + coord_flip()

Bing: net score per source

tidy_nike %>% inner_join(get_sentiments(“bing”)) %>% count(source, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative) %>% mutate(source = reorder(source, sentiment)) %>% ggplot(aes(source, sentiment, fill = sentiment > 0)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = “Net Sentiment Score (Positive - Negative)”, title = “Bing: Net Sentiment Score per Source”) + coord_flip()

NRC: net score per source

tidy_nike %>% inner_join(get_sentiments(“nrc”)) %>% filter(sentiment %in% c(“positive”, “negative”)) %>% count(source, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(sentiment = positive - negative) %>% mutate(source = reorder(source, sentiment)) %>% ggplot(aes(source, sentiment, fill = sentiment > 0)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = “Net Sentiment Score (Positive - Negative)”, title = “NRC: Net Sentiment Score per Source”) + coord_flip()

Q2: What themes are most associated with brand loyalty?

NRC Trust words in App Store Reviews

tidy_nike %>% filter(source == “Nike - App Store”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE)

Trust words in Nike Product Reviews

tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE)

Trust words in Reddit

tidy_nike %>% filter(source == “Nike - Reddit”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE)

Plot top trust words across all sources combined

tidy_nike %>% inner_join(nrc_trust) %>% count(word, sort = TRUE) %>% top_n(20) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = word)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = “Word Count”, title = “Nike: Top Trust Words - Themes Driving Brand Loyalty”) + coord_flip()

Bigrams around loyalty and trust words

bigrams_filtered %>% filter(!str_detect(word1, “²+$")) %>% filter(!str_detect(word2, "^[0-9]+$”)) %>% filter(!str_detect(word1, “‘“)) %>% filter(!str_detect(word2,”’”)) %>% filter(!word1 %in% c(“positive”, “negative”, “neutral”)) %>% filter(!word2 %in% c(“positive”, “negative”, “neutral”)) %>% filter(word1 %in% c(“wear”, “favorite”, “recommend”, “real”, “true”, “perfect”, “happy”, “strength”, “love”, “free”, “comfortable”, “trust”) | word2 %in% c(“wear”, “favorite”, “recommend”, “real”, “true”, “perfect”, “happy”, “strength”, “love”, “free”, “comfortable”, “trust”)) %>% count(word1, word2, sort = TRUE) %>% top_n(20) %>% unite(bigram, word1, word2, sep = ” “) %>% mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(bigram, n, fill = bigram)) + geom_col(show.legend = FALSE) + labs(x = NULL, y =”Count”, title = “Nike: Bigrams Around Key Loyalty and Trust Words”) + coord_flip()

Trigrams around loyalty and trust words

nike_df %>% unnest_tokens(trigram, text, token = “ngrams”, n = 3) %>% separate(trigram, c(“word1”, “word2”, “word3”), sep = ” “) %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) %>% filter(!word3 %in% stop_words$word) %>% filter(!str_detect(word1, "^[0-9]+$”)) %>% filter(!str_detect(word2, “³+$")) %>% filter(!str_detect(word3, "^[0-9]+$”)) %>% filter(!str_detect(word1, “‘“)) %>% filter(!str_detect(word2,”’”)) %>% filter(!str_detect(word3, “’”)) %>% filter(!word1 %in% c(“positive”, “negative”, “neutral”)) %>% filter(!word2 %in% c(“positive”, “negative”, “neutral”)) %>% filter(!word3 %in% c(“positive”, “negative”, “neutral”)) %>% filter(word1 %in% c(“wear”, “favorite”, “recommend”, “real”, “true”, “perfect”, “happy”, “strength”, “love”, “free”, “comfortable”, “trust”) | word2 %in% c(“wear”, “favorite”, “recommend”, “real”, “true”, “perfect”, “happy”, “strength”, “love”, “free”, “comfortable”, “trust”) | word3 %in% c(“wear”, “favorite”, “recommend”, “real”, “true”, “perfect”, “happy”, “strength”, “love”, “free”, “comfortable”, “trust”)) %>% count(word1, word2, word3, sort = TRUE) %>% top_n(5) %>% unite(trigram, word1, word2, word3, sep = ” “) %>% mutate(trigram = reorder(trigram, n)) %>% ggplot(aes(trigram, n, fill = trigram)) + geom_col(show.legend = FALSE) + labs(x = NULL, y =”Count”, title = “Nike: Trigrams Around Key Loyalty and Trust Words”) + coord_flip()

In percentage

Trust words in App Store Reviews

tidy_nike %>% filter(source == “Nike - App Store”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE) %>% mutate(proportion = n / sum(n)) %>% mutate(word = reorder(word, proportion)) %>% ggplot(aes(word, proportion, fill = word)) + geom_col(show.legend = FALSE) + scale_y_continuous(labels = percent_format()) + labs(x = NULL, y = “Percentage of Trust Words”, title = “App Store Reviews: Top Trust Words”) + coord_flip()

Trust words in Nike Product Reviews

tidy_nike %>% filter(source == “Nike - Product Reviews”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE) %>% mutate(proportion = n / sum(n)) %>% mutate(word = reorder(word, proportion)) %>% ggplot(aes(word, proportion, fill = word)) + geom_col(show.legend = FALSE) + scale_y_continuous(labels = percent_format()) + labs(x = NULL, y = “Percentage of Trust Words”, title = “Nike Product Reviews: Top Trust Words”) + coord_flip()

Trust words in Reddit

tidy_nike %>% filter(source == “Nike - Reddit”) %>% inner_join(nrc_trust) %>% count(word, sort = TRUE) %>% mutate(proportion = n / sum(n)) %>% mutate(word = reorder(word, proportion)) %>% ggplot(aes(word, proportion, fill = word)) + geom_col(show.legend = FALSE) + scale_y_continuous(labels = percent_format()) + labs(x = NULL, y = “Percentage of Trust Words”, title = “Reddit Comments: Top Trust Words”) + coord_flip()

Plot top trust words across all sources combined

tidy_nike %>% inner_join(nrc_trust) %>% count(word, sort = TRUE) %>% mutate(proportion = n / sum(n)) %>% top_n(20, proportion) %>% mutate(word = reorder(word, proportion)) %>% ggplot(aes(word, proportion, fill = word)) + geom_col(show.legend = FALSE) + scale_y_continuous(labels = percent_format()) + labs(x = NULL, y = “Percentage of Trust Words”, title = “Nike: Top Trust Words in Customer Reviews”) + coord_flip()

PRODUCT STRATEGY ANALYSIS

1. Isolate Nike Product Reviews from the full dataset

product_raw <- nike_df %>% filter(source == “Nike - Product Reviews”)

Inspect structure

names(product_raw) glimpse(product_raw) head(product_raw, 1)

2. Extract the single text blob from the product review source

product_text_blob <- product_raw$text[[1]]

Split the blob into individual lines

product_lines <- tibble( raw_line = str_split(product_text_blob, “”)[[1]] ) %>% mutate(raw_line = str_squish(raw_line)) %>% filter(raw_line != ““)

Inspect the first few raw lines

head(product_lines, 20)

3. Parse each line into structured fields

Inspect parsed structure

glimpse(product_reviews_structured) head(product_reviews_structured, 10)

Optional checks

table(product_reviews_structured$product_category) table(product_reviews_structured$audience)

4. Tokenize product reviews

product_tokens <- product_reviews_structured %>% unnest_tokens(word, review_text) %>% anti_join(stop_words, by = “word”) %>% filter(!str_detect(word, “⁴+$”)) %>% filter(str_length(word) > 2)

Inspect tokenized output

glimpse(product_tokens) head(product_tokens, 20)

5. Identify negative words by product line

product_negative_words <- product_tokens %>% inner_join(get_sentiments(“bing”), by = “word”) %>% filter(sentiment == “negative”)

negative_by_product <- product_negative_words %>% count(product_name, word, sort = TRUE)

head(negative_by_product, 20)

6. Measure negative complaint volume by product line

negative_volume_by_product <- product_negative_words %>% count(product_name, sort = TRUE)

negative_volume_by_product

7. Normalize complaint volume by number of reviews

review_volume_by_product <- product_reviews_structured %>% count(product_name, name = “review_count”)

negative_rate_by_product <- negative_volume_by_product %>% left_join(review_volume_by_product, by = “product_name”) %>% mutate(negative_words_per_review = n / review_count) %>% arrange(desc(negative_words_per_review))

negative_rate_by_product

8. Define aspect dictionary for product strategy

product_aspect_dictionary <- tribble( ~word, ~aspect, “size”, “Fit/Sizing”, “fit”, “Fit/Sizing”, “fits”, “Fit/Sizing”, “half”, “Fit/Sizing”, “oversized”, “Fit/Sizing”, “tight”, “Fit/Sizing”, “loose”, “Fit/Sizing”, “awkward”, “Fit/Sizing”, “long”, “Fit/Sizing”, “large”, “Fit/Sizing”, “small”, “Fit/Sizing”,

“comfortable”, “Comfort”, “comfy”, “Comfort”, “comfort”, “Comfort”, “soft”, “Comfort”, “roomy”, “Comfort”,

“quality”, “Quality/Durability”, “peeled”, “Quality/Durability”, “fell”, “Quality/Durability”, “cheap”, “Quality/Durability”, “shrink”, “Quality/Durability”, “shrunk”, “Quality/Durability”, “durable”, “Quality/Durability”, “problem”, “Quality/Durability”, “issues”, “Quality/Durability”,

“style”, “Style/Design”, “design”, “Style/Design”, “color”, “Style/Design”, “colorway”, “Style/Design”, “fleece”, “Style/Design”, “bomber”, “Style/Design”, “beautiful”, “Style/Design”, “feminine”, “Style/Design” )

9. Join aspects and sentiment

product_aspect_sentiment <- product_tokens %>% inner_join(product_aspect_dictionary, by = “word”) %>% inner_join(get_sentiments(“bing”), by = “word”)

Summarize sentiment by product line and aspect

product_aspect_summary <- product_aspect_sentiment %>% mutate(score = if_else(sentiment == “positive”, 1, -1)) %>% group_by(product_name, aspect) %>% summarise( mentions = n(), net_sentiment = sum(score), avg_sentiment = mean(score), .groups = “drop” ) %>% arrange(product_name, avg_sentiment)

product_aspect_summary

10. Extract negative complaint drivers

product_negative_drivers <- product_aspect_sentiment %>% filter(sentiment == “negative”) %>% count(product_name, aspect, word, sort = TRUE)

head(product_negative_drivers, 30)

11. Plot aspect sentiment by product line

product_aspect_summary %>% ggplot(aes(x = aspect, y = avg_sentiment, fill = aspect)) + geom_col(show.legend = FALSE) + facet_wrap(~product_name, scales = “free_y”) + coord_flip() + labs( title = “Aspect Sentiment by Product Line”, x = NULL, y = “Average Sentiment Score” )

Sustainability Narrative

sustainability_keywords <- data_frame( word = c( “sustainable”, “sustainability”, “eco”, “recycle”, “recycled”, “recycling”, “environment”, “environmental”, “planet”, “carbon”, “footprint”, “renewable”, “organic”, “biodegradable”, “climate”, “emissions”, “waste”, “ethical”, “responsible”, “circular”, “vegan”, “natural”, “conservation”, “impact” ) )

Greenwashing keywords - only clearly skeptical/critical words

greenwashing_keywords <- data_frame( word = c( “greenwashing”, “greenwash”, “misleading”, “mislead”, “propaganda”, “dishonest”, “pretend”, “pretending”, “performative”, “hypocrisy”, “hypocrite”, “virtue” ) )

Search across ALL raw tokens so no signal word is lost

sustainability_tokens <- nike_tokens %>% select(source, word) %>% inner_join(sustainability_keywords, by = “word”)

Tag each row with its brand using case_when (safer than nested ifelse)

sustainability_tokens <- sustainability_tokens %>% mutate(brand = case_when( source %in% adidas_sources ~ “Adidas”, source %in% ua_sources ~ “Under Armour”, TRUE ~ “Nike” ))

Q1: Is Nike’s sustainability messaging resonating?

Visual 1: volume of sustainability language per channel

sustainability_tokens %>% count(source, sort = TRUE) %>% mutate(source = reorder(source, n)) %>% ggplot(aes(source, n, fill = source)) + geom_col(show.legend = FALSE) + coord_flip() + labs( title = “Sustainability Mentions by Source”, subtitle = “Higher count = more sustainability language in that channel”, x = NULL, y = “Count” )

Visual 2: tone of sustainability language per channel

bing_lexicon <- get_sentiments(“bing”)

sustainability_tokens %>% inner_join(bing_lexicon, by = “word”) %>% count(source, sentiment) %>% spread(sentiment, n, fill = 0) %>% mutate(net_sentiment = positive - negative) %>% mutate(source = reorder(source, net_sentiment)) %>% ggplot(aes(source, net_sentiment, fill = net_sentiment > 0)) + geom_col(show.legend = FALSE) + coord_flip() + scale_fill_manual(values = c(“TRUE” = “steelblue”, “FALSE” = “tomato”)) + labs( title = “Net Sentiment Around Sustainability Keywords (Bing)”, subtitle = “Blue = net positive | Red = net negative”, x = NULL, y = “Positive minus Negative” )

Visual 3: does Nike lead the sustainability conversation vs competitors?

sustainability_tokens %>% count(brand, sort = TRUE) %>% mutate(brand = reorder(brand, n)) %>% ggplot(aes(brand, n, fill = brand)) + geom_col(show.legend = FALSE) + coord_flip() + labs( title = “Sustainability Mentions: Nike vs. Adidas vs. Under Armour”, x = NULL, y = “Total Keyword Occurrences” )

Sustainability Bigrams

bigrams_filtered %>% filter( word1 %in% sustainability_keywords$word | word2 %in% sustainability_keywords$word ) %>% count(word1, word2, sort = TRUE) %>% head(20) %>% unite(bigram, word1, word2, sep = ” “) %>% mutate(bigram = reorder(bigram, n)) %>% ggplot(aes(bigram, n)) + geom_col(fill =”steelblue”) + coord_flip() + labs( title = “Top Sustainability Bigrams”, subtitle = “Context check: ‘green’ and ‘carbon’ often appear in product contexts”, x = NULL, y = “Count” )

Q2: Is greenwashing language appearing?

greenwashing_tokens <- nike_tokens %>% select(source, word) %>% inner_join(greenwashing_keywords, by = “word”)

Visual 1: greenwashing language by source channel

greenwashing_tokens %>% count(source, sort = TRUE) %>% mutate(source = reorder(source, n)) %>% ggplot(aes(source, n, fill = source)) + geom_col(show.legend = FALSE) + coord_flip() + labs( title = “Greenwashing Language by Source”, subtitle = “Low counts = limited public accusation of greenwashing”, x = NULL, y = “Count” )

Visual 2: greenwashing language by brand

greenwashing_tokens %>% mutate(brand = case_when( source %in% adidas_sources ~ “Adidas”, source %in% ua_sources ~ “Under Armour”, TRUE ~ “Nike” )) %>% count(brand, sort = TRUE) %>% mutate(brand = reorder(brand, n)) %>% ggplot(aes(brand, n, fill = brand)) + geom_col(show.legend = FALSE) + coord_flip() + labs( title = “Greenwashing Language: Nike vs. Adidas vs. Under Armour”, x = NULL, y = “Greenwashing Keyword Occurrences” )

PRICING SENSITIVITY ANALYSIS

Pricing / value language dictionary

pricing_terms <- tibble( word = c( “overpriced”, “expensive”, “price”, “priced”, “cost”, “costly”, “value”, “worth”, “retail”, “premium”, “cheap”, “affordable” ) )

Pull example text snippets containing explicit pricing language

pricing_examples <- nike_df %>% filter(str_detect(str_to_lower(text), “overpriced|\bexpensive\b|price tag|\bworth\b|\bpremium\b”)) %>% select(source, text)

pricing_examples

Competitive Positioning

pkgs <- c(“tidyverse”, “tidytext”, “textdata”, “SnowballC”, “ggplot2”, “scales”, “readxl”, “officer”, “stringr”, “forcats”)

invisible(lapply(pkgs, function(p) { if (!requireNamespace(p, quietly = TRUE)) { install.packages(p, dependencies = TRUE) } library(p, character.only = TRUE) }))

set.seed(42)

Step 1: Load Competition Data

comparison_raw <- raw_docs

Step 2: Clean Source Names

colnames(comparison_raw)[1] <- “source” colnames(comparison_raw)[2] <- “text”

comparison_raw$source <- basename(comparison_raw$source) comparison_raw$source <- URLdecode(comparison_raw$source) comparison_raw$source <- str_remove(comparison_raw$source, “\.docx$”)

comparison_df <- comparison_raw %>% select(source, text)

print(unique(comparison_df$source))

Step 3: Assign Brand and Source Type

comparison_df <- comparison_df %>% mutate( brand = case_when( str_detect(str_to_lower(source), “nike”) ~ “Nike”, str_detect(str_to_lower(source), “adidas”) ~ “Adidas”, str_detect(str_to_lower(source), “under armour|under_armour|underarmour”) ~ “UnderArmour”, TRUE ~ NA_character_ ), source_type = case_when( str_detect(str_to_lower(source), “twitter”) ~ “Twitter”, str_detect(str_to_lower(source), “youtube”) ~ “YouTube”, str_detect(str_to_lower(source), “reddit”) ~ “Reddit”, str_detect(str_to_lower(source), “app store|appstore”) ~ “App Store”, str_detect(str_to_lower(source), “headline|headlines|news”) ~ “News”, str_detect(str_to_lower(source), “reviews website|product reviews|review”) ~ “Product Reviews”, TRUE ~ NA_character_ ) )

comparison_df %>% count(brand, source_type)

Step 4: Tokenization and Stopword Removal

custom_stopwords_comp <- tibble( word = c(“nike”, “adidas”, “under”, “armour”, “ua”, “shoe”, “shoes”, “brand”, “just”, “really”, “like”, “get”, “got”, “one”, “also”, “can”, “use”, “app”, “run”, “running”, “workout”) )

comparison_tokens <- comparison_df %>% unnest_tokens(word, text) %>% anti_join(stop_words, by = “word”) %>% anti_join(custom_stopwords_comp, by = “word”) %>% filter(!str_detect(word, “⁵+$”)) %>% filter(str_length(word) > 2)

comparison_tokens

Step 5: Figure 3 - NRC Emotion Intensity Heatmap

comparison_tokens %>% inner_join(get_sentiments(“nrc”), by = “word”) %>% filter(!sentiment %in% c(“positive”, “negative”)) %>% count(brand, sentiment) %>% group_by(brand) %>% mutate(pct = n / sum(n)) %>% ungroup() %>% ggplot(aes(x = brand, y = sentiment, fill = pct)) + geom_tile(color = “white”) + geom_text(aes(label = percent(pct, accuracy = 1)), size = 4) + scale_fill_gradient(low = “white”, high = “steelblue”, labels = percent_format()) + labs( title = “NRC Emotion Intensity Heatmap”, subtitle = “Proportion of emotion words per brand”, x = NULL, y = “Emotion”, fill = “% ofWords” ) + theme_minimal()

Step 6: Figure 4 - TF-IDF Signature Language

comparison_tokens %>% count(brand, word, sort = TRUE) %>% bind_tf_idf(word, brand, n) %>% group_by(brand) %>% slice_max(order_by = tf_idf, n = 15, with_ties = FALSE) %>% ungroup() %>% mutate(word = reorder_within(word, tf_idf, brand)) %>% ggplot(aes(word, tf_idf, fill = brand)) + geom_col(show.legend = FALSE) + facet_wrap(~brand, scales = “free_y”) + scale_x_reordered() + coord_flip() + labs( title = “TF-IDF: Signature Language of Each Brand”, subtitle = “Words most uniquely associated with each brand vs. competitors”, x = NULL, y = “TF-IDF Score” )

0-9↩︎
0-9↩︎
0-9↩︎
0-9↩︎
0-9↩︎

A2 FINAL Nike Team 9

2026-03-27

Nike Text Analytics - Strategic Brand Intelligence

A2: NLP Pipeline in R

install.packages(“tidyverse”)

install.packages(“tidytext”)

install.packages(“stringr”)

install.packages(“SnowballC”)

install.packages(“topicmodels”)

install.packages(“textdata”)

install.packages(“scales”)

install.packages(“tm”)

install.packages(“readtext”)

DATA PREPROCESSING

Step 1: Set Your Folder Path and Load Documents

CHANGE THIS to the folder where your .docx files are saved !!

Load all .docx files from the folder automatically

Check what’s loaded - you should see 6 rows, one per file

Step 2: Build a Data Frame with Source Labels

Rename columns

Strip the .docx extension from the source names

Check the source names loaded correctly

Recode sources to consistent, readable labels

VERIFICATION: all names should now be clean labels

Define source groupings per brand

Step 3: Tokenization

Count before stopword removal

Step 4: Stopword Removal

Top words after stopword removal

Step 5: Stemming

Step 6: Top Words per Source - Visualisation

Step 7: N-grams - Bigrams

Remove stop words from bigrams

Step 8: Quadrogram

Step 9: Document-Term Matrix (DTM)

SENTIMENT ANALYSIS

Step 1: Load and Explore the Lexicons

Explore each lexicon

Step 2: NRC Emotion - Filter specific emotions

Joy words in Nike Product Reviews

Anger words in Nike Product Reviews

Trust words in Nike Product Reviews

Anticipation words in Nike Product Reviews

Step 3: Comparing Sentiment Libraries - Per Source

App Store Reviews

Nike Product Reviews

Reddit Comments

Twitter Mentions

YouTube Comments

News Headlines

Step 4: Most Common Positive and Negative Words (per source)

App Store Reviews

Nike Product Reviews

Reddit Comments

Twitter Mentions

YouTube Comments

News Headlines

Overall

Source: Extra1_Day4_LDA_on_USA_and_EU_twitter-1.R

Step 1: Build DTM for LDA

Step 2: Run LDA Model

Step 3: Per-topic per-word probabilities (beta)

Step 4: Plot top terms per topic

Step 5: Per-document topic distribution (gamma)

BUSINESS-DRIVEN TEXT ANALYTICS

1: TF-IDF Brand Differentiation

2: Aspect-Based Sentiment

—- Aspect Dictionary —-

—- Map words to aspects —-

—- Attach Bing sentiment —-

—- Sentiment counts by aspect —-

—- Net sentiment by aspect —-

—- Plot positive vs negative counts by aspect —-

—- Aspect sentiment by source —-

—- Plot average sentiment by aspect and source —-

3: Keyword Co-occurrence Network

install.packages(“widyr”)

Pairwise correlations between words

Bar chart of correlations for key Nike brand words

Brand Perception