Code
library(ParseR)

library(LimpiaR)
library(devtools)
library(tidyverse)
library(lubridate)
library(DT)
library(DisplayR)
library(ggplot2)
library(wordcloud)
library(gridExtra)
library(knitr)
library(plotly)
library(scales)
library(patchwork)
library(viridis)
library(tidytext)


tiktok_raw <- readRDS('~/Google Drive/My Drive/data_science_project_work/loreal/loreal_im_landscape/data/raw_data/rev1/tiktok.rds')
Code
cat("Our dataset has", nrow(tiktok_raw), "videos and", ncol(tiktok_raw), "variables")
Our dataset has 249838 videos and 20 variables

We want to look at the relationship between the video content against post descriptions and audio transcriptions. We’ll be looking at text content characteristics, any missing data patterns and verify accuracy.

1 Data Overview

Let’s look at what we have:

Code
# Check columns
colnames(tiktok_raw)
 [1] "source_post_id"      "post_url"            "post_description"   
 [4] "source"              "likes_count"         "shares_count"       
 [7] "plays_count"         "comments_count"      "post_created"       
[10] "username"            "profile_url"         "play_url"           
[13] "audio_id"            "is_visible"          "post_id"            
[16] "region"              "ingested_sound_id"   "combined_tags"      
[19] "audio_transcription" "engagement_score"   

2 Missing Values

Before we dive into analysis, let’s check what’s missing:

Double checking!

Code
# What values are missing in each column
missing_values <- colSums(is.na(tiktok_raw))
missing_values[missing_values > 0]
   post_description            username         profile_url            play_url 
                 10               26369                  69               16490 
           audio_id              region   ingested_sound_id audio_transcription 
                108                1805                  69               90370 
Code
# Calculate percentages
missing_percent <- round(missing_values[missing_values > 0] / nrow(tiktok_raw) * 100, 2)
missing_percent
   post_description            username         profile_url            play_url 
               0.00               10.55                0.03                6.60 
           audio_id              region   ingested_sound_id audio_transcription 
               0.04                0.72                0.03               36.17 
Code
# Create data frame for visualization - PERCENTAGE
missing_df <- data.frame(
  column = names(missing_values[missing_values > 0]),
  percent = missing_percent
)

# Sort by percentage (descending)
missing_df <- missing_df[order(-missing_df$percent), ]

# Visualize PERCENTAGES using DisplayR
ggplot(missing_df, aes(x = reorder(column, -percent), y = percent)) +
  geom_bar(stat = "identity", fill = "#1e88e5") +
  DisplayR::dr_theme_capture() +
  labs(title = "Percentage of Missing Values by Column",
       subtitle = "Higher percentages indicate more missing data",
       x = NULL, 
       y = "Percent Missing (%)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code
# Create data frame for visualization - COUNTS
missing_data <- data.frame(
  column = names(missing_values[missing_values > 0]),
  count = missing_values[missing_values > 0]
)

# Sort by count
missing_data <- missing_data[order(-missing_data$count), ]

# Visualize COUNTS using DisplayR
ggplot(missing_data, aes(x = reorder(column, -count), y = count)) +
  geom_bar(stat = "identity", fill = "#1e88e5") +
  DisplayR::dr_theme_capture() +
  labs(title = "Missing Values by Column (Counts)",
       subtitle = "Raw counts of missing values",
       x = NULL, 
       y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code
# Note about audio_transcription missing values
cat("Number of missing audio_transcriptions:", missing_values["audio_transcription"], 
    "out of", nrow(tiktok_raw), "videos (", 
    missing_percent["audio_transcription"], "%)\n")
Number of missing audio_transcriptions: 90370 out of 249838 videos ( 36.17 %)
Code
# What values are missing in each column
missing_values <- colSums(is.na(tiktok_raw))
missing_values[missing_values > 0]
   post_description            username         profile_url            play_url 
                 10               26369                  69               16490 
           audio_id              region   ingested_sound_id audio_transcription 
                108                1805                  69               90370 
Code
# Calculate percentages
missing_percent <- round(missing_values[missing_values > 0] / nrow(tiktok_raw) * 100, 2)
missing_percent
   post_description            username         profile_url            play_url 
               0.00               10.55                0.03                6.60 
           audio_id              region   ingested_sound_id audio_transcription 
               0.04                0.72                0.03               36.17 
Code
# Create data frame for visualization - PERCENTAGE
missing_df <- data.frame(
  column = names(missing_values[missing_values > 0]),
  percent = missing_percent
)

# Sort by percentage
missing_df <- missing_df[order(-missing_df$percent), ]

# Visualize PERCENTAGES using DisplayR
ggplot(missing_df, aes(x = reorder(column, -percent), y = percent)) +
  geom_bar(stat = "identity", fill = "#1e88e5") +
  DisplayR::dr_theme_capture() +
  labs(title = paste0("Missing Values by Column (N = ", nrow(tiktok_raw), ")"),
       x = NULL, 
       y = "Percent Missing (%)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code
cat("Number of missing audio_transcriptions:", missing_values["audio_transcription"], 
    "out of", nrow(tiktok_raw), "videos (", 
    missing_percent["audio_transcription"], "%)\n")
Number of missing audio_transcriptions: 90370 out of 249838 videos ( 36.17 %)

3 Data Availability for Text Content

We’re now going to check what text content is available for further analysis

Code
# Count descriptions and transcriptions
desc_count <- sum(!is.na(tiktok_raw$post_description))
trans_count <- sum(!is.na(tiktok_raw$audio_transcription))

cat("Post descriptions available:", desc_count, "of", nrow(tiktok_raw), 
    "(", round(desc_count/nrow(tiktok_raw)*100, 2), "%)\n")
Post descriptions available: 249828 of 249838 ( 100 %)
Code
cat("Audio transcriptions available:", trans_count, "of", nrow(tiktok_raw), 
    "(", round(trans_count/nrow(tiktok_raw)*100, 2), "%)\n")
Audio transcriptions available: 159468 of 249838 ( 63.83 %)
Code
# Check URLs for manual verification
url_count <- sum(!is.na(tiktok_raw$post_url))
cat("Post URLs available:", url_count, "of", nrow(tiktok_raw), 
    "(", round(url_count/nrow(tiktok_raw)*100, 2), "%)")
Post URLs available: 249838 of 249838 ( 100 %)

Let’s compare videos with and without audio transcriptions to see if there are any patterns affecting it:

Code
audio_metrics <- tiktok_raw %>% 
  mutate(has_transcription = !is.na(audio_transcription)) %>%
  group_by(has_transcription) %>% 
  summarise(
    count = n(),
    percent = n() / nrow(tiktok_raw) * 100,
    avg_likes = mean(likes_count, na.rm = TRUE),
    avg_comments = mean(comments_count, na.rm = TRUE),
    avg_shares = mean(shares_count, na.rm = TRUE),
    avg_duration = ifelse("video_duration_sec" %in% names(tiktok_raw), 
                          mean(video_duration_sec, na.rm = TRUE), NA),
    has_description = sum(!is.na(post_description)) / n() * 100
  ) %>% 
  mutate(across(where(is.numeric), ~round(., 2)))

knitr::kable(audio_metrics, 
             caption = "Comparison of Videos With and Without Audio Transcriptions")
Comparison of Videos With and Without Audio Transcriptions
has_transcription count percent avg_likes avg_comments avg_shares avg_duration has_description
FALSE 90370 36.17 24046.45 181.63 930.98 NA 99.99
TRUE 159468 63.83 24016.07 163.14 964.73 NA 100.00

4 Compare Text Content Lengths

Let’s compare the lengths of different text fields:

Code
# Calculate text lengths
tiktok_raw <- tiktok_raw %>%
  mutate(len_desc = str_length(post_description),
         len_tran = str_length(audio_transcription))

# Summarise the lengths
text_length_summary <- tiktok_raw %>%
  summarise(
    desc_min = min(len_desc, na.rm = TRUE),
    desc_median = median(len_desc, na.rm = TRUE),
    desc_mean = round(mean(len_desc, na.rm = TRUE), 2),
    desc_max = max(len_desc, na.rm = TRUE),
    tran_min = min(len_tran, na.rm = TRUE),
    tran_median = median(len_tran, na.rm = TRUE),
    tran_mean = round(mean(len_tran, na.rm = TRUE), 2),
    tran_max = max(len_tran, na.rm = TRUE)
  )


# Visualise it
tiktok_raw %>%
  pivot_longer(cols = c(len_desc, len_tran), 
               names_to = "text_type", 
               values_to = "length") %>%
  filter(!is.na(length)) %>%
  ggplot(aes(x = length, fill = text_type)) +
  geom_histogram(position = "dodge", bins = 30, alpha = 0.7) +
  scale_fill_manual(values = c("skyblue", "lightgreen"),
                   labels = c("Description", "Transcription")) +
  DisplayR::dr_theme_capture() +
  labs(title = "Distribution of Text Lengths",
       x = "Character Count",
       y = "Count",
       fill = "Text Type")

Text lengths do not tend to exceed more than 5000 characters and most texts tend to peak at well below 1000 characters.

5 Comparing Text Content Differences

Let’s prepare the text data for analysis:

Code
# Making text data
all_text <- tiktok_raw %>%
  select(post_description, audio_transcription) %>%
  pivot_longer(everything(), names_to = "name", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(
    value = tolower(value),
    value = tm::removeWords(value, tm::stopwords(kind = "smart"))
  ) %>%
  LimpiaR::limpiar_spaces(value)

# Check counts
text_counts <- table(all_text$name)
text_counts

audio_transcription    post_description 
             159468              249828 

This shows we have:

  • 159,468 audio transcriptions (about 64% of videos)

  • 249,828 post descriptions (almost all videos)

6 Comparing Language Patterns

Let’s explore language differences between descriptions and transcriptions:

Code
# Making text data
all_text <- tiktok_raw %>%
  select(post_description, audio_transcription) %>%
  pivot_longer(everything(), names_to = "name", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(
    value = tolower(value),
    value = tm::removeWords(value, tm::stopwords(kind = "smart"))
  ) %>%
  LimpiaR::limpiar_spaces(value)

# Check counts
text_counts <- table(all_text$name)
text_counts

audio_transcription    post_description 
             159468              249828 
Code
# Calculate WLOs to see words 
wlo_results <- ParseR::calculate_wlos(
  df = all_text,
  topic_var = name,      
  text_var = value,   
  top_n = 30
)

wlo_results
$viz


$view
# A tibble: 60 × 4
   name                word                 n log_odds_weighted
   <chr>               <chr>            <int>             <dbl>
 1 post_description    makeup          133435              50.1
 2 post_description    fyp             110090              82.1
 3 post_description    skincare        104971              56.7
 4 post_description    beauty           70795              49.1
 5 post_description    viral            48328              49.0
 6 post_description    grwm             45236              52.2
 7 post_description    skincareroutine  40898              49.7
 8 audio_transcription skin             38957              74.8
 9 post_description    foryou           36933              47.2
10 audio_transcription love             32868             109. 
# ℹ 50 more rows

Okay, so looking at the WLOs between audio_transcription vs post_descriptions, it appears:

  • the audio words appears more conversational indicating natural spoken language

  • More emotive language is used like ‘ooh’ and ‘love’ indicating emotions are demonstrated

  • Post descriptions are very hashtag heavy to target specific algorithms

  • Post descriptions is good for Topic Modelling as it offers more broad descriptions on the content

7 Bigram Comparison

Let’s compare common phrases between descriptions and transcriptions:

Code
extract_bigrams <- function(data, text_col, name_col) {
  bigrams <- data %>%
    unnest_tokens(bigram, {{text_col}}, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word,
           !is.na(word1),
           !is.na(word2)) %>%
    count({{name_col}}, word1, word2, sort = TRUE) %>%
    dplyr::rename(ngram_freq = n)
  
  return(bigrams)
}

audio_bigrams <- all_text %>% 
  filter(name == "audio_transcription") %>%
  extract_bigrams(value, name)

desc_bigrams <- all_text %>% 
  filter(name == "post_description") %>%
  extract_bigrams(value, name)

# Top bigram views
audio_top <- audio_bigrams %>% 
  head(20) %>%
  mutate(ngram = paste(word1, word2))

desc_top <- desc_bigrams %>% 
  head(20) %>%
  mutate(ngram = paste(word1, word2))

# Making the bigram counts object
bigram_counts <- list(
  list(name = "audio_transcription", view = audio_top),
  list(name = "post_description", view = desc_top)
)


post_desc_bigrams <- bigram_counts[[2]]$view
audio_trans_bigrams <- bigram_counts[[1]]$view

# For post descriptions
ggplot(post_desc_bigrams, aes(x = reorder(paste(word1, word2), ngram_freq), y = ngram_freq)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top Bigrams in Post Descriptions",
       x = NULL,
       y = "Frequency")

Code
# For audio transcriptions
ggplot(audio_trans_bigrams, aes(x = reorder(paste(word1, word2), ngram_freq), y = ngram_freq)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top Bigrams in Audio Transcriptions",
       x = NULL,
       y = "Frequency")

There’s more hashtags in post descriptions than conversational language in audio transscriptions as we would expect

A lot of duplicated audio words which might be lyrics than actual conversation

Code
language_diff <- tibble(
  Characteristic = c("Main Purpose", "Common Words", "Emotional Content", 
                     "Marketing Style", "Hashtag Use", "How They Talk"),
  
  `Post Description` = c(
    "Getting more views and followers- hitting the algorithm", 
    "Lots of beauty terms, brand names, and product types",
    "Not very emotional, mostly just describing products",
    "Promotional language to get people to engage or buy",
    "Lots of hashtags (like #fyp, #skincare) to get discovered",
    "Direct statements without much conversation"
  ),
  
  `Audio Transcription` = c(
    "Showing how to use products and connecting with viewers", 
    "Everyday words, reactions, and explaining steps",
    "Much more emotional words like 'love' and 'amazing'",
    "Less obvious selling, more personal experiences",
    "Almost never mentions hashtags out loud",
    "Talks like they're having a conversation with questions and explanations"
  )
)

# Display as table
kable(language_diff, caption = "Language Pattern Comparison")
Language Pattern Comparison
Characteristic Post Description Audio Transcription
Main Purpose Getting more views and followers- hitting the algorithm Showing how to use products and connecting with viewers
Common Words Lots of beauty terms, brand names, and product types Everyday words, reactions, and explaining steps
Emotional Content Not very emotional, mostly just describing products Much more emotional words like ‘love’ and ‘amazing’
Marketing Style Promotional language to get people to engage or buy Less obvious selling, more personal experiences
Hashtag Use Lots of hashtags (like #fyp, #skincare) to get discovered Almost never mentions hashtags out loud
How They Talk Direct statements without much conversation Talks like they’re having a conversation with questions and explanations

8 Sampling for Manual Verification

Code
set.seed(123)
sample_size <- 50


sampled_videos <- tiktok_raw %>%
  filter(!is.na(post_description) & !is.na(audio_transcription)) %>%
  sample_n(sample_size) %>%
  select(post_url, post_description, audio_transcription, 
         likes_count, comments_count, engagement_score)


DT::datatable(sampled_videos, 
              options = list(pageLength = 5, scrollX = TRUE),
              caption = paste("Sample of", sample_size, "TikTok videos with both description and transcription"))

We then manually labelled the dataset and import it

Code
tiktok_manual_verify <- read.csv('tiktok_manual_verify.csv')


tiktok_manual_verify$X <- NULL

colnames(tiktok_manual_verify)
[1] "post_url"                                       
[2] "post_description"                               
[3] "audio_transcription"                            
[4] "likes_count"                                    
[5] "comments_count"                                 
[6] "engagement_score"                               
[7] "Is.post.description.accurately.representative." 
[8] "Is.audio.description.accurately.representative."
[9] "Speech.or.Lyrics"                               
Code
tiktok_manual_verify <- tiktok_manual_verify %>%
  rename(
    post_desc_accurate = `Is.post.description.accurately.representative.`,
    audio_desc_accurate = `Is.audio.description.accurately.representative.`,
    speech_lyric_classification = `Speech.or.Lyrics`
  )

verification_summary <- tiktok_manual_verify %>%
  summarise(
    total_videos = n(),
    post_desc_accurate_count = sum(post_desc_accurate == "Yes", na.rm = TRUE),
    post_desc_accurate_pct = mean(post_desc_accurate == "Yes", na.rm = TRUE) * 100,
    audio_desc_accurate_count = sum(audio_desc_accurate == "Yes", na.rm = TRUE),
    audio_desc_accurate_pct = mean(audio_desc_accurate == "Yes", na.rm = TRUE) * 100
  )

cat("Total videos analysed:", verification_summary$total_videos)
Total videos analysed: 50
Code
cat("Post description relevant: ", verification_summary$post_desc_accurate_pct, "%")
Post description relevant:  88 %
Code
cat("Audio transcription relevant: ", verification_summary$audio_desc_accurate_pct, "%")
Audio transcription relevant:  52 %

87% accuracy from post descriptions vs 51% from audio transcriptions shows a significant gap between what insights can be extracted from descriptive and spoken content

Code
verification_data <- data.frame(
  Content_Type = c("Post Description", "Audio Transcription"),   
  Accurate = c(87, 51),                                          
  Inaccurate = c(13, 49)                                         
)

verification_long <- verification_data %>% 
  tidyr::pivot_longer(cols = c(Accurate, Inaccurate),
                      names_to = "Accuracy",
                      values_to = "Percentage")


ggplot(verification_long, aes(x = Content_Type, y = Percentage, fill = Accuracy)) +
  geom_bar(stat = "identity", position = "stack") +
  geom_text(aes(label = sprintf("%.1f%%", Percentage)),
            position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = c("Accurate" = "#4CAF50", "Inaccurate" = "#F44336")) +
  labs(title = "Content Accuracy in TikTok Videos",
       subtitle = "Based on manual verification of 50 videos",
       x = NULL, y = "Percentage") +
  theme_minimal()

Videos with accurate post descriptions also tend to have higher engagement scores

9 Digging deeper into why Audio Transcriptions might be missing

Code
#Flag for audio transcription

tiktok_further_analysis <- tiktok_raw %>% 
  mutate(
    has_transcription = !is.na(audio_transcription),
    has_audio_id = !is.na(audio_id) & audio_id != "",
    has_sound_id = !is.na(ingested_sound_id) & ingested_sound_id != ""
  )

audio_indicators <- tiktok_further_analysis %>% 
  group_by(has_transcription) %>% 
  summarise(
    total_videos = n(),
    videos_with_audio_id = sum(has_audio_id),
    audio_id_percent = round(mean(has_audio_id) * 100, 2),
    videos_with_sound_id = sum(has_sound_id),
    sound_id_percent = round(mean(has_sound_id) * 100, 2)
  )

kable(audio_indicators, caption = "Audio Indicators")
Audio Indicators
has_transcription total_videos videos_with_audio_id audio_id_percent videos_with_sound_id sound_id_percent
FALSE 90370 90327 99.95 90359 99.99
TRUE 159468 159403 99.96 159410 99.96

Looking at the table above, almost all videos have audio however only 64% of videos contain speech that can be transcribed

We can manually look at a subset of Tiktok videos to confirm the absence of transcriptions contains a genuine lack of speech to transcribe.

10 Audio Content Classification

Based on our findings that virtually all Tiktok videos have audio but only 64% have transcriptions, we should try to understand the nature of the audio content.

We’ll try out different approaches but we’ve decided to try making a classification function and see how far that can take us in determining between lyrics vs speech (as is a much simpler and cost effective approach)

Code
library(reticulate)
library(purrr)
#py_install("tiktoken")
library(tokenizers)
#py_require(c("transformers"))
#transformers <- import("transformers")


#tiktoken_available <- test_tiktoken()

#tokenizer <- transformers$AutoTokenizer$from_pretrained("bert-base-uncased")
#tokenizer <- transformers$AutoTokenizer$from_pretrained("gpt2")  
Code
#text <- "Hello, how are you doing today?"
#tokens <- tokenizer$tokenize(text)
#print(tokens)
Code
#Testing out a function

classify_lyrics_speech <- function(audio_text) {
 #Gonna handle missing sentences first and just return No Audio
 if(is.na(audio_text) || audio_text == "") {
   return("No_Audio")
 }
 text_lower <- tolower(audio_text)
 
 #word counts
 words <- strsplit(text_lower, "\\s+")[[1]]
 word_count <- length(words)
 unique_words <- length(unique(words))
 repetition_ratio <- ifelse(word_count > 0, unique_words / word_count, 1)
 
 #do repetition checks for easy lyrics classification
 if(length(unique(words)) == 1 && word_count > 3) {
   return("Lyrics")
 }
 lyrics_patterns <- c(
   "(yeah yeah|na na|oh oh|la la|baby baby|love love)",
   "(whoa|ohh|ahh|mmm|uh huh|hey hey|woo)",
   "(tonight|forever|never|always.*love|heart.*soul)",
   "(live on the edge|destination|spotlight|good vibes|feel the beat)",
   "\\b(\\w+)\\s+\\1\\b"
 )
 
 #Should define speech patterns- CAN BUILD MORE HERE
 speech_patterns <- c(
   "(hi guys|hey everyone|how to|tutorial|today|gonna show|let me show)",
   "(step|first|next|then|so|basically|actually)",
   "(guys|everyone|you all|what do you think|comment|subscribe|follow)",
   "\\?(.*what|why|how|when|where)",
   "(this product|i tried|review|recommend|skincare|routine|makeup)",
   "(store|shopping|buy|purchase|charlotte tilbury|sephora)",
   "(amazing|obsessed|love this|so good|really like|honestly)"
 )
 
 lyrics_score <- sum(sapply(lyrics_patterns, function(p) length(gregexpr(p, text_lower)[[1]]) - 1))
 speech_score <- sum(sapply(speech_patterns, function(p) length(gregexpr(p, text_lower)[[1]]) - 1))
 
 # Categorising STARTS HERE
 if(word_count > 60 && repetition_ratio < 0.3) {
   return("Lyrics") 
 }
 
 if(lyrics_score > speech_score && lyrics_score > 0) {
   return("Lyrics")
 } else if(speech_score > 0) {
   return("Speech") 
 }
 
 if(word_count > 30 && grepl("(product|store|makeup|skincare|buy|shopping)", text_lower)) {
   return("Speech")
 }
 
 if(word_count > 40) {
   return("Lyrics")  
 } else if(word_count > 5) {
   return("Speech") 
 } else {
   return("Unclear")
 }
}
Code
#We should have 3 speech, 3 lyrics and 1 no audio
test_examples <- c(
  "Hey guys, today I'm gonna show you my skincare routine",  
  "Yeah yeah baby love me tonight forever yeah yeah",        
  "This serum is amazing I love how it makes my skin glow",  
  "Na na na oh oh whoa baby baby love love",                 
  "",                                                        
  "What do you think about this product?",                   
  "Tonight we dance forever love yeah tonight dance"        
)

for(i in seq_along(test_examples)) {
  result <- classify_lyrics_speech(test_examples[i])
  cat("Text:", str_trunc(test_examples[i], 50))
  cat("Classification:", result)
}
Text: Hey guys, today I'm gonna show you my skincare ...Classification: SpeechText: Yeah yeah baby love me tonight forever yeah yeahClassification: LyricsText: This serum is amazing I love how it makes my sk...Classification: SpeechText: Na na na oh oh whoa baby baby love loveClassification: LyricsText: Classification: No_AudioText: What do you think about this product?Classification: SpeechText: Tonight we dance forever love yeah tonight danceClassification: Lyrics

The function actually performed alright on the test examples, now lets test it out on the manual verification set and see what we get.

Code
tiktok_manual_verify$funclassification <- sapply(tiktok_manual_verify$audio_transcription, classify_lyrics_speech)

#table(tiktok_manual_verify$funclassification)

#table(tiktok_manual_verify$funclassification, tiktok_manual_verify$speech_lyric_classification)

head(tiktok_manual_verify[c("funclassification", "speech_lyric_classification")], 50)
   funclassification speech_lyric_classification
1             Speech                      Speech
2             Lyrics                      Lyrics
3            Unclear                     Unclear
4            Unclear                     Unclear
5             Lyrics                      Lyrics
6             Speech                      Speech
7             Speech                      Speech
8             Speech                      Speech
9            Unclear                      Speech
10            Speech                      Speech
11            Speech                      Speech
12            Speech                      Speech
13            Lyrics                      Speech
14            Speech                      Lyrics
15            Speech                      Lyrics
16            Lyrics                      Lyrics
17            Speech                      Speech
18            Speech                      Lyrics
19            Speech                      Speech
20            Speech                      Speech
21            Speech                      Lyrics
22            Speech                      Speech
23            Speech                      Speech
24           Unclear                     Unclear
25            Speech                      Speech
26            Speech                     Unclear
27            Speech                      Speech
28            Speech                      Speech
29            Speech                      Speech
30            Speech                      Lyrics
31           Unclear                     Unclear
32            Speech                     Unclear
33           Unclear                     Unclear
34            Speech                      Speech
35            Lyrics                      Lyrics
36            Speech                      Speech
37            Speech                      Lyrics
38            Speech                      Speech
39            Speech                      Speech
40            Speech                      Speech
41            Speech                      Lyrics
42            Speech                      Speech
43            Speech                      Speech
44            Speech                      Speech
45            Speech                      Speech
46            Speech                      Lyrics
47            Lyrics                      Lyrics
48            Lyrics                      Lyrics
49            Speech                      Speech
50            Speech                      Speech

So on the face of it the function seemed to be performing better than expected on classifying on the set of 50 tiktok verified set so looking at the confusion matrix to really get an overall view:

Code
library(caret)
library(ggplot2)
library(dplyr)

confusion_matrix <- table(Predicted = tiktok_manual_verify$funclassification, 
                         Actual = tiktok_manual_verify$speech_lyric_classification)

overall_accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
cat("Overall Accuracy:", round(overall_accuracy * 100, 2), "%")
Overall Accuracy: 76 %
Code
confusionMatrix(as.factor(tiktok_manual_verify$funclassification),
                as.factor(tiktok_manual_verify$speech_lyric_classification))
Confusion Matrix and Statistics

          Reference
Prediction Lyrics Speech Unclear
   Lyrics       6      1       0
   Speech       8     27       2
   Unclear      0      1       5

Overall Statistics
                                          
               Accuracy : 0.76            
                 95% CI : (0.6183, 0.8694)
    No Information Rate : 0.58            
    P-Value [Acc > NIR] : 0.006224        
                                          
                  Kappa : 0.5338          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: Lyrics Class: Speech Class: Unclear
Sensitivity                 0.4286        0.9310         0.7143
Specificity                 0.9722        0.5238         0.9767
Pos Pred Value              0.8571        0.7297         0.8333
Neg Pred Value              0.8140        0.8462         0.9545
Prevalence                  0.2800        0.5800         0.1400
Detection Rate              0.1200        0.5400         0.1000
Detection Prevalence        0.1400        0.7400         0.1200
Balanced Accuracy           0.7004        0.7274         0.8455

Looking at the confusion matrix: 6 out of 7 lyrics predicted, 27 out of 37 actual speech predicted and 5 out of 6 unclear predicted

Looks like the function is biased towards speech as it under predicts lyrics leading to 76% overall accuracy

11 Saving the transcriptions classified as speech into csv to test out Topic Modelling

Code
speech_data_manual <- tiktok_manual_verify %>%
  filter(speech_lyric_classification == "Speech" & !is.na(audio_transcription)) %>%
  mutate(
    video_id = row_number(),
    clean_transcription = str_trim(audio_transcription),
    transcription_length = str_length(clean_transcription)
  ) %>%
  select(
    video_id,
    audio_transcription,
    clean_transcription,
    transcription_length,
    speech_lyric_classification,
    likes_count,
    comments_count,
    engagement_score,
    post_description,
    post_desc_accurate,
    audio_desc_accurate
  ) %>%
  filter(transcription_length >= 15)

#nrow(speech_data_manual)

#write.csv(speech_data_manual, "tiktok_manual_transcritions", row.names = FALSE)

We can topic modelled using Bertopic in Google colab to determine how many topics come out of the 28 determined speech transcriptions. The results are:

Topic Info: Topic Count Name
0 -1 2 -1_hair_beard_or_the
1 0 7 0_the_in_that_there
2 1 6 1_the_it_and_is
3 2 5 2_is_one_they_tilbury
4 3 5 3_to_going_this_you
5 4 3 4_to_it_you_off

Clearly we need to add more stopwords to stop these common phrases to seep into the topics. So lets discuss next steps

12 Next steps

We can further build on the lyrics classification function and further optimise the accuracy and precision of the function

We can use Openai and test its performance on a much larger set of data i.e. 100-150

We will also need to further label this larger dataset to more accurately determine the function and Openai performance