This is an initial inspection of a sample from the Simpsons Dataset from Kaggle originally made available by Prashant Banerjee and curated for tidytuesday by Nicolas Foss, Ed.D., MS with Iowa HHS.

library(tidytuesdayR)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
tuesdata <- tt_load('2025-02-04')
## ---- Compiling #TidyTuesday Information for 2025-02-04 ----
## --- There are 4 files available ---
## 
## 
## ── Downloading files ───────────────────────────────────────────────────────────
## 
##   1 of 4: "simpsons_characters.csv"
##   2 of 4: "simpsons_episodes.csv"
##   3 of 4: "simpsons_locations.csv"
##   4 of 4: "simpsons_script_lines.csv"
simpsons_locations <- tuesdata$simpsons_locations
simpsons_script_lines <- tuesdata$simpsons_script_lines
simpsons_characters <- tuesdata$simpsons_characters
simpsons_episodes <- tuesdata$simpsons_episodes
# Grabbing only script lines that have location recorded
# Reduces 'simpsons_script_lines' from 31793 obs. to 31698
loc_script<-simpsons_script_lines |>
  semi_join(simpsons_locations, by = c("location_id"="id"))
# Now grabbing only script lines that have character recorded
# Reduces 'loc_script' from 31698 obs. to 28184
char_loc_script<-loc_script |>
  semi_join(simpsons_characters, by = c("character_id"="id"))
# Merging 'char_loc_script' with 'simpsons_locations'
# Telling R 'location_id' in 'char_loc_script' and
# 'id' in 'simpsons_locations' are to be treated equally
# Resulting in the same number of rows as char_loc_script, but
# with the normalized location name column included
df<-merge(char_loc_script, simpsons_locations, by.x = "location_id", by.y = "id")
# Using subset() to remove columns
df = subset(df, select = -c(raw_location_text,name))
# Renaming normalized column in 'simpsons_characters'
colnames(simpsons_characters) <-
  c('id','name','normalized_char_name', 'gender')
# Merging with 'simpsons_characters'
# Telling R 'character_id' in 'df' and
# 'id' in 'simpsons_characters' are to be treated equally
# Resulting in the same number of rows as df, but
# with the normalized character name column included
df<-merge(df, simpsons_characters, by.x = "character_id", by.y = "id")
# Using subset() to remove columns
df = subset(df, select = -c(raw_character_text,name))

Now we have the normalized name for locations and characters included in the script dataset

# Writing df as .csv file if needed
write.csv(df,"C:/Users/falin/Desktop/RProjectBeginner_2025/simpsonsScript.csv")