library(tidytuesdayR)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
tuesdata <- tt_load('2025-02-04')
## ---- Compiling #TidyTuesday Information for 2025-02-04 ----
## --- There are 4 files available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 4: "simpsons_characters.csv"
## 2 of 4: "simpsons_episodes.csv"
## 3 of 4: "simpsons_locations.csv"
## 4 of 4: "simpsons_script_lines.csv"
simpsons_locations <- tuesdata$simpsons_locations
simpsons_script_lines <- tuesdata$simpsons_script_lines
simpsons_characters <- tuesdata$simpsons_characters
simpsons_episodes <- tuesdata$simpsons_episodes
# Grabbing only script lines that have location recorded
# Reduces 'simpsons_script_lines' from 31793 obs. to 31698
loc_script<-simpsons_script_lines |>
semi_join(simpsons_locations, by = c("location_id"="id"))
# Now grabbing only script lines that have character recorded
# Reduces 'loc_script' from 31698 obs. to 28184
char_loc_script<-loc_script |>
semi_join(simpsons_characters, by = c("character_id"="id"))
# Merging 'char_loc_script' with 'simpsons_locations'
# Telling R 'location_id' in 'char_loc_script' and
# 'id' in 'simpsons_locations' are to be treated equally
# Resulting in the same number of rows as char_loc_script, but
# with the normalized location name column included
df<-merge(char_loc_script, simpsons_locations, by.x = "location_id", by.y = "id")
# Using subset() to remove columns
df = subset(df, select = -c(raw_location_text,name))
# Renaming normalized column in 'simpsons_characters'
colnames(simpsons_characters) <-
c('id','name','normalized_char_name', 'gender')
# Merging with 'simpsons_characters'
# Telling R 'character_id' in 'df' and
# 'id' in 'simpsons_characters' are to be treated equally
# Resulting in the same number of rows as df, but
# with the normalized character name column included
df<-merge(df, simpsons_characters, by.x = "character_id", by.y = "id")
# Using subset() to remove columns
df = subset(df, select = -c(raw_character_text,name))