In the following code hunk, import your data.
#### Use read_csv() or another function
#### Make sure your data is converted into a tibble.
#### For demonstration purposes, this example uses the mtcars data.
df_ratings <- read_tsv('title.ratings.tsv', na = "\\N", quote = '')
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## tconst = col_character(),
## averageRating = col_double(),
## numVotes = col_double()
## )
df_basics <- read_tsv('title.basics.tsv', na = "\\N", quote = '')
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## tconst = col_character(),
## titleType = col_character(),
## primaryTitle = col_character(),
## originalTitle = col_character(),
## isAdult = col_double(),
## startYear = col_double(),
## endYear = col_logical(),
## runtimeMinutes = col_double(),
## genres = col_character()
## )
## Warning: 75427 parsing failures.
## row col expected actual file
## 34983 endYear 1/0/T/F/TRUE/FALSE 1947 'title.basics.tsv'
## 35184 endYear 1/0/T/F/TRUE/FALSE 1945 'title.basics.tsv'
## 37613 endYear 1/0/T/F/TRUE/FALSE 1955 'title.basics.tsv'
## 38447 endYear 1/0/T/F/TRUE/FALSE 1949 'title.basics.tsv'
## 38448 endYear 1/0/T/F/TRUE/FALSE 1949 'title.basics.tsv'
## ..... ....... .................. ...... ..................
## See problems(...) for more details.
df_ratings <- df_ratings %>% left_join(df_basics)
## Joining, by = "tconst"
df_actors <- read_tsv('name.basics.tsv', na = "\\N", quote = '') %>%
filter(str_detect(primaryProfession, "actor|actress")) %>%
select(nconst, primaryName, birthYear)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## nconst = col_character(),
## primaryName = col_character(),
## birthYear = col_double(),
## deathYear = col_double(),
## primaryProfession = col_character(),
## knownForTitles = col_character()
## )
df_principals <- read_tsv('title.principals.tsv', na = "\\N", quote = '') %>%
filter(str_detect(category, "actor|actress")) %>%
select(tconst, ordering, nconst, category) %>%
group_by(tconst) %>%
filter(ordering == min(ordering))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## tconst = col_character(),
## ordering = col_double(),
## nconst = col_character(),
## category = col_character(),
## job = col_character(),
## characters = col_character()
## )
df_principals <- df_principals %>% left_join(df_actors)
## Joining, by = "nconst"
df_ratings <- df_ratings %>% left_join(df_principals)
## Joining, by = "tconst"
df_ratings_movies <- df_ratings %>%
filter(titleType == "movie", !is.na(birthYear), numVotes >= 10) %>%
mutate(age_lead = startYear - birthYear)
df_actor_ages <- df_ratings_movies %>%
group_by(startYear) %>%
summarize(low_age = quantile(age_lead, 0.25, na.rm=T),
med_age = quantile(age_lead, 0.50, na.rm=T),
high_age = quantile(age_lead, 0.75, na.rm=T))
## `summarise()` ungrouping output (override with `.groups` argument)
df_actor_ages_lead <- df_ratings_movies %>%
group_by(startYear, category) %>%
summarize(low_age = quantile(age_lead, 0.25, na.rm = T),
med_age = quantile(age_lead, 0.50, na.rm = T),
high_age = quantile(age_lead, 0.75, na.rm = T))
## `summarise()` regrouping output by 'startYear' (override with `.groups` argument)
df_ratings_movies_nth <- df_ratings_movies %>%
group_by(nconst) %>%
arrange(startYear) %>%
mutate(nth_lead = row_number())
## Warning: Removed 22 rows containing non-finite values (stat_bin2d).
## Warning: Removed 22 rows containing non-finite values (stat_smooth).