Thanks to Deepa Sharma for the analysis questions:
1- What is the highest-rated TV show of each year?
2- What is the highest rated TV show from each category in the data set?
For the first question, I will have to use tidyr to separate the “year” column, as it is includes the start year and end year of each show separated by a dash. I will assume the question asks for the highest rated TV show per release year. For the second question, I will assume that “category” refers to the genre column, which will also require separation as each cell of that column contains several genres that the show might fall into.
my_git_url <- getURL("https://raw.githubusercontent.com/aelsaeyed/Data607/main/Project2/Heros/Dataset_Superhero-TV-Shows.csv")
superhero_tv_shows_raw <- read.csv(text = my_git_url)
head(superhero_tv_shows_raw, 10)
shows_year_separated <- superhero_tv_shows_raw %>%
separate( release_year, c("start_year", "end_year"), "-", fill= "right", convert = TRUE) %>%
filter(start_year != "TBA") %>%
mutate(`imdb_rating`= as.numeric(imdb_rating)) %>%
mutate(`start_year`= as.numeric(start_year)) %>%
mutate(`runtime`= as.numeric(runtime))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
head(shows_year_separated, 10)
ratings_by_year <- shows_year_separated %>% filter(!is.na(imdb_rating)) %>%
group_by(start_year, show_title, imdb_rating) %>%
group_keys() %>%
arrange(desc(start_year), desc(imdb_rating))
head(ratings_by_year, 10)
highest_rating_by_year <- shows_year_separated %>% filter(!is.na(imdb_rating)) %>%
group_by(start_year) %>%
filter(imdb_rating == max(imdb_rating)) %>%
select(c(3,1,2)) %>%
arrange(desc(start_year))
head(highest_rating_by_year, 20)
shows_subset <- superhero_tv_shows_raw %>% select(c(1,2,5))
head(shows_subset, 10)
separated_genres <- shows_subset %>%
cSplit("genre", sep=",") %>%
dcast(show_title+imdb_rating~genre_1)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
## Using genre_3 as value column: use value.var to override.
## Aggregation function missing: defaulting to length
action_genre <- separated_genres %>%
select(1,2,3) %>%
filter(Action == 1) %>%
filter(imdb_rating != "Not-Rated") %>%
mutate(`imdb_rating`= as.numeric(imdb_rating)) %>%
filter(!is.na(imdb_rating))
highest_ranked <- action_genre[which.max(action_genre$imdb_rating),]
highest_ranked