library("dslabs")
library(ggplot2)
library(tidyr)
library(dplyr)
library(highcharter)
library(RColorBrewer)
#data(package="dslabs")
#list.files(system.file("script", package = "dslabs"))DS Labs HW Assignment
DS Labs Datasets
Load libraries and use the package DSLabs (Data Science Labs)
data("movielens")
head(movielens) movieId title year
1 31 Dangerous Minds 1995
2 1029 Dumbo 1941
3 1061 Sleepers 1996
4 1129 Escape from New York 1981
5 1172 Cinema Paradiso (Nuovo cinema Paradiso) 1989
6 1263 Deer Hunter, The 1978
genres userId rating timestamp
1 Drama 1 2.5 1260759144
2 Animation|Children|Drama|Musical 1 3.0 1260759179
3 Thriller 1 3.0 1260759182
4 Action|Adventure|Sci-Fi|Thriller 1 2.0 1260759185
5 Drama 1 4.0 1260759205
6 Drama|War 1 2.0 1260759151
Seperate genres and add multiple rows for movies with multiple genres
movielens_cleaned <- movielens |>
separate_rows(genres, sep = "\\|")
head(movielens_cleaned)# A tibble: 6 × 7
movieId title year genres userId rating timestamp
<int> <chr> <int> <chr> <int> <dbl> <int>
1 31 Dangerous Minds 1995 Drama 1 2.5 1260759144
2 1029 Dumbo 1941 Animation 1 3 1260759179
3 1029 Dumbo 1941 Children 1 3 1260759179
4 1029 Dumbo 1941 Drama 1 3 1260759179
5 1029 Dumbo 1941 Musical 1 3 1260759179
6 1061 Sleepers 1996 Thriller 1 3 1260759182
Explore data and groups
# Group by year, genre, and count the number of movies with rating >= 4
good_movies <- movielens_cleaned |>
filter(rating >= 4) |> # Filter movies with rating >= 4
group_by(year, genres) |> # Group by year and genre
summarize(count_movies = n()) # Count the number of movies in each groupData Visualization
The visualization displays good movies over time in regards to genre. Over time you’ll be able to see what the audience likes and the marketshare/growth of each genre for movies with a rating of at least 4 out of 5. If a movie had multiple genres, they were included in each group.
# set color palette
cols <- brewer.pal(7, "Set1")
# stacked area chart
highchart () |>
# Data Series
hc_add_series(data = good_movies,
type = "area",
hcaes(x = year,
y = count_movies,
group = genres)) |>
hc_colors(cols) |>
hc_chart(style = list(fontFamily = "Georgia",
fontWeight = "bold")) |>
hc_plotOptions(series = list(stacking = "normal",
marker = list(enabled = FALSE,
states = list(hover = list(enabled = FALSE))),
lineWidth = 0.5,
lineColor = "white")) |>
# Labeling
hc_title(text = "Stacked Area Chart of 'Good' Movies by Genre and Year", align = "left") |>
hc_xAxis(title = list(text="Year")) |>
hc_yAxis(title = list(text="Genres associated with 'good' movies [Rated 4-5]")) |>
hc_legend(align = "right", verticalAlign = "top",
layout = "vertical")