DS Labs HW Assignment

Author

Daniel B

DS Labs Datasets

Load libraries and use the package DSLabs (Data Science Labs)

library("dslabs")
library(ggplot2)
library(tidyr)
library(dplyr)
library(highcharter)
library(RColorBrewer)
#data(package="dslabs")
#list.files(system.file("script", package = "dslabs"))

data("movielens")
head(movielens)

  movieId                                   title year
1      31                         Dangerous Minds 1995
2    1029                                   Dumbo 1941
3    1061                                Sleepers 1996
4    1129                    Escape from New York 1981
5    1172 Cinema Paradiso (Nuovo cinema Paradiso) 1989
6    1263                        Deer Hunter, The 1978
                            genres userId rating  timestamp
1                            Drama      1    2.5 1260759144
2 Animation|Children|Drama|Musical      1    3.0 1260759179
3                         Thriller      1    3.0 1260759182
4 Action|Adventure|Sci-Fi|Thriller      1    2.0 1260759185
5                            Drama      1    4.0 1260759205
6                        Drama|War      1    2.0 1260759151

Seperate genres and add multiple rows for movies with multiple genres

movielens_cleaned <- movielens |>
  separate_rows(genres, sep = "\\|")

head(movielens_cleaned)

# A tibble: 6 × 7
  movieId title            year genres    userId rating  timestamp
    <int> <chr>           <int> <chr>      <int>  <dbl>      <int>
1      31 Dangerous Minds  1995 Drama          1    2.5 1260759144
2    1029 Dumbo            1941 Animation      1    3   1260759179
3    1029 Dumbo            1941 Children       1    3   1260759179
4    1029 Dumbo            1941 Drama          1    3   1260759179
5    1029 Dumbo            1941 Musical        1    3   1260759179
6    1061 Sleepers         1996 Thriller       1    3   1260759182

Explore data and groups

# Group by year, genre, and count the number of movies with rating >= 4
good_movies <- movielens_cleaned |>
  filter(rating >= 4) |>  # Filter movies with rating >= 4
  group_by(year, genres) |>  # Group by year and genre
  summarize(count_movies = n())  # Count the number of movies in each group

Data Visualization

The visualization displays good movies over time in regards to genre. Over time you’ll be able to see what the audience likes and the marketshare/growth of each genre for movies with a rating of at least 4 out of 5. If a movie had multiple genres, they were included in each group.

# set color palette
cols <- brewer.pal(7, "Set1")

# stacked area chart
highchart () |>
  
  # Data Series
  hc_add_series(data = good_movies,
                   type = "area",
                   hcaes(x = year,
                   y = count_movies, 
                   group = genres)) |>
  hc_colors(cols) |> 
  hc_chart(style = list(fontFamily = "Georgia",
                        fontWeight = "bold")) |>
  
  hc_plotOptions(series = list(stacking = "normal",
                               marker = list(enabled = FALSE,
                               states = list(hover = list(enabled = FALSE))),
                               lineWidth = 0.5,
                               lineColor = "white")) |>
  # Labeling
  hc_title(text = "Stacked Area Chart of 'Good' Movies by Genre and Year", align = "left") |>
  hc_xAxis(title = list(text="Year")) |>
  hc_yAxis(title = list(text="Genres associated with 'good' movies [Rated 4-5]")) |>
  hc_legend(align = "right", verticalAlign = "top",
            layout = "vertical")