data(cars)

Q1

median(cars[, 1 ])

## [1] 15

Q2

library(jsonlite)

url <- "https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=99"
raw_data <- fromJSON(url)

if (!is.null(raw_data$Data) && !is.null(raw_data$Data$Data)) {
  btc_data <- raw_data$Data$Data
} else if (!is.null(raw_data$Data)) {
  # Fallback for alternative structure
  btc_data <- raw_data$Data
} else {
  stop("Could not find the price data in the expected structure.")
}

max_daily_close_price <- max(btc_data$close, na.rm = TRUE)

print(paste0("The maximum daily close price for BTC in the data is: $", 
             format(max_daily_close_price, big.mark = ",", trim = TRUE, scientific = FALSE)))

## [1] "The maximum daily close price for BTC in the data is: $124,723"

max_daily_close_price

## [1] 124723

Q3

Topic - Netflix

library(readr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.1.0
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

netflix_data <- read_csv("netflix_titles.csv")

## Rows: 8807 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): show_id, type, title, director, cast, country, date_added, rating,...
## dbl  (1): release_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Number of rows (titles) and columns (variables)

cat("Dataset Dimensions:", nrow(netflix_data), "rows,", ncol(netflix_data), "columns\n")

## Dataset Dimensions: 8807 rows, 12 columns

Display the structure of the data frame (data types)

str(netflix_data)

## spc_tbl_ [8,807 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ show_id     : chr [1:8807] "s1" "s2" "s3" "s4" ...
##  $ type        : chr [1:8807] "Movie" "TV Show" "TV Show" "TV Show" ...
##  $ title       : chr [1:8807] "Dick Johnson Is Dead" "Blood & Water" "Ganglands" "Jailbirds New Orleans" ...
##  $ director    : chr [1:8807] "Kirsten Johnson" NA "Julien Leclercq" NA ...
##  $ cast        : chr [1:8807] NA "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile "| __truncated__ "Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, G"| __truncated__ NA ...
##  $ country     : chr [1:8807] "United States" "South Africa" NA NA ...
##  $ date_added  : chr [1:8807] "September 25, 2021" "September 24, 2021" "September 24, 2021" "September 24, 2021" ...
##  $ release_year: num [1:8807] 2020 2021 2021 2021 2021 ...
##  $ rating      : chr [1:8807] "PG-13" "TV-MA" "TV-MA" "TV-MA" ...
##  $ duration    : chr [1:8807] "90 min" "2 Seasons" "1 Season" "1 Season" ...
##  $ listed_in   : chr [1:8807] "Documentaries" "International TV Shows, TV Dramas, TV Mysteries" "Crime TV Shows, International TV Shows, TV Action & Adventure" "Docuseries, Reality TV" ...
##  $ description : chr [1:8807] "As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical wa"| __truncated__ "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is h"| __truncated__ "To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled "| __truncated__ "Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Or"| __truncated__ ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   show_id = col_character(),
##   ..   type = col_character(),
##   ..   title = col_character(),
##   ..   director = col_character(),
##   ..   cast = col_character(),
##   ..   country = col_character(),
##   ..   date_added = col_character(),
##   ..   release_year = col_double(),
##   ..   rating = col_character(),
##   ..   duration = col_character(),
##   ..   listed_in = col_character(),
##   ..   description = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

View the first few rows

head(netflix_data)

## # A tibble: 6 × 12
##   show_id type    title    director cast  country date_added release_year rating
##   <chr>   <chr>   <chr>    <chr>    <chr> <chr>   <chr>             <dbl> <chr> 
## 1 s1      Movie   Dick Jo… Kirsten… <NA>  United… September…         2020 PG-13 
## 2 s2      TV Show Blood &… <NA>     Ama … South … September…         2021 TV-MA 
## 3 s3      TV Show Ganglan… Julien … Sami… <NA>    September…         2021 TV-MA 
## 4 s4      TV Show Jailbir… <NA>     <NA>  <NA>    September…         2021 TV-MA 
## 5 s5      TV Show Kota Fa… <NA>     Mayu… India   September…         2021 TV-MA 
## 6 s6      TV Show Midnigh… Mike Fl… Kate… <NA>    September…         2021 TV-MA 
## # ℹ 3 more variables: duration <chr>, listed_in <chr>, description <chr>

Summarize missing values per column

missing_summary <- colSums(is.na(netflix_data))
missing_percent <- round(missing_summary / nrow(netflix_data) * 100, 2)

missing_df <- data.frame(
  Column = names(missing_summary),
  Missing_Count = missing_summary,
  Missing_Percent = missing_percent
) %>%
  filter(Missing_Count > 0) %>%
  arrange(desc(Missing_Count))

cat("--- Summary of Missing Data ---\n")

## --- Summary of Missing Data ---

print(missing_df)

##                Column Missing_Count Missing_Percent
## director     director          2634           29.91
## country       country           831            9.44
## cast             cast           825            9.37
## date_added date_added            10            0.11
## rating         rating             4            0.05
## duration     duration             3            0.03

Summarize key numerical variables

summary(netflix_data$release_year)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1925    2013    2017    2014    2019    2021

netflix_clean <- netflix_data %>%
  mutate(
    director = replace_na(director, "Unknown"),
    country = replace_na(country, "Unknown")
  )

Verify imputation

cat("Missing director count after imputation:", sum(is.na(netflix_clean$director)), "\n")

## Missing director count after imputation: 0

cat("Missing country count after imputation:", sum(is.na(netflix_clean$country)), "\n")

## Missing country count after imputation: 0

netflix_genres_unnested <- netflix_clean %>%
  # Separate the comma-delimited strings into multiple rows
  separate_rows(listed_in, sep = ", ") %>%
  # Remove any leading/trailing whitespace that might result from splitting
  mutate(listed_in = trimws(listed_in))

cat("Original rows:", nrow(netflix_clean), "\n")

## Original rows: 8807

cat("Un-nested rows (Genre per row):", nrow(netflix_genres_unnested), "\n")

## Un-nested rows (Genre per row): 19323

netflix_final <- netflix_genres_unnested %>%
  mutate(
    content_age_years = 2021 - release_year
  )

Review the new variable

summary(netflix_final$content_age_years)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   4.000   6.949   8.000  96.000

#Research Question 1: What are the Top 10 most frequent genres on Netflix?

top_10_genres <- netflix_final %>%
  # Filter out potential empty or unwanted entries
  filter(listed_in != "") %>%
  count(listed_in, sort = TRUE) %>%
  head(10)

print(top_10_genres)

## # A tibble: 10 × 2
##    listed_in                    n
##    <chr>                    <int>
##  1 International Movies      2752
##  2 Dramas                    2427
##  3 Comedies                  1674
##  4 International TV Shows    1351
##  5 Documentaries              869
##  6 Action & Adventure         859
##  7 TV Dramas                  763
##  8 Independent Movies         756
##  9 Children & Family Movies   641
## 10 Romantic Movies            616

Visualize Top 10 Genres

ggplot(top_10_genres, aes(x = reorder(listed_in, n), y = n)) +
  geom_col(fill = "#E50914") + # Netflix red color
  coord_flip() +
  labs(
    title = "Top 10 Most Frequent Genres on Netflix",
    x = "Genre",
    y = "Count of Titles"
  ) +
  theme_minimal()

# It is clear that international Movies and TV Shows are the most common, which shows data imperfection as these are not genres, but instead are groups of uncategorized movies.

#Research Question 2: Which Directors have the highest number of titles?

top_5_directors <- netflix_final %>%
  filter(director != "Unknown") %>%
  count(director, sort = TRUE) %>%
  # The director column is still duplicated due to the genre un-nesting, so we group again
  distinct(director, .keep_all = TRUE) %>%
  arrange(desc(n)) %>%
  head(5)

print(top_5_directors)

## # A tibble: 5 × 2
##   director                n
##   <chr>               <int>
## 1 Cathy Garcia-Molina    37
## 2 Youssef Chahine        33
## 3 Suhas Kadav            29
## 4 David Dhawan           27
## 5 Martin Scorsese        25

We can see that Cathy Garcia-Molina is the leader in content released.

#Research Question 3: How does the distribution of content type (Movie vs. TV Show) change over time?

library(dplyr)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

yearly_releases <- netflix_final %>%
  # Remove duplicates introduced by genre un-nesting (one row per title is needed)
  distinct(show_id, .keep_all = TRUE) %>%
  group_by(release_year, type) %>%
  count(name = "Count") %>%
  ungroup()

yearly_releases_filtered <- yearly_releases %>%
  filter(release_year >= 2000 & release_year <= 2021)

ggplot(yearly_releases_filtered, aes(x = release_year, y = Count, color = type)) +
  geom_line(size = 1.2) +
  labs(
    title = "Annual Content Release Trend (2000 - 2021)",
    x = "Release Year",
    y = "Count of Titles Released",
    color = "Type"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Movies were typically the most common form of media released on Netflix although since around 2018 the difference is shrinking

Research Question 4: Is there a correlation between content genre and its rating (maturity level)?

top_genres_list <- top_10_genres$listed_in

netflix_ratings_by_genre <- netflix_final %>%
  filter(listed_in %in% top_genres_list) %>%
  filter(!is.na(rating) & rating != "" & rating != "Not Given") %>%
  select(listed_in, rating)

ggplot(netflix_ratings_by_genre, aes(x = listed_in, fill = rating)) +
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Set3") + # Use a distinct color palette
  coord_flip() +
  labs(
    title = "Rating Distribution Across Top 10 Netflix Genres",
    subtitle = "Analysis of Content Maturity Level (Rating) by Genre",
    x = "Genre",
    y = "Proportion of Titles (100% Stacked)",
    fill = "Maturity Rating"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.y = element_text(size = 9)
  )

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set3 is 12
## Returning the palette you asked for with that many colors

Research Question 5: How does the distribution of content releases by country change over time?

country_yearly_releases <- netflix_final %>%
  # Remove duplicates introduced by genre un-nesting (one row per title is needed)
  distinct(show_id, .keep_all = TRUE) %>%
  # Group by year and country, counting the number of titles
  group_by(release_year, country) %>%
  count(name = "Count") %>%
  ungroup() %>%
  # Filter to focus on recent years and remove the "Unknown" category
  filter(release_year >= 2000 & release_year <= 2021 & country != "Unknown")

top_5_countries <- country_yearly_releases %>%
  group_by(country) %>%
  summarise(Total_Releases = sum(Count)) %>%
  arrange(desc(Total_Releases)) %>%
  head(5) %>%
  pull(country)

country_yearly_releases_filtered <- country_yearly_releases %>%
  filter(country %in% top_5_countries)

ggplot(country_yearly_releases_filtered, aes(x = release_year, y = Count, color = country)) +
  geom_line(size = 1.2) +
  labs(
    title = "Annual Content Releases by Top 5 Contributing Countries (2000 - 2021)",
    subtitle = "Tracking content production volume over time",
    x = "Release Year",
    y = "Count of Titles Released",
    color = "Country"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

Surprisingly American releases did not have a large lead over other countries until ~2015.

Mini Project 1

Andrew Miller

2025-10-16

Q1

Q2

Q3