data(cars)
median(cars[, 1 ])
## [1] 15
library(jsonlite)
url <- "https://min-api.cryptocompare.com/data/histoday?fsym=BTC&tsym=USD&limit=99"
raw_data <- fromJSON(url)
if (!is.null(raw_data$Data) && !is.null(raw_data$Data$Data)) {
btc_data <- raw_data$Data$Data
} else if (!is.null(raw_data$Data)) {
# Fallback for alternative structure
btc_data <- raw_data$Data
} else {
stop("Could not find the price data in the expected structure.")
}
max_daily_close_price <- max(btc_data$close, na.rm = TRUE)
print(paste0("The maximum daily close price for BTC in the data is: $",
format(max_daily_close_price, big.mark = ",", trim = TRUE, scientific = FALSE)))
## [1] "The maximum daily close price for BTC in the data is: $124,723"
max_daily_close_price
## [1] 124723
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.1.0
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
netflix_data <- read_csv("netflix_titles.csv")
## Rows: 8807 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): show_id, type, title, director, cast, country, date_added, rating,...
## dbl (1): release_year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cat("Dataset Dimensions:", nrow(netflix_data), "rows,", ncol(netflix_data), "columns\n")
## Dataset Dimensions: 8807 rows, 12 columns
str(netflix_data)
## spc_tbl_ [8,807 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ show_id : chr [1:8807] "s1" "s2" "s3" "s4" ...
## $ type : chr [1:8807] "Movie" "TV Show" "TV Show" "TV Show" ...
## $ title : chr [1:8807] "Dick Johnson Is Dead" "Blood & Water" "Ganglands" "Jailbirds New Orleans" ...
## $ director : chr [1:8807] "Kirsten Johnson" NA "Julien Leclercq" NA ...
## $ cast : chr [1:8807] NA "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile "| __truncated__ "Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, G"| __truncated__ NA ...
## $ country : chr [1:8807] "United States" "South Africa" NA NA ...
## $ date_added : chr [1:8807] "September 25, 2021" "September 24, 2021" "September 24, 2021" "September 24, 2021" ...
## $ release_year: num [1:8807] 2020 2021 2021 2021 2021 ...
## $ rating : chr [1:8807] "PG-13" "TV-MA" "TV-MA" "TV-MA" ...
## $ duration : chr [1:8807] "90 min" "2 Seasons" "1 Season" "1 Season" ...
## $ listed_in : chr [1:8807] "Documentaries" "International TV Shows, TV Dramas, TV Mysteries" "Crime TV Shows, International TV Shows, TV Action & Adventure" "Docuseries, Reality TV" ...
## $ description : chr [1:8807] "As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical wa"| __truncated__ "After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is h"| __truncated__ "To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled "| __truncated__ "Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Or"| __truncated__ ...
## - attr(*, "spec")=
## .. cols(
## .. show_id = col_character(),
## .. type = col_character(),
## .. title = col_character(),
## .. director = col_character(),
## .. cast = col_character(),
## .. country = col_character(),
## .. date_added = col_character(),
## .. release_year = col_double(),
## .. rating = col_character(),
## .. duration = col_character(),
## .. listed_in = col_character(),
## .. description = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
head(netflix_data)
## # A tibble: 6 × 12
## show_id type title director cast country date_added release_year rating
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 s1 Movie Dick Jo… Kirsten… <NA> United… September… 2020 PG-13
## 2 s2 TV Show Blood &… <NA> Ama … South … September… 2021 TV-MA
## 3 s3 TV Show Ganglan… Julien … Sami… <NA> September… 2021 TV-MA
## 4 s4 TV Show Jailbir… <NA> <NA> <NA> September… 2021 TV-MA
## 5 s5 TV Show Kota Fa… <NA> Mayu… India September… 2021 TV-MA
## 6 s6 TV Show Midnigh… Mike Fl… Kate… <NA> September… 2021 TV-MA
## # ℹ 3 more variables: duration <chr>, listed_in <chr>, description <chr>
missing_summary <- colSums(is.na(netflix_data))
missing_percent <- round(missing_summary / nrow(netflix_data) * 100, 2)
missing_df <- data.frame(
Column = names(missing_summary),
Missing_Count = missing_summary,
Missing_Percent = missing_percent
) %>%
filter(Missing_Count > 0) %>%
arrange(desc(Missing_Count))
cat("--- Summary of Missing Data ---\n")
## --- Summary of Missing Data ---
print(missing_df)
## Column Missing_Count Missing_Percent
## director director 2634 29.91
## country country 831 9.44
## cast cast 825 9.37
## date_added date_added 10 0.11
## rating rating 4 0.05
## duration duration 3 0.03
summary(netflix_data$release_year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1925 2013 2017 2014 2019 2021
netflix_clean <- netflix_data %>%
mutate(
director = replace_na(director, "Unknown"),
country = replace_na(country, "Unknown")
)
cat("Missing director count after imputation:", sum(is.na(netflix_clean$director)), "\n")
## Missing director count after imputation: 0
cat("Missing country count after imputation:", sum(is.na(netflix_clean$country)), "\n")
## Missing country count after imputation: 0
netflix_genres_unnested <- netflix_clean %>%
# Separate the comma-delimited strings into multiple rows
separate_rows(listed_in, sep = ", ") %>%
# Remove any leading/trailing whitespace that might result from splitting
mutate(listed_in = trimws(listed_in))
cat("Original rows:", nrow(netflix_clean), "\n")
## Original rows: 8807
cat("Un-nested rows (Genre per row):", nrow(netflix_genres_unnested), "\n")
## Un-nested rows (Genre per row): 19323
netflix_final <- netflix_genres_unnested %>%
mutate(
content_age_years = 2021 - release_year
)
summary(netflix_final$content_age_years)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 4.000 6.949 8.000 96.000
#Research Question 1: What are the Top 10 most frequent genres on Netflix?
top_10_genres <- netflix_final %>%
# Filter out potential empty or unwanted entries
filter(listed_in != "") %>%
count(listed_in, sort = TRUE) %>%
head(10)
print(top_10_genres)
## # A tibble: 10 × 2
## listed_in n
## <chr> <int>
## 1 International Movies 2752
## 2 Dramas 2427
## 3 Comedies 1674
## 4 International TV Shows 1351
## 5 Documentaries 869
## 6 Action & Adventure 859
## 7 TV Dramas 763
## 8 Independent Movies 756
## 9 Children & Family Movies 641
## 10 Romantic Movies 616
ggplot(top_10_genres, aes(x = reorder(listed_in, n), y = n)) +
geom_col(fill = "#E50914") + # Netflix red color
coord_flip() +
labs(
title = "Top 10 Most Frequent Genres on Netflix",
x = "Genre",
y = "Count of Titles"
) +
theme_minimal()
# It is clear that international Movies and TV Shows are the most
common, which shows data imperfection as these are not genres, but
instead are groups of uncategorized movies.
#Research Question 2: Which Directors have the highest number of titles?
top_5_directors <- netflix_final %>%
filter(director != "Unknown") %>%
count(director, sort = TRUE) %>%
# The director column is still duplicated due to the genre un-nesting, so we group again
distinct(director, .keep_all = TRUE) %>%
arrange(desc(n)) %>%
head(5)
print(top_5_directors)
## # A tibble: 5 × 2
## director n
## <chr> <int>
## 1 Cathy Garcia-Molina 37
## 2 Youssef Chahine 33
## 3 Suhas Kadav 29
## 4 David Dhawan 27
## 5 Martin Scorsese 25
#Research Question 3: How does the distribution of content type (Movie vs. TV Show) change over time?
library(dplyr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
yearly_releases <- netflix_final %>%
# Remove duplicates introduced by genre un-nesting (one row per title is needed)
distinct(show_id, .keep_all = TRUE) %>%
group_by(release_year, type) %>%
count(name = "Count") %>%
ungroup()
yearly_releases_filtered <- yearly_releases %>%
filter(release_year >= 2000 & release_year <= 2021)
ggplot(yearly_releases_filtered, aes(x = release_year, y = Count, color = type)) +
geom_line(size = 1.2) +
labs(
title = "Annual Content Release Trend (2000 - 2021)",
x = "Release Year",
y = "Count of Titles Released",
color = "Type"
) +
theme_minimal() +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Movies were typically the most common form of media released on
Netflix although since around 2018 the difference is shrinking
top_genres_list <- top_10_genres$listed_in
netflix_ratings_by_genre <- netflix_final %>%
filter(listed_in %in% top_genres_list) %>%
filter(!is.na(rating) & rating != "" & rating != "Not Given") %>%
select(listed_in, rating)
ggplot(netflix_ratings_by_genre, aes(x = listed_in, fill = rating)) +
geom_bar(position = "fill") +
scale_fill_brewer(palette = "Set3") + # Use a distinct color palette
coord_flip() +
labs(
title = "Rating Distribution Across Top 10 Netflix Genres",
subtitle = "Analysis of Content Maturity Level (Rating) by Genre",
x = "Genre",
y = "Proportion of Titles (100% Stacked)",
fill = "Maturity Rating"
) +
theme_minimal() +
theme(
legend.position = "bottom",
axis.text.y = element_text(size = 9)
)
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set3 is 12
## Returning the palette you asked for with that many colors
country_yearly_releases <- netflix_final %>%
# Remove duplicates introduced by genre un-nesting (one row per title is needed)
distinct(show_id, .keep_all = TRUE) %>%
# Group by year and country, counting the number of titles
group_by(release_year, country) %>%
count(name = "Count") %>%
ungroup() %>%
# Filter to focus on recent years and remove the "Unknown" category
filter(release_year >= 2000 & release_year <= 2021 & country != "Unknown")
top_5_countries <- country_yearly_releases %>%
group_by(country) %>%
summarise(Total_Releases = sum(Count)) %>%
arrange(desc(Total_Releases)) %>%
head(5) %>%
pull(country)
country_yearly_releases_filtered <- country_yearly_releases %>%
filter(country %in% top_5_countries)
ggplot(country_yearly_releases_filtered, aes(x = release_year, y = Count, color = country)) +
geom_line(size = 1.2) +
labs(
title = "Annual Content Releases by Top 5 Contributing Countries (2000 - 2021)",
subtitle = "Tracking content production volume over time",
x = "Release Year",
y = "Count of Titles Released",
color = "Country"
) +
theme_minimal() +
theme(legend.position = "bottom")