Project 1

Author

Last Benchers

# =========================================
# Netflix Dataset Full Analysis (Single Code)
# =========================================

# Install packages (run once if needed)
# install.packages(c("tidyverse","lubridate"))

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)

# ------------------------------
# Load Dataset (file upload)
# ------------------------------
df <- read.csv(file.choose(), stringsAsFactors = FALSE)

# ------------------------------
# Data Cleaning
# ------------------------------
df$date_added <- mdy(df$date_added)
df$year_added <- year(df$date_added)

# ------------------------------
# 1. Bar Chart: Content Type
# ------------------------------
ggplot(df, aes(x = type)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Content Type Distribution",
       x = "Type", y = "Count")

# ------------------------------
# 2. Time-Series: Content Added Per Year
# ------------------------------
df %>%
  count(year_added) %>%
  arrange(year_added) %>%
  ggplot(aes(x = year_added, y = n)) +
  geom_line(color = "darkgreen") +
  geom_point() +
  labs(title = "Content Added Per Year",
       x = "Year", y = "Number of Titles")
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

# ------------------------------
# 3. Heatmap: Genre vs Country
# ------------------------------
df2 <- df %>%
  drop_na(country, listed_in) %>%
  separate_rows(country, sep = ", ") %>%
  separate_rows(listed_in, sep = ", ")

heatmap_data <- df2 %>%
  count(country, listed_in)

# Select top 10 countries and genres
top_countries <- heatmap_data %>%
  group_by(country) %>%
  summarise(total = sum(n)) %>%
  slice_max(total, n = 10)

top_genres <- heatmap_data %>%
  group_by(listed_in) %>%
  summarise(total = sum(n)) %>%
  slice_max(total, n = 10)

heatmap_filtered <- heatmap_data %>%
  filter(country %in% top_countries$country,
         listed_in %in% top_genres$listed_in)

ggplot(heatmap_filtered, aes(x = listed_in, y = country, fill = n)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red") +
  labs(title = "Genre vs Country Heatmap",
       x = "Genre", y = "Country") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# ------------------------------
# 4. Ratings Distribution (Bar Chart)
# ------------------------------
ggplot(df, aes(x = rating)) +
  geom_bar(fill = "purple") +
  labs(title = "Ratings Distribution",
       x = "Rating", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# ------------------------------
# (Optional) Pie Chart for Ratings
# ------------------------------
rating_counts <- df %>%
  count(rating)

ggplot(rating_counts, aes(x = "", y = n, fill = rating)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  labs(title = "Ratings Distribution (Pie Chart)")