# =========================================
# Netflix Dataset Full Analysis (Single Code)
# =========================================
# Install packages (run once if needed)
# install.packages(c("tidyverse","lubridate"))
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.2.1 ✔ readr 2.2.0
✔ forcats 1.0.1 ✔ stringr 1.6.0
✔ ggplot2 4.0.2 ✔ tibble 3.3.1
✔ lubridate 1.9.5 ✔ tidyr 1.3.2
✔ purrr 1.2.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
# ------------------------------
# Load Dataset (file upload)
# ------------------------------
df <- read.csv(file.choose(), stringsAsFactors = FALSE)
# ------------------------------
# Data Cleaning
# ------------------------------
df$date_added <- mdy(df$date_added)
df$year_added <- year(df$date_added)
# ------------------------------
# 1. Bar Chart: Content Type
# ------------------------------
ggplot(df, aes(x = type)) +
geom_bar(fill = "steelblue") +
labs(title = "Content Type Distribution",
x = "Type", y = "Count")# ------------------------------
# 2. Time-Series: Content Added Per Year
# ------------------------------
df %>%
count(year_added) %>%
arrange(year_added) %>%
ggplot(aes(x = year_added, y = n)) +
geom_line(color = "darkgreen") +
geom_point() +
labs(title = "Content Added Per Year",
x = "Year", y = "Number of Titles")Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
# ------------------------------
# 3. Heatmap: Genre vs Country
# ------------------------------
df2 <- df %>%
drop_na(country, listed_in) %>%
separate_rows(country, sep = ", ") %>%
separate_rows(listed_in, sep = ", ")
heatmap_data <- df2 %>%
count(country, listed_in)
# Select top 10 countries and genres
top_countries <- heatmap_data %>%
group_by(country) %>%
summarise(total = sum(n)) %>%
slice_max(total, n = 10)
top_genres <- heatmap_data %>%
group_by(listed_in) %>%
summarise(total = sum(n)) %>%
slice_max(total, n = 10)
heatmap_filtered <- heatmap_data %>%
filter(country %in% top_countries$country,
listed_in %in% top_genres$listed_in)
ggplot(heatmap_filtered, aes(x = listed_in, y = country, fill = n)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
labs(title = "Genre vs Country Heatmap",
x = "Genre", y = "Country") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# ------------------------------
# 4. Ratings Distribution (Bar Chart)
# ------------------------------
ggplot(df, aes(x = rating)) +
geom_bar(fill = "purple") +
labs(title = "Ratings Distribution",
x = "Rating", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# ------------------------------
# (Optional) Pie Chart for Ratings
# ------------------------------
rating_counts <- df %>%
count(rating)
ggplot(rating_counts, aes(x = "", y = n, fill = rating)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(title = "Ratings Distribution (Pie Chart)")