## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
This report provides a comprehensive analysis of the “Top Popular Anime” dataset from Kaggle, containing 22,000+ anime entries with rich metadata from MyAnimeList.
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## corrplot 0.95 loaded
## Loading required package: RColorBrewer
## Rows: 28825 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): name, genres, type, status, duration_per_ep, rating, studios, pro...
## dbl (5): id, episodes, score, scored_by, rank
## dttm (2): aired_from, aired_to
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Display basic information about the dataset
cat("Dataset dimensions:", dim(anime_df), "rows x", dim(anime_df), "columns\n")
## Dataset dimensions: 28825 18 rows x 28825 18 columns
## Rows: 28,825
## Columns: 18
## $ id <dbl> 52991, 5114, 9253, 38524, 28977, 60022, 39486, 11061, …
## $ name <chr> "Frieren: Beyond Journey's End", "Fullmetal Alchemist:…
## $ genres <chr> "Adventure, Drama, Fantasy", "Action, Adventure, Drama…
## $ type <chr> "TV", "TV", "TV", "TV", "TV", "TV Special", "Movie", "…
## $ episodes <dbl> 28, 64, 24, 10, 51, 1, 1, 148, 51, 13, 110, 13, 12, 13…
## $ status <chr> "Finished Airing", "Finished Airing", "Finished Airing…
## $ aired_from <dttm> 2023-09-29, 2009-04-05, 2011-04-06, 2019-04-29, 2015-…
## $ aired_to <dttm> 2024-03-22, 2010-07-04, 2011-09-14, 2019-07-01, 2016-…
## $ duration_per_ep <chr> "24 min per ep", "24 min per ep", "24 min per ep", "23…
## $ score <dbl> 9.30, 9.10, 9.07, 9.05, 9.05, 9.04, 9.04, 9.03, 9.02, …
## $ scored_by <dbl> 676737, 2223666, 1467570, 1700946, 264260, 83738, 8078…
## $ rank <dbl> 1, 2, 3, 4, 5, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ rating <chr> "PG-13 - Teens 13 or older", "R - 17+ (violence & prof…
## $ studios <chr> "Madhouse", "Bones", "White Fox", "Wit Studio", "Banda…
## $ producers <chr> "Aniplex, Dentsu, Shogakukan-Shueisha Productions, Nip…
## $ image <chr> "https://cdn.myanimelist.net/images/anime/1015/138006l…
## $ trailer <chr> "https://www.youtube.com/watch?v=ZEkwCGJ3o7M", "https:…
## $ synopsis <chr> "During their decade-long quest to defeat the Demon Ki…
## id name genres type
## Min. : 1 Length:28825 Length:28825 Length:28825
## 1st Qu.:15979 Class :character Class :character Class :character
## Median :38070 Mode :character Mode :character Mode :character
## Mean :34132
## 3rd Qu.:51060
## Max. :61871
##
## episodes status aired_from
## Min. : 1.00 Length:28825 Min. :1917-01-01 00:00:00
## 1st Qu.: 1.00 Class :character 1st Qu.:2004-03-06 00:00:00
## Median : 2.00 Mode :character Median :2014-01-10 00:00:00
## Mean : 13.83 Mean :2009-08-25 11:09:28
## 3rd Qu.: 13.00 3rd Qu.:2020-01-01 00:00:00
## Max. :3057.00 Max. :2027-01-01 00:00:00
## NA's :779 NA's :955
## aired_to duration_per_ep score
## Min. :1962-02-25 00:00:00 Length:28825 Min. :1.890
## 1st Qu.:2006-03-24 00:00:00 Class :character 1st Qu.:5.780
## Median :2014-11-24 12:00:00 Mode :character Median :6.370
## Mean :2011-08-31 20:10:29 Mean :6.398
## 3rd Qu.:2020-03-23 00:00:00 3rd Qu.:7.030
## Max. :2025-12-05 00:00:00 Max. :9.300
## NA's :17895 NA's :10226
## scored_by rank rating studios
## Min. : 101 Min. : 0 Length:28825 Length:28825
## 1st Qu.: 340 1st Qu.: 4989 Class :character Class :character
## Median : 1544 Median :10462 Mode :character Mode :character
## Mean : 29892 Mean :10523
## 3rd Qu.: 10092 3rd Qu.:16023
## Max. :2943048 Max. :21729
## NA's :10226 NA's :6863
## producers image trailer synopsis
## Length:28825 Length:28825 Length:28825 Length:28825
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
#Check for missing values
missing_values <- anime_df %>%
summarise_all(~sum(is.na(.))) %>%
gather(key = "Column", value = "Missing_Count") %>%
mutate(Missing_Percentage = round((Missing_Count / nrow(anime_df)) * 100, 2)) %>%
arrange(desc(Missing_Count))
DT::datatable(missing_values, caption = "Missing Values Analysis")
#Visualize missing values
missing_values %>%
filter(Missing_Count > 0) %>%
ggplot(aes(x = reorder(Column, Missing_Count), y = Missing_Count)) +
geom_col(fill = "coral") +
coord_flip() +
labs(title = "Missing Values by Column",
x = "Column",
y = "Number of Missing Values") +
theme_minimal()
##Clean and preprocess the data
## === DATASET DEBUG INFO ===
## Dimensions: 28825 18
## Column names:
## [1] "id" "name" "genres" "type"
## [5] "episodes" "status" "aired_from" "aired_to"
## [9] "duration_per_ep" "score" "scored_by" "rank"
## [13] "rating" "studios" "producers" "image"
## [17] "trailer" "synopsis"
##
## Score column info:
## Class: numeric
## First 10 values:
## [1] 9.30 9.10 9.07 9.05 9.05 9.04 9.04 9.03 9.02 9.02
## Unique values (first 20):
## [1] 9.30 9.10 9.07 9.05 9.04 9.03 9.02 9.01 8.99 8.98 8.95 8.93 8.91 8.90 8.89
## [16] 8.88 8.87 8.86 8.84 8.83
# Clean and preprocess the data
anime_clean <- anime_df %>%
# Convert date columns
mutate(
aired_from = ymd(aired_from),
aired_to = ymd(aired_to),
# Extract year from aired_from
year = year(aired_from),
# Clean genres (split into list)
genres_list = str_split(genres, ",\\s*"),
# Clean episodes column (convert to numeric)
episodes_clean = as.numeric(episodes),
# Clean duration
duration_minutes = case_when(
str_detect(duration_per_ep, "hr") ~ as.numeric(str_extract(duration_per_ep, "\\d+")) * 60,
str_detect(duration_per_ep, "min") ~ as.numeric(str_extract(duration_per_ep, "\\d+")),
TRUE ~ NA_real_
)
) %>%
# Filter out entries with missing critical information
filter(
!is.na(score),
!is.na(year),
year >= 1960,
year <= 2024
)
cat("Cleaned dataset dimensions:", nrow(anime_clean), "rows x", ncol(anime_clean), "columns\n")
## Cleaned dataset dimensions: 18161 rows x 22 columns
#Distribution of anime scores
p1 <- anime_clean %>%
ggplot(aes(x = score)) +
geom_histogram(binwidth = 0.2, fill = "skyblue", color = "white", alpha = 0.8) +
geom_vline(aes(xintercept = mean(score, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
labs(title = "Distribution of Anime Scores",
subtitle = paste("Mean Score:", round(mean(anime_clean$score, na.rm = TRUE), 2)),
x = "Score (out of 10)",
y = "Count") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Extract and count genres
genres_expanded <- anime_clean %>%
unnest(genres_list) %>%
mutate(genres_list = str_trim(genres_list)) %>%
count(genres_list, sort = TRUE) %>%
filter(!is.na(genres_list), genres_list != "")
#Top 15 genres
top_genres <- genres_expanded %>%
head(15)
#Visualize top genres
p2 <- top_genres %>%
ggplot(aes(x = reorder(genres_list, n), y = n)) +
geom_col(fill = "purple", alpha = 0.7) +
coord_flip() +
labs(title = "Top 15 Anime Genres",
x = "Genre",
y = "Number of Anime") +
theme_minimal()
ggplotly(p2)
##Average scores by genre (top 10 genres)
genre_scores <- anime_clean %>%
unnest(genres_list) %>%
mutate(genres_list = str_trim(genres_list)) %>%
group_by(genres_list) %>%
summarise(
avg_score = mean(score, na.rm = TRUE),
count = n(),
.groups = 'drop'
) %>%
filter(count >= 50) %>% # Only genres with at least 50 anime
arrange(desc(avg_score)) %>%
head(10)
genre_scores %>%
ggplot(aes(x = reorder(genres_list, avg_score), y = avg_score)) +
geom_col(fill = "orange", alpha = 0.8) +
coord_flip() +
labs(title = "Average Scores by Genre (Min. 50 Anime)",
x = "Genre",
y = "Average Score") +
theme_minimal()
## Studio Analysis
#Top studios by number of anime
studio_counts <- anime_clean %>%
separate_rows(studios, sep = ",") %>%
mutate(studios = str_trim(studios)) %>%
filter(!is.na(studios), studios != "") %>%
count(studios, sort = TRUE) %>%
head(15)
studio_counts %>%
ggplot(aes(x = reorder(studios, n), y = n)) +
geom_col(fill = "green", alpha = 0.7) +
coord_flip() +
labs(title = "Top 15 Studios by Number of Anime",
x = "Studio",
y = "Number of Anime") +
theme_minimal()
##Average scores by studio (top studios only)
studio_scores <- anime_clean %>%
separate_rows(studios, sep = ",") %>%
mutate(studios = str_trim(studios)) %>%
filter(!is.na(studios), studios != "") %>%
group_by(studios) %>%
summarise(
avg_score = mean(score, na.rm = TRUE),
count = n(),
.groups = 'drop'
) %>%
filter(count >= 10) %>% # Studios with at least 10 anime
arrange(desc(avg_score)) %>%
head(10)
studio_scores %>%
ggplot(aes(x = reorder(studios, avg_score), y = avg_score)) +
geom_col(fill = "darkblue", alpha = 0.8) +
coord_flip() +
labs(title = "Top 10 Studios by Average Score (Min. 10 Anime)",
x = "Studio",
y = "Average Score") +
theme_minimal()
#Anime count and average score by year
yearly_stats <- anime_clean %>%
group_by(year) %>%
summarise(
count = n(),
avg_score = mean(score, na.rm = TRUE),
.groups = 'drop'
) %>%
filter(year >= 1980) # Focus on modern era
#Dual-axis plot
p3 <- ggplot(yearly_stats, aes(x = year)) +
geom_line(aes(y = count), color = "blue", size = 1) +
geom_line(aes(y = avg_score * 100), color = "red", size = 1) +
scale_y_continuous(
name = "Number of Anime",
sec.axis = sec_axis(trans = ~./100, name = "Average Score")
) +
labs(title = "Anime Production and Quality Trends Over Time",
subtitle = "Blue: Count, Red: Average Score",
x = "Year") +
theme_minimal()
## Warning: The `trans` argument of `sec_axis()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Anime types distribution
type_distribution <- anime_clean %>%
count(type, sort = TRUE) %>%
filter(!is.na(type))
type_distribution %>%
ggplot(aes(x = reorder(type, n), y = n)) +
geom_col(fill = "#008080", alpha = 0.8) +
coord_flip() +
labs(title = "Distribution of Anime Types",
x = "Type",
y = "Count") +
theme_minimal()
##Episode count analysis for TV series
tv_episodes <- anime_clean %>%
filter(type == "TV", !is.na(episodes_clean), episodes_clean <= 200) %>%
select(name, episodes_clean, score, year)
tv_episodes %>%
ggplot(aes(x = episodes_clean, y = score)) +
geom_point(alpha = 0.6, color = "darkgreen") +
geom_smooth(method = "lm", color = "red") +
labs(title = "Relationship between Episode Count and Score (TV Series)",
x = "Number of Episodes",
y = "Score") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
##Correlation
cor_episodes_score <- cor(tv_episodes$episodes_clean, tv_episodes$score, use = "complete.obs")
cat("Correlation between episodes and score:", round(cor_episodes_score, 3), "\n")
## Correlation between episodes and score: 0.022
##Simple content-based recommendation function
recommend_anime <- function(target_genre, min_score = 7.0, max_results = 10) {
recommendations <- anime_clean %>%
filter(
str_detect(genres, target_genre),
score >= min_score,
!is.na(rank)
) %>%
arrange(desc(score), rank) %>%
select(name, score, genres, type, episodes, year, studios) %>%
head(max_results)
return(recommendations)
}
##Example
#Numerical variables correlation
numeric_vars <- anime_clean %>%
select(score, episodes_clean, year, rank, scored_by, duration_minutes) %>%
filter(complete.cases(.))
if(nrow(numeric_vars) > 0) {
cor_matrix <- cor(numeric_vars, use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper",
order = "hclust", tl.cex = 0.8, tl.col = "black")
}
#Generate summary statistics
library(dplyr)
library(knitr)
## Unnest genres first
genres_expanded <- anime_clean %>%
unnest(genres_list)
## Compute most common genre
top_genre <- genres_expanded %>%
count(genres_list) %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(genres_list)
## Summary stats
summary_stats <- anime_clean %>%
summarise(
total_anime = n(),
avg_score = round(mean(score, na.rm = TRUE), 2),
median_score = round(median(score, na.rm = TRUE), 2),
most_common_type = names(sort(table(type), decreasing = TRUE))[1],
year_range = paste(min(year, na.rm = TRUE), "-", max(year, na.rm = TRUE)),
top_genre = top_genre
)
## Display table
knitr::kable(summary_stats, caption = "Dataset Summary Statistics")
total_anime | avg_score | median_score | most_common_type | year_range | top_genre |
---|---|---|---|---|---|
18161 | 6.41 | 6.38 | TV | 1960 - 2024 | Comedy |
#Top 10 highest rated anime
top_anime <- anime_clean %>%
filter(!is.na(rank), scored_by >= 1000) %>% # Minimum votes for reliability
arrange(desc(score)) %>%
select(name, score, type, year, genres, studios) %>%
head(10)
DT::datatable(top_anime, caption = "Top 10 Highest Rated Anime (Min. 1000 votes)")
This analysis reveals several key insights about the anime landscape:
Quality Distribution: The average anime score is 6.41, with most anime clustering around 6-8 points.
Genre Preferences: Adventure is the most common genre, appearing in anime.
Production Trends: Anime production has significantly increased over the years, with peak activity in recent decades.
Studio Impact: Different studios show varying quality patterns, with some consistently producing higher-rated content.
Type Distribution: TV series dominate the dataset, but movies and OVAs also represent significant portions.
This dataset provides excellent opportunities for building recommendation systems, analyzing industry trends, and understanding audience preferences in the anime medium.
Report generated on 12-August_2025
using R
Markdown