# Install tidyverse if you don't have it already
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
# Load the tidyverse library for use in this session
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Dataset Overview

This report provides a comprehensive analysis of the “Top Popular Anime” dataset from Kaggle, containing 22,000+ anime entries with rich metadata from MyAnimeList.

Key Features

  • 22,000+ anime entries with diverse metadata
  • Covers TV series, movies, OVAs, and continuing/upcoming anime
  • Suitable for recommendation models, trend analysis, genre studies
#Load required packages
library(tidyverse)
library(lubridate)
library(DT)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(corrplot)
## corrplot 0.95 loaded
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

#Set theme for ggplot
theme_set(theme_minimal())
#Load the dataset

anime_df <- read_csv("popular_anime.csv")
## Rows: 28825 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (11): name, genres, type, status, duration_per_ep, rating, studios, pro...
## dbl   (5): id, episodes, score, scored_by, rank
## dttm  (2): aired_from, aired_to
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Display basic information about the dataset
cat("Dataset dimensions:", dim(anime_df), "rows x", dim(anime_df), "columns\n")
## Dataset dimensions: 28825 18 rows x 28825 18 columns
#Examine the structure
glimpse(anime_df)
## Rows: 28,825
## Columns: 18
## $ id              <dbl> 52991, 5114, 9253, 38524, 28977, 60022, 39486, 11061, …
## $ name            <chr> "Frieren: Beyond Journey's End", "Fullmetal Alchemist:…
## $ genres          <chr> "Adventure, Drama, Fantasy", "Action, Adventure, Drama…
## $ type            <chr> "TV", "TV", "TV", "TV", "TV", "TV Special", "Movie", "…
## $ episodes        <dbl> 28, 64, 24, 10, 51, 1, 1, 148, 51, 13, 110, 13, 12, 13…
## $ status          <chr> "Finished Airing", "Finished Airing", "Finished Airing…
## $ aired_from      <dttm> 2023-09-29, 2009-04-05, 2011-04-06, 2019-04-29, 2015-…
## $ aired_to        <dttm> 2024-03-22, 2010-07-04, 2011-09-14, 2019-07-01, 2016-…
## $ duration_per_ep <chr> "24 min per ep", "24 min per ep", "24 min per ep", "23…
## $ score           <dbl> 9.30, 9.10, 9.07, 9.05, 9.05, 9.04, 9.04, 9.03, 9.02, …
## $ scored_by       <dbl> 676737, 2223666, 1467570, 1700946, 264260, 83738, 8078…
## $ rank            <dbl> 1, 2, 3, 4, 5, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ rating          <chr> "PG-13 - Teens 13 or older", "R - 17+ (violence & prof…
## $ studios         <chr> "Madhouse", "Bones", "White Fox", "Wit Studio", "Banda…
## $ producers       <chr> "Aniplex, Dentsu, Shogakukan-Shueisha Productions, Nip…
## $ image           <chr> "https://cdn.myanimelist.net/images/anime/1015/138006l…
## $ trailer         <chr> "https://www.youtube.com/watch?v=ZEkwCGJ3o7M", "https:…
## $ synopsis        <chr> "During their decade-long quest to defeat the Demon Ki…
#Statistical summary
summary(anime_df)
##        id            name              genres              type          
##  Min.   :    1   Length:28825       Length:28825       Length:28825      
##  1st Qu.:15979   Class :character   Class :character   Class :character  
##  Median :38070   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :34132                                                           
##  3rd Qu.:51060                                                           
##  Max.   :61871                                                           
##                                                                          
##     episodes          status            aired_from                 
##  Min.   :   1.00   Length:28825       Min.   :1917-01-01 00:00:00  
##  1st Qu.:   1.00   Class :character   1st Qu.:2004-03-06 00:00:00  
##  Median :   2.00   Mode  :character   Median :2014-01-10 00:00:00  
##  Mean   :  13.83                      Mean   :2009-08-25 11:09:28  
##  3rd Qu.:  13.00                      3rd Qu.:2020-01-01 00:00:00  
##  Max.   :3057.00                      Max.   :2027-01-01 00:00:00  
##  NA's   :779                          NA's   :955                  
##     aired_to                   duration_per_ep        score      
##  Min.   :1962-02-25 00:00:00   Length:28825       Min.   :1.890  
##  1st Qu.:2006-03-24 00:00:00   Class :character   1st Qu.:5.780  
##  Median :2014-11-24 12:00:00   Mode  :character   Median :6.370  
##  Mean   :2011-08-31 20:10:29                      Mean   :6.398  
##  3rd Qu.:2020-03-23 00:00:00                      3rd Qu.:7.030  
##  Max.   :2025-12-05 00:00:00                      Max.   :9.300  
##  NA's   :17895                                    NA's   :10226  
##    scored_by            rank          rating            studios         
##  Min.   :    101   Min.   :    0   Length:28825       Length:28825      
##  1st Qu.:    340   1st Qu.: 4989   Class :character   Class :character  
##  Median :   1544   Median :10462   Mode  :character   Mode  :character  
##  Mean   :  29892   Mean   :10523                                        
##  3rd Qu.:  10092   3rd Qu.:16023                                        
##  Max.   :2943048   Max.   :21729                                        
##  NA's   :10226     NA's   :6863                                         
##   producers            image             trailer            synopsis        
##  Length:28825       Length:28825       Length:28825       Length:28825      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
## 
#Check for missing values
missing_values <- anime_df %>%
summarise_all(~sum(is.na(.))) %>%
gather(key = "Column", value = "Missing_Count") %>%
mutate(Missing_Percentage = round((Missing_Count / nrow(anime_df)) * 100, 2)) %>%
arrange(desc(Missing_Count))

DT::datatable(missing_values, caption = "Missing Values Analysis")
#Visualize missing values
missing_values %>%
filter(Missing_Count > 0) %>%
ggplot(aes(x = reorder(Column, Missing_Count), y = Missing_Count)) +
geom_col(fill = "coral") +
coord_flip() +
labs(title = "Missing Values by Column",
x = "Column",
y = "Number of Missing Values") +
theme_minimal()

##Clean and preprocess the data

cat("=== DATASET DEBUG INFO ===\n")
## === DATASET DEBUG INFO ===
cat("Dimensions:", dim(anime_df), "\n")
## Dimensions: 28825 18
cat("Column names:\n")
## Column names:
print(colnames(anime_df))
##  [1] "id"              "name"            "genres"          "type"           
##  [5] "episodes"        "status"          "aired_from"      "aired_to"       
##  [9] "duration_per_ep" "score"           "scored_by"       "rank"           
## [13] "rating"          "studios"         "producers"       "image"          
## [17] "trailer"         "synopsis"
cat("\nScore column info:\n")
## 
## Score column info:
cat("Class:", class(anime_df$score), "\n")
## Class: numeric
cat("First 10 values:\n")
## First 10 values:
print(head(anime_df$score, 10))
##  [1] 9.30 9.10 9.07 9.05 9.05 9.04 9.04 9.03 9.02 9.02
cat("Unique values (first 20):\n")
## Unique values (first 20):
print(head(unique(anime_df$score), 20))
##  [1] 9.30 9.10 9.07 9.05 9.04 9.03 9.02 9.01 8.99 8.98 8.95 8.93 8.91 8.90 8.89
## [16] 8.88 8.87 8.86 8.84 8.83
# Clean and preprocess the data
anime_clean <- anime_df %>%
  # Convert date columns
  mutate(
    aired_from = ymd(aired_from),
    aired_to = ymd(aired_to),
    # Extract year from aired_from
    year = year(aired_from),
    # Clean genres (split into list)
    genres_list = str_split(genres, ",\\s*"),
    # Clean episodes column (convert to numeric)
    episodes_clean = as.numeric(episodes),
    # Clean duration
    duration_minutes = case_when(
      str_detect(duration_per_ep, "hr") ~ as.numeric(str_extract(duration_per_ep, "\\d+")) * 60,
      str_detect(duration_per_ep, "min") ~ as.numeric(str_extract(duration_per_ep, "\\d+")),
      TRUE ~ NA_real_
    )
  ) %>%
  # Filter out entries with missing critical information
  filter(
    !is.na(score), 
    !is.na(year), 
    year >= 1960, 
    year <= 2024
  )

cat("Cleaned dataset dimensions:", nrow(anime_clean), "rows x", ncol(anime_clean), "columns\n")
## Cleaned dataset dimensions: 18161 rows x 22 columns
#Distribution of anime scores
p1 <- anime_clean %>%
ggplot(aes(x = score)) +
geom_histogram(binwidth = 0.2, fill = "skyblue", color = "white", alpha = 0.8) +
geom_vline(aes(xintercept = mean(score, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
labs(title = "Distribution of Anime Scores",
subtitle = paste("Mean Score:", round(mean(anime_clean$score, na.rm = TRUE), 2)),
x = "Score (out of 10)",
y = "Count") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplotly(p1)

Top Genres Analysis

#Extract and count genres
genres_expanded <- anime_clean %>%
unnest(genres_list) %>%
mutate(genres_list = str_trim(genres_list)) %>%
count(genres_list, sort = TRUE) %>%
filter(!is.na(genres_list), genres_list != "")

#Top 15 genres
top_genres <- genres_expanded %>%
head(15)
#Visualize top genres
p2 <- top_genres %>%
ggplot(aes(x = reorder(genres_list, n), y = n)) +
geom_col(fill = "purple", alpha = 0.7) +
coord_flip() +
labs(title = "Top 15 Anime Genres",
x = "Genre",
y = "Number of Anime") +
theme_minimal()

ggplotly(p2)

##Average scores by genre (top 10 genres)

genre_scores <- anime_clean %>%
unnest(genres_list) %>%
mutate(genres_list = str_trim(genres_list)) %>%
group_by(genres_list) %>%
summarise(
avg_score = mean(score, na.rm = TRUE),
count = n(),
.groups = 'drop'
) %>%
filter(count >= 50) %>% # Only genres with at least 50 anime
arrange(desc(avg_score)) %>%
head(10)
genre_scores %>%
ggplot(aes(x = reorder(genres_list, avg_score), y = avg_score)) +
geom_col(fill = "orange", alpha = 0.8) +
coord_flip() +
labs(title = "Average Scores by Genre (Min. 50 Anime)",
x = "Genre",
y = "Average Score") +
theme_minimal()

## Studio Analysis

#Top studios by number of anime
studio_counts <- anime_clean %>%
separate_rows(studios, sep = ",") %>%
mutate(studios = str_trim(studios)) %>%
filter(!is.na(studios), studios != "") %>%
count(studios, sort = TRUE) %>%
head(15)

studio_counts %>%
ggplot(aes(x = reorder(studios, n), y = n)) +
geom_col(fill = "green", alpha = 0.7) +
coord_flip() +
labs(title = "Top 15 Studios by Number of Anime",
x = "Studio",
y = "Number of Anime") +
theme_minimal()

##Average scores by studio (top studios only)

studio_scores <- anime_clean %>%
separate_rows(studios, sep = ",") %>%
mutate(studios = str_trim(studios)) %>%
filter(!is.na(studios), studios != "") %>%
group_by(studios) %>%
summarise(
avg_score = mean(score, na.rm = TRUE),
count = n(),
.groups = 'drop'
) %>%
filter(count >= 10) %>% # Studios with at least 10 anime
arrange(desc(avg_score)) %>%
head(10)
studio_scores %>%
ggplot(aes(x = reorder(studios, avg_score), y = avg_score)) +
geom_col(fill = "darkblue", alpha = 0.8) +
coord_flip() +
labs(title = "Top 10 Studios by Average Score (Min. 10 Anime)",
x = "Studio",
y = "Average Score") +
theme_minimal()

Temporal Analysis

#Anime count and average score by year
yearly_stats <- anime_clean %>%
group_by(year) %>%
summarise(
count = n(),
avg_score = mean(score, na.rm = TRUE),
.groups = 'drop'
) %>%
filter(year >= 1980) # Focus on modern era
#Dual-axis plot
p3 <- ggplot(yearly_stats, aes(x = year)) +
geom_line(aes(y = count), color = "blue", size = 1) +
geom_line(aes(y = avg_score * 100), color = "red", size = 1) +
scale_y_continuous(
name = "Number of Anime",
sec.axis = sec_axis(trans = ~./100, name = "Average Score")
) +
labs(title = "Anime Production and Quality Trends Over Time",
subtitle = "Blue: Count, Red: Average Score",
x = "Year") +
theme_minimal()
## Warning: The `trans` argument of `sec_axis()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplotly(p3)

Type and Episode Analysis

#Anime types distribution
type_distribution <- anime_clean %>%
count(type, sort = TRUE) %>%
filter(!is.na(type))

type_distribution %>%
ggplot(aes(x = reorder(type, n), y = n)) +
geom_col(fill = "#008080", alpha = 0.8) +
coord_flip() +
labs(title = "Distribution of Anime Types",
x = "Type",
y = "Count") +
theme_minimal()

##Episode count analysis for TV series

tv_episodes <- anime_clean %>%
filter(type == "TV", !is.na(episodes_clean), episodes_clean <= 200) %>%
select(name, episodes_clean, score, year)

tv_episodes %>%
ggplot(aes(x = episodes_clean, y = score)) +
geom_point(alpha = 0.6, color = "darkgreen") +
geom_smooth(method = "lm", color = "red") +
labs(title = "Relationship between Episode Count and Score (TV Series)",
x = "Number of Episodes",
y = "Score") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

##Correlation

cor_episodes_score <- cor(tv_episodes$episodes_clean, tv_episodes$score, use = "complete.obs")
cat("Correlation between episodes and score:", round(cor_episodes_score, 3), "\n")
## Correlation between episodes and score: 0.022

##Simple content-based recommendation function

recommend_anime <- function(target_genre, min_score = 7.0, max_results = 10) {
recommendations <- anime_clean %>%
filter(
str_detect(genres, target_genre),
score >= min_score,
!is.na(rank)
) %>%
arrange(desc(score), rank) %>%
select(name, score, genres, type, episodes, year, studios) %>%
head(max_results)

return(recommendations)
}

##Example

action_recommendations <- recommend_anime("Action", min_score = 8.0)
DT::datatable(action_recommendations, caption = "Top Action Anime Recommendations (Score >= 8.0)")

Correlation Matrix

#Numerical variables correlation
numeric_vars <- anime_clean %>%
select(score, episodes_clean, year, rank, scored_by, duration_minutes) %>%
filter(complete.cases(.))

if(nrow(numeric_vars) > 0) {
cor_matrix <- cor(numeric_vars, use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper",
order = "hclust", tl.cex = 0.8, tl.col = "black")
}

Summary and Insights

Key Findings

#Generate summary statistics
library(dplyr)
library(knitr)

## Unnest genres first
genres_expanded <- anime_clean %>%
  unnest(genres_list)

## Compute most common genre
top_genre <- genres_expanded %>%
  count(genres_list) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(genres_list)

## Summary stats
summary_stats <- anime_clean %>%
  summarise(
    total_anime = n(),
    avg_score = round(mean(score, na.rm = TRUE), 2),
    median_score = round(median(score, na.rm = TRUE), 2),
    most_common_type = names(sort(table(type), decreasing = TRUE))[1],
    year_range = paste(min(year, na.rm = TRUE), "-", max(year, na.rm = TRUE)),
    top_genre = top_genre
  )

## Display table
knitr::kable(summary_stats, caption = "Dataset Summary Statistics")
Dataset Summary Statistics
total_anime avg_score median_score most_common_type year_range top_genre
18161 6.41 6.38 TV 1960 - 2024 Comedy

Top Performing Anime

#Top 10 highest rated anime
top_anime <- anime_clean %>%
filter(!is.na(rank), scored_by >= 1000) %>% # Minimum votes for reliability
arrange(desc(score)) %>%
select(name, score, type, year, genres, studios) %>%
head(10)

DT::datatable(top_anime, caption = "Top 10 Highest Rated Anime (Min. 1000 votes)")

Conclusion

This analysis reveals several key insights about the anime landscape:

  1. Quality Distribution: The average anime score is 6.41, with most anime clustering around 6-8 points.

  2. Genre Preferences: Adventure is the most common genre, appearing in anime.

  3. Production Trends: Anime production has significantly increased over the years, with peak activity in recent decades.

  4. Studio Impact: Different studios show varying quality patterns, with some consistently producing higher-rated content.

  5. Type Distribution: TV series dominate the dataset, but movies and OVAs also represent significant portions.

This dataset provides excellent opportunities for building recommendation systems, analyzing industry trends, and understanding audience preferences in the anime medium.


Report generated on 12-August_2025 using R Markdown