myanimelist

library(tidyverse)
library(rstatix)
library(skimr)
library(naniar)

anime_data=read_csv("D:\\wallpapers and photos\\csv\\mal_top2000_anime.csv")

str(anime_data)

## spc_tbl_ [2,000 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ...1            : num [1:2000] 0 1 2 3 4 5 6 7 8 9 ...
##  $ Name            : chr [1:2000] "Fullmetal Alchemist: Brotherhood" "Spy x Family" "Shingeki no Kyojin Season 3 Part 2" "Steins;Gate" ...
##  $ Type            : chr [1:2000] "TV" "TV" "TV" "TV" ...
##  $ Score           : num [1:2000] 9.14 9.09 9.08 9.08 9.08 9.05 9.05 9.05 9.04 9.04 ...
##  $ Score Rank      : num [1:2000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Popularity Rank : num [1:2000] 3 350 32 13 335 ...
##  $ Air Date        : chr [1:2000] "Apr 5, 2009 to Jul 4, 2010" "Apr 9, 2022 to ?" "Apr 29, 2019 to Jul 1, 2019" "Apr 6, 2011 to Sep 14, 2011" ...
##  $ Studio          : chr [1:2000] "['Bones']" "['Wit Studio', '            CloverWorks']" "['Wit Studio']" "['White Fox']" ...
##  $ Num. of episodes: num [1:2000] 64 12 10 24 51 51 1 148 13 13 ...
##  $ Genres          : chr [1:2000] "['Action', 'Adventure', 'Drama', 'Fantasy']" "['Action', 'Comedy']" "['Action', 'Drama']" "['Drama', 'Sci-Fi', 'Suspense']" ...
##  $ Theme(s)        : chr [1:2000] "['Military']" "['Childcare']" "['Gore', 'Military', 'Survival']" "['Psychological', 'Time Travel']" ...
##  $ Demographic     : chr [1:2000] "Shounen" "Shounen" "Shounen" "None" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ...1 = col_double(),
##   ..   Name = col_character(),
##   ..   Type = col_character(),
##   ..   Score = col_double(),
##   ..   `Score Rank` = col_double(),
##   ..   `Popularity Rank` = col_double(),
##   ..   `Air Date` = col_character(),
##   ..   Studio = col_character(),
##   ..   `Num. of episodes` = col_double(),
##   ..   Genres = col_character(),
##   ..   `Theme(s)` = col_character(),
##   ..   Demographic = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

skim(anime_data)

Data summary
Name	anime_data
Number of rows	2000
Number of columns	12
_______________________
Column type frequency:
character	7
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Name	1	1	96	1999
Type	1	2	7	6
Air Date	1	4	28	1832
Studio	1	7	110	273
Genres	1	8	81	370
Theme(s)	1	8	87	322
Demographic	1	4	7	6

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
…1	1	999.50	577.49	0.00	499.75	999.50	1499.25	1999.00	▇▇▇▇▇
Score	1	7.84	0.35	7.41	7.55	7.74	8.05	9.14	▇▅▂▁▁
Score Rank	1	1000.51	577.50	1.00	500.75	1000.50	1500.25	2000.00	▇▇▇▇▇
Popularity Rank	1	2311.68	2061.08	1.00	696.50	1778.00	3369.25	12164.00	▇▃▁▁▁
Num. of episodes	1	16.86	49.63	0.00	1.00	12.00	21.00	1787.00	▇▁▁▁▁

vis_miss(anime_data)

there is no missing value in the data

lets do some data cleaning to get a better understanding about the data

library(lubridate)
anime=anime_data %>% 
  rename("no"="...1") %>% 
  rename("Theme"="Theme(s)") %>% 
  rename("Episode"="Num. of episodes") %>% 
  mutate_at(vars(Studio,Theme,Genres),str_squish) %>% 
  mutate_at(vars(Studio,Theme,Genres),str_remove,"']") %>% 
  mutate_at(vars(Studio,Theme,Genres),str_remove,"\\[.") %>% 
  mutate_at(vars(Studio,Theme,Genres),str_remove_all,"\\'") %>% 
  mutate_if(is.character,str_to_title) %>% 
  mutate(Demographic=as.factor(Demographic)) %>% 
  mutate(Type=as.factor(Type)) %>% 
  separate(`Air Date`,into = c("start","End"),sep = "To") %>% 
  mutate(start=mdy(start)) %>% 
  mutate(End=mdy(End))

anime

## # A tibble: 2,000 × 13
##       no Name   Type  Score Score…¹ Popul…² start      End        Studio Episode
##    <dbl> <chr>  <fct> <dbl>   <dbl>   <dbl> <date>     <date>     <chr>    <dbl>
##  1     0 Fullm… Tv     9.14       1       3 2009-04-05 2010-07-04 Bones       64
##  2     1 Spy X… Tv     9.09       2     350 2022-04-09 NA         Wit S…      12
##  3     2 Shing… Tv     9.08       3      32 2019-04-29 2019-07-01 Wit S…      10
##  4     3 Stein… Tv     9.08       4      13 2011-04-06 2011-09-14 White…      24
##  5     4 Ginta… Tv     9.08       5     335 2015-04-08 2016-03-30 Banda…      51
##  6     5 Ginta… Tv     9.05       6     385 2011-04-04 2012-03-26 Sunri…      51
##  7     6 Ginta… Movie  9.05       7    1746 2021-01-08 NA         Banda…       1
##  8     7 Hunte… Tv     9.05       8      10 2011-10-02 2014-09-24 Madho…     148
##  9     8 Fruit… Tv     9.04       9     551 2021-04-06 2021-06-29 Tms E…      13
## 10     9 Ginta… Tv     9.04      10     695 2012-10-04 2013-03-28 Sunri…      13
## # … with 1,990 more rows, 3 more variables: Genres <chr>, Theme <chr>,
## #   Demographic <fct>, and abbreviated variable names ¹`Score Rank`,
## #   ²`Popularity Rank`

anime %>% 
  mutate(Studio=str_remove_all(Studio,"\"\\]")) %>% 
  separate(Studio,into = c("studio 1","studio 2"),sep = ",") %>% 
  count(`studio 1`,sort = TRUE)

## # A tibble: 188 × 2
##    `studio 1`            n
##    <chr>             <int>
##  1 Production I.g      114
##  2 Sunrise             112
##  3 Toei Animation      105
##  4 Madhouse             95
##  5 Tms Entertainment    79
##  6 J.c.staff            78
##  7 A-1 Pictures         76
##  8 Studio Deen          76
##  9 Bones                63
## 10 Kyoto Animation      62
## # … with 178 more rows

anime %>% 
  mutate(Studio=str_remove_all(Studio,"\"\\]")) %>% 
  separate(Studio,into = c("studio 1","studio 2"),sep = ",") %>% 
  count(`studio 2`,sort = TRUE) %>% 
  filter(!`studio 2` %in% NA)

## # A tibble: 79 × 2
##    `studio 2`             n
##    <chr>              <int>
##  1 "  Xebec"              7
##  2 "  Studio Deen"        6
##  3 "  Lidenfilms"         5
##  4 "  Aic"                4
##  5 "  Animation Do"       4
##  6 "  Egg Firm"           4
##  7 "  M.s.c"              4
##  8 "  Madhouse"           4
##  9 "  Mappa"              3
## 10 "  Production I.g"     3
## # … with 69 more rows

hence the data is pretty much cleaned……

anime

## # A tibble: 2,000 × 13
##       no Name   Type  Score Score…¹ Popul…² start      End        Studio Episode
##    <dbl> <chr>  <fct> <dbl>   <dbl>   <dbl> <date>     <date>     <chr>    <dbl>
##  1     0 Fullm… Tv     9.14       1       3 2009-04-05 2010-07-04 Bones       64
##  2     1 Spy X… Tv     9.09       2     350 2022-04-09 NA         Wit S…      12
##  3     2 Shing… Tv     9.08       3      32 2019-04-29 2019-07-01 Wit S…      10
##  4     3 Stein… Tv     9.08       4      13 2011-04-06 2011-09-14 White…      24
##  5     4 Ginta… Tv     9.08       5     335 2015-04-08 2016-03-30 Banda…      51
##  6     5 Ginta… Tv     9.05       6     385 2011-04-04 2012-03-26 Sunri…      51
##  7     6 Ginta… Movie  9.05       7    1746 2021-01-08 NA         Banda…       1
##  8     7 Hunte… Tv     9.05       8      10 2011-10-02 2014-09-24 Madho…     148
##  9     8 Fruit… Tv     9.04       9     551 2021-04-06 2021-06-29 Tms E…      13
## 10     9 Ginta… Tv     9.04      10     695 2012-10-04 2013-03-28 Sunri…      13
## # … with 1,990 more rows, 3 more variables: Genres <chr>, Theme <chr>,
## #   Demographic <fct>, and abbreviated variable names ¹`Score Rank`,
## #   ²`Popularity Rank`

library(patchwork)
library(plotly)
popularity_score=anime %>% 
  spread(value = "Score",key= "Demographic") 
shounen=popularity_score %>% 
  filter(!Shounen %in% NA) %>% 
  count(Shounen) 
g1=ggplot(shounen,aes(y=n,x=Shounen))+
  geom_bar(stat = "identity",fill= "#FF61C3") + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.y = element_text(size = 5),
    panel.background = element_rect(fill = "gray98"),
    plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings") + theme(axis.title = element_text(size = 8),
    plot.title = element_text(size = 12,
        face = "bold", hjust = 0.5)) +labs(title = "Shounen")
ggplotly(g1)

shoujo=popularity_score %>% 
  filter(!Shoujo %in% NA) %>% 
  count(Shoujo) 
g2=ggplot(shoujo,aes(y=n,x=Shoujo))+
  geom_bar(stat = "identity",fill= "#D39200") + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.y = element_text(size = 5),
    panel.background = element_rect(fill = "gray98"),
    plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings")+ theme(axis.title = element_text(size = 8),
    plot.title = element_text(size = 12,
        face = "bold", hjust = 0.5)) +labs(title = "Shoujo")
ggplotly(g2)

seinen=popularity_score %>% 
  filter(!Seinen %in% NA) %>% 
  count(Seinen) 
g3=ggplot(seinen,aes(y=n,x=Seinen))+
  geom_bar(stat = "identity",fill= "#93AA00") + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.y = element_text(size = 5),
    panel.background = element_rect(fill = "gray98"),
    plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", c = "Ratings")+ theme(axis.title = element_text(size = 8),
    plot.title = element_text(size = 12,
        face = "bold", hjust = 0.5)) +labs(title = "Seinen")
ggplotly(g3)

kids=popularity_score %>% 
  filter(!Kids %in% NA) %>% 
  count(Kids) 
g4=ggplot(kids,aes(y=n,x=Kids))+
  geom_bar(stat = "identity",fill= "#00C19F") + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.y = element_text(size = 5),
    panel.background = element_rect(fill = "gray98"),
    plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings")+ theme(axis.title = element_text(size = 8),
    plot.title = element_text(size = 12,
        face = "bold", hjust = 0.5)) +labs(title = "Kids")
ggplotly(g4)

Josei=popularity_score %>% 
  filter(!Josei %in% NA) %>% 
  count(Josei) 
g4=ggplot(Josei,aes(y=n,x=Josei))+
  geom_bar(stat = "identity",fill= "#DB72FB",bins=40) + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.y = element_text(size = 5),
    panel.background = element_rect(fill = "gray98"),
    plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings")+ theme(axis.title = element_text(size = 8),
    plot.title = element_text(size = 12,
        face = "bold", hjust = 0.5)) +labs(title = "Josei")
ggplotly(g4)

g1+g2/g3+g4

in terms of anime ranking shounen has most above 9 rating anime follwed by shoujo. Seinen has the most stable rating but does not have a anime over 9 ratings

x1=ggplot(anime,aes(x=Type,y=Score,color=Type,fill=Type))+
  geom_violin() + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text = element_text(colour = "deepskyblue4"),
    plot.title = element_text(colour = "azure3"),
    panel.background = element_rect(fill = "gray99"),
    plot.background = element_rect(colour = "aliceblue"))


x2=ggplot(anime,aes(x=Type,y=Score,color=Type,fill=Type))+
  geom_point() + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text = element_text(colour = "deepskyblue4"),
    plot.title = element_text(colour = "azure3"),
    panel.background = element_rect(fill = "gray99"),
    plot.background = element_rect(colour = "aliceblue"))


x3=ggplot(anime,aes(x=Type,fill=Type))+
  geom_bar() +
  facet_wrap(~Demographic)


ggplotly(x1)

ggplotly(x2)

x3

x1/x2

Thus it is clear that in terms of anime preference TV anime is popular , followed by Movie.

avg_success=anime %>% 
  group_by(Demographic) %>% 
  summarise(mean_score=mean(Score)) %>% 
  arrange(desc(mean_score)) 

ggplot(avg_success %>% filter(!Demographic %in% "Josei"),aes(x=Demographic,y=mean_score,color=Demographic,fill=Demographic))+
  geom_bar(stat = "Identity")

because of sheer number of shounen anime avg mean score is pretty high. But seinen is close second instead of shoujo. Meaning maybe seinen and shounen makes the most decent anime among others

library(DataExplorer)
library(corrgram)
plot_correlation(anime)

anime %>% 
  select(-no) %>% 
  
  corrgram(order=TRUE, upper.panel=panel.cor,main="cor matrix for all data")

anime %>% 
  filter(Demographic %in% "Shoujo") %>%
  select(-no) %>% 
  corrgram(order=TRUE, upper.panel=panel.cor,main="corr score matrix for shoujo")

anime %>% 
  filter(Demographic %in% "Seinen") %>% 
  select(-no) %>% 
  corrgram(order=TRUE, upper.panel=panel.cor,main="corr score matrix for seinen")

anime %>% 
  filter(Demographic %in% "Shounen") %>% 
  select(-no) %>% 
  corrgram(order=TRUE, upper.panel=panel.cor,main="corr score matrix for shounen")

from the correlarion plot it is clear that popularity of an anime depends on its score, so anime with better score has a chance to become more popular

H0=median difference is same

anime %>% 
  select(Type,Score) %>% 
  kruskal.test(data=.,Score~Type)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Score by Type
## Kruskal-Wallis chi-squared = 43.863, df = 5, p-value = 2.47e-08

thus the null hypothesis is rejected meaning there is a difference between movie,ona,music,Special,TV,Ova ratings

anime %>% 
  mutate(Score=cut(Score,breaks=3,labels=c( "meh","good","very good"))) %>% 
  select(Score,Type) %>% 
  table() %>% 
  chisq.test() %>% 
  tidy()

## # A tibble: 1 × 4
##   statistic    p.value parameter method                    
##       <dbl>      <dbl>     <int> <chr>                     
## 1      46.1 0.00000137        10 Pearson's Chi-squared test

thus the null hypothesis is rejected meaning , relation between type and score exists

myanimelist

omon das

2023-04-01