library(tidyverse)
library(rstatix)
library(skimr)
library(naniar)
anime_data=read_csv("D:\\wallpapers and photos\\csv\\mal_top2000_anime.csv")
str(anime_data)
## spc_tbl_ [2,000 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ...1            : num [1:2000] 0 1 2 3 4 5 6 7 8 9 ...
##  $ Name            : chr [1:2000] "Fullmetal Alchemist: Brotherhood" "Spy x Family" "Shingeki no Kyojin Season 3 Part 2" "Steins;Gate" ...
##  $ Type            : chr [1:2000] "TV" "TV" "TV" "TV" ...
##  $ Score           : num [1:2000] 9.14 9.09 9.08 9.08 9.08 9.05 9.05 9.05 9.04 9.04 ...
##  $ Score Rank      : num [1:2000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Popularity Rank : num [1:2000] 3 350 32 13 335 ...
##  $ Air Date        : chr [1:2000] "Apr 5, 2009 to Jul 4, 2010" "Apr 9, 2022 to ?" "Apr 29, 2019 to Jul 1, 2019" "Apr 6, 2011 to Sep 14, 2011" ...
##  $ Studio          : chr [1:2000] "['Bones']" "['Wit Studio', '            CloverWorks']" "['Wit Studio']" "['White Fox']" ...
##  $ Num. of episodes: num [1:2000] 64 12 10 24 51 51 1 148 13 13 ...
##  $ Genres          : chr [1:2000] "['Action', 'Adventure', 'Drama', 'Fantasy']" "['Action', 'Comedy']" "['Action', 'Drama']" "['Drama', 'Sci-Fi', 'Suspense']" ...
##  $ Theme(s)        : chr [1:2000] "['Military']" "['Childcare']" "['Gore', 'Military', 'Survival']" "['Psychological', 'Time Travel']" ...
##  $ Demographic     : chr [1:2000] "Shounen" "Shounen" "Shounen" "None" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ...1 = col_double(),
##   ..   Name = col_character(),
##   ..   Type = col_character(),
##   ..   Score = col_double(),
##   ..   `Score Rank` = col_double(),
##   ..   `Popularity Rank` = col_double(),
##   ..   `Air Date` = col_character(),
##   ..   Studio = col_character(),
##   ..   `Num. of episodes` = col_double(),
##   ..   Genres = col_character(),
##   ..   `Theme(s)` = col_character(),
##   ..   Demographic = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
skim(anime_data)
Data summary
Name anime_data
Number of rows 2000
Number of columns 12
_______________________
Column type frequency:
character 7
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1 1 96 0 1999 0
Type 0 1 2 7 0 6 0
Air Date 0 1 4 28 0 1832 0
Studio 0 1 7 110 0 273 0
Genres 0 1 8 81 0 370 0
Theme(s) 0 1 8 87 0 322 0
Demographic 0 1 4 7 0 6 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
…1 0 1 999.50 577.49 0.00 499.75 999.50 1499.25 1999.00 ▇▇▇▇▇
Score 0 1 7.84 0.35 7.41 7.55 7.74 8.05 9.14 ▇▅▂▁▁
Score Rank 0 1 1000.51 577.50 1.00 500.75 1000.50 1500.25 2000.00 ▇▇▇▇▇
Popularity Rank 0 1 2311.68 2061.08 1.00 696.50 1778.00 3369.25 12164.00 ▇▃▁▁▁
Num. of episodes 0 1 16.86 49.63 0.00 1.00 12.00 21.00 1787.00 ▇▁▁▁▁
vis_miss(anime_data)

there is no missing value in the data

lets do some data cleaning to get a better understanding about the data

library(lubridate)
anime=anime_data %>% 
  rename("no"="...1") %>% 
  rename("Theme"="Theme(s)") %>% 
  rename("Episode"="Num. of episodes") %>% 
  mutate_at(vars(Studio,Theme,Genres),str_squish) %>% 
  mutate_at(vars(Studio,Theme,Genres),str_remove,"']") %>% 
  mutate_at(vars(Studio,Theme,Genres),str_remove,"\\[.") %>% 
  mutate_at(vars(Studio,Theme,Genres),str_remove_all,"\\'") %>% 
  mutate_if(is.character,str_to_title) %>% 
  mutate(Demographic=as.factor(Demographic)) %>% 
  mutate(Type=as.factor(Type)) %>% 
  separate(`Air Date`,into = c("start","End"),sep = "To") %>% 
  mutate(start=mdy(start)) %>% 
  mutate(End=mdy(End))
anime
## # A tibble: 2,000 × 13
##       no Name   Type  Score Score…¹ Popul…² start      End        Studio Episode
##    <dbl> <chr>  <fct> <dbl>   <dbl>   <dbl> <date>     <date>     <chr>    <dbl>
##  1     0 Fullm… Tv     9.14       1       3 2009-04-05 2010-07-04 Bones       64
##  2     1 Spy X… Tv     9.09       2     350 2022-04-09 NA         Wit S…      12
##  3     2 Shing… Tv     9.08       3      32 2019-04-29 2019-07-01 Wit S…      10
##  4     3 Stein… Tv     9.08       4      13 2011-04-06 2011-09-14 White…      24
##  5     4 Ginta… Tv     9.08       5     335 2015-04-08 2016-03-30 Banda…      51
##  6     5 Ginta… Tv     9.05       6     385 2011-04-04 2012-03-26 Sunri…      51
##  7     6 Ginta… Movie  9.05       7    1746 2021-01-08 NA         Banda…       1
##  8     7 Hunte… Tv     9.05       8      10 2011-10-02 2014-09-24 Madho…     148
##  9     8 Fruit… Tv     9.04       9     551 2021-04-06 2021-06-29 Tms E…      13
## 10     9 Ginta… Tv     9.04      10     695 2012-10-04 2013-03-28 Sunri…      13
## # … with 1,990 more rows, 3 more variables: Genres <chr>, Theme <chr>,
## #   Demographic <fct>, and abbreviated variable names ¹​`Score Rank`,
## #   ²​`Popularity Rank`
anime %>% 
  mutate(Studio=str_remove_all(Studio,"\"\\]")) %>% 
  separate(Studio,into = c("studio 1","studio 2"),sep = ",") %>% 
  count(`studio 1`,sort = TRUE)
## # A tibble: 188 × 2
##    `studio 1`            n
##    <chr>             <int>
##  1 Production I.g      114
##  2 Sunrise             112
##  3 Toei Animation      105
##  4 Madhouse             95
##  5 Tms Entertainment    79
##  6 J.c.staff            78
##  7 A-1 Pictures         76
##  8 Studio Deen          76
##  9 Bones                63
## 10 Kyoto Animation      62
## # … with 178 more rows
anime %>% 
  mutate(Studio=str_remove_all(Studio,"\"\\]")) %>% 
  separate(Studio,into = c("studio 1","studio 2"),sep = ",") %>% 
  count(`studio 2`,sort = TRUE) %>% 
  filter(!`studio 2` %in% NA)
## # A tibble: 79 × 2
##    `studio 2`             n
##    <chr>              <int>
##  1 "  Xebec"              7
##  2 "  Studio Deen"        6
##  3 "  Lidenfilms"         5
##  4 "  Aic"                4
##  5 "  Animation Do"       4
##  6 "  Egg Firm"           4
##  7 "  M.s.c"              4
##  8 "  Madhouse"           4
##  9 "  Mappa"              3
## 10 "  Production I.g"     3
## # … with 69 more rows

hence the data is pretty much cleaned……

anime
## # A tibble: 2,000 × 13
##       no Name   Type  Score Score…¹ Popul…² start      End        Studio Episode
##    <dbl> <chr>  <fct> <dbl>   <dbl>   <dbl> <date>     <date>     <chr>    <dbl>
##  1     0 Fullm… Tv     9.14       1       3 2009-04-05 2010-07-04 Bones       64
##  2     1 Spy X… Tv     9.09       2     350 2022-04-09 NA         Wit S…      12
##  3     2 Shing… Tv     9.08       3      32 2019-04-29 2019-07-01 Wit S…      10
##  4     3 Stein… Tv     9.08       4      13 2011-04-06 2011-09-14 White…      24
##  5     4 Ginta… Tv     9.08       5     335 2015-04-08 2016-03-30 Banda…      51
##  6     5 Ginta… Tv     9.05       6     385 2011-04-04 2012-03-26 Sunri…      51
##  7     6 Ginta… Movie  9.05       7    1746 2021-01-08 NA         Banda…       1
##  8     7 Hunte… Tv     9.05       8      10 2011-10-02 2014-09-24 Madho…     148
##  9     8 Fruit… Tv     9.04       9     551 2021-04-06 2021-06-29 Tms E…      13
## 10     9 Ginta… Tv     9.04      10     695 2012-10-04 2013-03-28 Sunri…      13
## # … with 1,990 more rows, 3 more variables: Genres <chr>, Theme <chr>,
## #   Demographic <fct>, and abbreviated variable names ¹​`Score Rank`,
## #   ²​`Popularity Rank`
library(patchwork)
library(plotly)
popularity_score=anime %>% 
  spread(value = "Score",key= "Demographic") 
shounen=popularity_score %>% 
  filter(!Shounen %in% NA) %>% 
  count(Shounen) 
g1=ggplot(shounen,aes(y=n,x=Shounen))+
  geom_bar(stat = "identity",fill= "#FF61C3") + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.y = element_text(size = 5),
    panel.background = element_rect(fill = "gray98"),
    plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings") + theme(axis.title = element_text(size = 8),
    plot.title = element_text(size = 12,
        face = "bold", hjust = 0.5)) +labs(title = "Shounen")
ggplotly(g1)
shoujo=popularity_score %>% 
  filter(!Shoujo %in% NA) %>% 
  count(Shoujo) 
g2=ggplot(shoujo,aes(y=n,x=Shoujo))+
  geom_bar(stat = "identity",fill= "#D39200") + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.y = element_text(size = 5),
    panel.background = element_rect(fill = "gray98"),
    plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings")+ theme(axis.title = element_text(size = 8),
    plot.title = element_text(size = 12,
        face = "bold", hjust = 0.5)) +labs(title = "Shoujo")
ggplotly(g2)
seinen=popularity_score %>% 
  filter(!Seinen %in% NA) %>% 
  count(Seinen) 
g3=ggplot(seinen,aes(y=n,x=Seinen))+
  geom_bar(stat = "identity",fill= "#93AA00") + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.y = element_text(size = 5),
    panel.background = element_rect(fill = "gray98"),
    plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", c = "Ratings")+ theme(axis.title = element_text(size = 8),
    plot.title = element_text(size = 12,
        face = "bold", hjust = 0.5)) +labs(title = "Seinen")
ggplotly(g3)
kids=popularity_score %>% 
  filter(!Kids %in% NA) %>% 
  count(Kids) 
g4=ggplot(kids,aes(y=n,x=Kids))+
  geom_bar(stat = "identity",fill= "#00C19F") + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.y = element_text(size = 5),
    panel.background = element_rect(fill = "gray98"),
    plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings")+ theme(axis.title = element_text(size = 8),
    plot.title = element_text(size = 12,
        face = "bold", hjust = 0.5)) +labs(title = "Kids")
ggplotly(g4)
Josei=popularity_score %>% 
  filter(!Josei %in% NA) %>% 
  count(Josei) 
g4=ggplot(Josei,aes(y=n,x=Josei))+
  geom_bar(stat = "identity",fill= "#DB72FB",bins=40) + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.y = element_text(size = 5),
    panel.background = element_rect(fill = "gray98"),
    plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings")+ theme(axis.title = element_text(size = 8),
    plot.title = element_text(size = 12,
        face = "bold", hjust = 0.5)) +labs(title = "Josei")
ggplotly(g4)
g1+g2/g3+g4

in terms of anime ranking shounen has most above 9 rating anime follwed by shoujo. Seinen has the most stable rating but does not have a anime over 9 ratings

x1=ggplot(anime,aes(x=Type,y=Score,color=Type,fill=Type))+
  geom_violin() + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text = element_text(colour = "deepskyblue4"),
    plot.title = element_text(colour = "azure3"),
    panel.background = element_rect(fill = "gray99"),
    plot.background = element_rect(colour = "aliceblue"))


x2=ggplot(anime,aes(x=Type,y=Score,color=Type,fill=Type))+
  geom_point() + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text = element_text(colour = "deepskyblue4"),
    plot.title = element_text(colour = "azure3"),
    panel.background = element_rect(fill = "gray99"),
    plot.background = element_rect(colour = "aliceblue"))


x3=ggplot(anime,aes(x=Type,fill=Type))+
  geom_bar() +
  facet_wrap(~Demographic)


ggplotly(x1)
ggplotly(x2)
x3

x1/x2

Thus it is clear that in terms of anime preference TV anime is popular , followed by Movie.

avg_success=anime %>% 
  group_by(Demographic) %>% 
  summarise(mean_score=mean(Score)) %>% 
  arrange(desc(mean_score)) 

ggplot(avg_success %>% filter(!Demographic %in% "Josei"),aes(x=Demographic,y=mean_score,color=Demographic,fill=Demographic))+
  geom_bar(stat = "Identity")

because of sheer number of shounen anime avg mean score is pretty high. But seinen is close second instead of shoujo. Meaning maybe seinen and shounen makes the most decent anime among others

library(DataExplorer)
library(corrgram)
plot_correlation(anime)

anime %>% 
  select(-no) %>% 
  
  corrgram(order=TRUE, upper.panel=panel.cor,main="cor matrix for all data")

anime %>% 
  filter(Demographic %in% "Shoujo") %>%
  select(-no) %>% 
  corrgram(order=TRUE, upper.panel=panel.cor,main="corr score matrix for shoujo")

anime %>% 
  filter(Demographic %in% "Seinen") %>% 
  select(-no) %>% 
  corrgram(order=TRUE, upper.panel=panel.cor,main="corr score matrix for seinen")

anime %>% 
  filter(Demographic %in% "Shounen") %>% 
  select(-no) %>% 
  corrgram(order=TRUE, upper.panel=panel.cor,main="corr score matrix for shounen")

from the correlarion plot it is clear that popularity of an anime depends on its score, so anime with better score has a chance to become more popular

H0=median difference is same

anime %>% 
  select(Type,Score) %>% 
  kruskal.test(data=.,Score~Type)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Score by Type
## Kruskal-Wallis chi-squared = 43.863, df = 5, p-value = 2.47e-08

thus the null hypothesis is rejected meaning there is a difference between movie,ona,music,Special,TV,Ova ratings

anime %>% 
  mutate(Score=cut(Score,breaks=3,labels=c( "meh","good","very good"))) %>% 
  select(Score,Type) %>% 
  table() %>% 
  chisq.test() %>% 
  tidy()
## # A tibble: 1 × 4
##   statistic    p.value parameter method                    
##       <dbl>      <dbl>     <int> <chr>                     
## 1      46.1 0.00000137        10 Pearson's Chi-squared test

thus the null hypothesis is rejected meaning , relation between type and score exists