library(tidyverse)
library(rstatix)
library(skimr)
library(naniar)
anime_data=read_csv("D:\\wallpapers and photos\\csv\\mal_top2000_anime.csv")
str(anime_data)
## spc_tbl_ [2,000 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:2000] 0 1 2 3 4 5 6 7 8 9 ...
## $ Name : chr [1:2000] "Fullmetal Alchemist: Brotherhood" "Spy x Family" "Shingeki no Kyojin Season 3 Part 2" "Steins;Gate" ...
## $ Type : chr [1:2000] "TV" "TV" "TV" "TV" ...
## $ Score : num [1:2000] 9.14 9.09 9.08 9.08 9.08 9.05 9.05 9.05 9.04 9.04 ...
## $ Score Rank : num [1:2000] 1 2 3 4 5 6 7 8 9 10 ...
## $ Popularity Rank : num [1:2000] 3 350 32 13 335 ...
## $ Air Date : chr [1:2000] "Apr 5, 2009 to Jul 4, 2010" "Apr 9, 2022 to ?" "Apr 29, 2019 to Jul 1, 2019" "Apr 6, 2011 to Sep 14, 2011" ...
## $ Studio : chr [1:2000] "['Bones']" "['Wit Studio', ' CloverWorks']" "['Wit Studio']" "['White Fox']" ...
## $ Num. of episodes: num [1:2000] 64 12 10 24 51 51 1 148 13 13 ...
## $ Genres : chr [1:2000] "['Action', 'Adventure', 'Drama', 'Fantasy']" "['Action', 'Comedy']" "['Action', 'Drama']" "['Drama', 'Sci-Fi', 'Suspense']" ...
## $ Theme(s) : chr [1:2000] "['Military']" "['Childcare']" "['Gore', 'Military', 'Survival']" "['Psychological', 'Time Travel']" ...
## $ Demographic : chr [1:2000] "Shounen" "Shounen" "Shounen" "None" ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. Name = col_character(),
## .. Type = col_character(),
## .. Score = col_double(),
## .. `Score Rank` = col_double(),
## .. `Popularity Rank` = col_double(),
## .. `Air Date` = col_character(),
## .. Studio = col_character(),
## .. `Num. of episodes` = col_double(),
## .. Genres = col_character(),
## .. `Theme(s)` = col_character(),
## .. Demographic = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
skim(anime_data)
| Name | anime_data |
| Number of rows | 2000 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Name | 0 | 1 | 1 | 96 | 0 | 1999 | 0 |
| Type | 0 | 1 | 2 | 7 | 0 | 6 | 0 |
| Air Date | 0 | 1 | 4 | 28 | 0 | 1832 | 0 |
| Studio | 0 | 1 | 7 | 110 | 0 | 273 | 0 |
| Genres | 0 | 1 | 8 | 81 | 0 | 370 | 0 |
| Theme(s) | 0 | 1 | 8 | 87 | 0 | 322 | 0 |
| Demographic | 0 | 1 | 4 | 7 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| …1 | 0 | 1 | 999.50 | 577.49 | 0.00 | 499.75 | 999.50 | 1499.25 | 1999.00 | ▇▇▇▇▇ |
| Score | 0 | 1 | 7.84 | 0.35 | 7.41 | 7.55 | 7.74 | 8.05 | 9.14 | ▇▅▂▁▁ |
| Score Rank | 0 | 1 | 1000.51 | 577.50 | 1.00 | 500.75 | 1000.50 | 1500.25 | 2000.00 | ▇▇▇▇▇ |
| Popularity Rank | 0 | 1 | 2311.68 | 2061.08 | 1.00 | 696.50 | 1778.00 | 3369.25 | 12164.00 | ▇▃▁▁▁ |
| Num. of episodes | 0 | 1 | 16.86 | 49.63 | 0.00 | 1.00 | 12.00 | 21.00 | 1787.00 | ▇▁▁▁▁ |
vis_miss(anime_data)
there is no missing value in the data
lets do some data cleaning to get a better understanding about the data
library(lubridate)
anime=anime_data %>%
rename("no"="...1") %>%
rename("Theme"="Theme(s)") %>%
rename("Episode"="Num. of episodes") %>%
mutate_at(vars(Studio,Theme,Genres),str_squish) %>%
mutate_at(vars(Studio,Theme,Genres),str_remove,"']") %>%
mutate_at(vars(Studio,Theme,Genres),str_remove,"\\[.") %>%
mutate_at(vars(Studio,Theme,Genres),str_remove_all,"\\'") %>%
mutate_if(is.character,str_to_title) %>%
mutate(Demographic=as.factor(Demographic)) %>%
mutate(Type=as.factor(Type)) %>%
separate(`Air Date`,into = c("start","End"),sep = "To") %>%
mutate(start=mdy(start)) %>%
mutate(End=mdy(End))
anime
## # A tibble: 2,000 × 13
## no Name Type Score Score…¹ Popul…² start End Studio Episode
## <dbl> <chr> <fct> <dbl> <dbl> <dbl> <date> <date> <chr> <dbl>
## 1 0 Fullm… Tv 9.14 1 3 2009-04-05 2010-07-04 Bones 64
## 2 1 Spy X… Tv 9.09 2 350 2022-04-09 NA Wit S… 12
## 3 2 Shing… Tv 9.08 3 32 2019-04-29 2019-07-01 Wit S… 10
## 4 3 Stein… Tv 9.08 4 13 2011-04-06 2011-09-14 White… 24
## 5 4 Ginta… Tv 9.08 5 335 2015-04-08 2016-03-30 Banda… 51
## 6 5 Ginta… Tv 9.05 6 385 2011-04-04 2012-03-26 Sunri… 51
## 7 6 Ginta… Movie 9.05 7 1746 2021-01-08 NA Banda… 1
## 8 7 Hunte… Tv 9.05 8 10 2011-10-02 2014-09-24 Madho… 148
## 9 8 Fruit… Tv 9.04 9 551 2021-04-06 2021-06-29 Tms E… 13
## 10 9 Ginta… Tv 9.04 10 695 2012-10-04 2013-03-28 Sunri… 13
## # … with 1,990 more rows, 3 more variables: Genres <chr>, Theme <chr>,
## # Demographic <fct>, and abbreviated variable names ¹`Score Rank`,
## # ²`Popularity Rank`
anime %>%
mutate(Studio=str_remove_all(Studio,"\"\\]")) %>%
separate(Studio,into = c("studio 1","studio 2"),sep = ",") %>%
count(`studio 1`,sort = TRUE)
## # A tibble: 188 × 2
## `studio 1` n
## <chr> <int>
## 1 Production I.g 114
## 2 Sunrise 112
## 3 Toei Animation 105
## 4 Madhouse 95
## 5 Tms Entertainment 79
## 6 J.c.staff 78
## 7 A-1 Pictures 76
## 8 Studio Deen 76
## 9 Bones 63
## 10 Kyoto Animation 62
## # … with 178 more rows
anime %>%
mutate(Studio=str_remove_all(Studio,"\"\\]")) %>%
separate(Studio,into = c("studio 1","studio 2"),sep = ",") %>%
count(`studio 2`,sort = TRUE) %>%
filter(!`studio 2` %in% NA)
## # A tibble: 79 × 2
## `studio 2` n
## <chr> <int>
## 1 " Xebec" 7
## 2 " Studio Deen" 6
## 3 " Lidenfilms" 5
## 4 " Aic" 4
## 5 " Animation Do" 4
## 6 " Egg Firm" 4
## 7 " M.s.c" 4
## 8 " Madhouse" 4
## 9 " Mappa" 3
## 10 " Production I.g" 3
## # … with 69 more rows
hence the data is pretty much cleaned……
anime
## # A tibble: 2,000 × 13
## no Name Type Score Score…¹ Popul…² start End Studio Episode
## <dbl> <chr> <fct> <dbl> <dbl> <dbl> <date> <date> <chr> <dbl>
## 1 0 Fullm… Tv 9.14 1 3 2009-04-05 2010-07-04 Bones 64
## 2 1 Spy X… Tv 9.09 2 350 2022-04-09 NA Wit S… 12
## 3 2 Shing… Tv 9.08 3 32 2019-04-29 2019-07-01 Wit S… 10
## 4 3 Stein… Tv 9.08 4 13 2011-04-06 2011-09-14 White… 24
## 5 4 Ginta… Tv 9.08 5 335 2015-04-08 2016-03-30 Banda… 51
## 6 5 Ginta… Tv 9.05 6 385 2011-04-04 2012-03-26 Sunri… 51
## 7 6 Ginta… Movie 9.05 7 1746 2021-01-08 NA Banda… 1
## 8 7 Hunte… Tv 9.05 8 10 2011-10-02 2014-09-24 Madho… 148
## 9 8 Fruit… Tv 9.04 9 551 2021-04-06 2021-06-29 Tms E… 13
## 10 9 Ginta… Tv 9.04 10 695 2012-10-04 2013-03-28 Sunri… 13
## # … with 1,990 more rows, 3 more variables: Genres <chr>, Theme <chr>,
## # Demographic <fct>, and abbreviated variable names ¹`Score Rank`,
## # ²`Popularity Rank`
library(patchwork)
library(plotly)
popularity_score=anime %>%
spread(value = "Score",key= "Demographic")
shounen=popularity_score %>%
filter(!Shounen %in% NA) %>%
count(Shounen)
g1=ggplot(shounen,aes(y=n,x=Shounen))+
geom_bar(stat = "identity",fill= "#FF61C3") + theme(panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text.y = element_text(size = 5),
panel.background = element_rect(fill = "gray98"),
plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings") + theme(axis.title = element_text(size = 8),
plot.title = element_text(size = 12,
face = "bold", hjust = 0.5)) +labs(title = "Shounen")
ggplotly(g1)
shoujo=popularity_score %>%
filter(!Shoujo %in% NA) %>%
count(Shoujo)
g2=ggplot(shoujo,aes(y=n,x=Shoujo))+
geom_bar(stat = "identity",fill= "#D39200") + theme(panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text.y = element_text(size = 5),
panel.background = element_rect(fill = "gray98"),
plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings")+ theme(axis.title = element_text(size = 8),
plot.title = element_text(size = 12,
face = "bold", hjust = 0.5)) +labs(title = "Shoujo")
ggplotly(g2)
seinen=popularity_score %>%
filter(!Seinen %in% NA) %>%
count(Seinen)
g3=ggplot(seinen,aes(y=n,x=Seinen))+
geom_bar(stat = "identity",fill= "#93AA00") + theme(panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text.y = element_text(size = 5),
panel.background = element_rect(fill = "gray98"),
plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", c = "Ratings")+ theme(axis.title = element_text(size = 8),
plot.title = element_text(size = 12,
face = "bold", hjust = 0.5)) +labs(title = "Seinen")
ggplotly(g3)
kids=popularity_score %>%
filter(!Kids %in% NA) %>%
count(Kids)
g4=ggplot(kids,aes(y=n,x=Kids))+
geom_bar(stat = "identity",fill= "#00C19F") + theme(panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text.y = element_text(size = 5),
panel.background = element_rect(fill = "gray98"),
plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings")+ theme(axis.title = element_text(size = 8),
plot.title = element_text(size = 12,
face = "bold", hjust = 0.5)) +labs(title = "Kids")
ggplotly(g4)
Josei=popularity_score %>%
filter(!Josei %in% NA) %>%
count(Josei)
g4=ggplot(Josei,aes(y=n,x=Josei))+
geom_bar(stat = "identity",fill= "#DB72FB",bins=40) + theme(panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text.y = element_text(size = 5),
panel.background = element_rect(fill = "gray98"),
plot.background = element_rect(fill = "aliceblue")) +labs(y = "numbers", x = "Ratings")+ theme(axis.title = element_text(size = 8),
plot.title = element_text(size = 12,
face = "bold", hjust = 0.5)) +labs(title = "Josei")
ggplotly(g4)
g1+g2/g3+g4
in terms of anime ranking shounen has most above 9 rating anime follwed by shoujo. Seinen has the most stable rating but does not have a anime over 9 ratings
x1=ggplot(anime,aes(x=Type,y=Score,color=Type,fill=Type))+
geom_violin() + theme(panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text = element_text(colour = "deepskyblue4"),
plot.title = element_text(colour = "azure3"),
panel.background = element_rect(fill = "gray99"),
plot.background = element_rect(colour = "aliceblue"))
x2=ggplot(anime,aes(x=Type,y=Score,color=Type,fill=Type))+
geom_point() + theme(panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text = element_text(colour = "deepskyblue4"),
plot.title = element_text(colour = "azure3"),
panel.background = element_rect(fill = "gray99"),
plot.background = element_rect(colour = "aliceblue"))
x3=ggplot(anime,aes(x=Type,fill=Type))+
geom_bar() +
facet_wrap(~Demographic)
ggplotly(x1)
ggplotly(x2)
x3
x1/x2
Thus it is clear that in terms of anime preference TV anime is popular , followed by Movie.
avg_success=anime %>%
group_by(Demographic) %>%
summarise(mean_score=mean(Score)) %>%
arrange(desc(mean_score))
ggplot(avg_success %>% filter(!Demographic %in% "Josei"),aes(x=Demographic,y=mean_score,color=Demographic,fill=Demographic))+
geom_bar(stat = "Identity")
because of sheer number of shounen anime avg mean score is pretty high. But seinen is close second instead of shoujo. Meaning maybe seinen and shounen makes the most decent anime among others
library(DataExplorer)
library(corrgram)
plot_correlation(anime)
anime %>%
select(-no) %>%
corrgram(order=TRUE, upper.panel=panel.cor,main="cor matrix for all data")
anime %>%
filter(Demographic %in% "Shoujo") %>%
select(-no) %>%
corrgram(order=TRUE, upper.panel=panel.cor,main="corr score matrix for shoujo")
anime %>%
filter(Demographic %in% "Seinen") %>%
select(-no) %>%
corrgram(order=TRUE, upper.panel=panel.cor,main="corr score matrix for seinen")
anime %>%
filter(Demographic %in% "Shounen") %>%
select(-no) %>%
corrgram(order=TRUE, upper.panel=panel.cor,main="corr score matrix for shounen")
from the correlarion plot it is clear that popularity of an anime
depends on its score, so anime with better score has a chance to become
more popular
H0=median difference is same
anime %>%
select(Type,Score) %>%
kruskal.test(data=.,Score~Type)
##
## Kruskal-Wallis rank sum test
##
## data: Score by Type
## Kruskal-Wallis chi-squared = 43.863, df = 5, p-value = 2.47e-08
thus the null hypothesis is rejected meaning there is a difference between movie,ona,music,Special,TV,Ova ratings
anime %>%
mutate(Score=cut(Score,breaks=3,labels=c( "meh","good","very good"))) %>%
select(Score,Type) %>%
table() %>%
chisq.test() %>%
tidy()
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <int> <chr>
## 1 46.1 0.00000137 10 Pearson's Chi-squared test
thus the null hypothesis is rejected meaning , relation between type and score exists