Myanimelist collects the ratings of several million users, and uses
those to make an overall anime rank. The problem with this method is
that some people systematically give higher ratings than others, and
those people might be more likely to watch certain shows. Conversely,
there is the same issue on the user side, where some people might
deliberately search out shows with higher ratings than others.
Normally this issue would be solved with regression, which cannot be
done here because there are 10,000 different titles and 1,000,000
different users, making it impossible to run the algorithm due to memory
constraints. Instead, I used an interated method where I observed which
raters gave higher ratings than others controlling for the anime they
watched, and what anime received higher ratings than others controlling
for the users who rated them. Because the confounding goes both ways, so
I had to repeat the method to get roughly reliable results.
Loading and cleaning data
setwd('~')
Warning: The working directory was changed to C:/Users/micha/OneDrive/Documents inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
setwd('rfolder/anime')
animeids <- read.csv('data/anime.csv', sep='\t')
#original anime ranks
ogranks <- read.csv('data2/aranks_v1.csv', sep=',')
#first iteration
aranks2 <- read.csv('data2/aranks_v2.csv', sep=',')
#fifth iteration
aranks5 <- read.csv('data2/aranks_v6.csv', sep=',')
sequel <- read.csv('data2/sequels.csv', sep=',') %>% select(anime_id, Sequel)
#tenth iteration
aranks10 <- read.csv('data2/aranks_v11.csv', sep=',')
animeids <- read.csv('data/anime.csv', sep='\t')
names <- animeids %>% select(anime_id, title)
original_ranks <- left_join(ogranks, names, by='anime_id') %>% arrange(-animescore)
first_iteration <- left_join(aranks2, names, by='anime_id') %>% arrange(-animescore)
fifth_iteration <- left_join(aranks5, names, by='anime_id') %>% arrange(-animescore)
tenth_iteration <- left_join(aranks10, names, by='anime_id') %>% arrange(-animescore)
mean(original_ranks$animescore, na.rm=T)
[1] 6.284712
sd(original_ranks$animescore, na.rm=T)
[1] 1.052224
first_iteration$animescore = normalise(first_iteration$animescore)*1.052224+6.284712
fifth_iteration$animescore = normalise(fifth_iteration$animescore)*1.052224+6.284712
tenth_iteration$animescore = normalise(tenth_iteration$animescore)*1.052224+6.284712
first_iteration$animescore1 = first_iteration$animescore
fifth_iteration$animescore5 = fifth_iteration$animescore
tenth_iteration$animescore10 = tenth_iteration$animescore
newdata <- left_join(original_ranks, first_iteration %>% select(anime_id, animescore1), by='anime_id') %>% arrange(-animescore)
newdata <- left_join(newdata, fifth_iteration %>% select(anime_id, animescore5), by='anime_id') %>% arrange(-animescore)
newdata <- left_join(newdata, tenth_iteration %>% select(anime_id, animescore10), by='anime_id') %>% arrange(-animescore)
newdata <- left_join(newdata, sequel %>% select(anime_id, Sequel), by='anime_id') %>% arrange(-animescore)
newdata$adv <- newdata$animescore10 - newdata$animescore
Correlation matrix. Correlation between fifth iteration ratings and
tenth iteration is about .996.
correlation_matrix(newdata %>% select(animescore, animescore1, animescore5, animescore10))
animescore animescore1 animescore5 animescore10
animescore "NA" "0.912 ***" "0.952 ***" "0.952 ***"
animescore1 "0.912 ***" "NA" "0.967 ***" "0.959 ***"
animescore5 "0.952 ***" "0.967 ***" "NA" "1 ***"
animescore10 "0.952 ***" "0.959 ***" "1 ***" "NA"
The rankings
Kicking out anime with less than 5k ratings
newdata$originalrank <- ranker(newdata$animescore)
newdata$newrank <- ranker(newdata$animescore10)
newdata2 <- newdata %>% filter(an > 4999)
Top 20 for each category. animescore – original rating, animescore10
– user adjusted rating.
original <- newdata2 %>% filter(Sequel==0)
sequel <- newdata2 %>% filter(Sequel==1)
head(newdata2 %>% filter(Sequel==1) %>% select(title, animescore) %>% arrange(-animescore), n=20)
head(newdata2 %>% filter(Sequel==1) %>% select(title, animescore10) %>% arrange(-animescore10), n=20)
head(newdata2 %>% filter(Sequel==0) %>% select(title, animescore) %>% arrange(-animescore), n=20)
head(newdata2 %>% filter(Sequel==0) %>% select(title, animescore10) %>% arrange(-animescore10), n=20)
head(newdata2 %>% filter(Sequel==0) %>% select(title, adv) %>% arrange(-adv), n=20)
head(newdata2 %>% filter(Sequel==0) %>% select(title, adv) %>% arrange(adv), n=20)
head(newdata2 %>% filter(Sequel==1) %>% select(title, adv) %>% arrange(-adv), n=20)
head(newdata2 %>% filter(Sequel==1) %>% select(title, adv) %>% arrange(adv), n=20)
Some notes: - Traditionally “elitist” anime like Ashita no Joe,
Kaiba, and Legend of the Galactic Heroes gain the most from the
user-adjusted method, as the kind of people who are predisposed to
watching them are harsher raters. - Anime watched by men tend to gain
more from user-adjustment, and vice versa. - Legend of the Galactic
Heroes goes from top 4 to indisputably number 1 after adjusting for user
ratings.
This is the code snippet I used to calculate the user-adjusted
ratings. I had to run 10 iterations separately (no loops) of it due to
memory limitations.
```r
setwd('~')
setwd('Documents/rstuff/anime')
animeids <- read.csv('data/anime.csv', sep='\t')
test <- read.csv('data2/fullfilev1.csv', sep=',')
ranksv3 <- read.csv('data2/ranks_v3.csv', sep=',')
names <- animeids %>% select(anime_id, title)
########3naive means
m <- test %>% group_by(anime_id) %>% summarise(animescore=mean(score, na.rm=T), n=n())
m2 <- test %>% group_by(user_id) %>% summarise(userscore=mean(score, na.rm=T), n=n())
ranks <- left_join(m, names, by='anime_id') %>% arrange(-animescore)
uranks <- m2 %>% arrange(-userscore)
##############
test2 <- test
test2$ascore2 <- test2$score
test2$uscore2 <- test2$score
test2$score2 <- test2$score
########3
uranks <- test2 %>% group_by(user_id) %>% summarise(userscore=mean(uscore2, na.rm=T), un=n()) %>% arrange(-userscore)
aranks <- test2 %>% group_by(anime_id) %>% summarise(animescore=mean(ascore2, na.rm=T), an=n()) %>% arrange(-animescore)
write.csv(aranks, paste0('data2/aranks_v', 1, '.csv'))
write.csv(uranks, paste0('data2/uranks_v', 1, '.csv'))
test2 <- left_join(test2, uranks %>% select(user_id, userscore, un), by='user_id')
test2 <- left_join(test2, aranks %>% select(anime_id, animescore, an), by='anime_id')
test2$adiff <- test2$score2-test2$userscore
test2$udiff <- test2$score2-test2$animescore
test2$ascore2 <- test2$adiff+mean(aranks$animescore, na.rm=T)
test2$uscore2 <- test2$udiff+mean(uranks$userscore, na.rm=T)
aranks2 <- test2 %>% group_by(anime_id) %>% summarise(animescore=mean(ascore2, na.rm=T), an=n()) %>% arrange(-animescore)
uranks2 <- test2 %>% group_by(user_id) %>% summarise(userscore=mean(uscore2, na.rm=T), un=n()) %>% arrange(-userscore)
aranks2 <- left_join(names, aranks2 %>% select(anime_id, animescore, an), by='anime_id')
write.csv(aranks2, paste0('data2/aranks_v', 2, '.csv'))
write.csv(uranks2, paste0('data2/uranks_v', 2, '.csv'))
```
