HF Hub Models

library(huggingfaceR)
library(tidyverse)

# Grab all of the models' data
all_models <- hf_search_models()

# Get the names of all models for filtering
get_names <- function(x) map(x, ~ names(.x))
all_models_names <- get_names(all_models)
id_col <- 1:length(all_models_names)

# Create tibble for pipeline
all_models_data <- tibble(id_col, all_models_names)

# Get models with data for downloads - or mapping will fail, then pull ids to filte
filter_ids <- all_models_data %>%
  unnest(all_models_names) %>%
  group_by(id_col) %>%
  filter("downloads" %in% all_models_names) %>%
  ungroup() %>%
  distinct(id_col) %>%
  pull(id_col)

# Keep only models with downloads info
all_models <- all_models[filter_ids]

# Extract the data we're interested in
models_with_downloads <- all_models %>%
  map_dfr(~ tibble(
    model = .x$modelId,
    downloads = .x$downloads,
    task = .x$pipeline_tag,
    sha = .x$sha,
    private = .x$private
  ))

huggingfaceR::models_with_downloads %>%
  count(task, sort = TRUE)

## # A tibble: 30 × 2
##    task                             n
##    <chr>                        <int>
##  1 <NA>                         18549
##  2 text-classification           7935
##  3 text-generation               5136
##  4 text2text-generation          4780
##  5 fill-mask                     3473
##  6 automatic-speech-recognition  2654
##  7 token-classification          2648
##  8 question-answering            1954
##  9 reinforcement-learning        1621
## 10 translation                   1612
## # … with 20 more rows

huggingfaceR::models_with_downloads %>%
  filter(task == "text-classification")

## # A tibble: 7,935 × 5
##    model                                           downloads task  sha   private
##    <chr>                                               <int> <chr> <chr> <lgl>  
##  1 distilbert-base-uncased-finetuned-sst-2-english   8786731 text… 83db… FALSE  
##  2 cardiffnlp/twitter-roberta-base-sentiment         1269394 text… b636… FALSE  
##  3 cross-encoder/ms-marco-MiniLM-L-12-v2             1104884 text… 97f7… FALSE  
##  4 cardiffnlp/twitter-xlm-roberta-base-sentiment      935857 text… f3e3… FALSE  
##  5 cardiffnlp/twitter-roberta-base-sentiment-late…    843827 text… 5916… FALSE  
##  6 ProsusAI/finbert                                   764115 text… 5ea6… FALSE  
##  7 bhadresh-savani/distilbert-base-uncased-emotion    563660 text… fff2… FALSE  
##  8 cross-encoder/ms-marco-TinyBERT-L-2                477985 text… e9ed… FALSE  
##  9 daigo/bert-base-japanese-sentiment                 368812 text… 51ac… FALSE  
## 10 yiyanghkust/finbert-tone                           313288 text… 6950… FALSE  
## # … with 7,925 more rows

HF Hub Models

Jack Penzer

2022-08-05