Import your data

data <- read_csv("../00_data/languages.csv")
## Rows: 4303 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (21): pldb_id, title, description, type, creators, website, domain_name,...
## dbl (24): appeared, domain_name_registered, isbndb, book_count, semantic_sch...
## lgl  (4): features_has_comments, features_has_semantic_indentation, features...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Chapter 15

Create a factor

# Has 40 distinct types, choosing 10 most common for this exercise
data %>% count(type, sort = TRUE)
## # A tibble: 40 × 2
##    type                n
##    <chr>           <int>
##  1 pl               3368
##  2 textMarkup         97
##  3 queryLanguage      94
##  4 xmlFormat          69
##  5 dataNotation       68
##  6 grammarLanguage    67
##  7 esolang            66
##  8 template           55
##  9 textDataFormat     50
## 10 protocol           49
## # ℹ 30 more rows
type_levels <- c("pl", "textMarkup", "queryLanguage", "xmlFormat", "dataNotation", 
                 "grammarLanguage", "esolang", "template", "textDataFormat", "protocol")

data_rev <- data %>%
    select(title, type, number_of_users) %>%
    mutate(type = type %>% factor(levels = type_levels)) %>%
    filter(!is.na(type))

data_rev
## # A tibble: 3,983 × 3
##    title      type          number_of_users
##    <chr>      <fct>                   <dbl>
##  1 Java       pl                    5550123
##  2 JavaScript pl                    5962666
##  3 C          pl                    3793768
##  4 Python     pl                    2818037
##  5 SQL        queryLanguage         7179119
##  6 C++        pl                    4128238
##  7 HTML       textMarkup            5570873
##  8 XML        dataNotation          1917452
##  9 PHP        pl                    2356101
## 10 Perl       pl                     491984
## # ℹ 3,973 more rows

Modify factor order

Make two bar charts here - one before ordering another after

data_summary <- data_rev %>%
    group_by(type) %>%
    summarise(
        number_of_users = mean(number_of_users, na.rm = TRUE)
    )

data_summary
## # A tibble: 10 × 2
##    type            number_of_users
##    <fct>                     <dbl>
##  1 pl                       10794.
##  2 textMarkup               59693.
##  3 queryLanguage           109797.
##  4 xmlFormat                 3699.
##  5 dataNotation             34728.
##  6 grammarLanguage            711.
##  7 esolang                    436.
##  8 template                  3323.
##  9 textDataFormat             343.
## 10 protocol                  4442.
ggplot(data_summary, aes(number_of_users, type)) + geom_point()

ggplot(data_summary, aes(number_of_users, fct_reorder(type, number_of_users))) + geom_point()

Modify factor levels

Show examples of three functions:

  • fct_recode
data_rev %>%
    mutate(type = fct_recode(type,
                             "Programming" = "pl",
                             "TextMarkup" = "textMarkup",
                             "Query" = "queryLanguage",
                             "XML" = "xmlFormat",
                             "Data" = "dataNotation",
                             "Grammar" = "grammarLanguage",
                             "Esoteric" = "esolang",
                             "Template" = "template",
                             "TextData" = "textDataFormat",
                             "Protocol" = "protocol")) %>%
    count(type)
## # A tibble: 10 × 2
##    type            n
##    <fct>       <int>
##  1 Programming  3368
##  2 TextMarkup     97
##  3 Query          94
##  4 XML            69
##  5 Data           68
##  6 Grammar        67
##  7 Esoteric       66
##  8 Template       55
##  9 TextData       50
## 10 Protocol       49
  • fct_collapse
data_rev %>%
    mutate(type = fct_collapse(type,
                               Programming = "pl",
                               Other = c("textMarkup", "queryLanguage", "xmlFormat", "dataNotation",
                               "grammarLanguage", "esolang", "template", "textDataFormat", "protocol"))) %>%
    count(type)
## # A tibble: 2 × 2
##   type            n
##   <fct>       <int>
## 1 Programming  3368
## 2 Other         615
  • fct_lump
data_rev %>%
    mutate(type = fct_lump(type)) %>%
    count(type)
## # A tibble: 2 × 2
##   type      n
##   <fct> <int>
## 1 pl     3368
## 2 Other   615

Chapter 16

No need to do anything here.