data <- read_csv("../00_data/languages.csv")
## Rows: 4303 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (21): pldb_id, title, description, type, creators, website, domain_name,...
## dbl (24): appeared, domain_name_registered, isbndb, book_count, semantic_sch...
## lgl (4): features_has_comments, features_has_semantic_indentation, features...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Has 40 distinct types, choosing 10 most common for this exercise
data %>% count(type, sort = TRUE)
## # A tibble: 40 × 2
## type n
## <chr> <int>
## 1 pl 3368
## 2 textMarkup 97
## 3 queryLanguage 94
## 4 xmlFormat 69
## 5 dataNotation 68
## 6 grammarLanguage 67
## 7 esolang 66
## 8 template 55
## 9 textDataFormat 50
## 10 protocol 49
## # ℹ 30 more rows
type_levels <- c("pl", "textMarkup", "queryLanguage", "xmlFormat", "dataNotation",
"grammarLanguage", "esolang", "template", "textDataFormat", "protocol")
data_rev <- data %>%
select(title, type, number_of_users) %>%
mutate(type = type %>% factor(levels = type_levels)) %>%
filter(!is.na(type))
data_rev
## # A tibble: 3,983 × 3
## title type number_of_users
## <chr> <fct> <dbl>
## 1 Java pl 5550123
## 2 JavaScript pl 5962666
## 3 C pl 3793768
## 4 Python pl 2818037
## 5 SQL queryLanguage 7179119
## 6 C++ pl 4128238
## 7 HTML textMarkup 5570873
## 8 XML dataNotation 1917452
## 9 PHP pl 2356101
## 10 Perl pl 491984
## # ℹ 3,973 more rows
Make two bar charts here - one before ordering another after
data_summary <- data_rev %>%
group_by(type) %>%
summarise(
number_of_users = mean(number_of_users, na.rm = TRUE)
)
data_summary
## # A tibble: 10 × 2
## type number_of_users
## <fct> <dbl>
## 1 pl 10794.
## 2 textMarkup 59693.
## 3 queryLanguage 109797.
## 4 xmlFormat 3699.
## 5 dataNotation 34728.
## 6 grammarLanguage 711.
## 7 esolang 436.
## 8 template 3323.
## 9 textDataFormat 343.
## 10 protocol 4442.
ggplot(data_summary, aes(number_of_users, type)) + geom_point()
ggplot(data_summary, aes(number_of_users, fct_reorder(type, number_of_users))) + geom_point()
Show examples of three functions:
data_rev %>%
mutate(type = fct_recode(type,
"Programming" = "pl",
"TextMarkup" = "textMarkup",
"Query" = "queryLanguage",
"XML" = "xmlFormat",
"Data" = "dataNotation",
"Grammar" = "grammarLanguage",
"Esoteric" = "esolang",
"Template" = "template",
"TextData" = "textDataFormat",
"Protocol" = "protocol")) %>%
count(type)
## # A tibble: 10 × 2
## type n
## <fct> <int>
## 1 Programming 3368
## 2 TextMarkup 97
## 3 Query 94
## 4 XML 69
## 5 Data 68
## 6 Grammar 67
## 7 Esoteric 66
## 8 Template 55
## 9 TextData 50
## 10 Protocol 49
data_rev %>%
mutate(type = fct_collapse(type,
Programming = "pl",
Other = c("textMarkup", "queryLanguage", "xmlFormat", "dataNotation",
"grammarLanguage", "esolang", "template", "textDataFormat", "protocol"))) %>%
count(type)
## # A tibble: 2 × 2
## type n
## <fct> <int>
## 1 Programming 3368
## 2 Other 615
data_rev %>%
mutate(type = fct_lump(type)) %>%
count(type)
## # A tibble: 2 × 2
## type n
## <fct> <int>
## 1 pl 3368
## 2 Other 615
No need to do anything here.