Import Data
languages <- read_csv("../00_data/languages.csv")
## Rows: 4303 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (21): pldb_id, title, description, type, creators, website, domain_name,...
## dbl (24): appeared, domain_name_registered, isbndb, book_count, semantic_sch...
## lgl (4): features_has_comments, features_has_semantic_indentation, features...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
languages
## # A tibble: 4,303 × 49
## pldb_id title description type appeared creators website domain_name
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 java Java <NA> pl 1995 James G… https:… <NA>
## 2 javascript JavaScript <NA> pl 1995 Brendan… <NA> <NA>
## 3 c C <NA> pl 1972 Dennis … <NA> <NA>
## 4 python Python <NA> pl 1991 Guido v… https:… python.org
## 5 sql SQL <NA> quer… 1974 Donald … <NA> <NA>
## 6 cpp C++ <NA> pl 1985 Bjarne … http:/… isocpp.org
## 7 html HTML <NA> text… 1991 Tim Ber… <NA> <NA>
## 8 xml XML <NA> data… 1996 <NA> <NA> <NA>
## 9 php PHP <NA> pl 1995 Rasmus … https:… php.net
## 10 perl Perl <NA> pl 1987 Larry W… https:… perl.org
## # ℹ 4,293 more rows
## # ℹ 41 more variables: domain_name_registered <dbl>, reference <chr>,
## # isbndb <dbl>, book_count <dbl>, semantic_scholar <dbl>,
## # language_rank <dbl>, github_repo <chr>, github_repo_stars <dbl>,
## # github_repo_forks <dbl>, github_repo_updated <dbl>,
## # github_repo_subscribers <dbl>, github_repo_created <dbl>,
## # github_repo_description <chr>, github_repo_issues <dbl>, …
Introduction
Questions
Variation
Visualizing distributions
languages %>%
filter(!is.na(github_language_type)) %>%
ggplot(aes(x = github_language_type)) +
geom_bar()

languages %>%
filter(!is.na(book_count)) %>%
ggplot(mapping = aes(x = book_count)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

languages %>%
filter(!is.na(book_count)) %>%
filter(book_count > 10) %>%
ggplot(mapping = aes(x = book_count)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

languages %>%
filter(!is.na(github_language_type)) %>%
ggplot(aes(x = number_of_users, color = github_language_type)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values
Unusual values
languages %>%
filter(!is.na(book_count)) %>%
ggplot(mapping = aes(x = book_count)) +
geom_histogram() +
coord_cartesian(ylim = c(0, 75))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values
languages %>%
mutate(y = ifelse(number_of_jobs < 100 | number_of_jobs > 100000, NA, number_of_jobs)) %>%
filter(!is.na(book_count)) %>%
ggplot(aes(x = book_count, y = y)) +
geom_point()
## Warning: Removed 4232 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation
A categorical and continuous variable
languages %>%
filter(!is.na(github_language_type), number_of_jobs > 0) %>%
ggplot(aes(x = github_language_type, y = number_of_jobs)) +
geom_boxplot()

Two categorical variables
languages %>%
mutate(over_50k_users = number_of_jobs > 50000) %>% # Creating a new, arbitrary categorical variable
filter(!is.na(github_language_type)) %>%
count(github_language_type, over_50k_users) %>%
ggplot(aes(x = github_language_type, y = over_50k_users, fill = n)) +
geom_tile()

Two continous variables
library(hexbin)
languages %>%
filter(book_count > 0, number_of_jobs > 0) %>%
ggplot(aes(x = book_count, y = number_of_jobs)) +
geom_hex()

Patterns and models
# Doesn't really make sense for my dataset, there aren't any clear trendlines between the variables I'm testing.
#library(modelr)
#model <- lm(log(number_of_jobs) ~ log(book_count), data = languages)
#languages2 <- languages %>%
# filter(book_count > 0, number_of_jobs > 0) %>%
# modelr::add_residuals(model) %>%
# mutate(resid = exp(resid))
#languages2 %>%
# ggplot(aes(book_count, resid)) +
# geom_point()
#languages2 %>%
# ggplot(aes(number_of_jobs, resid)) +
# geom_boxplot()