Import Data

languages <- read_csv("../00_data/languages.csv")
## Rows: 4303 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (21): pldb_id, title, description, type, creators, website, domain_name,...
## dbl (24): appeared, domain_name_registered, isbndb, book_count, semantic_sch...
## lgl  (4): features_has_comments, features_has_semantic_indentation, features...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
languages
## # A tibble: 4,303 × 49
##    pldb_id    title      description type  appeared creators website domain_name
##    <chr>      <chr>      <chr>       <chr>    <dbl> <chr>    <chr>   <chr>      
##  1 java       Java       <NA>        pl        1995 James G… https:… <NA>       
##  2 javascript JavaScript <NA>        pl        1995 Brendan… <NA>    <NA>       
##  3 c          C          <NA>        pl        1972 Dennis … <NA>    <NA>       
##  4 python     Python     <NA>        pl        1991 Guido v… https:… python.org 
##  5 sql        SQL        <NA>        quer…     1974 Donald … <NA>    <NA>       
##  6 cpp        C++        <NA>        pl        1985 Bjarne … http:/… isocpp.org 
##  7 html       HTML       <NA>        text…     1991 Tim Ber… <NA>    <NA>       
##  8 xml        XML        <NA>        data…     1996 <NA>     <NA>    <NA>       
##  9 php        PHP        <NA>        pl        1995 Rasmus … https:… php.net    
## 10 perl       Perl       <NA>        pl        1987 Larry W… https:… perl.org   
## # ℹ 4,293 more rows
## # ℹ 41 more variables: domain_name_registered <dbl>, reference <chr>,
## #   isbndb <dbl>, book_count <dbl>, semantic_scholar <dbl>,
## #   language_rank <dbl>, github_repo <chr>, github_repo_stars <dbl>,
## #   github_repo_forks <dbl>, github_repo_updated <dbl>,
## #   github_repo_subscribers <dbl>, github_repo_created <dbl>,
## #   github_repo_description <chr>, github_repo_issues <dbl>, …

Introduction

Questions

Variation

Visualizing distributions

languages %>%
    filter(!is.na(github_language_type)) %>%
    ggplot(aes(x = github_language_type)) +
    geom_bar()

languages %>%
    filter(!is.na(book_count)) %>%
    ggplot(mapping = aes(x = book_count)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

languages %>%
    filter(!is.na(book_count)) %>%
    filter(book_count > 10) %>%
    ggplot(mapping = aes(x = book_count)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

languages %>%
    filter(!is.na(github_language_type)) %>%
    ggplot(aes(x = number_of_users, color = github_language_type)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values

Unusual values

languages %>%
    filter(!is.na(book_count)) %>%
    ggplot(mapping = aes(x = book_count)) +
    geom_histogram() + 
    coord_cartesian(ylim = c(0, 75))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values

languages %>%
    mutate(y = ifelse(number_of_jobs < 100 | number_of_jobs > 100000, NA, number_of_jobs)) %>%
    filter(!is.na(book_count)) %>%
    ggplot(aes(x = book_count, y = y)) +
    geom_point()
## Warning: Removed 4232 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

A categorical and continuous variable

languages %>%
    filter(!is.na(github_language_type), number_of_jobs > 0) %>%
    ggplot(aes(x = github_language_type, y = number_of_jobs)) +
    geom_boxplot()

Two categorical variables

languages %>%
    mutate(over_50k_users = number_of_jobs > 50000) %>% # Creating a new, arbitrary categorical variable
    filter(!is.na(github_language_type)) %>%
    count(github_language_type, over_50k_users) %>%
    ggplot(aes(x = github_language_type, y = over_50k_users, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)
languages %>%
    filter(book_count > 0, number_of_jobs > 0) %>%
    ggplot(aes(x = book_count, y = number_of_jobs)) +
    geom_hex()

Patterns and models

# Doesn't really make sense for my dataset, there aren't any clear trendlines between the variables I'm testing.

#library(modelr)

#model <- lm(log(number_of_jobs) ~ log(book_count), data = languages)

#languages2 <- languages %>%
#    filter(book_count > 0, number_of_jobs > 0) %>%
#    modelr::add_residuals(model) %>%
#    mutate(resid = exp(resid))

#languages2 %>%
#    ggplot(aes(book_count, resid)) + 
#    geom_point()

#languages2 %>%
#    ggplot(aes(number_of_jobs, resid)) +
#    geom_boxplot()