Week 8: Apply it to your data 7

Import your data

data <- read_csv("00_data/languages.csv")

## Rows: 4303 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (21): pldb_id, title, description, type, creators, website, domain_name,...
## dbl (24): appeared, domain_name_registered, isbndb, book_count, semantic_sch...
## lgl  (4): features_has_comments, features_has_semantic_indentation, features...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data_small <- data %>%
    select(title, number_of_jobs, number_of_users) %>%
    filter(number_of_users > 1000000)

data_small

## # A tibble: 13 × 3
##    title      number_of_jobs number_of_users
##    <chr>               <dbl>           <dbl>
##  1 Java                85206         5550123
##  2 JavaScript          63993         5962666
##  3 C                   59919         3793768
##  4 Python              46976         2818037
##  5 SQL                219617         7179119
##  6 C++                 61098         4128238
##  7 HTML                69531         5570873
##  8 XML                 42277         1917452
##  9 PHP                 30349         2356101
## 10 MATLAB              32228         2661579
## 11 MySQL               47466         2608362
## 12 CSS                 45617         2881846
## 13 R                   14173         1075613

Pivoting

wide to long form

# Performing pivot_longer first since the data is already wide

long_data <- data_small %>%
    pivot_longer(
        cols = c(number_of_jobs, number_of_users),
        names_to = "type",
        values_to = "quantity"
    )

long_data

## # A tibble: 26 × 3
##    title      type            quantity
##    <chr>      <chr>              <dbl>
##  1 Java       number_of_jobs     85206
##  2 Java       number_of_users  5550123
##  3 JavaScript number_of_jobs     63993
##  4 JavaScript number_of_users  5962666
##  5 C          number_of_jobs     59919
##  6 C          number_of_users  3793768
##  7 Python     number_of_jobs     46976
##  8 Python     number_of_users  2818037
##  9 SQL        number_of_jobs    219617
## 10 SQL        number_of_users  7179119
## # ℹ 16 more rows

long to wide form

wide_data <- long_data %>%
    pivot_wider(
        names_from = "type",
        values_from = "quantity"
    )

wide_data

## # A tibble: 13 × 3
##    title      number_of_jobs number_of_users
##    <chr>               <dbl>           <dbl>
##  1 Java                85206         5550123
##  2 JavaScript          63993         5962666
##  3 C                   59919         3793768
##  4 Python              46976         2818037
##  5 SQL                219617         7179119
##  6 C++                 61098         4128238
##  7 HTML                69531         5570873
##  8 XML                 42277         1917452
##  9 PHP                 30349         2356101
## 10 MATLAB              32228         2661579
## 11 MySQL               47466         2608362
## 12 CSS                 45617         2881846
## 13 R                   14173         1075613

Separating and Uniting

Unite two columns

data_small <- data %>%
    select(title, appeared, domain_name_registered)

data_small

## # A tibble: 4,303 × 3
##    title      appeared domain_name_registered
##    <chr>         <dbl>                  <dbl>
##  1 Java           1995                     NA
##  2 JavaScript     1995                     NA
##  3 C              1972                     NA
##  4 Python         1991                   1995
##  5 SQL            1974                     NA
##  6 C++            1985                   2012
##  7 HTML           1991                     NA
##  8 XML            1996                     NA
##  9 PHP            1995                   1997
## 10 Perl           1987                   1995
## # ℹ 4,293 more rows

data_united <- data_small %>%
    filter(!is.na(domain_name_registered)) %>%
    unite(col = "years", c("appeared", "domain_name_registered"), sep = "/", remove = TRUE)

data_united

## # A tibble: 502 × 2
##    title  years    
##    <chr>  <chr>    
##  1 Python 1991/1995
##  2 C++    1985/2012
##  3 PHP    1995/1997
##  4 Perl   1987/1995
##  5 MySQL  1995/1999
##  6 R      1993/1999
##  7 JSON   2001/2000
##  8 Scala  2004/2007
##  9 Ada    1980/1998
## 10 Rust   2010/2010
## # ℹ 492 more rows

Separate a column

data_united %>%
    separate(col = "years", into = c("appeared", "domain_name_registered"), sep = "/")

## # A tibble: 502 × 3
##    title  appeared domain_name_registered
##    <chr>  <chr>    <chr>                 
##  1 Python 1991     1995                  
##  2 C++    1985     2012                  
##  3 PHP    1995     1997                  
##  4 Perl   1987     1995                  
##  5 MySQL  1995     1999                  
##  6 R      1993     1999                  
##  7 JSON   2001     2000                  
##  8 Scala  2004     2007                  
##  9 Ada    1980     1998                  
## 10 Rust   2010     2010                  
## # ℹ 492 more rows

Missing Values

data_small <- data %>%
    select(title, type, description) %>%
    filter(type %in% c("queryLanguage", "textMarkup"))

data_small

## # A tibble: 191 × 3
##    title               type          description                                
##    <chr>               <chr>         <chr>                                      
##  1 SQL                 queryLanguage <NA>                                       
##  2 HTML                textMarkup    <NA>                                       
##  3 MySQL               queryLanguage <NA>                                       
##  4 PostgreSQL          queryLanguage <NA>                                       
##  5 Regular Expressions queryLanguage <NA>                                       
##  6 PostScript          textMarkup    Postscript is a graphical page description…
##  7 GraphQL             queryLanguage <NA>                                       
##  8 LaTeX               textMarkup    <NA>                                       
##  9 Markdown            textMarkup    <NA>                                       
## 10 SQLite              queryLanguage <NA>                                       
## # ℹ 181 more rows

data_small %>%
    complete(title, type)

## # A tibble: 380 × 3
##    title           type          description                                    
##    <chr>           <chr>         <chr>                                          
##  1 Amazon Redshift queryLanguage <NA>                                           
##  2 Amazon Redshift textMarkup    <NA>                                           
##  3 Apache Lucene   queryLanguage <NA>                                           
##  4 Apache Lucene   textMarkup    <NA>                                           
##  5 Argdown         queryLanguage <NA>                                           
##  6 Argdown         textMarkup    <NA>                                           
##  7 AsciiDoc        queryLanguage <NA>                                           
##  8 AsciiDoc        textMarkup    <NA>                                           
##  9 AsciiMath       queryLanguage <NA>                                           
## 10 AsciiMath       textMarkup    AsciiMath is an easy-to-write markup language …
## # ℹ 370 more rows

data_small %>%
    fill(description, .direction = "up")

## # A tibble: 191 × 3
##    title               type          description                                
##    <chr>               <chr>         <chr>                                      
##  1 SQL                 queryLanguage Postscript is a graphical page description…
##  2 HTML                textMarkup    Postscript is a graphical page description…
##  3 MySQL               queryLanguage Postscript is a graphical page description…
##  4 PostgreSQL          queryLanguage Postscript is a graphical page description…
##  5 Regular Expressions queryLanguage Postscript is a graphical page description…
##  6 PostScript          textMarkup    Postscript is a graphical page description…
##  7 GraphQL             queryLanguage Org is a mode for keeping notes, maintaini…
##  8 LaTeX               textMarkup    Org is a mode for keeping notes, maintaini…
##  9 Markdown            textMarkup    Org is a mode for keeping notes, maintaini…
## 10 SQLite              queryLanguage Org is a mode for keeping notes, maintaini…
## # ℹ 181 more rows

Week 8: Apply it to your data 7

Riley Levasseur

2026-03-07

Import your data

Pivoting

wide to long form

long to wide form

Separating and Uniting

Unite two columns

Separate a column

Missing Values