Import your data
data <- read_csv("00_data/languages.csv")
## Rows: 4303 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (21): pldb_id, title, description, type, creators, website, domain_name,...
## dbl (24): appeared, domain_name_registered, isbndb, book_count, semantic_sch...
## lgl (4): features_has_comments, features_has_semantic_indentation, features...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_small <- data %>%
select(title, number_of_jobs, number_of_users) %>%
filter(number_of_users > 1000000)
data_small
## # A tibble: 13 × 3
## title number_of_jobs number_of_users
## <chr> <dbl> <dbl>
## 1 Java 85206 5550123
## 2 JavaScript 63993 5962666
## 3 C 59919 3793768
## 4 Python 46976 2818037
## 5 SQL 219617 7179119
## 6 C++ 61098 4128238
## 7 HTML 69531 5570873
## 8 XML 42277 1917452
## 9 PHP 30349 2356101
## 10 MATLAB 32228 2661579
## 11 MySQL 47466 2608362
## 12 CSS 45617 2881846
## 13 R 14173 1075613
Separating and Uniting
Unite two columns
data_small <- data %>%
select(title, appeared, domain_name_registered)
data_small
## # A tibble: 4,303 × 3
## title appeared domain_name_registered
## <chr> <dbl> <dbl>
## 1 Java 1995 NA
## 2 JavaScript 1995 NA
## 3 C 1972 NA
## 4 Python 1991 1995
## 5 SQL 1974 NA
## 6 C++ 1985 2012
## 7 HTML 1991 NA
## 8 XML 1996 NA
## 9 PHP 1995 1997
## 10 Perl 1987 1995
## # ℹ 4,293 more rows
data_united <- data_small %>%
filter(!is.na(domain_name_registered)) %>%
unite(col = "years", c("appeared", "domain_name_registered"), sep = "/", remove = TRUE)
data_united
## # A tibble: 502 × 2
## title years
## <chr> <chr>
## 1 Python 1991/1995
## 2 C++ 1985/2012
## 3 PHP 1995/1997
## 4 Perl 1987/1995
## 5 MySQL 1995/1999
## 6 R 1993/1999
## 7 JSON 2001/2000
## 8 Scala 2004/2007
## 9 Ada 1980/1998
## 10 Rust 2010/2010
## # ℹ 492 more rows
Separate a column
data_united %>%
separate(col = "years", into = c("appeared", "domain_name_registered"), sep = "/")
## # A tibble: 502 × 3
## title appeared domain_name_registered
## <chr> <chr> <chr>
## 1 Python 1991 1995
## 2 C++ 1985 2012
## 3 PHP 1995 1997
## 4 Perl 1987 1995
## 5 MySQL 1995 1999
## 6 R 1993 1999
## 7 JSON 2001 2000
## 8 Scala 2004 2007
## 9 Ada 1980 1998
## 10 Rust 2010 2010
## # ℹ 492 more rows
Missing Values
data_small <- data %>%
select(title, type, description) %>%
filter(type %in% c("queryLanguage", "textMarkup"))
data_small
## # A tibble: 191 × 3
## title type description
## <chr> <chr> <chr>
## 1 SQL queryLanguage <NA>
## 2 HTML textMarkup <NA>
## 3 MySQL queryLanguage <NA>
## 4 PostgreSQL queryLanguage <NA>
## 5 Regular Expressions queryLanguage <NA>
## 6 PostScript textMarkup Postscript is a graphical page description…
## 7 GraphQL queryLanguage <NA>
## 8 LaTeX textMarkup <NA>
## 9 Markdown textMarkup <NA>
## 10 SQLite queryLanguage <NA>
## # ℹ 181 more rows
data_small %>%
complete(title, type)
## # A tibble: 380 × 3
## title type description
## <chr> <chr> <chr>
## 1 Amazon Redshift queryLanguage <NA>
## 2 Amazon Redshift textMarkup <NA>
## 3 Apache Lucene queryLanguage <NA>
## 4 Apache Lucene textMarkup <NA>
## 5 Argdown queryLanguage <NA>
## 6 Argdown textMarkup <NA>
## 7 AsciiDoc queryLanguage <NA>
## 8 AsciiDoc textMarkup <NA>
## 9 AsciiMath queryLanguage <NA>
## 10 AsciiMath textMarkup AsciiMath is an easy-to-write markup language …
## # ℹ 370 more rows
data_small %>%
fill(description, .direction = "up")
## # A tibble: 191 × 3
## title type description
## <chr> <chr> <chr>
## 1 SQL queryLanguage Postscript is a graphical page description…
## 2 HTML textMarkup Postscript is a graphical page description…
## 3 MySQL queryLanguage Postscript is a graphical page description…
## 4 PostgreSQL queryLanguage Postscript is a graphical page description…
## 5 Regular Expressions queryLanguage Postscript is a graphical page description…
## 6 PostScript textMarkup Postscript is a graphical page description…
## 7 GraphQL queryLanguage Org is a mode for keeping notes, maintaini…
## 8 LaTeX textMarkup Org is a mode for keeping notes, maintaini…
## 9 Markdown textMarkup Org is a mode for keeping notes, maintaini…
## 10 SQLite queryLanguage Org is a mode for keeping notes, maintaini…
## # ℹ 181 more rows