Module 12: Apply it to your data 11

Import your data

data(flights)

flights %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	336776
Number of columns	19
_______________________
Column type frequency:
character	4
numeric	14
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
carrier	0	1.00	2	2	16
tailnum	2512	0.99	5	6	4043
origin	0	1.00	3	3	3
dest	0	1.00	3	3	105

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.00	0.00	2013	2013	2013	2013	2013	▁▁▇▁▁
month	0	1.00	6.55	3.41	1	4	7	10	12	▇▆▆▆▇
day	0	1.00	15.71	8.77	1	8	16	23	31	▇▇▇▇▆
dep_time	8255	0.98	1349.11	488.28	1	907	1401	1744	2400	▁▇▆▇▃
sched_dep_time	0	1.00	1344.25	467.34	106	906	1359	1729	2359	▁▇▇▇▃
dep_delay	8255	0.98	12.64	40.21	-43	-5	-2	11	1301	▇▁▁▁▁
arr_time	8713	0.97	1502.05	533.26	1	1104	1535	1940	2400	▁▃▇▇▇
sched_arr_time	0	1.00	1536.38	497.46	1	1124	1556	1945	2359	▁▃▇▇▇
arr_delay	9430	0.97	6.90	44.63	-86	-17	-5	14	1272	▇▁▁▁▁
flight	0	1.00	1971.92	1632.47	1	553	1496	3465	8500	▇▃▃▁▁
air_time	9430	0.97	150.69	93.69	20	82	129	192	695	▇▂▂▁▁
distance	0	1.00	1039.91	733.23	17	502	872	1389	4983	▇▃▂▁▁
hour	0	1.00	13.18	4.66	1	9	13	17	23	▁▇▇▇▅
minute	0	1.00	26.23	19.30	0	8	29	44	59	▇▃▆▃▅

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-31 23:00:00	2013-07-03 10:00:00	6936

rating <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-01-25/ratings.csv', show_col_types = FALSE)
rating

## # A tibble: 21,831 × 10
##      num     id name          year  rank average bayes_average users_rated url  
##    <dbl>  <dbl> <chr>        <dbl> <dbl>   <dbl>         <dbl>       <dbl> <chr>
##  1   105  30549 Pandemic      2008   106    7.59          7.49      108975 /boa…
##  2   189    822 Carcassonne   2000   190    7.42          7.31      108738 /boa…
##  3   428     13 Catan         1995   429    7.14          6.97      108024 /boa…
##  4    72  68448 7 Wonders     2010    73    7.74          7.63       89982 /boa…
##  5   103  36218 Dominion      2008   104    7.61          7.50       81561 /boa…
##  6   191   9209 Ticket to R…  2004   192    7.41          7.30       76171 /boa…
##  7   100 178900 Codenames     2015   101    7.6           7.51       74419 /boa…
##  8     3 167791 Terraformin…  2016     4    8.42          8.27       74216 /boa…
##  9    15 173346 7 Wonders D…  2015    16    8.11          7.98       69472 /boa…
## 10    35  31260 Agricola      2007    36    7.93          7.81       66093 /boa…
## # ℹ 21,821 more rows
## # ℹ 1 more variable: thumbnail <chr>

ratings <- head(rating, 50)
ratings

## # A tibble: 50 × 10
##      num     id name          year  rank average bayes_average users_rated url  
##    <dbl>  <dbl> <chr>        <dbl> <dbl>   <dbl>         <dbl>       <dbl> <chr>
##  1   105  30549 Pandemic      2008   106    7.59          7.49      108975 /boa…
##  2   189    822 Carcassonne   2000   190    7.42          7.31      108738 /boa…
##  3   428     13 Catan         1995   429    7.14          6.97      108024 /boa…
##  4    72  68448 7 Wonders     2010    73    7.74          7.63       89982 /boa…
##  5   103  36218 Dominion      2008   104    7.61          7.50       81561 /boa…
##  6   191   9209 Ticket to R…  2004   192    7.41          7.30       76171 /boa…
##  7   100 178900 Codenames     2015   101    7.6           7.51       74419 /boa…
##  8     3 167791 Terraformin…  2016     4    8.42          8.27       74216 /boa…
##  9    15 173346 7 Wonders D…  2015    16    8.11          7.98       69472 /boa…
## 10    35  31260 Agricola      2007    36    7.93          7.81       66093 /boa…
## # ℹ 40 more rows
## # ℹ 1 more variable: thumbnail <chr>

Create Data frame functions

Example 1: count columns

code snippets

ncol_num <- flights %>%
    
    # Select a type of variables
    select(where(is.numeric)) %>%
    
    # Count columns
    ncol()

ncol_num

## [1] 14

Turn them into a function

count_ncol_numeric <- function(.data){
  ncol_num <- .data %>%
    
    # Select a type of variables
    select(where(is.numeric)) %>%
    
    # Count columns
    ncol()
  return(ncol_num)
}

flights %>% count_ncol_numeric()

## [1] 14

flights %>% .[1:10, -1:-13] %>% count_ncol_numeric()

## [1] 4

Adding arguments for details of operation

count_ncol_type <- function(.data, type_data = "numeric"){
  
  # if statement for type of variables
  if(type_data == "numeric"){
  
    ncol_type <- .data %>%
      
      # Select a type of variables
      select(where(is.numeric)) %>%
      
      # Count columns
      ncol()
  } else if (type_data == "character"){
    ncol_type <- .data %>%
      
      # Select a type of variables
      select(where(is.character)) %>%
      
      # Count columns
      ncol()
  }
  return(ncol_type)
}
flights %>% count_ncol_type()

## [1] 14

flights %>% count_ncol_type("character")

## [1] 4

flights %>% .[1:10, 1:5] %>% count_ncol_type("character")

## [1] 0

Example 2: count rows

code snippets

nrow_num <- flights %>%
    
    # filter rows that meet a condition
    filter(carrier == "UA") %>%
    
    # Count rows
    nrow()

nrow_num

## [1] 58665

Turn them into a function

count_num_flight_by_carrier <- function(.data, carrier_name) {
  # body
  nrow_num <- .data %>%
    
    # filter rows that meet a condition
    filter(carrier == carrier_name) %>%
    
    # Count rows
    nrow()
  
  # return new variable
  return(nrow_num)
}

flights %>% .[1:10, "carrier"] %>% count_num_flight_by_carrier("AA")

## [1] 2

Example 3: Count rows

Create your own.

code snippets

Use the filter() function to select rows that meet a condition. Refer to Chapter 5.2 Filter rows with filter()

ratings

## # A tibble: 50 × 10
##      num     id name          year  rank average bayes_average users_rated url  
##    <dbl>  <dbl> <chr>        <dbl> <dbl>   <dbl>         <dbl>       <dbl> <chr>
##  1   105  30549 Pandemic      2008   106    7.59          7.49      108975 /boa…
##  2   189    822 Carcassonne   2000   190    7.42          7.31      108738 /boa…
##  3   428     13 Catan         1995   429    7.14          6.97      108024 /boa…
##  4    72  68448 7 Wonders     2010    73    7.74          7.63       89982 /boa…
##  5   103  36218 Dominion      2008   104    7.61          7.50       81561 /boa…
##  6   191   9209 Ticket to R…  2004   192    7.41          7.30       76171 /boa…
##  7   100 178900 Codenames     2015   101    7.6           7.51       74419 /boa…
##  8     3 167791 Terraformin…  2016     4    8.42          8.27       74216 /boa…
##  9    15 173346 7 Wonders D…  2015    16    8.11          7.98       69472 /boa…
## 10    35  31260 Agricola      2007    36    7.93          7.81       66093 /boa…
## # ℹ 40 more rows
## # ℹ 1 more variable: thumbnail <chr>

nrow_num <- ratings %>%
    
    # filter rows that meet a condition
    filter(year == 2008) %>%
    
    # Count rows
    nrow()

nrow_num

## [1] 4

Turn them into a function

count_num_games_by_year <- function(.data, yr){
  nrow_num <- .data %>%
      
      # filter rows that meet a condition
      filter(year == yr) %>%
      
      # Count rows
      nrow()

  return(nrow_num)
}
count_num_games_by_year(ratings, 2008)

## [1] 4

count_num_games_by_year(ratings, 2009)

## [1] 3

Module 12: Apply it to your data 11

Chapter 19 Functions

Simon Champney

Import your data

Create Data frame functions

Example 1: count columns

code snippets

Turn them into a function

Adding arguments for details of operation

Example 2: count rows

code snippets

Turn them into a function

Example 3: Count rows

code snippets

Turn them into a function