Module 12: Apply it to your data 11

Import your data

data(flights)

flights %>% skimr::skim()

Data summary
Name	Piped data
Number of rows	336776
Number of columns	19
_______________________
Column type frequency:
character	4
numeric	14
POSIXct	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
carrier	0	1.00	2	2	16
tailnum	2512	0.99	5	6	4043
origin	0	1.00	3	3	3
dest	0	1.00	3	3	105

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.00	0.00	2013	2013	2013	2013	2013	▁▁▇▁▁
month	0	1.00	6.55	3.41	1	4	7	10	12	▇▆▆▆▇
day	0	1.00	15.71	8.77	1	8	16	23	31	▇▇▇▇▆
dep_time	8255	0.98	1349.11	488.28	1	907	1401	1744	2400	▁▇▆▇▃
sched_dep_time	0	1.00	1344.25	467.34	106	906	1359	1729	2359	▁▇▇▇▃
dep_delay	8255	0.98	12.64	40.21	-43	-5	-2	11	1301	▇▁▁▁▁
arr_time	8713	0.97	1502.05	533.26	1	1104	1535	1940	2400	▁▃▇▇▇
sched_arr_time	0	1.00	1536.38	497.46	1	1124	1556	1945	2359	▁▃▇▇▇
arr_delay	9430	0.97	6.90	44.63	-86	-17	-5	14	1272	▇▁▁▁▁
flight	0	1.00	1971.92	1632.47	1	553	1496	3465	8500	▇▃▃▁▁
air_time	9430	0.97	150.69	93.69	20	82	129	192	695	▇▂▂▁▁
distance	0	1.00	1039.91	733.23	17	502	872	1389	4983	▇▃▂▁▁
hour	0	1.00	13.18	4.66	1	9	13	17	23	▁▇▇▇▅
minute	0	1.00	26.23	19.30	0	8	29	44	59	▇▃▆▃▅

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-31 23:00:00	2013-07-03 10:00:00	6936

data <- read_excel("myData.xlsx")
data

## # A tibble: 32,754 × 20
##         id original_title original_language overview tagline release_date       
##      <dbl> <chr>          <chr>             <chr>    <chr>   <dttm>             
##  1  760161 Orphan: First… en                After e… "There… 2022-07-27 00:00:00
##  2  760741 Beast          en                A recen… "Fight… 2022-08-11 00:00:00
##  3  882598 Smile          en                After w… "Once … 2022-09-23 00:00:00
##  4  717728 Jeepers Creep… en                Forced … "Evil … 2022-09-15 00:00:00
##  5  772450 Presencias     es                A man w…  <NA>   2022-09-07 00:00:00
##  6 1014226 Sonríe         es                <NA>      <NA>   2022-08-18 00:00:00
##  7  913290 Barbarian      en                In town… "Some … 2022-09-08 00:00:00
##  8  830788 The Invitation en                After t… "You a… 2022-08-24 00:00:00
##  9  927341 Hunting Ava B… en                Billion… "\"If … 2022-04-01 00:00:00
## 10  762504 Nope           en                Residen… "What’… 2022-07-20 00:00:00
## # ℹ 32,744 more rows
## # ℹ 14 more variables: title <chr>, popularity <dbl>, revenue <dbl>,
## #   budget <dbl>, poster_path <chr>, vote_count <dbl>, vote_average <dbl>,
## #   runtime <dbl>, status <chr>, adult <lgl>, backdrop_path <chr>,
## #   genre_names <chr>, collection <chr>, collection_name <chr>

small_data <- data %>%
  slice(1:5) %>%
  select(id, original_title, release_date, popularity, revenue,)

Create Data frame functions

Example 1: count columns

code snippets

ncol_num <- flights %>%
    
    # Select a type of variables
    select(where(is.numeric)) %>%
    
    # Count columns
    ncol()

ncol_num

## [1] 14

Turn them into a function

count_ncol_numeric <- function(.data) {
    
    # body
    ncol_num <- flights %>%
        
    
         # Select a type of variables
         select(where(is.numeric)) %>%
    
         # Count columns
         ncol()
    
    # return the new variable
    return(ncol_num)
    
}

flights %>% count_ncol_numeric()

## [1] 14

flights %>% .[1:10, -1:-13] %>% count_ncol_numeric()

## [1] 14

Adding arguments for details of operation

count_ncol_type <- function(.data, type_data = "numeric") {
    
    # if statement for type of variables 
    if(type_data == "numeric") {
      # body
      ncol_type <- .data %>%
          
          # Select a type of variables
          select(where(is.numeric)) %>%
          
          # Count columns
          ncol()  
          
    } else if(type_data == "character") { 
        # body
        ncol_type <- .data %>%
            
            # Select a type of variables
              select(where(is.character)) %>%
            
            # Count columns
            ncol()  
        
    }
     
    # return the new variable
    return(ncol_type)
    
}

flights %>% count_ncol_type()

## [1] 14

flights %>% count_ncol_type(type_data = "character")

## [1] 4

flights %>% .[1:10, 1:15] %>% count_ncol_type(type_data = "character")

## [1] 4

Example 2: count rows

code snippets

nrow_num <- flights %>%
    
    # filter rows that meet a condition
    filter(carrier == "DL") %>%
    
    # Count rows
    nrow()

nrow_num

## [1] 48110

Turn them into a function

count_num_flights_by_carrier <- function(.data, carrier_name) {
    
    # body
    nrow_num <- .data %>%
    
          # filter rows that meet a condition
          filter(carrier == carrier_name) %>%
        
        # Count rows
        nrow()
    
    # return the new variable
    return(nrow_num)
    
}

flights %>% .[1:10, "carrier"] %>% count_num_flights_by_carrier(carrier_name = "UA")

## [1] 3

Example 3: count rows

Create your own.

code snippets

Use the filter() function to select rows that meet a condition. Refer to Chapter 5.2 Filter rows with filter()

Turn them into a function

# Define the function
count_movies <- function(data, limit) {
  data %>%
    filter(popularity > limit) %>%
    nrow()
}

# Use the function
count_movies(small_data, 1000)

## [1] 4

Module 12: Apply it to your data 11

Chapter 19 Functions

Luke Davies

Import your data

Create Data frame functions

Example 1: count columns

code snippets

Turn them into a function

Adding arguments for details of operation

Example 2: count rows

code snippets

Turn them into a function

Example 3: count rows

code snippets

Turn them into a function