Week 12: Apply it to your data 11

Data summary
Name	Piped data
Number of rows	336776
Number of columns	19
_______________________
Column type frequency:
character	4
numeric	14
POSIXct	1
________________________
Group variables	None

skim_variable	n_missing	complete_rate	min	max	n_unique
carrier	0	1.00	2	2	16
tailnum	2512	0.99	5	6	4043
origin	0	1.00	3	3	3
dest	0	1.00	3	3	105

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2013.00	0.00	2013	2013	2013	2013	2013	▁▁▇▁▁
month	0	1.00	6.55	3.41	1	4	7	10	12	▇▆▆▆▇
day	0	1.00	15.71	8.77	1	8	16	23	31	▇▇▇▇▆
dep_time	8255	0.98	1349.11	488.28	1	907	1401	1744	2400	▁▇▆▇▃
sched_dep_time	0	1.00	1344.25	467.34	106	906	1359	1729	2359	▁▇▇▇▃
dep_delay	8255	0.98	12.64	40.21	-43	-5	-2	11	1301	▇▁▁▁▁
arr_time	8713	0.97	1502.05	533.26	1	1104	1535	1940	2400	▁▃▇▇▇
sched_arr_time	0	1.00	1536.38	497.46	1	1124	1556	1945	2359	▁▃▇▇▇
arr_delay	9430	0.97	6.90	44.63	-86	-17	-5	14	1272	▇▁▁▁▁
flight	0	1.00	1971.92	1632.47	1	553	1496	3465	8500	▇▃▃▁▁
air_time	9430	0.97	150.69	93.69	20	82	129	192	695	▇▂▂▁▁
distance	0	1.00	1039.91	733.23	17	502	872	1389	4983	▇▃▂▁▁
hour	0	1.00	13.18	4.66	1	9	13	17	23	▁▇▇▇▅
minute	0	1.00	26.23	19.30	0	8	29	44	59	▇▃▆▃▅

skim_variable	n_missing	complete_rate	min	max	median	n_unique
time_hour	0	1	2013-01-01 05:00:00	2013-12-31 23:00:00	2013-07-03 10:00:00	6936

Create Data frame functions

Example 1: count columns

code snippets

ncol_num <- flights %>%
    
    # Select a type of variables
    select(where(is.numeric)) %>%
    
    # Count columns
    ncol()

ncol_num

## [1] 14

Turn them into a function

count_ncol_numeric <- function(.data) {
    
    ncol_num <- .data %>%
    
    # Select a type of variables
    select(where(is.numeric)) %>%
    
    # Count columns
    ncol()
    
    # Return the new variable 
    return(ncol_num)
}

flights %>% count_ncol_numeric()

## [1] 14

flights %>% .[1:10, -1:-13] %>% count_ncol_numeric()

## [1] 4

Adding arguments for details of operation

count_ncol_type <- function(.data, type_data = "numeric") {
    
    # If statement for type of variables 
    if(type_data == "numeric") {
           
         # body 
        ncol_type <- .data %>%
        
        # Select a type of variables
        select(where(is.numeric)) %>%
        
        # Count columns
        ncol()    
    } else if (type_data == "character") {
            
           # body 
        ncol_type <- .data %>%
        
        # Select a type of variables
        select(where(is.character)) %>%
        
        # Count columns
        ncol()
    }
 
    
    # Return the new variable 
    return(ncol_type)
}

flights %>% count_ncol_type

## [1] 14

flights %>% count_ncol_type(type_data = "character")

## [1] 4

flights %>% .[1:10, 1:5] %>% count_ncol_type(type_data = "character")

## [1] 0

Example 2: count rows

code snippets

nrow_num <- flights %>%
    
    # filter rows that meet a condition
    filter(carrier == "UA") %>%
    
    # Count rows
    nrow()

nrow_num

## [1] 58665

Turn them into a function

count_num_flights_by_carrier <- function(.data, carrier_name) {
    
    # body 
    nrow_num <- .data %>%
    
    # filter rows that meet a condition
    filter(carrier == carrier_name) %>%
    
    # Count rows
    nrow()
    
    # return the new variable 
    return(nrow_num)
}

flights %>% .[1:10, "carrier"] %>% count_num_flights_by_carrier(carrier_name = "AA")

## [1] 2

Example 3: count rows

Create your own.

code snippets

Use the filter() function to select rows that meet a condition. Refer to Chapter 5.2 Filter rows with filter()

library(tidyverse)
library(tidyquant)

## Warning: package 'tidyquant' was built under R version 4.2.3

## Loading required package: PerformanceAnalytics

## Loading required package: xts

## Warning: package 'xts' was built under R version 4.2.3

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 4.2.3

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## ################################### WARNING ###################################
## # We noticed you have dplyr installed. The dplyr lag() function breaks how    #
## # base R's lag() function is supposed to work, which breaks lag(my_xts).      #
## #                                                                             #
## # Calls to lag(my_xts) that you enter or source() into this session won't     #
## # work correctly.                                                             #
## #                                                                             #
## # All package code is unaffected because it is protected by the R namespace   #
## # mechanism.                                                                  #
## #                                                                             #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## # You can use stats::lag() to make sure you're not using dplyr::lag(), or you #
## # can add conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop   #
## # dplyr from breaking base R's lag() function.                                #
## ################################### WARNING ###################################

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

## Loading required package: quantmod

## Warning: package 'quantmod' was built under R version 4.2.3

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

symbols <- c("NVDA", "AAPL", "NFLX", "MSFT", "TSLA")
prices <- tq_get (x = symbols,
                  from = "2012-12-31")

Turn them into a function

prices %>%
    filter(symbol == "AAPL", close > 150) %>%
    nrow()

## [1] 241

prices %>%
    filter(symbol == "NFLX", close > 200) %>%
    nrow()

## [1] 1280

count_num_by_symbol_by_close <- function(.data, symbol_txt, close_dollar){
    
    n_days <- .data %>%
    filter(symbol == symbol_txt, close > close_dollar) %>%
    nrow()
    
    return(n_days)
}

count_num_by_symbol_by_close(.data = prices, symbol_txt = "NFLX", close_dollar = 250)

## [1] 1188

prices %>%
    filter(close > 200) %>%
    group_by(symbol) %>%
    summarise(n_days = n()) %>%
    ungroup()

## # A tibble: 4 × 2
##   symbol n_days
##   <chr>   <int>
## 1 MSFT      707
## 2 NFLX     1280
## 3 NVDA      236
## 4 TSLA      481

get_nDays_above_close <- function(.data, closing_dollar){
    
    new_variable <- .data %>%
    filter(close > closing_dollar) %>%
    group_by(symbol) %>%
    summarise(n_days = n()) %>%
    ungroup()
    
    return(new_variable)
}

get_nDays_above_close(.data = prices, closing_dollar = 100)

## # A tibble: 5 × 2
##   symbol n_days
##   <chr>   <int>
## 1 AAPL      682
## 2 MSFT     1217
## 3 NFLX     1788
## 4 NVDA      699
## 5 TSLA      684

Week 12: Apply it to your data 11

Chapter 19 Functions

Cody Grube

2022-04-17

Import your data

Create Data frame functions

Example 1: count columns

code snippets

Turn them into a function

Adding arguments for details of operation

Example 2: count rows

code snippets

Turn them into a function

Example 3: count rows

code snippets

Turn them into a function