Import your data
data(flights)
flights %>% skimr::skim()
Data summary
Name |
Piped data |
Number of rows |
336776 |
Number of columns |
19 |
_______________________ |
|
Column type frequency: |
|
character |
4 |
numeric |
14 |
POSIXct |
1 |
________________________ |
|
Group variables |
None |
Variable type: character
carrier |
0 |
1.00 |
2 |
2 |
0 |
16 |
0 |
tailnum |
2512 |
0.99 |
5 |
6 |
0 |
4043 |
0 |
origin |
0 |
1.00 |
3 |
3 |
0 |
3 |
0 |
dest |
0 |
1.00 |
3 |
3 |
0 |
105 |
0 |
Variable type: numeric
year |
0 |
1.00 |
2013.00 |
0.00 |
2013 |
2013 |
2013 |
2013 |
2013 |
▁▁▇▁▁ |
month |
0 |
1.00 |
6.55 |
3.41 |
1 |
4 |
7 |
10 |
12 |
▇▆▆▆▇ |
day |
0 |
1.00 |
15.71 |
8.77 |
1 |
8 |
16 |
23 |
31 |
▇▇▇▇▆ |
dep_time |
8255 |
0.98 |
1349.11 |
488.28 |
1 |
907 |
1401 |
1744 |
2400 |
▁▇▆▇▃ |
sched_dep_time |
0 |
1.00 |
1344.25 |
467.34 |
106 |
906 |
1359 |
1729 |
2359 |
▁▇▇▇▃ |
dep_delay |
8255 |
0.98 |
12.64 |
40.21 |
-43 |
-5 |
-2 |
11 |
1301 |
▇▁▁▁▁ |
arr_time |
8713 |
0.97 |
1502.05 |
533.26 |
1 |
1104 |
1535 |
1940 |
2400 |
▁▃▇▇▇ |
sched_arr_time |
0 |
1.00 |
1536.38 |
497.46 |
1 |
1124 |
1556 |
1945 |
2359 |
▁▃▇▇▇ |
arr_delay |
9430 |
0.97 |
6.90 |
44.63 |
-86 |
-17 |
-5 |
14 |
1272 |
▇▁▁▁▁ |
flight |
0 |
1.00 |
1971.92 |
1632.47 |
1 |
553 |
1496 |
3465 |
8500 |
▇▃▃▁▁ |
air_time |
9430 |
0.97 |
150.69 |
93.69 |
20 |
82 |
129 |
192 |
695 |
▇▂▂▁▁ |
distance |
0 |
1.00 |
1039.91 |
733.23 |
17 |
502 |
872 |
1389 |
4983 |
▇▃▂▁▁ |
hour |
0 |
1.00 |
13.18 |
4.66 |
1 |
9 |
13 |
17 |
23 |
▁▇▇▇▅ |
minute |
0 |
1.00 |
26.23 |
19.30 |
0 |
8 |
29 |
44 |
59 |
▇▃▆▃▅ |
Variable type: POSIXct
time_hour |
0 |
1 |
2013-01-01 05:00:00 |
2013-12-31 23:00:00 |
2013-07-03 10:00:00 |
6936 |
attendance <- read_excel("../00_data/nfl_attendance.xlsx")
attendance %>% skimr::skim()
Data summary
Name |
Piped data |
Number of rows |
10846 |
Number of columns |
8 |
_______________________ |
|
Column type frequency: |
|
character |
3 |
numeric |
5 |
________________________ |
|
Group variables |
None |
Variable type: character
team |
0 |
1 |
5 |
13 |
0 |
32 |
0 |
team_name |
0 |
1 |
4 |
10 |
0 |
32 |
0 |
weekly_attendance |
0 |
1 |
2 |
6 |
0 |
4074 |
0 |
Variable type: numeric
year |
0 |
1 |
2009.53 |
5.75 |
2000 |
2005 |
2010 |
2015 |
2019 |
▇▇▇▇▇ |
total |
0 |
1 |
1080910.03 |
72876.97 |
760644 |
1040509 |
1081090 |
1123230 |
1322087 |
▁▁▇▆▁ |
home |
0 |
1 |
540455.01 |
66774.65 |
202687 |
504360 |
543185 |
578342 |
741775 |
▁▁▅▇▁ |
away |
0 |
1 |
540455.01 |
25509.33 |
450295 |
524974 |
541757 |
557741 |
601655 |
▁▂▇▇▂ |
week |
0 |
1 |
9.00 |
4.90 |
1 |
5 |
9 |
13 |
17 |
▇▆▆▆▇ |
standings <- read_excel("../00_data/nfl_standings.xlsx")
standings %>% skimr::skim()
Data summary
Name |
Piped data |
Number of rows |
638 |
Number of columns |
15 |
_______________________ |
|
Column type frequency: |
|
character |
4 |
numeric |
11 |
________________________ |
|
Group variables |
None |
Variable type: character
team |
0 |
1 |
5 |
13 |
0 |
32 |
0 |
team_name |
0 |
1 |
4 |
10 |
0 |
32 |
0 |
playoffs |
0 |
1 |
8 |
11 |
0 |
2 |
0 |
sb_winner |
0 |
1 |
12 |
13 |
0 |
2 |
0 |
Variable type: numeric
year |
0 |
1 |
2009.53 |
5.76 |
2000.0 |
2005.00 |
2010.0 |
2014.75 |
2019.0 |
▇▇▇▇▇ |
wins |
0 |
1 |
7.98 |
3.08 |
0.0 |
6.00 |
8.0 |
10.00 |
16.0 |
▂▆▇▆▂ |
loss |
0 |
1 |
7.98 |
3.08 |
0.0 |
6.00 |
8.0 |
10.00 |
16.0 |
▂▆▇▆▂ |
points_for |
0 |
1 |
350.28 |
71.40 |
161.0 |
299.00 |
348.0 |
396.00 |
606.0 |
▂▇▇▂▁ |
points_against |
0 |
1 |
350.28 |
59.55 |
165.0 |
310.00 |
347.0 |
391.50 |
517.0 |
▁▃▇▆▁ |
points_differential |
0 |
1 |
0.00 |
101.09 |
-261.0 |
-75.00 |
1.5 |
72.75 |
315.0 |
▂▆▇▅▁ |
margin_of_victory |
0 |
1 |
0.00 |
6.32 |
-16.3 |
-4.70 |
0.1 |
4.57 |
19.7 |
▂▆▇▅▁ |
strength_of_schedule |
0 |
1 |
0.00 |
1.63 |
-4.6 |
-1.10 |
0.0 |
1.20 |
4.3 |
▁▅▇▅▁ |
simple_rating |
0 |
1 |
0.00 |
6.20 |
-17.4 |
-4.47 |
0.0 |
4.50 |
20.1 |
▁▆▇▅▁ |
offensive_ranking |
0 |
1 |
0.00 |
4.34 |
-11.7 |
-3.18 |
0.0 |
2.70 |
15.9 |
▁▇▇▂▁ |
defensive_ranking |
0 |
1 |
0.00 |
3.57 |
-9.8 |
-2.40 |
0.1 |
2.50 |
9.8 |
▁▅▇▅▁ |
Create Data frame functions
Example 1: count columns
code snippets
ncol_num <- flights %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
ncol_num
## [1] 14
Turn them into a function
count_ncol_numeric <- function(.data) {
# Body
ncol_num <- .data %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
# Return new variable
return(ncol_num)
}
flights %>% count_ncol_numeric()
## [1] 14
flights %>% .[1:10, -1:-13] %>% count_ncol_numeric()
## [1] 4
Adding arguments for details of operation
count_ncol_type <- function(.data, type_data = "numeric") {
# If statement for type of variables
if(type_data == "numeric") {
# Body
ncol_type <- .data %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
} else if (type_data == "character") {
# Body
ncol_type <- .data %>%
# Select a type of variables
select(where(is.character)) %>%
# Count columns
ncol()
}
# Return new variable
return(ncol_type)
}
flights %>% count_ncol_type()
## [1] 14
flights %>% count_ncol_type(type_data = "character")
## [1] 4
flights %>% .[1:10,1:5] %>% count_ncol_type(type_data = "character")
## [1] 0
Example 2: count rows
code snippets
nrow_num <- flights %>%
# filter rows that meet a condition
filter(carrier == "UA") %>%
# Count rows
nrow()
nrow_num
## [1] 58665
Turn them into a function
count_num_flights_by_carrier <- function(.data, carrier_name) {
# Body
nrow_num <- .data %>%
# filter rows that meet a condition
filter(carrier == carrier_name) %>%
# Count rows
nrow()
# Return the new variable
return(nrow_num)
}
flights %>% .[1:10, "carrier"] %>% count_num_flights_by_carrier(carrier_name = "UA")
## [1] 3
flights %>% .[1:10, "carrier"] %>% count_num_flights_by_carrier(carrier_name = "AA")
## [1] 2
Example 3: count rows
Create your own.
code snippets
Use the filter() function to select rows that meet a condition. Refer
to Chapter 5.2 Filter rows with filter()
nrows_num <- standings %>%
# filter rows that meet a condition
filter(team_name == "Patriots") %>%
# Count rows
nrow()
nrows_num
## [1] 20
Turn them into a function
count_num_seasons_by_team <- function(.data, team_name_txt) {
# Body
nrows_num <- .data %>%
# filter rows that meet a condition
filter(team_name == team_name_txt) %>%
# Count rows
nrow()
# Return new variable
return(nrows_num)
}
standings %>% count_num_seasons_by_team(team_name_txt = "Patriots")
## [1] 20
standings %>% count_num_seasons_by_team(team_name_txt = "Texans")
## [1] 18
# Since the Texans were made in 2002, they only have 18 seasons as opposed to the Patriot's 20