data(flights)
flights %>% skimr::skim()
| Name | Piped data |
| Number of rows | 336776 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| character | 4 |
| numeric | 14 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| carrier | 0 | 1.00 | 2 | 2 | 0 | 16 | 0 |
| tailnum | 2512 | 0.99 | 5 | 6 | 0 | 4043 | 0 |
| origin | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
| dest | 0 | 1.00 | 3 | 3 | 0 | 105 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 2013.00 | 0.00 | 2013 | 2013 | 2013 | 2013 | 2013 | ▁▁▇▁▁ |
| month | 0 | 1.00 | 6.55 | 3.41 | 1 | 4 | 7 | 10 | 12 | ▇▆▆▆▇ |
| day | 0 | 1.00 | 15.71 | 8.77 | 1 | 8 | 16 | 23 | 31 | ▇▇▇▇▆ |
| dep_time | 8255 | 0.98 | 1349.11 | 488.28 | 1 | 907 | 1401 | 1744 | 2400 | ▁▇▆▇▃ |
| sched_dep_time | 0 | 1.00 | 1344.25 | 467.34 | 106 | 906 | 1359 | 1729 | 2359 | ▁▇▇▇▃ |
| dep_delay | 8255 | 0.98 | 12.64 | 40.21 | -43 | -5 | -2 | 11 | 1301 | ▇▁▁▁▁ |
| arr_time | 8713 | 0.97 | 1502.05 | 533.26 | 1 | 1104 | 1535 | 1940 | 2400 | ▁▃▇▇▇ |
| sched_arr_time | 0 | 1.00 | 1536.38 | 497.46 | 1 | 1124 | 1556 | 1945 | 2359 | ▁▃▇▇▇ |
| arr_delay | 9430 | 0.97 | 6.90 | 44.63 | -86 | -17 | -5 | 14 | 1272 | ▇▁▁▁▁ |
| flight | 0 | 1.00 | 1971.92 | 1632.47 | 1 | 553 | 1496 | 3465 | 8500 | ▇▃▃▁▁ |
| air_time | 9430 | 0.97 | 150.69 | 93.69 | 20 | 82 | 129 | 192 | 695 | ▇▂▂▁▁ |
| distance | 0 | 1.00 | 1039.91 | 733.23 | 17 | 502 | 872 | 1389 | 4983 | ▇▃▂▁▁ |
| hour | 0 | 1.00 | 13.18 | 4.66 | 1 | 9 | 13 | 17 | 23 | ▁▇▇▇▅ |
| minute | 0 | 1.00 | 26.23 | 19.30 | 0 | 8 | 29 | 44 | 59 | ▇▃▆▃▅ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| time_hour | 0 | 1 | 2013-01-01 05:00:00 | 2013-12-31 23:00:00 | 2013-07-03 10:00:00 | 6936 |
ncol_num <- flights %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
ncol_num
## [1] 14
# Create a function to count # of a type of columns
count_numeric_var <- function(.data) {
# body
ncol_num <- .data %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
# return value
return(ncol_num)
}
flights %>% count_numeric_var()
## [1] 14
flights %>% .[, -1:-13] %>% count_numeric_var()
## [1] 4
# Create a function to count # of a type of columns
count_type_of_var <- function(.data, type = "numeric") {
# if statement for type of variables
if(type == "numeric") {
# body
ncol_num <- .data %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
} else if(type == "character") {
# body
ncol_num <- .data %>%
# Select a type of variables
select(where(is.character)) %>%
# Count columns
ncol()
}
# return value
return(ncol_num)
}
flights %>% count_type_of_var(type = "character")
## [1] 4
flights %>% .[, -1:-13] %>% count_type_of_var(type = "character")
## [1] 1
nrow_num <- flights %>%
# filter rows that meet a condition
filter(carrier == "UA") %>%
# Count rows
nrow()
nrow_num
## [1] 58665
# Create a function to count # of a type of columns
count_n_flights_by_carrier <- function(.data, carrier_name) {
# body
nrow_num <- .data %>%
# filter rows that meet a condition
filter(carrier == carrier_name) %>%
# Count rows
nrow()
nrow_num
# return value
return(nrow_num)
}
flights %>% count_n_flights_by_carrier(carrier_name = "UA")
## [1] 58665
flights %>% .[1:10, ] %>% count_n_flights_by_carrier(carrier_name = "UA")
## [1] 3
Create your own.
Use the filter() function to select rows that meet a condition. Refer to Chapter 5.2 Filter rows with filter()
library(troopdata)
troopdata <- tibble(troopdata)
troopdata
## # A tibble: 14,435 × 10
## countryname ccode iso3c year troops army navy air_force marine…¹ region
## <chr> <int> <chr> <int> <int> <int> <int> <int> <int> <chr>
## 1 United States 2 USA 1950 941231 NA NA NA NA North…
## 2 United States 2 USA 1951 1645490 NA NA NA NA North…
## 3 United States 2 USA 1952 2338379 NA NA NA NA North…
## 4 United States 2 USA 1953 2017164 NA NA NA NA North…
## 5 United States 2 USA 1954 2159404 NA NA NA NA North…
## 6 United States 2 USA 1955 2003012 NA NA NA NA North…
## 7 United States 2 USA 1956 1913912 NA NA NA NA North…
## 8 United States 2 USA 1957 1830532 NA NA NA NA North…
## 9 United States 2 USA 1958 1786761 NA NA NA NA North…
## 10 United States 2 USA 1959 1556335 NA NA NA NA North…
## # … with 14,425 more rows, and abbreviated variable name ¹marine_corps
troopdata %>% count(year)
## # A tibble: 72 × 2
## year n
## <int> <int>
## 1 1950 186
## 2 1951 186
## 3 1952 186
## 4 1953 186
## 5 1954 187
## 6 1955 187
## 7 1956 187
## 8 1957 187
## 9 1958 187
## 10 1959 187
## # … with 62 more rows
troopdata %>% count(countryname)
## # A tibble: 239 × 2
## countryname n
## <chr> <int>
## 1 Aden 56
## 2 Afghanistan 72
## 3 Afloat 2
## 4 Akrotiri 6
## 5 Albania 72
## 6 Algeria 72
## 7 Andorra 29
## 8 Angola 72
## 9 Antigua 58
## 10 Antigua & Barbuda 41
## # … with 229 more rows
troopdata %>% count(region)
## # A tibble: 9 × 2
## region n
## <chr> <int>
## 1 Afloat 2
## 2 East Asia & Pacific 2269
## 3 Europe & Central Asia 3806
## 4 Latin America & Caribbean 2581
## 5 Middle East & North Africa 1524
## 6 North America 215
## 7 South Asia 655
## 8 Sub-Saharan Africa 3369
## 9 <NA> 14
# number of marine corps in North America in 2021
troopdata %>%
filter(region == "North America", year == 2021) %>%
summarise(n_marines = sum(marine_corps))
## # A tibble: 1 × 1
## n_marines
## <int>
## 1 147863
troopdata %>%
filter(region == "East Asia & Pacific", year == 1990) %>%
summarise(n_marines = sum(marine_corps))
## # A tibble: 1 × 1
## n_marines
## <int>
## 1 NA