data(flights)
flights %>% skimr::skim()
Name | Piped data |
Number of rows | 336776 |
Number of columns | 19 |
_______________________ | |
Column type frequency: | |
character | 4 |
numeric | 14 |
POSIXct | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
carrier | 0 | 1.00 | 2 | 2 | 0 | 16 | 0 |
tailnum | 2512 | 0.99 | 5 | 6 | 0 | 4043 | 0 |
origin | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
dest | 0 | 1.00 | 3 | 3 | 0 | 105 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1.00 | 2013.00 | 0.00 | 2013 | 2013 | 2013 | 2013 | 2013 | ▁▁▇▁▁ |
month | 0 | 1.00 | 6.55 | 3.41 | 1 | 4 | 7 | 10 | 12 | ▇▆▆▆▇ |
day | 0 | 1.00 | 15.71 | 8.77 | 1 | 8 | 16 | 23 | 31 | ▇▇▇▇▆ |
dep_time | 8255 | 0.98 | 1349.11 | 488.28 | 1 | 907 | 1401 | 1744 | 2400 | ▁▇▆▇▃ |
sched_dep_time | 0 | 1.00 | 1344.25 | 467.34 | 106 | 906 | 1359 | 1729 | 2359 | ▁▇▇▇▃ |
dep_delay | 8255 | 0.98 | 12.64 | 40.21 | -43 | -5 | -2 | 11 | 1301 | ▇▁▁▁▁ |
arr_time | 8713 | 0.97 | 1502.05 | 533.26 | 1 | 1104 | 1535 | 1940 | 2400 | ▁▃▇▇▇ |
sched_arr_time | 0 | 1.00 | 1536.38 | 497.46 | 1 | 1124 | 1556 | 1945 | 2359 | ▁▃▇▇▇ |
arr_delay | 9430 | 0.97 | 6.90 | 44.63 | -86 | -17 | -5 | 14 | 1272 | ▇▁▁▁▁ |
flight | 0 | 1.00 | 1971.92 | 1632.47 | 1 | 553 | 1496 | 3465 | 8500 | ▇▃▃▁▁ |
air_time | 9430 | 0.97 | 150.69 | 93.69 | 20 | 82 | 129 | 192 | 695 | ▇▂▂▁▁ |
distance | 0 | 1.00 | 1039.91 | 733.23 | 17 | 502 | 872 | 1389 | 4983 | ▇▃▂▁▁ |
hour | 0 | 1.00 | 13.18 | 4.66 | 1 | 9 | 13 | 17 | 23 | ▁▇▇▇▅ |
minute | 0 | 1.00 | 26.23 | 19.30 | 0 | 8 | 29 | 44 | 59 | ▇▃▆▃▅ |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
time_hour | 0 | 1 | 2013-01-01 05:00:00 | 2013-12-31 23:00:00 | 2013-07-03 10:00:00 | 6936 |
ufo_sightings <- read_csv("../00_data/ufo_sightings.csv")
## Rows: 60632 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Location.City, Location.State, Location.Country, Data.Shape, Data....
## dbl (11): Data.Encounter duration, Location.Coordinates.Latitude, Location.C...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ufo_sightings %>% skimr::skim()
Name | Piped data |
Number of rows | 60632 |
Number of columns | 16 |
_______________________ | |
Column type frequency: | |
character | 5 |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
Location.City | 0 | 1 | 3 | 23 | 0 | 9149 | 0 |
Location.State | 0 | 1 | 2 | 2 | 0 | 51 | 0 |
Location.Country | 0 | 1 | 2 | 2 | 0 | 1 | 0 |
Data.Shape | 0 | 1 | 3 | 9 | 0 | 28 | 0 |
Data.Description excerpt | 0 | 1 | 1 | 246 | 0 | 60410 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Data.Encounter duration | 0 | 1 | 5410.13 | 414386.67 | 0.01 | 30.00 | 180.00 | 600.00 | 66276000.00 | ▇▁▁▁▁ |
Location.Coordinates.Latitude | 0 | 1 | 38.31 | 5.55 | 19.43 | 34.09 | 38.90 | 41.92 | 70.64 | ▁▇▆▁▁ |
Location.Coordinates.Longitude | 0 | 1 | -95.58 | 18.03 | -170.48 | -114.34 | -89.91 | -81.03 | -66.98 | ▁▁▅▅▇ |
Dates.Sighted.Year | 0 | 1 | 2004.45 | 10.18 | 1910.00 | 2002.00 | 2007.00 | 2011.00 | 2014.00 | ▁▁▁▁▇ |
Dates.Sighted.Month | 0 | 1 | 6.87 | 3.25 | 1.00 | 4.00 | 7.00 | 10.00 | 12.00 | ▆▅▇▆▇ |
Date.Sighted.Day | 0 | 1 | 15.03 | 8.92 | 1.00 | 7.00 | 15.00 | 22.00 | 31.00 | ▇▆▇▆▅ |
Dates.Sighted.Hour | 0 | 1 | 15.81 | 7.54 | 0.00 | 11.00 | 19.00 | 21.00 | 23.00 | ▂▁▁▂▇ |
Dates.Sighted.Minute | 0 | 1 | 17.72 | 17.92 | 0.00 | 0.00 | 15.00 | 30.00 | 59.00 | ▇▂▅▂▁ |
Dates.Documented.Year | 0 | 1 | 2007.40 | 4.48 | 1998.00 | 2004.00 | 2008.00 | 2012.00 | 2014.00 | ▃▃▅▅▇ |
Dates.Documented.Month | 0 | 1 | 6.71 | 3.49 | 1.00 | 4.00 | 7.00 | 10.00 | 12.00 | ▇▅▅▅▇ |
Dates.Documented.Day | 0 | 1 | 15.23 | 8.79 | 1.00 | 8.00 | 14.00 | 22.00 | 31.00 | ▇▇▆▅▆ |
ncol_num <- flights %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
ncol_num
## [1] 14
count_ncol_numeric <- function(.data) {
# Body of function
ncol_num <- .data %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
# Return new variable
return(ncol_num)
}
flights %>% count_ncol_numeric()
## [1] 14
# To see first 10 rows and first 5 columns
flights %>% .[1:10, 1:5] %>% count_ncol_numeric()
## [1] 5
# To remove columns
flights %>% .[1:10, -1:-13] %>% count_ncol_numeric()
## [1] 4
count_ncol_type <- function(.data, type_data = "numeric") {
# If statement for type of variables
if(type_data == "numeric") {
# Body of function
ncol_type <- .data %>%
# Select a type of variable
select(where(is.numeric)) %>%
# Count columns
ncol()
} else if(type_data == "character") {
# Body of function
ncol_type <- .data %>%
# Select a type of variable
select(where(is.character)) %>%
# Count columns
ncol()
}
# Return new variable
return(ncol_type)
}
flights %>% count_ncol_type()
## [1] 14
flights %>% count_ncol_type(type_data = "character")
## [1] 4
flights %>% .[1:10, 1:5] %>% count_ncol_type(type_data = "character")
## [1] 0
nrow_num <- flights %>%
# filter rows that meet a condition
filter(carrier == "DL") %>%
# Count rows
nrow()
nrow_num
## [1] 48110
count_num_flights_by_carrier <- function(.data, carrier_name) {
# Body of the function
nrow_num <- .data %>%
# filter rows that meet a condition
filter(carrier == carrier_name) %>%
# Count rows
nrow()
# Return the new variable
return(nrow_num)
}
flights %>% .[1:10, "carrier"] %>% count_num_flights_by_carrier(carrier_name = "AA")
## [1] 2
Create your own.
Use the filter() function to select rows that meet a condition. Refer to Chapter 5.2 Filter rows with filter()
nrow_num <- ufo_sightings %>%
# filter rows that meet a condition
filter(Dates.Sighted.Year == "1910") %>%
# Count rows
nrow()
nrow_num
## [1] 1
count_num_year_of_sighting <- function(.data, year_sighted) {
# Body of the function
nrow_num <- .data %>%
# filter rows that meet a condition
filter(Dates.Sighted.Year == year_sighted) %>%
# Count rows
nrow()
# Return the new variable
return(nrow_num)
}
ufo_sightings %>% count_num_year_of_sighting(year_sighted = "2014")
## [1] 1894
count_num_year_of_sighting(ufo_sightings, year_sighted = "2014")
## [1] 1894
ufo_sightings %>% .[1:10, "Dates.Sighted.Year"] %>% count_num_year_of_sighting(year_sighted = "2014")
## [1] 1
ufo_sightings %>% .[1:500, "Dates.Sighted.Year"] %>% count_num_year_of_sighting(year_sighted = "2014")
## [1] 34
ufo_sightings %>% .[1:30316, "Dates.Sighted.Year"] %>% count_num_year_of_sighting(year_sighted = "2014")
## [1] 1018