data <- read.csv("../00_data/myData.csv")
data %>% skimr::skim()
Name | Piped data |
Number of rows | 691 |
Number of columns | 22 |
_______________________ | |
Column type frequency: | |
character | 8 |
numeric | 14 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
sort_name | 0 | 1.00 | 1 | 34 | 0 | 391 | 0 |
clean_name | 0 | 1.00 | 1 | 34 | 0 | 386 | 0 |
album | 0 | 1.00 | 1 | 69 | 0 | 685 | 0 |
genre | 164 | 0.76 | 5 | 35 | 0 | 16 | 0 |
type | 0 | 1.00 | 4 | 13 | 0 | 5 | 0 |
spotify_url | 36 | 0.95 | 22 | 36 | 0 | 655 | 0 |
artist_gender | 5 | 0.99 | 4 | 11 | 0 | 3 | 0 |
album_id | 0 | 1.00 | 6 | 22 | 0 | 691 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
X | 0 | 1.00 | 346.00 | 199.62 | 1 | 173.50 | 346.0 | 518.50 | 691 | ▇▇▇▇▇ |
rank_2003 | 191 | 0.72 | 250.50 | 144.49 | 1 | 125.75 | 250.5 | 375.25 | 500 | ▇▇▇▇▇ |
rank_2012 | 191 | 0.72 | 250.50 | 144.48 | 1 | 125.75 | 250.5 | 375.25 | 500 | ▇▇▇▇▇ |
rank_2020 | 191 | 0.72 | 250.50 | 144.48 | 1 | 125.75 | 250.5 | 375.25 | 500 | ▇▇▇▇▇ |
differential | 0 | 1.00 | -12.32 | 199.04 | -501 | -137.50 | -8.0 | 106.00 | 484 | ▂▅▇▃▂ |
release_year | 0 | 1.00 | 1982.87 | 14.55 | 1955 | 1971.00 | 1979.0 | 1994.00 | 2019 | ▂▇▃▃▂ |
weeks_on_billboard | 119 | 0.83 | 64.27 | 75.14 | 1 | 20.75 | 44.5 | 81.00 | 741 | ▇▁▁▁▁ |
peak_billboard_position | 0 | 1.00 | 61.19 | 77.16 | 1 | 2.00 | 17.0 | 111.50 | 201 | ▇▁▁▁▂ |
spotify_popularity | 37 | 0.95 | 55.81 | 14.95 | 10 | 46.00 | 57.0 | 68.00 | 91 | ▁▃▇▇▂ |
artist_member_count | 5 | 0.99 | 2.75 | 2.02 | 1 | 1.00 | 2.0 | 4.00 | 12 | ▇▅▁▁▁ |
artist_birth_year_sum | 5 | 0.99 | 5363.21 | 3947.13 | 1910 | 1948.00 | 3896.0 | 7845.00 | 23368 | ▇▅▁▁▁ |
debut_album_release_year | 5 | 0.99 | 1976.87 | 14.96 | 1934 | 1966.25 | 1973.0 | 1989.00 | 2019 | ▁▇▇▅▂ |
ave_age_at_top_500 | 5 | 0.99 | 29.61 | 9.35 | 17 | 24.04 | 27.0 | 31.00 | 88 | ▇▂▁▁▁ |
years_between | 5 | 0.99 | 5.93 | 8.42 | 0 | 1.00 | 3.0 | 7.00 | 54 | ▇▁▁▁▁ |
ncol_num <- flights %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
ncol_num
## [1] 14
count_ncol_numeric <- function(.data) {
# body
ncol_num <- .data %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
# return the new variable
return(ncol_num)
}
flights %>% count_ncol_numeric()
## [1] 14
flights %>% .[1:10, -1:-13] %>% count_ncol_numeric()
## [1] 4
count_ncol_type <- function(.data, type_data = "numeric") {
# if statement for type of variables
if(type_data == "numeric") {
# body
ncol_type <- .data %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
} else if(type_data == "character") {
# body
ncol_type <- .data %>%
# Select a type of variables
select(where(is.character)) %>%
# Count columns
ncol()
}
# return the new variable
return(ncol_type)
}
flights %>% count_ncol_type()
## [1] 14
flights %>% count_ncol_type(type_data = "character")
## [1] 4
flights %>% .[1:10, 1:5] %>% count_ncol_type(type_data = "character")
## [1] 0
nrow_num <- flights %>%
# filter rows that meet a condition
filter(carrier == "DL") %>%
# Count rows
nrow()
nrow_num
## [1] 48110
count_num_flights_by_carrier <- function(.data, carrier_name) {
# body
nrow_num <- .data %>%
# filter rows that meet a condition
filter(carrier == carrier_name) %>%
# Count rows
nrow()
# return the new variable
return(nrow_num)
}
flights %>% .[1:10, "carrier"] %>% count_num_flights_by_carrier(carrier_name = "UA")
## [1] 3
data <- read.csv("../00_data/myData.csv")
Create your own.
Use the filter() function to select rows that meet a condition. Refer to Chapter 5.2 Filter rows with filter()
nrow_num <- data %>%
# filter rows that meet a condition
filter(type == "Studio") %>%
# Count rows
nrow()
nrow_num
## [1] 608
count_num_studio_by_type <- function(data, type) {
nrow_num <- data %>%
filter(type == studio) %>%
nrow()
return(nrow_num)
}