setwd("~/Desktop/PSU_DAT3000_IntroToDA/05_module8/data/")
data <- read_excel("My_Data.xlsx")
data
## # A tibble: 1,302 × 9
## Language Endonym `World Region` Country `Global Speakers` `Language Family`
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Abakuá Abakuá Caribbean "Cuba" NA <NA>
## 2 Abaza Абаза Western Asia "Turke… 49800 Abkhaz-Adyge
## 3 Abruzzese… Abruzz… Southern Euro… "Italy" NA Indo-European
## 4 Abruzzese… Abruzz… Southern Euro… "Italy" NA Indo-European
## 5 Acehnese Bahsa … Southeastern … "Indon… 3500000 Austronesian
## 6 Acehnese Bahsa … Southeastern … "Indon… 3500000 Austronesian
## 7 Adjoukrou <NA> Western Africa "Ivory… 140000 Atlantic-Congo
## 8 Adyghe <NA> Western Asia "Turke… 117500 Abkhaz-Adyge
## 9 Afenmai Afenmai Western Africa "Niger… 270000 Atlantic-Congo
## 10 African-A… Black … Northern Amer… "Unite… 45109521 Indo-European
## # ℹ 1,292 more rows
## # ℹ 3 more variables: Location <chr>, Size <chr>, Status <chr>
data %>% skimr::skim()
| Name | Piped data |
| Number of rows | 1302 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Language | 0 | 1.00 | 2 | 29 | 0 | 749 | 0 |
| Endonym | 168 | 0.87 | 2 | 27 | 0 | 635 | 0 |
| World Region | 0 | 1.00 | 9 | 25 | 0 | 22 | 0 |
| Country | 0 | 1.00 | 4 | 59 | 0 | 354 | 0 |
| Language Family | 8 | 0.99 | 4 | 23 | 0 | 49 | 0 |
| Location | 0 | 1.00 | 4 | 41 | 0 | 259 | 0 |
| Size | 0 | 1.00 | 5 | 8 | 0 | 5 | 0 |
| Status | 0 | 1.00 | 8 | 11 | 0 | 5 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Global Speakers | 176 | 0.86 | 30428972 | 105984908 | 100 | 346647.5 | 2408000 | 15870835 | 1116596640 | ▇▁▁▁▁ |
ncol_num <- data %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
ncol_num
## [1] 1
count_ncol_numeric <- function(.data) {
# body
ncol_num <- .data %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
# return the new variable
return(ncol_num)
}
data %>% count_ncol_numeric()
## [1] 1
data %>% .[1:10, -1:-3] %>% count_ncol_numeric()
## [1] 1
count_ncol_type <- function(.data, type_data = "numeric") {
# if statement for type of variable
if(type_data == "numeric") {
# body
ncol_type <- .data %>%
# Select a type of variables
select(where(is.numeric)) %>%
# Count columns
ncol()
} else if (type_data == "character") {
# body
ncol_type <- .data %>%
# Select a type of variables
select(where(is.character)) %>%
# Count columns
ncol()
}
# return the new variable
return(ncol_type)
}
data %>% count_ncol_type()
## [1] 1
data %>% count_ncol_type(type_data = "character")
## [1] 8
data %>% .[1:10, 1:9] %>% count_ncol_type(type_data = "character")
## [1] 8
nrow_num <- data %>%
# filter rows that meet a condition
filter(Status == "Historical") %>%
# Count rows
nrow()
nrow_num
## [1] 84
count_num_data_by_status <- function(.data, status_name) {
# body
nrow_num <- .data %>%
# filter rows that meet a condition
filter(Status == status_name) %>%
# Count rows
nrow()
# return the new variable
return(nrow_num)
}
data %>% .[1:10, "Status"] %>%
count_num_data_by_status(status_name = "Historical")
## [1] 1
Create your own.
Use the filter() function to select rows that meet a condition. Refer to Chapter 5.2 Filter rows with filter()
nrow_num <- data %>%
# filter rows that meet a condition
filter(Status == "Historical") %>%
# Count rows
nrow()
nrow_num
## [1] 84
count_num_data_by_status <- function(.data, status_name) {
# body
nrow_num <- .data %>%
# filter rows that meet a condition
filter(Status == status_name) %>%
# Count rows
nrow()
# return the new variable
return(nrow_num)
}
data %>% .[1:10, "Status"] %>%
count_num_data_by_status(status_name = "Historical")
## [1] 1