# excel file
data <- read_xlsx("../00_data/my data q&a.xlsx")
data
## # A tibble: 269,732 × 15
## id name sex age height weight team noc games year season city
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 1 A Dijia… M 24 180 80 China CHN 1992… 1992 Summer Barc…
## 2 2 A Lamusi M 23 170 60 China CHN 2012… 2012 Summer Lond…
## 3 3 Gunnar … M 24 NA NA Denm… DEN 1920… 1920 Summer Antw…
## 4 4 Edgar L… M 34 NA NA Denm… DEN 1900… 1900 Summer Paris
## 5 5 Christi… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
## 6 5 Christi… F 21 185 82 Neth… NED 1988… 1988 Winter Calg…
## 7 5 Christi… F 25 185 82 Neth… NED 1992… 1992 Winter Albe…
## 8 5 Christi… F 25 185 82 Neth… NED 1992… 1992 Winter Albe…
## 9 5 Christi… F 27 185 82 Neth… NED 1994… 1994 Winter Lill…
## 10 5 Christi… F 27 185 82 Neth… NED 1994… 1994 Winter Lill…
## # ℹ 269,722 more rows
## # ℹ 3 more variables: sport <chr>, event <chr>, medal <chr>
set.seed(1234)
depart_small <- data %>%
sample_n(5) %>%
select(age, height, weight)
depart_small
## # A tibble: 5 × 3
## age height weight
## <chr> <chr> <chr>
## 1 25 NA NA
## 2 25 196 86
## 3 25 190 90
## 4 33 173 73
## 5 15 149 45
data %>%
pivot_longer(
cols = c(height, weight),
names_to = "measurement",
values_to = "value"
)
## # A tibble: 539,464 × 15
## id name sex age team noc games year season city sport event
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 A Dijiang M 24 China CHN 1992… 1992 Summer Barc… Bask… Bask…
## 2 1 A Dijiang M 24 China CHN 1992… 1992 Summer Barc… Bask… Bask…
## 3 2 A Lamusi M 23 China CHN 2012… 2012 Summer Lond… Judo Judo…
## 4 2 A Lamusi M 23 China CHN 2012… 2012 Summer Lond… Judo Judo…
## 5 3 Gunnar Ni… M 24 Denm… DEN 1920… 1920 Summer Antw… Foot… Foot…
## 6 3 Gunnar Ni… M 24 Denm… DEN 1920… 1920 Summer Antw… Foot… Foot…
## 7 4 Edgar Lin… M 34 Denm… DEN 1900… 1900 Summer Paris Tug-… Tug-…
## 8 4 Edgar Lin… M 34 Denm… DEN 1900… 1900 Summer Paris Tug-… Tug-…
## 9 5 Christine… F 21 Neth… NED 1988… 1988 Winter Calg… Spee… Spee…
## 10 5 Christine… F 21 Neth… NED 1988… 1988 Winter Calg… Spee… Spee…
## # ℹ 539,454 more rows
## # ℹ 3 more variables: medal <chr>, measurement <chr>, value <chr>
set.seed(1234)
depart_small <- data %>%
sample_n(5) %>%
select(age, height, weight)
depart_small
## # A tibble: 5 × 3
## age height weight
## <chr> <chr> <chr>
## 1 25 NA NA
## 2 25 196 86
## 3 25 190 90
## 4 33 173 73
## 5 15 149 45
data <- data %>%
mutate(weight = na_if(weight, "")) %>%
mutate(weight = na_if(weight, "NA"))
data <- tibble(
id = 1:3,
height = c(180, 170, 185),
weight = c(80, 60, 82)
)
data <- data %>%
mutate(rate = paste(height, weight, sep = "/")) %>% # create rate column
separate(rate, into = c("cases", "population"), sep = "/") %>%
mutate(across(c(cases, population), as.numeric))
print(data)
## # A tibble: 3 × 5
## id height weight cases population
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 180 80 180 80
## 2 2 170 60 170 60
## 3 3 185 82 185 82
data %>%
unite(new_column, weight, height, sep = "", na.rm = TRUE)
## # A tibble: 3 × 4
## id new_column cases population
## <int> <chr> <dbl> <dbl>
## 1 1 80180 180 80
## 2 2 60170 170 60
## 3 3 82185 185 82
colnames(data)
## [1] "id" "height" "weight" "cases" "population"
data <- data %>% unite(new_column, cases, id, sep = "", remove = FALSE)