Import your data

# excel file
data <- read_xlsx("../00_data/my data q&a.xlsx")
data
## # A tibble: 269,732 × 15
##       id name     sex   age   height weight team  noc   games  year season city 
##    <dbl> <chr>    <chr> <chr> <chr>  <chr>  <chr> <chr> <chr> <dbl> <chr>  <chr>
##  1     1 A Dijia… M     24    180    80     China CHN   1992…  1992 Summer Barc…
##  2     2 A Lamusi M     23    170    60     China CHN   2012…  2012 Summer Lond…
##  3     3 Gunnar … M     24    NA     NA     Denm… DEN   1920…  1920 Summer Antw…
##  4     4 Edgar L… M     34    NA     NA     Denm… DEN   1900…  1900 Summer Paris
##  5     5 Christi… F     21    185    82     Neth… NED   1988…  1988 Winter Calg…
##  6     5 Christi… F     21    185    82     Neth… NED   1988…  1988 Winter Calg…
##  7     5 Christi… F     25    185    82     Neth… NED   1992…  1992 Winter Albe…
##  8     5 Christi… F     25    185    82     Neth… NED   1992…  1992 Winter Albe…
##  9     5 Christi… F     27    185    82     Neth… NED   1994…  1994 Winter Lill…
## 10     5 Christi… F     27    185    82     Neth… NED   1994…  1994 Winter Lill…
## # ℹ 269,722 more rows
## # ℹ 3 more variables: sport <chr>, event <chr>, medal <chr>
set.seed(1234)
depart_small <- data %>%

  sample_n(5) %>%
  select(age, height, weight)

depart_small
## # A tibble: 5 × 3
##   age   height weight
##   <chr> <chr>  <chr> 
## 1 25    NA     NA    
## 2 25    196    86    
## 3 25    190    90    
## 4 33    173    73    
## 5 15    149    45

Pivoting

long to wide form

data %>%
  pivot_longer(
    cols = c(height, weight),
    names_to = "measurement",
    values_to = "value"
  )
## # A tibble: 539,464 × 15
##       id name       sex   age   team  noc   games  year season city  sport event
##    <dbl> <chr>      <chr> <chr> <chr> <chr> <chr> <dbl> <chr>  <chr> <chr> <chr>
##  1     1 A Dijiang  M     24    China CHN   1992…  1992 Summer Barc… Bask… Bask…
##  2     1 A Dijiang  M     24    China CHN   1992…  1992 Summer Barc… Bask… Bask…
##  3     2 A Lamusi   M     23    China CHN   2012…  2012 Summer Lond… Judo  Judo…
##  4     2 A Lamusi   M     23    China CHN   2012…  2012 Summer Lond… Judo  Judo…
##  5     3 Gunnar Ni… M     24    Denm… DEN   1920…  1920 Summer Antw… Foot… Foot…
##  6     3 Gunnar Ni… M     24    Denm… DEN   1920…  1920 Summer Antw… Foot… Foot…
##  7     4 Edgar Lin… M     34    Denm… DEN   1900…  1900 Summer Paris Tug-… Tug-…
##  8     4 Edgar Lin… M     34    Denm… DEN   1900…  1900 Summer Paris Tug-… Tug-…
##  9     5 Christine… F     21    Neth… NED   1988…  1988 Winter Calg… Spee… Spee…
## 10     5 Christine… F     21    Neth… NED   1988…  1988 Winter Calg… Spee… Spee…
## # ℹ 539,454 more rows
## # ℹ 3 more variables: medal <chr>, measurement <chr>, value <chr>
set.seed(1234)
depart_small <- data %>%

  sample_n(5) %>%
  select(age, height, weight)

depart_small
## # A tibble: 5 × 3
##   age   height weight
##   <chr> <chr>  <chr> 
## 1 25    NA     NA    
## 2 25    196    86    
## 3 25    190    90    
## 4 33    173    73    
## 5 15    149    45

wide to long form

data <- data %>%
  mutate(weight = na_if(weight, "")) %>%
  mutate(weight = na_if(weight, "NA"))

Separating and Uniting

Separate a column

data <- tibble(
  id = 1:3,
  height = c(180, 170, 185),
  weight = c(80, 60, 82)
)

data <- data %>%
  mutate(rate = paste(height, weight, sep = "/")) %>%  # create rate column
  separate(rate, into = c("cases", "population"), sep = "/") %>% 
  mutate(across(c(cases, population), as.numeric))

print(data)
## # A tibble: 3 × 5
##      id height weight cases population
##   <int>  <dbl>  <dbl> <dbl>      <dbl>
## 1     1    180     80   180         80
## 2     2    170     60   170         60
## 3     3    185     82   185         82

Unite two columns

data %>% 
  unite(new_column, weight, height, sep = "", na.rm = TRUE)
## # A tibble: 3 × 4
##      id new_column cases population
##   <int> <chr>      <dbl>      <dbl>
## 1     1 80180        180         80
## 2     2 60170        170         60
## 3     3 82185        185         82

Missing Values

colnames(data)
## [1] "id"         "height"     "weight"     "cases"      "population"
data <- data %>% unite(new_column, cases, id, sep = "", remove = FALSE)