This report is a summary of the lesson by Jeroen Boeye, Datacamp

1. Tidy Data

Multiple variables per column

netflix_df
## # A tibble: 637 × 3
##    title                  type    duration
##    <chr>                  <chr>   <chr>   
##  1 Article 15             Movie   125 min 
##  2 Kill Me If You Dare    Movie   100 min 
##  3 The Spy                TV Show 1 Season
##  4 The World We Make      Movie   108 min 
##  5 Watchman               Movie   93 min  
##  6 Mo Gilligan: Momentum  Movie   64 min  
##  7 Domino                 Movie   89 min  
##  8 TUNA GIRL              Movie   90 min  
##  9 Bard of Blood          TV Show 1 Season
## 10 Dragons: Rescue Riders TV Show 1 Season
## # ℹ 627 more rows

1) separate

  • separate(data, into = c(new column names), sep = ..., convert = TF)
netflix_df %>% 
  separate(col = duration, into = c("value", "unit"), sep = " ", convert = TRUE)
## # A tibble: 637 × 4
##    title                  type    value unit  
##    <chr>                  <chr>   <int> <chr> 
##  1 Article 15             Movie     125 min   
##  2 Kill Me If You Dare    Movie     100 min   
##  3 The Spy                TV Show     1 Season
##  4 The World We Make      Movie     108 min   
##  5 Watchman               Movie      93 min   
##  6 Mo Gilligan: Momentum  Movie      64 min   
##  7 Domino                 Movie      89 min   
##  8 TUNA GIRL              Movie      90 min   
##  9 Bard of Blood          TV Show     1 Season
## 10 Dragons: Rescue Riders TV Show     1 Season
## # ℹ 627 more rows

2) unite

  • unite(data, new column name ..., sep = ...)
phone_nr_df
## # A tibble: 6 × 3
##   country        country_code national_number
##   <chr>          <chr>        <chr>          
## 1 USA            +1           2025550117     
## 2 United Kingdom +44          1632960924     
## 3 Brazil         +55          95552452220    
## 4 Australia      +61          1900654321     
## 5 China          +86          13555953217    
## 6 India          +91          8555843898
phone_nr_df %>%
  unite("international_number", country_code, national_number, sep = " ")
## # A tibble: 6 × 2
##   country        international_number
##   <chr>          <chr>               
## 1 USA            +1 2025550117       
## 2 United Kingdom +44 1632960924      
## 3 Brazil         +55 95552452220     
## 4 Australia      +61 1900654321      
## 5 China          +86 13555953217     
## 6 India          +91 8555843898

3) separate_row

  • separate_row(column name, sep = ...)
drink_df
## # A tibble: 3 × 2
##   drink          ingredients                                    
##   <chr>          <chr>                                          
## 1 Chocolate milk milk 0.3 L; chocolate 40 g; sugar 10 g         
## 2 Orange juice   oranges 3; sugar 20 g                          
## 3 Cappuccino     milk 0.1 L; water 0.1 L; coffee 30 g; sugar 5 g
drink_df %>%
  separate_rows(ingredients, sep = "; ")
## # A tibble: 9 × 2
##   drink          ingredients   
##   <chr>          <chr>         
## 1 Chocolate milk milk 0.3 L    
## 2 Chocolate milk chocolate 40 g
## 3 Chocolate milk sugar 10 g    
## 4 Orange juice   oranges 3     
## 5 Orange juice   sugar 20 g    
## 6 Cappuccino     milk 0.1 L    
## 7 Cappuccino     water 0.1 L   
## 8 Cappuccino     coffee 30 g   
## 9 Cappuccino     sugar 5 g

Missing values - NA

1) replace_na

  • replace_na(): imputing with a default value
moon_df
## # A tibble: 5 × 2
##    year people_on_moon
##   <dbl>          <dbl>
## 1  1969              4
## 2  1970             NA
## 3  1971              4
## 4  1972              4
## 5  1973             NA
moon_df %>%
  replace_na(replace = list(people_on_moon = 0L))
## # A tibble: 5 × 2
##    year people_on_moon
##   <dbl>          <dbl>
## 1  1969              4
## 2  1970              0
## 3  1971              4
## 4  1972              4
## 5  1973              0

2) fill

  • fill(): imputing with the most recent value
cumul_moon_df
## # A tibble: 5 × 3
##    year people_on_moon total_people_on_moon
##   <dbl>          <dbl>                <dbl>
## 1  1969              4                    4
## 2  1970             NA                   NA
## 3  1971              4                    8
## 4  1972              4                   NA
## 5  1973             NA                   12
## .direction = "dwon" 기본값
cumul_moon_df %>%
  fill(total_people_on_moon, .direction = "down")
## # A tibble: 5 × 3
##    year people_on_moon total_people_on_moon
##   <dbl>          <dbl>                <dbl>
## 1  1969              4                    4
## 2  1970             NA                    4
## 3  1971              4                    8
## 4  1972              4                    8
## 5  1973             NA                   12
## .direction = "up"
cumul_moon_df %>%
  fill(total_people_on_moon, .direction = "up")
## # A tibble: 5 × 3
##    year people_on_moon total_people_on_moon
##   <dbl>          <dbl>                <dbl>
## 1  1969              4                    4
## 2  1970             NA                    8
## 3  1971              4                    8
## 4  1972              4                   12
## 5  1973             NA                   12

3) drop_na

  • drop_na(): removing rows with missing values
director_df
## # A tibble: 4,265 × 2
##    director                        title                                   
##    <chr>                           <chr>                                   
##  1 Richard Finn, Tim Maltby        Norm of the North: King Sized Adventure 
##  2 <NA>                            Jandino: Whatever it Takes              
##  3 Fernando Lebrija                #realityhigh                            
##  4 Gabe Ibáñez                     Automata                                
##  5 Rodrigo Toro, Francisco Schultz Fabrizio Copano: Solo pienso en mi      
##  6 Henrik Ruben Genz               Good People                             
##  7 José Miguel Contreras           Joaquín Reyes: Una y no más             
##  8 Daniel Alfredson                Kidnapping Mr. Heineken                 
##  9 <NA>                            Krish Trish and Baltiboy                
## 10 Munjal Shroff, Tilak Shetty     Krish Trish and Baltiboy: Battle of Wits
## # ℹ 4,255 more rows
director_df %>% 
  ## Drop rows with NA values in the director column
  drop_na(director) %>% 
  separate_rows(director, sep = ", ") %>% 
  ## Count the number of movies per director
  count(director, sort = TRUE)
## # A tibble: 3,535 × 2
##    director             n
##    <chr>            <int>
##  1 Jan Suter           21
##  2 Raúl Campos         19
##  3 Jay Karas           14
##  4 Marcus Raboy        14
##  5 Jay Chapman         12
##  6 Martin Scorsese      9
##  7 Steven Spielberg     9
##  8 David Dhawan         8
##  9 Johnnie To           8
## 10 Lance Bangs          8
## # ℹ 3,525 more rows

2. From wide to long and back

From wide to long data

1) pivot_longer

  • pivot_longer(cols = ..., names_to = ..., values_to = ..., values_drop_na = TF, names_transform = list(...))
    • cols: pivot 대상 칼럼명
      • starts_with: 동일한 패턴이 있는 칼럼명 일괄 지정 ex) starts_with(“name_”)
    • names_to: 기존 칼럼명의 새 칼럼명
      • names_sep: 새 칼럼이 2개 이상일 경우 구분자
      • .value를 사용하면 칼럼명에 따라서도 분류 가능
    • names_pattern: 정규표현식으로 구분
    • values_to: 기존 데이터 값의 새 칼럼명
    • values_drop_na: NA drop 여부
    • names_transform: 데이터 타입 지정 ex) list(id = as.integer)
    • names_prefix: 사전에 지정한 텍스트를 제외하고 출력
who %>% 
  select(-c("iso2", "iso3")) %>% 
  pivot_longer(-c("country", "year"),
               names_to = c("diagnosis", "gender", "age"),
               names_pattern = "new_?(.*)_(.)(.*)",
               values_to = "count",
               values_drop_na = T,
               values_transform = list(count = as.integer)) %>% 
  group_by(country, diagnosis) %>% 
  summarise(count = sum(count), .groups = "keep") %>% 
  arrange(desc(count)) %>% 
  ## 상위 50개
  head(50) %>% 
  
  ggplot(aes(x = reorder(country, sqrt(count), sum), y = sqrt(count), fill = diagnosis)) +
  geom_col() +
  coord_flip() +
  scale_fill_discrete(
    labels = c("ep" = "폐외",
               "rel" = "재발",
               "sn" = "음성 폐결핵",
               "sp" = "양성 폐결핵")
  ) +
  labs(title = "Counts of cases recorded per country, broken down by diagnosis", x = "country", y = "count", caption = "Source: who") +
  theme(plot.title.position = "plot") +
  scale_y_continuous(expand = c(0,0))

2) uncount

  • uncount 데이터프레임의 각 행을 반복하여 확장하는 함수
    • weights: 반복할 횟수를 지정한 열(정수형도 가능)
    • .id: 각 행의 고유 ID를 추가
nuke_df_e <- nuke_df %>% 
  pivot_longer(
    -year,
    names_to = "country",
    values_to = "n_bombs"
  ) %>% 
  group_by(country) %>% 
  summarise(n_bombs = sum(n_bombs, na.rm = T)) %>% 
  arrange(n_bombs)

nuke_df_e %>% 
  uncount(n_bombs, .id = "bomb_id")
## # A tibble: 2,156 × 2
##    country     bomb_id
##    <chr>         <int>
##  1 Pakistan          1
##  2 Pakistan          2
##  3 India             1
##  4 India             2
##  5 India             3
##  6 India             4
##  7 India             5
##  8 India             6
##  9 North Korea       1
## 10 North Korea       2
## # ℹ 2,146 more rows

From long to wide data

pivot_wider

  • pivot_wider(cols = ..., names_from = ..., values_from = ..., values_drop_na = TF, names_transform = list(...))
    • cols: pivot 대상 칼럼명
      • starts_with: 동일한 패턴이 있는 칼럼명 일괄 지정 ex) starts_with(“name_”)
    • names_from: 새롭게 배치할 칼럼명이 있는 칼럼
    • values_from: 새롭게 배치할 데이터가 있는 칼럼
    • names_prefix: 사전에 지정한 텍스트를 포함하여 출력
planet_df %>% 
  pivot_wider(names_from = metric, values_from = value) %>% 
  ggplot(aes(x = distance_to_sun, y = temperature)) +
  geom_point(aes(size = diameter)) +
  geom_text(aes(label = planet), vjust = -1) +
  labs(x = "Distance to sun (million km)",
       y = "Mean temperature (ºC)") +
  theme(legend.position = "none")

planet_df_2 %>% 
  # Pivot all columns except metric to long format
  pivot_longer(-metric, names_to = "planet") %>% 
  # Put each metric in its own column
  pivot_wider(names_from = metric, values_from = value) %>% 
  ggplot(aes(x = diameter, y = number_of_moons)) +
  geom_point(aes(size = diameter)) +
  geom_text(aes(label = planet), vjust = -1) +
  labs(x = "Diameter (km)", y = "Number of moons") +
  theme(legend.position = "none")

3. Expanding data

Creating unique combination of vectors

what if not values are missing but full observations?

  • expand_grid
  • right_join
# create a tibble with all combination of years and species
full_df <- expand_grid(
  year = 1951:1970,
  species = c("Human", "Dog")
)

space_df %>% 
  right_join(full_df, by = c("year", "species")) %>% 
  replace_na(list(n_in_space = 0L)) %>% 
  ggplot(aes(x = year, y = n_in_space, color = species)) +
  geom_line()

  • anti_join : find the combinations not present in df

Completing data with all value combinations

  • complete(col..., fill = list(...)) : missing combination 데이터를 자동으로 생성하여 데이터프레임을 완전한 상태로 만드는 함수
planets
## [1] "Mercury" "Venus"   "Earth"   "Mars"    "Jupiter" "Saturn"  "Uranus" 
## [8] "Neptune"
planet_df_3 %>% 
  complete(
    planet = planets,
    # overwrite NA values for n_moons with 0L
    fill = list(n_moons = 0L)
  )
## # A tibble: 9 × 2
##   planet  n_moons
##   <chr>     <dbl>
## 1 Earth         1
## 2 Jupiter      79
## 3 Mars          2
## 4 Mercury       0
## 5 Neptune       0
## 6 Saturn       62
## 7 Uranus       27
## 8 Venus         0
## 9 Neptuen      14
medal_df %>% 
  count(team, year, name = "n_medals") %>% 
  complete(team, year, fill = list(n_medals= 0)) %>% 
  
  ggplot(aes(x = year, y = n_medals, color = team)) +
  geom_line() +
  scale_color_brewer(palette = "Paired")

  • full_seq( , period = ...)
outer_dates <- c(as.Date("1980-01-01"), as.Date("1980-12-31"))

# generate the dates for all days in 1980
head(full_seq(outer_dates, period = 1), 50)
##  [1] "1980-01-01" "1980-01-02" "1980-01-03" "1980-01-04" "1980-01-05"
##  [6] "1980-01-06" "1980-01-07" "1980-01-08" "1980-01-09" "1980-01-10"
## [11] "1980-01-11" "1980-01-12" "1980-01-13" "1980-01-14" "1980-01-15"
## [16] "1980-01-16" "1980-01-17" "1980-01-18" "1980-01-19" "1980-01-20"
## [21] "1980-01-21" "1980-01-22" "1980-01-23" "1980-01-24" "1980-01-25"
## [26] "1980-01-26" "1980-01-27" "1980-01-28" "1980-01-29" "1980-01-30"
## [31] "1980-01-31" "1980-02-01" "1980-02-02" "1980-02-03" "1980-02-04"
## [36] "1980-02-05" "1980-02-06" "1980-02-07" "1980-02-08" "1980-02-09"
## [41] "1980-02-10" "1980-02-11" "1980-02-12" "1980-02-13" "1980-02-14"
## [46] "1980-02-15" "1980-02-16" "1980-02-17" "1980-02-18" "1980-02-19"
nukes_1962 %>% 
  complete(country, date = full_seq(date, period = 1)) %>% 
  group_by(country) %>% 
  # impute missing values with the last known observation
  fill(total_bombs) %>% 
  ggplot(aes(x = date, y = total_bombs, color = country)) +
  geom_rect(xmin = as.Date("1962-10-16"), xmax = as.Date("1962-10-29"), ymin = -Inf, ymax = Inf, color = NA) +
  geom_text(x = as.Date("1962-10-22"), y = 15, label = "Cuban Missile Crisis", angle = 90, color = "white") +
  geom_line()

Advanced completions

  • nesting() : complete 내 변수를 지정할 때 단일 변수로 취급하도록 함

4. Rectangling Data

Intro to non-rectangular data

spreadsheet, csv 등과는 다른 Json, XML을 의미

  • tibble
  • unnest_wider : list 개수가 동일할 때
  • unnest_longer: list 개수가 다를 때
tibble(movie = movie_list) %>% 
  unnest_wider(movie)
## # A tibble: 7 × 5
##   title                   episode_id release_date director         producer     
##   <chr>                        <int> <chr>        <chr>            <chr>        
## 1 A New Hope                       4 1977-05-25   George Lucas     Gary Kurtz, …
## 2 Attack of the Clones             2 2002-05-16   George Lucas     Rick McCallum
## 3 The Phantom Menace               1 1999-05-19   George Lucas     Rick McCallum
## 4 Revenge of the Sith              3 2005-05-19   George Lucas     Rick McCallum
## 5 Return of the Jedi               6 1983-05-25   Richard Marquand Howard G. Ka…
## 6 The Empire Strikes Back          5 1980-05-17   Irvin Kershner   Gary Kurtz, …
## 7 The Force Awakens                7 2015-12-11   J. J. Abrams     Kathleen Ken…
tibble(movie = movie_planets_list) %>% 
  unnest_wider(movie) %>% 
  unnest_longer(planets)
## # A tibble: 34 × 3
##    title                   release_date planets    
##    <chr>                   <chr>        <chr>      
##  1 A New Hope              1977-05-25   Alderaan   
##  2 A New Hope              1977-05-25   Yavin IV   
##  3 A New Hope              1977-05-25   Tatooine   
##  4 The Empire Strikes Back 1980-05-17   Hoth       
##  5 The Empire Strikes Back 1980-05-17   Dagobah    
##  6 The Empire Strikes Back 1980-05-17   Bespin     
##  7 The Empire Strikes Back 1980-05-17   Ord Mantell
##  8 Return of the Jedi      1983-05-25   Dagobah    
##  9 Return of the Jedi      1983-05-25   Endor      
## 10 Return of the Jedi      1983-05-25   Naboo      
## # ℹ 24 more rows
star_wars_list %>% 
  unnest_wider(metadata) %>% 
  unnest_longer(films)
## # A tibble: 166 × 4
##    name             species      homeworld films               
##    <chr>            <chr>        <chr>     <chr>               
##  1 Ackbar           Mon Calamari Mon Cala  Return of the Jedi  
##  2 Ackbar           Mon Calamari Mon Cala  The Force Awakens   
##  3 Adi Gallia       Tholothian   Coruscant The Phantom Menace  
##  4 Adi Gallia       Tholothian   Coruscant Revenge of the Sith 
##  5 Anakin Skywalker Human        Tatooine  Attack of the Clones
##  6 Anakin Skywalker Human        Tatooine  The Phantom Menace  
##  7 Anakin Skywalker Human        Tatooine  Revenge of the Sith 
##  8 Arvel Crynyd     Human        unknown   Return of the Jedi  
##  9 Ayla Secura      Twi'lek      Ryloth    Attack of the Clones
## 10 Ayla Secura      Twi'lek      Ryloth    The Phantom Menace  
## # ℹ 156 more rows

Selecting nested variables

  • hoist(data = ..., col = ..., ...): unnest_를 중복하여 사용하지 않고 한번에 데이터 선택 가능, 하지만 데이터 구조를 먼저 파악해야 함.
    • data: 원본 데이터
    • col: 리스트가 포함된 열
    • ...: 리스트에서 추출할 열 (새로운 열이름 = …)
    • str로 먼저 구조를 파악하는게 필요함
star_wars_list %>% 
  hoist(
    metadata,
    homeworld = list(2),
    first_film = list("films", 1)
  )
## # A tibble: 82 × 4
##    name                homeworld first_film           metadata        
##    <chr>               <chr>     <chr>                <list>          
##  1 Ackbar              Mon Cala  Return of the Jedi   <named list [2]>
##  2 Adi Gallia          Coruscant The Phantom Menace   <named list [2]>
##  3 Anakin Skywalker    Tatooine  Attack of the Clones <named list [2]>
##  4 Arvel Crynyd        unknown   Return of the Jedi   <named list [2]>
##  5 Ayla Secura         Ryloth    Attack of the Clones <named list [2]>
##  6 Bail Prestor Organa Alderaan  Attack of the Clones <named list [2]>
##  7 Barriss Offee       Mirial    Attack of the Clones <named list [2]>
##  8 BB8                 unknown   The Force Awakens    <named list [2]>
##  9 Ben Quadinaros      Tund      The Phantom Menace   <named list [2]>
## 10 Beru Whitesun lars  Tatooine  Attack of the Clones <named list [2]>
## # ℹ 72 more rows
## metadata는 삭제되지 않고 그대로 유지됨

Nesting data for modeling

  • nest(): list 칼럼 생성
  • unnest()
  • map: list 칼럼별 회귀분석을 하거나 함수를 적용할 수 있음
ansur_df %>% 
  group_by(sex) %>% 
  nest() %>% 
  mutate(
    fit = purrr::map(data, function(df) lm(weight_kg ~ waist_circum_m + stature_m, data = df)),
    # tibble별 glance 칼럼 생성, tidy 함수도 가능
    glanced = purrr::map(fit, glance)
  ) %>% 
  unnest(glanced)
## # A tibble: 2 × 15
## # Groups:   sex [2]
##   sex    data     fit    r.squared adj.r.squared sigma statistic p.value    df
##   <chr>  <list>   <list>     <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>
## 1 Female <tibble> <lm>       0.822         0.822  4.63     4585.       0     2
## 2 Male   <tibble> <lm>       0.873         0.873  5.06    14076.       0     2
## # ℹ 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
## #   df.residual <int>, nobs <int>