This report is a summary of the lesson by Jeroen Boeye, Datacamp
netflix_df
## # A tibble: 637 × 3
## title type duration
## <chr> <chr> <chr>
## 1 Article 15 Movie 125 min
## 2 Kill Me If You Dare Movie 100 min
## 3 The Spy TV Show 1 Season
## 4 The World We Make Movie 108 min
## 5 Watchman Movie 93 min
## 6 Mo Gilligan: Momentum Movie 64 min
## 7 Domino Movie 89 min
## 8 TUNA GIRL Movie 90 min
## 9 Bard of Blood TV Show 1 Season
## 10 Dragons: Rescue Riders TV Show 1 Season
## # ℹ 627 more rows
separate(data, into = c(new column names), sep = ..., convert = TF)netflix_df %>%
separate(col = duration, into = c("value", "unit"), sep = " ", convert = TRUE)
## # A tibble: 637 × 4
## title type value unit
## <chr> <chr> <int> <chr>
## 1 Article 15 Movie 125 min
## 2 Kill Me If You Dare Movie 100 min
## 3 The Spy TV Show 1 Season
## 4 The World We Make Movie 108 min
## 5 Watchman Movie 93 min
## 6 Mo Gilligan: Momentum Movie 64 min
## 7 Domino Movie 89 min
## 8 TUNA GIRL Movie 90 min
## 9 Bard of Blood TV Show 1 Season
## 10 Dragons: Rescue Riders TV Show 1 Season
## # ℹ 627 more rows
unite(data, new column name ..., sep = ...)phone_nr_df
## # A tibble: 6 × 3
## country country_code national_number
## <chr> <chr> <chr>
## 1 USA +1 2025550117
## 2 United Kingdom +44 1632960924
## 3 Brazil +55 95552452220
## 4 Australia +61 1900654321
## 5 China +86 13555953217
## 6 India +91 8555843898
phone_nr_df %>%
unite("international_number", country_code, national_number, sep = " ")
## # A tibble: 6 × 2
## country international_number
## <chr> <chr>
## 1 USA +1 2025550117
## 2 United Kingdom +44 1632960924
## 3 Brazil +55 95552452220
## 4 Australia +61 1900654321
## 5 China +86 13555953217
## 6 India +91 8555843898
separate_row(column name, sep = ...)drink_df
## # A tibble: 3 × 2
## drink ingredients
## <chr> <chr>
## 1 Chocolate milk milk 0.3 L; chocolate 40 g; sugar 10 g
## 2 Orange juice oranges 3; sugar 20 g
## 3 Cappuccino milk 0.1 L; water 0.1 L; coffee 30 g; sugar 5 g
drink_df %>%
separate_rows(ingredients, sep = "; ")
## # A tibble: 9 × 2
## drink ingredients
## <chr> <chr>
## 1 Chocolate milk milk 0.3 L
## 2 Chocolate milk chocolate 40 g
## 3 Chocolate milk sugar 10 g
## 4 Orange juice oranges 3
## 5 Orange juice sugar 20 g
## 6 Cappuccino milk 0.1 L
## 7 Cappuccino water 0.1 L
## 8 Cappuccino coffee 30 g
## 9 Cappuccino sugar 5 g
NAreplace_na(): imputing with a default
valuemoon_df
## # A tibble: 5 × 2
## year people_on_moon
## <dbl> <dbl>
## 1 1969 4
## 2 1970 NA
## 3 1971 4
## 4 1972 4
## 5 1973 NA
moon_df %>%
replace_na(replace = list(people_on_moon = 0L))
## # A tibble: 5 × 2
## year people_on_moon
## <dbl> <dbl>
## 1 1969 4
## 2 1970 0
## 3 1971 4
## 4 1972 4
## 5 1973 0
fill(): imputing with the most recent
valuecumul_moon_df
## # A tibble: 5 × 3
## year people_on_moon total_people_on_moon
## <dbl> <dbl> <dbl>
## 1 1969 4 4
## 2 1970 NA NA
## 3 1971 4 8
## 4 1972 4 NA
## 5 1973 NA 12
## .direction = "dwon" 기본값
cumul_moon_df %>%
fill(total_people_on_moon, .direction = "down")
## # A tibble: 5 × 3
## year people_on_moon total_people_on_moon
## <dbl> <dbl> <dbl>
## 1 1969 4 4
## 2 1970 NA 4
## 3 1971 4 8
## 4 1972 4 8
## 5 1973 NA 12
## .direction = "up"
cumul_moon_df %>%
fill(total_people_on_moon, .direction = "up")
## # A tibble: 5 × 3
## year people_on_moon total_people_on_moon
## <dbl> <dbl> <dbl>
## 1 1969 4 4
## 2 1970 NA 8
## 3 1971 4 8
## 4 1972 4 12
## 5 1973 NA 12
drop_na(): removing rows with missing valuesdirector_df
## # A tibble: 4,265 × 2
## director title
## <chr> <chr>
## 1 Richard Finn, Tim Maltby Norm of the North: King Sized Adventure
## 2 <NA> Jandino: Whatever it Takes
## 3 Fernando Lebrija #realityhigh
## 4 Gabe Ibáñez Automata
## 5 Rodrigo Toro, Francisco Schultz Fabrizio Copano: Solo pienso en mi
## 6 Henrik Ruben Genz Good People
## 7 José Miguel Contreras Joaquín Reyes: Una y no más
## 8 Daniel Alfredson Kidnapping Mr. Heineken
## 9 <NA> Krish Trish and Baltiboy
## 10 Munjal Shroff, Tilak Shetty Krish Trish and Baltiboy: Battle of Wits
## # ℹ 4,255 more rows
director_df %>%
## Drop rows with NA values in the director column
drop_na(director) %>%
separate_rows(director, sep = ", ") %>%
## Count the number of movies per director
count(director, sort = TRUE)
## # A tibble: 3,535 × 2
## director n
## <chr> <int>
## 1 Jan Suter 21
## 2 Raúl Campos 19
## 3 Jay Karas 14
## 4 Marcus Raboy 14
## 5 Jay Chapman 12
## 6 Martin Scorsese 9
## 7 Steven Spielberg 9
## 8 David Dhawan 8
## 9 Johnnie To 8
## 10 Lance Bangs 8
## # ℹ 3,525 more rows
pivot_longer(cols = ..., names_to = ..., values_to = ..., values_drop_na = TF, names_transform = list(...))
cols: pivot 대상 칼럼명
starts_with: 동일한 패턴이 있는 칼럼명 일괄 지정 ex)
starts_with(“name_”)names_to: 기존 칼럼명의 새 칼럼명
names_sep: 새 칼럼이 2개 이상일 경우 구분자.value를 사용하면 칼럼명에 따라서도 분류 가능names_pattern: 정규표현식으로 구분values_to: 기존 데이터 값의 새 칼럼명values_drop_na: NA drop 여부names_transform: 데이터 타입 지정 ex) list(id =
as.integer)names_prefix: 사전에 지정한 텍스트를 제외하고 출력who %>%
select(-c("iso2", "iso3")) %>%
pivot_longer(-c("country", "year"),
names_to = c("diagnosis", "gender", "age"),
names_pattern = "new_?(.*)_(.)(.*)",
values_to = "count",
values_drop_na = T,
values_transform = list(count = as.integer)) %>%
group_by(country, diagnosis) %>%
summarise(count = sum(count), .groups = "keep") %>%
arrange(desc(count)) %>%
## 상위 50개
head(50) %>%
ggplot(aes(x = reorder(country, sqrt(count), sum), y = sqrt(count), fill = diagnosis)) +
geom_col() +
coord_flip() +
scale_fill_discrete(
labels = c("ep" = "폐외",
"rel" = "재발",
"sn" = "음성 폐결핵",
"sp" = "양성 폐결핵")
) +
labs(title = "Counts of cases recorded per country, broken down by diagnosis", x = "country", y = "count", caption = "Source: who") +
theme(plot.title.position = "plot") +
scale_y_continuous(expand = c(0,0))
uncount 데이터프레임의 각 행을 반복하여 확장하는 함수
weights: 반복할 횟수를 지정한 열(정수형도 가능).id: 각 행의 고유 ID를 추가nuke_df_e <- nuke_df %>%
pivot_longer(
-year,
names_to = "country",
values_to = "n_bombs"
) %>%
group_by(country) %>%
summarise(n_bombs = sum(n_bombs, na.rm = T)) %>%
arrange(n_bombs)
nuke_df_e %>%
uncount(n_bombs, .id = "bomb_id")
## # A tibble: 2,156 × 2
## country bomb_id
## <chr> <int>
## 1 Pakistan 1
## 2 Pakistan 2
## 3 India 1
## 4 India 2
## 5 India 3
## 6 India 4
## 7 India 5
## 8 India 6
## 9 North Korea 1
## 10 North Korea 2
## # ℹ 2,146 more rows
pivot_wider(cols = ..., names_from = ..., values_from = ..., values_drop_na = TF, names_transform = list(...))
cols: pivot 대상 칼럼명
starts_with: 동일한 패턴이 있는 칼럼명 일괄 지정 ex)
starts_with(“name_”)names_from: 새롭게 배치할 칼럼명이 있는 칼럼values_from: 새롭게 배치할 데이터가 있는 칼럼names_prefix: 사전에 지정한 텍스트를 포함하여 출력planet_df %>%
pivot_wider(names_from = metric, values_from = value) %>%
ggplot(aes(x = distance_to_sun, y = temperature)) +
geom_point(aes(size = diameter)) +
geom_text(aes(label = planet), vjust = -1) +
labs(x = "Distance to sun (million km)",
y = "Mean temperature (ºC)") +
theme(legend.position = "none")
planet_df_2 %>%
# Pivot all columns except metric to long format
pivot_longer(-metric, names_to = "planet") %>%
# Put each metric in its own column
pivot_wider(names_from = metric, values_from = value) %>%
ggplot(aes(x = diameter, y = number_of_moons)) +
geom_point(aes(size = diameter)) +
geom_text(aes(label = planet), vjust = -1) +
labs(x = "Diameter (km)", y = "Number of moons") +
theme(legend.position = "none")
expand_gridright_join# create a tibble with all combination of years and species
full_df <- expand_grid(
year = 1951:1970,
species = c("Human", "Dog")
)
space_df %>%
right_join(full_df, by = c("year", "species")) %>%
replace_na(list(n_in_space = 0L)) %>%
ggplot(aes(x = year, y = n_in_space, color = species)) +
geom_line()
anti_join : find the combinations not present in
dfcomplete(col..., fill = list(...)) : missing
combination 데이터를 자동으로 생성하여 데이터프레임을 완전한 상태로
만드는 함수planets
## [1] "Mercury" "Venus" "Earth" "Mars" "Jupiter" "Saturn" "Uranus"
## [8] "Neptune"
planet_df_3 %>%
complete(
planet = planets,
# overwrite NA values for n_moons with 0L
fill = list(n_moons = 0L)
)
## # A tibble: 9 × 2
## planet n_moons
## <chr> <dbl>
## 1 Earth 1
## 2 Jupiter 79
## 3 Mars 2
## 4 Mercury 0
## 5 Neptune 0
## 6 Saturn 62
## 7 Uranus 27
## 8 Venus 0
## 9 Neptuen 14
medal_df %>%
count(team, year, name = "n_medals") %>%
complete(team, year, fill = list(n_medals= 0)) %>%
ggplot(aes(x = year, y = n_medals, color = team)) +
geom_line() +
scale_color_brewer(palette = "Paired")
full_seq( , period = ...)outer_dates <- c(as.Date("1980-01-01"), as.Date("1980-12-31"))
# generate the dates for all days in 1980
head(full_seq(outer_dates, period = 1), 50)
## [1] "1980-01-01" "1980-01-02" "1980-01-03" "1980-01-04" "1980-01-05"
## [6] "1980-01-06" "1980-01-07" "1980-01-08" "1980-01-09" "1980-01-10"
## [11] "1980-01-11" "1980-01-12" "1980-01-13" "1980-01-14" "1980-01-15"
## [16] "1980-01-16" "1980-01-17" "1980-01-18" "1980-01-19" "1980-01-20"
## [21] "1980-01-21" "1980-01-22" "1980-01-23" "1980-01-24" "1980-01-25"
## [26] "1980-01-26" "1980-01-27" "1980-01-28" "1980-01-29" "1980-01-30"
## [31] "1980-01-31" "1980-02-01" "1980-02-02" "1980-02-03" "1980-02-04"
## [36] "1980-02-05" "1980-02-06" "1980-02-07" "1980-02-08" "1980-02-09"
## [41] "1980-02-10" "1980-02-11" "1980-02-12" "1980-02-13" "1980-02-14"
## [46] "1980-02-15" "1980-02-16" "1980-02-17" "1980-02-18" "1980-02-19"
nukes_1962 %>%
complete(country, date = full_seq(date, period = 1)) %>%
group_by(country) %>%
# impute missing values with the last known observation
fill(total_bombs) %>%
ggplot(aes(x = date, y = total_bombs, color = country)) +
geom_rect(xmin = as.Date("1962-10-16"), xmax = as.Date("1962-10-29"), ymin = -Inf, ymax = Inf, color = NA) +
geom_text(x = as.Date("1962-10-22"), y = 15, label = "Cuban Missile Crisis", angle = 90, color = "white") +
geom_line()
nesting() : complete 내 변수를 지정할 때
단일 변수로 취급하도록 함spreadsheet, csv 등과는 다른 Json, XML을 의미
tibbleunnest_wider : list 개수가 동일할 때unnest_longer: list 개수가 다를 때tibble(movie = movie_list) %>%
unnest_wider(movie)
## # A tibble: 7 × 5
## title episode_id release_date director producer
## <chr> <int> <chr> <chr> <chr>
## 1 A New Hope 4 1977-05-25 George Lucas Gary Kurtz, …
## 2 Attack of the Clones 2 2002-05-16 George Lucas Rick McCallum
## 3 The Phantom Menace 1 1999-05-19 George Lucas Rick McCallum
## 4 Revenge of the Sith 3 2005-05-19 George Lucas Rick McCallum
## 5 Return of the Jedi 6 1983-05-25 Richard Marquand Howard G. Ka…
## 6 The Empire Strikes Back 5 1980-05-17 Irvin Kershner Gary Kurtz, …
## 7 The Force Awakens 7 2015-12-11 J. J. Abrams Kathleen Ken…
tibble(movie = movie_planets_list) %>%
unnest_wider(movie) %>%
unnest_longer(planets)
## # A tibble: 34 × 3
## title release_date planets
## <chr> <chr> <chr>
## 1 A New Hope 1977-05-25 Alderaan
## 2 A New Hope 1977-05-25 Yavin IV
## 3 A New Hope 1977-05-25 Tatooine
## 4 The Empire Strikes Back 1980-05-17 Hoth
## 5 The Empire Strikes Back 1980-05-17 Dagobah
## 6 The Empire Strikes Back 1980-05-17 Bespin
## 7 The Empire Strikes Back 1980-05-17 Ord Mantell
## 8 Return of the Jedi 1983-05-25 Dagobah
## 9 Return of the Jedi 1983-05-25 Endor
## 10 Return of the Jedi 1983-05-25 Naboo
## # ℹ 24 more rows
star_wars_list %>%
unnest_wider(metadata) %>%
unnest_longer(films)
## # A tibble: 166 × 4
## name species homeworld films
## <chr> <chr> <chr> <chr>
## 1 Ackbar Mon Calamari Mon Cala Return of the Jedi
## 2 Ackbar Mon Calamari Mon Cala The Force Awakens
## 3 Adi Gallia Tholothian Coruscant The Phantom Menace
## 4 Adi Gallia Tholothian Coruscant Revenge of the Sith
## 5 Anakin Skywalker Human Tatooine Attack of the Clones
## 6 Anakin Skywalker Human Tatooine The Phantom Menace
## 7 Anakin Skywalker Human Tatooine Revenge of the Sith
## 8 Arvel Crynyd Human unknown Return of the Jedi
## 9 Ayla Secura Twi'lek Ryloth Attack of the Clones
## 10 Ayla Secura Twi'lek Ryloth The Phantom Menace
## # ℹ 156 more rows
hoist(data = ..., col = ..., ...):
unnest_를 중복하여 사용하지 않고 한번에 데이터 선택 가능,
하지만 데이터 구조를 먼저 파악해야 함.
data: 원본 데이터col: 리스트가 포함된 열...: 리스트에서 추출할 열 (새로운 열이름 = …)str로 먼저 구조를 파악하는게 필요함star_wars_list %>%
hoist(
metadata,
homeworld = list(2),
first_film = list("films", 1)
)
## # A tibble: 82 × 4
## name homeworld first_film metadata
## <chr> <chr> <chr> <list>
## 1 Ackbar Mon Cala Return of the Jedi <named list [2]>
## 2 Adi Gallia Coruscant The Phantom Menace <named list [2]>
## 3 Anakin Skywalker Tatooine Attack of the Clones <named list [2]>
## 4 Arvel Crynyd unknown Return of the Jedi <named list [2]>
## 5 Ayla Secura Ryloth Attack of the Clones <named list [2]>
## 6 Bail Prestor Organa Alderaan Attack of the Clones <named list [2]>
## 7 Barriss Offee Mirial Attack of the Clones <named list [2]>
## 8 BB8 unknown The Force Awakens <named list [2]>
## 9 Ben Quadinaros Tund The Phantom Menace <named list [2]>
## 10 Beru Whitesun lars Tatooine Attack of the Clones <named list [2]>
## # ℹ 72 more rows
## metadata는 삭제되지 않고 그대로 유지됨
nest(): list 칼럼 생성unnest()map: list 칼럼별 회귀분석을 하거나 함수를 적용할 수
있음ansur_df %>%
group_by(sex) %>%
nest() %>%
mutate(
fit = purrr::map(data, function(df) lm(weight_kg ~ waist_circum_m + stature_m, data = df)),
# tibble별 glance 칼럼 생성, tidy 함수도 가능
glanced = purrr::map(fit, glance)
) %>%
unnest(glanced)
## # A tibble: 2 × 15
## # Groups: sex [2]
## sex data fit r.squared adj.r.squared sigma statistic p.value df
## <chr> <list> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female <tibble> <lm> 0.822 0.822 4.63 4585. 0 2
## 2 Male <tibble> <lm> 0.873 0.873 5.06 14076. 0 2
## # ℹ 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
## # df.residual <int>, nobs <int>