This report is a summary of the lesson by DataCamp
inner_join : macth perfectly between tables
by: 조인 대상 칼럼suffix: 조인 칼럼명이 중복일 경우 추가로 입력될
접미사parts %>%
inner_join(part_categories, by = c("part_cat_id" = "id"), suffix = c("_part", "_category"))
## # A tibble: 17,501 × 4
## part_num name_part part_cat_id name_category
## <chr> <chr> <dbl> <chr>
## 1 0901 Baseplate 16 x 30 with Set 080 Yellow H… 1 Baseplates
## 2 0902 Baseplate 16 x 24 with Set 080 Small Wh… 1 Baseplates
## 3 0903 Baseplate 16 x 24 with Set 080 Red Hous… 1 Baseplates
## 4 0904 Baseplate 16 x 24 with Set 080 Large Wh… 1 Baseplates
## 5 1 Homemaker Bookcase 2 x 4 x 4 7 Containers
## 6 10016414 Sticker Sheet #1 for 41055-1 58 Stickers
## 7 10026stk01 Sticker for Set 10026 - (44942/4184185) 58 Stickers
## 8 10039 Pullback Motor 8 x 4 x 2/3 44 Mechanical
## 9 10048 Minifig Hair Tousled 65 Minifig Head…
## 10 10049 Minifig Shield Broad with Spiked Bottom… 27 Minifig Acce…
## # ℹ 17,491 more rows
inner_join을 여러번 사용하면 됨
sets %>%
# table 1
inner_join(inventories, by = "set_num") %>%
# table 2
inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
# table 3
inner_join(colors, by = c("color_id" = "id"), suffix = c("_set", "_color")) %>%
# count the number of colors and sort
count(name_color, sort = T)
## # A tibble: 134 × 2
## name_color n
## <chr> <int>
## 1 Black 48068
## 2 White 30105
## 3 Light Bluish Gray 26024
## 4 Red 21602
## 5 Dark Bluish Gray 19948
## 6 Yellow 17088
## 7 Blue 12980
## 8 Light Gray 8632
## 9 Reddish Brown 6960
## 10 Tan 6664
## # ℹ 124 more rows
left_join : 왼쪽 테이블값 모두 유지right_join: 오른쪽 테이블값 모두 유지sets %>%
count(theme_id, sort = T) %>%
right_join(themes, by = c("theme_id" = "id")) %>%
# filter for NA
filter(is.na(n))
## # A tibble: 96 × 4
## theme_id n name parent_id
## <dbl> <int> <chr> <dbl>
## 1 8 NA Farm 5
## 2 24 NA Airport 23
## 3 25 NA Castle 23
## 4 26 NA Construction 23
## 5 27 NA Race 23
## 6 28 NA Harbor 23
## 7 29 NA Train 23
## 8 32 NA Robot 23
## 9 34 NA Building 23
## 10 35 NA Cargo 23
## # ℹ 86 more rows
sets %>%
count(theme_id, sort = T) %>%
right_join(themes, by = c("theme_id" = "id")) %>%
# replace NA
replace_na(list(n = 0L))
## # A tibble: 665 × 4
## theme_id n name parent_id
## <dbl> <int> <chr> <dbl>
## 1 501 122 Gear NA
## 2 494 111 Friends NA
## 3 435 94 Ninjago NA
## 4 505 94 Basic Set 504
## 5 632 93 Town 504
## 6 371 89 Supplemental 365
## 7 497 86 Books NA
## 8 503 82 Key Chain 501
## 9 516 78 Duplo and Explore 507
## 10 220 72 City 217
## # ℹ 655 more rows
themes 를 살펴보면 id와
parent_id가 계층적 관계를 가지고
있음
themes
## # A tibble: 665 × 3
## id name parent_id
## <dbl> <chr> <dbl>
## 1 1 Technic NA
## 2 2 Arctic Technic 1
## 3 3 Competition 1
## 4 4 Expert Builder 1
## 5 5 Model 1
## 6 6 Airport 5
## 7 7 Construction 5
## 8 8 Farm 5
## 9 9 Fire 5
## 10 10 Harbor 5
## # ℹ 655 more rows
inner_join을 사용하여 자체 조인 가능
themes %>%
inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
inner_join(themes, by = c("id_child" = "parent_id"), suffix = c("_parent", "_grandchild"))
## # A tibble: 158 × 7
## id_parent name_parent parent_id id_child name_child id_grandchild name
## <dbl> <chr> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 Technic NA 5 Model 6 Airport
## 2 1 Technic NA 5 Model 7 Constructi…
## 3 1 Technic NA 5 Model 8 Farm
## 4 1 Technic NA 5 Model 9 Fire
## 5 1 Technic NA 5 Model 10 Harbor
## 6 1 Technic NA 5 Model 11 Off-Road
## 7 1 Technic NA 5 Model 12 Race
## 8 1 Technic NA 5 Model 13 Riding Cyc…
## 9 1 Technic NA 5 Model 14 Robot
## 10 1 Technic NA 5 Model 15 Traffic
## # ℹ 148 more rows
full_join## 1
inventory_parts_joined <- inventories %>%
inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
arrange(desc(quantity)) %>%
select(-id, -version)
inventory_parts_joined
## # A tibble: 258,958 × 4
## set_num part_num color_id quantity
## <chr> <chr> <dbl> <dbl>
## 1 40179-1 3024 72 900
## 2 40179-1 3024 15 900
## 3 40179-1 3024 0 900
## 4 40179-1 3024 71 900
## 5 40179-1 3024 14 900
## 6 k34434-1 3024 15 810
## 7 21010-1 3023 320 771
## 8 k34431-1 3024 0 720
## 9 42083-1 2780 0 684
## 10 k34434-1 3024 0 540
## # ℹ 258,948 more rows
## 2
inventory_sets_themes <- inventory_parts_joined %>%
inner_join(sets, by = "set_num") %>%
inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme"))
batman_parts <- inventory_sets_themes %>%
filter(name_theme == "Batman") %>%
count(part_num, color_id, wt = quantity)
star_wars_parts <- inventory_sets_themes %>%
filter(name_theme == "Star Wars") %>%
count(part_num, color_id, wt = quantity)
## 3
parts_joined <- batman_parts %>%
full_join(star_wars_parts, by = c("part_num", "color_id"), suffix = c("_batman", "_star_wars")) %>%
# replace NA
replace_na(list(n_batman = 0, n_star_wars = 0))
parts_joined
## # A tibble: 3,628 × 4
## part_num color_id n_batman n_star_wars
## <chr> <dbl> <dbl> <dbl>
## 1 10113 0 11 0
## 2 10113 272 1 0
## 3 10113 320 1 0
## 4 10183 57 1 0
## 5 10190 0 2 0
## 6 10201 0 1 21
## 7 10201 4 3 0
## 8 10201 14 1 0
## 9 10201 15 6 0
## 10 10201 71 4 5
## # ℹ 3,618 more rows
## 4
parts_joined %>%
arrange(desc(n_star_wars)) %>%
inner_join(colors, by = c("color_id" = "id")) %>%
inner_join(parts, by = c("part_num"), suffix = c("_color", "_part"))
## # A tibble: 3,628 × 8
## part_num color_id n_batman n_star_wars name_color rgb name_part part_cat_id
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 2780 0 104 392 Black #051… Technic … 53
## 2 32062 0 1 141 Black #051… Technic … 46
## 3 4274 1 56 118 Blue #005… Technic … 53
## 4 6141 36 11 117 Trans-Red #C91… Plate Ro… 21
## 5 3023 71 10 106 Light Blu… #A0A… Plate 1 … 14
## 6 6558 1 30 106 Blue #005… Technic … 53
## 7 43093 1 44 99 Blue #005… Technic … 53
## 8 3022 72 14 95 Dark Blui… #6C6… Plate 2 … 14
## 9 2357 19 0 84 Tan #E4C… Brick 2 … 11
## 10 6141 179 90 81 Flat Silv… #898… Plate Ro… 21
## # ℹ 3,618 more rows
semi_join: A테이블 관측값이 B테이블에도
있습니까?
inner_join과 다르게 왼쪽 테이블 칼럼만 유지anti_join: A테이블 관측값이 B테이블에
없습니까?
두 함수 모두 왼쪽 테이블만 출력하기에 suffix는
사용안함
## Aggregating sets to look at their differences
batman_colors <- inventory_sets_themes %>%
filter(name_theme == "Batman") %>%
group_by(color_id) %>%
summarise(total = sum(quantity)) %>%
mutate(fraction = total / sum(total))
batman_colors
## # A tibble: 57 × 3
## color_id total fraction
## <dbl> <dbl> <dbl>
## 1 0 2807 0.296
## 2 1 243 0.0256
## 3 2 158 0.0167
## 4 4 529 0.0558
## 5 5 1 0.000105
## 6 10 13 0.00137
## 7 14 426 0.0449
## 8 15 404 0.0426
## 9 19 142 0.0150
## 10 25 36 0.00380
## # ℹ 47 more rows
star_wars_colors <- inventory_sets_themes %>%
filter(name_theme == "Star Wars") %>%
group_by(color_id) %>%
summarise(total = sum(quantity)) %>%
mutate(fraction = total / sum(total))
star_wars_colors
## # A tibble: 52 × 3
## color_id total fraction
## <dbl> <dbl> <dbl>
## 1 0 3258 0.207
## 2 1 410 0.0261
## 3 2 36 0.00229
## 4 3 25 0.00159
## 5 4 434 0.0276
## 6 6 40 0.00254
## 7 7 209 0.0133
## 8 8 51 0.00324
## 9 10 6 0.000382
## 10 14 207 0.0132
## # ℹ 42 more rows
## Combining sets
colors_joined <- batman_colors %>%
full_join(star_wars_colors, by = "color_id", suffix = c("_batman", "_star_wars")) %>%
replace_na(list(total_batman = 0, total_star_wars = 0, fraction_batman = 0, fraction_star_wars = 0)) %>%
inner_join(colors, by = c("color_id" = "id")) %>%
# create the difference and total columns
mutate(difference = fraction_batman - fraction_star_wars,
total = total_batman + total_star_wars) %>%
filter(total > 200) %>%
# factoring name
mutate(name = fct_reorder(name, difference))
color_palette <- setNames(colors_joined$rgb, colors_joined$name)
# visualizing
ggplot(colors_joined, aes(x = name, y = difference, fill = name)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = color_palette, guide = "none")
questions %>%
inner_join(answers, by = c("id" = "question_id"), suffix = c("_question", "_answer")) %>%
mutate(
gap = as.integer(creation_date_answer - creation_date_question)
)
## # A tibble: 380,643 × 7
## id creation_date_question score_question id_answer creation_date_answer
## <int> <date> <int> <int> <date>
## 1 22557677 2014-03-21 1 22560670 2014-03-21
## 2 22557707 2014-03-21 2 22558516 2014-03-21
## 3 22557707 2014-03-21 2 22558726 2014-03-21
## 4 22558084 2014-03-21 2 22558085 2014-03-21
## 5 22558084 2014-03-21 2 22606545 2014-03-24
## 6 22558084 2014-03-21 2 22610396 2014-03-24
## 7 22558084 2014-03-21 2 34374729 2015-12-19
## 8 22558395 2014-03-21 2 22559327 2014-03-21
## 9 22558395 2014-03-21 2 22560102 2014-03-21
## 10 22558395 2014-03-21 2 22560288 2014-03-21
## # ℹ 380,633 more rows
## # ℹ 2 more variables: score_answer <int>, gap <int>
##
answer_counts <- answers %>%
count(question_id, sort = TRUE)
answer_counts
## # A tibble: 243,930 × 2
## question_id n
## <int> <int>
## 1 1295955 34
## 2 2547402 30
## 3 1358003 27
## 4 4090169 26
## 5 1535021 25
## 6 1189759 24
## 7 1815606 24
## 8 5963269 23
## 9 17200114 22
## 10 102056 21
## # ℹ 243,920 more rows
##
question_answer_counts <- questions %>%
left_join(answer_counts, by = c("id" = "question_id")) %>%
replace_na(list(n = 0))
question_answer_counts
## # A tibble: 294,735 × 4
## id creation_date score n
## <int> <date> <int> <int>
## 1 22557677 2014-03-21 1 1
## 2 22557707 2014-03-21 2 2
## 3 22558084 2014-03-21 2 4
## 4 22558395 2014-03-21 2 3
## 5 22558613 2014-03-21 0 1
## 6 22558677 2014-03-21 2 2
## 7 22558887 2014-03-21 8 1
## 8 22559180 2014-03-21 1 1
## 9 22559312 2014-03-21 0 1
## 10 22559322 2014-03-21 2 5
## # ℹ 294,725 more rows
##
tagged_answers <- question_answer_counts %>%
inner_join(question_tags, by = c("id" = "question_id")) %>%
inner_join(tags, by = c("tag_id" = "id"))
tagged_answers
## # A tibble: 497,153 × 6
## id creation_date score n tag_id tag_name
## <int> <date> <int> <int> <dbl> <chr>
## 1 22557677 2014-03-21 1 1 18 regex
## 2 22557677 2014-03-21 1 1 139 string
## 3 22557677 2014-03-21 1 1 16088 time-complexity
## 4 22557677 2014-03-21 1 1 1672 backreference
## 5 22558084 2014-03-21 2 4 6419 time-series
## 6 22558084 2014-03-21 2 4 92764 panel-data
## 7 22558395 2014-03-21 2 3 5569 function
## 8 22558395 2014-03-21 2 3 134 sorting
## 9 22558395 2014-03-21 2 3 9412 vectorization
## 10 22558395 2014-03-21 2 3 18621 operator-precedence
## # ℹ 497,143 more rows
##
tagged_answers %>%
group_by(tag_name) %>%
summarize(
questions = n(),
average_answers = mean(n)
) %>%
arrange(desc(questions))
## # A tibble: 7,840 × 3
## tag_name questions average_answers
## <chr> <int> <dbl>
## 1 ggplot2 28228 1.15
## 2 dataframe 18874 1.67
## 3 shiny 14219 0.921
## 4 dplyr 14039 1.55
## 5 plot 11315 1.23
## 6 data.table 8809 1.47
## 7 matrix 6205 1.45
## 8 loops 5149 1.39
## 9 regex 4912 1.91
## 10 function 4892 1.30
## # ℹ 7,830 more rows
bind_rows##
questions_with_tags <- questions %>%
inner_join(question_tags, by = c("id" = "question_id")) %>%
inner_join(tags, by = c("tag_id" = "id"))
questions_with_tags
## # A tibble: 497,153 × 5
## id creation_date score tag_id tag_name
## <int> <date> <int> <dbl> <chr>
## 1 22557677 2014-03-21 1 18 regex
## 2 22557677 2014-03-21 1 139 string
## 3 22557677 2014-03-21 1 16088 time-complexity
## 4 22557677 2014-03-21 1 1672 backreference
## 5 22558084 2014-03-21 2 6419 time-series
## 6 22558084 2014-03-21 2 92764 panel-data
## 7 22558395 2014-03-21 2 5569 function
## 8 22558395 2014-03-21 2 134 sorting
## 9 22558395 2014-03-21 2 9412 vectorization
## 10 22558395 2014-03-21 2 18621 operator-precedence
## # ℹ 497,143 more rows
##
answers_with_tags <- answers %>%
inner_join(question_tags, by = "question_id") %>%
inner_join(tags, by = c("tag_id" = "id"))
answers_with_tags
## # A tibble: 625,845 × 6
## id creation_date question_id score tag_id tag_name
## <int> <date> <int> <int> <dbl> <chr>
## 1 39143935 2016-08-25 39142481 0 4240 average
## 2 39143935 2016-08-25 39142481 0 5571 summary
## 3 39144014 2016-08-25 39024390 0 85748 shiny
## 4 39144014 2016-08-25 39024390 0 83308 r-markdown
## 5 39144014 2016-08-25 39024390 0 116736 htmlwidgets
## 6 39144252 2016-08-25 39096741 6 67746 rstudio
## 7 39144375 2016-08-25 39143885 5 105113 data.table
## 8 39144430 2016-08-25 39144077 0 276 variables
## 9 39144625 2016-08-25 39142728 1 46457 dataframe
## 10 39144625 2016-08-25 39142728 1 9047 subset
## # ℹ 625,835 more rows
##
posts_with_tags <- bind_rows(questions_with_tags %>% mutate(type = "question"),
answers_with_tags %>% mutate(type = "answer"))
by_type_year_tag <- posts_with_tags %>%
mutate(
year = lubridate::year(creation_date),
) %>%
count(type, year, tag_name)
by_type_year_tag
## # A tibble: 58,299 × 4
## type year tag_name n
## <chr> <dbl> <chr> <int>
## 1 answer 2008 bayesian 1
## 2 answer 2008 dataframe 3
## 3 answer 2008 dirichlet 1
## 4 answer 2008 eof 1
## 5 answer 2008 file 1
## 6 answer 2008 file-io 1
## 7 answer 2008 function 7
## 8 answer 2008 global-variables 7
## 9 answer 2008 math 2
## 10 answer 2008 mathematical-optimization 1
## # ℹ 58,289 more rows
##
by_type_year_tag_filtered <- by_type_year_tag %>%
filter(tag_name %in% c("dplyr", "ggplot2"))
ggplot(by_type_year_tag_filtered, aes(x = year, y = n, color = type)) +
geom_line() +
facet_wrap( ~ tag_name)