Missing Data / Joining Data - week #4 - FDS - 09/22/20

access libraries

library(readr)
library(tidyverse)
library(naniar)

read in data

draft_66_pass <- read_csv("draft_66_pass.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   draft_team = col_character(),
##   name = col_character(),
##   college = col_character(),
##   nba_season = col_character(),
##   nba_tm = col_character(),
##   nba_lg = col_character(),
##   nba_pos = col_character(),
##   nba_g = col_character(),
##   nba_gs = col_logical(),
##   nba_mip = col_logical(),
##   nba_6th_man = col_logical(),
##   nba_finals_mvp = col_logical(),
##   nba_mvp = col_logical(),
##   nba_all_rookie_2nd = col_logical(),
##   nba_all_3rd = col_logical(),
##   nba_def_mvp = col_logical(),
##   nba_def_1st = col_logical(),
##   nba_def_2nd = col_logical()
## )
## See spec(...) for full column specifications.
draft_66_fail <- read_csv("draft_66_fail.csv")
## Parsed with column specification:
## cols(
##   draft_year = col_double(),
##   pick_number = col_double(),
##   draft_team = col_character(),
##   name = col_character(),
##   college = col_character()
## )

select the variables of interest in the “pass” group

draft_66_pass <-select(draft_66_pass, c("nba_g","pick_number","name","draft_team","college",
                                        "nba_pts","nba_trb","nba_ast","nba_blk","nba_stl"))

visualize missing data

draft_66_pass %>% arrange(pick_number) %>% vis_miss()

gg_miss_var(draft_66_pass, facet = draft_team)

open up data viewer and show name/team missing info that needs to be filled…

fill down these variables

draft_66_pass <- draft_66_pass %>% fill(name,college)

identify strange missing values

miss_scan_count(data = draft_66_pass, search = list("N/A","missing","na","NA","n/a"))
## # A tibble: 10 x 2
##    Variable        n
##    <chr>       <int>
##  1 nba_g           7
##  2 pick_number     0
##  3 name            0
##  4 draft_team      0
##  5 college        11
##  6 nba_pts         0
##  7 nba_trb         0
##  8 nba_ast         0
##  9 nba_blk         0
## 10 nba_stl         0

Replace the strange missing values w/ zeroes for nba_g; college is ok

draft_66_pass <- replace_with_na(draft_66_pass, replace = list(nba_g = c("N/A","missing","na","NA","n/a")))

fixed?

miss_scan_count(data = draft_66_pass, search = list("N/A","missing","na","NA","n/a"))
## # A tibble: 10 x 2
##    Variable        n
##    <chr>       <int>
##  1 nba_g           0
##  2 pick_number     0
##  3 name            0
##  4 draft_team      0
##  5 college        11
##  6 nba_pts         0
##  7 nba_trb         0
##  8 nba_ast         0
##  9 nba_blk         0
## 10 nba_stl         0

missing values –> zeros in preparation for sum

draft_66_pass$nba_g<-as.numeric(draft_66_pass$nba_g) # need to convert this variable to numeric first
draft_66_pass[is.na(draft_66_pass)] <- 0

aggregate the season level observations of the “pass” group

draft_66_pass <- draft_66_pass %>%
  group_by(pick_number,name,draft_team,college) %>%
  summarize(nba_g=sum(nba_g),nba_pts=sum(nba_pts),nba_trb=sum(nba_trb),
            nba_ast=sum(nba_ast),nba_blk=sum(nba_blk),nba_stl=sum(nba_stl))
## `summarise()` regrouping output by 'pick_number', 'name', 'draft_team' (override with `.groups` argument)

#Now Draft_66_Pass has summary variables

head(draft_66_pass)
## # A tibble: 6 x 10
## # Groups:   pick_number, name, draft_team [6]
##   pick_number name  draft_team college nba_g nba_pts nba_trb nba_ast nba_blk
##         <dbl> <chr> <chr>      <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1           1 Cazz~ NYK        Univer~   817   12377    3068    1838      33
## 2           2 Dave~ DET        Syracu~   901   18327    3420    5397      89
## 3           3 Clyd~ SFW        Vander~   822    6204    8383     893      84
## 4           4 Lou ~ STL        Univer~   890   17940    3926    2432      98
## 5           5 Jack~ BAL        Duke U~  1002   13922    4885    2121      96
## 6           6 Walt~ CIN        Univer~   692    5282    3457     428      30
## # ... with 1 more variable: nba_stl <dbl>

inner join

draft_66_test <- draft_66_pass %>% 
  inner_join(draft_66_fail, by = c("name" = "name"), suffix = c("_pass", "_fail")) ## no matches

no matches

str(draft_66_test)
## tibble [0 x 14] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ pick_number_pass: num(0) 
##  $ name            : chr(0) 
##  $ draft_team_pass : chr(0) 
##  $ college_pass    : chr(0) 
##  $ nba_g           : num(0) 
##  $ nba_pts         : num(0) 
##  $ nba_trb         : num(0) 
##  $ nba_ast         : num(0) 
##  $ nba_blk         : num(0) 
##  $ nba_stl         : num(0) 
##  $ draft_year      : num(0) 
##  $ pick_number_fail: num(0) 
##  $ draft_team_fail : chr(0) 
##  $ college_fail    : chr(0) 
##  - attr(*, "groups")= tibble [0 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ name : chr(0) 
##   ..$ .rows: list<int> [1:0] 
##  list()
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE

left join

draft_66_test <- draft_66_pass %>% 
  left_join(draft_66_fail, by = c("name" = "name"), suffix = c("_pass", "_fail")) ## 25 observations (pass only)

25 observations

str(draft_66_pass)
## tibble [25 x 10] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ pick_number: num [1:25] 1 2 3 4 5 6 7 8 9 10 ...
##  $ name       : chr [1:25] "Cazzie Russell" "Dave Bing" "Clyde Lee" "Lou Hudson" ...
##  $ draft_team : chr [1:25] "NYK" "DET" "SFW" "STL" ...
##  $ college    : chr [1:25] "University of Michigan" "Syracuse University" "Vanderbilt University" "University of Minnesota" ...
##  $ nba_g      : num [1:25] 817 901 822 890 1002 ...
##  $ nba_pts    : num [1:25] 12377 18327 6204 17940 13922 ...
##  $ nba_trb    : num [1:25] 3068 3420 8383 3926 4885 ...
##  $ nba_ast    : num [1:25] 1838 5397 893 2432 2121 ...
##  $ nba_blk    : num [1:25] 33 89 84 98 96 30 3 46 65 0 ...
##  $ nba_stl    : num [1:25] 239 483 110 516 246 23 11 178 189 0 ...
##  - attr(*, "groups")= tibble [25 x 4] (S3: tbl_df/tbl/data.frame)
##   ..$ pick_number: num [1:25] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ name       : chr [1:25] "Cazzie Russell" "Dave Bing" "Clyde Lee" "Lou Hudson" ...
##   ..$ draft_team : chr [1:25] "NYK" "DET" "SFW" "STL" ...
##   ..$ .rows      : list<int> [1:25] 
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE

right join

draft_66_test <- draft_66_pass %>% 
  right_join(draft_66_fail, by = c("name" = "name"), suffix = c("_pass", "_fail")) ## 35 observations (fail only)

35 Observations Only

str(draft_66_test)
## tibble [35 x 14] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ pick_number_pass: num [1:35] NA NA NA NA NA NA NA NA NA NA ...
##  $ name            : chr [1:35] "Jerry Lee Wells" "Leon Clark" "Stew Johnson" "Ollie Darden" ...
##  $ draft_team_pass : chr [1:35] NA NA NA NA ...
##  $ college_pass    : chr [1:35] NA NA NA NA ...
##  $ nba_g           : num [1:35] NA NA NA NA NA NA NA NA NA NA ...
##  $ nba_pts         : num [1:35] NA NA NA NA NA NA NA NA NA NA ...
##  $ nba_trb         : num [1:35] NA NA NA NA NA NA NA NA NA NA ...
##  $ nba_ast         : num [1:35] NA NA NA NA NA NA NA NA NA NA ...
##  $ nba_blk         : num [1:35] NA NA NA NA NA NA NA NA NA NA ...
##  $ nba_stl         : num [1:35] NA NA NA NA NA NA NA NA NA NA ...
##  $ draft_year      : num [1:35] 1966 1966 1966 1966 1966 ...
##  $ pick_number_fail: num [1:35] 16 18 21 22 23 25 28 30 31 32 ...
##  $ draft_team_fail : chr [1:35] "CIN" "BOS" "NYK" "DET" ...
##  $ college_fail    : chr [1:35] "Oklahoma City University" "University of Wyoming" "Murray State University" "University of Michigan" ...
##  - attr(*, "groups")= tibble [35 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ name : chr [1:35] "Bob McIntyre" "Carroll Hooser" "Charles Schmaus" "Charlie Hunter" ...
##   ..$ .rows: list<int> [1:35] 
##   .. ..$ : int 12
##   .. ..$ : int 27
##   .. ..$ : int 14
##   .. ..$ : int 33
##   .. ..$ : int 6
##   .. ..$ : int 19
##   .. ..$ : int 8
##   .. ..$ : int 7
##   .. ..$ : int 26
##   .. ..$ : int 13
##   .. ..$ : int 30
##   .. ..$ : int 10
##   .. ..$ : int 1
##   .. ..$ : int 28
##   .. ..$ : int 15
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 32
##   .. ..$ : int 25
##   .. ..$ : int 9
##   .. ..$ : int 2
##   .. ..$ : int 29
##   .. ..$ : int 35
##   .. ..$ : int 4
##   .. ..$ : int 34
##   .. ..$ : int 22
##   .. ..$ : int 16
##   .. ..$ : int 23
##   .. ..$ : int 5
##   .. ..$ : int 11
##   .. ..$ : int 31
##   .. ..$ : int 3
##   .. ..$ : int 24
##   .. ..$ : int 18
##   .. ..$ : int 17
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE

full join

draft_66_test <- draft_66_pass %>% 
  full_join(draft_66_fail, by = c("name" = "name"), suffix = c("_pass", "_fail")) ## 60 observations (pass + fail)

60 Observations

str(draft_66_test)
## tibble [60 x 14] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ pick_number_pass: num [1:60] 1 2 3 4 5 6 7 8 9 10 ...
##  $ name            : chr [1:60] "Cazzie Russell" "Dave Bing" "Clyde Lee" "Lou Hudson" ...
##  $ draft_team_pass : chr [1:60] "NYK" "DET" "SFW" "STL" ...
##  $ college_pass    : chr [1:60] "University of Michigan" "Syracuse University" "Vanderbilt University" "University of Minnesota" ...
##  $ nba_g           : num [1:60] 817 901 822 890 1002 ...
##  $ nba_pts         : num [1:60] 12377 18327 6204 17940 13922 ...
##  $ nba_trb         : num [1:60] 3068 3420 8383 3926 4885 ...
##  $ nba_ast         : num [1:60] 1838 5397 893 2432 2121 ...
##  $ nba_blk         : num [1:60] 33 89 84 98 96 30 3 46 65 0 ...
##  $ nba_stl         : num [1:60] 239 483 110 516 246 23 11 178 189 0 ...
##  $ draft_year      : num [1:60] NA NA NA NA NA NA NA NA NA NA ...
##  $ pick_number_fail: num [1:60] NA NA NA NA NA NA NA NA NA NA ...
##  $ draft_team_fail : chr [1:60] NA NA NA NA ...
##  $ college_fail    : chr [1:60] NA NA NA NA ...
##  - attr(*, "groups")= tibble [60 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ name : chr [1:60] "Archie Clark" "Bill Melchionni" "Bob McIntyre" "Carroll Hooser" ...
##   ..$ .rows: list<int> [1:60] 
##   .. ..$ : int 23
##   .. ..$ : int 17
##   .. ..$ : int 37
##   .. ..$ : int 52
##   .. ..$ : int 1
##   .. ..$ : int 39
##   .. ..$ : int 58
##   .. ..$ : int 3
##   .. ..$ : int 2
##   .. ..$ : int 10
##   .. ..$ : int 31
##   .. ..$ : int 44
##   .. ..$ : int 14
##   .. ..$ : int 22
##   .. ..$ : int 12
##   .. ..$ : int 33
##   .. ..$ : int 18
##   .. ..$ : int 32
##   .. ..$ : int 51
##   .. ..$ : int 38
##   .. ..$ : int 16
##   .. ..$ : int 11
##   .. ..$ : int 5
##   .. ..$ : int 55
##   .. ..$ : int 35
##   .. ..$ : int 7
##   .. ..$ : int 26
##   .. ..$ : int 8
##   .. ..$ : int 53
##   .. ..$ : int 20
##   .. ..$ : int 40
##   .. ..$ : int 13
##   .. ..$ : int 45
##   .. ..$ : int 21
##   .. ..$ : int 46
##   .. ..$ : int 24
##   .. ..$ : int 57
##   .. ..$ : int 25
##   .. ..$ : int 50
##   .. ..$ : int 34
##   .. ..$ : int 27
##   .. ..$ : int 54
##   .. ..$ : int 4
##   .. ..$ : int 9
##   .. ..$ : int 60
##   .. ..$ : int 15
##   .. ..$ : int 29
##   .. ..$ : int 59
##   .. ..$ : int 47
##   .. ..$ : int 41
##   .. ..$ : int 48
##   .. ..$ : int 30
##   .. ..$ : int 36
##   .. ..$ : int 56
##   .. ..$ : int 28
##   .. ..$ : int 49
##   .. ..$ : int 43
##   .. ..$ : int 19
##   .. ..$ : int 6
##   .. ..$ : int 42
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE

Notes

You can stack joins w/ pipes to join 3+ relational datasets in one code run. You can join on multiple match variables at once

full join + filter

draft_66_test <- draft_66_pass %>% 
  full_join(draft_66_fail, by = c("name" = "name"), suffix = c("_pass", "_fail")) %>%
  filter(college_pass=="University of Utah") ## 1 match (players that made the league from the U. of Utah in '66)

1 match (players that made the league from the U. of Utah in ’66)

str(draft_66_test)
## tibble [1 x 14] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ pick_number_pass: num 7
##  $ name            : chr "Jerry Chambers"
##  $ draft_team_pass : chr "LAL"
##  $ college_pass    : chr "University of Utah"
##  $ nba_g           : num 320
##  $ nba_pts         : num 2667
##  $ nba_trb         : num 1032
##  $ nba_ast         : num 270
##  $ nba_blk         : num 3
##  $ nba_stl         : num 11
##  $ draft_year      : num NA
##  $ pick_number_fail: num NA
##  $ draft_team_fail : chr NA
##  $ college_fail    : chr NA
##  - attr(*, "groups")= tibble [1 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ name : chr "Jerry Chambers"
##   ..$ .rows: list<int> [1:1] 
##   .. ..$ : int 1
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE

separate function

draft_66_test <- draft_66_test %>% separate(name,into=c("first_name","last_name"))

unite function

draft_66_test <- draft_66_test %>% unite(name,first_name,last_name,sep="|")