Import data

https://www.hockey-reference.com/leagues/NHL_2023_skaters.html#stats::goals

# excel file
colony <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv')
## Rows: 1222 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): months, state
## dbl (8): year, colony_n, colony_max, colony_lost, colony_lost_pct, colony_ad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colony
## # A tibble: 1,222 × 10
##     year months    state colon…¹ colon…² colon…³ colon…⁴ colon…⁵ colon…⁶ colon…⁷
##    <dbl> <chr>     <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1  2015 January-… Alab…    7000    7000    1800      26    2800     250       4
##  2  2015 January-… Ariz…   35000   35000    4600      13    3400    2100       6
##  3  2015 January-… Arka…   13000   14000    1500      11    1200      90       1
##  4  2015 January-… Cali… 1440000 1690000  255000      15  250000  124000       7
##  5  2015 January-… Colo…    3500   12500    1500      12     200     140       1
##  6  2015 January-… Conn…    3900    3900     870      22     290      NA      NA
##  7  2015 January-… Flor…  305000  315000   42000      13   54000   25000       8
##  8  2015 January-… Geor…  104000  105000   14500      14   47000    9500       9
##  9  2015 January-… Hawa…   10500   10500     380       4    3400     760       7
## 10  2015 January-… Idaho   81000   88000    3700       4    2600    8000       9
## # … with 1,212 more rows, and abbreviated variable names ¹​colony_n,
## #   ²​colony_max, ³​colony_lost, ⁴​colony_lost_pct, ⁵​colony_added, ⁶​colony_reno,
## #   ⁷​colony_reno_pct
skimr::skim(colony)
Data summary
Name colony
Number of rows 1222
Number of columns 10
_______________________
Column type frequency:
character 2
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
months 0 1 10 16 0 4 0
state 0 1 4 14 0 47 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2017.77 1.89 2015 2016 2018 2019 2021 ▇▃▃▃▆
colony_n 47 0.96 123578.04 437835.18 1300 8000 17500 55500 3181180 ▇▁▁▁▁
colony_max 72 0.94 79112.77 190823.42 1700 9000 21000 68750 1710000 ▇▁▁▁▁
colony_lost 47 0.96 16551.32 60544.42 20 950 2200 6500 502350 ▇▁▁▁▁
colony_lost_pct 54 0.96 11.38 7.23 1 6 10 15 52 ▇▅▁▁▁
colony_added 83 0.93 17243.20 68167.65 10 420 1800 6500 736920 ▇▁▁▁▁
colony_reno 131 0.89 15278.86 62588.04 10 260 960 4585 806170 ▇▁▁▁▁
colony_reno_pct 260 0.79 9.10 9.66 1 2 6 12 77 ▇▁▁▁▁
set.seed(123)

colony_small <- colony %>% 
    sample_n(10) %>% 
    select(year, state, colony_lost)

colony_small
## # A tibble: 10 × 3
##     year state      colony_lost
##    <dbl> <chr>            <dbl>
##  1  2017 Utah              2700
##  2  2017 Vermont            170
##  3  2015 Texas            25000
##  4  2017 Hawaii             130
##  5  2016 Florida          45000
##  6  2019 Wyoming           3300
##  7  2021 Kansas            1400
##  8  2020 California       69000
##  9  2018 Florida          30000
## 10  2018 Texas            22000

Pivoting

long to wide form

colony_wide <- colony_small %>%

    pivot_wider(names_from = year, 
                values_from = colony_lost)

colony_wide
## # A tibble: 8 × 8
##   state      `2017` `2015` `2016` `2019` `2021` `2020` `2018`
##   <chr>       <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 Utah         2700     NA     NA     NA     NA     NA     NA
## 2 Vermont       170     NA     NA     NA     NA     NA     NA
## 3 Texas          NA  25000     NA     NA     NA     NA  22000
## 4 Hawaii        130     NA     NA     NA     NA     NA     NA
## 5 Florida        NA     NA  45000     NA     NA     NA  30000
## 6 Wyoming        NA     NA     NA   3300     NA     NA     NA
## 7 Kansas         NA     NA     NA     NA   1400     NA     NA
## 8 California     NA     NA     NA     NA     NA  69000     NA

wide to long form

colony_wide %>% 
    pivot_longer(cols           = `2017`:`2018`, 
                 names_to       = "year", 
                 values_to      = "colony_lost", 
                 values_drop_na = TRUE)
## # A tibble: 10 × 3
##    state      year  colony_lost
##    <chr>      <chr>       <dbl>
##  1 Utah       2017         2700
##  2 Vermont    2017          170
##  3 Texas      2015        25000
##  4 Texas      2018        22000
##  5 Hawaii     2017          130
##  6 Florida    2016        45000
##  7 Florida    2018        30000
##  8 Wyoming    2019         3300
##  9 Kansas     2021         1400
## 10 California 2020        69000

Separating and Uniting

Unite two columns

colony_small %>%
    
    unite(col = "state_colony", c(state, colony_lost), sep = "/")
## # A tibble: 10 × 2
##     year state_colony    
##    <dbl> <chr>           
##  1  2017 Utah/2700       
##  2  2017 Vermont/170     
##  3  2015 Texas/25000     
##  4  2017 Hawaii/130      
##  5  2016 Florida/45000   
##  6  2019 Wyoming/3300    
##  7  2021 Kansas/1400     
##  8  2020 California/69000
##  9  2018 Florida/30000   
## 10  2018 Texas/22000

Separate a column

colony_small %>% 
  separate(year, into = c("century", "year"), sep = 2)
## # A tibble: 10 × 4
##    century year  state      colony_lost
##    <chr>   <chr> <chr>            <dbl>
##  1 20      17    Utah              2700
##  2 20      17    Vermont            170
##  3 20      15    Texas            25000
##  4 20      17    Hawaii             130
##  5 20      16    Florida          45000
##  6 20      19    Wyoming           3300
##  7 20      21    Kansas            1400
##  8 20      20    California       69000
##  9 20      18    Florida          30000
## 10 20      18    Texas            22000

Missing Values