https://www.hockey-reference.com/leagues/NHL_2023_skaters.html#stats::goals
# excel file
colony <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv')
## Rows: 1222 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): months, state
## dbl (8): year, colony_n, colony_max, colony_lost, colony_lost_pct, colony_ad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colony
## # A tibble: 1,222 × 10
## year months state colon…¹ colon…² colon…³ colon…⁴ colon…⁵ colon…⁶ colon…⁷
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2015 January-… Alab… 7000 7000 1800 26 2800 250 4
## 2 2015 January-… Ariz… 35000 35000 4600 13 3400 2100 6
## 3 2015 January-… Arka… 13000 14000 1500 11 1200 90 1
## 4 2015 January-… Cali… 1440000 1690000 255000 15 250000 124000 7
## 5 2015 January-… Colo… 3500 12500 1500 12 200 140 1
## 6 2015 January-… Conn… 3900 3900 870 22 290 NA NA
## 7 2015 January-… Flor… 305000 315000 42000 13 54000 25000 8
## 8 2015 January-… Geor… 104000 105000 14500 14 47000 9500 9
## 9 2015 January-… Hawa… 10500 10500 380 4 3400 760 7
## 10 2015 January-… Idaho 81000 88000 3700 4 2600 8000 9
## # … with 1,212 more rows, and abbreviated variable names ¹colony_n,
## # ²colony_max, ³colony_lost, ⁴colony_lost_pct, ⁵colony_added, ⁶colony_reno,
## # ⁷colony_reno_pct
skimr::skim(colony)
Name | colony |
Number of rows | 1222 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
character | 2 |
numeric | 8 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
months | 0 | 1 | 10 | 16 | 0 | 4 | 0 |
state | 0 | 1 | 4 | 14 | 0 | 47 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
year | 0 | 1.00 | 2017.77 | 1.89 | 2015 | 2016 | 2018 | 2019 | 2021 | ▇▃▃▃▆ |
colony_n | 47 | 0.96 | 123578.04 | 437835.18 | 1300 | 8000 | 17500 | 55500 | 3181180 | ▇▁▁▁▁ |
colony_max | 72 | 0.94 | 79112.77 | 190823.42 | 1700 | 9000 | 21000 | 68750 | 1710000 | ▇▁▁▁▁ |
colony_lost | 47 | 0.96 | 16551.32 | 60544.42 | 20 | 950 | 2200 | 6500 | 502350 | ▇▁▁▁▁ |
colony_lost_pct | 54 | 0.96 | 11.38 | 7.23 | 1 | 6 | 10 | 15 | 52 | ▇▅▁▁▁ |
colony_added | 83 | 0.93 | 17243.20 | 68167.65 | 10 | 420 | 1800 | 6500 | 736920 | ▇▁▁▁▁ |
colony_reno | 131 | 0.89 | 15278.86 | 62588.04 | 10 | 260 | 960 | 4585 | 806170 | ▇▁▁▁▁ |
colony_reno_pct | 260 | 0.79 | 9.10 | 9.66 | 1 | 2 | 6 | 12 | 77 | ▇▁▁▁▁ |
set.seed(123)
colony_small <- colony %>%
sample_n(10) %>%
select(year, state, colony_lost)
colony_small
## # A tibble: 10 × 3
## year state colony_lost
## <dbl> <chr> <dbl>
## 1 2017 Utah 2700
## 2 2017 Vermont 170
## 3 2015 Texas 25000
## 4 2017 Hawaii 130
## 5 2016 Florida 45000
## 6 2019 Wyoming 3300
## 7 2021 Kansas 1400
## 8 2020 California 69000
## 9 2018 Florida 30000
## 10 2018 Texas 22000
colony_wide <- colony_small %>%
pivot_wider(names_from = year,
values_from = colony_lost)
colony_wide
## # A tibble: 8 × 8
## state `2017` `2015` `2016` `2019` `2021` `2020` `2018`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Utah 2700 NA NA NA NA NA NA
## 2 Vermont 170 NA NA NA NA NA NA
## 3 Texas NA 25000 NA NA NA NA 22000
## 4 Hawaii 130 NA NA NA NA NA NA
## 5 Florida NA NA 45000 NA NA NA 30000
## 6 Wyoming NA NA NA 3300 NA NA NA
## 7 Kansas NA NA NA NA 1400 NA NA
## 8 California NA NA NA NA NA 69000 NA
colony_wide %>%
pivot_longer(cols = `2017`:`2018`,
names_to = "year",
values_to = "colony_lost",
values_drop_na = TRUE)
## # A tibble: 10 × 3
## state year colony_lost
## <chr> <chr> <dbl>
## 1 Utah 2017 2700
## 2 Vermont 2017 170
## 3 Texas 2015 25000
## 4 Texas 2018 22000
## 5 Hawaii 2017 130
## 6 Florida 2016 45000
## 7 Florida 2018 30000
## 8 Wyoming 2019 3300
## 9 Kansas 2021 1400
## 10 California 2020 69000
colony_small %>%
unite(col = "state_colony", c(state, colony_lost), sep = "/")
## # A tibble: 10 × 2
## year state_colony
## <dbl> <chr>
## 1 2017 Utah/2700
## 2 2017 Vermont/170
## 3 2015 Texas/25000
## 4 2017 Hawaii/130
## 5 2016 Florida/45000
## 6 2019 Wyoming/3300
## 7 2021 Kansas/1400
## 8 2020 California/69000
## 9 2018 Florida/30000
## 10 2018 Texas/22000
colony_small %>%
separate(year, into = c("century", "year"), sep = 2)
## # A tibble: 10 × 4
## century year state colony_lost
## <chr> <chr> <chr> <dbl>
## 1 20 17 Utah 2700
## 2 20 17 Vermont 170
## 3 20 15 Texas 25000
## 4 20 17 Hawaii 130
## 5 20 16 Florida 45000
## 6 20 19 Wyoming 3300
## 7 20 21 Kansas 1400
## 8 20 20 California 69000
## 9 20 18 Florida 30000
## 10 20 18 Texas 22000