colony <- read_excel("../00_data/myData.xlsx")
set.seed(123)
colony_small <- colony %>%
sample_n(10) %>%
select(year, state, colony_lost)
colony_small
## # A tibble: 10 × 3
## year state colony_lost
## <dbl> <chr> <dbl>
## 1 2017 Utah 2700
## 2 2017 Vermont 170
## 3 2015 Texas 25000
## 4 2017 Hawaii 130
## 5 2016 Florida 45000
## 6 2019 Wyoming 3300
## 7 2021 Kansas 1400
## 8 2020 California 69000
## 9 2018 Florida 30000
## 10 2018 Texas 22000
colony_small %>% count(state, colony_lost)
## # A tibble: 10 × 3
## state colony_lost n
## <chr> <dbl> <int>
## 1 California 69000 1
## 2 Florida 30000 1
## 3 Florida 45000 1
## 4 Hawaii 130 1
## 5 Kansas 1400 1
## 6 Texas 22000 1
## 7 Texas 25000 1
## 8 Utah 2700 1
## 9 Vermont 170 1
## 10 Wyoming 3300 1
colony_small %>% count(state, colony_lost) %>% filter(n > 1)
## # A tibble: 0 × 3
## # … with 3 variables: state <chr>, colony_lost <dbl>, n <int>
Divide it using dplyr::select in a way the two have a common variable, which you could use to join the two.
colony_year.state <- colony_small %>% select(year:state)
colony_year.state
## # A tibble: 10 × 2
## year state
## <dbl> <chr>
## 1 2017 Utah
## 2 2017 Vermont
## 3 2015 Texas
## 4 2017 Hawaii
## 5 2016 Florida
## 6 2019 Wyoming
## 7 2021 Kansas
## 8 2020 California
## 9 2018 Florida
## 10 2018 Texas
colony_state.lost <- colony_small %>% select(state:colony_lost)
colony_state.lost
## # A tibble: 10 × 2
## state colony_lost
## <chr> <dbl>
## 1 Utah 2700
## 2 Vermont 170
## 3 Texas 25000
## 4 Hawaii 130
## 5 Florida 45000
## 6 Wyoming 3300
## 7 Kansas 1400
## 8 California 69000
## 9 Florida 30000
## 10 Texas 22000
Use tidyr::left_join or other joining functions.
left_join(colony_year.state, colony_state.lost)
## Joining with `by = join_by(state)`
## Warning in left_join(colony_year.state, colony_state.lost): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 3 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
## warning.
## # A tibble: 14 × 3
## year state colony_lost
## <dbl> <chr> <dbl>
## 1 2017 Utah 2700
## 2 2017 Vermont 170
## 3 2015 Texas 25000
## 4 2015 Texas 22000
## 5 2017 Hawaii 130
## 6 2016 Florida 45000
## 7 2016 Florida 30000
## 8 2019 Wyoming 3300
## 9 2021 Kansas 1400
## 10 2020 California 69000
## 11 2018 Florida 45000
## 12 2018 Florida 30000
## 13 2018 Texas 25000
## 14 2018 Texas 22000
x <- c("state", "months", "colony_lost")
str_detect(x, "s")
## [1] TRUE TRUE TRUE
str_extract(x, "state")
## [1] "state" NA NA
str_replace(x, "months", "NA")
## [1] "state" "NA" "colony_lost"