Import your data
Mydata <- read_csv("../00_data/Mydata.csv")
## New names:
## Rows: 65706 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): lake, species, comments, region dbl (4): ...1, year, grand_total, values
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Mydata
## # A tibble: 65,706 × 8
## ...1 year lake species grand_total comments region values
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 1 1991 Erie American Eel 1 <NA> Michigan (MI) 0
## 2 2 1991 Erie American Eel 1 <NA> New York (NY) 0
## 3 3 1991 Erie American Eel 1 <NA> Ohio (OH) 0
## 4 4 1991 Erie American Eel 1 <NA> Pennsylvania (PA) 0
## 5 5 1991 Erie American Eel 1 <NA> U.S. Total 0
## 6 6 1991 Erie American Eel 1 <NA> Canada (ONT) 1
## 7 7 1992 Erie American Eel 0 <NA> Michigan (MI) 0
## 8 8 1992 Erie American Eel 0 <NA> New York (NY) 0
## 9 9 1992 Erie American Eel 0 <NA> Ohio (OH) 0
## 10 10 1992 Erie American Eel 0 <NA> Pennsylvania (PA) 0
## # ℹ 65,696 more rows
Separating and Uniting
Separate a column
Mydata_sep <- Mydata %>%
separate(col = region, into = c("City", "State"))
## Warning: Expected 2 pieces. Additional pieces discarded in 65706 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Most values were successfully separated apart from anomalies
Unite two columns
Mydata_sep %>%
unite(col = "Region", City:State, sep = "/")
## # A tibble: 65,706 × 8
## ...1 year lake species grand_total comments Region values
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 1 1991 Erie American Eel 1 <NA> Michigan/MI 0
## 2 2 1991 Erie American Eel 1 <NA> New/York 0
## 3 3 1991 Erie American Eel 1 <NA> Ohio/OH 0
## 4 4 1991 Erie American Eel 1 <NA> Pennsylvania/PA 0
## 5 5 1991 Erie American Eel 1 <NA> U/S 0
## 6 6 1991 Erie American Eel 1 <NA> Canada/ONT 1
## 7 7 1992 Erie American Eel 0 <NA> Michigan/MI 0
## 8 8 1992 Erie American Eel 0 <NA> New/York 0
## 9 9 1992 Erie American Eel 0 <NA> Ohio/OH 0
## 10 10 1992 Erie American Eel 0 <NA> Pennsylvania/PA 0
## # ℹ 65,696 more rows
# Again most values were successfully separated apart from anomalies
Missing Values
#Use function to change explicit missing values to implicit in a column to get rid of useless rows
library(dplyr)
Mydata %>%
filter(!is.na(grand_total))
## # A tibble: 33,939 × 8
## ...1 year lake species grand_total comments region values
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 1 1991 Erie American Eel 1 <NA> Michigan (MI) 0
## 2 2 1991 Erie American Eel 1 <NA> New York (NY) 0
## 3 3 1991 Erie American Eel 1 <NA> Ohio (OH) 0
## 4 4 1991 Erie American Eel 1 <NA> Pennsylvania (PA) 0
## 5 5 1991 Erie American Eel 1 <NA> U.S. Total 0
## 6 6 1991 Erie American Eel 1 <NA> Canada (ONT) 1
## 7 7 1992 Erie American Eel 0 <NA> Michigan (MI) 0
## 8 8 1992 Erie American Eel 0 <NA> New York (NY) 0
## 9 9 1992 Erie American Eel 0 <NA> Ohio (OH) 0
## 10 10 1992 Erie American Eel 0 <NA> Pennsylvania (PA) 0
## # ℹ 33,929 more rows