library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(stringr)
library(rmarkdown)
library(here)
## here() starts at C:/Users/ajb22/Documents/school/dacss_601
data_path <- paste(here(), "/DACSS601Fall21/_data/", sep="")
knitr::opts_chunk$set(echo = TRUE)
Below is the example we did in class with some extra notes.
# OPTION 1: select names of column names that you want - not good for lots of data
trains_path <- paste(data_path, "StateCounty2012.xls", sep="")
trains <- read_excel(trains_path, skip=3) %>%
select(STATE, COUNTY, TOTAL)
## New names:
## * `` -> ...2
## * `` -> ...4
# further improvements - insert column names and call certain ones delete to remove later
Notes, select is for columns and filter is for rows
# OPTION 2: name columns to make it easy to delete them - be sure to skip the old col names
trains <- read_excel(trains_path, skip=4,
col_names = c("State", "D1", "County", "D2", "Total")) %>%
select(!starts_with("d")) %>%
filter(!str_detect(State, "[Tt]otal")) #will look for both cases Total or total
trains
## # A tibble: 2,933 x 3
## State County Total
## <chr> <chr> <dbl>
## 1 AE APO 2
## 2 AK ANCHORAGE 7
## 3 AK FAIRBANKS NORTH STAR 2
## 4 AK JUNEAU 3
## 5 AK MATANUSKA-SUSITNA 2
## 6 AK SITKA 1
## 7 AK SKAGWAY MUNICIPALITY 88
## 8 AL AUTAUGA 102
## 9 AL BALDWIN 143
## 10 AL BARBOUR 1
## # ... with 2,923 more rows
Now we want to take a unstructured dataset and convert it to “tidy” data. This example will use the organiceggpoultry.xlsx found in the "_data" folder. The first step is to define what the end result data should look like. We can figure this out by defining the case. A case encompasses the things that uniquely identify a value. For the Organic Poultry dataset, the case is the product, year, and month. For this particular dataset, it makes sense to further break down the product into two groups, chicken and egg products. The value for these two groups is the price for some quantity.
First let us read in the data by defining the data path from our base path, "C:/Users/ajb22/Documents/school/dacss_601/DACSS601Fall21/_data/".
# here we
# 1. skip the empty rows and the inaccurate header row
# 2. we rename the first column with the header "egg_month"
eggs_path <- paste(data_path, "organiceggpoultry.xls", sep="")
eggs_dirty <- read_excel(eggs_path, sheet=1, skip=4) %>%
rename(egg_month = 1)
## New names:
## * `` -> ...1
## * `` -> ...6
# show the raw data
print(eggs_dirty, width = Inf)
## # A tibble: 120 x 11
## egg_month `Extra Large \nDozen` `Extra Large 1/2 Doz.\n1/2 Dozen`
## <chr> <dbl> <dbl>
## 1 Jan 2004 230 132
## 2 February 230 134.
## 3 March 230 137
## 4 April 234. 137
## 5 May 236 137
## 6 June 241 137
## 7 July 241 137
## 8 August 241 137
## 9 September 241 136.
## 10 October 241 136.
## `Large \nDozen` `Large \n1/2 Doz.` ...6 Whole `B/S Breast` `Bone-in Breast`
## <dbl> <dbl> <lgl> <dbl> <dbl> <chr>
## 1 230 126 NA 198. 646. too few
## 2 226. 128. NA 198. 642. too few
## 3 225 131 NA 209 642. too few
## 4 225 131 NA 212 642. too few
## 5 225 131 NA 214. 642. too few
## 6 231. 134. NA 216. 641 too few
## 7 234. 134. NA 217 642. 390.5
## 8 234. 134. NA 217 642. 390.5
## 9 234. 130. NA 217 642. 390.5
## 10 234. 128. NA 217 642. 390.5
## `Whole Legs` Thighs
## <dbl> <chr>
## 1 194. too few
## 2 194. 203
## 3 194. 203
## 4 194. 203
## 5 194. 203
## 6 202. 200.375
## 7 204. 199.5
## 8 204. 199.5
## 9 204. 199.5
## 10 204. 199.5
## # ... with 110 more rows
After successfully reading in the data, it is time to start cleaning up the data. A good first step is to delete any unnecessary columns. First we will delete the columns that only contain “NA” as the data using the comparison of the amount of empty rows to the total number of rows.
# we then take out any columns that have only NA - compare #NA to #rows
eggs_dirty <- eggs_dirty %>% select_if(!colSums(is.na(eggs_dirty)) == nrow(eggs_dirty))
eggs_dirty
## # A tibble: 120 x 10
## egg_month `Extra Large \nD~ `Extra Large 1/~ `Large \nDozen` `Large \n1/2 Do~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Jan 2004 230 132 230 126
## 2 February 230 134. 226. 128.
## 3 March 230 137 225 131
## 4 April 234. 137 225 131
## 5 May 236 137 225 131
## 6 June 241 137 231. 134.
## 7 July 241 137 234. 134.
## 8 August 241 137 234. 134.
## 9 September 241 136. 234. 130.
## 10 October 241 136. 234. 128.
## # ... with 110 more rows, and 5 more variables: Whole <dbl>, B/S Breast <dbl>,
## # Bone-in Breast <chr>, Whole Legs <dbl>, Thighs <chr>
Next we make everything the same data type, so that we can pivot. Then we pivot the columns to make them longer and to contain the products as a variable. Then we replace any response that is not valid, such as “too few” with 0.
# convert all types to characters so we can pivot, then we pivot the products to price and replace all of the "too few" responses in price to 0
eggs_dirty <- eggs_dirty %>% mutate(across(where(is.double), as.character)) %>%
pivot_longer(cols = contains("doz"),
names_to = "egg products",
values_to = "price per carton") %>%
pivot_longer(cols=contains(c("whole", "breast", "leg", "thigh")),
names_to = "chicken products",
values_to = "price per lb") %>%
mutate(`price per carton` = str_replace(`price per carton`, "[a-zA-Z ]+", "0")) %>%
mutate(`price per lb`= str_replace(`price per lb`, "[a-zA-Z ]+", "0"))
eggs_dirty
## # A tibble: 2,400 x 5
## egg_month `egg products` `price per cart~ `chicken produc~ `price per lb`
## <chr> <chr> <chr> <chr> <chr>
## 1 Jan 2004 "Extra Large \nDo~ 230 Whole 197.5
## 2 Jan 2004 "Extra Large \nDo~ 230 Whole Legs 193.5
## 3 Jan 2004 "Extra Large \nDo~ 230 B/S Breast 645.5
## 4 Jan 2004 "Extra Large \nDo~ 230 Bone-in Breast 0
## 5 Jan 2004 "Extra Large \nDo~ 230 Thighs 0
## 6 Jan 2004 "Extra Large 1/2 ~ 132 Whole 197.5
## 7 Jan 2004 "Extra Large 1/2 ~ 132 Whole Legs 193.5
## 8 Jan 2004 "Extra Large 1/2 ~ 132 B/S Breast 645.5
## 9 Jan 2004 "Extra Large 1/2 ~ 132 Bone-in Breast 0
## 10 Jan 2004 "Extra Large 1/2 ~ 132 Thighs 0
## # ... with 2,390 more rows
Now we separate out the month and the year within the first column to make it into two separate columns.
# take out any of the "/1" that got filled in instead of the year (month (o)year|/1) to (month (o)year), then separate the column egg_month to month and year
eggs_dirty <- eggs_dirty %>%
mutate(egg_month = str_remove(egg_month, "/[0-9]+")) %>%
separate(egg_month, c("month", "year"),
extra = "drop", fill = "right")
# filling in the year from the jan month to the rest of the months, default direction is down
eggs_dirty <- eggs_dirty %>% fill(year)
eggs_dirty
## # A tibble: 2,400 x 6
## month year `egg products` `price per cart~ `chicken produc~ `price per lb`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Jan 2004 "Extra Large \n~ 230 Whole 197.5
## 2 Jan 2004 "Extra Large \n~ 230 Whole Legs 193.5
## 3 Jan 2004 "Extra Large \n~ 230 B/S Breast 645.5
## 4 Jan 2004 "Extra Large \n~ 230 Bone-in Breast 0
## 5 Jan 2004 "Extra Large \n~ 230 Thighs 0
## 6 Jan 2004 "Extra Large 1/~ 132 Whole 197.5
## 7 Jan 2004 "Extra Large 1/~ 132 Whole Legs 193.5
## 8 Jan 2004 "Extra Large 1/~ 132 B/S Breast 645.5
## 9 Jan 2004 "Extra Large 1/~ 132 Bone-in Breast 0
## 10 Jan 2004 "Extra Large 1/~ 132 Thighs 0
## # ... with 2,390 more rows
Now we can return the value of price back to the double data type instead of a character. We can further process this data by breaking up the products. However, this might be excessive, since there is not a main pattern between the product types.
# further separate the products into type and amount?? not sure
eggs_clean <- eggs_dirty %>%
mutate(`price per carton`, `price per carton`= as.double(`price per carton`)) %>%
mutate(`price per lb`, `price per lb`= as.double(`price per lb`))
# show the clean data
eggs_clean
## # A tibble: 2,400 x 6
## month year `egg products` `price per cart~ `chicken produc~ `price per lb`
## <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 Jan 2004 "Extra Large \n~ 230 Whole 198.
## 2 Jan 2004 "Extra Large \n~ 230 Whole Legs 194.
## 3 Jan 2004 "Extra Large \n~ 230 B/S Breast 646.
## 4 Jan 2004 "Extra Large \n~ 230 Bone-in Breast 0
## 5 Jan 2004 "Extra Large \n~ 230 Thighs 0
## 6 Jan 2004 "Extra Large 1/~ 132 Whole 198.
## 7 Jan 2004 "Extra Large 1/~ 132 Whole Legs 194.
## 8 Jan 2004 "Extra Large 1/~ 132 B/S Breast 646.
## 9 Jan 2004 "Extra Large 1/~ 132 Bone-in Breast 0
## 10 Jan 2004 "Extra Large 1/~ 132 Thighs 0
## # ... with 2,390 more rows