library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readxl)
library(stringr)
library(rmarkdown)
library(here)
## here() starts at C:/Users/ajb22/Documents/school/dacss_601
data_path <- paste(here(), "/DACSS601Fall21/_data/", sep="")
knitr::opts_chunk$set(echo = TRUE)

Read In Advance Dataset: Railroad

Below is the example we did in class with some extra notes.

# OPTION 1: select names of column names that you want - not good for lots of data 
trains_path <- paste(data_path, "StateCounty2012.xls", sep="")
trains <- read_excel(trains_path, skip=3) %>% 
  select(STATE, COUNTY, TOTAL)
## New names:
## * `` -> ...2
## * `` -> ...4
# further improvements - insert column names and call certain ones delete to remove later

Notes, select is for columns and filter is for rows

# OPTION 2: name columns to make it easy to delete them  - be sure to skip the old col names
trains <- read_excel(trains_path, skip=4, 
                     col_names = c("State", "D1", "County", "D2", "Total")) %>% 
  select(!starts_with("d")) %>% 
  filter(!str_detect(State, "[Tt]otal")) #will look for both cases Total or total
trains
## # A tibble: 2,933 x 3
##    State County               Total
##    <chr> <chr>                <dbl>
##  1 AE    APO                      2
##  2 AK    ANCHORAGE                7
##  3 AK    FAIRBANKS NORTH STAR     2
##  4 AK    JUNEAU                   3
##  5 AK    MATANUSKA-SUSITNA        2
##  6 AK    SITKA                    1
##  7 AK    SKAGWAY MUNICIPALITY    88
##  8 AL    AUTAUGA                102
##  9 AL    BALDWIN                143
## 10 AL    BARBOUR                  1
## # ... with 2,923 more rows

Now we want to take a unstructured dataset and convert it to “tidy” data. This example will use the organiceggpoultry.xlsx found in the "_data" folder. The first step is to define what the end result data should look like. We can figure this out by defining the case. A case encompasses the things that uniquely identify a value. For the Organic Poultry dataset, the case is the product, year, and month. For this particular dataset, it makes sense to further break down the product into two groups, chicken and egg products. The value for these two groups is the price for some quantity.

First let us read in the data by defining the data path from our base path, "C:/Users/ajb22/Documents/school/dacss_601/DACSS601Fall21/_data/".

# here we 
# 1. skip the empty rows and the inaccurate header row
# 2. we rename the first column with the header "egg_month"
eggs_path <- paste(data_path, "organiceggpoultry.xls", sep="")
eggs_dirty <- read_excel(eggs_path, sheet=1, skip=4) %>%
  rename(egg_month = 1) 
## New names:
## * `` -> ...1
## * `` -> ...6
# show the raw data 
print(eggs_dirty, width = Inf)
## # A tibble: 120 x 11
##    egg_month `Extra Large \nDozen` `Extra Large 1/2 Doz.\n1/2 Dozen`
##    <chr>                     <dbl>                             <dbl>
##  1 Jan 2004                   230                               132 
##  2 February                   230                               134.
##  3 March                      230                               137 
##  4 April                      234.                              137 
##  5 May                        236                               137 
##  6 June                       241                               137 
##  7 July                       241                               137 
##  8 August                     241                               137 
##  9 September                  241                               136.
## 10 October                    241                               136.
##    `Large \nDozen` `Large \n1/2 Doz.` ...6  Whole `B/S Breast` `Bone-in Breast`
##              <dbl>              <dbl> <lgl> <dbl>        <dbl> <chr>           
##  1            230                126  NA     198.         646. too few         
##  2            226.               128. NA     198.         642. too few         
##  3            225                131  NA     209          642. too few         
##  4            225                131  NA     212          642. too few         
##  5            225                131  NA     214.         642. too few         
##  6            231.               134. NA     216.         641  too few         
##  7            234.               134. NA     217          642. 390.5           
##  8            234.               134. NA     217          642. 390.5           
##  9            234.               130. NA     217          642. 390.5           
## 10            234.               128. NA     217          642. 390.5           
##    `Whole Legs` Thighs 
##           <dbl> <chr>  
##  1         194. too few
##  2         194. 203    
##  3         194. 203    
##  4         194. 203    
##  5         194. 203    
##  6         202. 200.375
##  7         204. 199.5  
##  8         204. 199.5  
##  9         204. 199.5  
## 10         204. 199.5  
## # ... with 110 more rows

After successfully reading in the data, it is time to start cleaning up the data. A good first step is to delete any unnecessary columns. First we will delete the columns that only contain “NA” as the data using the comparison of the amount of empty rows to the total number of rows.

# we then take out any columns that have only NA - compare #NA to #rows
eggs_dirty <- eggs_dirty %>% select_if(!colSums(is.na(eggs_dirty)) == nrow(eggs_dirty))
eggs_dirty
## # A tibble: 120 x 10
##    egg_month `Extra Large \nD~ `Extra Large 1/~ `Large \nDozen` `Large \n1/2 Do~
##    <chr>                 <dbl>            <dbl>           <dbl>            <dbl>
##  1 Jan 2004               230              132             230              126 
##  2 February               230              134.            226.             128.
##  3 March                  230              137             225              131 
##  4 April                  234.             137             225              131 
##  5 May                    236              137             225              131 
##  6 June                   241              137             231.             134.
##  7 July                   241              137             234.             134.
##  8 August                 241              137             234.             134.
##  9 September              241              136.            234.             130.
## 10 October                241              136.            234.             128.
## # ... with 110 more rows, and 5 more variables: Whole <dbl>, B/S Breast <dbl>,
## #   Bone-in Breast <chr>, Whole Legs <dbl>, Thighs <chr>

Next we make everything the same data type, so that we can pivot. Then we pivot the columns to make them longer and to contain the products as a variable. Then we replace any response that is not valid, such as “too few” with 0.

# convert all types to characters so we can pivot, then we pivot the products to price and replace all of the "too few" responses in price to 0
eggs_dirty <- eggs_dirty %>% mutate(across(where(is.double), as.character)) %>% 
  pivot_longer(cols = contains("doz"), 
               names_to = "egg products", 
               values_to = "price per carton") %>% 
  pivot_longer(cols=contains(c("whole", "breast", "leg", "thigh")), 
               names_to = "chicken products", 
               values_to = "price per lb") %>% 
  mutate(`price per carton` = str_replace(`price per carton`, "[a-zA-Z ]+", "0")) %>% 
  mutate(`price per lb`= str_replace(`price per lb`, "[a-zA-Z ]+", "0"))
eggs_dirty
## # A tibble: 2,400 x 5
##    egg_month `egg products`     `price per cart~ `chicken produc~ `price per lb`
##    <chr>     <chr>              <chr>            <chr>            <chr>         
##  1 Jan 2004  "Extra Large \nDo~ 230              Whole            197.5         
##  2 Jan 2004  "Extra Large \nDo~ 230              Whole Legs       193.5         
##  3 Jan 2004  "Extra Large \nDo~ 230              B/S Breast       645.5         
##  4 Jan 2004  "Extra Large \nDo~ 230              Bone-in Breast   0             
##  5 Jan 2004  "Extra Large \nDo~ 230              Thighs           0             
##  6 Jan 2004  "Extra Large 1/2 ~ 132              Whole            197.5         
##  7 Jan 2004  "Extra Large 1/2 ~ 132              Whole Legs       193.5         
##  8 Jan 2004  "Extra Large 1/2 ~ 132              B/S Breast       645.5         
##  9 Jan 2004  "Extra Large 1/2 ~ 132              Bone-in Breast   0             
## 10 Jan 2004  "Extra Large 1/2 ~ 132              Thighs           0             
## # ... with 2,390 more rows

Now we separate out the month and the year within the first column to make it into two separate columns.

# take out any of the "/1" that got filled in instead of the year (month (o)year|/1) to (month (o)year), then separate the column egg_month to month and year  
eggs_dirty <- eggs_dirty %>% 
  mutate(egg_month = str_remove(egg_month, "/[0-9]+")) %>% 
  separate(egg_month, c("month", "year"), 
           extra = "drop", fill = "right")

# filling in the year from the jan month to the rest of the months, default direction is down
eggs_dirty <- eggs_dirty %>% fill(year) 
eggs_dirty
## # A tibble: 2,400 x 6
##    month year  `egg products`   `price per cart~ `chicken produc~ `price per lb`
##    <chr> <chr> <chr>            <chr>            <chr>            <chr>         
##  1 Jan   2004  "Extra Large \n~ 230              Whole            197.5         
##  2 Jan   2004  "Extra Large \n~ 230              Whole Legs       193.5         
##  3 Jan   2004  "Extra Large \n~ 230              B/S Breast       645.5         
##  4 Jan   2004  "Extra Large \n~ 230              Bone-in Breast   0             
##  5 Jan   2004  "Extra Large \n~ 230              Thighs           0             
##  6 Jan   2004  "Extra Large 1/~ 132              Whole            197.5         
##  7 Jan   2004  "Extra Large 1/~ 132              Whole Legs       193.5         
##  8 Jan   2004  "Extra Large 1/~ 132              B/S Breast       645.5         
##  9 Jan   2004  "Extra Large 1/~ 132              Bone-in Breast   0             
## 10 Jan   2004  "Extra Large 1/~ 132              Thighs           0             
## # ... with 2,390 more rows

Now we can return the value of price back to the double data type instead of a character. We can further process this data by breaking up the products. However, this might be excessive, since there is not a main pattern between the product types.

# further separate the products into type and amount?? not sure 
eggs_clean <- eggs_dirty %>% 
  mutate(`price per carton`, `price per carton`= as.double(`price per carton`)) %>% 
  mutate(`price per lb`, `price per lb`= as.double(`price per lb`))

# show the clean data 
eggs_clean
## # A tibble: 2,400 x 6
##    month year  `egg products`   `price per cart~ `chicken produc~ `price per lb`
##    <chr> <chr> <chr>                       <dbl> <chr>                     <dbl>
##  1 Jan   2004  "Extra Large \n~              230 Whole                      198.
##  2 Jan   2004  "Extra Large \n~              230 Whole Legs                 194.
##  3 Jan   2004  "Extra Large \n~              230 B/S Breast                 646.
##  4 Jan   2004  "Extra Large \n~              230 Bone-in Breast               0 
##  5 Jan   2004  "Extra Large \n~              230 Thighs                       0 
##  6 Jan   2004  "Extra Large 1/~              132 Whole                      198.
##  7 Jan   2004  "Extra Large 1/~              132 Whole Legs                 194.
##  8 Jan   2004  "Extra Large 1/~              132 B/S Breast                 646.
##  9 Jan   2004  "Extra Large 1/~              132 Bone-in Breast               0 
## 10 Jan   2004  "Extra Large 1/~              132 Thighs                       0 
## # ... with 2,390 more rows