Files and data stored at: https://github.com/rweberc/Data607_Project2

Electricity demand (contributed by Rose Koh)

Net actual interchange (electric): The algebraic sum of all metered interchange over all interconnections between two physically Adjacent Balancing Authority Areas. NERC definition https://www.eia.gov/tools/glossary/?id=electricity

Read in data

elecDf <- read.csv("https://raw.githubusercontent.com/rweberc/Data607_Project2/master/electricity.csv", stringsAsFactors = FALSE, colClasses = "character", skip = 4)

Confirm data structure:

  • Every second line “Demand”
  • Every third line “”
  • Every fourth line “Total net actual interchange”
stopifnot(all(elecDf$megawatthours[c(FALSE, TRUE, FALSE, FALSE)] == "Demand"))
stopifnot(all(elecDf$megawatthours[c(FALSE, FALSE, TRUE, FALSE)] == "Net generation"))
stopifnot(all(elecDf$megawatthours[c(FALSE, FALSE, FALSE, TRUE)] == "Total net actual interchange"))

Extract region names

elecDf$region <- NA
elecDf$region[c(TRUE, FALSE, FALSE, FALSE)] <- elecDf$megawatthours[c(TRUE, FALSE, FALSE, FALSE)]

elecDf <- elecDf %>% fill(region)
table(elecDf$region, useNA = "ifany")
## 
##                           California (region) 
##                                             4 
##                            Carolinas (region) 
##                                             4 
##                              Central (region) 
##                                             4 
##         Electric Reliability Council of Texas 
##                                             4 
##                              Florida (region) 
##                                             4 
##                         Mid-Atlantic (region) 
##                                             4 
##                              Midwest (region) 
##                                             4 
##                      New England ISO (region) 
##                                             4 
## New York Independent System Operator (region) 
##                                             4 
##                            Northwest (region) 
##                                             4 
##                            Southeast (region) 
##                                             4 
##                            Southwest (region) 
##                                             4 
##           Tennessee Valley Authority (region) 
##                                             4 
##               United States Lower 48 (region) 
##                                             4
elecDf$region[c(TRUE, FALSE, FALSE, FALSE)] <- ""

Update column names

elecDf <- elecDf %>% rename(category = megawatthours)
elecDf <- elecDf %>% select(category, region, everything())

colnames(elecDf) <- str_replace(colnames(elecDf), "X", "")
colnames(elecDf) <- str_replace_all(colnames(elecDf), "\\.", "/")

colnames(elecDf)
##  [1] "category"   "region"     "01/01/2018" "01/02/2018" "01/03/2018"
##  [6] "01/04/2018" "01/05/2018" "01/06/2018" "01/07/2018" "01/08/2018"
## [11] "01/09/2018" "01/10/2018" "01/11/2018" "01/12/2018" "01/13/2018"
## [16] "01/14/2018" "01/15/2018" "01/16/2018" "01/17/2018" "01/18/2018"
## [21] "01/19/2018" "01/20/2018" "01/21/2018" "01/22/2018" "01/23/2018"
## [26] "01/24/2018" "01/25/2018" "01/26/2018" "01/27/2018" "01/28/2018"
## [31] "01/29/2018" "01/30/2018" "01/31/2018"

Wide to long

elecDf_long <- elecDf %>% gather("evalDay", "megawatthours", 3:length(.))

Clean data in long form

# data check
elecDf_long$megawatthours[elecDf_long$megawatthours == " Inc. (region)"] <- ""
stopifnot(elecDf_long$megawatthours[str_trim(elecDf_long$region) == ""] == "")

# filter out unneed "region" rows
elecDf_long <- elecDf_long %>% filter(str_trim(region) != "")

# parse day and megawatthours
elecDf_long$evalDay <- mdy(elecDf_long$evalDay)
elecDf_long$megawatthours <- as.numeric(elecDf_long$megawatthours)

Long to wide

elecDf <- elecDf_long %>% spread(category, megawatthours)
rm(elecDf_long)

# data check: confirm no megawatt hours are NA
stopifnot(nrow(elecDf %>% filter_if(is.Date, any_vars(is.na(.)))) == 0)

Suggested analyses

  • daily demand per region
  • daily net generation per region
  • daily total net actual interchange
perRegionAvg <- elecDf %>% 
  group_by(region) %>%
  summarize(demandAvg = mean(Demand),
                                     netGenAvg = mean(`Net generation`),
                                     totInterchgAvg = mean(`Total net actual interchange`))

ggplot(perRegionAvg, aes(x=region, y=totInterchgAvg)) + 
  geom_bar(stat = "identity") +
  labs(title = "Avg Interchange values", x = "Region", y = "Averge Interchange") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0))

California looks to have the greatest deficits, while the northwest region looks to have the greatest surplus.

  • Any particular days were demand was greater than production?
dayDf <- elecDf %>% 
  ungroup() %>%
  mutate(evalDayOfWeek = wday(evalDay, label = TRUE)) %>%
  group_by(region, evalDayOfWeek) %>%
  summarize(demandAvg = mean(Demand),
                                     netGenAvg = mean(`Net generation`),
                                     totInterchgAvg = mean(`Total net actual interchange`))

ggplot(dayDf, aes(x = region, y = totInterchgAvg)) + 
  geom_bar(aes(fill = evalDayOfWeek), position = "dodge", stat = "identity") +
  labs(title = "Avg Interchange values", x = "Region", y = "Averge Interchange") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0))

Net values do not seem dependent on days.

Marraige/Divorce Rates (contributed by Jiadi Li)

Read in data

mdDf <- read.csv("https://raw.githubusercontent.com/xiaoxiaogao-DD/DATA607-Project1/master/national_marriage_divorce_rates_00-16.csv", stringsAsFactors = FALSE, skip = 2, header = TRUE, colClasses = "character")

Remove blank columns

# Confirm all values are blank for last columns
stopifnot(nrow(mdDf %>% filter_if(str_detect(colnames(.), "X"), all_vars(. != ""))) == 0)

# Remove these rows
mdDf <- mdDf[, !str_detect(colnames(mdDf), "X")]

Split apart separate dataframes

marrDf <- mdDf[1:27,]
divDf <- mdDf[29:nrow(mdDf),]

Adust column names

colnames(divDf) <- divDf[1, ]

Extract footnote data into dataframe

# Save out total row numbers to confirm logic to split off footnotes was reasonable
marrRows <- nrow(marrDf)
divRows <- nrow(divDf)

marrFtDf <- marrDf %>% filter(!str_detect(Year, "^20."))
divFtDf <- divDf %>% filter(!str_detect(Year, "^20."))

marrDf <- marrDf %>% filter(str_detect(Year, "^20."))
divDf <- divDf %>% filter(str_detect(Year, "^20."))

stopifnot(marrRows == nrow(marrFtDf) + nrow(marrDf))
stopifnot(divRows == nrow(divFtDf) + nrow(divDf))

Parse footnote dataframes

marrFtDf <- marrFtDf %>%
  rename(raw = Year) %>%
  select(raw) %>%
  filter(str_trim(raw) != "") %>%
  mutate(footId = as.numeric(str_extract(raw, "^[:digit:]+")),
         footnote = str_replace(raw, "^[:digit:]+/", "")) %>%
  filter(!is.na(footId)) %>%
  select(footId, footnote)

divFtDf <- divFtDf %>%
  rename(raw = Year) %>%
  select(raw) %>%
  filter(str_trim(raw) != "") %>%
  mutate(footId = as.numeric(str_extract(raw, "^[:digit:]+")),
         footnote = str_replace(raw, "^[:digit:]+ ", "")) %>%
  filter(!is.na(footId)) %>%
  select(footId, footnote)

Parse numerics in main dataframes

# Extract footnote information from Year column
marrDf$footId <- str_extract(marrDf$Year, "/.")
marrDf$Year <- str_replace(marrDf$Year, "/.", "")
marrDf$footId <- str_replace(marrDf$footId, "/", "")

divDf$footId <- str_extract(divDf$Year, "/.")
divDf$Year <- str_replace(divDf$Year, "/.", "")
divDf$footId <- str_replace(divDf$footId, "/", "")

# remove commas
marrDf <- marrDf %>% mutate_all(funs(str_replace_all(., ",", "")))
divDf <- divDf %>% mutate_all(funs(str_replace_all(., ",", "")))

# parse all columns as numerics
marrDf <- marrDf %>% mutate_all(as.numeric)
divDf <- divDf %>% mutate_all(as.numeric)

Merge in footnotes

marrDf <- left_join(marrDf, marrFtDf, by="footId")
divDf <- left_join(divDf, divFtDf, by="footId")
rm(mdDf, marrFtDf, divFtDf)

Merge marraige and divorce dataframes

marrDf <- marrDf %>%
  rename(marriages = Marriages,
         marrPopulation = Population,
         marrRatePer1000 = Rate.per.1.000.total.population,
         marrFootnote = footnote) %>%
  select(-footId)

divDf <- divDf %>%
  rename(divPopulation = Population,
         divRatePer1000 = `Rate per 1,000 total population`,
         divFootnote = footnote,
         divAnnul = `Divorces & annulments`) %>%
  select(-footId)

finalDf <- full_join(marrDf, divDf, by = "Year") %>%
  select(Year, marriages, divAnnul, marrPopulation, divPopulation, marrRatePer1000, divRatePer1000,
         marrFootnote, divFootnote)

rm(divDf, marrDf)

Suggested Analyses

  • Is the decrease in the divorce rate due to the decrease in the marriage rate?

    As noted, there are pretty large differences in population between the two data sets… So it’s not easy to compare marriage and divorce rates per year, but we can look at the data broadly.

    Since couples may be unlikely to divorce just in the first year of marriage, maybe it would be useful to look at overlaid line graphs to see if there’s any trends that seem to occur on a delay of several years

ggplot(finalDf, aes(Year)) + 
  geom_line(aes(y = marrRatePer1000, colour = "marrRatePer1000")) + 
  geom_line(aes(y = divRatePer1000, colour = "divRatePer1000"))  +
  labs(title = "Marriage/Divorce Rates", x = "Year", y = "Rate Per 1000") +
  theme(plot.title = element_text(hjust = 0.5))

RNA_DNA (contributed by Heather Geiger)

Read in data

rawDf <- read.csv("https://raw.githubusercontent.com/heathergeiger/Data607_data_to_tidy/master/RNA_vs_DNA_example_sample_sheet.csv", stringsAsFactors = FALSE, header = FALSE, colClasses = "character")

Split dataframe vertically, bind into one data set

rawDf1 <- rawDf[, 1:6]
rawDf2 <- rawDf[, 8:13]

colnames(rawDf1) <- letters[1:length(rawDf1)]
colnames(rawDf2) <- letters[1:length(rawDf2)]

fullDf <- bind_rows(rawDf1, rawDf2)

rm(rawDf, rawDf1, rawDf2)

Extract group numbers

fullDf$groupNum <- str_extract(fullDf$a, "[^0][:digit:]+") #Not sure why [^0][:digit:]+ is returning "p068", for instance
fullDf$groupNum <- str_replace(fullDf$groupNum, "p0", "")

fullDf <- fullDf %>% 
  fill(groupNum) %>%
  filter(!str_detect(a, "Group*"))

Remove unneeded rows, adjust column names

colnames(fullDf) <- fullDf[1, ]
fullDf <- fullDf %>% rename(GroupNum = `68`)

fullDf <- fullDf %>%
  filter(!(Index == "" | Index == "Index"))

Seems worth keeping group number for now, in case we wanted to compare the data sources this references in later analysis

Additional tidying to be done:

  • Replace the “_" in columns “Sample” and “WGS ID” with a “-”.
fullDf <- fullDf %>% mutate_at(vars("Sample", "WGS ID"), funs(str_replace_all(., "_", "-")))
  • Remove any text or underscores from WGS Quote ID to get just a five-digit number. We can count on this always being 5 digits if you want to use a regex.
fullDf$`WGS Quote ID` <- str_extract(fullDf$`WGS Quote ID`, "[:digit:]+")
stopifnot(all(str_count(fullDf$`WGS Quote ID`) == 5))

fullDf$`WGS Quote ID` <- as.numeric(fullDf$`WGS Quote ID`)
  • Include an additional column that simplifies tissue type into just either “SpinalCord”,“Cerebellum”, or “Cortex”. Note that different sources may use either underscores or spaces between words in the tissue column, so best to use a regex rather than a simple match to translate these here.
fullDf$`Tissue Type` <- str_replace_all(fullDf$`Tissue Type`, "_", " ")
table(fullDf$`Tissue Type`, useNA = "ifany")
## 
##           Cerebellum       Cortex Frontal     Cortex Occipital 
##                    6                   10                    6 
## Motor Cortex Lateral  Motor Cortex Medial Spinal Cord Cervical 
##                    7                    6                    4 
##   Spinal Cord Lumbar Spinal Cord Thoracic 
##                    5                    4
  • Also would be good to make sure there is only one project and gender per unique value in “WGS ID”
# Is project number "Sample" or WGS Quote Id"?  or "GroupNum"
fullDf %>% 
  group_by(`WGS ID`, Gender, Sample) %>%
  filter(n() != 1) %>% 
  nrow()
## [1] 0
fullDf %>% 
  group_by(`WGS ID`, Gender, `WGS Quote ID`) %>%
  filter(n() != 1) %>%
  nrow()
## [1] 47

Project seems to be “Sample”, which has a unique WGS ID when grouped also by Gender

  • …and only unique tissues for each “WGS ID” (so no repeats of say “Spinal_Cord_Lumbar” for the same WGS ID).
fullDf %>% 
  group_by(`WGS ID`, `Tissue Type`) %>%
  filter(n() != 1) %>% 
  nrow()
## [1] 0

Tissue Type is unique by WGS ID