Files and data stored at: https://github.com/rweberc/Data607_Project2
Net actual interchange (electric): The algebraic sum of all metered interchange over all interconnections between two physically Adjacent Balancing Authority Areas. NERC definition https://www.eia.gov/tools/glossary/?id=electricity
elecDf <- read.csv("https://raw.githubusercontent.com/rweberc/Data607_Project2/master/electricity.csv", stringsAsFactors = FALSE, colClasses = "character", skip = 4)stopifnot(all(elecDf$megawatthours[c(FALSE, TRUE, FALSE, FALSE)] == "Demand"))
stopifnot(all(elecDf$megawatthours[c(FALSE, FALSE, TRUE, FALSE)] == "Net generation"))
stopifnot(all(elecDf$megawatthours[c(FALSE, FALSE, FALSE, TRUE)] == "Total net actual interchange"))elecDf$region <- NA
elecDf$region[c(TRUE, FALSE, FALSE, FALSE)] <- elecDf$megawatthours[c(TRUE, FALSE, FALSE, FALSE)]
elecDf <- elecDf %>% fill(region)
table(elecDf$region, useNA = "ifany")##
## California (region)
## 4
## Carolinas (region)
## 4
## Central (region)
## 4
## Electric Reliability Council of Texas
## 4
## Florida (region)
## 4
## Mid-Atlantic (region)
## 4
## Midwest (region)
## 4
## New England ISO (region)
## 4
## New York Independent System Operator (region)
## 4
## Northwest (region)
## 4
## Southeast (region)
## 4
## Southwest (region)
## 4
## Tennessee Valley Authority (region)
## 4
## United States Lower 48 (region)
## 4
elecDf$region[c(TRUE, FALSE, FALSE, FALSE)] <- ""elecDf <- elecDf %>% rename(category = megawatthours)
elecDf <- elecDf %>% select(category, region, everything())
colnames(elecDf) <- str_replace(colnames(elecDf), "X", "")
colnames(elecDf) <- str_replace_all(colnames(elecDf), "\\.", "/")
colnames(elecDf)## [1] "category" "region" "01/01/2018" "01/02/2018" "01/03/2018"
## [6] "01/04/2018" "01/05/2018" "01/06/2018" "01/07/2018" "01/08/2018"
## [11] "01/09/2018" "01/10/2018" "01/11/2018" "01/12/2018" "01/13/2018"
## [16] "01/14/2018" "01/15/2018" "01/16/2018" "01/17/2018" "01/18/2018"
## [21] "01/19/2018" "01/20/2018" "01/21/2018" "01/22/2018" "01/23/2018"
## [26] "01/24/2018" "01/25/2018" "01/26/2018" "01/27/2018" "01/28/2018"
## [31] "01/29/2018" "01/30/2018" "01/31/2018"
elecDf_long <- elecDf %>% gather("evalDay", "megawatthours", 3:length(.))# data check
elecDf_long$megawatthours[elecDf_long$megawatthours == " Inc. (region)"] <- ""
stopifnot(elecDf_long$megawatthours[str_trim(elecDf_long$region) == ""] == "")
# filter out unneed "region" rows
elecDf_long <- elecDf_long %>% filter(str_trim(region) != "")
# parse day and megawatthours
elecDf_long$evalDay <- mdy(elecDf_long$evalDay)
elecDf_long$megawatthours <- as.numeric(elecDf_long$megawatthours)elecDf <- elecDf_long %>% spread(category, megawatthours)
rm(elecDf_long)
# data check: confirm no megawatt hours are NA
stopifnot(nrow(elecDf %>% filter_if(is.Date, any_vars(is.na(.)))) == 0)perRegionAvg <- elecDf %>%
group_by(region) %>%
summarize(demandAvg = mean(Demand),
netGenAvg = mean(`Net generation`),
totInterchgAvg = mean(`Total net actual interchange`))
ggplot(perRegionAvg, aes(x=region, y=totInterchgAvg)) +
geom_bar(stat = "identity") +
labs(title = "Avg Interchange values", x = "Region", y = "Averge Interchange") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0))California looks to have the greatest deficits, while the northwest region looks to have the greatest surplus.
dayDf <- elecDf %>%
ungroup() %>%
mutate(evalDayOfWeek = wday(evalDay, label = TRUE)) %>%
group_by(region, evalDayOfWeek) %>%
summarize(demandAvg = mean(Demand),
netGenAvg = mean(`Net generation`),
totInterchgAvg = mean(`Total net actual interchange`))
ggplot(dayDf, aes(x = region, y = totInterchgAvg)) +
geom_bar(aes(fill = evalDayOfWeek), position = "dodge", stat = "identity") +
labs(title = "Avg Interchange values", x = "Region", y = "Averge Interchange") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0))Net values do not seem dependent on days.
mdDf <- read.csv("https://raw.githubusercontent.com/xiaoxiaogao-DD/DATA607-Project1/master/national_marriage_divorce_rates_00-16.csv", stringsAsFactors = FALSE, skip = 2, header = TRUE, colClasses = "character")# Confirm all values are blank for last columns
stopifnot(nrow(mdDf %>% filter_if(str_detect(colnames(.), "X"), all_vars(. != ""))) == 0)
# Remove these rows
mdDf <- mdDf[, !str_detect(colnames(mdDf), "X")]marrDf <- mdDf[1:27,]
divDf <- mdDf[29:nrow(mdDf),]colnames(divDf) <- divDf[1, ]# Save out total row numbers to confirm logic to split off footnotes was reasonable
marrRows <- nrow(marrDf)
divRows <- nrow(divDf)
marrFtDf <- marrDf %>% filter(!str_detect(Year, "^20."))
divFtDf <- divDf %>% filter(!str_detect(Year, "^20."))
marrDf <- marrDf %>% filter(str_detect(Year, "^20."))
divDf <- divDf %>% filter(str_detect(Year, "^20."))
stopifnot(marrRows == nrow(marrFtDf) + nrow(marrDf))
stopifnot(divRows == nrow(divFtDf) + nrow(divDf))marrFtDf <- marrFtDf %>%
rename(raw = Year) %>%
select(raw) %>%
filter(str_trim(raw) != "") %>%
mutate(footId = as.numeric(str_extract(raw, "^[:digit:]+")),
footnote = str_replace(raw, "^[:digit:]+/", "")) %>%
filter(!is.na(footId)) %>%
select(footId, footnote)
divFtDf <- divFtDf %>%
rename(raw = Year) %>%
select(raw) %>%
filter(str_trim(raw) != "") %>%
mutate(footId = as.numeric(str_extract(raw, "^[:digit:]+")),
footnote = str_replace(raw, "^[:digit:]+ ", "")) %>%
filter(!is.na(footId)) %>%
select(footId, footnote)# Extract footnote information from Year column
marrDf$footId <- str_extract(marrDf$Year, "/.")
marrDf$Year <- str_replace(marrDf$Year, "/.", "")
marrDf$footId <- str_replace(marrDf$footId, "/", "")
divDf$footId <- str_extract(divDf$Year, "/.")
divDf$Year <- str_replace(divDf$Year, "/.", "")
divDf$footId <- str_replace(divDf$footId, "/", "")
# remove commas
marrDf <- marrDf %>% mutate_all(funs(str_replace_all(., ",", "")))
divDf <- divDf %>% mutate_all(funs(str_replace_all(., ",", "")))
# parse all columns as numerics
marrDf <- marrDf %>% mutate_all(as.numeric)
divDf <- divDf %>% mutate_all(as.numeric)marrDf <- left_join(marrDf, marrFtDf, by="footId")
divDf <- left_join(divDf, divFtDf, by="footId")
rm(mdDf, marrFtDf, divFtDf)marrDf <- marrDf %>%
rename(marriages = Marriages,
marrPopulation = Population,
marrRatePer1000 = Rate.per.1.000.total.population,
marrFootnote = footnote) %>%
select(-footId)
divDf <- divDf %>%
rename(divPopulation = Population,
divRatePer1000 = `Rate per 1,000 total population`,
divFootnote = footnote,
divAnnul = `Divorces & annulments`) %>%
select(-footId)
finalDf <- full_join(marrDf, divDf, by = "Year") %>%
select(Year, marriages, divAnnul, marrPopulation, divPopulation, marrRatePer1000, divRatePer1000,
marrFootnote, divFootnote)
rm(divDf, marrDf)Is the decrease in the divorce rate due to the decrease in the marriage rate?
As noted, there are pretty large differences in population between the two data sets… So it’s not easy to compare marriage and divorce rates per year, but we can look at the data broadly.
Since couples may be unlikely to divorce just in the first year of marriage, maybe it would be useful to look at overlaid line graphs to see if there’s any trends that seem to occur on a delay of several years
ggplot(finalDf, aes(Year)) +
geom_line(aes(y = marrRatePer1000, colour = "marrRatePer1000")) +
geom_line(aes(y = divRatePer1000, colour = "divRatePer1000")) +
labs(title = "Marriage/Divorce Rates", x = "Year", y = "Rate Per 1000") +
theme(plot.title = element_text(hjust = 0.5))rawDf <- read.csv("https://raw.githubusercontent.com/heathergeiger/Data607_data_to_tidy/master/RNA_vs_DNA_example_sample_sheet.csv", stringsAsFactors = FALSE, header = FALSE, colClasses = "character")rawDf1 <- rawDf[, 1:6]
rawDf2 <- rawDf[, 8:13]
colnames(rawDf1) <- letters[1:length(rawDf1)]
colnames(rawDf2) <- letters[1:length(rawDf2)]
fullDf <- bind_rows(rawDf1, rawDf2)
rm(rawDf, rawDf1, rawDf2)fullDf$groupNum <- str_extract(fullDf$a, "[^0][:digit:]+") #Not sure why [^0][:digit:]+ is returning "p068", for instance
fullDf$groupNum <- str_replace(fullDf$groupNum, "p0", "")
fullDf <- fullDf %>%
fill(groupNum) %>%
filter(!str_detect(a, "Group*"))colnames(fullDf) <- fullDf[1, ]
fullDf <- fullDf %>% rename(GroupNum = `68`)
fullDf <- fullDf %>%
filter(!(Index == "" | Index == "Index"))Seems worth keeping group number for now, in case we wanted to compare the data sources this references in later analysis
fullDf <- fullDf %>% mutate_at(vars("Sample", "WGS ID"), funs(str_replace_all(., "_", "-")))fullDf$`WGS Quote ID` <- str_extract(fullDf$`WGS Quote ID`, "[:digit:]+")
stopifnot(all(str_count(fullDf$`WGS Quote ID`) == 5))
fullDf$`WGS Quote ID` <- as.numeric(fullDf$`WGS Quote ID`)fullDf$`Tissue Type` <- str_replace_all(fullDf$`Tissue Type`, "_", " ")
table(fullDf$`Tissue Type`, useNA = "ifany")##
## Cerebellum Cortex Frontal Cortex Occipital
## 6 10 6
## Motor Cortex Lateral Motor Cortex Medial Spinal Cord Cervical
## 7 6 4
## Spinal Cord Lumbar Spinal Cord Thoracic
## 5 4
# Is project number "Sample" or WGS Quote Id"? or "GroupNum"
fullDf %>%
group_by(`WGS ID`, Gender, Sample) %>%
filter(n() != 1) %>%
nrow()## [1] 0
fullDf %>%
group_by(`WGS ID`, Gender, `WGS Quote ID`) %>%
filter(n() != 1) %>%
nrow()## [1] 47
Project seems to be “Sample”, which has a unique WGS ID when grouped also by Gender
fullDf %>%
group_by(`WGS ID`, `Tissue Type`) %>%
filter(n() != 1) %>%
nrow()## [1] 0
Tissue Type is unique by WGS ID