This project continues to practice the tasks of data transformation and data wrangling.
This is done by taking three “untidy” datasets and using R to “tidy” the data. Specifically this project will focus on using the R packages of tidyr and dplyr.
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
repeat.labels <- function(vec)
{
tmp <-""
for(i in 1:length(vec))
{
if (vec[i] != "") {tmp <- vec[i]}
if (vec[i] == "" & i != 1) {vec[i] <- tmp}
if (vec[i] == "" & i == 1) {vec[i] <- ""}
next
}
vec
}
Load the dataset into R
This dataset is split across two tables.
fileLocation_1 <- 'https://raw.githubusercontent.com/ChadRyanBailey/607-Week6-Project2/master/04%20dataset1%20-%20Accidental%20Officer%20Deaths%20-%20modified.csv'
dataset1a <- read.table(file = fileLocation_1, header = TRUE, sep = ',', stringsAsFactors = FALSE)
#get the second table for this dataset
fileLocation_2 <- 'https://raw.githubusercontent.com/ChadRyanBailey/607-Week6-Project2/master/04%20dataset1%20-%20Felony%20Officer%20Deaths%20-%20modified.csv'
dataset1b <- read.table(file = fileLocation_2, header = TRUE, sep = ',', stringsAsFactors = FALSE)
Check the names/structure
names(dataset1a)
## [1] "Area" "Total" "X2008" "X2009" "X2010" "X2011" "X2012" "X2013"
## [9] "X2014" "X2015" "X2016" "X2017"
names(dataset1b)
## [1] "Area" "Total" "X2008" "X2009" "X2010" "X2011" "X2012" "X2013"
## [9] "X2014" "X2015" "X2016" "X2017"
For each table, add a column to allow records to be identified after they are unioned (i.e., appended/stacked)
dataset1a <- dataset1a %>% mutate(Death_Type = "Felony")
dataset1b <- dataset1b %>% mutate(Death_Type = "Accidental")
For each table, seperate the values for region, sub-region, state/territory into three distinct columns
dataset1a <- dataset1a %>%
mutate(Region = ifelse(Area %in% c("NORTHEAST"
, "MIDWEST"
, "SOUTH"
, "WEST"
, "PUERTO RICO AND OTHER OUTLYING AREAS"), Area, "")) %>%
mutate(Sub_Region = ifelse(Area %in% c("New England"
, "Middle Atlantic"
, "East North Central"
, "West North Central"
, "South Atlantic"
, "East South Central"
, "West South Central"
, "Mountain"
, "Pacific"), Area, "")) %>%
mutate(Area = ifelse(Region != "", "Region Total",
ifelse(Sub_Region != "", "Sub_Region Total", Area))) %>%
mutate(Sub_Region = repeat.labels(Sub_Region)) %>%
mutate(Sub_Region = ifelse(Region != "", "Region Total", Sub_Region)) %>%
mutate(Region = repeat.labels(Region)) %>%
mutate(Region = ifelse(Region == "", "Grand Total", Region)) %>%
mutate(Sub_Region = ifelse(Sub_Region == "", "Grand Total", Sub_Region)) %>%
mutate(Area = ifelse(Area == "Number of victim officers", "Grand Total", Area)) %>%
rename(State_Territory = "Area")
dataset1b <- dataset1b %>%
mutate(Region = ifelse(Area %in% c("NORTHEAST"
, "MIDWEST"
, "SOUTH"
, "WEST"
, "PUERTO RICO AND OTHER OUTLYING AREAS"), Area, "")) %>%
mutate(Sub_Region = ifelse(Area %in% c("New England"
, "Middle Atlantic"
, "East North Central"
, "West North Central"
, "South Atlantic"
, "East South Central"
, "West South Central"
, "Mountain"
, "Pacific"), Area, "")) %>%
mutate(Area = ifelse(Region != "", "Region Total",
ifelse(Sub_Region != "", "Sub_Region Total", Area))) %>%
mutate(Sub_Region = repeat.labels(Sub_Region)) %>%
mutate(Sub_Region = ifelse(Region != "", "Region Total", Sub_Region)) %>%
mutate(Region = repeat.labels(Region)) %>%
mutate(Region = ifelse(Region == "", "Grand Total", Region)) %>%
mutate(Sub_Region = ifelse(Sub_Region == "", "Grand Total", Sub_Region)) %>%
mutate(Area = ifelse(Area == "Number of victim officers", "Grand Total", Area)) %>%
rename(State_Territory = "Area")
Union (append/stack) the tables and then check counts for reasonability
dataset1 <- union(dataset1a, dataset1b)
dataset1 %>%
group_by(Death_Type) %>%
summarise(n=n())
## # A tibble: 2 x 2
## Death_Type n
## <chr> <int>
## 1 Accidental 71
## 2 Felony 71
As requested, all subtotals are removed
dataset1 <- dataset1 %>%
#drop total rows
filter(Region != "Grand Total"
& !Sub_Region %in% c("Grand Total", "Region Total")
& !State_Territory %in% c("Grand Total", "Sub_Region Total")) %>%
#drop total column and re-order columns
select(Death_Type
,Region
, Sub_Region
, State_Territory
, X2008
, X2009
, X2010
, X2011
, X2012
, X2013
, X2014
, X2015
, X2016
, X2017
) %>%
arrange(Death_Type, Region, Sub_Region, State_Territory)
Normalize (gather) the data so that years move from being seperate columns to being rows
names(dataset1) = str_replace_all(names(dataset1), "X", "")
dataset1_tidy <- gather(dataset1, "Year", "N_Deaths", 5:14)
Denormalize (spread) the data so that each value of Death_Type becomes its own column.
Also add a total column (across death types)
dataset1_tidy <- spread(dataset1_tidy, Death_Type, N_Deaths)
dataset1_tidy <- dataset1_tidy %>%
mutate(Total = Felony + Accidental)
Filter to a single state to verify transformations were succuessful
dataset1_tidy %>%
filter(State_Territory == "Michigan") %>%
arrange(Region, Sub_Region, State_Territory)
## Region Sub_Region State_Territory Year Accidental Felony Total
## 1 MIDWEST East North Central Michigan 2008 1 0 1
## 2 MIDWEST East North Central Michigan 2009 0 2 2
## 3 MIDWEST East North Central Michigan 2010 3 0 3
## 4 MIDWEST East North Central Michigan 2011 4 1 5
## 5 MIDWEST East North Central Michigan 2012 1 0 1
## 6 MIDWEST East North Central Michigan 2013 1 1 2
## 7 MIDWEST East North Central Michigan 2014 0 1 1
## 8 MIDWEST East North Central Michigan 2015 0 2 2
## 9 MIDWEST East North Central Michigan 2016 2 2 4
## 10 MIDWEST East North Central Michigan 2017 2 2 4
Calculate some basic statistics
dataset1_summary <- dataset1_tidy %>%
group_by(Year, Region) %>%
summarise(nRecords = n()
,nTotalDeaths = sum(Total)
,nAccidentalDeaths = sum(Accidental)
,nFelonyDeaths = sum(Felony))
Generate a scatterplot for visual exploration
g <- ggplot(dataset1_summary, aes(x= nAccidentalDeaths, y = nFelonyDeaths))
g <- g + geom_point(aes(color = Region))
g <- g + facet_wrap(~Year)
g
Load the dataset into R
fileLocation <- 'https://raw.githubusercontent.com/ChadRyanBailey/607-Week6-Project2/master/06%20dataset2%20-%20Race%20Counts%20-%20modified.csv'
dataset2 <- read.table(file = fileLocation, header = TRUE, sep = ',', stringsAsFactors = FALSE)
Check the names and structure
names(dataset2)
## [1] "Race" "Hispanic.or.Latino"
## [3] "X..of.H.L" "X..of.US"
## [5] "Not.Hispanic.or.Latino" "X..of.Not.H.L"
## [7] "X..of.US.1"
head(dataset2)
## Race Hispanic.or.Latino X..of.H.L X..of.US
## 1 All races 35305818 100.0 12.5
## 2 One race 33081736 93.7 11.8
## 3 White 16907852 47.9 6
## 4 Black or African A. 710353 2.0 0.3
## 5 A. Indian/Alaska Nat. 407073 1.2 0.1
## 6 Asian 119829 0.3 <0.1
## Not.Hispanic.or.Latino X..of.Not.H.L X..of.US.1
## 1 246116088 100.0 87.5
## 2 241513942 98.1 85.8
## 3 194552774 79.1 69.1
## 4 33947837 13.8 12.1
## 5 2068883 0.8 0.7
## 6 10123169 4.1 3.6
Begin tidying up the dataset.
dataset2 <- dataset2 %>%
mutate(Count_of_Races = ifelse(Race %in% c("All races", "One race", "2+ races"), Race, "")) %>%
mutate(Race = ifelse(Count_of_Races != "", "All races", Race)) %>%
mutate(Count_of_Races = repeat.labels(Count_of_Races)) %>%
mutate(Count_of_Races = str_replace_all(Count_of_Races, "2", "Two")) %>%
mutate(Race = str_replace_all(Race, "2", "Two")) %>%
rename(H_L = "Hispanic.or.Latino"
,Pct_of_H_L = "X..of.H.L"
,Pct_H_L_of_US = "X..of.US"
,Not_H_L = "Not.Hispanic.or.Latino"
,Pct_of_Not_H_L = "X..of.Not.H.L"
,Pct_Not_H_L_of_US = "X..of.US.1"
)
Add columns for an “All Ethnicities” block of columns
dataset2 <- dataset2 %>%
mutate(RaceTotal = H_L + Not_H_L) %>%
mutate(RacePct_ofEth = 100)
#get US grand total to calculate RaceTotal Pct of US
us_gt <-as.numeric(dataset2 %>%
filter(Count_of_Races == "All races") %>%
select(RaceTotal))
dataset2 <- dataset2 %>%
mutate(RacePct_ofUS = round(RaceTotal/us_gt*100, 2))
Seperate the three block of columns to seperate dataframes.
Then union (append/stack) those dataframes together.
#get just Hispanic records
eth_Hispanic <- dataset2 %>%
select(Count_of_Races ,Race ,H_L ,Pct_of_H_L ,Pct_H_L_of_US) %>%
mutate (Ethnicity = "Hispanic or Latino") %>%
rename(N_Race_Ethnicity = "H_L"
, Pct_Ethnicity_Total = "Pct_of_H_L"
, Pct_US_Total = "Pct_H_L_of_US") %>%
mutate(Pct_US_Total = as.character(Pct_US_Total))
#get just not-hispanic records
eth_Not_Hispanic <- dataset2 %>%
select(Count_of_Races ,Race ,Not_H_L ,Pct_of_Not_H_L ,Pct_Not_H_L_of_US) %>%
mutate (Ethnicity = "Not Hispanic or Latino") %>%
rename(N_Race_Ethnicity = "Not_H_L"
, Pct_Ethnicity_Total = "Pct_of_Not_H_L"
, Pct_US_Total = "Pct_Not_H_L_of_US") %>%
mutate(Pct_US_Total = as.character(Pct_US_Total))
#get just "all ethnicities" records
eth_Total <- dataset2 %>%
select(Count_of_Races ,Race ,RaceTotal ,RacePct_ofEth ,RacePct_ofUS) %>%
mutate (Ethnicity = "All Ethnicities") %>%
rename(N_Race_Ethnicity = "RaceTotal"
, Pct_Ethnicity_Total = "RacePct_ofEth"
, Pct_US_Total = "RacePct_ofUS") %>%
mutate(Pct_US_Total = as.character(Pct_US_Total))
# union (i.e., append/stack) the three datasets together
dataset2_tidy <- union(eth_Total, eth_Hispanic)
dataset2_tidy <- union(dataset2_tidy, eth_Not_Hispanic)
Check the count of records for each block (Ethnicity)
dataset2_tidy %>%
group_by(Ethnicity) %>%
summarise(n = n())
## # A tibble: 3 x 2
## Ethnicity n
## <chr> <int>
## 1 All Ethnicities 11
## 2 Hispanic or Latino 11
## 3 Not Hispanic or Latino 11
In some records the Pct_US_Total is shown as “<0.1” rather than as a true percent. The following uses the available data to recalculate the Pct_US_Total and rounds to two decimals to avoid this problem.
The same is done for Pct_Ethnicity_Total so the two variables will have the same precision.
#US grand total is already stored in us_gt
#get grand totals for each Ethnicity
eth_totals <- dataset2_tidy %>%
filter(Count_of_Races == "All races") %>%
rename(eth_Total = "N_Race_Ethnicity") %>%
select(Ethnicity, eth_Total)
#join ethnicity grand totals into dataset for calculations
dataset2_tidy <- left_join(dataset2_tidy, eth_totals, by = "Ethnicity")
#recalculate and stored pct columns
dataset2_tidy <- dataset2_tidy %>%
mutate(Pct_Ethnicity_Total = round(N_Race_Ethnicity/eth_Total*100, 2)) %>%
mutate(Pct_US_Total = round(N_Race_Ethnicity/us_gt*100, 2)) %>%
select(Ethnicity
, Count_of_Races
, Race
, N_Race_Ethnicity
, Pct_Ethnicity_Total
, Pct_US_Total) %>%
arrange(Ethnicity
, Count_of_Races
, Race)
dataset2_tidy %>%
filter(Ethnicity == "All Ethnicities")
## Ethnicity Count_of_Races Race
## 1 All Ethnicities All races All races
## 2 All Ethnicities One race A. Indian/Alaska Nat.
## 3 All Ethnicities One race All races
## 4 All Ethnicities One race Asian
## 5 All Ethnicities One race Black or African A.
## 6 All Ethnicities One race Hawaiian N. & Pacific Is.
## 7 All Ethnicities One race Some other
## 8 All Ethnicities One race White
## 9 All Ethnicities Two+ races All races
## 10 All Ethnicities Two+ races Some other + W/B/N/A
## 11 All Ethnicities Two+ races Two+ W/B/N/A
## N_Race_Ethnicity Pct_Ethnicity_Total Pct_US_Total
## 1 281421906 100.00 100.00
## 2 2475956 0.88 0.88
## 3 274595678 97.57 97.57
## 4 10242998 3.64 3.64
## 5 34658190 12.32 12.32
## 6 398835 0.14 0.14
## 7 15359073 5.46 5.46
## 8 211460626 75.14 75.14
## 9 6826228 2.43 2.43
## 10 3162413 1.12 1.12
## 11 3663815 1.30 1.30
Load the data into R
fileLocation <- 'https://raw.githubusercontent.com/ChadRyanBailey/607-Week6-Project2/master/07%20dataset3%20-%20Poverty%20Counts.csv'
dataset3 <- read.table(file = fileLocation, header = TRUE, sep = ',')
Review the column names
names(dataset3)
## [1] "Characteristic"
## [2] "X2016.Total"
## [3] "X2016.Below.Poverty.Number"
## [4] "X2016.Below.Poverty.Number.Margin.of.Error"
## [5] "X2016.Below.Poverty.Percent"
## [6] "X2016.Below.Poverty.Percent.Margin.of.Error"
## [7] "X2017.Total"
## [8] "X2017.Below.Poverty.Number"
## [9] "X2017.Below.Poverty.Number.Margin.of.Error"
## [10] "X2017.Below.Poverty.Percent"
## [11] "X2017.Below.Poverty.Percent.Margin.of.Error"
## [12] "Change.in.Poverty.Number"
## [13] "Change.in.Poverty.Percent"
The column names are particularly long and cumbersome. Adjust column names to be more workable.
names(dataset3) <- str_replace_all(names(dataset3), "\\.", "_")
names(dataset3) <- str_replace_all(names(dataset3), "Poverty", "Pov")
names(dataset3) <- str_replace_all(names(dataset3), "Number", "N")
names(dataset3) <- str_replace_all(names(dataset3), "Percent", "Pct")
names(dataset3) <- str_replace_all(names(dataset3), "_Below", "")
names(dataset3) <- str_replace_all(names(dataset3), "_Margin_of_Error", "_MOE")
names(dataset3) <- str_replace_all(names(dataset3), "Total", "Total_N")
names(dataset3)
## [1] "Characteristic" "X2016_Total_N" "X2016_Pov_N"
## [4] "X2016_Pov_N_MOE" "X2016_Pov_Pct" "X2016_Pov_Pct_MOE"
## [7] "X2017_Total_N" "X2017_Pov_N" "X2017_Pov_N_MOE"
## [10] "X2017_Pov_Pct" "X2017_Pov_Pct_MOE" "Change_in_Pov_N"
## [13] "Change_in_Pov_Pct"
Add a Characterist_Type variable to group the types of characteristics
dataset3 <- dataset3 %>%
mutate(Characteristic = str_trim(Characteristic)) %>%
mutate(Characteristic_Type = ifelse(is.na(X2016_Total_N), Characteristic, "")) %>%
mutate(Characteristic_Type = repeat.labels(Characteristic_Type)) %>%
mutate(Characteristic_Type = str_replace_all(Characteristic_Type, " and Hispanic Origin", "/Ethnicity")) %>%
mutate(Characteristic_Type = str_replace_all(Characteristic_Type, "\\d", "")) %>%
filter(X2016_Total_N != "" #remove blank rows
& Characteristic != "Total" #remove total row
& Characteristic != "White, not Hispanic") #remove sub-element row
head(dataset3 %>% select(Characteristic_Type, Characteristic), n = 15)
## Characteristic_Type Characteristic
## 1 Race/Ethnicity White
## 2 Race/Ethnicity Black
## 3 Race/Ethnicity Asian
## 4 Race/Ethnicity Hispanic (any race)
## 5 Sex Male
## 6 Sex Female
## 7 Age Under age 18
## 8 Age Aged 18 to 64
## 9 Age Aged 65 and older
## 10 Nativity Native born
## 11 Nativity Foreign born
## 12 Nativity Naturalized citizen
## 13 Nativity Not a citizen
## 14 Region Northeast
## 15 Region Midwest
Seperate year-block of columns to its own dataframe and then union together.
#get just 2016 block of records
y_2016 <- dataset3 %>%
mutate(X2016_Year = 2016) %>%
select( X2016_Year
, Characteristic_Type
, Characteristic
, X2016_Total_N
, X2016_Pov_N
, X2016_Pov_N_MOE
, X2016_Pov_Pct
, X2016_Pov_Pct_MOE)
names(y_2016) <- str_replace_all(names(y_2016), "X2016_", "")
#get just 2017 block of records
y_2017 <- dataset3 %>%
mutate(X2017_Year = 2017) %>%
select( X2017_Year
, Characteristic_Type
, Characteristic
, X2017_Total_N
, X2017_Pov_N
, X2017_Pov_N_MOE
, X2017_Pov_Pct
, X2017_Pov_Pct_MOE)
names(y_2017) <- str_replace_all(names(y_2017), "X2017_", "")
#union (append/stack) the years back together
dataset3_tidy <- union(y_2016, y_2017)
The orginal table was showing counts in thousands. The following recalculates to the actual values.
dataset3_tidy <- dataset3_tidy %>%
mutate(Total_N = Total_N *1000, Pov_N = Pov_N*1000)
Review a small sample to verify the transformations were successful.
dataset3_tidy %>%
filter(Characteristic_Type %in% c("Race/Ethnicity", "Sex")) %>%
select(Year ,Characteristic_Type ,Characteristic ,Total_N ,Pov_N, Pov_Pct) %>%
arrange (Year ,Characteristic_Type ,Characteristic)
## Year Characteristic_Type Characteristic Total_N Pov_N Pov_Pct
## 1 2016 Race/Ethnicity Asian 18879000 1908000 10.1
## 2 2016 Race/Ethnicity Black 41962000 9234000 22.0
## 3 2016 Race/Ethnicity Hispanic (any race) 57556000 11137000 19.4
## 4 2016 Race/Ethnicity White 245985000 27113000 11.0
## 5 2016 Sex Female 163234000 22931000 14.0
## 6 2016 Sex Male 156677000 17685000 11.3
## 7 2017 Race/Ethnicity Asian 19475000 1953000 10.0
## 8 2017 Race/Ethnicity Black 42474000 8993000 21.2
## 9 2017 Race/Ethnicity Hispanic (any race) 59053000 10790000 18.3
## 10 2017 Race/Ethnicity White 247272000 26436000 10.7
## 11 2017 Sex Female 164433000 22333000 13.6
## 12 2017 Sex Male 158116000 17365000 11.0