Week6 Project2 Tidy Data

Overview

This project continues to practice the tasks of data transformation and data wrangling.

This is done by taking three “untidy” datasets and using R to “tidy” the data. Specifically this project will focus on using the R packages of tidyr and dplyr.

Load necessary packages and define supporting functions

library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)

repeat.labels <- function(vec)
{
  tmp <-""
  for(i in 1:length(vec))
  {
    if (vec[i] != "") {tmp <- vec[i]}
    if (vec[i] == "" & i != 1) {vec[i] <- tmp}
    if (vec[i] == "" & i == 1) {vec[i] <- ""}
    next
  }
  vec
}

Dataset1: Officer Deaths

Load the dataset into R

This dataset is split across two tables.

fileLocation_1 <- 'https://raw.githubusercontent.com/ChadRyanBailey/607-Week6-Project2/master/04%20dataset1%20-%20Accidental%20Officer%20Deaths%20-%20modified.csv'

dataset1a <- read.table(file = fileLocation_1, header = TRUE, sep = ',', stringsAsFactors =  FALSE)



#get the second table for this dataset
fileLocation_2 <- 'https://raw.githubusercontent.com/ChadRyanBailey/607-Week6-Project2/master/04%20dataset1%20-%20Felony%20Officer%20Deaths%20-%20modified.csv'

dataset1b <- read.table(file = fileLocation_2, header = TRUE, sep = ',', stringsAsFactors =  FALSE)

Check the names/structure

names(dataset1a)

##  [1] "Area"  "Total" "X2008" "X2009" "X2010" "X2011" "X2012" "X2013"
##  [9] "X2014" "X2015" "X2016" "X2017"

names(dataset1b)

##  [1] "Area"  "Total" "X2008" "X2009" "X2010" "X2011" "X2012" "X2013"
##  [9] "X2014" "X2015" "X2016" "X2017"

For each table, add a column to allow records to be identified after they are unioned (i.e., appended/stacked)

dataset1a <- dataset1a %>% mutate(Death_Type = "Felony")
dataset1b <- dataset1b %>% mutate(Death_Type = "Accidental")

For each table, seperate the values for region, sub-region, state/territory into three distinct columns

dataset1a <- dataset1a %>% 
  mutate(Region = ifelse(Area %in% c("NORTHEAST"
                                       , "MIDWEST"
                                       , "SOUTH"
                                       , "WEST"
                                       , "PUERTO RICO AND OTHER OUTLYING AREAS"), Area, "")) %>%
  mutate(Sub_Region = ifelse(Area %in% c("New England"
                                         , "Middle Atlantic"
                                         , "East North Central"
                                         , "West North Central"
                                         , "South Atlantic"
                                         , "East South Central"
                                         , "West South Central"
                                         , "Mountain"
                                         , "Pacific"), Area, "")) %>%
  mutate(Area = ifelse(Region != "", "Region Total", 
                       ifelse(Sub_Region != "", "Sub_Region Total", Area))) %>%
  mutate(Sub_Region = repeat.labels(Sub_Region)) %>%
  mutate(Sub_Region = ifelse(Region != "", "Region Total", Sub_Region)) %>%
  mutate(Region = repeat.labels(Region)) %>%
  mutate(Region = ifelse(Region == "", "Grand Total", Region)) %>%
  mutate(Sub_Region = ifelse(Sub_Region == "", "Grand Total", Sub_Region)) %>%
  mutate(Area = ifelse(Area == "Number of victim officers", "Grand Total", Area)) %>%
  rename(State_Territory = "Area")
  
dataset1b <- dataset1b %>% 
  mutate(Region = ifelse(Area %in% c("NORTHEAST"
                                       , "MIDWEST"
                                       , "SOUTH"
                                       , "WEST"
                                       , "PUERTO RICO AND OTHER OUTLYING AREAS"), Area, "")) %>%
  mutate(Sub_Region = ifelse(Area %in% c("New England"
                                         , "Middle Atlantic"
                                         , "East North Central"
                                         , "West North Central"
                                         , "South Atlantic"
                                         , "East South Central"
                                         , "West South Central"
                                         , "Mountain"
                                         , "Pacific"), Area, "")) %>%
  mutate(Area = ifelse(Region != "", "Region Total", 
                       ifelse(Sub_Region != "", "Sub_Region Total", Area))) %>%
  mutate(Sub_Region = repeat.labels(Sub_Region)) %>%
  mutate(Sub_Region = ifelse(Region != "", "Region Total", Sub_Region)) %>%
  mutate(Region = repeat.labels(Region)) %>%
  mutate(Region = ifelse(Region == "", "Grand Total", Region)) %>%
  mutate(Sub_Region = ifelse(Sub_Region == "", "Grand Total", Sub_Region)) %>%
  mutate(Area = ifelse(Area == "Number of victim officers", "Grand Total", Area)) %>%
  rename(State_Territory = "Area")

Union (append/stack) the tables and then check counts for reasonability

dataset1 <- union(dataset1a, dataset1b)

dataset1 %>%
  group_by(Death_Type) %>%
  summarise(n=n())

## # A tibble: 2 x 2
##   Death_Type     n
##   <chr>      <int>
## 1 Accidental    71
## 2 Felony        71

As requested, all subtotals are removed

dataset1 <- dataset1 %>%
  #drop total rows
  filter(Region != "Grand Total" 
         & !Sub_Region %in% c("Grand Total", "Region Total")
         & !State_Territory %in% c("Grand Total", "Sub_Region Total")) %>%
  #drop total column and re-order columns
  select(Death_Type
         ,Region
         , Sub_Region
         , State_Territory 
         , X2008
         , X2009
         , X2010
         , X2011
         , X2012
         , X2013
         , X2014
         , X2015
         , X2016
         , X2017
         ) %>%
  arrange(Death_Type, Region, Sub_Region, State_Territory)

Normalize (gather) the data so that years move from being seperate columns to being rows

names(dataset1) = str_replace_all(names(dataset1), "X", "")
dataset1_tidy <- gather(dataset1, "Year", "N_Deaths", 5:14)

Denormalize (spread) the data so that each value of Death_Type becomes its own column.
Also add a total column (across death types)

dataset1_tidy <- spread(dataset1_tidy, Death_Type, N_Deaths)

dataset1_tidy <- dataset1_tidy %>%
  mutate(Total = Felony + Accidental)

Filter to a single state to verify transformations were succuessful

dataset1_tidy %>%
  filter(State_Territory == "Michigan") %>%
  arrange(Region, Sub_Region, State_Territory)

##     Region         Sub_Region State_Territory Year Accidental Felony Total
## 1  MIDWEST East North Central        Michigan 2008          1      0     1
## 2  MIDWEST East North Central        Michigan 2009          0      2     2
## 3  MIDWEST East North Central        Michigan 2010          3      0     3
## 4  MIDWEST East North Central        Michigan 2011          4      1     5
## 5  MIDWEST East North Central        Michigan 2012          1      0     1
## 6  MIDWEST East North Central        Michigan 2013          1      1     2
## 7  MIDWEST East North Central        Michigan 2014          0      1     1
## 8  MIDWEST East North Central        Michigan 2015          0      2     2
## 9  MIDWEST East North Central        Michigan 2016          2      2     4
## 10 MIDWEST East North Central        Michigan 2017          2      2     4

Calculate some basic statistics

dataset1_summary <- dataset1_tidy %>%
  group_by(Year, Region) %>%
  summarise(nRecords = n()
            ,nTotalDeaths = sum(Total)
            ,nAccidentalDeaths = sum(Accidental)
            ,nFelonyDeaths = sum(Felony))

Generate a scatterplot for visual exploration

g <- ggplot(dataset1_summary, aes(x= nAccidentalDeaths, y = nFelonyDeaths)) 
g <- g + geom_point(aes(color = Region))
g <- g + facet_wrap(~Year)
g

Dataset2: US Race and Ethnicity Counts

Load the dataset into R

fileLocation <- 'https://raw.githubusercontent.com/ChadRyanBailey/607-Week6-Project2/master/06%20dataset2%20-%20Race%20Counts%20-%20modified.csv'

dataset2 <- read.table(file = fileLocation, header = TRUE, sep = ',', stringsAsFactors =  FALSE)

Check the names and structure

names(dataset2)

## [1] "Race"                   "Hispanic.or.Latino"    
## [3] "X..of.H.L"              "X..of.US"              
## [5] "Not.Hispanic.or.Latino" "X..of.Not.H.L"         
## [7] "X..of.US.1"

head(dataset2)

##                    Race Hispanic.or.Latino X..of.H.L X..of.US
## 1             All races           35305818     100.0     12.5
## 2              One race           33081736      93.7     11.8
## 3                 White           16907852      47.9        6
## 4   Black or African A.             710353       2.0      0.3
## 5 A. Indian/Alaska Nat.             407073       1.2      0.1
## 6                 Asian             119829       0.3     <0.1
##   Not.Hispanic.or.Latino X..of.Not.H.L X..of.US.1
## 1              246116088         100.0       87.5
## 2              241513942          98.1       85.8
## 3              194552774          79.1       69.1
## 4               33947837          13.8       12.1
## 5                2068883           0.8        0.7
## 6               10123169           4.1        3.6

Begin tidying up the dataset.

Add a new column for “Count of Races” to hold current subtotals
Add change the “Race” value for these subtotal rows
Make sure row labels are repeated for each row
Replace the character “2” with the word “Two” for consistency
Rename columns for easier reference

dataset2 <- dataset2 %>%
  mutate(Count_of_Races = ifelse(Race %in% c("All races", "One race", "2+ races"), Race, "")) %>%
  mutate(Race = ifelse(Count_of_Races != "", "All races", Race)) %>%
  mutate(Count_of_Races = repeat.labels(Count_of_Races)) %>%
  mutate(Count_of_Races = str_replace_all(Count_of_Races, "2", "Two")) %>%
  mutate(Race = str_replace_all(Race, "2", "Two")) %>%
  rename(H_L = "Hispanic.or.Latino"
         ,Pct_of_H_L = "X..of.H.L"
         ,Pct_H_L_of_US = "X..of.US"
         ,Not_H_L = "Not.Hispanic.or.Latino"
         ,Pct_of_Not_H_L = "X..of.Not.H.L"
         ,Pct_Not_H_L_of_US = "X..of.US.1"
         )

Add columns for an “All Ethnicities” block of columns

Add column for race to be totaled across ethnicities
Add column for race pct of ethnicity
Get US grand total
Calculate the Pct of US total for each race total

dataset2 <- dataset2 %>%
  mutate(RaceTotal = H_L + Not_H_L) %>%
  mutate(RacePct_ofEth = 100)

#get US grand total to calculate RaceTotal Pct of US
us_gt <-as.numeric(dataset2 %>%
  filter(Count_of_Races == "All races") %>%
  select(RaceTotal))

dataset2 <- dataset2 %>%
  mutate(RacePct_ofUS = round(RaceTotal/us_gt*100, 2))

Seperate the three block of columns to seperate dataframes.
Then union (append/stack) those dataframes together.

#get just Hispanic records
eth_Hispanic <- dataset2 %>%
  select(Count_of_Races ,Race ,H_L ,Pct_of_H_L ,Pct_H_L_of_US) %>%
  mutate (Ethnicity = "Hispanic or Latino") %>%
  rename(N_Race_Ethnicity = "H_L"
         , Pct_Ethnicity_Total = "Pct_of_H_L"
         , Pct_US_Total = "Pct_H_L_of_US") %>%
  mutate(Pct_US_Total = as.character(Pct_US_Total))

#get just not-hispanic records
eth_Not_Hispanic <- dataset2 %>%
  select(Count_of_Races ,Race ,Not_H_L ,Pct_of_Not_H_L ,Pct_Not_H_L_of_US) %>%
  mutate (Ethnicity = "Not Hispanic or Latino") %>%
  rename(N_Race_Ethnicity = "Not_H_L"
         , Pct_Ethnicity_Total = "Pct_of_Not_H_L"
         , Pct_US_Total = "Pct_Not_H_L_of_US") %>%
  mutate(Pct_US_Total = as.character(Pct_US_Total))

#get just "all ethnicities" records
eth_Total <- dataset2 %>%
  select(Count_of_Races ,Race ,RaceTotal ,RacePct_ofEth ,RacePct_ofUS) %>%
  mutate (Ethnicity = "All Ethnicities") %>%
  rename(N_Race_Ethnicity = "RaceTotal"
         , Pct_Ethnicity_Total = "RacePct_ofEth"
         , Pct_US_Total = "RacePct_ofUS") %>%
  mutate(Pct_US_Total = as.character(Pct_US_Total))

# union (i.e., append/stack) the three datasets together
dataset2_tidy <- union(eth_Total, eth_Hispanic)
dataset2_tidy <- union(dataset2_tidy, eth_Not_Hispanic)

Check the count of records for each block (Ethnicity)

dataset2_tidy %>%
  group_by(Ethnicity) %>%
  summarise(n = n())

## # A tibble: 3 x 2
##   Ethnicity                  n
##   <chr>                  <int>
## 1 All Ethnicities           11
## 2 Hispanic or Latino        11
## 3 Not Hispanic or Latino    11

In some records the Pct_US_Total is shown as “<0.1” rather than as a true percent. The following uses the available data to recalculate the Pct_US_Total and rounds to two decimals to avoid this problem.

The same is done for Pct_Ethnicity_Total so the two variables will have the same precision.

#US grand total is already stored in us_gt

#get grand totals for each Ethnicity
eth_totals <- dataset2_tidy %>%
  filter(Count_of_Races == "All races") %>%
  rename(eth_Total = "N_Race_Ethnicity") %>%
  select(Ethnicity, eth_Total)

#join ethnicity grand totals into dataset for calculations
dataset2_tidy <- left_join(dataset2_tidy, eth_totals, by = "Ethnicity")

#recalculate and stored pct columns
dataset2_tidy <- dataset2_tidy %>%
  mutate(Pct_Ethnicity_Total = round(N_Race_Ethnicity/eth_Total*100, 2)) %>%
  mutate(Pct_US_Total = round(N_Race_Ethnicity/us_gt*100, 2)) %>%
  select(Ethnicity
         , Count_of_Races
         , Race
         , N_Race_Ethnicity
         , Pct_Ethnicity_Total
         , Pct_US_Total) %>%
  arrange(Ethnicity
         , Count_of_Races
         , Race)

dataset2_tidy %>% 
  filter(Ethnicity == "All Ethnicities")

##          Ethnicity Count_of_Races                      Race
## 1  All Ethnicities      All races                 All races
## 2  All Ethnicities       One race     A. Indian/Alaska Nat.
## 3  All Ethnicities       One race                 All races
## 4  All Ethnicities       One race                     Asian
## 5  All Ethnicities       One race       Black or African A.
## 6  All Ethnicities       One race Hawaiian N. & Pacific Is.
## 7  All Ethnicities       One race                Some other
## 8  All Ethnicities       One race                     White
## 9  All Ethnicities     Two+ races                 All races
## 10 All Ethnicities     Two+ races      Some other + W/B/N/A
## 11 All Ethnicities     Two+ races              Two+ W/B/N/A
##    N_Race_Ethnicity Pct_Ethnicity_Total Pct_US_Total
## 1         281421906              100.00       100.00
## 2           2475956                0.88         0.88
## 3         274595678               97.57        97.57
## 4          10242998                3.64         3.64
## 5          34658190               12.32        12.32
## 6            398835                0.14         0.14
## 7          15359073                5.46         5.46
## 8         211460626               75.14        75.14
## 9           6826228                2.43         2.43
## 10          3162413                1.12         1.12
## 11          3663815                1.30         1.30

Dataset3: US Poverty by Various Demographics

Load the data into R

fileLocation <- 'https://raw.githubusercontent.com/ChadRyanBailey/607-Week6-Project2/master/07%20dataset3%20-%20Poverty%20Counts.csv'

dataset3 <- read.table(file = fileLocation, header = TRUE, sep = ',')

Review the column names

names(dataset3)

##  [1] "Characteristic"                             
##  [2] "X2016.Total"                                
##  [3] "X2016.Below.Poverty.Number"                 
##  [4] "X2016.Below.Poverty.Number.Margin.of.Error" 
##  [5] "X2016.Below.Poverty.Percent"                
##  [6] "X2016.Below.Poverty.Percent.Margin.of.Error"
##  [7] "X2017.Total"                                
##  [8] "X2017.Below.Poverty.Number"                 
##  [9] "X2017.Below.Poverty.Number.Margin.of.Error" 
## [10] "X2017.Below.Poverty.Percent"                
## [11] "X2017.Below.Poverty.Percent.Margin.of.Error"
## [12] "Change.in.Poverty.Number"                   
## [13] "Change.in.Poverty.Percent"

The column names are particularly long and cumbersome. Adjust column names to be more workable.

names(dataset3) <- str_replace_all(names(dataset3), "\\.", "_")
names(dataset3) <- str_replace_all(names(dataset3), "Poverty", "Pov")
names(dataset3) <- str_replace_all(names(dataset3), "Number", "N")
names(dataset3) <- str_replace_all(names(dataset3), "Percent", "Pct")
names(dataset3) <- str_replace_all(names(dataset3), "_Below", "")
names(dataset3) <- str_replace_all(names(dataset3), "_Margin_of_Error", "_MOE")
names(dataset3) <- str_replace_all(names(dataset3), "Total", "Total_N")



names(dataset3)

##  [1] "Characteristic"    "X2016_Total_N"     "X2016_Pov_N"      
##  [4] "X2016_Pov_N_MOE"   "X2016_Pov_Pct"     "X2016_Pov_Pct_MOE"
##  [7] "X2017_Total_N"     "X2017_Pov_N"       "X2017_Pov_N_MOE"  
## [10] "X2017_Pov_Pct"     "X2017_Pov_Pct_MOE" "Change_in_Pov_N"  
## [13] "Change_in_Pov_Pct"

Add a Characterist_Type variable to group the types of characteristics

dataset3 <- dataset3 %>%
  mutate(Characteristic = str_trim(Characteristic)) %>%
  mutate(Characteristic_Type = ifelse(is.na(X2016_Total_N), Characteristic, "")) %>%
  mutate(Characteristic_Type = repeat.labels(Characteristic_Type)) %>%
  mutate(Characteristic_Type = str_replace_all(Characteristic_Type, " and Hispanic Origin", "/Ethnicity")) %>%
  mutate(Characteristic_Type = str_replace_all(Characteristic_Type, "\\d", "")) %>%
  filter(X2016_Total_N != "" #remove blank rows
         & Characteristic != "Total"  #remove total row
         & Characteristic != "White, not Hispanic") #remove sub-element row

head(dataset3 %>% select(Characteristic_Type, Characteristic), n = 15)

##    Characteristic_Type      Characteristic
## 1       Race/Ethnicity               White
## 2       Race/Ethnicity               Black
## 3       Race/Ethnicity               Asian
## 4       Race/Ethnicity Hispanic (any race)
## 5                  Sex                Male
## 6                  Sex              Female
## 7                  Age        Under age 18
## 8                  Age       Aged 18 to 64
## 9                  Age   Aged 65 and older
## 10            Nativity         Native born
## 11            Nativity        Foreign born
## 12            Nativity Naturalized citizen
## 13            Nativity       Not a citizen
## 14              Region           Northeast
## 15              Region             Midwest

Seperate year-block of columns to its own dataframe and then union together.

#get just 2016 block of records
y_2016 <- dataset3 %>%
  mutate(X2016_Year = 2016) %>%
  select( X2016_Year
         , Characteristic_Type
         , Characteristic
         , X2016_Total_N
         , X2016_Pov_N
         , X2016_Pov_N_MOE
         , X2016_Pov_Pct
         , X2016_Pov_Pct_MOE)

names(y_2016) <- str_replace_all(names(y_2016), "X2016_", "")


#get just 2017 block of records
y_2017 <- dataset3 %>%
  mutate(X2017_Year = 2017) %>%
  select( X2017_Year
         , Characteristic_Type
         , Characteristic
         , X2017_Total_N
         , X2017_Pov_N
         , X2017_Pov_N_MOE
         , X2017_Pov_Pct
         , X2017_Pov_Pct_MOE)

names(y_2017) <- str_replace_all(names(y_2017), "X2017_", "")
  
#union (append/stack) the years back together
dataset3_tidy <- union(y_2016, y_2017)

The orginal table was showing counts in thousands. The following recalculates to the actual values.

dataset3_tidy <- dataset3_tidy %>%
  mutate(Total_N = Total_N *1000, Pov_N = Pov_N*1000)

Review a small sample to verify the transformations were successful.

dataset3_tidy %>%
  filter(Characteristic_Type %in% c("Race/Ethnicity", "Sex")) %>%
  select(Year ,Characteristic_Type ,Characteristic ,Total_N ,Pov_N, Pov_Pct) %>%
  arrange (Year ,Characteristic_Type ,Characteristic)

##    Year Characteristic_Type      Characteristic   Total_N    Pov_N Pov_Pct
## 1  2016      Race/Ethnicity               Asian  18879000  1908000    10.1
## 2  2016      Race/Ethnicity               Black  41962000  9234000    22.0
## 3  2016      Race/Ethnicity Hispanic (any race)  57556000 11137000    19.4
## 4  2016      Race/Ethnicity               White 245985000 27113000    11.0
## 5  2016                 Sex              Female 163234000 22931000    14.0
## 6  2016                 Sex                Male 156677000 17685000    11.3
## 7  2017      Race/Ethnicity               Asian  19475000  1953000    10.0
## 8  2017      Race/Ethnicity               Black  42474000  8993000    21.2
## 9  2017      Race/Ethnicity Hispanic (any race)  59053000 10790000    18.3
## 10 2017      Race/Ethnicity               White 247272000 26436000    10.7
## 11 2017                 Sex              Female 164433000 22333000    13.6
## 12 2017                 Sex                Male 158116000 17365000    11.0