- Assignment
- Data Set 1 : Coal Consumption By Country
- Data Set 2 : Austin Texas Water Quality Sampling Data
- Read the data in the CSV format and select only 7 columns
- Reduce the size by filtering out certain parameeters and parameterTypes
- Change the data-type of the columns to reflect the nature of the data
- Tidy up the data
- Boxplot
- Convert Fahrenheit to Celcius
- Boxplot after converting Fahrenheit into Celcius
- Data Set 3 : NYPD Motor Vehicle Collisions.
Assignment
Data Set 1 : Coal Consumption By Country
————————————————————————————————————
————————————————————————————————————
The first data set is downloaded from the link https://catalog.data.gov/dataset/annual-coal-consumption-by-country
In order to keep all the data sets of the projects in the in the same place, the data has been saved as CSV format in gitHub. The URL for the file is https://raw.githubusercontent.com/jameskuruvilla/DATA607/master/Coal_Consumption_By_Country.csv
Read data from gitHub
coal_url <-'https://raw.githubusercontent.com/jameskuruvilla/DATA607/master/Coal_Consumption_By_Country.csv'
Tidy the dataset
Skip first 2 lines to eliminate the headings
coal <- read_csv(coal_url, skip = 2)
Give the title ‘country’ to the country column
colnames(coal)[1] <- 'country'
glimpse(coal)
## Observations: 234
## Variables: 35
## $ country <chr> NA, "North America", "Bermuda", "Canada", "Greenland",...
## $ X2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ 1980 <chr> NA, "16.48681", "0", "0.96156", "0.00005", "0.10239", ...
## $ 1981 <chr> NA, "17.00366", "0", "0.99047", "0.00005", "0.10562", ...
## $ 1982 <chr> NA, "16.49711", "0", "1.05584", "0.00003", "0.11967", ...
## $ 1983 <chr> NA, "17.1397", "0", "1.11653", "0.00003", "0.12869", "...
## $ 1984 <chr> NA, "18.43818", "0", "1.23682", "0.00003", "0.13071", ...
## $ 1985 <chr> NA, "18.83168", "0", "1.20679", "0", "0.14646", "0", "...
## $ 1986 <chr> NA, "18.54233", "0", "1.12583", "0", "0.15609", "0", "...
## $ 1987 <chr> NA, "19.42918", "0", "1.25072", "0", "0.17001", "0", "...
## $ 1988 <chr> NA, "20.36408", "0", "1.35809", "0", "0.15967", "0", "...
## $ 1989 <chr> NA, "20.59531", "0", "1.35196", "0", "0.17359", "0", "...
## $ 1990 <chr> NA, "20.55542", "0", "1.21338", "0", "0.1694", "0", "1...
## $ 1991 <chr> NA, "20.4154", "0", "1.26457", "0", "0.15916", "0", "1...
## $ 1992 <chr> NA, "20.6121", "0", "1.32379", "0", "0.16584", "0", "1...
## $ 1993 <chr> NA, "21.25508", "0", "1.22875", "0", "0.19118", "0", "...
## $ 1994 <chr> NA, "21.33798", "0", "1.24492", "0", "0.1836", "0", "1...
## $ 1995 <chr> NA, "21.58119", "0", "1.28479", "0", "0.20768", "0", "...
## $ 1996 <chr> NA, "22.5529", "0", "1.30032", "0", "0.25067", "0", "2...
## $ 1997 <chr> NA, "23.15846", "0", "1.44933", "0", "0.26373", "0", "...
## $ 1998 <chr> NA, "23.43312", "0", "1.50985", "0", "0.26753", "0", "...
## $ 1999 <chr> NA, "23.41701", "0", "1.505", "0", "0.28947", "0", "21...
## $ 2000 <chr> NA, "24.49048", "0", "1.61651", "0", "0.29444", "0", "...
## $ 2001 <chr> NA, "23.59779", "0", "1.35444", "0", "0.32908", "0", "...
## $ 2002 <chr> NA, "23.638", "0", "1.36876", "0", "0.36525", "0", "21...
## $ 2003 <chr> NA, "24.12736", "0", "1.38766", "0", "0.41878", "0", "...
## $ 2004 <chr> NA, "24.22248", "0", "1.43684", "0", "0.31944", "0", "...
## $ 2005 <chr> NA, "24.64341", "0", "1.44948", "0", "0.39739", "0", "...
## $ 2006 <chr> NA, "24.26095", "0", "1.42135", "0", "0.39244", "0", "...
## $ 2007 <chr> NA, "24.52227", "0", "1.38369", "0", "0.38911", "0", "...
## $ 2008 <chr> NA, "24.08137", "0", "1.37385", "0", "0.32008", "0", "...
## $ 2009 <chr> NA, "21.17627", "0", "1.14856", "0", "0.33651", "0", "...
## $ 2010 <chr> NA, "22.35421", "0", "1.15082", "0", "0.36942", "0", "...
## $ 2011 <chr> NA, "21.03609", "0", "0.99674", "0", "0.38157", "0", "...
## $ 2012 <chr> NA, "18.68227", "0", "0.96531", "0", "0.37785", "0", "...
Remove the 2nd column X2 which is empty
coal <- coal[-2]
glimpse(coal)
## Observations: 234
## Variables: 34
## $ country <chr> NA, "North America", "Bermuda", "Canada", "Greenland",...
## $ 1980 <chr> NA, "16.48681", "0", "0.96156", "0.00005", "0.10239", ...
## $ 1981 <chr> NA, "17.00366", "0", "0.99047", "0.00005", "0.10562", ...
## $ 1982 <chr> NA, "16.49711", "0", "1.05584", "0.00003", "0.11967", ...
## $ 1983 <chr> NA, "17.1397", "0", "1.11653", "0.00003", "0.12869", "...
## $ 1984 <chr> NA, "18.43818", "0", "1.23682", "0.00003", "0.13071", ...
## $ 1985 <chr> NA, "18.83168", "0", "1.20679", "0", "0.14646", "0", "...
## $ 1986 <chr> NA, "18.54233", "0", "1.12583", "0", "0.15609", "0", "...
## $ 1987 <chr> NA, "19.42918", "0", "1.25072", "0", "0.17001", "0", "...
## $ 1988 <chr> NA, "20.36408", "0", "1.35809", "0", "0.15967", "0", "...
## $ 1989 <chr> NA, "20.59531", "0", "1.35196", "0", "0.17359", "0", "...
## $ 1990 <chr> NA, "20.55542", "0", "1.21338", "0", "0.1694", "0", "1...
## $ 1991 <chr> NA, "20.4154", "0", "1.26457", "0", "0.15916", "0", "1...
## $ 1992 <chr> NA, "20.6121", "0", "1.32379", "0", "0.16584", "0", "1...
## $ 1993 <chr> NA, "21.25508", "0", "1.22875", "0", "0.19118", "0", "...
## $ 1994 <chr> NA, "21.33798", "0", "1.24492", "0", "0.1836", "0", "1...
## $ 1995 <chr> NA, "21.58119", "0", "1.28479", "0", "0.20768", "0", "...
## $ 1996 <chr> NA, "22.5529", "0", "1.30032", "0", "0.25067", "0", "2...
## $ 1997 <chr> NA, "23.15846", "0", "1.44933", "0", "0.26373", "0", "...
## $ 1998 <chr> NA, "23.43312", "0", "1.50985", "0", "0.26753", "0", "...
## $ 1999 <chr> NA, "23.41701", "0", "1.505", "0", "0.28947", "0", "21...
## $ 2000 <chr> NA, "24.49048", "0", "1.61651", "0", "0.29444", "0", "...
## $ 2001 <chr> NA, "23.59779", "0", "1.35444", "0", "0.32908", "0", "...
## $ 2002 <chr> NA, "23.638", "0", "1.36876", "0", "0.36525", "0", "21...
## $ 2003 <chr> NA, "24.12736", "0", "1.38766", "0", "0.41878", "0", "...
## $ 2004 <chr> NA, "24.22248", "0", "1.43684", "0", "0.31944", "0", "...
## $ 2005 <chr> NA, "24.64341", "0", "1.44948", "0", "0.39739", "0", "...
## $ 2006 <chr> NA, "24.26095", "0", "1.42135", "0", "0.39244", "0", "...
## $ 2007 <chr> NA, "24.52227", "0", "1.38369", "0", "0.38911", "0", "...
## $ 2008 <chr> NA, "24.08137", "0", "1.37385", "0", "0.32008", "0", "...
## $ 2009 <chr> NA, "21.17627", "0", "1.14856", "0", "0.33651", "0", "...
## $ 2010 <chr> NA, "22.35421", "0", "1.15082", "0", "0.36942", "0", "...
## $ 2011 <chr> NA, "21.03609", "0", "0.99674", "0", "0.38157", "0", "...
## $ 2012 <chr> NA, "18.68227", "0", "0.96531", "0", "0.37785", "0", "...
Remove Null rows and make the data set tidy and long using gather()
coal_long<-drop_na(coal)%>%gather(year,consumption, -country) # '-country' => all columns except country
summary(coal_long)
## country year consumption
## Length:7689 Length:7689 Length:7689
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
change the data type of ‘consumption’ to numeric and ‘year’ to integer
coal_long$consumption <- as.numeric(coal_long$consumption)
## Warning: NAs introduced by coercion
coal_long$year <- as.integer(coal_long$year)
summary(coal_long)
## country year consumption
## Length:7689 Min. :1980 Min. : 0.0000
## Class :character 1st Qu.:1988 1st Qu.: 0.0000
## Mode :character Median :1996 Median : 0.0002
## Mean :1996 Mean : 1.3716
## 3rd Qu.:2004 3rd Qu.: 0.0740
## Max. :2012 Max. :148.5470
## NA's :563
unique(coal_long$country)
## [1] "North America"
## [2] "Bermuda"
## [3] "Canada"
## [4] "Greenland"
## [5] "Mexico"
## [6] "Saint Pierre and Miquelon"
## [7] "United States"
## [8] "Central & South America"
## [9] "Antarctica"
## [10] "Antigua and Barbuda"
## [11] "Argentina"
## [12] "Aruba"
## [13] "Bahamas, The"
## [14] "Barbados"
## [15] "Belize"
## [16] "Bolivia"
## [17] "Brazil"
## [18] "Cayman Islands"
## [19] "Chile"
## [20] "Colombia"
## [21] "Costa Rica"
## [22] "Cuba"
## [23] "Dominica"
## [24] "Dominican Republic"
## [25] "Ecuador"
## [26] "El Salvador"
## [27] "Falkland Islands (Islas Malvinas)"
## [28] "French Guiana"
## [29] "Grenada"
## [30] "Guadeloupe"
## [31] "Guatemala"
## [32] "Guyana"
## [33] "Haiti"
## [34] "Honduras"
## [35] "Jamaica"
## [36] "Martinique"
## [37] "Montserrat"
## [38] "Netherlands Antilles"
## [39] "Nicaragua"
## [40] "Panama"
## [41] "Paraguay"
## [42] "Peru"
## [43] "Puerto Rico"
## [44] "Saint Kitts and Nevis"
## [45] "Saint Lucia"
## [46] "Saint Vincent/Grenadines"
## [47] "Suriname"
## [48] "Trinidad and Tobago"
## [49] "Turks and Caicos Islands"
## [50] "Uruguay"
## [51] "Venezuela"
## [52] "Virgin Islands, U.S."
## [53] "Virgin Islands, British"
## [54] "Europe"
## [55] "Albania"
## [56] "Austria"
## [57] "Belgium"
## [58] "Bosnia and Herzegovina"
## [59] "Bulgaria"
## [60] "Croatia"
## [61] "Cyprus"
## [62] "Czech Republic"
## [63] "Denmark"
## [64] "Faroe Islands"
## [65] "Finland"
## [66] "Former Czechoslovakia"
## [67] "Former Serbia and Montenegro"
## [68] "Former Yugoslavia"
## [69] "France"
## [70] "Germany"
## [71] "Germany, East"
## [72] "Germany, West"
## [73] "Gibraltar"
## [74] "Greece"
## [75] "Hungary"
## [76] "Iceland"
## [77] "Ireland"
## [78] "Italy"
## [79] "Kosovo"
## [80] "Luxembourg"
## [81] "Macedonia"
## [82] "Malta"
## [83] "Montenegro"
## [84] "Netherlands"
## [85] "Norway"
## [86] "Poland"
## [87] "Portugal"
## [88] "Romania"
## [89] "Serbia"
## [90] "Slovakia"
## [91] "Slovenia"
## [92] "Spain"
## [93] "Sweden"
## [94] "Switzerland"
## [95] "Turkey"
## [96] "United Kingdom"
## [97] "Eurasia"
## [98] "Armenia"
## [99] "Azerbaijan"
## [100] "Belarus"
## [101] "Estonia"
## [102] "Former U.S.S.R."
## [103] "Georgia"
## [104] "Kazakhstan"
## [105] "Kyrgyzstan"
## [106] "Latvia"
## [107] "Lithuania"
## [108] "Moldova"
## [109] "Russia"
## [110] "Tajikistan"
## [111] "Turkmenistan"
## [112] "Ukraine"
## [113] "Uzbekistan"
## [114] "Middle East"
## [115] "Bahrain"
## [116] "Iran"
## [117] "Iraq"
## [118] "Israel"
## [119] "Jordan"
## [120] "Kuwait"
## [121] "Lebanon"
## [122] "Oman"
## [123] "Palestinian Territories"
## [124] "Qatar"
## [125] "Saudi Arabia"
## [126] "Syria"
## [127] "United Arab Emirates"
## [128] "Yemen"
## [129] "Africa"
## [130] "Algeria"
## [131] "Angola"
## [132] "Benin"
## [133] "Botswana"
## [134] "Burkina Faso"
## [135] "Burundi"
## [136] "Cameroon"
## [137] "Cape Verde"
## [138] "Central African Republic"
## [139] "Chad"
## [140] "Comoros"
## [141] "Congo (Brazzaville)"
## [142] "Congo (Kinshasa)"
## [143] "Cote dIvoire (IvoryCoast)"
## [144] "Djibouti"
## [145] "Egypt"
## [146] "Equatorial Guinea"
## [147] "Eritrea"
## [148] "Ethiopia"
## [149] "Gabon"
## [150] "Gambia, The"
## [151] "Ghana"
## [152] "Guinea"
## [153] "Guinea-Bissau"
## [154] "Kenya"
## [155] "Lesotho"
## [156] "Liberia"
## [157] "Libya"
## [158] "Madagascar"
## [159] "Malawi"
## [160] "Mali"
## [161] "Mauritania"
## [162] "Mauritius"
## [163] "Morocco"
## [164] "Mozambique"
## [165] "Namibia"
## [166] "Niger"
## [167] "Nigeria"
## [168] "Reunion"
## [169] "Rwanda"
## [170] "Saint Helena"
## [171] "Sao Tome and Principe"
## [172] "Senegal"
## [173] "Seychelles"
## [174] "Sierra Leone"
## [175] "Somalia"
## [176] "South Africa"
## [177] "Sudan and South Sudan"
## [178] "Swaziland"
## [179] "Tanzania"
## [180] "Togo"
## [181] "Tunisia"
## [182] "Uganda"
## [183] "Western Sahara"
## [184] "Zambia"
## [185] "Zimbabwe"
## [186] "Asia & Oceania"
## [187] "Afghanistan"
## [188] "American Samoa"
## [189] "Australia"
## [190] "Bangladesh"
## [191] "Bhutan"
## [192] "Brunei"
## [193] "Burma (Myanmar)"
## [194] "Cambodia"
## [195] "China"
## [196] "Cook Islands"
## [197] "Fiji"
## [198] "French Polynesia"
## [199] "Guam"
## [200] "Hawaiian Trade Zone"
## [201] "Hong Kong"
## [202] "India"
## [203] "Indonesia"
## [204] "Japan"
## [205] "Kiribati"
## [206] "Korea, North"
## [207] "Korea, South"
## [208] "Laos"
## [209] "Macau"
## [210] "Malaysia"
## [211] "Maldives"
## [212] "Mongolia"
## [213] "Nauru"
## [214] "Nepal"
## [215] "New Caledonia"
## [216] "New Zealand"
## [217] "Niue"
## [218] "Pakistan"
## [219] "Papua New Guinea"
## [220] "Philippines"
## [221] "Samoa"
## [222] "Singapore"
## [223] "Solomon Islands"
## [224] "Sri Lanka"
## [225] "Taiwan"
## [226] "Thailand"
## [227] "Timor-Leste (East Timor)"
## [228] "Tonga"
## [229] "U.S. Pacific Islands"
## [230] "Vanuatu"
## [231] "Vietnam"
## [232] "Wake Island"
## [233] "World"
Separate the country and the noncountry datasets from the main dataset.
Save the elements with country name in a tibble called coal_country
# We notice from the unique list of country colum that there are non-country elements in the column.
# Make a list of such items from this list
noncountries <- c("North America", "Central & South America" , "World","Antarctica", "Europe", "Eurasia","Middle East","Africa","Asia & Oceania")
#Find the ids of the the elements with noncountries as 'country'
matches<- which(!is.na(match(coal_long$country, noncountries)))
# Save countries in a separate dataset called coal_country
coal_country <- coal_long[-matches,]
Save non-countries in a separate dataset called coal_region
coal_region <- coal_long[matches,]
summary(coal_region) ; summary(coal_country)
## country year consumption
## Length:297 Min. :1980 Min. : 0.0000
## Class :character 1st Qu.:1988 1st Qu.: 0.6169
## Mode :character Median :1996 Median : 7.6032
## Mean :1996 Mean : 21.9396
## 3rd Qu.:2004 3rd Qu.: 22.5782
## Max. :2012 Max. :148.5470
## country year consumption
## Length:7392 Min. :1980 Min. : 0.0000
## Class :character 1st Qu.:1988 1st Qu.: 0.0000
## Mode :character Median :1996 Median : 0.0001
## Mean :1996 Mean : 0.4771
## 3rd Qu.:2004 3rd Qu.: 0.0462
## Max. :2012 Max. :70.8132
## NA's :563
coal_region
## # A tibble: 297 × 3
## country year consumption
## <chr> <int> <dbl>
## 1 North America 1980 16.48681
## 2 Central & South America 1980 0.42101
## 3 Antarctica 1980 0.00000
## 4 Europe 1980 19.58460
## 5 Eurasia 1980 11.45935
## 6 Middle East 1980 0.02776
## 7 Africa 1980 2.25784
## 8 Asia & Oceania 1980 19.69262
## 9 World 1980 69.92998
## 10 North America 1981 17.00366
## # ... with 287 more rows
Visualizing the coal Daataset
ggplot(data=coal_region, mapping = aes(x=year, y = consumption)) +
geom_smooth(mapping = aes(color = country), method = 'loess')
Conclusion
The top pink line says that, coal consumption has increased dramatically in the last decade worldwide. All of that increase appears to be due to changes in the green line Asia and Oceania. The rest of the world actually slightly decreased its consumption over that same time period.
Data Set 2 : Austin Texas Water Quality Sampling Data
The second data set is the water quality data from the City of Austin, Texas. This data set comes directly from the City of Austin through their data portal located at data.austintexas.gov. It contains the results of over 1.14 million water quality tests performed on creeks, springs, wells, lakes, and other bodies of water by city staff. If we scroll down on the webpage, you’ll see a description of the data set, followed by what we call metadata, a description of the specific columns that are included in the data set, and this includes the name of the column, a description of the kind of data you’ll find in that column, and then the data type used for that variable.
The URL of the site is https://data.austintexas.gov/Environment/Water-Quality-Sampling-Data/5tye-7ray
And the link to download the csv file is https://data.austintexas.gov/api/views/5tye-7ray/rows.csv?accessType=DOWNLOAD
This has 1.14 million rows and 24 columns
Read the data in the CSV format and select only 7 columns
water <- read_csv('https://data.austintexas.gov/api/views/5tye-7ray/rows.csv?accessType=DOWNLOAD')
## Parsed with column specification:
## cols(
## .default = col_character(),
## RESULT = col_double(),
## SAMPLE_SITE_NO = col_integer(),
## DEPTH_IN_METERS = col_double(),
## DATA_REF_NO = col_integer(),
## LAT_DD_WGS84 = col_double(),
## LON_DD_WGS84 = col_double(),
## SAMPLE_REF_NO = col_integer()
## )
## See spec(...) for full column specifications.
# selet only 6 columns from the data set to reduce the volumn of data.
water <- tibble('siteName' = water$SITE_NAME,
'siteType' = water$SITE_TYPE ,
'sampleTime' = water$SAMPLE_DATE,
'parameterType' = water$PARAM_TYPE,
'parameter'=water$PARAMETER,
'result'= water$RESULT,
'unit'= water$UNIT)
glimpse(water)
## Observations: 1,142,257
## Variables: 7
## $ siteName <chr> "Old Mill (Sunken Gardens) Spring", "Old Mill (S...
## $ siteType <chr> "Spring", "Spring", "Spring", "Spring", "Spring"...
## $ sampleTime <chr> "07/18/2013 02:58:00 PM", "07/18/2013 02:58:00 P...
## $ parameterType <chr> "Benthic Macroinvertebrates", "Benthic Macroinve...
## $ parameter <chr> "ANNELIDA (WORM/LEECHES/PLANARIA)", "CRAYFISH (A...
## $ result <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0...
## $ unit <chr> "1=Present; 0=Absent", "Relative Abundance (1): ...
Reduce the size by filtering out certain parameeters and parameterTypes
# Eventhough this dataset has ony 7 columns, it has 1.14 million rows. Inorder to make the dataset more simple, I want to reduce the size
# by filtering out rows as follows. After couple of iterations I figured out parameterType is the column I want to apply the filter.
unique(water$parameterType)
## [1] "Benthic Macroinvertebrates"
## [2] "Fish"
## [3] "Grain Size/Substrate Characterization"
## [4] "Habitat Evaluation"
## [5] "Physicals"
## [6] "Salamanders"
## [7] "Spatial"
## [8] "Temporal"
## [9] "Flow/Rainfall"
## [10] "Alkalinity/Hardness/pH"
## [11] "Conventionals"
## [12] "Solids/Conductivity"
## [13] "Major Ions"
## [14] "Nutrients"
## [15] "Metals"
## [16] "Bacteria/Pathogens"
## [17] "Oxygen"
## [18] "Sample Collection Methods"
## [19] "Clarity"
## [20] "Chlorophyll/Pheophytin"
## [21] "Environmental Conditions"
## [22] "PAHs"
## [23] "Calculated Metrics"
## [24] "Channel Assessment"
## [25] "Diatoms"
## [26] "Isotope"
## [27] "Carbon"
## [28] "Hydrocarbons"
## [29] "Organochlorines"
## [30] "PCBs"
## [31] "Sample Descriptors"
## [32] "Bivalve"
## [33] "Alcohols"
## [34] "Amides"
## [35] "Benzoic acid"
## [36] "Carbamates"
## [37] "Chlorinated"
## [38] "Emerging Contaminants"
## [39] "Fungicides"
## [40] "Insecticides"
## [41] "Organics"
## [42] "Organophosphates"
## [43] "Semi-Volatiles"
## [44] "Triazines"
## [45] "Uracils"
## [46] "Volatiles"
## [47] "Riparian Shrubby Tree"
## [48] "Riparian Tree - Large"
## [49] "Riparian Tree - Small"
## [50] "Riparian Vegetation"
## [51] "Inorganics"
## [52] "Radiochemical"
## [53] "Troglophilic"
## [54] "Disinfectants/Disinfection Byproducts"
## [55] "Riparian Ferns"
## [56] "Riparian Forbs"
## [57] "Riparian Grasses"
## [58] "Riparian Tree"
## [59] "Riparian Vine"
## [60] "Riparian Sedges Rushes"
## [61] "Biologicals"
## [62] "Algae (non-filamentous)"
## [63] "Algae (filamentous)"
## [64] "Vascular Macrophytes"
## [65] "Chlorophyte"
## [66] "Xanthophyte"
## [67] "Cyanobacteria"
## [68] "Chrysophyte"
## [69] "Haptophyte"
## [70] "Charophyte"
## [71] "Troglobitic"
## [72] "Rhodophyte"
## [73] "Moss"
## [74] "Aromatic Acids"
## [75] "Benzothiadiazoles"
## [76] "Diphenolics"
## [77] "Phenoxys"
## [78] "Phthalates"
## [79] "Surfactants"
## [80] "Survival/Toxicity Testing"
## [81] "Defoliants"
## [82] "Phaeophyte"
## [83] "Amphibian"
## [84] "Fertilizers"
## [85] "Flagellate Algae"
## [86] "Riparian Succulents"
## [87] "Dinoflagellate"
## [88] "Euglenophyte"
## [89] "Detergent Metabolites"
## [90] "Synurophyte"
## [91] "Sterols"
## [92] "Riparian Shrub"
## [93] "Riparian Graminoids"
## [94] "Cryptophyte"
## [95] "Ochrophyta"
filtered_water <- filter(water, (parameterType == 'Alkalinity/Hardness/pH')
| parameterType == 'Conventionals')
glimpse(filtered_water) # The filtered data has a reduced the size of only 60,372 Observations
## Observations: 60,372
## Variables: 7
## $ siteName <chr> "North Branch of Sycamore Creek @ Foster Ranch R...
## $ siteType <chr> "Stream", "Stream", "Spring", "Spring", "Stream"...
## $ sampleTime <chr> "01/21/2015 12:00:00 PM", "01/21/2015 12:00:00 P...
## $ parameterType <chr> "Alkalinity/Hardness/pH", "Conventionals", "Alka...
## $ parameter <chr> "PH", "WATER TEMPERATURE", "PH", "WATER TEMPERAT...
## $ result <dbl> 8.35, 11.73, 7.82, 18.29, 16.35, 8.21, 15.47, 7....
## $ unit <chr> "Standard units", "Deg. Celsius", "Standard unit...
#Apply another filter on the column parameter
filtered_water <- subset(filtered_water, (parameter =='PH')|(parameter == 'WATER TEMPERATURE'))
glimpse(filtered_water) # Thus the number of observations reduced to 52,929
## Observations: 52,929
## Variables: 7
## $ siteName <chr> "North Branch of Sycamore Creek @ Foster Ranch R...
## $ siteType <chr> "Stream", "Stream", "Spring", "Spring", "Stream"...
## $ sampleTime <chr> "01/21/2015 12:00:00 PM", "01/21/2015 12:00:00 P...
## $ parameterType <chr> "Alkalinity/Hardness/pH", "Conventionals", "Alka...
## $ parameter <chr> "PH", "WATER TEMPERATURE", "PH", "WATER TEMPERAT...
## $ result <dbl> 8.35, 11.73, 7.82, 18.29, 16.35, 8.21, 15.47, 7....
## $ unit <chr> "Standard units", "Deg. Celsius", "Standard unit...
summary(filtered_water)
## siteName siteType sampleTime
## Length:52929 Length:52929 Length:52929
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## parameterType parameter result
## Length:52929 Length:52929 Min. : 0.1
## Class :character Class :character 1st Qu.: 7.7
## Mode :character Mode :character Median : 9.8
## Mean : 45.1
## 3rd Qu.: 21.1
## Max. :1112340.0
## NA's :1
## unit
## Length:52929
## Class :character
## Mode :character
##
##
##
##
Change the data-type of the columns to reflect the nature of the data
# some of the columns should be factors. factor is a categorical variable which means it contains limited number of values.
unique(filtered_water$siteType)
## [1] "Stream"
## [2] "Spring"
## [3] "Sediment"
## [4] "Non-spatial or Protected Location"
## [5] "Lake"
## [6] "Soil"
## [7] "Well"
## [8] "BMP - Detention Pond"
## [9] "BMP - Oil Grit Separator"
## [10] "Storm Drain"
## [11] "VIP Site"
## [12] "BMP - Wet Pond"
## [13] "BMP-Ret/Irrigation Pond"
## [14] "BMP-Drainage Swale"
unique(filtered_water$unit)
## [1] "Standard units" "Deg. Celsius" "Deg. Fahrenheit" "MG/L"
## [5] "Feet"
unique(filtered_water$parameterType)
## [1] "Alkalinity/Hardness/pH" "Conventionals"
unique(filtered_water$parameter)
## [1] "PH" "WATER TEMPERATURE"
filtered_water$siteType <- as.factor(filtered_water$siteType)
filtered_water$unit <- as.factor(filtered_water$unit)
filtered_water$parameterType <- as.factor(filtered_water$parameterType)
filtered_water$parameterType <- as.factor(filtered_water$parameterType)
glimpse(filtered_water)
## Observations: 52,929
## Variables: 7
## $ siteName <chr> "North Branch of Sycamore Creek @ Foster Ranch R...
## $ siteType <fctr> Stream, Stream, Spring, Spring, Stream, Stream,...
## $ sampleTime <chr> "01/21/2015 12:00:00 PM", "01/21/2015 12:00:00 P...
## $ parameterType <fctr> Alkalinity/Hardness/pH, Conventionals, Alkalini...
## $ parameter <chr> "PH", "WATER TEMPERATURE", "PH", "WATER TEMPERAT...
## $ result <dbl> 8.35, 11.73, 7.82, 18.29, 16.35, 8.21, 15.47, 7....
## $ unit <fctr> Standard units, Deg. Celsius, Standard units, D...
# SampleTime is stored as Character. That should be convered to date format
filtered_water$sampleTime <- mdy_hms(filtered_water$sampleTime)
summary(filtered_water)
## siteName siteType
## Length:52929 Stream :24831
## Class :character Lake :17172
## Mode :character Spring : 7320
## Well : 1607
## Soil : 1591
## Non-spatial or Protected Location: 218
## (Other) : 190
## sampleTime parameterType
## Min. :1986-05-01 13:35:00 Alkalinity/Hardness/pH:26190
## 1st Qu.:1997-07-11 11:00:00 Conventionals :26739
## Median :2002-07-23 15:58:00
## Mean :2003-10-06 05:28:42
## 3rd Qu.:2010-05-20 10:40:00
## Max. :2017-09-12 15:44:00
##
## parameter result unit
## Length:52929 Min. : 0.1 Deg. Celsius :26687
## Class :character 1st Qu.: 7.7 Deg. Fahrenheit: 47
## Mode :character Median : 9.8 Feet : 1
## Mean : 45.1 MG/L : 7
## 3rd Qu.: 21.1 Standard units :26187
## Max. :1112340.0
## NA's :1
Tidy up the data
# There is 1 observation for which the unit is 'Feet'.
#That should be a mistake. Lets see that observation using following command
subset(filtered_water,unit=='Feet')
## # A tibble: 1 × 7
## siteName siteType sampleTime parameterType parameter
## <chr> <fctr> <dttm> <fctr> <chr>
## 1 Lanier Well Well 2012-09-25 10:25:00 Conventionals WATER TEMPERATURE
## # ... with 2 more variables: result <dbl>, unit <fctr>
# considering the parameter and result, the unit should be 'Fahrenheit'.
#In order to change the value, lets find the id of the observation
convert<-which(filtered_water$unit == 'Feet')
filtered_water$unit[convert] <- 'Deg. Fahrenheit' # this is how Fahrenheit unit is mentioned for other observations
# There are 7 observations with unit 'MG/L'. That seems to be incorrect. Lets see those observations
subset(filtered_water,unit == 'MG/L')
## # A tibble: 7 × 7
## siteName siteType
## <chr> <fctr>
## 1 Barton Creek Downstream of Barton Spring Pool Stream
## 2 Barton Spring Spring
## 3 Barton Creek @ Hwy 71 Downstream of Little Barton Stream
## 4 Avery Deer Spring Spring
## 5 Canyon Creek Spring 1 (Tubb Spring) Spring
## 6 Fern Gully Spring Spring
## 7 Lady Bird Lake @ Holly Dock Lake
## # ... with 5 more variables: sampleTime <dttm>, parameterType <fctr>,
## # parameter <chr>, result <dbl>, unit <fctr>
# For the observations with 'result' around 70, the unit should be 'Deg. Fahrenheit' . There is one such observation
# For the observations with 'result' around 20, the unit should be 'Deg. Celsius'. There are 3 such observation
# For the observations with 'result' around 7, the unit should be 'Standard units' for PH. There are 3 such observation
# Following statements transform unit column to the correct values
convert <- which(filtered_water$unit == 'MG/L' & filtered_water$result > 70)
filtered_water$unit[convert] <- 'Deg. Fahrenheit'
convert <- which(filtered_water$unit == 'MG/L' & filtered_water$result > 19)
filtered_water$unit[convert] <- 'Deg. Celsius'
convert <- which(filtered_water$unit == 'MG/L')
filtered_water$unit[convert] <- 'Standard units'
summary(filtered_water)
## siteName siteType
## Length:52929 Stream :24831
## Class :character Lake :17172
## Mode :character Spring : 7320
## Well : 1607
## Soil : 1591
## Non-spatial or Protected Location: 218
## (Other) : 190
## sampleTime parameterType
## Min. :1986-05-01 13:35:00 Alkalinity/Hardness/pH:26190
## 1st Qu.:1997-07-11 11:00:00 Conventionals :26739
## Median :2002-07-23 15:58:00
## Mean :2003-10-06 05:28:42
## 3rd Qu.:2010-05-20 10:40:00
## Max. :2017-09-12 15:44:00
##
## parameter result unit
## Length:52929 Min. : 0.1 Deg. Celsius :26690
## Class :character 1st Qu.: 7.7 Deg. Fahrenheit: 49
## Mode :character Median : 9.8 Feet : 0
## Mean : 45.1 MG/L : 0
## 3rd Qu.: 21.1 Standard units :26190
## Max. :1112340.0
## NA's :1
# Looks like the result has some unusually high value. Lets see those observations
subset(filtered_water,result > 1000)
## # A tibble: 74 × 7
## siteName siteType sampleTime parameterType parameter
## <chr> <fctr> <dttm> <fctr> <chr>
## 1 JTMEC Well B-10C Well 2013-12-19 Conventionals WATER TEMPERATURE
## 2 JTMEC Well B-10C Well 2013-09-10 Conventionals WATER TEMPERATURE
## 3 JTMEC Well B-10C Well 2013-07-23 Conventionals WATER TEMPERATURE
## 4 JTMEC Well B-10C Well 2013-06-25 Conventionals WATER TEMPERATURE
## 5 JTMEC Well B-10C Well 2013-05-21 Conventionals WATER TEMPERATURE
## 6 JTMEC Well B-10C Well 2013-04-16 Conventionals WATER TEMPERATURE
## 7 JTMEC Well B-10C Well 2013-03-19 Conventionals WATER TEMPERATURE
## 8 JTMEC Well B-10C Well 2013-02-26 Conventionals WATER TEMPERATURE
## 9 JTMEC Well B-10C Well 2013-02-19 Conventionals WATER TEMPERATURE
## 10 JTMEC Well B-10C Well 2013-02-12 Conventionals WATER TEMPERATURE
## # ... with 64 more rows, and 2 more variables: result <dbl>, unit <fctr>
# There are 74 observations with water temperature higher than 1000.
#That is clearly as mistake. I don't have a way to assume the correct
# values. Also there is 1 observation with result = NA. so I am removing those observations
remove <- which(filtered_water$result > 1000 | is.na(filtered_water$result))
filtered_water <- filtered_water[-remove,]
summary(filtered_water)
## siteName siteType
## Length:52854 Stream :24830
## Class :character Lake :17172
## Mode :character Spring : 7320
## Soil : 1591
## Well : 1533
## Non-spatial or Protected Location: 218
## (Other) : 190
## sampleTime parameterType
## Min. :1986-05-01 13:35:00 Alkalinity/Hardness/pH:26189
## 1st Qu.:1997-07-10 15:03:45 Conventionals :26665
## Median :2002-06-20 12:10:00
## Mean :2003-10-01 17:19:45
## 3rd Qu.:2010-05-17 14:20:00
## Max. :2017-09-12 15:44:00
##
## parameter result unit
## Length:52854 Min. : 0.114 Deg. Celsius :26616
## Class :character 1st Qu.: 7.700 Deg. Fahrenheit: 49
## Mode :character Median : 9.575 Feet : 0
## Mean :14.266 MG/L : 0
## 3rd Qu.:21.100 Standard units :26189
## Max. :78.920
##
# The unit is a factor. Eventhough there is no data against the values 'MG/L' and 'Feet' the summary still shows those levels
# Following command removes those levels
filtered_water$unit <- droplevels(filtered_water$unit)
summary(filtered_water)
## siteName siteType
## Length:52854 Stream :24830
## Class :character Lake :17172
## Mode :character Spring : 7320
## Soil : 1591
## Well : 1533
## Non-spatial or Protected Location: 218
## (Other) : 190
## sampleTime parameterType
## Min. :1986-05-01 13:35:00 Alkalinity/Hardness/pH:26189
## 1st Qu.:1997-07-10 15:03:45 Conventionals :26665
## Median :2002-06-20 12:10:00
## Mean :2003-10-01 17:19:45
## 3rd Qu.:2010-05-17 14:20:00
## Max. :2017-09-12 15:44:00
##
## parameter result unit
## Length:52854 Min. : 0.114 Deg. Celsius :26616
## Class :character 1st Qu.: 7.700 Deg. Fahrenheit: 49
## Mode :character Median : 9.575 Standard units :26189
## Mean :14.266
## 3rd Qu.:21.100
## Max. :78.920
##
Boxplot
ggplot(data=filtered_water, mapping = aes(x=unit, y=result))+
geom_boxplot()
There are 2 observerations as outliers and those are above 60. Most probabilty the units for those observation should be ‘Deg. Fahrenheit’. So lets convert those into units into Deg. Fahrenheit
convert<- which(filtered_water$result>60 & filtered_water$unit == 'Deg. Celsius')
filtered_water$unit[convert] <- 'Deg. Fahrenheit'
#Boxplot again
ggplot(data=filtered_water, mapping = aes(x=unit, y= result))+
geom_boxplot()
Convert Fahrenheit to Celcius
** Convert the observations in such a way that, unit of water temperature will be thesame for all the observations**
fahrenheit <- which(filtered_water$unit=='Deg. Fahrenheit')
filtered_water$result[fahrenheit] <- (filtered_water$result[fahrenheit] -32) *(5.0/9.0)
filtered_water$unit[fahrenheit] <- 'Deg. Celsius'
Boxplot after converting Fahrenheit into Celcius
ggplot(data=filtered_water, mapping = aes(x=unit, y= result))+
geom_boxplot()
Now the data is clean and ready for analysis
Data Set 3 : NYPD Motor Vehicle Collisions.
Details of Motor Vehicle Collisions in New York City provided by the Police Department (NYPD)
The URL for the site is https://data.cityofnewyork.us/Public-Safety/NYPD-Motor-Vehicle-Collisions/h9gi-nx95 and the link for downloading the CSV file is https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD
As we see in the site, this file has about 1.13 Million of observations with 29 variables. The data has been updated on October 4th 2017.
Read the data from the URL and reduce the size by applying filters
url <- 'https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD'
mvc <- read.csv(url)
remove <- which(is.na(mvc$ZIP.CODE) == TRUE)
mvc <- mvc[-remove,]
remove <- which(is.na(mvc$LATITUDE) == TRUE)
mvc <- mvc[-remove,]
filterout <- which((mvc$CONTRIBUTING.FACTOR.VEHICLE.1 == 'Driver Inattention/Distraction'|
mvc$CONTRIBUTING.FACTOR.VEHICLE.1 == 'Failure to Yield Right-of-Way'|
mvc$CONTRIBUTING.FACTOR.VEHICLE.1 == 'Backing Unsafely'|
mvc$CONTRIBUTING.FACTOR.VEHICLE.1 == 'Fatigued/Drowsy'))
mvc <- mvc[filterout,]
#longitude =0.0 seems to be a mistake. Lets see the row
remove <- which(mvc$longitude == 0)
mvc <- mvc[-remove,]
remove <- which(mvc$VehType1=="OTHER")
mvc <- mvc[-remove,]
filterout <- which (mvc$VehType1 == "PASSENGER VEHICLE" |
mvc$VehType1 == "SPORT UTILITY / STATION WAGON" |
mvc$VehType1 == "TAXI" |
mvc$VehType1 == "VAN" |
mvc$VehType1 == "PICK-UP TRUCK")
mvc <- mvc[filterout,]
glimpse(mvc)
## Observations: 0
## Variables: 29
## $ DATE <fctr>
## $ TIME <fctr>
## $ BOROUGH <fctr>
## $ ZIP.CODE <int>
## $ LATITUDE <dbl>
## $ LONGITUDE <dbl>
## $ LOCATION <fctr>
## $ ON.STREET.NAME <fctr>
## $ CROSS.STREET.NAME <fctr>
## $ OFF.STREET.NAME <fctr>
## $ NUMBER.OF.PERSONS.INJURED <int>
## $ NUMBER.OF.PERSONS.KILLED <int>
## $ NUMBER.OF.PEDESTRIANS.INJURED <int>
## $ NUMBER.OF.PEDESTRIANS.KILLED <int>
## $ NUMBER.OF.CYCLIST.INJURED <int>
## $ NUMBER.OF.CYCLIST.KILLED <int>
## $ NUMBER.OF.MOTORIST.INJURED <int>
## $ NUMBER.OF.MOTORIST.KILLED <int>
## $ CONTRIBUTING.FACTOR.VEHICLE.1 <fctr>
## $ CONTRIBUTING.FACTOR.VEHICLE.2 <fctr>
## $ CONTRIBUTING.FACTOR.VEHICLE.3 <fctr>
## $ CONTRIBUTING.FACTOR.VEHICLE.4 <fctr>
## $ CONTRIBUTING.FACTOR.VEHICLE.5 <fctr>
## $ UNIQUE.KEY <int>
## $ VEHICLE.TYPE.CODE.1 <fctr>
## $ VEHICLE.TYPE.CODE.2 <fctr>
## $ VEHICLE.TYPE.CODE.3 <fctr>
## $ VEHICLE.TYPE.CODE.4 <fctr>
## $ VEHICLE.TYPE.CODE.5 <fctr>
Select 14 columns out of 29 and Tidy up the data
mvc <- tibble("date" = mvc$DATE,
"borough"=mvc$BOROUGH,
"lattitude"=mvc$LATITUDE,
"longitude"=mvc$LONGITUDE,
"NoOfInjured"=mvc$NUMBER.OF.PERSONS.INJURED,
"NoOfKilled" = mvc$NUMBER.OF.PERSONS.KILLED,
"NoOfPedInjured"= mvc$NUMBER.OF.PEDESTRIANS.INJURED,
"NoOfPedKilled" = mvc$NUMBER.OF.PEDESTRIANS.KILLED,
"NoOfCycInjured" = mvc$NUMBER.OF.CYCLIST.INJURED,
"NoOfCycKilled"= mvc$NUMBER.OF.CYCLIST.KILLED,
"NoOfMotInjured"=mvc$NUMBER.OF.MOTORIST.INJURED,
"NoOfMotKilled"=mvc$NUMBER.OF.MOTORIST.KILLED,
"ConFacVeh1"=mvc$CONTRIBUTING.FACTOR.VEHICLE.1,
"VehType1"=mvc$VEHICLE.TYPE.CODE.1)
summary(mvc)
## date borough lattitude longitude
## 01/01/2013:0 :0 Min. : NA Min. : NA
## 01/01/2014:0 BRONX :0 1st Qu.: NA 1st Qu.: NA
## 01/01/2015:0 BROOKLYN :0 Median : NA Median : NA
## 01/01/2016:0 MANHATTAN :0 Mean :NaN Mean :NaN
## 01/01/2017:0 QUEENS :0 3rd Qu.: NA 3rd Qu.: NA
## 01/02/2013:0 STATEN ISLAND:0 Max. : NA Max. : NA
## (Other) :0
## NoOfInjured NoOfKilled NoOfPedInjured NoOfPedKilled NoOfCycInjured
## Min. : NA Min. : NA Min. : NA Min. : NA Min. : NA
## 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA
## Median : NA Median : NA Median : NA Median : NA Median : NA
## Mean :NaN Mean :NaN Mean :NaN Mean :NaN Mean :NaN
## 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
## Max. : NA Max. : NA Max. : NA Max. : NA Max. : NA
##
## NoOfCycKilled NoOfMotInjured NoOfMotKilled
## Min. : NA Min. : NA Min. : NA
## 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA
## Median : NA Median : NA Median : NA
## Mean :NaN Mean :NaN Mean :NaN
## 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
## Max. : NA Max. : NA Max. : NA
##
## ConFacVeh1 VehType1
## :0 :0
## Accelerator Defective :0 AMBULANCE :0
## Aggressive Driving/Road Rage:0 BICYCLE :0
## Alcohol Involvement :0 BUS :0
## Animals Action :0 FIRE TRUCK :0
## Backing Unsafely :0 LARGE COM VEH(6 OR MORE TIRES):0
## (Other) :0 (Other) :0
mvc$date <- mdy(mvc$date) # Converting data type of date column from factor to date
## Warning: All formats failed to parse. No formats found.
#Following command droplevels drop the empty levels in the factor variables
mvc$ConFacVeh1 <- droplevels(mvc$ConFacVeh1)
mvc$VehType1 <- droplevels(mvc$VehType1)
mvc$borough <- droplevels(mvc$borough)
mvc_old<-mvc
glimpse(mvc)
## Observations: 0
## Variables: 14
## $ date <date>
## $ borough <fctr>
## $ lattitude <dbl>
## $ longitude <dbl>
## $ NoOfInjured <int>
## $ NoOfKilled <int>
## $ NoOfPedInjured <int>
## $ NoOfPedKilled <int>
## $ NoOfCycInjured <int>
## $ NoOfCycKilled <int>
## $ NoOfMotInjured <int>
## $ NoOfMotKilled <int>
## $ ConFacVeh1 <fctr>
## $ VehType1 <fctr>
Now the data is Tidy enough for further analysis
No. of accidents by borough
mvc_acc_borough <- mvc%>%
select (2,5,6,7,8,9,10,11,12) %>%
group_by (borough) %>% summarise(totInjured = sum(NoOfInjured),totKilled=sum(NoOfKilled), totPedInjured = sum(NoOfPedKilled), totCycInjured = sum(NoOfCycInjured), totCycKilled = sum(NoOfCycKilled), totMotInjured = sum(NoOfMotInjured), totMotKilled = sum(NoOfMotKilled))
print(mvc_acc_borough)
## # A tibble: 0 × 8
## # ... with 8 variables: borough <fctr>, totInjured <int>, totKilled <int>,
## # totPedInjured <int>, totCycInjured <int>, totCycKilled <int>,
## # totMotInjured <int>, totMotKilled <int>
No. of accidents by Vehicle Type
mvc_acc_VehType <- mvc%>%
select (5,6,7,8,9,10,11,12,14) %>%
group_by (VehType1) %>% summarise(totInjured = sum(NoOfInjured),totKilled=sum(NoOfKilled), totPedInjured = sum(NoOfPedKilled), totCycInjured = sum(NoOfCycInjured), totCycKilled = sum(NoOfCycKilled), totMotInjured = sum(NoOfMotInjured), totMotKilled = sum(NoOfMotKilled))
print(mvc_acc_VehType)
## # A tibble: 0 × 8
## # ... with 8 variables: VehType1 <fctr>, totInjured <int>,
## # totKilled <int>, totPedInjured <int>, totCycInjured <int>,
## # totCycKilled <int>, totMotInjured <int>, totMotKilled <int>
No. of accidents by reason
mvc_acc_Reason <- mvc%>%
select (2,5,6,7,8,9,10,11,12,13) %>%
group_by (borough,ConFacVeh1) %>% summarise(totInjured = sum(NoOfInjured),totKilled=sum(NoOfKilled), totPedInjured = sum(NoOfPedKilled), totCycInjured = sum(NoOfCycInjured), totCycKilled = sum(NoOfCycKilled), totMotInjured = sum(NoOfMotInjured), totMotKilled = sum(NoOfMotKilled))
print(mvc_acc_Reason)
## Source: local data frame [0 x 9]
## Groups: borough [?]
##
## # ... with 9 variables: borough <fctr>, ConFacVeh1 <fctr>,
## # totInjured <int>, totKilled <int>, totPedInjured <int>,
## # totCycInjured <int>, totCycKilled <int>, totMotInjured <int>,
## # totMotKilled <int>
ggplot(data=mvc_acc_Reason, aes(x=ConFacVeh1, y = totInjured, fill = borough)) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=totInjured), vjust=.9, hjust=1,position= position_dodge(width=0.9)) +
geom_point(mapping = aes(color = borough ), method = 'loess') +
xlab("Contributing Factor") + ylab("Total Number Of Injured") +
coord_flip()