Tidy And Transform 3 DataSets

CUNY MSDS - DATA607 - Project 2

James Kuruvilla

October 3, 2017

Assignment

Data Set 1 : Coal Consumption By Country

————————————————————————————————————
————————————————————————————————————

The first data set is downloaded from the link https://catalog.data.gov/dataset/annual-coal-consumption-by-country

In order to keep all the data sets of the projects in the in the same place, the data has been saved as CSV format in gitHub. The URL for the file is https://raw.githubusercontent.com/jameskuruvilla/DATA607/master/Coal_Consumption_By_Country.csv

Read data from gitHub

coal_url <-'https://raw.githubusercontent.com/jameskuruvilla/DATA607/master/Coal_Consumption_By_Country.csv'

Tidy the dataset

Skip first 2 lines to eliminate the headings

coal <- read_csv(coal_url, skip = 2)

Give the title ‘country’ to the country column

colnames(coal)[1] <- 'country'
glimpse(coal)
## Observations: 234
## Variables: 35
## $ country <chr> NA, "North America", "Bermuda", "Canada", "Greenland",...
## $ X2      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ 1980    <chr> NA, "16.48681", "0", "0.96156", "0.00005", "0.10239", ...
## $ 1981    <chr> NA, "17.00366", "0", "0.99047", "0.00005", "0.10562", ...
## $ 1982    <chr> NA, "16.49711", "0", "1.05584", "0.00003", "0.11967", ...
## $ 1983    <chr> NA, "17.1397", "0", "1.11653", "0.00003", "0.12869", "...
## $ 1984    <chr> NA, "18.43818", "0", "1.23682", "0.00003", "0.13071", ...
## $ 1985    <chr> NA, "18.83168", "0", "1.20679", "0", "0.14646", "0", "...
## $ 1986    <chr> NA, "18.54233", "0", "1.12583", "0", "0.15609", "0", "...
## $ 1987    <chr> NA, "19.42918", "0", "1.25072", "0", "0.17001", "0", "...
## $ 1988    <chr> NA, "20.36408", "0", "1.35809", "0", "0.15967", "0", "...
## $ 1989    <chr> NA, "20.59531", "0", "1.35196", "0", "0.17359", "0", "...
## $ 1990    <chr> NA, "20.55542", "0", "1.21338", "0", "0.1694", "0", "1...
## $ 1991    <chr> NA, "20.4154", "0", "1.26457", "0", "0.15916", "0", "1...
## $ 1992    <chr> NA, "20.6121", "0", "1.32379", "0", "0.16584", "0", "1...
## $ 1993    <chr> NA, "21.25508", "0", "1.22875", "0", "0.19118", "0", "...
## $ 1994    <chr> NA, "21.33798", "0", "1.24492", "0", "0.1836", "0", "1...
## $ 1995    <chr> NA, "21.58119", "0", "1.28479", "0", "0.20768", "0", "...
## $ 1996    <chr> NA, "22.5529", "0", "1.30032", "0", "0.25067", "0", "2...
## $ 1997    <chr> NA, "23.15846", "0", "1.44933", "0", "0.26373", "0", "...
## $ 1998    <chr> NA, "23.43312", "0", "1.50985", "0", "0.26753", "0", "...
## $ 1999    <chr> NA, "23.41701", "0", "1.505", "0", "0.28947", "0", "21...
## $ 2000    <chr> NA, "24.49048", "0", "1.61651", "0", "0.29444", "0", "...
## $ 2001    <chr> NA, "23.59779", "0", "1.35444", "0", "0.32908", "0", "...
## $ 2002    <chr> NA, "23.638", "0", "1.36876", "0", "0.36525", "0", "21...
## $ 2003    <chr> NA, "24.12736", "0", "1.38766", "0", "0.41878", "0", "...
## $ 2004    <chr> NA, "24.22248", "0", "1.43684", "0", "0.31944", "0", "...
## $ 2005    <chr> NA, "24.64341", "0", "1.44948", "0", "0.39739", "0", "...
## $ 2006    <chr> NA, "24.26095", "0", "1.42135", "0", "0.39244", "0", "...
## $ 2007    <chr> NA, "24.52227", "0", "1.38369", "0", "0.38911", "0", "...
## $ 2008    <chr> NA, "24.08137", "0", "1.37385", "0", "0.32008", "0", "...
## $ 2009    <chr> NA, "21.17627", "0", "1.14856", "0", "0.33651", "0", "...
## $ 2010    <chr> NA, "22.35421", "0", "1.15082", "0", "0.36942", "0", "...
## $ 2011    <chr> NA, "21.03609", "0", "0.99674", "0", "0.38157", "0", "...
## $ 2012    <chr> NA, "18.68227", "0", "0.96531", "0", "0.37785", "0", "...

Remove the 2nd column X2 which is empty

coal <- coal[-2]

glimpse(coal)
## Observations: 234
## Variables: 34
## $ country <chr> NA, "North America", "Bermuda", "Canada", "Greenland",...
## $ 1980    <chr> NA, "16.48681", "0", "0.96156", "0.00005", "0.10239", ...
## $ 1981    <chr> NA, "17.00366", "0", "0.99047", "0.00005", "0.10562", ...
## $ 1982    <chr> NA, "16.49711", "0", "1.05584", "0.00003", "0.11967", ...
## $ 1983    <chr> NA, "17.1397", "0", "1.11653", "0.00003", "0.12869", "...
## $ 1984    <chr> NA, "18.43818", "0", "1.23682", "0.00003", "0.13071", ...
## $ 1985    <chr> NA, "18.83168", "0", "1.20679", "0", "0.14646", "0", "...
## $ 1986    <chr> NA, "18.54233", "0", "1.12583", "0", "0.15609", "0", "...
## $ 1987    <chr> NA, "19.42918", "0", "1.25072", "0", "0.17001", "0", "...
## $ 1988    <chr> NA, "20.36408", "0", "1.35809", "0", "0.15967", "0", "...
## $ 1989    <chr> NA, "20.59531", "0", "1.35196", "0", "0.17359", "0", "...
## $ 1990    <chr> NA, "20.55542", "0", "1.21338", "0", "0.1694", "0", "1...
## $ 1991    <chr> NA, "20.4154", "0", "1.26457", "0", "0.15916", "0", "1...
## $ 1992    <chr> NA, "20.6121", "0", "1.32379", "0", "0.16584", "0", "1...
## $ 1993    <chr> NA, "21.25508", "0", "1.22875", "0", "0.19118", "0", "...
## $ 1994    <chr> NA, "21.33798", "0", "1.24492", "0", "0.1836", "0", "1...
## $ 1995    <chr> NA, "21.58119", "0", "1.28479", "0", "0.20768", "0", "...
## $ 1996    <chr> NA, "22.5529", "0", "1.30032", "0", "0.25067", "0", "2...
## $ 1997    <chr> NA, "23.15846", "0", "1.44933", "0", "0.26373", "0", "...
## $ 1998    <chr> NA, "23.43312", "0", "1.50985", "0", "0.26753", "0", "...
## $ 1999    <chr> NA, "23.41701", "0", "1.505", "0", "0.28947", "0", "21...
## $ 2000    <chr> NA, "24.49048", "0", "1.61651", "0", "0.29444", "0", "...
## $ 2001    <chr> NA, "23.59779", "0", "1.35444", "0", "0.32908", "0", "...
## $ 2002    <chr> NA, "23.638", "0", "1.36876", "0", "0.36525", "0", "21...
## $ 2003    <chr> NA, "24.12736", "0", "1.38766", "0", "0.41878", "0", "...
## $ 2004    <chr> NA, "24.22248", "0", "1.43684", "0", "0.31944", "0", "...
## $ 2005    <chr> NA, "24.64341", "0", "1.44948", "0", "0.39739", "0", "...
## $ 2006    <chr> NA, "24.26095", "0", "1.42135", "0", "0.39244", "0", "...
## $ 2007    <chr> NA, "24.52227", "0", "1.38369", "0", "0.38911", "0", "...
## $ 2008    <chr> NA, "24.08137", "0", "1.37385", "0", "0.32008", "0", "...
## $ 2009    <chr> NA, "21.17627", "0", "1.14856", "0", "0.33651", "0", "...
## $ 2010    <chr> NA, "22.35421", "0", "1.15082", "0", "0.36942", "0", "...
## $ 2011    <chr> NA, "21.03609", "0", "0.99674", "0", "0.38157", "0", "...
## $ 2012    <chr> NA, "18.68227", "0", "0.96531", "0", "0.37785", "0", "...

Remove Null rows and make the data set tidy and long using gather()

coal_long<-drop_na(coal)%>%gather(year,consumption, -country) # '-country' => all columns except country

summary(coal_long)
##    country              year           consumption       
##  Length:7689        Length:7689        Length:7689       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character

change the data type of ‘consumption’ to numeric and ‘year’ to integer

coal_long$consumption <- as.numeric(coal_long$consumption)
## Warning: NAs introduced by coercion
coal_long$year        <- as.integer(coal_long$year)

summary(coal_long)
##    country               year       consumption      
##  Length:7689        Min.   :1980   Min.   :  0.0000  
##  Class :character   1st Qu.:1988   1st Qu.:  0.0000  
##  Mode  :character   Median :1996   Median :  0.0002  
##                     Mean   :1996   Mean   :  1.3716  
##                     3rd Qu.:2004   3rd Qu.:  0.0740  
##                     Max.   :2012   Max.   :148.5470  
##                                    NA's   :563
unique(coal_long$country)
##   [1] "North America"                    
##   [2] "Bermuda"                          
##   [3] "Canada"                           
##   [4] "Greenland"                        
##   [5] "Mexico"                           
##   [6] "Saint Pierre and Miquelon"        
##   [7] "United States"                    
##   [8] "Central & South America"          
##   [9] "Antarctica"                       
##  [10] "Antigua and Barbuda"              
##  [11] "Argentina"                        
##  [12] "Aruba"                            
##  [13] "Bahamas, The"                     
##  [14] "Barbados"                         
##  [15] "Belize"                           
##  [16] "Bolivia"                          
##  [17] "Brazil"                           
##  [18] "Cayman Islands"                   
##  [19] "Chile"                            
##  [20] "Colombia"                         
##  [21] "Costa Rica"                       
##  [22] "Cuba"                             
##  [23] "Dominica"                         
##  [24] "Dominican Republic"               
##  [25] "Ecuador"                          
##  [26] "El Salvador"                      
##  [27] "Falkland Islands (Islas Malvinas)"
##  [28] "French Guiana"                    
##  [29] "Grenada"                          
##  [30] "Guadeloupe"                       
##  [31] "Guatemala"                        
##  [32] "Guyana"                           
##  [33] "Haiti"                            
##  [34] "Honduras"                         
##  [35] "Jamaica"                          
##  [36] "Martinique"                       
##  [37] "Montserrat"                       
##  [38] "Netherlands Antilles"             
##  [39] "Nicaragua"                        
##  [40] "Panama"                           
##  [41] "Paraguay"                         
##  [42] "Peru"                             
##  [43] "Puerto Rico"                      
##  [44] "Saint Kitts and Nevis"            
##  [45] "Saint Lucia"                      
##  [46] "Saint Vincent/Grenadines"         
##  [47] "Suriname"                         
##  [48] "Trinidad and Tobago"              
##  [49] "Turks and Caicos Islands"         
##  [50] "Uruguay"                          
##  [51] "Venezuela"                        
##  [52] "Virgin Islands,  U.S."            
##  [53] "Virgin Islands, British"          
##  [54] "Europe"                           
##  [55] "Albania"                          
##  [56] "Austria"                          
##  [57] "Belgium"                          
##  [58] "Bosnia and Herzegovina"           
##  [59] "Bulgaria"                         
##  [60] "Croatia"                          
##  [61] "Cyprus"                           
##  [62] "Czech Republic"                   
##  [63] "Denmark"                          
##  [64] "Faroe Islands"                    
##  [65] "Finland"                          
##  [66] "Former Czechoslovakia"            
##  [67] "Former Serbia and Montenegro"     
##  [68] "Former Yugoslavia"                
##  [69] "France"                           
##  [70] "Germany"                          
##  [71] "Germany, East"                    
##  [72] "Germany, West"                    
##  [73] "Gibraltar"                        
##  [74] "Greece"                           
##  [75] "Hungary"                          
##  [76] "Iceland"                          
##  [77] "Ireland"                          
##  [78] "Italy"                            
##  [79] "Kosovo"                           
##  [80] "Luxembourg"                       
##  [81] "Macedonia"                        
##  [82] "Malta"                            
##  [83] "Montenegro"                       
##  [84] "Netherlands"                      
##  [85] "Norway"                           
##  [86] "Poland"                           
##  [87] "Portugal"                         
##  [88] "Romania"                          
##  [89] "Serbia"                           
##  [90] "Slovakia"                         
##  [91] "Slovenia"                         
##  [92] "Spain"                            
##  [93] "Sweden"                           
##  [94] "Switzerland"                      
##  [95] "Turkey"                           
##  [96] "United Kingdom"                   
##  [97] "Eurasia"                          
##  [98] "Armenia"                          
##  [99] "Azerbaijan"                       
## [100] "Belarus"                          
## [101] "Estonia"                          
## [102] "Former U.S.S.R."                  
## [103] "Georgia"                          
## [104] "Kazakhstan"                       
## [105] "Kyrgyzstan"                       
## [106] "Latvia"                           
## [107] "Lithuania"                        
## [108] "Moldova"                          
## [109] "Russia"                           
## [110] "Tajikistan"                       
## [111] "Turkmenistan"                     
## [112] "Ukraine"                          
## [113] "Uzbekistan"                       
## [114] "Middle East"                      
## [115] "Bahrain"                          
## [116] "Iran"                             
## [117] "Iraq"                             
## [118] "Israel"                           
## [119] "Jordan"                           
## [120] "Kuwait"                           
## [121] "Lebanon"                          
## [122] "Oman"                             
## [123] "Palestinian Territories"          
## [124] "Qatar"                            
## [125] "Saudi Arabia"                     
## [126] "Syria"                            
## [127] "United Arab Emirates"             
## [128] "Yemen"                            
## [129] "Africa"                           
## [130] "Algeria"                          
## [131] "Angola"                           
## [132] "Benin"                            
## [133] "Botswana"                         
## [134] "Burkina Faso"                     
## [135] "Burundi"                          
## [136] "Cameroon"                         
## [137] "Cape Verde"                       
## [138] "Central African Republic"         
## [139] "Chad"                             
## [140] "Comoros"                          
## [141] "Congo (Brazzaville)"              
## [142] "Congo (Kinshasa)"                 
## [143] "Cote dIvoire (IvoryCoast)"        
## [144] "Djibouti"                         
## [145] "Egypt"                            
## [146] "Equatorial Guinea"                
## [147] "Eritrea"                          
## [148] "Ethiopia"                         
## [149] "Gabon"                            
## [150] "Gambia, The"                      
## [151] "Ghana"                            
## [152] "Guinea"                           
## [153] "Guinea-Bissau"                    
## [154] "Kenya"                            
## [155] "Lesotho"                          
## [156] "Liberia"                          
## [157] "Libya"                            
## [158] "Madagascar"                       
## [159] "Malawi"                           
## [160] "Mali"                             
## [161] "Mauritania"                       
## [162] "Mauritius"                        
## [163] "Morocco"                          
## [164] "Mozambique"                       
## [165] "Namibia"                          
## [166] "Niger"                            
## [167] "Nigeria"                          
## [168] "Reunion"                          
## [169] "Rwanda"                           
## [170] "Saint Helena"                     
## [171] "Sao Tome and Principe"            
## [172] "Senegal"                          
## [173] "Seychelles"                       
## [174] "Sierra Leone"                     
## [175] "Somalia"                          
## [176] "South Africa"                     
## [177] "Sudan and South Sudan"            
## [178] "Swaziland"                        
## [179] "Tanzania"                         
## [180] "Togo"                             
## [181] "Tunisia"                          
## [182] "Uganda"                           
## [183] "Western Sahara"                   
## [184] "Zambia"                           
## [185] "Zimbabwe"                         
## [186] "Asia & Oceania"                   
## [187] "Afghanistan"                      
## [188] "American Samoa"                   
## [189] "Australia"                        
## [190] "Bangladesh"                       
## [191] "Bhutan"                           
## [192] "Brunei"                           
## [193] "Burma (Myanmar)"                  
## [194] "Cambodia"                         
## [195] "China"                            
## [196] "Cook Islands"                     
## [197] "Fiji"                             
## [198] "French Polynesia"                 
## [199] "Guam"                             
## [200] "Hawaiian Trade Zone"              
## [201] "Hong Kong"                        
## [202] "India"                            
## [203] "Indonesia"                        
## [204] "Japan"                            
## [205] "Kiribati"                         
## [206] "Korea, North"                     
## [207] "Korea, South"                     
## [208] "Laos"                             
## [209] "Macau"                            
## [210] "Malaysia"                         
## [211] "Maldives"                         
## [212] "Mongolia"                         
## [213] "Nauru"                            
## [214] "Nepal"                            
## [215] "New Caledonia"                    
## [216] "New Zealand"                      
## [217] "Niue"                             
## [218] "Pakistan"                         
## [219] "Papua New Guinea"                 
## [220] "Philippines"                      
## [221] "Samoa"                            
## [222] "Singapore"                        
## [223] "Solomon Islands"                  
## [224] "Sri Lanka"                        
## [225] "Taiwan"                           
## [226] "Thailand"                         
## [227] "Timor-Leste (East Timor)"         
## [228] "Tonga"                            
## [229] "U.S. Pacific Islands"             
## [230] "Vanuatu"                          
## [231] "Vietnam"                          
## [232] "Wake Island"                      
## [233] "World"

Separate the country and the noncountry datasets from the main dataset.

Save the elements with country name in a tibble called coal_country
# We notice from the unique list of country colum that there are non-country elements in the column.
# Make a list of such items from this list
noncountries <- c("North America", "Central & South America" , "World","Antarctica", "Europe", "Eurasia","Middle East","Africa","Asia & Oceania")

#Find the ids of the the elements with noncountries as 'country'
matches<- which(!is.na(match(coal_long$country, noncountries)))

# Save countries in a separate dataset called coal_country
coal_country <- coal_long[-matches,]
Save non-countries in a separate dataset called coal_region
coal_region <- coal_long[matches,]

summary(coal_region) ; summary(coal_country)
##    country               year       consumption      
##  Length:297         Min.   :1980   Min.   :  0.0000  
##  Class :character   1st Qu.:1988   1st Qu.:  0.6169  
##  Mode  :character   Median :1996   Median :  7.6032  
##                     Mean   :1996   Mean   : 21.9396  
##                     3rd Qu.:2004   3rd Qu.: 22.5782  
##                     Max.   :2012   Max.   :148.5470
##    country               year       consumption     
##  Length:7392        Min.   :1980   Min.   : 0.0000  
##  Class :character   1st Qu.:1988   1st Qu.: 0.0000  
##  Mode  :character   Median :1996   Median : 0.0001  
##                     Mean   :1996   Mean   : 0.4771  
##                     3rd Qu.:2004   3rd Qu.: 0.0462  
##                     Max.   :2012   Max.   :70.8132  
##                                    NA's   :563
coal_region
## # A tibble: 297 × 3
##                    country  year consumption
##                      <chr> <int>       <dbl>
## 1            North America  1980    16.48681
## 2  Central & South America  1980     0.42101
## 3               Antarctica  1980     0.00000
## 4                   Europe  1980    19.58460
## 5                  Eurasia  1980    11.45935
## 6              Middle East  1980     0.02776
## 7                   Africa  1980     2.25784
## 8           Asia & Oceania  1980    19.69262
## 9                    World  1980    69.92998
## 10           North America  1981    17.00366
## # ... with 287 more rows

Visualizing the coal Daataset

ggplot(data=coal_region, mapping = aes(x=year, y = consumption)) +
  geom_smooth(mapping = aes(color = country), method = 'loess')

Conclusion

The top pink line says that, coal consumption has increased dramatically in the last decade worldwide. All of that increase appears to be due to changes in the green line Asia and Oceania. The rest of the world actually slightly decreased its consumption over that same time period.

Data Set 2 : Austin Texas Water Quality Sampling Data

The second data set is the water quality data from the City of Austin, Texas. This data set comes directly from the City of Austin through their data portal located at data.austintexas.gov. It contains the results of over 1.14 million water quality tests performed on creeks, springs, wells, lakes, and other bodies of water by city staff. If we scroll down on the webpage, you’ll see a description of the data set, followed by what we call metadata, a description of the specific columns that are included in the data set, and this includes the name of the column, a description of the kind of data you’ll find in that column, and then the data type used for that variable.

The URL of the site is https://data.austintexas.gov/Environment/Water-Quality-Sampling-Data/5tye-7ray

And the link to download the csv file is https://data.austintexas.gov/api/views/5tye-7ray/rows.csv?accessType=DOWNLOAD

This has 1.14 million rows and 24 columns

Read the data in the CSV format and select only 7 columns

water <- read_csv('https://data.austintexas.gov/api/views/5tye-7ray/rows.csv?accessType=DOWNLOAD')
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   RESULT = col_double(),
##   SAMPLE_SITE_NO = col_integer(),
##   DEPTH_IN_METERS = col_double(),
##   DATA_REF_NO = col_integer(),
##   LAT_DD_WGS84 = col_double(),
##   LON_DD_WGS84 = col_double(),
##   SAMPLE_REF_NO = col_integer()
## )
## See spec(...) for full column specifications.
# selet only 6 columns from the data set to reduce the volumn of data. 

water <- tibble('siteName' = water$SITE_NAME,
                'siteType' = water$SITE_TYPE ,
                'sampleTime' = water$SAMPLE_DATE,
                'parameterType' = water$PARAM_TYPE,
                'parameter'=water$PARAMETER,
                'result'= water$RESULT,
                'unit'= water$UNIT)
glimpse(water)
## Observations: 1,142,257
## Variables: 7
## $ siteName      <chr> "Old Mill (Sunken Gardens) Spring", "Old Mill (S...
## $ siteType      <chr> "Spring", "Spring", "Spring", "Spring", "Spring"...
## $ sampleTime    <chr> "07/18/2013 02:58:00 PM", "07/18/2013 02:58:00 P...
## $ parameterType <chr> "Benthic Macroinvertebrates", "Benthic Macroinve...
## $ parameter     <chr> "ANNELIDA (WORM/LEECHES/PLANARIA)", "CRAYFISH (A...
## $ result        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0...
## $ unit          <chr> "1=Present; 0=Absent", "Relative Abundance (1): ...

Reduce the size by filtering out certain parameeters and parameterTypes

# Eventhough this dataset has ony 7 columns, it has 1.14 million rows. Inorder to make the dataset more simple, I want to reduce the size 
# by filtering out rows as follows. After couple of iterations I figured out parameterType is the column I want to apply the filter.

unique(water$parameterType)
##  [1] "Benthic Macroinvertebrates"           
##  [2] "Fish"                                 
##  [3] "Grain Size/Substrate Characterization"
##  [4] "Habitat Evaluation"                   
##  [5] "Physicals"                            
##  [6] "Salamanders"                          
##  [7] "Spatial"                              
##  [8] "Temporal"                             
##  [9] "Flow/Rainfall"                        
## [10] "Alkalinity/Hardness/pH"               
## [11] "Conventionals"                        
## [12] "Solids/Conductivity"                  
## [13] "Major Ions"                           
## [14] "Nutrients"                            
## [15] "Metals"                               
## [16] "Bacteria/Pathogens"                   
## [17] "Oxygen"                               
## [18] "Sample Collection Methods"            
## [19] "Clarity"                              
## [20] "Chlorophyll/Pheophytin"               
## [21] "Environmental Conditions"             
## [22] "PAHs"                                 
## [23] "Calculated Metrics"                   
## [24] "Channel Assessment"                   
## [25] "Diatoms"                              
## [26] "Isotope"                              
## [27] "Carbon"                               
## [28] "Hydrocarbons"                         
## [29] "Organochlorines"                      
## [30] "PCBs"                                 
## [31] "Sample Descriptors"                   
## [32] "Bivalve"                              
## [33] "Alcohols"                             
## [34] "Amides"                               
## [35] "Benzoic acid"                         
## [36] "Carbamates"                           
## [37] "Chlorinated"                          
## [38] "Emerging Contaminants"                
## [39] "Fungicides"                           
## [40] "Insecticides"                         
## [41] "Organics"                             
## [42] "Organophosphates"                     
## [43] "Semi-Volatiles"                       
## [44] "Triazines"                            
## [45] "Uracils"                              
## [46] "Volatiles"                            
## [47] "Riparian Shrubby Tree"                
## [48] "Riparian Tree - Large"                
## [49] "Riparian Tree - Small"                
## [50] "Riparian Vegetation"                  
## [51] "Inorganics"                           
## [52] "Radiochemical"                        
## [53] "Troglophilic"                         
## [54] "Disinfectants/Disinfection Byproducts"
## [55] "Riparian Ferns"                       
## [56] "Riparian Forbs"                       
## [57] "Riparian Grasses"                     
## [58] "Riparian Tree"                        
## [59] "Riparian Vine"                        
## [60] "Riparian Sedges Rushes"               
## [61] "Biologicals"                          
## [62] "Algae (non-filamentous)"              
## [63] "Algae (filamentous)"                  
## [64] "Vascular Macrophytes"                 
## [65] "Chlorophyte"                          
## [66] "Xanthophyte"                          
## [67] "Cyanobacteria"                        
## [68] "Chrysophyte"                          
## [69] "Haptophyte"                           
## [70] "Charophyte"                           
## [71] "Troglobitic"                          
## [72] "Rhodophyte"                           
## [73] "Moss"                                 
## [74] "Aromatic Acids"                       
## [75] "Benzothiadiazoles"                    
## [76] "Diphenolics"                          
## [77] "Phenoxys"                             
## [78] "Phthalates"                           
## [79] "Surfactants"                          
## [80] "Survival/Toxicity Testing"            
## [81] "Defoliants"                           
## [82] "Phaeophyte"                           
## [83] "Amphibian"                            
## [84] "Fertilizers"                          
## [85] "Flagellate Algae"                     
## [86] "Riparian Succulents"                  
## [87] "Dinoflagellate"                       
## [88] "Euglenophyte"                         
## [89] "Detergent Metabolites"                
## [90] "Synurophyte"                          
## [91] "Sterols"                              
## [92] "Riparian Shrub"                       
## [93] "Riparian Graminoids"                  
## [94] "Cryptophyte"                          
## [95] "Ochrophyta"
filtered_water <- filter(water, (parameterType == 'Alkalinity/Hardness/pH') 
                                 | parameterType == 'Conventionals')

glimpse(filtered_water)  # The filtered data has a reduced the size of only 60,372 Observations
## Observations: 60,372
## Variables: 7
## $ siteName      <chr> "North Branch of Sycamore Creek @ Foster Ranch R...
## $ siteType      <chr> "Stream", "Stream", "Spring", "Spring", "Stream"...
## $ sampleTime    <chr> "01/21/2015 12:00:00 PM", "01/21/2015 12:00:00 P...
## $ parameterType <chr> "Alkalinity/Hardness/pH", "Conventionals", "Alka...
## $ parameter     <chr> "PH", "WATER TEMPERATURE", "PH", "WATER TEMPERAT...
## $ result        <dbl> 8.35, 11.73, 7.82, 18.29, 16.35, 8.21, 15.47, 7....
## $ unit          <chr> "Standard units", "Deg. Celsius", "Standard unit...
#Apply another filter on the column parameter
filtered_water <- subset(filtered_water, (parameter =='PH')|(parameter == 'WATER TEMPERATURE'))

glimpse(filtered_water) # Thus the number of observations reduced to 52,929
## Observations: 52,929
## Variables: 7
## $ siteName      <chr> "North Branch of Sycamore Creek @ Foster Ranch R...
## $ siteType      <chr> "Stream", "Stream", "Spring", "Spring", "Stream"...
## $ sampleTime    <chr> "01/21/2015 12:00:00 PM", "01/21/2015 12:00:00 P...
## $ parameterType <chr> "Alkalinity/Hardness/pH", "Conventionals", "Alka...
## $ parameter     <chr> "PH", "WATER TEMPERATURE", "PH", "WATER TEMPERAT...
## $ result        <dbl> 8.35, 11.73, 7.82, 18.29, 16.35, 8.21, 15.47, 7....
## $ unit          <chr> "Standard units", "Deg. Celsius", "Standard unit...
summary(filtered_water)
##    siteName           siteType          sampleTime       
##  Length:52929       Length:52929       Length:52929      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  parameterType       parameter             result         
##  Length:52929       Length:52929       Min.   :      0.1  
##  Class :character   Class :character   1st Qu.:      7.7  
##  Mode  :character   Mode  :character   Median :      9.8  
##                                        Mean   :     45.1  
##                                        3rd Qu.:     21.1  
##                                        Max.   :1112340.0  
##                                        NA's   :1          
##      unit          
##  Length:52929      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Change the data-type of the columns to reflect the nature of the data

# some of the columns should be factors. factor is a categorical variable which means it contains limited number of values.

unique(filtered_water$siteType)
##  [1] "Stream"                           
##  [2] "Spring"                           
##  [3] "Sediment"                         
##  [4] "Non-spatial or Protected Location"
##  [5] "Lake"                             
##  [6] "Soil"                             
##  [7] "Well"                             
##  [8] "BMP - Detention Pond"             
##  [9] "BMP - Oil Grit Separator"         
## [10] "Storm Drain"                      
## [11] "VIP Site"                         
## [12] "BMP - Wet Pond"                   
## [13] "BMP-Ret/Irrigation Pond"          
## [14] "BMP-Drainage Swale"
unique(filtered_water$unit)
## [1] "Standard units"  "Deg. Celsius"    "Deg. Fahrenheit" "MG/L"           
## [5] "Feet"
unique(filtered_water$parameterType)
## [1] "Alkalinity/Hardness/pH" "Conventionals"
unique(filtered_water$parameter)
## [1] "PH"                "WATER TEMPERATURE"
filtered_water$siteType <- as.factor(filtered_water$siteType)
filtered_water$unit <- as.factor(filtered_water$unit)
filtered_water$parameterType <- as.factor(filtered_water$parameterType)
filtered_water$parameterType <- as.factor(filtered_water$parameterType)

glimpse(filtered_water)
## Observations: 52,929
## Variables: 7
## $ siteName      <chr> "North Branch of Sycamore Creek @ Foster Ranch R...
## $ siteType      <fctr> Stream, Stream, Spring, Spring, Stream, Stream,...
## $ sampleTime    <chr> "01/21/2015 12:00:00 PM", "01/21/2015 12:00:00 P...
## $ parameterType <fctr> Alkalinity/Hardness/pH, Conventionals, Alkalini...
## $ parameter     <chr> "PH", "WATER TEMPERATURE", "PH", "WATER TEMPERAT...
## $ result        <dbl> 8.35, 11.73, 7.82, 18.29, 16.35, 8.21, 15.47, 7....
## $ unit          <fctr> Standard units, Deg. Celsius, Standard units, D...
# SampleTime is stored as Character. That should be convered to date format
filtered_water$sampleTime <- mdy_hms(filtered_water$sampleTime)

summary(filtered_water)
##    siteName                                      siteType    
##  Length:52929       Stream                           :24831  
##  Class :character   Lake                             :17172  
##  Mode  :character   Spring                           : 7320  
##                     Well                             : 1607  
##                     Soil                             : 1591  
##                     Non-spatial or Protected Location:  218  
##                     (Other)                          :  190  
##    sampleTime                                 parameterType  
##  Min.   :1986-05-01 13:35:00   Alkalinity/Hardness/pH:26190  
##  1st Qu.:1997-07-11 11:00:00   Conventionals         :26739  
##  Median :2002-07-23 15:58:00                                 
##  Mean   :2003-10-06 05:28:42                                 
##  3rd Qu.:2010-05-20 10:40:00                                 
##  Max.   :2017-09-12 15:44:00                                 
##                                                              
##   parameter             result                       unit      
##  Length:52929       Min.   :      0.1   Deg. Celsius   :26687  
##  Class :character   1st Qu.:      7.7   Deg. Fahrenheit:   47  
##  Mode  :character   Median :      9.8   Feet           :    1  
##                     Mean   :     45.1   MG/L           :    7  
##                     3rd Qu.:     21.1   Standard units :26187  
##                     Max.   :1112340.0                          
##                     NA's   :1

Tidy up the data

# There is 1 observation for which the unit is 'Feet'. 
#That should be a mistake. Lets see that observation using following command

subset(filtered_water,unit=='Feet')
## # A tibble: 1 × 7
##      siteName siteType          sampleTime parameterType         parameter
##         <chr>   <fctr>              <dttm>        <fctr>             <chr>
## 1 Lanier Well     Well 2012-09-25 10:25:00 Conventionals WATER TEMPERATURE
## # ... with 2 more variables: result <dbl>, unit <fctr>
# considering the parameter and result, the unit should be 'Fahrenheit'. 
#In order to change the value, lets find the id of the observation

convert<-which(filtered_water$unit == 'Feet')

filtered_water$unit[convert] <- 'Deg. Fahrenheit' # this is how Fahrenheit unit is mentioned for other observations

# There are 7 observations with unit 'MG/L'. That seems to be incorrect. Lets see those observations
subset(filtered_water,unit == 'MG/L')
## # A tibble: 7 × 7
##                                            siteName siteType
##                                               <chr>   <fctr>
## 1     Barton Creek Downstream of Barton Spring Pool   Stream
## 2                                     Barton Spring   Spring
## 3 Barton Creek @ Hwy 71 Downstream of Little Barton   Stream
## 4                                 Avery Deer Spring   Spring
## 5               Canyon Creek Spring 1 (Tubb Spring)   Spring
## 6                                 Fern Gully Spring   Spring
## 7                       Lady Bird Lake @ Holly Dock     Lake
## # ... with 5 more variables: sampleTime <dttm>, parameterType <fctr>,
## #   parameter <chr>, result <dbl>, unit <fctr>
# For the observations with 'result' around 70, the unit should be 'Deg. Fahrenheit' . There is one such observation
# For the observations with 'result' around 20, the unit should be 'Deg. Celsius'. There are 3 such observation
# For the observations with 'result' around 7, the unit should be 'Standard units' for PH. There are 3 such observation
# Following statements transform unit column to the correct values

convert <- which(filtered_water$unit == 'MG/L' & filtered_water$result > 70)
filtered_water$unit[convert] <- 'Deg. Fahrenheit'

convert <- which(filtered_water$unit == 'MG/L' & filtered_water$result > 19)
filtered_water$unit[convert] <- 'Deg. Celsius'

convert <- which(filtered_water$unit == 'MG/L')
filtered_water$unit[convert] <- 'Standard units'

summary(filtered_water)
##    siteName                                      siteType    
##  Length:52929       Stream                           :24831  
##  Class :character   Lake                             :17172  
##  Mode  :character   Spring                           : 7320  
##                     Well                             : 1607  
##                     Soil                             : 1591  
##                     Non-spatial or Protected Location:  218  
##                     (Other)                          :  190  
##    sampleTime                                 parameterType  
##  Min.   :1986-05-01 13:35:00   Alkalinity/Hardness/pH:26190  
##  1st Qu.:1997-07-11 11:00:00   Conventionals         :26739  
##  Median :2002-07-23 15:58:00                                 
##  Mean   :2003-10-06 05:28:42                                 
##  3rd Qu.:2010-05-20 10:40:00                                 
##  Max.   :2017-09-12 15:44:00                                 
##                                                              
##   parameter             result                       unit      
##  Length:52929       Min.   :      0.1   Deg. Celsius   :26690  
##  Class :character   1st Qu.:      7.7   Deg. Fahrenheit:   49  
##  Mode  :character   Median :      9.8   Feet           :    0  
##                     Mean   :     45.1   MG/L           :    0  
##                     3rd Qu.:     21.1   Standard units :26190  
##                     Max.   :1112340.0                          
##                     NA's   :1
# Looks like the result has some unusually high value. Lets see those observations

subset(filtered_water,result > 1000)
## # A tibble: 74 × 7
##            siteName siteType sampleTime parameterType         parameter
##               <chr>   <fctr>     <dttm>        <fctr>             <chr>
## 1  JTMEC Well B-10C     Well 2013-12-19 Conventionals WATER TEMPERATURE
## 2  JTMEC Well B-10C     Well 2013-09-10 Conventionals WATER TEMPERATURE
## 3  JTMEC Well B-10C     Well 2013-07-23 Conventionals WATER TEMPERATURE
## 4  JTMEC Well B-10C     Well 2013-06-25 Conventionals WATER TEMPERATURE
## 5  JTMEC Well B-10C     Well 2013-05-21 Conventionals WATER TEMPERATURE
## 6  JTMEC Well B-10C     Well 2013-04-16 Conventionals WATER TEMPERATURE
## 7  JTMEC Well B-10C     Well 2013-03-19 Conventionals WATER TEMPERATURE
## 8  JTMEC Well B-10C     Well 2013-02-26 Conventionals WATER TEMPERATURE
## 9  JTMEC Well B-10C     Well 2013-02-19 Conventionals WATER TEMPERATURE
## 10 JTMEC Well B-10C     Well 2013-02-12 Conventionals WATER TEMPERATURE
## # ... with 64 more rows, and 2 more variables: result <dbl>, unit <fctr>
# There are 74 observations with water temperature higher than 1000.
#That is clearly as mistake. I don't have a way to assume the correct 
# values. Also there is 1 observation with result = NA.  so I am removing those observations

remove <- which(filtered_water$result > 1000 | is.na(filtered_water$result))
filtered_water <- filtered_water[-remove,]

summary(filtered_water)
##    siteName                                      siteType    
##  Length:52854       Stream                           :24830  
##  Class :character   Lake                             :17172  
##  Mode  :character   Spring                           : 7320  
##                     Soil                             : 1591  
##                     Well                             : 1533  
##                     Non-spatial or Protected Location:  218  
##                     (Other)                          :  190  
##    sampleTime                                 parameterType  
##  Min.   :1986-05-01 13:35:00   Alkalinity/Hardness/pH:26189  
##  1st Qu.:1997-07-10 15:03:45   Conventionals         :26665  
##  Median :2002-06-20 12:10:00                                 
##  Mean   :2003-10-01 17:19:45                                 
##  3rd Qu.:2010-05-17 14:20:00                                 
##  Max.   :2017-09-12 15:44:00                                 
##                                                              
##   parameter             result                    unit      
##  Length:52854       Min.   : 0.114   Deg. Celsius   :26616  
##  Class :character   1st Qu.: 7.700   Deg. Fahrenheit:   49  
##  Mode  :character   Median : 9.575   Feet           :    0  
##                     Mean   :14.266   MG/L           :    0  
##                     3rd Qu.:21.100   Standard units :26189  
##                     Max.   :78.920                          
## 
# The unit is a factor. Eventhough there is no data against the values 'MG/L' and 'Feet' the summary still shows those levels
# Following command removes those levels

filtered_water$unit <- droplevels(filtered_water$unit)

summary(filtered_water)
##    siteName                                      siteType    
##  Length:52854       Stream                           :24830  
##  Class :character   Lake                             :17172  
##  Mode  :character   Spring                           : 7320  
##                     Soil                             : 1591  
##                     Well                             : 1533  
##                     Non-spatial or Protected Location:  218  
##                     (Other)                          :  190  
##    sampleTime                                 parameterType  
##  Min.   :1986-05-01 13:35:00   Alkalinity/Hardness/pH:26189  
##  1st Qu.:1997-07-10 15:03:45   Conventionals         :26665  
##  Median :2002-06-20 12:10:00                                 
##  Mean   :2003-10-01 17:19:45                                 
##  3rd Qu.:2010-05-17 14:20:00                                 
##  Max.   :2017-09-12 15:44:00                                 
##                                                              
##   parameter             result                    unit      
##  Length:52854       Min.   : 0.114   Deg. Celsius   :26616  
##  Class :character   1st Qu.: 7.700   Deg. Fahrenheit:   49  
##  Mode  :character   Median : 9.575   Standard units :26189  
##                     Mean   :14.266                          
##                     3rd Qu.:21.100                          
##                     Max.   :78.920                          
## 

Boxplot

ggplot(data=filtered_water, mapping = aes(x=unit, y=result))+
  geom_boxplot()

There are 2 observerations as outliers and those are above 60. Most probabilty the units for those observation should be ‘Deg. Fahrenheit’. So lets convert those into units into Deg. Fahrenheit

convert<- which(filtered_water$result>60 & filtered_water$unit == 'Deg. Celsius')
filtered_water$unit[convert] <- 'Deg. Fahrenheit'

#Boxplot again

ggplot(data=filtered_water, mapping = aes(x=unit, y= result))+
  geom_boxplot()

Convert Fahrenheit to Celcius

** Convert the observations in such a way that, unit of water temperature will be thesame for all the observations**

fahrenheit <- which(filtered_water$unit=='Deg. Fahrenheit')
filtered_water$result[fahrenheit] <- (filtered_water$result[fahrenheit] -32) *(5.0/9.0)
filtered_water$unit[fahrenheit] <- 'Deg. Celsius'

Boxplot after converting Fahrenheit into Celcius

ggplot(data=filtered_water, mapping = aes(x=unit, y= result))+
  geom_boxplot()

Now the data is clean and ready for analysis

Data Set 3 : NYPD Motor Vehicle Collisions.

Details of Motor Vehicle Collisions in New York City provided by the Police Department (NYPD)

The URL for the site is https://data.cityofnewyork.us/Public-Safety/NYPD-Motor-Vehicle-Collisions/h9gi-nx95 and the link for downloading the CSV file is https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD

As we see in the site, this file has about 1.13 Million of observations with 29 variables. The data has been updated on October 4th 2017.

Read the data from the URL and reduce the size by applying filters

url <- 'https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD'

mvc <- read.csv(url)

remove <- which(is.na(mvc$ZIP.CODE) == TRUE)
mvc <- mvc[-remove,]

remove <- which(is.na(mvc$LATITUDE) == TRUE)
mvc <- mvc[-remove,]

filterout <- which((mvc$CONTRIBUTING.FACTOR.VEHICLE.1 == 'Driver Inattention/Distraction'|
                 mvc$CONTRIBUTING.FACTOR.VEHICLE.1 == 'Failure to Yield Right-of-Way'|
                 mvc$CONTRIBUTING.FACTOR.VEHICLE.1 == 'Backing Unsafely'|
                 mvc$CONTRIBUTING.FACTOR.VEHICLE.1 == 'Fatigued/Drowsy'))
mvc <- mvc[filterout,]

#longitude =0.0 seems to be a mistake. Lets see the row

remove <- which(mvc$longitude == 0)
mvc <- mvc[-remove,]

remove <- which(mvc$VehType1=="OTHER")
mvc <- mvc[-remove,]

filterout <- which (mvc$VehType1 == "PASSENGER VEHICLE" |
                     mvc$VehType1 == "SPORT UTILITY / STATION WAGON" |
                     mvc$VehType1 == "TAXI" |
                     mvc$VehType1 == "VAN" |
                     mvc$VehType1 == "PICK-UP TRUCK")
mvc <- mvc[filterout,]


glimpse(mvc)
## Observations: 0
## Variables: 29
## $ DATE                          <fctr> 
## $ TIME                          <fctr> 
## $ BOROUGH                       <fctr> 
## $ ZIP.CODE                      <int> 
## $ LATITUDE                      <dbl> 
## $ LONGITUDE                     <dbl> 
## $ LOCATION                      <fctr> 
## $ ON.STREET.NAME                <fctr> 
## $ CROSS.STREET.NAME             <fctr> 
## $ OFF.STREET.NAME               <fctr> 
## $ NUMBER.OF.PERSONS.INJURED     <int> 
## $ NUMBER.OF.PERSONS.KILLED      <int> 
## $ NUMBER.OF.PEDESTRIANS.INJURED <int> 
## $ NUMBER.OF.PEDESTRIANS.KILLED  <int> 
## $ NUMBER.OF.CYCLIST.INJURED     <int> 
## $ NUMBER.OF.CYCLIST.KILLED      <int> 
## $ NUMBER.OF.MOTORIST.INJURED    <int> 
## $ NUMBER.OF.MOTORIST.KILLED     <int> 
## $ CONTRIBUTING.FACTOR.VEHICLE.1 <fctr> 
## $ CONTRIBUTING.FACTOR.VEHICLE.2 <fctr> 
## $ CONTRIBUTING.FACTOR.VEHICLE.3 <fctr> 
## $ CONTRIBUTING.FACTOR.VEHICLE.4 <fctr> 
## $ CONTRIBUTING.FACTOR.VEHICLE.5 <fctr> 
## $ UNIQUE.KEY                    <int> 
## $ VEHICLE.TYPE.CODE.1           <fctr> 
## $ VEHICLE.TYPE.CODE.2           <fctr> 
## $ VEHICLE.TYPE.CODE.3           <fctr> 
## $ VEHICLE.TYPE.CODE.4           <fctr> 
## $ VEHICLE.TYPE.CODE.5           <fctr>

Select 14 columns out of 29 and Tidy up the data

mvc <- tibble("date" = mvc$DATE,
              "borough"=mvc$BOROUGH,
              "lattitude"=mvc$LATITUDE,
              "longitude"=mvc$LONGITUDE,
              "NoOfInjured"=mvc$NUMBER.OF.PERSONS.INJURED,
              "NoOfKilled" = mvc$NUMBER.OF.PERSONS.KILLED,
              "NoOfPedInjured"= mvc$NUMBER.OF.PEDESTRIANS.INJURED,
              "NoOfPedKilled" = mvc$NUMBER.OF.PEDESTRIANS.KILLED,
              "NoOfCycInjured" = mvc$NUMBER.OF.CYCLIST.INJURED,
              "NoOfCycKilled"= mvc$NUMBER.OF.CYCLIST.KILLED,
              "NoOfMotInjured"=mvc$NUMBER.OF.MOTORIST.INJURED,
              "NoOfMotKilled"=mvc$NUMBER.OF.MOTORIST.KILLED,
              "ConFacVeh1"=mvc$CONTRIBUTING.FACTOR.VEHICLE.1,
              "VehType1"=mvc$VEHICLE.TYPE.CODE.1)

summary(mvc)
##          date            borough    lattitude     longitude  
##  01/01/2013:0                :0   Min.   : NA   Min.   : NA  
##  01/01/2014:0   BRONX        :0   1st Qu.: NA   1st Qu.: NA  
##  01/01/2015:0   BROOKLYN     :0   Median : NA   Median : NA  
##  01/01/2016:0   MANHATTAN    :0   Mean   :NaN   Mean   :NaN  
##  01/01/2017:0   QUEENS       :0   3rd Qu.: NA   3rd Qu.: NA  
##  01/02/2013:0   STATEN ISLAND:0   Max.   : NA   Max.   : NA  
##  (Other)   :0                                                
##   NoOfInjured    NoOfKilled  NoOfPedInjured NoOfPedKilled NoOfCycInjured
##  Min.   : NA   Min.   : NA   Min.   : NA    Min.   : NA   Min.   : NA   
##  1st Qu.: NA   1st Qu.: NA   1st Qu.: NA    1st Qu.: NA   1st Qu.: NA   
##  Median : NA   Median : NA   Median : NA    Median : NA   Median : NA   
##  Mean   :NaN   Mean   :NaN   Mean   :NaN    Mean   :NaN   Mean   :NaN   
##  3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA    3rd Qu.: NA   3rd Qu.: NA   
##  Max.   : NA   Max.   : NA   Max.   : NA    Max.   : NA   Max.   : NA   
##                                                                         
##  NoOfCycKilled NoOfMotInjured NoOfMotKilled
##  Min.   : NA   Min.   : NA    Min.   : NA  
##  1st Qu.: NA   1st Qu.: NA    1st Qu.: NA  
##  Median : NA   Median : NA    Median : NA  
##  Mean   :NaN   Mean   :NaN    Mean   :NaN  
##  3rd Qu.: NA   3rd Qu.: NA    3rd Qu.: NA  
##  Max.   : NA   Max.   : NA    Max.   : NA  
##                                            
##                         ConFacVeh1                           VehType1
##                              :0                                  :0  
##  Accelerator Defective       :0    AMBULANCE                     :0  
##  Aggressive Driving/Road Rage:0    BICYCLE                       :0  
##  Alcohol Involvement         :0    BUS                           :0  
##  Animals Action              :0    FIRE TRUCK                    :0  
##  Backing Unsafely            :0    LARGE COM VEH(6 OR MORE TIRES):0  
##  (Other)                     :0    (Other)                       :0
mvc$date <- mdy(mvc$date) # Converting data type of date column from factor to date
## Warning: All formats failed to parse. No formats found.
#Following command droplevels drop the empty levels in the factor variables
mvc$ConFacVeh1 <-  droplevels(mvc$ConFacVeh1)
mvc$VehType1 <-  droplevels(mvc$VehType1)
mvc$borough <-  droplevels(mvc$borough)
mvc_old<-mvc

glimpse(mvc)
## Observations: 0
## Variables: 14
## $ date           <date> 
## $ borough        <fctr> 
## $ lattitude      <dbl> 
## $ longitude      <dbl> 
## $ NoOfInjured    <int> 
## $ NoOfKilled     <int> 
## $ NoOfPedInjured <int> 
## $ NoOfPedKilled  <int> 
## $ NoOfCycInjured <int> 
## $ NoOfCycKilled  <int> 
## $ NoOfMotInjured <int> 
## $ NoOfMotKilled  <int> 
## $ ConFacVeh1     <fctr> 
## $ VehType1       <fctr>

Now the data is Tidy enough for further analysis

No. of accidents by borough

mvc_acc_borough <- mvc%>%
                   select (2,5,6,7,8,9,10,11,12) %>%
                   group_by (borough)  %>% summarise(totInjured = sum(NoOfInjured),totKilled=sum(NoOfKilled), totPedInjured = sum(NoOfPedKilled), totCycInjured = sum(NoOfCycInjured), totCycKilled = sum(NoOfCycKilled), totMotInjured = sum(NoOfMotInjured), totMotKilled = sum(NoOfMotKilled))

print(mvc_acc_borough)
## # A tibble: 0 × 8
## # ... with 8 variables: borough <fctr>, totInjured <int>, totKilled <int>,
## #   totPedInjured <int>, totCycInjured <int>, totCycKilled <int>,
## #   totMotInjured <int>, totMotKilled <int>

No. of accidents by Vehicle Type

mvc_acc_VehType <- mvc%>%
                   select (5,6,7,8,9,10,11,12,14) %>%
                   group_by (VehType1)  %>% summarise(totInjured = sum(NoOfInjured),totKilled=sum(NoOfKilled), totPedInjured = sum(NoOfPedKilled), totCycInjured = sum(NoOfCycInjured), totCycKilled = sum(NoOfCycKilled), totMotInjured = sum(NoOfMotInjured), totMotKilled = sum(NoOfMotKilled))

print(mvc_acc_VehType)
## # A tibble: 0 × 8
## # ... with 8 variables: VehType1 <fctr>, totInjured <int>,
## #   totKilled <int>, totPedInjured <int>, totCycInjured <int>,
## #   totCycKilled <int>, totMotInjured <int>, totMotKilled <int>

No. of accidents by reason

mvc_acc_Reason <- mvc%>%
                   select (2,5,6,7,8,9,10,11,12,13) %>%
                   group_by (borough,ConFacVeh1)  %>% summarise(totInjured = sum(NoOfInjured),totKilled=sum(NoOfKilled), totPedInjured = sum(NoOfPedKilled), totCycInjured = sum(NoOfCycInjured), totCycKilled = sum(NoOfCycKilled), totMotInjured = sum(NoOfMotInjured), totMotKilled = sum(NoOfMotKilled))

print(mvc_acc_Reason)
## Source: local data frame [0 x 9]
## Groups: borough [?]
## 
## # ... with 9 variables: borough <fctr>, ConFacVeh1 <fctr>,
## #   totInjured <int>, totKilled <int>, totPedInjured <int>,
## #   totCycInjured <int>, totCycKilled <int>, totMotInjured <int>,
## #   totMotKilled <int>
ggplot(data=mvc_acc_Reason, aes(x=ConFacVeh1, y = totInjured, fill = borough)) +
  geom_bar(stat="identity", position=position_dodge()) +
  geom_text(aes(label=totInjured), vjust=.9, hjust=1,position= position_dodge(width=0.9)) +
  geom_point(mapping = aes(color = borough  ), method = 'loess') + 
  xlab("Contributing Factor") + ylab("Total Number Of Injured") +
  coord_flip()