library('dplyr')##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('tidyr')
library('ggplot2')## Warning: package 'ggplot2' was built under R version 3.3.3
Dataset #1 looks at 1,000 car collisions in NYC from 9/30/2017 to 10/1/2017. The sample data has numerous blank values and needs some work before analyzing. I will look at which borough has the most collisions and the injuries per collision.
collision <- read.csv("Collision.csv",header=TRUE, sep=",", stringsAsFactors=FALSE)
head(collision,2)## X..computed_region_92fq_4b7q X..computed_region_efsh_h5xi
## 1 47 20529
## 2 37 17215
## X..computed_region_f5dn_yrer X..computed_region_sbqj_enih
## 1 51 59
## 2 45 47
## X..computed_region_yeji_bk3q borough contributing_factor_vehicle_1
## 1 3 QUEENS Driver Inattention/Distraction
## 2 2 View Obstructed/Limited
## contributing_factor_vehicle_2 contributing_factor_vehicle_3
## 1 Unspecified
## 2 Unspecified
## contributing_factor_vehicle_4 contributing_factor_vehicle_5
## 1
## 2
## cross_street_name date
## 1 51-45 ALMEDA AVENUE 2017-10-01T00:00:00.000
## 2 2017-10-01T00:00:00.000
## latitude location location_address location_city
## 1 40.59782 POINT (-73.78255 40.597824) NA NA
## 2 40.67884 POINT (-73.86609 40.678844) NA NA
## location_state location_zip longitude number_of_cyclist_injured
## 1 NA NA -73.78255 0
## 2 NA NA -73.86609 0
## number_of_cyclist_killed number_of_motorist_injured
## 1 0 0
## 2 0 0
## number_of_motorist_killed number_of_pedestrians_injured
## 1 0 0
## 2 0 0
## number_of_pedestrians_killed number_of_persons_injured
## 1 0 0
## 2 0 0
## number_of_persons_killed off_street_name
## 1 0
## 2 0 GRANT AVENUE
## on_street_name time unique_key vehicle_type_code1
## 1 13:00 3761053 PASSENGER VEHICLE
## 2 LIBERTY AVENUE 19:55 3762078 PASSENGER VEHICLE
## vehicle_type_code2 vehicle_type_code_3 vehicle_type_code_4
## 1
## 2
## vehicle_type_code_5 zip_code
## 1 11691
## 2 NA
Tidy up the data and look at some metrics by borough. Also make a new column to see the injuries per collision.
boro_group <- collision %>%
group_by(borough)
boro_group <- summarise(
boro_group,
collisions = n(),
injuries = sum(number_of_persons_injured),
mean_injured = mean(number_of_persons_injured),
max_injured = max(number_of_persons_injured)
) %>%
mutate(injury_per = round(injuries/collisions, 3))
boro_group$borough[1] <- "N/A"
boro_group ## # A tibble: 6 Ă— 6
## borough collisions injuries mean_injured max_injured injury_per
## <chr> <int> <int> <dbl> <int> <dbl>
## 1 N/A 394 135 0.3426396 6 0.343
## 2 BRONX 83 24 0.2891566 4 0.289
## 3 BROOKLYN 207 44 0.2125604 3 0.213
## 4 MANHATTAN 119 29 0.2436975 3 0.244
## 5 QUEENS 177 47 0.2655367 5 0.266
## 6 STATEN ISLAND 20 1 0.0500000 1 0.050
We see that Brooklyn has the most collisions and Staten Island has the least. For the number of injuries per collision, the Bronx has the most.
Dataset #2 looks at the population of catafish from 1992 to 2016.The original CSv was in a wide format with each year as it’s own column. I will tidy up the data and look at how the population of catfish has changed yearly.
catfish <- read.csv("CatfishFarm.csv",header=TRUE, sep=",",stringsAsFactors=FALSE,check.names = FALSE)
catfish## Size category 1992 1993 1994 1995 1996 1997 1998 1999
## 1 Broodfish 1491 1169 1183 1301 1171 1163 1187 1155
## 2 Fingerling/fry 849412 669491 648628 724693 823397 873457 975542 986368
## 3 Stockers 634353 571254 548207 554342 627834 754816 607878 678682
## 4 Small foodsize 166731 153600 134314 138160 156297 178448 178511 182251
## 5 Medium foodsize 70495 61894 48851 59159 64858 84725 62140 63049
## 6 Large foodsize 6769 6698 5196 4536 6644 7810 7295 9266
## 7 NA NA NA NA NA NA NA NA
## 8 NA NA NA NA NA NA NA NA
## 2000 2001 2002 2003 2004 2005 2006 2007 2008
## 1 1377 1327 1171 1303 1113 1053 1091 886 801
## 2 1053300 1023533 1066400 990163 745849 712144 1045266 985620 951910
## 3 790683 845287 676378 775226 890275 660000 781958 586320 688844
## 4 200032 239655 287591 254920 261323 243090 214848 210340 204750
## 5 77149 87926 106117 127908 109120 95240 103591 104080 107800
## 6 5812 6872 10746 11195 10947 10642 10823 8986 9290
## 7 NA NA NA NA NA NA NA NA NA
## 8 NA NA NA NA NA NA NA NA NA
## 2009 2010 2011 2012 2013 2014 2015 2016
## 1 704 536 495 562 540 650 577 520
## 2 728340 429590 568990 451100 398510 420060 449510 328570
## 3 586069 366090 380660 463485 339260 289080 248790 204800
## 4 193870 169030 115560 112970 103520 102190 96810 100850
## 5 105610 91790 54130 64740 58015 50600 48220 45775
## 6 9316 8570 6212 3595 5155 4500 5090 3520
## 7 NA NA NA NA NA NA NA NA
## 8 NA NA NA NA NA NA NA NA
Put the data in a long format and make a new column called NumFish which shows the number of fish by size and year. Remove any rows where the NumFish is NA or where the Size is blank.
fishdata <- catfish %>% gather(Years,NumFish, 2:26)
fishdata <- fishdata[!(fishdata$`Size category`=="" | is.na(fishdata$NumFish)), ]
head(fishdata,2)## Size category Years NumFish
## 1 Broodfish 1992 1491
## 2 Fingerling/fry 1992 849412
Looking at the chart we see that the catfish population has been decreasing since 2009.
fish_year <- fishdata %>%
group_by(Years) %>%
summarise(Pop = sum(NumFish))
ggplot(data=fish_year, aes(x=Years, y=Pop)) +
geom_bar(stat="identity") +
labs(x="Year",y="Catfish Pop") Dataset #3 looks at the amount of funding public tranist agencies recieved from 1991 to 2015. The original CSV was in a wide format with each year as it’s own column. I will format the data to make it ling and see how funding has changed YoY and by state.
transit <- read.csv("TransitFunding.csv",header=TRUE, sep=",",stringsAsFactors=FALSE,check.names = FALSE)
head(transit,2)## Last Report Year 5 Digit NTDID 4 Digit NTDID
## 1 2015 1 0001
## 2 2015 2 0002
## Reporter Name
## 1 King County Department of Transportation - Metro Transit Division(King County Metro)
## 2 Spokane Transit Authority(STA)
## Reporter Status Reporter Type City State Census Year Primary UZA Name
## 1 Active Full Reporter Seattle WA 2010 Seattle, WA
## 2 Active Full Reporter Spokane WA 2010 Spokane, WA
## UZA UZA Area SQ Miles UZA Population 2015 Status 1991 1992
## 1 14 1010 3059393 Existing 2015 383107583 329694739
## 2 96 164 387847 Existing 2015 27042832 27976993
## 1993 1994 1995 1996 1997 1998 1999
## 1 275639731 277424929 289759749 342032852 399974914 367127087 483918410
## 2 29374328 42316768 37303829 32192970 40737394 33591348 34253966
## 2000 2001 2002 2003 2004 2005 2006
## 1 416810718 400536050 493524290 506818105 651927092 509808277 535030832
## 2 35204499 37802003 39682730 45535339 40763480 48952703 56450544
## 2007 2008 2009 2010 2011 2012 2013
## 1 572783766 651636069 668646858 669733012 867772522 817791736 762388692
## 2 64963968 73982401 66417392 67478671 64728901 70077825 65295886
## 2014 2015
## 1 782670566 831702187
## 2 70699557 72960135
I made the data long and added a Funds column. I then grouped the data by State and Year for analysis on funding. The total funds is divided by 1,000,000 to see the amount funded in millions.
transitdata <- transit %>% gather(Years,Funds, 15:39)
head(transitdata)## Last Report Year 5 Digit NTDID 4 Digit NTDID
## 1 2015 1 0001
## 2 2015 2 0002
## 3 2015 3 0003
## 4 2015 5 0005
## 5 2015 6 0006
## 6 2015 7 0007
## Reporter Name
## 1 King County Department of Transportation - Metro Transit Division(King County Metro)
## 2 Spokane Transit Authority(STA)
## 3 Pierce County Transportation Benefit Area Authority(Pierce Transit)
## 4 Everett Transit(ET)
## 5 Yakima Transit(YT)
## 6 Lane Transit District(LTD)
## Reporter Status Reporter Type City State Census Year Primary UZA Name
## 1 Active Full Reporter Seattle WA 2010 Seattle, WA
## 2 Active Full Reporter Spokane WA 2010 Spokane, WA
## 3 Active Full Reporter Tacoma WA 2010 Seattle, WA
## 4 Active Full Reporter Everett WA 2010 Seattle, WA
## 5 Active Full Reporter Yakima WA 2010 Yakima, WA
## 6 Active Full Reporter Eugene OR 2010 Eugene, OR
## UZA UZA Area SQ Miles UZA Population 2015 Status Years Funds
## 1 14 1010 3059393 Existing 2015 1991 383107583
## 2 96 164 387847 Existing 2015 1991 27042832
## 3 14 1010 3059393 Existing 2015 1991 54776294
## 4 14 1010 3059393 Existing 2015 1991 6604377
## 5 248 60 129534 Existing 2015 1991 4226544
## 6 151 87 247421 Existing 2015 1991 17875132
transit_grp <- transitdata %>%
group_by(State,Years) %>%
summarise(TotalFunds = sum(Funds)/1000000)
head(transit_grp,2)## Source: local data frame [2 x 3]
## Groups: State [1]
##
## State Years TotalFunds
## <chr> <chr> <dbl>
## 1 1991 0
## 2 1992 0
Looking at the plot we can see that funding has increased over time. With major states like CA and NY receiving the most amount.
ggplot() + geom_bar(aes(y = TotalFunds, x = Years, fill = State), data = transit_grp,stat="identity")