This isn’t my best work. I found the datasets a bit too structured. Sorry.

NYC Crashes:

Import the data:

library(dplyr)
library(tidyr)
library(readxl)
library(ggplot2)
library(ggthemes)

nycDF <-  read_excel("/Users/kailukowiak/OneDrive - CUNY School of Professional Studies/DATA 607 Repository/Project 2/cityacc-en-us.xlsx")
head(nycDF)

## # A tibble: 6 x 14
##   `Motor Vehicle Collision Report Statistics Citywide`         X__1
##                                                  <chr>        <chr>
## 1                                           August2017         <NA>
## 2                                           Collisions         <NA>
## 3                                              GeoCode GeoCodeLabel
## 4                                                    C     CITYWIDE
## 5                                                    M    MANHATTAN
## 6                                                  001 1st Precinct
## # ... with 12 more variables: X__2 <chr>, X__3 <chr>, X__4 <chr>,
## #   X__5 <chr>, X__6 <chr>, X__7 <chr>, X__8 <chr>, X__9 <chr>,
## #   X__10 <chr>, X__11 <chr>, X__12 <chr>, X__13 <chr>

tail(nycDF)

## # A tibble: 6 x 14
##                          `Motor Vehicle Collision Report Statistics Citywide`
##                                                                         <chr>
## 1 The category of “Motorist” includes the owner of a parked vehicle. Contribu
## 2                          All figures are preliminary and subject to change.
## 3 A collision at an intersection located on a precinct border may appear in t
## 4 but will appear in only one precinct list. Collision location is captured a
## 5 A collision on a highway, bridge or tunnel which occurs at or near a precin
## 6 that shares that location, but will appear in only one precinct list. Colli
## # ... with 13 more variables: X__1 <chr>, X__2 <chr>, X__3 <chr>,
## #   X__4 <chr>, X__5 <chr>, X__6 <chr>, X__7 <chr>, X__8 <chr>,
## #   X__9 <chr>, X__10 <chr>, X__11 <chr>, X__12 <chr>, X__13 <chr>

nycDF <- nycDF %>% filter(!is.na(X__1))
head(nycDF)

## # A tibble: 6 x 14
##   `Motor Vehicle Collision Report Statistics Citywide`         X__1
##                                                  <chr>        <chr>
## 1                                              GeoCode GeoCodeLabel
## 2                                                    C     CITYWIDE
## 3                                                    M    MANHATTAN
## 4                                                  001 1st Precinct
## 5                                                  005 5th Precinct
## 6                                                  006 6th Precinct
## # ... with 12 more variables: X__2 <chr>, X__3 <chr>, X__4 <chr>,
## #   X__5 <chr>, X__6 <chr>, X__7 <chr>, X__8 <chr>, X__9 <chr>,
## #   X__10 <chr>, X__11 <chr>, X__12 <chr>, X__13 <chr>

tail(nycDF)

## # A tibble: 6 x 14
##   `Motor Vehicle Collision Report Statistics Citywide`           X__1
##                                                  <chr>          <chr>
## 1                                                  115 115th Precinct
## 2                                                    S  STATEN ISLAND
## 3                                                  120 120th Precinct
## 4                                                  121 121st Precinct
## 5                                                  122 122nd Precinct
## 6                                                  123 123rd Precinct
## # ... with 12 more variables: X__2 <chr>, X__3 <chr>, X__4 <chr>,
## #   X__5 <chr>, X__6 <chr>, X__7 <chr>, X__8 <chr>, X__9 <chr>,
## #   X__10 <chr>, X__11 <chr>, X__12 <chr>, X__13 <chr>

Now we need to add row names.

colnames(nycDF) <- nycDF[1,]
nycDF <- nycDF[-1,]
head(nycDF)

## # A tibble: 6 x 14
##   GeoCode GeoCodeLabel Number_of_Motor_Vehicle_Collisions
##     <chr>        <chr>                              <chr>
## 1       C     CITYWIDE                              18727
## 2       M    MANHATTAN                               3916
## 3     001 1st Precinct                                288
## 4     005 5th Precinct                                203
## 5     006 6th Precinct                                130
## 6     007 7th Precinct                                102
## # ... with 11 more variables: Vehicles_or_Motorists_Involved <chr>,
## #   Injury_or_Fatal_Collisions <chr>, MotoristsInjured <chr>,
## #   MotoristsKilled <chr>, PassengInjured <chr>, PassengKilled <chr>,
## #   CyclistsInjured <chr>, CyclistsKilled <chr>, PedestrInjured <chr>,
## #   PedestrKilled <chr>, Bicycle <chr>

Everthing is reletively pretty now, but we have citywide, borough, and precicnt data. This isn’t very tidy.

Seperate borough and precent data.

borDF <- nycDF %>% 
  filter( !grepl("\\d", GeoCode))
preDF <- nycDF %>% 
  filter( grepl("\\d", GeoCode))

Create new column.

nycDF <- nycDF %>% mutate(Borough = GeoCode)

Remove numerics from Borough

nycDF$Borough <- sub("\\d\\d\\d", NA , nycDF$Borough)
nycDF <- nycDF %>% fill(Borough, .direction = "down")# Add factor variables. 
precDF <- nycDF %>% 
  filter( grepl("\\d", GeoCode))
precDF %>%
  head()

## # A tibble: 6 x 15
##   GeoCode  GeoCodeLabel Number_of_Motor_Vehicle_Collisions
##     <chr>         <chr>                              <chr>
## 1     001  1st Precinct                                288
## 2     005  5th Precinct                                203
## 3     006  6th Precinct                                130
## 4     007  7th Precinct                                102
## 5     009  9th Precinct                                126
## 6     010 10th Precinct                                246
## # ... with 12 more variables: Vehicles_or_Motorists_Involved <chr>,
## #   Injury_or_Fatal_Collisions <chr>, MotoristsInjured <chr>,
## #   MotoristsKilled <chr>, PassengInjured <chr>, PassengKilled <chr>,
## #   CyclistsInjured <chr>, CyclistsKilled <chr>, PedestrInjured <chr>,
## #   PedestrKilled <chr>, Bicycle <chr>, Borough <chr>

precDF$Borough <-  precDF$Borough %>% recode(M = "Manhatten",
         B = "Bronx",
         K = "Brooklyn",
         Q = "Queens",
         S = "StattenIsland") # Adds names.
precDF$Borough <- as.factor(precDF$Borough)
cols <- colnames(precDF)
precDF[, cols[-c(2,15)]] <-  as.numeric(unlist(precDF[, cols[-c(2,15)]])) # Convert to numeric

Analysis of NYC Crashes

plt <-  precDF %>% 
  group_by(Borough) %>% 
  summarise(numAccidents = sum(Number_of_Motor_Vehicle_Collisions))

ggplot(plt, aes(x = Borough, y = numAccidents))+
  geom_bar(stat = 'identity') + 
  xlab("Borough") +
  ylab("Number of Accidents") +
  ggtitle("Number of Accidents by Borough") +
  theme_fivethirtyeight()

Special thanks to Rob Barry for nyc precincts.

# Grab the shapefile and extract it
# You may want to add some file paths here
download.file(
  # I saved a version on my server so this script should work
  # even if the NYC planning department changes things around
  "http://www.rob-barry.com/assets/data/mapping/nypp_15b.zip",
  destfile = "nypp_15b.zip"
)
unzip(zipfile = "nypp_15b.zip")

# Now, load package to read the shapefile
library("rgdal")

# Read it into an sp object:
nypp <- readOGR("nypp_15b", "nypp")

## OGR data source with driver: ESRI Shapefile 
## Source: "nypp_15b", layer: "nypp"
## with 77 features
## It has 3 fields

library("RColorBrewer")
library("classInt")

# Plot it
pal <- brewer.pal(5, "YlOrRd")
fill.clr <-
  findColours(
    classIntervals(as.numeric(precDF$Number_of_Motor_Vehicle_Collisions), style = "pretty", n = 5),
    pal
  )
plot(nypp, col = fill.clr)
legend(
  "topleft",
  fill=attr(fill.clr, "palette"),
  legend=names(attr(fill.clr, "table")),
  bty = "n"
)
title("Accidents by Precinct")

# Plot it
pal <- brewer.pal(5, "YlOrRd")
fill.clr <-
  findColours(
    classIntervals(as.numeric(precDF$Injury_or_Fatal_Collisions), style = "pretty", n = 5),
    pal
  )
plot(nypp, col = fill.clr)
legend(
  "topleft",
  fill=attr(fill.clr, "palette"),
  legend=names(attr(fill.clr, "table")),
  bty = "n"
)
title("Total Number of Casualties")

Hospital Data

Importing the data:

hosDF <- read.csv("/Users/kailukowiak/OneDrive - CUNY School of Professional Studies/DATA 607 Repository/Project 2/Hospital_Revised_Flatfiles/Complications and Deaths - Hospital.csv")

There are a lot of NA values. First we need to change them to NA

hosDF[ hosDF == "Not Available" ] <- NA

We can remove NA’s but this gives us an incomplete dataset. It might be better to remove the entire hospital if it is missing one value. We will deal with this later.

Right now we need to find the length of time that the hospitals studied

hosDF <- hosDF %>% mutate(timeDiff = difftime(as.POSIXct(as.Date(hosDF$Measure.End.Date,
                                               format = '%m/%d/%Y')), 
         as.POSIXct(as.Date(hosDF$Measure.Start.Date, 
                                               format = '%m/%d/%Y')), 
         units = 'days'))

hosDF$Score <-  as.numeric(hosDF$Score)
plt <- hosDF %>% 
  group_by(Measure.Name) %>% 
  summarise(facSum = sum(Score, na.rm = T))
 
ggplot(plt, aes(Measure.Name, facSum))+
  geom_bar(stat = 'identity') +
  theme(axis.text.x=element_text(angle=90, hjust=1))+ coord_flip()+
  ggtitle('Deaths by Cause')

#qplot(x = as.factor(hosDF$Measure.Name), y = hosDF$Score, geom = "boxplot")

Create dummy variables for Measure ID.

hosDF %>% 
  mutate(yesno = 1) %>% 
  distinct() %>% 
  spread(Measure.ID, yesno, fill = 0) %>% 
  glimpse()

## Observations: 81,804
## Variables: 35
## $ Provider.ID               <int> 10001, 10001, 10001, 10001, 10001, 1...
## $ Hospital.Name             <fctr> SOUTHEAST ALABAMA MEDICAL CENTER, S...
## $ Address                   <fctr> 1108 ROSS CLARK CIRCLE, 1108 ROSS C...
## $ City                      <fctr> DOTHAN, DOTHAN, DOTHAN, DOTHAN, DOT...
## $ State                     <fctr> AL, AL, AL, AL, AL, AL, AL, AL, AL,...
## $ ZIP.Code                  <int> 36301, 36301, 36301, 36301, 36301, 3...
## $ County.Name               <fctr> HOUSTON, HOUSTON, HOUSTON, HOUSTON,...
## $ Phone.Number              <dbl> 3347938701, 3347938701, 3347938701, ...
## $ Measure.Name              <fctr> Rate of complications for hip/knee ...
## $ Compared.to.National      <fctr> Worse than the National Rate, No Di...
## $ Denominator               <fctr> 337, 744, 280, 522, 858, 597, 566, ...
## $ Score                     <dbl> 2565, 1085, 2488, 3162, 442, 1806, 2...
## $ Lower.Estimate            <fctr> 2.9, 11.4, 2.1, 7.6, 9.5, 13.0, 13....
## $ Higher.Estimate           <fctr> 6.0, 15.8, 5.8, 12.3, 13.6, 18.3, 1...
## $ Footnote                  <fctr> , , , , , , , , , , , , , , , , , ,...
## $ Measure.Start.Date        <fctr> 04/01/2013, 07/01/2013, 07/01/2013,...
## $ Measure.End.Date          <fctr> 03/31/2016, 06/30/2016, 06/30/2016,...
## $ timeDiff                  <time> 1095 days, 1095 days, 1095 days, 10...
## $ COMP_HIP_KNEE             <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_AMI               <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_CABG              <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_COPD              <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_HF                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_PN                <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_STK               <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ PSI_12_POSTOP_PULMEMB_DVT <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ PSI_13_POST_SEPSIS        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ PSI_14_POSTOP_DEHIS       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ PSI_15_ACC_LAC            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ PSI_3_ULCER               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ PSI_4_SURG_COMP           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PSI_6_IAT_PTX             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PSI_7_CVCBI               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PSI_8_POST_HIP            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PSI_90_SAFETY             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

Chronic Disease

library(readr)
U_S_Chronic_Disease_Indicators_CDI_ <- read_csv("~/OneDrive - CUNY School of Professional Studies/DATA 607 Repository/Project 2/U.S._Chronic_Disease_Indicators__CDI_.csv")

chronDF <- U_S_Chronic_Disease_Indicators_CDI_

chronDF <- chronDF %>% select_if(~sum(!is.na(.)) > 0) # Removes all rows that only have NA values
chronDF$DataValueFootnoteSymbol <- NULL
chronDF$DatavalueFootnote <- NULL
chronDF$DataValueUnit <- NULL # Removing not very useful data
chronDF <- filter(chronDF, LocationDesc != c("Guam", "District of Columbia","Puerto Rico",
                                             "United States","Virgin Islands"))

## Warning in LocationDesc != c("Guam", "District of Columbia", "Puerto
## Rico", : longer object length is not a multiple of shorter object length

library(fiftystater)
library(mapproj)

## Loading required package: maps

alcohol <- filter(chronDF, Question == "Alcohol use among youth" & 
                    YearStart == 2015 & StratificationCategory1 == "Overall")
alcohol$LocationDesc <- tolower(alcohol$LocationDesc) # Needs to be lowercase to match the map.
ggplot(alcohol, aes(map_id = LocationDesc)) +
  geom_map(aes(fill = DataValue), map = fifty_states) +
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank())+
  ggtitle("Alcohol use among youth")

Here we see the Prevelance of underaged drinking. The data is fairly sparse so it is difficult to draw exact conclusions.

alcoholFem <- filter(chronDF, Question == "Heavy drinking among women aged 18-44 years" & 
                       YearStart == 2015 & StratificationCategory1 == "Overall")
alcoholFem$LocationDesc <- tolower(alcoholFem$LocationDesc) # Needs to be lowercase to match the map.
ggplot(alcoholFem, aes(map_id = LocationDesc)) +
  geom_map(aes(fill = DataValue), map = fifty_states) +
  expand_limits(x = fifty_states$long, y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(breaks = NULL) + 
  scale_y_continuous(breaks = NULL) +
  labs(x = "", y = "") +
  theme(legend.position = "bottom", 
        panel.background = element_blank()) +
  ggtitle("Heavy drinking among women aged 18-44 years")

Nevada doesn’t make a lot of sense being so low, but maybe my sterotypes of Americans need updating. It’s interesting that there seems to be little correlation between youth alcohol consumption and heavy drinking in adulthood.

DATA 607 Project 2

Kai Lukowiak

2017-10-06

NYC Crashes:

Analysis of NYC Crashes

Hospital Data

Chronic Disease