This isn’t my best work. I found the datasets a bit too structured. Sorry.
Import the data:
library(dplyr)
library(tidyr)
library(readxl)
library(ggplot2)
library(ggthemes)
nycDF <- read_excel("/Users/kailukowiak/OneDrive - CUNY School of Professional Studies/DATA 607 Repository/Project 2/cityacc-en-us.xlsx")
head(nycDF)
## # A tibble: 6 x 14
## `Motor Vehicle Collision Report Statistics Citywide` X__1
## <chr> <chr>
## 1 August2017 <NA>
## 2 Collisions <NA>
## 3 GeoCode GeoCodeLabel
## 4 C CITYWIDE
## 5 M MANHATTAN
## 6 001 1st Precinct
## # ... with 12 more variables: X__2 <chr>, X__3 <chr>, X__4 <chr>,
## # X__5 <chr>, X__6 <chr>, X__7 <chr>, X__8 <chr>, X__9 <chr>,
## # X__10 <chr>, X__11 <chr>, X__12 <chr>, X__13 <chr>
tail(nycDF)
## # A tibble: 6 x 14
## `Motor Vehicle Collision Report Statistics Citywide`
## <chr>
## 1 The category of “Motorist” includes the owner of a parked vehicle. Contribu
## 2 All figures are preliminary and subject to change.
## 3 A collision at an intersection located on a precinct border may appear in t
## 4 but will appear in only one precinct list. Collision location is captured a
## 5 A collision on a highway, bridge or tunnel which occurs at or near a precin
## 6 that shares that location, but will appear in only one precinct list. Colli
## # ... with 13 more variables: X__1 <chr>, X__2 <chr>, X__3 <chr>,
## # X__4 <chr>, X__5 <chr>, X__6 <chr>, X__7 <chr>, X__8 <chr>,
## # X__9 <chr>, X__10 <chr>, X__11 <chr>, X__12 <chr>, X__13 <chr>
nycDF <- nycDF %>% filter(!is.na(X__1))
head(nycDF)
## # A tibble: 6 x 14
## `Motor Vehicle Collision Report Statistics Citywide` X__1
## <chr> <chr>
## 1 GeoCode GeoCodeLabel
## 2 C CITYWIDE
## 3 M MANHATTAN
## 4 001 1st Precinct
## 5 005 5th Precinct
## 6 006 6th Precinct
## # ... with 12 more variables: X__2 <chr>, X__3 <chr>, X__4 <chr>,
## # X__5 <chr>, X__6 <chr>, X__7 <chr>, X__8 <chr>, X__9 <chr>,
## # X__10 <chr>, X__11 <chr>, X__12 <chr>, X__13 <chr>
tail(nycDF)
## # A tibble: 6 x 14
## `Motor Vehicle Collision Report Statistics Citywide` X__1
## <chr> <chr>
## 1 115 115th Precinct
## 2 S STATEN ISLAND
## 3 120 120th Precinct
## 4 121 121st Precinct
## 5 122 122nd Precinct
## 6 123 123rd Precinct
## # ... with 12 more variables: X__2 <chr>, X__3 <chr>, X__4 <chr>,
## # X__5 <chr>, X__6 <chr>, X__7 <chr>, X__8 <chr>, X__9 <chr>,
## # X__10 <chr>, X__11 <chr>, X__12 <chr>, X__13 <chr>
Now we need to add row names.
colnames(nycDF) <- nycDF[1,]
nycDF <- nycDF[-1,]
head(nycDF)
## # A tibble: 6 x 14
## GeoCode GeoCodeLabel Number_of_Motor_Vehicle_Collisions
## <chr> <chr> <chr>
## 1 C CITYWIDE 18727
## 2 M MANHATTAN 3916
## 3 001 1st Precinct 288
## 4 005 5th Precinct 203
## 5 006 6th Precinct 130
## 6 007 7th Precinct 102
## # ... with 11 more variables: Vehicles_or_Motorists_Involved <chr>,
## # Injury_or_Fatal_Collisions <chr>, MotoristsInjured <chr>,
## # MotoristsKilled <chr>, PassengInjured <chr>, PassengKilled <chr>,
## # CyclistsInjured <chr>, CyclistsKilled <chr>, PedestrInjured <chr>,
## # PedestrKilled <chr>, Bicycle <chr>
Everthing is reletively pretty now, but we have citywide, borough, and precicnt data. This isn’t very tidy.
Seperate borough and precent data.
borDF <- nycDF %>%
filter( !grepl("\\d", GeoCode))
preDF <- nycDF %>%
filter( grepl("\\d", GeoCode))
Create new column.
nycDF <- nycDF %>% mutate(Borough = GeoCode)
Remove numerics from Borough
nycDF$Borough <- sub("\\d\\d\\d", NA , nycDF$Borough)
nycDF <- nycDF %>% fill(Borough, .direction = "down")# Add factor variables.
precDF <- nycDF %>%
filter( grepl("\\d", GeoCode))
precDF %>%
head()
## # A tibble: 6 x 15
## GeoCode GeoCodeLabel Number_of_Motor_Vehicle_Collisions
## <chr> <chr> <chr>
## 1 001 1st Precinct 288
## 2 005 5th Precinct 203
## 3 006 6th Precinct 130
## 4 007 7th Precinct 102
## 5 009 9th Precinct 126
## 6 010 10th Precinct 246
## # ... with 12 more variables: Vehicles_or_Motorists_Involved <chr>,
## # Injury_or_Fatal_Collisions <chr>, MotoristsInjured <chr>,
## # MotoristsKilled <chr>, PassengInjured <chr>, PassengKilled <chr>,
## # CyclistsInjured <chr>, CyclistsKilled <chr>, PedestrInjured <chr>,
## # PedestrKilled <chr>, Bicycle <chr>, Borough <chr>
precDF$Borough <- precDF$Borough %>% recode(M = "Manhatten",
B = "Bronx",
K = "Brooklyn",
Q = "Queens",
S = "StattenIsland") # Adds names.
precDF$Borough <- as.factor(precDF$Borough)
cols <- colnames(precDF)
precDF[, cols[-c(2,15)]] <- as.numeric(unlist(precDF[, cols[-c(2,15)]])) # Convert to numeric
plt <- precDF %>%
group_by(Borough) %>%
summarise(numAccidents = sum(Number_of_Motor_Vehicle_Collisions))
ggplot(plt, aes(x = Borough, y = numAccidents))+
geom_bar(stat = 'identity') +
xlab("Borough") +
ylab("Number of Accidents") +
ggtitle("Number of Accidents by Borough") +
theme_fivethirtyeight()
Special thanks to Rob Barry for nyc precincts.
# Grab the shapefile and extract it
# You may want to add some file paths here
download.file(
# I saved a version on my server so this script should work
# even if the NYC planning department changes things around
"http://www.rob-barry.com/assets/data/mapping/nypp_15b.zip",
destfile = "nypp_15b.zip"
)
unzip(zipfile = "nypp_15b.zip")
# Now, load package to read the shapefile
library("rgdal")
# Read it into an sp object:
nypp <- readOGR("nypp_15b", "nypp")
## OGR data source with driver: ESRI Shapefile
## Source: "nypp_15b", layer: "nypp"
## with 77 features
## It has 3 fields
library("RColorBrewer")
library("classInt")
# Plot it
pal <- brewer.pal(5, "YlOrRd")
fill.clr <-
findColours(
classIntervals(as.numeric(precDF$Number_of_Motor_Vehicle_Collisions), style = "pretty", n = 5),
pal
)
plot(nypp, col = fill.clr)
legend(
"topleft",
fill=attr(fill.clr, "palette"),
legend=names(attr(fill.clr, "table")),
bty = "n"
)
title("Accidents by Precinct")
# Plot it
pal <- brewer.pal(5, "YlOrRd")
fill.clr <-
findColours(
classIntervals(as.numeric(precDF$Injury_or_Fatal_Collisions), style = "pretty", n = 5),
pal
)
plot(nypp, col = fill.clr)
legend(
"topleft",
fill=attr(fill.clr, "palette"),
legend=names(attr(fill.clr, "table")),
bty = "n"
)
title("Total Number of Casualties")
Importing the data:
hosDF <- read.csv("/Users/kailukowiak/OneDrive - CUNY School of Professional Studies/DATA 607 Repository/Project 2/Hospital_Revised_Flatfiles/Complications and Deaths - Hospital.csv")
There are a lot of NA values. First we need to change them to NA
hosDF[ hosDF == "Not Available" ] <- NA
We can remove NA’s but this gives us an incomplete dataset. It might be better to remove the entire hospital if it is missing one value. We will deal with this later.
Right now we need to find the length of time that the hospitals studied
hosDF <- hosDF %>% mutate(timeDiff = difftime(as.POSIXct(as.Date(hosDF$Measure.End.Date,
format = '%m/%d/%Y')),
as.POSIXct(as.Date(hosDF$Measure.Start.Date,
format = '%m/%d/%Y')),
units = 'days'))
hosDF$Score <- as.numeric(hosDF$Score)
plt <- hosDF %>%
group_by(Measure.Name) %>%
summarise(facSum = sum(Score, na.rm = T))
ggplot(plt, aes(Measure.Name, facSum))+
geom_bar(stat = 'identity') +
theme(axis.text.x=element_text(angle=90, hjust=1))+ coord_flip()+
ggtitle('Deaths by Cause')
#qplot(x = as.factor(hosDF$Measure.Name), y = hosDF$Score, geom = "boxplot")
Create dummy variables for Measure ID.
hosDF %>%
mutate(yesno = 1) %>%
distinct() %>%
spread(Measure.ID, yesno, fill = 0) %>%
glimpse()
## Observations: 81,804
## Variables: 35
## $ Provider.ID <int> 10001, 10001, 10001, 10001, 10001, 1...
## $ Hospital.Name <fctr> SOUTHEAST ALABAMA MEDICAL CENTER, S...
## $ Address <fctr> 1108 ROSS CLARK CIRCLE, 1108 ROSS C...
## $ City <fctr> DOTHAN, DOTHAN, DOTHAN, DOTHAN, DOT...
## $ State <fctr> AL, AL, AL, AL, AL, AL, AL, AL, AL,...
## $ ZIP.Code <int> 36301, 36301, 36301, 36301, 36301, 3...
## $ County.Name <fctr> HOUSTON, HOUSTON, HOUSTON, HOUSTON,...
## $ Phone.Number <dbl> 3347938701, 3347938701, 3347938701, ...
## $ Measure.Name <fctr> Rate of complications for hip/knee ...
## $ Compared.to.National <fctr> Worse than the National Rate, No Di...
## $ Denominator <fctr> 337, 744, 280, 522, 858, 597, 566, ...
## $ Score <dbl> 2565, 1085, 2488, 3162, 442, 1806, 2...
## $ Lower.Estimate <fctr> 2.9, 11.4, 2.1, 7.6, 9.5, 13.0, 13....
## $ Higher.Estimate <fctr> 6.0, 15.8, 5.8, 12.3, 13.6, 18.3, 1...
## $ Footnote <fctr> , , , , , , , , , , , , , , , , , ,...
## $ Measure.Start.Date <fctr> 04/01/2013, 07/01/2013, 07/01/2013,...
## $ Measure.End.Date <fctr> 03/31/2016, 06/30/2016, 06/30/2016,...
## $ timeDiff <time> 1095 days, 1095 days, 1095 days, 10...
## $ COMP_HIP_KNEE <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_AMI <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_CABG <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_COPD <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_HF <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_PN <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ MORT_30_STK <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ PSI_12_POSTOP_PULMEMB_DVT <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ PSI_13_POST_SEPSIS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ PSI_14_POSTOP_DEHIS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ PSI_15_ACC_LAC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ PSI_3_ULCER <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ PSI_4_SURG_COMP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PSI_6_IAT_PTX <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PSI_7_CVCBI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PSI_8_POST_HIP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ PSI_90_SAFETY <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
library(readr)
U_S_Chronic_Disease_Indicators_CDI_ <- read_csv("~/OneDrive - CUNY School of Professional Studies/DATA 607 Repository/Project 2/U.S._Chronic_Disease_Indicators__CDI_.csv")
chronDF <- U_S_Chronic_Disease_Indicators_CDI_
chronDF <- chronDF %>% select_if(~sum(!is.na(.)) > 0) # Removes all rows that only have NA values
chronDF$DataValueFootnoteSymbol <- NULL
chronDF$DatavalueFootnote <- NULL
chronDF$DataValueUnit <- NULL # Removing not very useful data
chronDF <- filter(chronDF, LocationDesc != c("Guam", "District of Columbia","Puerto Rico",
"United States","Virgin Islands"))
## Warning in LocationDesc != c("Guam", "District of Columbia", "Puerto
## Rico", : longer object length is not a multiple of shorter object length
library(fiftystater)
library(mapproj)
## Loading required package: maps
alcohol <- filter(chronDF, Question == "Alcohol use among youth" &
YearStart == 2015 & StratificationCategory1 == "Overall")
alcohol$LocationDesc <- tolower(alcohol$LocationDesc) # Needs to be lowercase to match the map.
ggplot(alcohol, aes(map_id = LocationDesc)) +
geom_map(aes(fill = DataValue), map = fifty_states) +
expand_limits(x = fifty_states$long, y = fifty_states$lat) +
coord_map() +
scale_x_continuous(breaks = NULL) +
scale_y_continuous(breaks = NULL) +
labs(x = "", y = "") +
theme(legend.position = "bottom",
panel.background = element_blank())+
ggtitle("Alcohol use among youth")
Here we see the Prevelance of underaged drinking. The data is fairly sparse so it is difficult to draw exact conclusions.
alcoholFem <- filter(chronDF, Question == "Heavy drinking among women aged 18-44 years" &
YearStart == 2015 & StratificationCategory1 == "Overall")
alcoholFem$LocationDesc <- tolower(alcoholFem$LocationDesc) # Needs to be lowercase to match the map.
ggplot(alcoholFem, aes(map_id = LocationDesc)) +
geom_map(aes(fill = DataValue), map = fifty_states) +
expand_limits(x = fifty_states$long, y = fifty_states$lat) +
coord_map() +
scale_x_continuous(breaks = NULL) +
scale_y_continuous(breaks = NULL) +
labs(x = "", y = "") +
theme(legend.position = "bottom",
panel.background = element_blank()) +
ggtitle("Heavy drinking among women aged 18-44 years")
Nevada doesn’t make a lot of sense being so low, but maybe my sterotypes of Americans need updating. It’s interesting that there seems to be little correlation between youth alcohol consumption and heavy drinking in adulthood.