We will be working with data about every tax lot in NYC, in a zip file. Please download it from http://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page. You will need to combine the data from each boro into one file called ‘all_PLUTO_data.R’ in order to complete this assignment. I encourage you to do this via code but it is not necessary to submit such code.
temp <- tempfile()
download.file(
"http://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_pluto_16v2%20.zip", temp)
# Brooklyn #
BK <- read.csv(unzip(temp, "BORO_zip_files_csv/BK.csv"))
# Bronx #
BX <- read.csv(unzip(temp, "BORO_zip_files_csv/BX.csv"))
# Manhattan #
MN <- read.csv(unzip(temp, "BORO_zip_files_csv/MN.csv"))
# Queens #
QN <- read.csv(unzip(temp, "BORO_zip_files_csv/QN.csv"))
# Staten Island
SI <- read.csv(unzip(temp, "BORO_zip_files_csv/SI.csv"))
# create the combined dataframe of all boros
all_pluto <- tbl_df(rbindlist(list(BK, BX, MN, QN, SI)))
# Noticing 0s in the YearBuilt variable #
summary(all_pluto$YearBuilt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1920 1930 1841 1955 2040
# Year Built:
# The year construction of the building was completed.
data <- all_pluto %>%
filter(YearBuilt > 1850, YearBuilt < 2016) %>%
select(YearBuilt)
pluto_5yr <- condense(bin(data$YearBuilt, 5))
## Summarising with count
pluto_5yr$pct <- with(pluto_5yr, .count/sum(.count))
pluto_5yr <- arrange(pluto_5yr, desc(data.YearBuilt))
pluto_5yr <- pluto_5yr %>% arrange(desc(data.YearBuilt)) %>% mutate(cum_pct = cumsum(pct))
pluto_5yr <- arrange(pluto_5yr, data.YearBuilt)
pluto_5yr <- pluto_5yr %>% mutate(group = ifelse(cum_pct > .50, "40%", "60%"))
pluto_5yr$cum_pct <- cumsum(pluto_5yr$pct)
# Build a graph of the cumulative sum of buildings from 1850 to 2015
ggplot(pluto_5yr, aes(x=data.YearBuilt, y=cumsum(.count), fill=group)) +
geom_bar(stat="identity") +
geom_vline(aes(xintercept = 1934), colour="black", linetype=5) +
scale_y_continuous(label = comma) +
scale_x_continuous(breaks = seq(1850, 2015, 20), limits = c(1850, 2015)) +
labs(x="Year Built", y = "Cumulative Number of Buildings", fill="Breakdown") +
ggtitle("New York - When Were Most Buildings Constructed?") +
annotate("text", x = 1934, y = 750000, label = "60% of Building Constructed \n After 1934", colour="black", size = 3)
## Warning: Removed 2 rows containing missing values (position_stack).
Looking at the resulting graph, the city inspectors should focus on buildings built in the early 1930s forward. By doing this, they’ll be able to cover approximately 60% of all buildings.
One issue seen with the pluto dataset is that there are 44180 entries with 0 for the YearBuilt value. This raises some questions about the accuracy of the data and any generated statistics or graphs based on the data.
qplot(all_pluto$YearBuilt, geom="histogram", binwidth = 100) + scale_y_continuous(label = comma)
df <- filter(all_pluto, NumFloors > 19 & YearBuilt > 0) %>% select(YearBuilt, NumFloors)
df$FloorRange <- cut(df$NumFloors ,
breaks = c(-Inf, 29, 39, 49 ),
labels = c("20", "30", "40"),
right = FALSE)
df2 <- df %>% na.omit() %>% group_by(YearBuilt, FloorRange) %>% summarise(Total = n()) %>% select(YearBuilt, FloorRange, Total) %>%
arrange(YearBuilt, FloorRange)
facet_names <- c(`20` = "20 - 29 Floor Buildings",
`30` = "30 - 39 Floor Buildings",
`40` = "40 - 49 Floor Buildings")
ggplot(data=df2, aes(x=YearBuilt, y=Total, colour = FloorRange)) +
geom_line() +
geom_point( size=1, shape=20, fill="white") +
facet_grid(FloorRange ~ ., labeller = as_labeller(facet_names)) +
labs(x = "Year Built", y = "Number of Buildings Constructed", colour = "Floor Range") +
theme(axis.line=element_line()) +
scale_x_continuous(breaks=seq(1850, 2015, 20)) +
scale_y_continuous(breaks=seq(0, 100, 10)) +
ggtitle("NYC Tall Buildings - \nYear of Construction Comparison")
summary(all_pluto$AssessTot)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 2.207e+04 2.903e+04 4.064e+05 4.446e+04 6.798e+09
summary(all_pluto$AssessLand)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 7.594e+03 1.044e+04 1.130e+05 1.562e+04 3.211e+09
df <- all_pluto %>%
filter(YearBuilt > 1850, NumFloors != 0, AssessTot > 0, AssessLand > 0) %>%
mutate(BldgAssessment = AssessTot - AssessLand,
Per_Floor_Assessment = BldgAssessment/NumFloors) %>%
select(YearBuilt, Per_Floor_Assessment, Borough)
# filter to the a 30 year timeframe overlapping WWII
df <- filter(df, YearBuilt > 1930, YearBuilt < 1960)
year_bin <- 1
# Analyze median floor values per year
# Create one set for all boroughs and then one for each individual
# All Boroughs - Median #
all_con <- with(df, condense(bin(YearBuilt, year_bin), z=Per_Floor_Assessment, summary="median"))
all_median <- smooth(all_con, 100, var=".median" , type="robust")
all_median$Borough <- "All"
# Brooklyn - Median #
BK_con <- with(filter(df, Borough == "BK"),
condense(bin(YearBuilt, year_bin), z=Per_Floor_Assessment, summary="median"))
BK_median <- smooth(BK_con, 100, var=".median", type="robust")
BK_median$Borough <- "BK"
# Bronx - Median #
BX_con <- with(filter(df, Borough == "BX"),
condense(bin(YearBuilt, year_bin), z=Per_Floor_Assessment, summary="median"))
BX_median <- smooth(BX_con, 100, var=".median", type="robust")
BX_median$Borough <- "BX"
# Manhattan - Median #
MN_con <- with(filter(df, Borough == "MN"),
condense(bin(YearBuilt, year_bin), z=Per_Floor_Assessment, summary="median"))
MN_median <- smooth(MN_con, 100, var=".median", type="robust")
MN_median$Borough <- "MN"
# Queens - Median #
QN_con <- with(filter(df, Borough == "QN"),
condense(bin(YearBuilt, year_bin), z=Per_Floor_Assessment, summary="median"))
QN_median <- smooth(QN_con, 100, var=".median", type="robust")
QN_median$Borough <- "QN"
# Staten Island - Median #
SI_con <- with(filter(df, Borough == "SI"),
condense(bin(YearBuilt, year_bin), z=Per_Floor_Assessment, summary="median"))
SI_median <- smooth(SI_con, 100, var=".median", type="robust")
SI_median$Borough <- "SI"
# Combine all into one dataframe for plotting
combined <- rbindlist(list(all_median, SI_median, BX_median, BK_median, MN_median, QN_median))
# drop cases with NA
combined <- combined[complete.cases(combined),]
facet_names <- c(`All` = "All",
`BK` = "Brooklyn",
`BX` = "Bronx",
`MN` = "Manhattan",
`QN` = "Queens",
`SI` = "Staten Island")
# plot the results
ggplot() +
geom_line(data=combined, aes(x=YearBuilt, y=.median, colour=Borough)) +
ggtitle("NYC Boroughs - Median Building Value Per Year (1930-1960) ") +
ylab("Median Floor Value") +
facet_grid(. ~ Borough, labeller = as_labeller(facet_names)) +
scale_y_continuous(labels=dollar, breaks = scales::pretty_breaks(n = 7)) +
scale_x_continuous(breaks=seq(1930, 1960, 10)) +
theme(axis.text.x = element_text(angle = 90))
I was curious if there may have been a geographical component to the observation that the per floor assessment value dropped around the time of WWII. Looking at the resulting graph of median assessment per floor (smoothed), we see that the assessment value per floor is rising for all boroughs combined from 1930 to 1960. We do see a noticeable decline in assessment value in the Bronx. Staten Island looks to be relatively flat with a possible small decline.
The remaining boroughs - Brooklyn, Manhattan, and Queens – all show significant increases in assessment value per floor over the 30 year period.