I have provided you with data about every tax lot in NYC, in a zip file. Please download it from http://www.nyc.gov/html/dcp/html/bytes/dwn_pluto_mappluto.shtml. You will need to combine the data from each boro into one file called ‘all_PLUTO_data.R’ in order to complete this assignment.
The strategy to combine the puto data is:
library("knitr")
base_project = "C:\\Users\\malarconba001\\Google Drive\\CUNY\\DATA608\\week2\\pluto_data\\"
files = list.files(path=base_project,pattern=".*\\.csv$",full.names=TRUE,recursive = TRUE)
all_PLUTO_data = data.frame()
for (file in files){
tmp_data = read.csv(file, stringsAsFactors = FALSE)
tmp_data = tmp_data[tmp_data$YearBuilt > 1850 &tmp_data$YearBuilt <=2014 & tmp_data$LotArea > 100 & tmp_data$AssessTot < 10000000 & tmp_data$NumFloors != 0,]
tmp_data$filename = file
all_PLUTO_data = rbind(all_PLUTO_data,tmp_data[,c("YearBuilt","NumFloors","AssessLand","AssessTot","BldgArea")])
}
summary(all_PLUTO_data)
## YearBuilt NumFloors AssessLand AssessTot
## Min. :1851 Min. : 1.000 Min. : 0 Min. : 0
## 1st Qu.:1920 1st Qu.: 2.000 1st Qu.: 8460 1st Qu.: 21841
## Median :1931 Median : 2.000 Median : 11567 Median : 28315
## Mean :1940 Mean : 2.348 Mean : 42364 Mean : 156397
## 3rd Qu.:1957 3rd Qu.: 2.500 3rd Qu.: 16869 3rd Qu.: 41935
## Max. :2014 Max. :119.000 Max. :9265500 Max. :9999000
## BldgArea
## Min. : 0
## 1st Qu.: 1548
## Median : 2160
## Mean : 4631
## 3rd Qu.: 3198
## Max. :1047240
kable(head(all_PLUTO_data))
| YearBuilt | NumFloors | AssessLand | AssessTot | BldgArea | |
|---|---|---|---|---|---|
| 2 | 2014 | 12 | 389700 | 992385 | 123199 |
| 18 | 1988 | 1 | 129600 | 276750 | 9585 |
| 21 | 1920 | 1 | 178200 | 195750 | 5000 |
| 23 | 1900 | 7 | 514350 | 5011200 | 211306 |
| 25 | 1977 | 1 | 214650 | 1309050 | 21735 |
| 26 | 1920 | 5 | 112500 | 2523150 | 47735 |
1. After a few building collapses, the City of New York is going to begin investigating older buildings for safety. However, the city has a limited number of inspectors, and wants to find a ‘cut-off’ date before most city buildings were constructed. Build a graph to help the city determine when most buildings were constructed. Is there anything in the results that causes you to question the accuracy of the data? (note: only look at buildings built since 1850)
Since we’re looking for the cut-off from which most buildings were constructed, this cut-off is the point where at least 50% of buildings
First, Let’s condense the data and look at the cummulative percentage of constructed buildings by year. The cut-off where most buildings were constructed ends where up to 50% of the buildings were built.
# install.packages("devtools")
# devtools::install_github("hadley/bigvis")
library("bigvis")
## Loading required package: Rcpp
##
## Attaching package: 'bigvis'
## The following object is masked from 'package:stats':
##
## smooth
library("ggplot2")
yr_built <- condense(bin(all_PLUTO_data$YearBuilt, 1))
## Summarising with count
yr_built$percent_built <- yr_built$.count/sum(yr_built$.count)
yr_built$cummulative_percent_built <- cumsum(yr_built$percent_built)
# remove rows with NA values
yr_built <- yr_built[complete.cases(yr_built),]
# set color of points to indicate various significant segments along the curve
yr_built$color <- "gold4"
# Use red3 to mark 2Q datapoints
yr_built[yr_built$all_PLUTO_data.YearBuilt<=1930,]$color <- "red3"
# this is 1Q
yr_built[yr_built$all_PLUTO_data.YearBuilt<=1919,]$color <- "red4"
kable(head(yr_built))
| all_PLUTO_data.YearBuilt | .count | percent_built | cummulative_percent_built | color | |
|---|---|---|---|---|---|
| 2 | 1851 | 8 | 9.90e-06 | 9.90e-06 | red4 |
| 3 | 1852 | 14 | 1.73e-05 | 2.72e-05 | red4 |
| 4 | 1853 | 10 | 1.24e-05 | 3.96e-05 | red4 |
| 5 | 1854 | 6 | 7.40e-06 | 4.70e-05 | red4 |
| 6 | 1855 | 13 | 1.61e-05 | 6.31e-05 | red4 |
| 7 | 1856 | 12 | 1.48e-05 | 7.79e-05 | red4 |
Let’s plot the data now:
p <- ggplot(yr_built, aes(x=all_PLUTO_data.YearBuilt, y=cummulative_percent_built)) +
geom_point(color = yr_built$color) +
ylab('Total of Buildings Constructed (%)') +
geom_vline(xintercept = 1930)+
geom_text(aes(1930,0.5,label = "<- 1930", hjust = -0.2)) +
geom_vline(xintercept = 1919)+
geom_text(aes(1919,0.25,label = "1919 ->", hjust = 1.2))+
xlim(1850, 2014) +
scale_y_continuous(labels = scales::percent)+
ggtitle("Cummulative Building Construction in NYC (%)") +
xlab ('Year Built')
p
Based on the chart, we can see that 50% of buildings were erected before 1930. Therefore, inspecting buildings constructed before 1930 will ensure that we are capturing the mayority.
Furthermore, we can say that a quarter of all structures were built before 1919. Depending on the amount of resources available to conduct the inspections, they may want to prioritize this group.
In regards to data quality, it almost appears as if data is recorded every 5 years up to around 1975. Below is a smoothed version of the same chart:
# Run the same thing, this time we will use smoothing.
yr_built <- smooth(condense(bin(all_PLUTO_data$YearBuilt, 1)),5)
## Summarising with count
yr_built$percent_built <- yr_built$.count/sum(yr_built$.count)
yr_built$cummulative_percent_built <- cumsum(yr_built$percent_built)
# remove rows with NA values
yr_built <- yr_built[complete.cases(yr_built),]
# set color of points to indicate various significant segments along the curve
yr_built$color <- "gold4"
# Use red3 to mark 2Q datapoints
yr_built[yr_built$all_PLUTO_data.YearBuilt<=1930,]$color <- "red3"
# this is 1Q
yr_built[yr_built$all_PLUTO_data.YearBuilt<=1919,]$color <- "red4"
p <- ggplot(yr_built, aes(x=all_PLUTO_data.YearBuilt, y=cummulative_percent_built))+
geom_point(color = yr_built$color) +
ylab('Total of Buildings Constructed (%)') +
geom_vline(xintercept = 1930)+
geom_text(aes(1930,0.5,label = "<- 1930", hjust = -0.2)) +
geom_vline(xintercept = 1919)+
geom_text(aes(1919,0.25,label = "1919 ->", hjust = 1.2))+
xlim(1849, 2014) +
scale_y_continuous(labels = scales::percent)+
ggtitle("Cummulative Building Construction in NYC (%)") +
xlab ('Year Built')
p
2. The city is particularly worried about buildings that were unusually tall when they were built, since best-practices for safety hadn’t yet been determined. Create a graph that shows how many buildings of a certain number of floors were built in each ear (note: you may want to use a log scale for the number of buildings). It should be clear when 20-story buildings, 30-story buildings, and 40-story buildings were first built in large numbers.
First, we need to figure out which are the buildings erected with a higher than normal number of floors for the year. The strategy will then be:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
hi_rise <- all_PLUTO_data
#####################################################
# * Condense the data by year and number of floors
#####################################################
# Condense the data by binning both year built and num floors with interval widths of 0.1
# Also, transform the number of floors
date_by_floor <- condense(c(bin(hi_rise$YearBuilt, 5),
bin(hi_rise$NumFloors, 10,origin=0))
)
## Summarising with count
# make the column names something user friendly
colnames(date_by_floor) = c("YearBuilt","NumFloors","Count")
#####################################################
#* Obtain the mean, sd and FirstBuiltYear of the construction year by the number of floors.
#####################################################
# mean and sd
floor_summary_by_date <- date_by_floor %>%
na.omit %>%
group_by(NumFloors) %>%
summarise(date_avg=mean(YearBuilt),date_sd=sd(YearBuilt))
# set those NA sd to 1
floor_summary_by_date[is.na(floor_summary_by_date$date_sd),]$date_sd = 1
# tget the firstbuiltyear
first_built <- date_by_floor %>%
na.omit %>%
group_by(NumFloors) %>%
slice(which.min(YearBuilt))
colnames(first_built) <- c("FirstBuiltYear","NumFloors","Count")
#merge the two into first_built_summary
first_built_summary <- merge(floor_summary_by_date,first_built)
#####################################################
# * Normalize the construction year based on the average and sd of the construction year for buildings with the same floors.
#####################################################
# create a field that contains the color of the datapoint
date_by_floor <- data.frame(merge(date_by_floor,first_built_summary,by="NumFloors"))
# Normalize the year built
date_by_floor$YearBuiltNorm <- (date_by_floor$YearBuilt - date_by_floor$date_avg)/date_by_floor$date_sd
#####################################################
# * Create a separate column indicating if it's the first time a building this tall has been erected in this year
#####################################################
# set column to indicate if it was first built
date_by_floor$FirstBuilt <- (date_by_floor$FirstBuiltYear==date_by_floor$YearBuilt)
# coerce the standardized years of those buildings first constructed to be the min standardized value so the first erected buildings start with the same color
date_by_floor[date_by_floor$FirstBuilt,]$YearBuiltNorm <-min(date_by_floor["YearBuiltNorm"])
# remove rows with NA values
date_by_floor <- date_by_floor[complete.cases(date_by_floor),]
#####################################################
# * Plot the data in a heatmap style.
#####################################################
ggplot(date_by_floor, aes(x = YearBuilt, y = NumFloors,fill=YearBuiltNorm)) +
scale_y_continuous(breaks = round(seq(min(date_by_floor$NumFloors),
max(date_by_floor$NumFloors),
by = 10),
1)
)+
geom_bin2d(show.legend=FALSE,stat="identity")+
xlim(1849, 2014) +
ggtitle("Building Height Trends by Year of Construction") +
xlab ('Year Built') +
ylab ('Number of Floors') +
scale_fill_gradientn( guide_legend(title = ""),
limits=c(-2,2),
breaks=seq(-2, 2, by=1),
colours=rainbow(5),
labels = c("Early Riser","","Along with\nthe Crowd","","Latecomer")
)
From the chart above, we can see that first construction is ahead of the pack by about 50 years. It appears that this trend is regardless of the number of floors.
3. Your boss suspects that buildings constructed during the US’s involvement in World War II (1941-1945) are more poorly constructed than those before and after the way due to the high cost of materials during those years. She thinks that, if you calculate assessed value per floor, you will see lower values for buildings at that time vs before or after. Construct a chart/graph to see if she’s right.
library(dplyr)
construction_quality <- all_PLUTO_data[all_PLUTO_data$AssessTot>0,]
construction_quality$during_wwii<- construction_quality$YearBuilt <=1945 && construction_quality$YearBuilt >=1941
# Calculate the assesed value of the construction (AssessTot - AssessLand)
construction_quality$AssessConstruction <- (construction_quality$AssessTot-construction_quality$AssessLand)
# pro-rate it per floor
construction_quality$value_per_floor <- construction_quality$AssessConstruction / construction_quality$NumFloors
# Given that we have data reported every 5 or so years, let's bin as such
# in a way so that 1943 is at the center of the range (1941-1945)
cq_condensed <- condense(bin(construction_quality$YearBuilt,5,origin=1856),
summary="mean",
z=construction_quality$value_per_floor
)
colnames(cq_condensed) <- c("YearBuilt","Count","Mean")
# let's only show the asessed values of buildings in the vicinity of WWII (1930 -> 1955)
cq_condensed <- cq_condensed[complete.cases(cq_condensed),]
cq_condensed<-cq_condensed[cq_condensed$YearBuilt>1930 & cq_condensed$YearBuilt<=1955,]
# plot it
ggplot(cq_condensed, aes(x = YearBuilt, y = Mean)) +
geom_bar(stat="identity")+
xlim(1930, 1956) +
ggtitle("Average Assessed Value Per Floor Around World War II") +
xlab("Construction Year") +
ylab("Average Assessed Value of Construction Per Floor") +
scale_x_continuous(breaks = round(seq(1931,
1955,
by = 2),
1)
)+
scale_y_continuous(labels = scales::dollar)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
From the chart above, we can clearly see a sharp decline in the assessed construction value per floor during World War II. The decline is followed by a restoration to pre-war levels.