Load Libraries

if (!require('dplyr')) install.packages('dplyr')
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if (!require('ggplot2')) install.packages('ggplot2')
## Loading required package: ggplot2
if (!require('bigvis')) install.packages('bigvis')
## Loading required package: bigvis
## Loading required package: Rcpp
## 
## Attaching package: 'bigvis'
## The following object is masked from 'package:stats':
## 
##     smooth

Plot 1

1. After a few building collapses, the City of New York is going to begin investigating older buildings for safety. However, the city has a limited number of inspectors, and wants to find a ‘cut-off’ date before most city buildings were constructed. Build a graph to help the city determine when most buildings were constructed. Is there anything in the results that cause you to question the accuracy of the data? (note: only look at buildings built since 1850)

this_dir <- function(directory)
setwd( file.path(getwd(), directory) )

bk <- read.csv('BK.csv', header=TRUE) # Brooklyn
bx <- read.csv('BX.csv', header=TRUE) # The Bronx
mn <- read.csv('Mn.csv', header=TRUE) # Manhattan
qn <- read.csv('QN.csv', header=TRUE) # Queens
si <- read.csv('SI.csv', header=TRUE) # Statan Island


# Cobine all .csv to a single file

all_PLUTO_data <- bind_rows(bk, bx, mn, qn, si)


write.csv(all_PLUTO_data, file = "all_PLUTO_data")

# Get column names

names(all_PLUTO_data)
##  [1] "Borough"       "Block"         "Lot"           "CD"           
##  [5] "CT2010"        "CB2010"        "SchoolDist"    "Council"      
##  [9] "ZipCode"       "FireComp"      "PolicePrct"    "HealthArea"   
## [13] "SanitBoro"     "SanitDistrict" "SanitSub"      "Address"      
## [17] "ZoneDist1"     "ZoneDist2"     "ZoneDist3"     "ZoneDist4"    
## [21] "Overlay1"      "Overlay2"      "SPDist1"       "SPDist2"      
## [25] "SPDist3"       "LtdHeight"     "SplitZone"     "BldgClass"    
## [29] "LandUse"       "Easements"     "OwnerType"     "OwnerName"    
## [33] "LotArea"       "BldgArea"      "ComArea"       "ResArea"      
## [37] "OfficeArea"    "RetailArea"    "GarageArea"    "StrgeArea"    
## [41] "FactryArea"    "OtherArea"     "AreaSource"    "NumBldgs"     
## [45] "NumFloors"     "UnitsRes"      "UnitsTotal"    "LotFront"     
## [49] "LotDepth"      "BldgFront"     "BldgDepth"     "Ext"          
## [53] "ProxCode"      "IrrLotCode"    "LotType"       "BsmtCode"     
## [57] "AssessLand"    "AssessTot"     "ExemptLand"    "ExemptTot"    
## [61] "YearBuilt"     "YearAlter1"    "YearAlter2"    "HistDist"     
## [65] "Landmark"      "BuiltFAR"      "ResidFAR"      "CommFAR"      
## [69] "FacilFAR"      "BoroCode"      "BBL"           "CondoNo"      
## [73] "Tract2010"     "XCoord"        "YCoord"        "ZoneMap"      
## [77] "ZMCode"        "Sanborn"       "TaxMap"        "EDesigNum"    
## [81] "APPBBL"        "APPDate"       "PLUTOMapID"    "Version"
built_since_1850 <- all_PLUTO_data %>%
  select(YearBuilt) %>%
  filter(YearBuilt >= 1850, YearBuilt < 2016)

# Summary data

summary(built_since_1850)
##    YearBuilt   
##  Min.   :1850  
##  1st Qu.:1920  
##  Median :1931  
##  Mean   :1941  
##  3rd Qu.:1960  
##  Max.   :2015

From the summary we can see the YearBuilt starts from 0 and goes all the way upto year 2040 makes us question the integrity of the data, since the project requires us to start from year 1850, it will eliminate 0 values. We will also take remove any data that show YearBuilt above 2016.

# Bigvis condense

c_built_since_1850<- condense(bin(built_since_1850$YearBuilt, 5)) 
## Summarising with count
c_built_since_1850 <- na.omit(c_built_since_1850)
cut_off = (median(c_built_since_1850$built_since_1850.YearBuilt))

plot_1 <- autoplot(c_built_since_1850) +
  labs(title = "Numbers of Building Constructed by Year",
       x = "Years",
       y = "Count") +
  geom_vline(aes(xintercept = cut_off),
             color = "red", linetype="dashed")+
  annotate("text", x = 1942, y = 90000, label = cut_off, color="red")
  

plot_1

ggsave("plot_1.png")
## Saving 7 x 5 in image

The cut off year will be 1934.5 which is the median.


Plot 2

2. The city is particularly worried about buildings that were unusually tall when they were built, since best-practices for safety hadn’t yet been determined. Create a graph that shows how many buildings of a certain number of floors were built in each year (note: you may want to use a log scale for the number of buildings). It should be clear when 20-story buildings, 30-story buildings, and 40-story buildings were first built in large numbers.

number_of_floors <- all_PLUTO_data %>%
  filter(YearBuilt >= 1850, YearBuilt < 2016, NumFloors >= 20) %>%
  select(YearBuilt, NumFloors)


c_number_of_floors <-condense(bin(number_of_floors$YearBuilt, 1), bin(number_of_floors$NumFloors, 10))
## Summarising with count
plot_2 <- autoplot(c_number_of_floors) +
  labs(title="Numbers of Building by Numbers of Floors by Year Built", 
         y="Year Built", 
         x="Number of Floors")
plot_2

ggsave("plot_2.png")
## Saving 7 x 5 in image

Plot 3

3. Your boss suspects that buildings constructed during the US’s involvement in World War II (1941-1945) are more poorly constructed than those before and after the way due to the high cost of materials during those years. She thinks that, if you calculate assessed value per floor, you will see lower values for buildings at that time vs before or after. Construct a chart/graph to see if she’s right.

value_floor <- all_PLUTO_data %>%
  filter(YearBuilt >= 1850, YearBuilt < 2016, NumFloors > 0) %>%
  mutate(avpf = round(AssessTot/NumFloors,0)/1000) %>%
  select(YearBuilt, avpf)

c_value_floor <-  condense(bin(value_floor$YearBuilt,5), z =value_floor$avpf)
## Summarising with mean
plot_3 <- autoplot(c_value_floor) +
  labs(title = "Assesed Value per Floor",
       y = "Mean value in 1k",
       x = "Year")

plot_3

ggsave("plot_3.png")
## Saving 7 x 5 in image

Zoomed

c_value_floor_zoomed <-  condense(bin(value_floor$YearBuilt,1), z =value_floor$avpf) 
## Summarising with mean
plot_3zoomed <- autoplot(c_value_floor_zoomed)+
  xlim(1930,1965) +
  labs(title = "Assesed Value per Floor",
       y = "Mean value in 1k",
       x = "Year")
plot_3zoomed

ggsave("plot_3zoomed.png")
## Saving 7 x 5 in image

There doesn’t seem to be any evidence to support lower assessed values per floor for the buildings constructed during World War II.