Packages Used

Below are the packages used for the analysis.

  • Note: You may want to install some of the packages first if you dont have them already.
options(warn=-1)

suppressMessages(library(knitr))
suppressMessages(library(tidyr))
suppressMessages(library(ggplot2))
suppressMessages(require(dplyr))

suppressMessages(library(graphics))
suppressMessages(library(plotly))
suppressMessages(library(ggthemes))
suppressMessages(library(Amelia))
suppressMessages(library(scales))

Question 1:

After a few building collapses, the City of New York is going to begin investigating older buildings for safety. However, the city has a limited number of inspectors, and wants to find a ‘cut-off’ date before most city buildings were constructed. Build a graph to help the city determine when most buildings were constructed. Is there anything in the results that causes you to question the accuracy of the data? (note: only look at buildings built since 1850)

  • lets load the datasets.

A glimpse of some of the dataset.

kable(head(BK))
Borough Block Lot CD CT2010 CB2010 SchoolDist Council ZipCode FireComp PolicePrct HealthArea SanitBoro SanitDistrict SanitSub Address ZoneDist1 ZoneDist2 ZoneDist3 ZoneDist4 Overlay1 Overlay2 SPDist1 SPDist2 SPDist3 LtdHeight SplitZone BldgClass LandUse Easements OwnerType OwnerName LotArea BldgArea ComArea ResArea OfficeArea RetailArea GarageArea StrgeArea FactryArea OtherArea AreaSource NumBldgs NumFloors UnitsRes UnitsTotal LotFront LotDepth BldgFront BldgDepth Ext ProxCode IrrLotCode LotType BsmtCode AssessLand AssessTot ExemptLand ExemptTot YearBuilt YearAlter1 YearAlter2 HistDist Landmark BuiltFAR ResidFAR CommFAR FacilFAR BoroCode BBL CondoNo Tract2010 XCoord YCoord ZoneMap ZMCode Sanborn TaxMap EDesigNum APPBBL APPDate PLUTOMapID Version
BK 1 1 302 21 NA 13 33 11201 L118 84 1000 3 2 1B JOHN STREET M3-1 M1-4/R8A MX-2 Y V1 11 1 P BROOKLYN BRIDGE PARK 151930 0 0 0 0 0 0 0 0 0 4 0 0 0 0 206.25 337.42 0 0 0 Y 5 5 1164645 1164645 0 0 0 0 0 DUMBO Historic District 0.00 0.0 2 0.0 3 3000010001 0 21 NA NA 12d 302 007 30101 3000010001 11/26/2013 1 16V2
BK 1 50 302 21 2000 13 33 11201 L118 84 1000 3 2 1B 10 JAY STREET M1-4/R8A M3-1 MX-2 Y E9 6 0 SAFDI PLAZA REALTY IN 19682 154400 154400 0 17160 0 0 68620 0 68620 2 1 9 0 90 109.08 207.25 88 195 1 Y 3 0 834300 12019950 0 290250 1920 1994 2007 DUMBO Historic District 7.84 5.4 2 6.5 3 3000010050 0 21 987838 195989 12d 302 007 30101 E-231 0 1 16V2
BK 1 7501 302 21 2000 13 33 11201 L118 84 1000 1 2 1 JOHN STREET M3-1 N RM 4 1 0 102534 4686 97848 0 4686 0 0 0 0 7 1 12 42 44 0.00 0.00 0 0 0 N 0 5 389696 5864853 0 0 0 0 0 0.00 0.0 2 0.0 3 3000017501 3819 21 987624 195992 12d 302 007 30101 3000010002 3/4/2016 1 16V2
BK 3 1 302 21 3002 13 33 11201 L118 84 1000 3 2 1B JAY STREET M3-1 N U4 7 1 CONSOLIDED EDISON CO/ 387060 0 0 0 0 0 0 0 0 0 7 15 0 0 0 1051.00 469.00 0 0 0 Y 5 5 0 0 0 0 0 0 0 0.00 0.0 2 0.0 3 3000030001 0 21 988568 196151 12d 302 007 30101 0 1 16V2
BK 3 5 302 21 NA 13 33 11201 L118 84 1000 3 2 1B JAY STREET M3-1 N T2 7 0 C NYC DSBS 6384 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0.00 0.00 0 0 0 N 5 5 21150 21150 21150 21150 0 0 0 DUMBO Historic District 0.00 0.0 2 0.0 3 3000030005 0 21 NA NA 12d 302 007 30101 0 4 16V2
BK 3 35 302 21 NA 13 33 11201 L118 84 1000 3 2 1B GOLD STREET M3-1 N T2 7 0 C NYC DSBS 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0.00 0.00 0 0 0 N 5 5 1800 1800 1800 1800 0 0 0 0.00 0.0 2 0.0 3 3000030035 0 21 NA NA 302 007 30101 0 2 16V2
kable(head(BX))
Borough Block Lot CD CT2010 CB2010 SchoolDist Council ZipCode FireComp PolicePrct HealthArea SanitBoro SanitDistrict SanitSub Address ZoneDist1 ZoneDist2 ZoneDist3 ZoneDist4 Overlay1 Overlay2 SPDist1 SPDist2 SPDist3 LtdHeight SplitZone BldgClass LandUse Easements OwnerType OwnerName LotArea BldgArea ComArea ResArea OfficeArea RetailArea GarageArea StrgeArea FactryArea OtherArea AreaSource NumBldgs NumFloors UnitsRes UnitsTotal LotFront LotDepth BldgFront BldgDepth Ext ProxCode IrrLotCode LotType BsmtCode AssessLand AssessTot ExemptLand ExemptTot YearBuilt YearAlter1 YearAlter2 HistDist Landmark BuiltFAR ResidFAR CommFAR FacilFAR BoroCode BBL CondoNo Tract2010 XCoord YCoord ZoneMap ZMCode Sanborn TaxMap EDesigNum APPBBL APPDate PLUTOMapID Version
BX 2260 1 201 19 1022 7 8 10454 L029 40 4700 2 1 2A 122 BRUCKNER BOULEVARD M1-5/R8A MX-1 NA NA NA N Z9 NA 0 122 BRUCKNER PARTNERS 15000 0 0 0 0 0 0 0 0 0 7 1 0 0 0 75.00 200 0 0 0 N 3 5 130500 152100 0 0 0 0 0 0.00 6.02 5 6.5 2 2022600001 0 19 1005957 232162 6b 209S016 20901 E-143 0 1 16V2
BX 2260 4 201 19 1022 7 8 10454 L029 40 4700 2 1 2A 126 BRUCKNER BOULEVARD M1-5/R8A MX-1 NA NA NA N G5 7 0 24 INDIAN HEAD HOLDIN 13770 752 752 0 272 0 0 480 0 0 2 2 1 0 1 137.58 100 16 16 0 N 5 5 117000 293850 0 18225 1931 1994 0 0.05 6.02 5 6.5 2 2022600004 0 19 1006076 232156 6b 209S016 20901 E-143 0 1 16V2
BX 2260 10 201 19 1022 7 8 10454 L029 40 4700 2 1 2A 138 BRUCKNER BOULEVARD M1-5/R8A MX-1 NA NA NA N F9 6 0 P ANJOST CORP 35000 39375 39375 0 0 0 0 0 39375 0 2 1 2 0 1 175.00 200 175 200 0 N 4 5 153000 822150 0 0 1931 0 0 1.13 6.02 5 6.5 2 2022600010 0 19 1006187 232036 6b 209S016 20901 E-143 0 1 16V2
BX 2260 17 201 19 1022 7 8 10454 L029 40 4700 2 1 2A 144 BRUCKNER BOULEVARD M1-5/R8A MX-1 NA NA NA N E9 6 0 144 BRUCKNER LLC 2500 10625 10625 0 0 2125 0 0 8500 0 2 1 5 0 1 25.00 100 25 85 0 N 5 5 33750 227250 0 0 1931 2001 0 4.25 6.02 5 6.5 2 2022600017 0 19 1006299 232033 6b 209S016 20901 E-143 0 1 16V2
BX 2260 18 201 19 1022 7 8 10454 L029 40 4700 2 1 2A 148 BRUCKNER BOULEVARD M1-5/R8A MX-1 NA NA NA N C7 4 0 P 148 BRUCKNER LLC 1875 8595 1719 6876 0 1719 0 0 0 0 2 1 5 8 10 25.00 75 25 70 0 N 3 2 17316 108227 0 44548 1920 2009 0 4.58 6.02 5 6.5 2 2022600018 0 19 1006363 232040 6b 209S016 20901 E-143 0 1 16V2
BX 2260 19 201 19 1022 7 8 10454 L029 40 4700 2 1 2A ST ANNS AVENUE M1-5/R8A MX-1 NA NA NA N G7 10 0 138 BRUCKNER BLVD. AS 15625 0 0 0 0 0 0 0 0 0 7 0 0 0 0 175.00 100 0 0 0 Y 3 5 147150 147150 0 0 0 0 0 0.00 6.02 5 6.5 2 2022600019 0 19 1006303 231950 6b 209S016 20901 E-143 2022600019 12/31/1990 1 16V2

As instructed, we will combine the four datasets as one dataframe.

df <- rbind.data.frame(BK, BX, MN, QN, SI)
kable(head(df))
Borough Block Lot CD CT2010 CB2010 SchoolDist Council ZipCode FireComp PolicePrct HealthArea SanitBoro SanitDistrict SanitSub Address ZoneDist1 ZoneDist2 ZoneDist3 ZoneDist4 Overlay1 Overlay2 SPDist1 SPDist2 SPDist3 LtdHeight SplitZone BldgClass LandUse Easements OwnerType OwnerName LotArea BldgArea ComArea ResArea OfficeArea RetailArea GarageArea StrgeArea FactryArea OtherArea AreaSource NumBldgs NumFloors UnitsRes UnitsTotal LotFront LotDepth BldgFront BldgDepth Ext ProxCode IrrLotCode LotType BsmtCode AssessLand AssessTot ExemptLand ExemptTot YearBuilt YearAlter1 YearAlter2 HistDist Landmark BuiltFAR ResidFAR CommFAR FacilFAR BoroCode BBL CondoNo Tract2010 XCoord YCoord ZoneMap ZMCode Sanborn TaxMap EDesigNum APPBBL APPDate PLUTOMapID Version
BK 1 1 302 21 NA 13 33 11201 L118 84 1000 3 2 1B JOHN STREET M3-1 M1-4/R8A MX-2 Y V1 11 1 P BROOKLYN BRIDGE PARK 151930 0 0 0 0 0 0 0 0 0 4 0 0 0 0 206.25 337.42 0 0 0 Y 5 5 1164645 1164645 0 0 0 0 0 DUMBO Historic District 0.00 0.0 2 0.0 3 3000010001 0 21 NA NA 12d 302 007 30101 3000010001 11/26/2013 1 16V2
BK 1 50 302 21 2000 13 33 11201 L118 84 1000 3 2 1B 10 JAY STREET M1-4/R8A M3-1 MX-2 Y E9 6 0 SAFDI PLAZA REALTY IN 19682 154400 154400 0 17160 0 0 68620 0 68620 2 1 9 0 90 109.08 207.25 88 195 1 Y 3 0 834300 12019950 0 290250 1920 1994 2007 DUMBO Historic District 7.84 5.4 2 6.5 3 3000010050 0 21 987838 195989 12d 302 007 30101 E-231 0 1 16V2
BK 1 7501 302 21 2000 13 33 11201 L118 84 1000 1 2 1 JOHN STREET M3-1 N RM 4 1 0 102534 4686 97848 0 4686 0 0 0 0 7 1 12 42 44 0.00 0.00 0 0 0 N 0 5 389696 5864853 0 0 0 0 0 0.00 0.0 2 0.0 3 3000017501 3819 21 987624 195992 12d 302 007 30101 3000010002 3/4/2016 1 16V2
BK 3 1 302 21 3002 13 33 11201 L118 84 1000 3 2 1B JAY STREET M3-1 N U4 7 1 CONSOLIDED EDISON CO/ 387060 0 0 0 0 0 0 0 0 0 7 15 0 0 0 1051.00 469.00 0 0 0 Y 5 5 0 0 0 0 0 0 0 0.00 0.0 2 0.0 3 3000030001 0 21 988568 196151 12d 302 007 30101 0 1 16V2
BK 3 5 302 21 NA 13 33 11201 L118 84 1000 3 2 1B JAY STREET M3-1 N T2 7 0 C NYC DSBS 6384 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0.00 0.00 0 0 0 N 5 5 21150 21150 21150 21150 0 0 0 DUMBO Historic District 0.00 0.0 2 0.0 3 3000030005 0 21 NA NA 12d 302 007 30101 0 4 16V2
BK 3 35 302 21 NA 13 33 11201 L118 84 1000 3 2 1B GOLD STREET M3-1 N T2 7 0 C NYC DSBS 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0.00 0.00 0 0 0 N 5 5 1800 1800 1800 1800 0 0 0 0.00 0.0 2 0.0 3 3000030035 0 21 NA NA 302 007 30101 0 2 16V2
str(df)
## 'data.frame':    858370 obs. of  84 variables:
##  $ Borough      : chr  "BK" "BK" "BK" "BK" ...
##  $ Block        : int  1 1 1 3 3 3 5 5 6 6 ...
##  $ Lot          : int  1 50 7501 1 5 35 1 8 1 10 ...
##  $ CD           : int  302 302 302 302 302 302 302 302 302 302 ...
##  $ CT2010       : num  21 21 21 21 21 21 21 21 21 21 ...
##  $ CB2010       : int  NA 2000 2000 3002 NA NA 3001 NA 3000 NA ...
##  $ SchoolDist   : int  13 13 13 13 13 13 13 13 13 13 ...
##  $ Council      : int  33 33 33 33 33 33 33 33 33 33 ...
##  $ ZipCode      : int  11201 11201 11201 11201 11201 11201 11201 11201 11201 11201 ...
##  $ FireComp     : chr  "L118" "L118" "L118" "L118" ...
##  $ PolicePrct   : int  84 84 84 84 84 84 84 84 84 84 ...
##  $ HealthArea   : int  1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
##  $ SanitBoro    : int  3 3 1 3 3 3 3 3 3 3 ...
##  $ SanitDistrict: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ SanitSub     : chr  "1B" "1B" "" "1B" ...
##  $ Address      : chr  "        JOHN STREET" "10 JAY STREET" "1 JOHN STREET" "        JAY STREET" ...
##  $ ZoneDist1    : chr  "M3-1" "M1-4/R8A" "M3-1" "M3-1" ...
##  $ ZoneDist2    : chr  "M1-4/R8A" "M3-1" "" "" ...
##  $ ZoneDist3    : chr  "" "" "" "" ...
##  $ ZoneDist4    : chr  "" "" "" "" ...
##  $ Overlay1     : chr  "" "" "" "" ...
##  $ Overlay2     : chr  "" "" "" "" ...
##  $ SPDist1      : chr  "MX-2" "MX-2" "" "" ...
##  $ SPDist2      : chr  "" "" "" "" ...
##  $ SPDist3      : chr  "" "" "" "" ...
##  $ LtdHeight    : chr  "" "" "" "" ...
##  $ SplitZone    : chr  "Y" "Y" "N" "N" ...
##  $ BldgClass    : chr  "V1" "E9" "RM" "U4" ...
##  $ LandUse      : int  11 6 4 7 7 7 7 7 7 7 ...
##  $ Easements    : int  1 0 1 1 0 0 0 0 1 0 ...
##  $ OwnerType    : chr  "P" "" "" "" ...
##  $ OwnerName    : chr  "BROOKLYN BRIDGE PARK" "SAFDI PLAZA REALTY IN" "" "CONSOLIDED EDISON CO/" ...
##  $ LotArea      : int  151930 19682 0 387060 6384 0 0 0 323307 66889 ...
##  $ BldgArea     : int  0 154400 102534 0 0 0 0 0 0 0 ...
##  $ ComArea      : int  0 154400 4686 0 0 0 0 0 0 0 ...
##  $ ResArea      : int  0 0 97848 0 0 0 0 0 0 0 ...
##  $ OfficeArea   : int  0 17160 0 0 0 0 0 0 0 0 ...
##  $ RetailArea   : int  0 0 4686 0 0 0 0 0 0 0 ...
##  $ GarageArea   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ StrgeArea    : int  0 68620 0 0 0 0 0 0 0 0 ...
##  $ FactryArea   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ OtherArea    : int  0 68620 0 0 0 0 0 0 0 0 ...
##  $ AreaSource   : int  4 2 7 7 7 7 7 7 7 7 ...
##  $ NumBldgs     : int  0 1 1 15 0 0 4 0 15 0 ...
##  $ NumFloors    : num  0 9 12 0 0 0 0 0 0 0 ...
##  $ UnitsRes     : int  0 0 42 0 0 0 0 0 0 0 ...
##  $ UnitsTotal   : int  0 90 44 0 0 0 0 0 0 0 ...
##  $ LotFront     : num  206 109 0 1051 0 ...
##  $ LotDepth     : num  337 207 0 469 0 ...
##  $ BldgFront    : num  0 88 0 0 0 0 0 0 0 0 ...
##  $ BldgDepth    : num  0 195 0 0 0 0 0 0 0 0 ...
##  $ Ext          : chr  "" "" "" "" ...
##  $ ProxCode     : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ IrrLotCode   : chr  "Y" "Y" "N" "Y" ...
##  $ LotType      : int  5 3 0 5 5 5 3 5 3 5 ...
##  $ BsmtCode     : int  5 0 5 5 5 5 5 5 5 5 ...
##  $ AssessLand   : num  1164645 834300 389696 0 21150 ...
##  $ AssessTot    : num  1164645 12019950 5864853 0 21150 ...
##  $ ExemptLand   : num  0 0 0 0 21150 ...
##  $ ExemptTot    : num  0 290250 0 0 21150 ...
##  $ YearBuilt    : int  0 1920 0 0 0 0 0 0 0 0 ...
##  $ YearAlter1   : int  0 1994 0 0 0 0 0 0 0 0 ...
##  $ YearAlter2   : int  0 2007 0 0 0 0 0 0 0 0 ...
##  $ HistDist     : chr  "DUMBO Historic District" "DUMBO Historic District" "" "" ...
##  $ Landmark     : chr  "" "" "" "" ...
##  $ BuiltFAR     : num  0 7.84 0 0 0 0 0 0 0 0 ...
##  $ ResidFAR     : num  0 5.4 0 0 0 0 0 0 0 0 ...
##  $ CommFAR      : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ FacilFAR     : num  0 6.5 0 0 0 0 0 0 0 0 ...
##  $ BoroCode     : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ BBL          : num  3e+09 3e+09 3e+09 3e+09 3e+09 ...
##  $ CondoNo      : int  0 0 3819 0 0 0 0 0 0 0 ...
##  $ Tract2010    : int  21 21 21 21 21 21 21 21 21 21 ...
##  $ XCoord       : int  NA 987838 987624 988568 NA NA 989346 NA 989745 NA ...
##  $ YCoord       : int  NA 195989 195992 196151 NA NA 196266 NA 196040 NA ...
##  $ ZoneMap      : chr  "12d" "12d" "12d" "12d" ...
##  $ ZMCode       : chr  "" "" "" "" ...
##  $ Sanborn      : chr  "302 007" "302 007" "302 007" "302 007" ...
##  $ TaxMap       : int  30101 30101 30101 30101 30101 30101 30101 30101 30101 30101 ...
##  $ EDesigNum    : chr  "" "E-231" "" "" ...
##  $ APPBBL       : num  3e+09 0e+00 3e+09 0e+00 0e+00 ...
##  $ APPDate      : chr  "11/26/2013" "" "3/4/2016" "" ...
##  $ PLUTOMapID   : int  1 1 1 1 4 2 1 4 1 1 ...
##  $ Version      : chr  "16V2" "16V2" "16V2" "16V2" ...
  • Checking for any missing values.
missing_value <- missmap(df, main= "Plot Showing The Columns With Missing Values")

missing_value
## NULL
ggsave(filename = "Ganiyu_Musa_2_hw2.pdf", plot = missing_value, width = 6, height = 7)

The above plot shows that we have a lot of missing values. Lets deal with that first!

df2 <- df[complete.cases(df), ]
newdata <- na.omit(df2)

After the removal of the missing values, we will then subset the dataset from 1850 downward.

year_built <- filter(newdata, YearBuilt >=1850)
kable(head(year_built))
Borough Block Lot CD CT2010 CB2010 SchoolDist Council ZipCode FireComp PolicePrct HealthArea SanitBoro SanitDistrict SanitSub Address ZoneDist1 ZoneDist2 ZoneDist3 ZoneDist4 Overlay1 Overlay2 SPDist1 SPDist2 SPDist3 LtdHeight SplitZone BldgClass LandUse Easements OwnerType OwnerName LotArea BldgArea ComArea ResArea OfficeArea RetailArea GarageArea StrgeArea FactryArea OtherArea AreaSource NumBldgs NumFloors UnitsRes UnitsTotal LotFront LotDepth BldgFront BldgDepth Ext ProxCode IrrLotCode LotType BsmtCode AssessLand AssessTot ExemptLand ExemptTot YearBuilt YearAlter1 YearAlter2 HistDist Landmark BuiltFAR ResidFAR CommFAR FacilFAR BoroCode BBL CondoNo Tract2010 XCoord YCoord ZoneMap ZMCode Sanborn TaxMap EDesigNum APPBBL APPDate PLUTOMapID Version
BK 1 50 302 21 2000 13 33 11201 L118 84 1000 3 2 1B 10 JAY STREET M1-4/R8A M3-1 MX-2 Y E9 6 0 SAFDI PLAZA REALTY IN 19682 154400 154400 0 17160 0 0 68620 0 68620 2 1 9 0 90 109.08 207.25 88.00 195 1 Y 3 0 834300 12019950 0 290250 1920 1994 2007 DUMBO Historic District 7.84 5.4 2 6.5 3 3000010050 0 21 987838 195989 12d 302 007 30101 E-231 0 1 16V2
BK 7 21 302 21 2008 13 33 11201 E205 84 1000 3 2 1B 1 PLYMOUTH STREET M3-1 N G1 10 0 C DEPT OF ENVIRONMENTAL 10650 9585 9585 0 0 0 0 0 0 9585 2 2 1 0 1 102.17 213.17 69.00 113 0 Y 3 5 129600 309600 129600 309600 1988 1988 0 0.90 0.0 2 0.0 3 3000070021 0 21 987216 195803 12d 302 006 30101 0 1 16V2
BK 16 5 302 21 1001 13 33 11201 E205 84 1000 3 2 1B 65 PLYMOUTH STREET PARK N Q0 9 0 C PARKS AND RECREATION 44000 5000 5000 0 0 0 0 0 0 5000 2 2 1 0 1 100.00 460.00 50.00 165 0 Y 5 5 178200 201600 178200 201600 1920 0 0 0.11 0.0 0 0.0 3 3000160005 0 21 986856 195975 12d 302 006 30101 0 1 16V2
BK 18 1 302 21 2002 13 33 11201 L118 84 1000 3 2 1B 135 PLYMOUTH STREET M1-4/R8A MX-2 N D5 3 0 DON BEN R CO 41250 211386 37360 174026 0 37360 0 0 0 0 2 1 7 100 102 206.25 200.00 156.00 48 0 N 3 5 514350 6977700 0 0 1900 2014 0 DUMBO Historic District 5.12 5.4 2 6.5 3 3000180001 0 21 987577 195811 12d 302 006 30101 E-231 0 1 16V2
BK 19 1 302 21 2003 13 33 11201 L118 84 1000 3 2 1B 20 JAY STREET M1-4/R8A MX-2 N O6 5 0 P DUMBO PARTNERS B LLC 41400 460000 460000 0 368980 3421 41400 46199 0 0 2 1 11 0 42 200.00 207.00 200.00 207 0 Y 3 5 972000 26132850 0 7794360 1911 1999 0 DUMBO Historic District 11.11 5.4 2 6.5 3 3000190001 0 21 987835 195800 12d 302 007 30101 E-231 0 1 16V2
BK 20 1 302 21 3008 13 33 11201 L118 84 1000 3 2 1B 35 JAY STREET M1-4/R8A MX-2 N E9 6 0 FORMAN FERRY, LLC 18955 21735 21735 0 0 0 0 21735 0 0 2 1 1 0 2 99.33 150.00 99.33 150 0 Y 3 5 214650 1484550 0 0 1977 2012 0 DUMBO Historic District 1.15 5.4 2 6.5 3 3000200001 0 21 988077 195754 12d 302 007 30101 E-231 0 1 16V2

As instructed, here is the list of top 10 number of houses built in a particular year. Where 1920 had the most house built.

pp <- year_built %>%
  group_by(YearBuilt, Borough) %>%
  summarize(Year_Built_Count = n()) %>%
  arrange(desc(Year_Built_Count))


kable(head(pp))
YearBuilt Borough Year_Built_Count
1920 BK 30726
1930 BK 29400
1925 BK 28345
1910 BK 23033
1899 BK 19390
1931 BK 19155
top10 <- top_n(pp, 10, Year_Built_Count)
top10
## # A tibble: 164 x 3
## # Groups:   YearBuilt [164]
##    YearBuilt Borough Year_Built_Count
##        <int>   <chr>            <int>
##  1      1920      BK            30726
##  2      1930      BK            29400
##  3      1925      BK            28345
##  4      1910      BK            23033
##  5      1899      BK            19390
##  6      1931      BK            19155
##  7      1901      BK            14808
##  8      1960      BK            10952
##  9      1940      BK             8061
## 10      1915      BK             5922
## # ... with 154 more rows

A plot giving more explanation.

myplot <- qplot(YearBuilt, Year_Built_Count, data = top10, color = Borough)
ggplotly(myplot) %>% add_lines()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
p4 <- ggplot(pp, aes(x = YearBuilt, y = Year_Built_Count)) +
  geom_point() +
  ggtitle(" Year Against Count")

Using a Tufte plot.

p4 + geom_rangeframe() +
  theme_tufte() + 
  scale_x_continuous(breaks = extended_range_breaks()(pp$YearBuilt)) +
  scale_y_continuous(breaks = extended_range_breaks()(pp$YearBuilt))

ggsave(filename = "Ganiyu_Musa_5_hw2.pdf", plot = p4, width = 6, height = 7)

We want to check for any outliers in the plot.

outlier_values1 <- boxplot.stats(pp$Year_Built_Count)$out

outlier_values1
##  [1] 30726 29400 28345 23033 19390 19155 14808 10952  8061  5922  5911
## [12]  5574  5230  5212  4042  3765  2453  2445  1794  1670  1597
outlier_values2 <- boxplot.stats(pp$YearBuilt)$out
outlier_values2
## integer(0)

Above is the list of outliers.

p <- ggplot(top10, aes(x=YearBuilt, y=Year_Built_Count))
p + geom_boxplot(outlier.colour = "red", outlier.shape = 1)

ggsave(filename = "Ganiyu_Musa_4_hw2.pdf", plot = p, width = 6, height = 7)

More plot using box plot and Tufte boxplot.

p + geom_tufteboxplot()

gg <- ggplot(top10, aes(YearBuilt, Year_Built_Count)) +
  geom_line(aes(colour = Borough)) +
  geom_text(aes(label = YearBuilt), data = top10) +
  labs(title = "Graph showing top 10 houses built in a particular year") +
  geom_smooth(se = TRUE)
gg
## `geom_smooth()` using method = 'loess'

ggsave(filename = "Ganiyu_Musa_hw2_5.pdf", plot = gg, width = 6, height = 7)
## `geom_smooth()` using method = 'loess'
  • Is there anything in the results that causes you to question the accuracy of the data?

  • Answer: From the above graph, we can see that only the Borough BK (Brooklyn) building was built many years ago i.e all the top 10 are buildings are from Brooklyn.

m <- ggplot(top10, aes(YearBuilt, Year_Built_Count)) + coord_flip()
m + geom_bar(stat = "identity")

Question 2:

The city is particularly worried about buildings that were unusually tall when they were built, since best-practices for safety hadn’t yet been determined. Create a graph that shows how many buildings of a certain number of floors were built in each year (note: you may want to use a log scale for the number of buildings). It should be clear when 20-story buildings, 30-story buildings, and 40-story buildings were first built in large numbers.

num_floor <- year_built %>%
  group_by(NumFloors, YearBuilt, Borough) %>%
  summarize(Year_Built_Count = n()) %>%
  subset(NumFloors==20 | NumFloors==30 | NumFloors==40) %>%
  arrange(desc(NumFloors))

kable(num_floor)
NumFloors YearBuilt Borough Year_Built_Count
40 2005 BK 1
30 1910 BK 2
30 1929 BK 1
30 2007 BK 1
20 1930 BK 1
20 1948 BK 1
20 1960 BK 2
20 1962 BK 1
20 1964 BK 1
20 1965 BK 1
20 1968 BK 1
20 1970 BK 1
20 1971 BK 2
20 1972 BK 1
20 1979 BK 1
20 2003 BK 1
20 2007 BK 1

The above table shows the number of floors in group and their respective year built.

rr <-ggplot(num_floor, aes(NumFloors, YearBuilt)) +
  geom_col(aes(colour = Borough)) +
  geom_text(aes(label = YearBuilt), data = num_floor, position = position_stack(vjust = 0.5)) +
  labs(title = "Graph Showing The Year When 20, 30 & 40 Floors Was Built") + coord_flip()

rr

ggsave(filename = "Ganiyu_Musa_6_hw2.pdf", plot = rr, width = 6, height = 7)

Question 3:

Your boss suspects that buildings constructed during the US’s involvement in World War II (1941-1945) are more poorly constructed than those before and after the way due to the high cost of materials during those years. She thinks that, if you calculate assessed value per floor, you will see lower values for buildings at that time vs before or after. Construct a chart/graph to see if she’s right.

  • We are going to use dplyr to extract the data for World War II era, pre/before world war and post/after World War II.
world_war2 <- df %>%
  group_by(AssessTot, YearBuilt, NumFloors) %>%
  summarize(Year_Built_Count = n()) %>%
  mutate(AssessTot_per_floor = AssessTot / NumFloors) %>%
  subset(YearBuilt >= 1941 & YearBuilt <= 1945) %>%
  arrange(desc(AssessTot))

kable(head(world_war2))
AssessTot YearBuilt NumFloors Year_Built_Count AssessTot_per_floor
612072900 1945 13 1 47082531
102214800 1943 1 1 102214800
82368297 1941 9 1 9152033
60231600 1942 4 1 15057900
56928600 1941 11 1 5175327
46424448 1941 9 1 5158272
pre_world_war2 <- df %>%
  group_by(AssessTot, YearBuilt, NumFloors) %>%
  summarize(Year_Built_Count = n()) %>%
  mutate(AssessTot_per_floor = AssessTot / NumFloors) %>%
  subset(YearBuilt < 1941) %>%
  arrange(desc(AssessTot))

kable(head(pre_world_war2))
AssessTot YearBuilt NumFloors Year_Built_Count AssessTot_per_floor
3302640000 0 0 1 Inf
2719492200 1933 1 1 2719492200
2539648350 0 0 1 Inf
1356546600 0 0 1 Inf
946134000 0 0 1 Inf
648205652 1937 70 1 9260081
post_world_war2 <- df %>%
  group_by(AssessTot, YearBuilt, NumFloors) %>%
  summarize(Year_Built_Count = n()) %>%
  mutate(AssessTot_per_floor = AssessTot / NumFloors) %>%
  subset(YearBuilt > 1945) %>%
  arrange(desc(AssessTot))

kable(head(post_world_war2))
AssessTot YearBuilt NumFloors Year_Built_Count AssessTot_per_floor
6798347325 1994 3 1 2266115775
1299184650 2009 104 1 12492160
927978750 2006 6 1 154663125
771809400 2005 54 1 14292767
766783350 1968 50 1 15335667
717062052 2004 54 1 13278927

Data cleansing and manipulation for plotting (Note tha we reduced the dataset to 9000 observation for easier combination)

world_war2_2 <- world_war2[sample(1:nrow(world_war2), 9000, replace=FALSE),] 
pre_world_war2_2 <- pre_world_war2[sample(1:nrow(pre_world_war2), 9000, replace=FALSE),] 
Post_world_war2_2 <- post_world_war2[sample(1:nrow(post_world_war2), 9000, replace=FALSE),] 



combined <- cbind.data.frame(log(world_war2_2$AssessTot_per_floor), log(pre_world_war2_2$AssessTot_per_floor), log(Post_world_war2_2$AssessTot_per_floor), world_war2_2$YearBuilt)

colnames(combined) <- c("WorldWarII", "Pre_WorldWar", "Post_WorldWar", "YearBuilt")
  • Plot showing
dfplot <- combined %>% gather(key, value, -YearBuilt)
plt <- ggplot(dfplot, mapping = aes(x = YearBuilt, y = value, color = key) ) + geom_point() + labs(title = "Graph showing the relationship between World War II houses \n price per floor against Pre & Post War", subtitle = "Converted to Logarithms")

plt + coord_cartesian(xlim = c(1800, 1945), ylim = c(0, 40)) + theme_classic() + coord_flip()

ggsave(filename = "Ganiyu_Musa_hw2.pdf", plot = plt, width = 6, height = 7)