We will be working with data about every tax lot in NYC, in a zip file. Please download it from http://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page. You will need to combine the data from each boro into one file called ‘all_PLUTO_data.R’ in order to complete this assignment. I encourage you to do this via code but it is not necessary to submit such code.
if (file.exists("HWData")) {
unzip("HWData")
BK = read.csv("BORO_zip_files_csv\\BK.csv")
BX = read.csv("BORO_zip_files_csv\\BX.csv")
MN = read.csv("BORO_zip_files_csv\\MN.csv")
QN = read.csv("BORO_zip_files_csv\\QN.csv")
SI = read.csv("BORO_zip_files_csv\\SI.csv")
all_pluto = rbind(BK,BX,MN,QN,SI)
} else{
download.file("http://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_pluto_16v2%20.zip", destfile="HWData")
unzip("HWData")
BK = read.csv("BORO_zip_files_csv\\BK.csv")
BX = read.csv("BORO_zip_files_csv\\BX.csv")
MN = read.csv("BORO_zip_files_csv\\MN.csv")
QN = read.csv("BORO_zip_files_csv\\QN.csv")
SI = read.csv("BORO_zip_files_csv\\SI.csv")
all_pluto = rbind(BK,BX,MN,QN,SI)
}
#Verify number of Columns in data set
ncol(all_pluto)
## [1] 84
# a quick summary identifies min being 0 no null present and a max year of 2040 in data set.
summary(all_pluto$YearBuilt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1920 1930 1841 1955 2040
# Quick plot to verify data distribution
hist(all_pluto$YearBuilt,25)
# I will subset the data to remove data with 0,2040.
q1data <- all_pluto %>%
filter(YearBuilt >= 1850, YearBuilt != 2040) %>%
select(YearBuilt)
q1data$Count = 1
summary(q1data)
## YearBuilt Count
## Min. :1850 Min. :1
## 1st Qu.:1920 1st Qu.:1
## Median :1931 Median :1
## Mean :1941 Mean :1
## 3rd Qu.:1960 3rd Qu.:1
## Max. :2016 Max. :1
plotq1 = condense(bin(q1data$YearBuilt,10))
#autoplot(plotq1)+ ylab('YearBuilt')+theme_stata() theme_tufte()
#label = unit_format(unit = "K")
The majority of the buildings build after 1850 occurred between 1920 and 1939. Missing values did not raise concern regarding the accuracy of data because of the large sample size.
ggplot(plotq1, aes(x=plotq1$q1data.YearBuilt,y=plotq1$.count)) +
geom_bar(stat="identity",fill="sky blue")+
scale_x_continuous(breaks = seq(min(1850), max(2016), by = 10))+
scale_y_continuous(label = comma,breaks = seq(min(0), max(200000), by = 25000))+
labs(title = "Buildings Constructed 1850-2016", x= "Year", y= "Count")+
theme(plot.title = element_text(hjust = 0.5,size=20))+theme_pander()
ggsave('Question1.png')
q2data = all_pluto %>%
filter(YearBuilt >= 1850, YearBuilt != 2040,NumFloors >= 20) %>%
mutate(NumFloors = round(NumFloors,0)) %>%
select(YearBuilt,NumFloors)
plotq2 = condense(bin(q2data$YearBuilt, 1), bin(q2data$NumFloors, 1))
The chart below identifies that 1895 was the start of high rise buildings. This trend pick up from 1920-1935, slowed down between the great depression and World War 2.
autoplot(plotq2)+
scale_fill_gradient("Count", low = "sky blue", high = "red")+
scale_x_continuous(breaks = seq(min(1850), max(2016), by = 10))+
scale_y_continuous(label = comma,breaks = seq(min(20), max(120), by = 10))+
labs(title = "Buildings Constructed by Number of Floors", x= "Year", y= "Floors")+
theme(plot.title = element_text(hjust = 0.5,size=20))
ggsave('Question2.png')
q3data = all_pluto %>%
filter(YearBuilt >= 1932,YearBuilt <= 1954,NumFloors > 1,AssessTot > 0) %>%
mutate(NumFloors = round(NumFloors,0),FloorValN = round(AssessTot/NumFloors,0),FloorValM = (AssessTot/NumFloors)/1000000 )%>%
select(YearBuilt,FloorValN,FloorValM)
#write.csv(q3data,file = "mydata.csv")
#plotq2 = condense(bin(q2data$YearBuilt, 1), bin(q2data$NumFloors, 1))
Running a cumulative plot of value per floor indicates that the suspicion was true, and the values surrounding years 1941-1945 were lower with the exception of 1945.
ggplot(q3data, aes(x=q3data$YearBuilt,y=q3data$FloorValM)) +
geom_bar(stat="identity",fill="sky blue")+
scale_x_continuous(breaks = seq(min(1932), max(1954), by = 2))+
scale_y_continuous(label = comma,breaks = seq(min(0), max(1300), by = 50))+
labs(title = "Cumulative Value Per Floor by Year", x= "Year", y= "Value Millions")+
theme(plot.title = element_text(hjust = 0.5,size=20))+theme_pander()
ggsave('Question3a.png')
#gc() 1941-1945
q3bdata = all_pluto %>%
filter(YearBuilt >= 1937,YearBuilt <= 1949,NumFloors > 1,AssessTot > 0) %>%
mutate(NumFloors = round(NumFloors,0),FloorValM = (AssessTot/NumFloors)/1000000 )%>%
select(YearBuilt,FloorValM)
plotq4 = condense(bin(q3bdata$YearBuilt, 1), bin(q3bdata$FloorValM, 1))
autoplot(plotq4)+
scale_fill_gradient("Count", low = "sky blue", high = "red")+
scale_x_continuous(breaks = seq(min(1937), max(1949), by = 1))+
scale_y_continuous(label = comma,breaks = seq(min(0), max(50), by = 5))+
labs(title = "Value Per Building", x= "Year", y= "Value Millions")+
theme(plot.title = element_text(hjust = 0.5,size=20))+theme_pander()
ggsave('Question3b.png')