#using system() to run bash commands in R
#used the csvkit by Christopher Groskopf
#my first bash scripting experience!
system(paste("cd ", getwd(), "/nyc_pluto_15v1", sep=""))
#list of boros
boros = c("BK","BX","Mn","QN","SI")
#create empty data frame to fill with data
build_data = data_frame()
#same procedure for each borough file
for(each in boros){
#select the columns I want to use for this homework
columns = "Borough,ZipCode,YearBuilt,NumFloors,AssessLand,AssessTot"
bash_script = sprintf("csvcut -c %s nyc_pluto_15v1/%s.csv > temp%s.csv",
columns, each, each)
#run the bash script
system(bash_script)
#create full file name
file_name = paste("temp", each, ".csv", sep="")
#using data.table's fread function
temp_data = fread(file_name, data.table=FALSE)
#combine the data with all boroughs, only years after 1850 and none from the future
build_data = rbind(build_data, filter(temp_data,
YearBuilt > 1850,
YearBuilt < 2016))
}
#sample data to see if this worked
kable(build_data[sample(nrow(build_data), 5),])
| Borough | ZipCode | YearBuilt | NumFloors | AssessLand | AssessTot | |
|---|---|---|---|---|---|---|
| 779813 | SI | 10312 | 1950 | 1.5 | 7844 | 22314 |
| 625148 | QN | 11412 | 1945 | 2.0 | 10200 | 19440 |
| 156441 | BK | 11220 | 1905 | 2.0 | 10576 | 32271 |
| 129059 | BK | 11226 | 1920 | 2.0 | 7920 | 12265 |
| 101191 | BK | 11208 | 1930 | 2.0 | 18815 | 23288 |
#remove bad data
build_data = build_data[complete.cases(build_data),]
build_data = build_data %>% filter(NumFloors > 0,
AssessLand > 0,
AssessLand > 0)
#use bigvis to condense the data
year_built = condense(bin(build_data$YearBuilt, 1))
#use ggplot2 to make a chart
pa = autoplot(year_built)
pa = pa + xlab("Year") + ylab("Number of Buildings")
pa = pa + ggtitle("Building Construction by Date")
ggsave("Capofari_HW2_1a.jpeg")
pa
This seems odd. Due to the spikes in the chart, I believe the year’s these buildings were built is not accurate. If we zoom in to see a 20 year span, we can clearly see that the year’s used are dominated by multiples of 5s. A smoother curve is generated when the building data is maintained yearly, like it is over the last 20 years. Also, it seems that an overwhelming amount of buildings were built in 1925, which seems incorrect.
pb = autoplot(year_built[which(year_built > 1909 &
year_built < 1941),])
pb = pb + ylab("Number of Buildings") + xlab("Year")
pb = pb + ggtitle("Building Construction:\n1930 to 1950")
ggsave("Capofari_HW2_1b.jpeg")
pc = autoplot(year_built[year_built > 1984,])
pc = pc + ylab("") + xlab("Year")
pc = pc + ggtitle("Building Construction:\n1985 to the Present")
ggsave("Capofari_HW2_1c.jpeg")
#create a side by side plot
grid.arrange(pb,pc,ncol=2)
By increasing the size of the bins using the condense function from the bigvis package, we can remove the spikes that were present in our oriinal chart.
#to find the 'cut off' date, find the total houses built / 2
total = max(cumsum(year_built$.count))
cutoff = ceiling(total / 2)
#find the year where the cumsum is > than the cutoff total
cutoff_year = year_built[cumsum(year_built$.count) > cutoff,][1,1][[1]]
year_built_10 = condense(bin(build_data$YearBuilt, 10))
pd = autoplot(year_built_10)
pd = pd + xlab("Number of Buildings") + ylab("Year")
pd = pd + ggtitle("Building Construction by Date")
pd = pd + geom_vline(xintercept=cutoff_year,
color="red", linetype="longdash")
pd = pd + annotate("text", x=1933, y=182000,
color="red",
hjust=0,
label=paste(
"Half of all buildings\nwere built after",
cutoff_year))
ggsave("Capofari_HW2_1d.jpeg")
pd
#too much empty space from 1850-1900
#I am going to maximize ink on the page
floor_df = build_data %>%
filter(YearBuilt > 1900) %>%
#look at building floors grouped by 20
mutate(floor_group=floor(NumFloors / 10)) %>%
group_by(YearBuilt, floor_group) %>%
summarize(n=length(YearBuilt))
#group all buildings over 49 floors together
floor_df$floor_group[floor_df$floor_group > 4] = 5
#column for labeling
floor_df$labels = character(nrow(floor_df))
floor_df$labels[floor_df$floor_group == 0] = "10 and less"
floor_df$labels[floor_df$floor_group == 1] = "10-19"
floor_df$labels[floor_df$floor_group == 2] = "20-29"
floor_df$labels[floor_df$floor_group == 3] = "30-39"
floor_df$labels[floor_df$floor_group == 4] = "40-49"
floor_df$labels[floor_df$floor_group == 5] = "50 and more"
floor_chart = ggplot(data=floor_df, aes(YearBuilt, n, fill=labels))
floor_chart = floor_chart + geom_bar(stat="identity")
floor_chart = floor_chart + facet_grid(labels~., scales="free")
floor_chart = floor_chart + ylab("Number of Buildings") + xlab("")
floor_chart = floor_chart + ggtitle("Floors by Year")
floor_chart = floor_chart + scale_y_discrete(breaks=pretty_breaks())
floor_chart = floor_chart + scale_fill_discrete(guide=FALSE)
ggsave("Capofari_HW2_2.jpeg")
floor_chart
price_floor = floor(build_data$AssessTot/build_data$NumFloors)
price_floor_condense = condense(bin(build_data$YearBuilt, 10),
z=price_floor)
P1 = autoplot(price_floor_condense)
P1 = P1 + xlab("Year") + ylab("Average Price per Floor")
P1 = P1 + ggtitle("Un-Smooth Data")
P1 = P1 + scale_y_continuous(labels=dollar)
P1 = P1 + theme(legend.position="none")
ggsave("Capofari_HW2_3a.jpeg")
#smooth
price_floor_smooth = smooth(price_floor_condense,
20, var=".mean", type="mean")
P2 = autoplot(price_floor_smooth)
P2 = P2 + xlab("Year") + ylab("Average Price per Floor")
P2 = P2 + ggtitle("Smooth Data")
P2 = P2 + scale_y_continuous(labels=dollar)
P2 = P2 + theme(legend.position="none")
ggsave("Capofari_HW2_3b.jpeg")
#zoom in
price_floor_condense = condense(bin(build_data$YearBuilt, 1),
z=price_floor)
price_floor_smooth = smooth(price_floor_condense,
20, var=".mean", type="mean")
P3 = autoplot(price_floor_smooth[which(
price_floor_smooth > 1909 & price_floor_smooth < 1960),])
P3 = P3 + xlab("Year") + ylab("Average Price per Floor")
P3 = P3 + ggtitle("Building Data\n1925 - 1960")
P3 = P3 + scale_y_continuous(labels=dollar)
P3 = P3 + theme(legend.position="none")
ggsave("Capofari_HW2_3c.jpeg")
grid.arrange(arrangeGrob(P1,P2),P3,ncol=2)
P4 = autoplot(price_floor_condense[which(
price_floor_smooth > 1909 & price_floor_smooth < 1960),])
P4 = P4 + xlab("Year") + ylab("Average Price per Floor")
P4 = P4 + ggtitle("Building Data\n1925 - 1960")
P4 = P4 + scale_y_continuous(labels=dollar)
P4 = P4 + theme(legend.position="none")
ggsave("Capofari_HW2_3d.jpeg")
P4
Looks like there is a dip during WWII. But wait, if we look at the data before the smoothing process, this dip could be attributed to the large spikes that we can see in 1933.