library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(bit64)
## Loading required package: bit
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
## 
## Attaching package: 'bit'
## The following object is masked from 'package:data.table':
## 
##     setattr
## The following object is masked from 'package:base':
## 
##     xor
## Attaching package bit64
## package:bit64 (c) 2011-2012 Jens Oehlschlaegel
## creators: integer64 seq :
## coercion: as.integer64 as.vector as.logical as.integer as.double as.character as.bin
## logical operator: ! & | xor != == < <= >= >
## arithmetic operator: + - * / %/% %% ^
## math: sign abs sqrt log log2 log10
## math: floor ceiling trunc round
## querying: is.integer64 is.vector [is.atomic} [length] format print str
## values: is.na is.nan is.finite is.infinite
## aggregation: any all min max range sum prod
## cumulation: diff cummin cummax cumsum cumprod
## access: length<- [ [<- [[ [[<-
## combine: c rep cbind rbind as.data.frame
## WARNING don't use as subscripts
## WARNING semantics differ from integer
## for more help type ?bit64
## 
## Attaching package: 'bit64'
## The following object is masked from 'package:bit':
## 
##     still.identical
## The following objects are masked from 'package:base':
## 
##     %in%, :, is.double, match, order, rank
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(class)
## Warning: package 'class' was built under R version 3.5.3
library(OneR)
## Warning: package 'OneR' was built under R version 3.5.3
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(proj4)
## Warning: package 'proj4' was built under R version 3.5.3
library(sf)
## Warning: package 'sf' was built under R version 3.5.3
## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
## 
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
## 
##     wind
library(reticulate)
## Warning: package 'reticulate' was built under R version 3.5.3

Data Collection

path <- "C:/Users/Gurpreet/Documents/DATA608/nyc_pluto/PLUTO17v1.1/"

borough_list <- c("BK", "BX", "MN", "QN", "SI")

for (i in borough_list){
  temp <- fread(paste0(path,i, "2017V11.csv"))
  assign(paste0("pluto_",i),temp)
  rm(temp)
} 


library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
pluto_all <-do.call('rbind.fill',lapply(ls(pattern="pluto_"),get))
#unloadNamespace("plyr")

Data Exploration

dim(pluto_all)
## [1] 859223     89
table(pluto_all$YearBuilt)
## 
##     0  1661  1665  1706  1729  1765  1779  1780  1785  1798  1799  1800 
## 43699     1     1     1     1     1     1     1     2     1     1   165 
##  1801  1802  1804  1805  1807  1812  1814  1816  1821  1822  1823  1824 
##     1     2     1     1     1     1     1     2     2     3     1     6 
##  1825  1826  1827  1829  1830  1831  1832  1833  1834  1835  1836  1837 
##     4     6     4    20    14     6     5     1     7     6     7     2 
##  1838  1839  1840  1841  1842  1843  1844  1845  1846  1847  1848  1849 
##     5     7    21    10    11    11    19    19     9     9    15    20 
##  1850  1851  1852  1853  1854  1855  1856  1857  1858  1859  1860  1861 
##    45     8    14    12     6    13    11     8     7     9    36     3 
##  1862  1863  1864  1865  1866  1867  1868  1869  1870  1871  1872  1873 
##     2     3     3     8     6     5     8    12    58    20     5     5 
##  1874  1875  1876  1877  1878  1879  1880  1881  1882  1883  1884  1885 
##     4    15     2     1     3     3   179    49    11     8    12    25 
##  1886  1887  1888  1889  1890  1891  1892  1893  1894  1895  1896  1897 
##    18    16    18    17   440    49    12    14    13    50    42    19 
##  1898  1899  1900  1901  1902  1903  1904  1905  1906  1907  1908  1909 
##    33 24870  9586 25619   135   181   273  7620   752   647   351   877 
##  1910  1911  1912  1913  1914  1915  1916  1917  1918  1919  1920  1921 
## 46675   704   722   578   598 16398   646   515   254   309 91340  1158 
##  1922  1923  1924  1925  1926  1927  1928  1929  1930  1931  1932  1933 
##  1160  1632  2132 70699  3146  3274  4352  2127 77128 32530  1236   964 
##  1934  1935  1936  1937  1938  1939  1940  1941  1942  1943  1944  1945 
##   371 25433   641   667   761   964 38322   745   323   198   177 24952 
##  1946  1947  1948  1949  1950  1951  1952  1953  1954  1955  1956  1957 
##   345   379   603   754 47803   763   692   570   646 26804   764   944 
##  1958  1959  1960  1961  1962  1963  1964  1965  1966  1967  1968  1969 
##   811   931 37181   819   957  1089   947 19641   611   617   629   508 
##  1970  1971  1972  1973  1974  1975  1976  1977  1978  1979  1980  1981 
## 17139   498   631   852   854  9739   660   728  1009   625  5454   551 
##  1982  1983  1984  1985  1986  1987  1988  1989  1990  1991  1992  1993 
##   778  1161  1333  3473  3738  3800  3597  3607  3507  2261  3169  2458 
##  1994  1995  1996  1997  1998  1999  2000  2001  2002  2003  2004  2005 
##  2130  2557  3464  3162  2761  3724  4190  4534  4180  4431  6529  6012 
##  2006  2007  2008  2009  2010  2011  2012  2013  2014  2015  2016  2017 
##  4857  4085  2907  1809  1198  1403  1488  1362  1637  1675  1443    17 
##  2040 
##     1
min(pluto_all$YearBuilt)
## [1] 0
max(pluto_all$YearBuilt)
## [1] 2040
min(pluto_all$NumFloors)
## [1] 0
max(pluto_all$NumFloors)
## [1] 119

The YearBuilt with zero and 2040 are data errors/missing data, we will remove those records.

pluto_all <- pluto_all %>%  filter(!YearBuilt %in% c(0,2040, NA)) 

1.After a few building collapses, the City of New York is going to begin investigating older buildings for safety. The city is particularly worried about buildings that were unusually tall when they were built, since best-practices for safety hadn’t yet been determined. Create a graph that shows how many buildings of a certain number of floors were built in each year (note: you may want to use a log scale for the number of buildings). Find a strategy to bin buildings (It should be clear 20-29-story buildings, 30-39-story buildings, and 40-49-story buildings were first built in large numbers, but does it make sense to continue in this way as you get taller?)

As stated in data dictionary: If the NUMBER OF FLOORS is zero and the NUMBER OF BUILDINGS is greater than zero, then NUMBER OF FLOORS is not available for the tax lot. If the NUMBER OF FLOORS is zero and the NUMBER OF BUILDINGS is zero, then NUMBER OF FLOORS is not applicable for the tax lot.

Since we are focussing on number of floors, we want to filter for those categories for our analysis for this part.

q1 <- pluto_all %>% filter(NumBldgs!=0 | NumFloors!=0 | !is.na(NumFloors)|!is.na(NumBldgs)) %>%
      group_by(YearBuilt, NumFloors) %>%dplyr::summarize(num_build =n()) %>%
      mutate(l_numbuild = log(num_build)) %>% ungroup()

p0<-ggplot(q1, aes(x=YearBuilt, y=num_build)) +geom_line() + scale_y_log10()
ggplotly()

A look at the plot reveals that the from around 1830 approximately and back there are not that many buildings reported. That might be due to the fact the data might be misssing information due to data entry error or there were only buildings that were recorded into the system or the buildings my have been demolished and replaced by the newer ones. From that inference we might want to take a look at the data from 1830 and forward.

We will bin the years into a decade and floors will be binned into intervals of 10 and plot it to find the number of buildings in a decade with specific floors.

round_to_decade = function(value){ return(round(value / 10) * 10) }
#https://stackoverflow.com/questions/35352914/floor-a-year-to-the-decade-in-r

q1 <-  q1 %>% filter(YearBuilt> 1830) %>%
          mutate(floorbin= cut(NumFloors,seq(0,120,10) ,right=FALSE),
                 yrbin= round_to_decade(YearBuilt))



 p1 <-ggplot(q1,aes(x=yrbin,y=l_numbuild, color=floorbin)) +
  geom_bar(stat="identity")  +theme_minimal() 
ggplotly()

Earlier decades of 1900’s seems to be a start of those buildings with a little dip in the middle.

2. You work for a real estate developer and are researching underbuilt areas of the city. After looking in the Pluto data dictionary, you’ve discovered that all tax assessments consist of two parts: The assessment of the land and assessment of the structure. You reason that there should be a correlation between these two values: more valuable land will have more valuable structures on them (more valuable in this case refers not just to a mansion vs a bungalow, but an apartment tower vs a single family home). Deviations from the norm could represent underbuilt or overbuilt areas of the city. You also recently read a really cool blog post about bivariate choropleth maps, and think the technique could be used for this problem.

We will work with the columns AssesTotal, AssessLand, Latitude and Longitude for this part. In order to analyze the developed and underdeveloped area, a number of factors are to be considered including facilities available in the neighbourhood. School districts, shopping areas, crime statistics. In our situation we can look at the value of land and property. In addition to this, number of floors can also be considered. However we will drop the column number of floors, as the total value of land is an important factor than number of floors. For example : In a specific area, 20 floor building will have more value than 5 floor building. We will focus on assess land and assess total value only for this part. In addition, NYC has different class property based pn the tax and property assessment. We will not go into details and will focus on the visualization. Making an assumption if the building value is less than 70% of the Total assessment, it is flagged underdeveloped.

ny <- fread("C:/Users/Gurpreet/Documents/DATA608/q2.csv", colClasses="character")

q2 <- ny %>% dplyr::mutate( AssessTot = as.integer(AssessTot),
                            AssessLand =as.integer(AssessLand),
                            build_val = AssessTot-AssessLand,
                    under_f = ifelse(build_val < AssessTot*0.7, "y", "")) %>%
              filter(under_f=="y") %>% select(-under_f) 
## Warning: NAs introduced by coercion to integer range

## Warning: NAs introduced by coercion to integer range
https://smallbusiness.chron.com/calculate-land-value-tax-purposes-10165.html

import datashader as ds
## C:\Users\Gurpreet\AppData\Local\Programs\Python\Python37\lib\site-packages\datashader\transfer_functions.py:21: FutureWarning: xarray subclass Image should explicitly define __slots__
##   class Image(xr.DataArray):
import datashader.glyphs
#mport plotly.plotly as py
#import plotly.graph_objs as go
import datashader.transfer_functions as tf
from plotly import tools
from functools import partial
from datashader import reductions
from pyproj import Proj, transform
from datashader.core import bypixel

Refrences : https://community.rstudio.com/t/how-to-add-my-api-key-into-get-map/15992/5 https://rpubs.com/jhofman/nycmaps http://rforpublichealth.blogspot.com/2015/10/mapping-with-ggplot-create-nice.html