First reading data from files and combining it to one file. Read data only after 1850 and before 2015 and LotArea > 100 and number of floors greater than one.
files <- list.files(path="/home/chirag/MyDocs/R/rmds/",pattern=".*/*.csv$",full.names=TRUE,recursive = TRUE)
all_PLUTO_data = data.frame()
for (file in files)
{
ny_building_data = read.csv(file, stringsAsFactors = FALSE)
# read data only after 1850 and LotArea > 100 and number floor greater than one
ny_building_data = ny_building_data[ny_building_data$YearBuilt > 1850 & ny_building_data$YearBuilt < 2015 & ny_building_data$LotArea > 100 & ny_building_data$NumFloors >1,]
ny_building_data$filename = file
#limiting columall_PLUTO_data.RData
all_PLUTO_data = rbind(all_PLUTO_data,ny_building_data[,c("YearBuilt","NumFloors","AssessLand","AssessTot","BldgArea")])
}
save(all_PLUTO_data, file="all_PLUTO_data.RData")
Reading from all_PLUTO_data.RData
load("all_PLUTO_data.RData")
summary(all_PLUTO_data)
## YearBuilt NumFloors AssessLand AssessTot
## Min. :1851 Min. : 1.010 Min. :0.000e+00 Min. :0.000e+00
## 1st Qu.:1920 1st Qu.: 2.000 1st Qu.:7.650e+03 1st Qu.:2.256e+04
## Median :1930 Median : 2.000 Median :1.030e+04 Median :2.911e+04
## Mean :1940 Mean : 2.592 Mean :9.234e+04 Mean :4.175e+05
## 3rd Qu.:1955 3rd Qu.: 3.000 3rd Qu.:1.472e+04 3rd Qu.:4.280e+04
## Max. :2040 Max. :119.000 Max. :3.211e+09 Max. :6.798e+09
## BldgArea
## Min. : 0
## 1st Qu.: 1492
## Median : 2096
## Mean : 6950
## 3rd Qu.: 3090
## Max. :49547830
library("knitr")
library("bigvis")
library("ggplot2")
library(plotly)
library(dplyr)
library("plyr")
Get a compact summary of the data
compact_data <- condense(bin(all_PLUTO_data$YearBuilt, 1))
## Summarising with count
count(subset(all_PLUTO_data,all_PLUTO_data$YearBuilt>1969 & all_PLUTO_data$YearBuilt<1981),"YearBuilt")
## YearBuilt freq
## 1 1970 14265
## 2 1971 346
## 3 1972 443
## 4 1973 682
## 5 1974 628
## 6 1975 8594
## 7 1976 507
## 8 1977 532
## 9 1978 820
## 10 1979 484
## 11 1980 4839
Now showing summary of data
summary(compact_data)
## all_PLUTO_data.YearBuilt .count
## Min. :1851 Min. : 0
## 1st Qu.:1898 1st Qu.: 11
## Median :1946 Median : 443
## Mean :1946 Mean : 3851
## 3rd Qu.:1993 3rd Qu.: 1964
## Max. :2040 Max. :88227
## NA's :1
Showing cumulative graph
compact_data$percent_built <- compact_data$.count/sum(compact_data$.count)
compact_data$cumulative_percent_built <- cumsum(compact_data$percent_built)
mydata<-data.frame(compact_data$all_PLUTO_data.YearBuilt,compact_data$cumulative_percent_built)
colnames(mydata)[1]<-"year"
colnames(mydata)[2]<-"cumulative"
qplot(mydata$year,mydata$cumulative,geom='step')
by looking at this graph we can say 50% percent of buildings were built around 1930. so we can start looking at those buildings first.
Considering small data size and looking for building with more than 20 floors.
small<-subset(all_PLUTO_data,all_PLUTO_data$NumFloors>20)
plot_ly(small, x = ~small$YearBuilt, y = ~small$NumFloors, color = ~small$NumFloors,size = ~small$NumFloors)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
qualityOfConstruction <- all_PLUTO_data[all_PLUTO_data$AssessTot>0,]
# Calculating value of the construction
qualityOfConstruction$AssessConstruction <- (qualityOfConstruction$AssessTot-qualityOfConstruction$AssessLand)
# rate per floor
qualityOfConstruction$value_per_floor <- qualityOfConstruction$AssessConstruction / qualityOfConstruction$NumFloors
qualityOfConstruction_condensed <- condense(bin(qualityOfConstruction$YearBuilt,5,origin=1856),summary="mean",z=qualityOfConstruction$value_per_floor)
#changing column names
colnames(qualityOfConstruction_condensed) <- c("YearBuilt","Count","Mean")
qualityOfConstruction_condensed <- qualityOfConstruction_condensed[complete.cases(qualityOfConstruction_condensed),]
#printing values around world war 2
qualityOfConstruction_condensed<-qualityOfConstruction_condensed[qualityOfConstruction_condensed$YearBuilt>1930 & qualityOfConstruction_condensed$YearBuilt<=1955,]
plot_ly(
x = qualityOfConstruction_condensed$YearBuilt,
y = qualityOfConstruction_condensed$Mean,
type = "bar"
)
Looks like she is right as price around world war 2 year is less.