library(plyr)
require(gdata)
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
NYC <- read.xls("rollingsales_manhattan.xls",pattern="BOROUGH")
head(NYC)
## BOROUGH NEIGHBORHOOD
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## BUILDING.CLASS.CATEGORY TAX.CLASS.AT.PRESENT BLOCK
## 1 13 CONDOS - ELEVATOR APARTMENTS 738
## 2 13 CONDOS - ELEVATOR APARTMENTS 738
## 3 13 CONDOS - ELEVATOR APARTMENTS 738
## 4 13 CONDOS - ELEVATOR APARTMENTS 738
## 5 13 CONDOS - ELEVATOR APARTMENTS 738
## 6 13 CONDOS - ELEVATOR APARTMENTS 738
## LOT EASE.MENT BUILDING.CLASS.AT.PRESENT
## 1 1306 NA
## 2 1307 NA
## 3 1308 NA
## 4 1309 NA
## 5 1310 NA
## 6 1311 NA
## ADDRESS APART.MENT.NUMBER ZIP.CODE
## 1 345 WEST 14TH STREET 10014
## 2 345 WEST 14TH STREET 10014
## 3 345 WEST 14TH STREET 10014
## 4 345 WEST 14TH STREET 10014
## 5 345 WEST 14TH STREET 10014
## 6 345 WEST 14TH STREET 10014
## RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE
## 1 0 0 2
## 2 0 0 2
## 3 0 0 2
## 4 0 0 2
## 5 0 0 2
## 6 0 0 2
## BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE
## 1 R4 $2,214,693 2013-05-20
## 2 R4 $1,654,656 2013-05-16
## 3 R4 $1,069,162 2013-05-23
## 4 R4 $1,374,637 2013-05-15
## 5 R4 $1,649,565 2013-05-13
## 6 R4 $1,705,568 2013-05-21
summary(NYC)
## BOROUGH NEIGHBORHOOD
## Min. :1 MIDTOWN WEST : 6264
## 1st Qu.:1 UPPER EAST SIDE (59-79) : 2569
## Median :1 UPPER EAST SIDE (79-96) : 2117
## Mean :1 UPPER WEST SIDE (59-79) : 2053
## 3rd Qu.:1 MIDTOWN EAST : 1357
## Max. :1 UPPER WEST SIDE (79-96) : 1133
## (Other) :11902
## BUILDING.CLASS.CATEGORY
## 13 CONDOS - ELEVATOR APARTMENTS :8480
## 10 COOPS - ELEVATOR APARTMENTS :7530
## :3454
## 25 LUXURY HOTELS :1644
## 17 CONDOPS :1083
## 07 RENTALS - WALKUP APARTMENTS :1020
## (Other) :4184
## TAX.CLASS.AT.PRESENT BLOCK LOT EASE.MENT
## 2 :18394 Min. : 7 Min. : 1.0 Mode:logical
## 4 : 6920 1st Qu.: 877 1st Qu.: 37.0 NA's:27395
## 2C : 789 Median :1047 Median :1007.0
## 1 : 505 Mean :1110 Mean : 741.8
## 2B : 362 3rd Qu.:1411 3rd Qu.:1233.0
## 2A : 264 Max. :2250 Max. :9117.0
## (Other): 161
## BUILDING.CLASS.AT.PRESENT
## R4 :8384
## D4 :7227
## RH :3515
## H2 :1641
## R9 :1082
## C6 : 804
## (Other):4742
## ADDRESS APART.MENT.NUMBER
## 870 7 AVENUE : 2087 :14570
## 102 WEST 57TH STREET : 1322 TIMES : 599
## 200 WEST 56TH STREET : 608 5B : 85
## 1335 AVENUE OF THE AMERIC : 405 3A : 77
## 102 WEST 57TH ST : 262 6A : 77
## 1335 AVENUE OF THE AMER : 191 4A : 76
## (Other) :22520 (Other) :11911
## ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS
## Min. : 0 0 :16372 Min. : 0.000 1 :13772
## 1st Qu.:10016 1 : 9132 1st Qu.: 0.000 0 : 9704
## Median :10019 2 : 229 Median : 0.000 2 : 1909
## Mean :10029 3 : 182 Mean : 0.375 3 : 189
## 3rd Qu.:10027 8 : 125 3rd Qu.: 0.000 4 : 187
## Max. :10463 4 : 123 Max. :604.000 10 : 159
## (Other): 1232 (Other): 1475
## LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT
## 0 :22906 0 :23069 Min. : 0
## 7,532 : 1635 112,850: 1634 1st Qu.:1900
## 2,500 : 65 3,600 : 15 Median :1928
## 2,008 : 45 5,400 : 14 Mean :1494
## 2,554 : 42 4,000 : 12 3rd Qu.:1973
## 2,523 : 39 4,976 : 12 Max. :2013
## (Other): 2663 (Other): 2639
## TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE
## Min. :1.000 R4 :8480
## 1st Qu.:2.000 D4 :7227
## Median :2.000 RH :2983
## Mean :2.488 H2 :1641
## 3rd Qu.:4.000 R9 :1083
## Max. :4.000 R5 : 995
## (Other):4986
## SALE.PRICE SALE.DATE
## $0 : 7593 2012-12-17: 485
## $57,900 : 272 2012-12-27: 324
## $100 : 217 2012-12-20: 319
## $10 : 143 2012-12-28: 308
## $18,850 : 143 2012-12-19: 293
## $1,100,000: 97 2012-08-23: 274
## (Other) :18930 (Other) :25392
colnames(NYC)
## [1] "BOROUGH" "NEIGHBORHOOD"
## [3] "BUILDING.CLASS.CATEGORY" "TAX.CLASS.AT.PRESENT"
## [5] "BLOCK" "LOT"
## [7] "EASE.MENT" "BUILDING.CLASS.AT.PRESENT"
## [9] "ADDRESS" "APART.MENT.NUMBER"
## [11] "ZIP.CODE" "RESIDENTIAL.UNITS"
## [13] "COMMERCIAL.UNITS" "TOTAL.UNITS"
## [15] "LAND.SQUARE.FEET" "GROSS.SQUARE.FEET"
## [17] "YEAR.BUILT" "TAX.CLASS.AT.TIME.OF.SALE"
## [19] "BUILDING.CLASS.AT.TIME.OF.SALE" "SALE.PRICE"
## [21] "SALE.DATE"
NYC <- NYC[,-1]
NYC$SALE.PRICE<-as.numeric(gsub("[^[:digit:]]","",NYC$SALE.PRICE))
count(is.na(NYC$SALE.PRICE))
## x freq
## 1 FALSE 27395
names(NYC)<-tolower(names(NYC))
NYC$gross.square.feet<-as.numeric(gsub("[^[:digit:]]","",NYC$gross.square.feet))
NYC$land.square.feet<-as.numeric(gsub("[^[:digit:]]","",NYC$land.square.feet))
NYC$sale.date<-as.Date(NYC$sale.date)
NYC$built<-as.numeric(as.character(NYC$year.built))
Exploratory Graphs
Retain Actual Sales Only
Limit Inquiry to Family Homes (1-3)
NYC.homes<-NYC.sale[which(grepl("FAMILY",
NYC.sale$building.class.category)),]
plot(log(NYC.homes$gross.square.feet),log(NYC.homes$sale.price))
NYC.homes$outlier<-(log(NYC.homes$sale.price)<=5) + 0
NYC.homes<-NYC.homes[which(NYC.homes$outlier==0),]
plot(log(NYC.homes$gross.square.feet),log(NYC.homes$sale.price))