Choose any three of the “wide” datasets identified in the Week 6 Discussion items. (You may use your own dataset; please don’t use my Sample Post dataset, since that was used in your Week 6 assignment!) For each of the three chosen datasets: ??? (1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information included in the dataset. You’re encouraged to use a “wide” structure similar to how the information appears in the discussion item, so that you can practice tidying and transformations as described below. ??? (2) Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. [Most of your grade will be based on this step!] ??? (3) Perform the analysis requested in the discussion item. ??? (4) Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions.
The data source is from NYC.gov about rolling sales records of each county from September 2016 to July 2017. There are five cities in New York, including Manhattan, Broxn, Queens, Brooklyn and Statenisland.Please see the original data in the link http://www1.nyc.gov/site/finance/taxes/property-rolling-sales-data.page .
First import all packages I need in the projec.
require(tidyr)
## Loading required package: tidyr
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(stringr)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## arrange(): dplyr, plyr
## compact(): purrr, plyr
## count(): dplyr, plyr
## failwith(): dplyr, plyr
## filter(): dplyr, stats
## id(): dplyr, plyr
## lag(): dplyr, stats
## mutate(): dplyr, plyr
## rename(): dplyr, plyr
## summarise(): dplyr, plyr
## summarize(): dplyr, plyr
# Import data
manhattan <-tbl_df(read.csv('D:/CUNY_SPS_DA/607_Data_Aq/Project2/NYCRollingSales9.2016_8.2017/rollingsales_manhattan.csv', stringsAsFactors = FALSE))
colnames(manhattan) <- as.character(unlist(manhattan[4,]))
manhattan = manhattan[-4, ]
df<- manhattan[4:nrow(manhattan),]
tail(df,50)
## # A tibble: 50 x 21
## BOROUGH NEIGHBORHOOD
## <chr> <chr>
## 1 1 WASHINGTON HEIGHTS UPPER
## 2 1 WASHINGTON HEIGHTS UPPER
## 3 1 WASHINGTON HEIGHTS UPPER
## 4 1 WASHINGTON HEIGHTS UPPER
## 5 1 WASHINGTON HEIGHTS UPPER
## 6 1 WASHINGTON HEIGHTS UPPER
## 7 1 WASHINGTON HEIGHTS UPPER
## 8 1 WASHINGTON HEIGHTS UPPER
## 9 1 WASHINGTON HEIGHTS UPPER
## 10 1 WASHINGTON HEIGHTS UPPER
## # ... with 40 more rows, and 19 more variables: `BUILDING CLASS
## # CATEGORY` <chr>, `TAX CLASS AT PRESENT` <chr>, BLOCK <chr>, LOT <chr>,
## # `EASE-MENT` <chr>, `BUILDING CLASS AT PRESENT` <chr>, ADDRESS <chr>,
## # `APARTMENT NUMBER` <chr>, `ZIP CODE` <chr>, `RESIDENTIAL UNITS` <chr>,
## # `COMMERCIAL UNITS` <chr>, `TOTAL UNITS` <chr>, `LAND SQUARE
## # FEET` <chr>, `GROSS SQUARE FEET` <chr>, `YEAR BUILT` <chr>, `TAX CLASS
## # AT TIME OF SALE` <chr>, `BUILDING CLASS AT TIME OF SALE` <chr>, ` SALE
## # PRICE ` <chr>, `SALE DATE` <chr>
dim(df)
## [1] 18306 21
# Original raw data types
#str(df)
# rename the table
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
colnms <- c("BOROUGH", "NEIGHBORHOOD", "BUILDING_CLASS_CATEGORY", "TAX_CLASS_AT_PRESENT","BLOCK","LOT","EASE_MENT", "BUILDING_CLASS_AT_PRESENT","ADDRESS", "APARTMENT_NUMBER", "ZIP_CODE","RESIDENTIAL_UNITS","COMMERCIAL_UNITS", "TOTAL_UNITS", "LAND_SQUARE_FEET", "GROSS_SQUARE_FEET","YEAR_BUILT", "TAX_CLASS_AT_TIME_OF_SALE","BUILDING_CLASS_AT_TIME_OF_SALE","SALE_PRICE", "SALE_DATE" )
setnames(df, colnms)
#correct data types
df$BOROUGH <-as.numeric(df$BOROUGH)
df$BLOCK <-as.numeric(df$BLOCK)
df$LOT <-as.numeric(df$LOT)
df$BLOCK <-as.numeric(df$BLOCK)
df$RESIDENTIAL_UNITS <-as.numeric(df$RESIDENTIAL_UNITS)
df$COMMERCIAL_UNITS <-as.numeric(df$COMMERCIAL_UNITS)
df$TOTAL_UNITS <-as.numeric(df$TOTAL_UNITS)
df$LAND_SQUARE_FEET <-as.numeric(gsub(",","",df$LAND_SQUARE_FEET))
## Warning: NAs introduced by coercion
df$GROSS_SQUARE_FEET <-as.numeric(gsub(",","",df$GROSS_SQUARE_FEET))
## Warning: NAs introduced by coercion
df$YEAR_BUILT <-as.numeric(df$YEAR_BUILT)
df$TAX_CLASS_AT_TIME_OF_SALE <-as.numeric(df$TAX_CLASS_AT_TIME_OF_SALE)
df$SALE_PRICE <-as.numeric(gsub(",","",df$SALE_PRICE))
## Warning: NAs introduced by coercion
#check data types
str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 18306 obs. of 21 variables:
## $ BOROUGH : num 1 1 1 1 1 1 1 1 1 1 ...
## $ NEIGHBORHOOD : chr "ALPHABET CITY" "ALPHABET CITY" "ALPHABET CITY" "ALPHABET CITY" ...
## $ BUILDING_CLASS_CATEGORY : chr "07 RENTALS - WALKUP APARTMENTS " "07 RENTALS - WALKUP APARTMENTS " "07 RENTALS - WALKUP APARTMENTS " "07 RENTALS - WALKUP APARTMENTS " ...
## $ TAX_CLASS_AT_PRESENT : chr "2A" "2" "2" "2B" ...
## $ BLOCK : num 392 399 399 402 404 405 406 407 379 387 ...
## $ LOT : num 6 26 39 21 55 16 32 18 34 153 ...
## $ EASE_MENT : chr " " " " " " " " ...
## $ BUILDING_CLASS_AT_PRESENT : chr "C2" "C7" "C7" "C4" ...
## $ ADDRESS : chr "153 AVENUE B" "234 EAST 4TH STREET" "197 EAST 3RD STREET" "154 EAST 7TH STREET" ...
## $ APARTMENT_NUMBER : chr " " " " " " " " ...
## $ ZIP_CODE : chr "10009" "10009" "10009" "10009" ...
## $ RESIDENTIAL_UNITS : num 5 28 16 10 6 20 8 44 15 24 ...
## $ COMMERCIAL_UNITS : num 0 3 1 0 0 0 0 2 0 0 ...
## $ TOTAL_UNITS : num 5 31 17 10 6 20 8 46 15 24 ...
## $ LAND_SQUARE_FEET : num 1633 4616 2212 2272 2369 ...
## $ GROSS_SQUARE_FEET : num 6440 18690 7803 6794 4615 ...
## $ YEAR_BUILT : num 1900 1900 1900 1913 1900 ...
## $ TAX_CLASS_AT_TIME_OF_SALE : num 2 2 2 2 2 2 2 2 2 2 ...
## $ BUILDING_CLASS_AT_TIME_OF_SALE: chr "C2" "C7" "C7" "C4" ...
## $ SALE_PRICE : num 6625000 NA NA 3936272 8000000 ...
## $ SALE_DATE : chr "7/19/2017" "12/14/2016" "12/9/2016" "9/23/2016" ...
tail(df,50)
## # A tibble: 50 x 21
## BOROUGH NEIGHBORHOOD
## <dbl> <chr>
## 1 1 WASHINGTON HEIGHTS UPPER
## 2 1 WASHINGTON HEIGHTS UPPER
## 3 1 WASHINGTON HEIGHTS UPPER
## 4 1 WASHINGTON HEIGHTS UPPER
## 5 1 WASHINGTON HEIGHTS UPPER
## 6 1 WASHINGTON HEIGHTS UPPER
## 7 1 WASHINGTON HEIGHTS UPPER
## 8 1 WASHINGTON HEIGHTS UPPER
## 9 1 WASHINGTON HEIGHTS UPPER
## 10 1 WASHINGTON HEIGHTS UPPER
## # ... with 40 more rows, and 19 more variables:
## # BUILDING_CLASS_CATEGORY <chr>, TAX_CLASS_AT_PRESENT <chr>,
## # BLOCK <dbl>, LOT <dbl>, EASE_MENT <chr>,
## # BUILDING_CLASS_AT_PRESENT <chr>, ADDRESS <chr>,
## # APARTMENT_NUMBER <chr>, ZIP_CODE <chr>, RESIDENTIAL_UNITS <dbl>,
## # COMMERCIAL_UNITS <dbl>, TOTAL_UNITS <dbl>, LAND_SQUARE_FEET <dbl>,
## # GROSS_SQUARE_FEET <dbl>, YEAR_BUILT <dbl>,
## # TAX_CLASS_AT_TIME_OF_SALE <dbl>, BUILDING_CLASS_AT_TIME_OF_SALE <chr>,
## # SALE_PRICE <dbl>, SALE_DATE <chr>
mydat<- df%>% separate(BUILDING_CLASS_CATEGORY, into=c("BUILDING_CLASS_Number", "BUILDING_CLASS_CATEGORY"))
## Warning: Too many values at 18306 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
#calculate unit pice per feet for each sale
mydat2<- na.omit(mydat)%>%
select(NEIGHBORHOOD,BUILDING_CLASS_Number, BUILDING_CLASS_CATEGORY, GROSS_SQUARE_FEET, SALE_PRICE)%>%
mutate(PRICE_PerFeet = round(SALE_PRICE/GROSS_SQUARE_FEET))%>%
group_by(BUILDING_CLASS_CATEGORY)
mydat2
## # A tibble: 1,006 x 6
## # Groups: BUILDING_CLASS_CATEGORY [21]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 ALPHABET CITY 07 RENTALS
## 2 ALPHABET CITY 07 RENTALS
## 3 ALPHABET CITY 07 RENTALS
## 4 ALPHABET CITY 07 RENTALS
## 5 ALPHABET CITY 08 RENTALS
## 6 ALPHABET CITY 08 RENTALS
## 7 ALPHABET CITY 09 COOPS
## 8 ALPHABET CITY 14 RENTALS
## 9 ALPHABET CITY 14 RENTALS
## 10 ALPHABET CITY 14 RENTALS
## # ... with 996 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## # SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
unique(unlist(df$BUILDING_CLASS_CATEGORY))
## [1] "07 RENTALS - WALKUP APARTMENTS "
## [2] "08 RENTALS - ELEVATOR APARTMENTS "
## [3] "09 COOPS - WALKUP APARTMENTS "
## [4] "10 COOPS - ELEVATOR APARTMENTS "
## [5] "11A CONDO-RENTALS "
## [6] "12 CONDOS - WALKUP APARTMENTS "
## [7] "13 CONDOS - ELEVATOR APARTMENTS "
## [8] "14 RENTALS - 4-10 UNIT "
## [9] "15 CONDOS - 2-10 UNIT RESIDENTIAL "
## [10] "16 CONDOS - 2-10 UNIT WITH COMMERCIAL UNIT "
## [11] "17 CONDO COOPS "
## [12] "22 STORE BUILDINGS "
## [13] "37 RELIGIOUS FACILITIES "
## [14] "42 CONDO CULTURAL/MEDICAL/EDUCATIONAL/ETC "
## [15] "46 CONDO STORE BUILDINGS "
## [16] "47 CONDO NON-BUSINESS STORAGE "
## [17] "01 ONE FAMILY DWELLINGS "
## [18] "02 TWO FAMILY DWELLINGS "
## [19] "03 THREE FAMILY DWELLINGS "
## [20] "04 TAX CLASS 1 CONDOS "
## [21] "21 OFFICE BUILDINGS "
## [22] "23 LOFT BUILDINGS "
## [23] "25 LUXURY HOTELS "
## [24] "26 OTHER HOTELS "
## [25] "28 COMMERCIAL CONDOS "
## [26] "29 COMMERCIAL GARAGES "
## [27] "35 INDOOR PUBLIC AND CULTURAL FACILITIES "
## [28] "38 ASYLUMS AND HOMES "
## [29] "43 CONDO OFFICE BUILDINGS "
## [30] "44 CONDO PARKING "
## [31] "48 CONDO TERRACES/GARDENS/CABANAS "
## [32] "31 COMMERCIAL VACANT LAND "
## [33] "32 HOSPITAL AND HEALTH FACILITIES "
## [34] "41 TAX CLASS 4 - OTHER "
## [35] "18 TAX CLASS 3 - UNTILITY PROPERTIES "
## [36] "30 WAREHOUSES "
## [37] "36 OUTDOOR RECREATIONAL FACILITIES "
## [38] "49 CONDO WAREHOUSES/FACTORY/INDUS "
## [39] "34 THEATRES "
## [40] "27 FACTORIES "
## [41] "40 SELECTED GOVERNMENTAL FACILITIES "
## [42] "45 CONDO HOTELS "
## [43] "33 EDUCATIONAL FACILITIES "
## [44] "11 SPECIAL CONDO BILLING LOTS "
condo <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="CONDO")
condo
## # A tibble: 3 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 ROOSEVELT ISLAND 11A CONDO
## 2 SOHO 11A CONDO
## 3 UPPER WEST SIDE (79-96) 11A CONDO
## # ... with 3 more variables: GROSS_SQUARE_FEET <dbl>, SALE_PRICE <dbl>,
## # PRICE_PerFeet <dbl>
COOP <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="COOPS")
COOP
## # A tibble: 14 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 ALPHABET CITY 09 COOPS
## 2 CHELSEA 09 COOPS
## 3 MIDTOWN EAST 10 COOPS
## 4 MURRAY HILL 10 COOPS
## 5 UPPER EAST SIDE (59-79) 09 COOPS
## 6 UPPER EAST SIDE (59-79) 10 COOPS
## 7 UPPER EAST SIDE (59-79) 10 COOPS
## 8 UPPER EAST SIDE (79-96) 10 COOPS
## 9 UPPER WEST SIDE (59-79) 10 COOPS
## 10 UPPER WEST SIDE (59-79) 10 COOPS
## 11 UPPER WEST SIDE (79-96) 10 COOPS
## 12 UPPER WEST SIDE (79-96) 10 COOPS
## 13 UPPER WEST SIDE (79-96) 10 COOPS
## 14 WASHINGTON HEIGHTS LOWER 09 COOPS
## # ... with 3 more variables: GROSS_SQUARE_FEET <dbl>, SALE_PRICE <dbl>,
## # PRICE_PerFeet <dbl>
onefamily <-mydat2%>% filter(BUILDING_CLASS_CATEGORY=="ONE")
onefamily
## # A tibble: 83 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 CHELSEA 01 ONE
## 2 CHELSEA 01 ONE
## 3 CHELSEA 01 ONE
## 4 CHELSEA 01 ONE
## 5 GREENWICH VILLAGE-CENTRAL 01 ONE
## 6 GREENWICH VILLAGE-CENTRAL 01 ONE
## 7 GREENWICH VILLAGE-CENTRAL 01 ONE
## 8 GREENWICH VILLAGE-CENTRAL 01 ONE
## 9 GREENWICH VILLAGE-CENTRAL 01 ONE
## 10 GREENWICH VILLAGE-CENTRAL 01 ONE
## # ... with 73 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## # SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
twofamily <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="TWO")
twofamily
## # A tibble: 84 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 CHELSEA 02 TWO
## 2 EAST VILLAGE 02 TWO
## 3 EAST VILLAGE 02 TWO
## 4 EAST VILLAGE 02 TWO
## 5 GRAMERCY 02 TWO
## 6 GRAMERCY 02 TWO
## 7 GRAMERCY 02 TWO
## 8 GRAMERCY 02 TWO
## 9 GREENWICH VILLAGE-CENTRAL 02 TWO
## 10 GREENWICH VILLAGE-CENTRAL 02 TWO
## # ... with 74 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## # SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
threefamily <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="THREE")
threefamily
## # A tibble: 38 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 CHELSEA 03 THREE
## 2 GRAMERCY 03 THREE
## 3 GREENWICH VILLAGE-CENTRAL 03 THREE
## 4 GREENWICH VILLAGE-WEST 03 THREE
## 5 GREENWICH VILLAGE-WEST 03 THREE
## 6 HARLEM-CENTRAL 03 THREE
## 7 HARLEM-CENTRAL 03 THREE
## 8 HARLEM-CENTRAL 03 THREE
## 9 HARLEM-CENTRAL 03 THREE
## 10 HARLEM-CENTRAL 03 THREE
## # ... with 28 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## # SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
AptRent <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="RENTALS")
AptRent
## # A tibble: 512 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 ALPHABET CITY 07 RENTALS
## 2 ALPHABET CITY 07 RENTALS
## 3 ALPHABET CITY 07 RENTALS
## 4 ALPHABET CITY 07 RENTALS
## 5 ALPHABET CITY 08 RENTALS
## 6 ALPHABET CITY 08 RENTALS
## 7 ALPHABET CITY 14 RENTALS
## 8 ALPHABET CITY 14 RENTALS
## 9 ALPHABET CITY 14 RENTALS
## 10 CHELSEA 07 RENTALS
## # ... with 502 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## # SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
popType <- c("COOP","CONDO","OneFamily", "TwoFamily","ThreFamily", "AptRent")
mean_price <- c(mean(COOP$PRICE_PerFeet), mean(condo$PRICE_PerFeet),mean(onefamily$PRICE_PerFeet), mean(twofamily$PRICE_PerFeet),mean(threefamily$PRICE_PerFeet), mean(AptRent$PRICE_PerFeet))
library ('ggplot2')
barplot(mean_price, names.arg=popType, ylim=c(0,2500), main="Manhattan Residentail Property Sale Price (ft)", ylab="Price", xlab="Poperty Types", col="skyblue")
The data source is from census.gov about population in New York from 2008 to 2016. Please see the original data in the link https://www.census.gov/data/datasets/2016/demo/popest/counties-detail.html
# Import data
population <-tbl_df(read.csv('D:/CUNY_SPS_DA/607_Data_Aq/Project2/NYC_Population_2016.csv', stringsAsFactors = FALSE))
df<- population[, 5:8]
head(df,5)
## # A tibble: 5 x 4
## CTYNAME YEAR AGEGRP TOT_POP
## <chr> <int> <int> <int>
## 1 Albany County 1 0 304204
## 2 Albany County 1 1 15286
## 3 Albany County 1 2 16131
## 4 Albany County 1 3 17639
## 5 Albany County 1 4 23752
unique(unlist(df$AGEGRP))
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
df3 <- df %>% spread(AGEGRP,TOT_POP)
df3 <- rename(df3,c("0"="g0" , "1"="g1","2"= "g2","3"= "g3","4"= "g4","5"= "g5","6"= "g6","7"= "g7","8"= "g8","9"= "g9","10"="g10", "11"="g11","12"= "g12","13"= "g13","14"= "g14","15"= "g15","16"= "g16","17"= "g17","18"= "g18" ))
ageG <- df3 %>% filter(YEAR==9)
curAge <-c(sum(ageG$g1),sum(ageG$g2),sum(ageG$g3),sum(ageG$g4),sum(ageG$g5),sum(ageG$g6),sum(ageG$g7),sum(ageG$g8),sum(ageG$g9),sum(ageG$g10),sum(ageG$g11),sum(ageG$g12),sum(ageG$g13),sum(ageG$g14),sum(ageG$g15),sum(ageG$g16),sum(ageG$g17),sum(ageG$g18))
gName <-c("g1","g2","g3","g4","g5","g6","g7","g8","g9","g10","g11","g12","g13","g14","g15","g16","g17","g18")
ageStruc<-as.data.frame(gName,curAge)
## Warning in as.data.frame.vector(x, ..., nm = nm): 'row.names' is not a
## character vector of length 18 -- omitting it. Will be an error!
dim(ageStruc)
## [1] 18 1
# group number bigger, the age older
ggplot(ageStruc, aes(gName,curAge ))+
geom_bar(stat = 'identity') +
theme(axis.text.x=element_text(angle=90, hjust=1))+ coord_flip()+
ggtitle('New York Age Grops')
#If I can get more than 30 years population data,I can grow rates then can estemate the future age structures by simulation.
The data source is from census.gov about household income in New York. Please see the original data in the link https://www.census.gov/data/tables/time-series/demo/income-poverty/cps-hinc/hinc-01.html.
# Import data
brooklyn <-tbl_df(read.csv('D:/CUNY_SPS_DA/607_Data_Aq/Project2/NYCRollingSales9.2016_8.2017/rollingsales_brooklyn.csv', stringsAsFactors = FALSE))
colnames(brooklyn) <- as.character(unlist(brooklyn[4,]))
brooklyn = brooklyn[-4, ]
df<- brooklyn[4:nrow(brooklyn),]
tail(df,50)
## # A tibble: 50 x 21
## BOROUGH NEIGHBORHOOD `BUILDING CLASS CATEGORY`
## <chr> <chr> <chr>
## 1 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 2 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 3 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 4 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 5 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 6 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 7 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 8 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 9 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 10 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## # ... with 40 more rows, and 18 more variables: `TAX CLASS AT
## # PRESENT` <chr>, BLOCK <chr>, LOT <chr>, `EASE-MENT` <chr>, `BUILDING
## # CLASS AT PRESENT` <chr>, ADDRESS <chr>, `APARTMENT NUMBER` <chr>, `ZIP
## # CODE` <chr>, `RESIDENTIAL UNITS` <chr>, `COMMERCIAL UNITS` <chr>,
## # `TOTAL UNITS` <chr>, `LAND SQUARE FEET` <chr>, `GROSS SQUARE
## # FEET` <chr>, `YEAR BUILT` <chr>, `TAX CLASS AT TIME OF SALE` <chr>,
## # `BUILDING CLASS AT TIME OF SALE` <chr>, ` SALE PRICE ` <chr>, `SALE
## # DATE` <chr>
dim(df)
## [1] 24047 21
# Original raw data types
#str(df)
# rename the table
library(data.table)
colnms <- c("BOROUGH", "NEIGHBORHOOD", "BUILDING_CLASS_CATEGORY", "TAX_CLASS_AT_PRESENT","BLOCK","LOT","EASE_MENT", "BUILDING_CLASS_AT_PRESENT","ADDRESS", "APARTMENT_NUMBER", "ZIP_CODE","RESIDENTIAL_UNITS","COMMERCIAL_UNITS", "TOTAL_UNITS", "LAND_SQUARE_FEET", "GROSS_SQUARE_FEET","YEAR_BUILT", "TAX_CLASS_AT_TIME_OF_SALE","BUILDING_CLASS_AT_TIME_OF_SALE","SALE_PRICE", "SALE_DATE" )
setnames(df, colnms)
#correct data types
df$BOROUGH <-as.numeric(df$BOROUGH)
df$BLOCK <-as.numeric(df$BLOCK)
df$LOT <-as.numeric(df$LOT)
df$BLOCK <-as.numeric(df$BLOCK)
df$RESIDENTIAL_UNITS <-as.numeric(df$RESIDENTIAL_UNITS)
df$COMMERCIAL_UNITS <-as.numeric(df$COMMERCIAL_UNITS)
df$TOTAL_UNITS <-as.numeric(df$TOTAL_UNITS)
df$LAND_SQUARE_FEET <-as.numeric(gsub(",","",df$LAND_SQUARE_FEET))
## Warning: NAs introduced by coercion
df$GROSS_SQUARE_FEET <-as.numeric(gsub(",","",df$GROSS_SQUARE_FEET))
## Warning: NAs introduced by coercion
df$YEAR_BUILT <-as.numeric(df$YEAR_BUILT)
df$TAX_CLASS_AT_TIME_OF_SALE <-as.numeric(df$TAX_CLASS_AT_TIME_OF_SALE)
df$SALE_PRICE <-as.numeric(gsub(",","",df$SALE_PRICE))
## Warning: NAs introduced by coercion
#check data types
str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 24047 obs. of 21 variables:
## $ BOROUGH : num 3 3 3 3 3 3 3 3 3 3 ...
## $ NEIGHBORHOOD : chr "BATH BEACH" "BATH BEACH" "BATH BEACH" "BATH BEACH" ...
## $ BUILDING_CLASS_CATEGORY : chr "01 ONE FAMILY DWELLINGS " "01 ONE FAMILY DWELLINGS " "01 ONE FAMILY DWELLINGS " "01 ONE FAMILY DWELLINGS " ...
## $ TAX_CLASS_AT_PRESENT : chr "1" "1" "1" "1" ...
## $ BLOCK : num 6360 6361 6364 6367 6374 ...
## $ LOT : num 157 20 2 42 67 13 49 25 36 57 ...
## $ EASE_MENT : chr " " " " " " " " ...
## $ BUILDING_CLASS_AT_PRESENT : chr "A5" "A5" "A5" "S1" ...
## $ ADDRESS : chr "36 BAY 10TH STREET" "47 BAY 10TH STREET" "1649 BENSON AVENUE" "1730 86TH STREET" ...
## $ APARTMENT_NUMBER : chr " " " " " " " " ...
## $ ZIP_CODE : chr "11228" "11228" "11214" "11214" ...
## $ RESIDENTIAL_UNITS : num 1 1 1 1 1 1 1 1 1 1 ...
## $ COMMERCIAL_UNITS : num 0 0 0 1 1 0 1 0 0 0 ...
## $ TOTAL_UNITS : num 1 1 1 2 2 1 2 1 1 1 ...
## $ LAND_SQUARE_FEET : num 1547 1933 1638 1342 1740 ...
## $ GROSS_SQUARE_FEET : num 1428 1660 972 1920 3240 ...
## $ YEAR_BUILT : num 1930 1930 1930 1926 1925 ...
## $ TAX_CLASS_AT_TIME_OF_SALE : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BUILDING_CLASS_AT_TIME_OF_SALE: chr "A5" "A5" "A5" "S1" ...
## $ SALE_PRICE : num 800000 NA NA 1200000 1090000 NA 1060000 2050000 950000 NA ...
## $ SALE_DATE : chr "2/27/2017" "3/4/2017" "10/6/2016" "3/31/2017" ...
tail(df,50)
## # A tibble: 50 x 21
## BOROUGH NEIGHBORHOOD BUILDING_CLASS_CATEGORY
## <dbl> <chr> <chr>
## 1 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 2 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 3 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 4 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 5 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 6 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 7 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 8 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 9 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## 10 3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS
## # ... with 40 more rows, and 18 more variables:
## # TAX_CLASS_AT_PRESENT <chr>, BLOCK <dbl>, LOT <dbl>, EASE_MENT <chr>,
## # BUILDING_CLASS_AT_PRESENT <chr>, ADDRESS <chr>,
## # APARTMENT_NUMBER <chr>, ZIP_CODE <chr>, RESIDENTIAL_UNITS <dbl>,
## # COMMERCIAL_UNITS <dbl>, TOTAL_UNITS <dbl>, LAND_SQUARE_FEET <dbl>,
## # GROSS_SQUARE_FEET <dbl>, YEAR_BUILT <dbl>,
## # TAX_CLASS_AT_TIME_OF_SALE <dbl>, BUILDING_CLASS_AT_TIME_OF_SALE <chr>,
## # SALE_PRICE <dbl>, SALE_DATE <chr>
mydat<- df%>% separate(BUILDING_CLASS_CATEGORY, into=c("BUILDING_CLASS_Number", "BUILDING_CLASS_CATEGORY"))
## Warning: Too many values at 24047 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
#calculate unit pice per feet for each sale
mydat2<- na.omit(mydat)%>%
select(NEIGHBORHOOD,BUILDING_CLASS_Number, BUILDING_CLASS_CATEGORY, GROSS_SQUARE_FEET, SALE_PRICE)%>%
mutate(PRICE_PerFeet = round(SALE_PRICE/GROSS_SQUARE_FEET))%>%
group_by(BUILDING_CLASS_CATEGORY)
mydat2
## # A tibble: 8,669 x 6
## # Groups: BUILDING_CLASS_CATEGORY [20]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 BATH BEACH 01 ONE
## 2 BATH BEACH 01 ONE
## 3 BATH BEACH 01 ONE
## 4 BATH BEACH 01 ONE
## 5 BATH BEACH 01 ONE
## 6 BATH BEACH 01 ONE
## 7 BATH BEACH 01 ONE
## 8 BATH BEACH 01 ONE
## 9 BATH BEACH 01 ONE
## 10 BATH BEACH 01 ONE
## # ... with 8,659 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## # SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
unique(unlist(df$BUILDING_CLASS_CATEGORY))
## [1] "01 ONE FAMILY DWELLINGS "
## [2] "02 TWO FAMILY DWELLINGS "
## [3] "03 THREE FAMILY DWELLINGS "
## [4] "04 TAX CLASS 1 CONDOS "
## [5] "05 TAX CLASS 1 VACANT LAND "
## [6] "07 RENTALS - WALKUP APARTMENTS "
## [7] "09 COOPS - WALKUP APARTMENTS "
## [8] "10 COOPS - ELEVATOR APARTMENTS "
## [9] "12 CONDOS - WALKUP APARTMENTS "
## [10] "13 CONDOS - ELEVATOR APARTMENTS "
## [11] "14 RENTALS - 4-10 UNIT "
## [12] "15 CONDOS - 2-10 UNIT RESIDENTIAL "
## [13] "16 CONDOS - 2-10 UNIT WITH COMMERCIAL UNIT "
## [14] "21 OFFICE BUILDINGS "
## [15] "22 STORE BUILDINGS "
## [16] "43 CONDO OFFICE BUILDINGS "
## [17] "44 CONDO PARKING "
## [18] "06 TAX CLASS 1 - OTHER "
## [19] "08 RENTALS - ELEVATOR APARTMENTS "
## [20] "17 CONDO COOPS "
## [21] "29 COMMERCIAL GARAGES "
## [22] "30 WAREHOUSES "
## [23] "46 CONDO STORE BUILDINGS "
## [24] "47 CONDO NON-BUSINESS STORAGE "
## [25] "11A CONDO-RENTALS "
## [26] "23 LOFT BUILDINGS "
## [27] "26 OTHER HOTELS "
## [28] "27 FACTORIES "
## [29] "28 COMMERCIAL CONDOS "
## [30] "31 COMMERCIAL VACANT LAND "
## [31] "32 HOSPITAL AND HEALTH FACILITIES "
## [32] "37 RELIGIOUS FACILITIES "
## [33] "38 ASYLUMS AND HOMES "
## [34] "41 TAX CLASS 4 - OTHER "
## [35] "48 CONDO TERRACES/GARDENS/CABANAS "
## [36] "49 CONDO WAREHOUSES/FACTORY/INDUS "
## [37] "35 INDOOR PUBLIC AND CULTURAL FACILITIES "
## [38] "33 EDUCATIONAL FACILITIES "
## [39] "42 CONDO CULTURAL/MEDICAL/EDUCATIONAL/ETC "
## [40] "45 CONDO HOTELS "
## [41] "11 SPECIAL CONDO BILLING LOTS "
## [42] "39 TRANSPORTATION FACILITIES "
## [43] "36 OUTDOOR RECREATIONAL FACILITIES "
#This is a wrong value and won't be in the analysis.
condo <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="CONDO")
condo
## # A tibble: 1 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 DOWNTOWN-FULTON MALL 11A CONDO
## # ... with 3 more variables: GROSS_SQUARE_FEET <dbl>, SALE_PRICE <dbl>,
## # PRICE_PerFeet <dbl>
COOP <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="COOPS")
COOP
## # A tibble: 5 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 BUSHWICK 09 COOPS
## 2 FLATLANDS 10 COOPS
## 3 KENSINGTON 10 COOPS
## 4 KENSINGTON 10 COOPS
## 5 SHEEPSHEAD BAY 10 COOPS
## # ... with 3 more variables: GROSS_SQUARE_FEET <dbl>, SALE_PRICE <dbl>,
## # PRICE_PerFeet <dbl>
onefamily <-mydat2%>% filter(BUILDING_CLASS_CATEGORY=="ONE")
onefamily
## # A tibble: 2,345 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 BATH BEACH 01 ONE
## 2 BATH BEACH 01 ONE
## 3 BATH BEACH 01 ONE
## 4 BATH BEACH 01 ONE
## 5 BATH BEACH 01 ONE
## 6 BATH BEACH 01 ONE
## 7 BATH BEACH 01 ONE
## 8 BATH BEACH 01 ONE
## 9 BATH BEACH 01 ONE
## 10 BATH BEACH 01 ONE
## # ... with 2,335 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## # SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
twofamily <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="TWO")
twofamily
## # A tibble: 3,632 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 BATH BEACH 02 TWO
## 2 BATH BEACH 02 TWO
## 3 BATH BEACH 02 TWO
## 4 BATH BEACH 02 TWO
## 5 BATH BEACH 02 TWO
## 6 BATH BEACH 02 TWO
## 7 BATH BEACH 02 TWO
## 8 BATH BEACH 02 TWO
## 9 BATH BEACH 02 TWO
## 10 BATH BEACH 02 TWO
## # ... with 3,622 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## # SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
threefamily <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="THREE")
threefamily
## # A tibble: 1,143 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 BATH BEACH 03 THREE
## 2 BATH BEACH 03 THREE
## 3 BATH BEACH 03 THREE
## 4 BATH BEACH 03 THREE
## 5 BATH BEACH 03 THREE
## 6 BATH BEACH 03 THREE
## 7 BATH BEACH 03 THREE
## 8 BATH BEACH 03 THREE
## 9 BATH BEACH 03 THREE
## 10 BATH BEACH 03 THREE
## # ... with 1,133 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## # SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
AptRent <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="RENTALS")
AptRent
## # A tibble: 1,045 x 6
## # Groups: BUILDING_CLASS_CATEGORY [1]
## NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
## <chr> <chr> <chr>
## 1 BATH BEACH 07 RENTALS
## 2 BATH BEACH 07 RENTALS
## 3 BATH BEACH 07 RENTALS
## 4 BAY RIDGE 07 RENTALS
## 5 BAY RIDGE 07 RENTALS
## 6 BAY RIDGE 07 RENTALS
## 7 BAY RIDGE 07 RENTALS
## 8 BAY RIDGE 07 RENTALS
## 9 BAY RIDGE 07 RENTALS
## 10 BAY RIDGE 07 RENTALS
## # ... with 1,035 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## # SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
popType <- c("COOP","OneFamily", "TwoFamily","ThreFamily", "AptRent")
mean_price <- c(mean(COOP$PRICE_PerFeet),mean(onefamily$PRICE_PerFeet), mean(twofamily$PRICE_PerFeet),mean(threefamily$PRICE_PerFeet), mean(AptRent$PRICE_PerFeet))
library ('ggplot2')
barplot(mean_price, names.arg=popType, ylim=c(0,1000), main="Brooklyn Residentail Property Sale Price (ft)", ylab="Price", xlab="Poperty Types", col="red")