Your task is to:

Choose any three of the “wide” datasets identified in the Week 6 Discussion items. (You may use your own dataset; please don’t use my Sample Post dataset, since that was used in your Week 6 assignment!) For each of the three chosen datasets: ??? (1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information included in the dataset. You’re encouraged to use a “wide” structure similar to how the information appears in the discussion item, so that you can practice tidying and transformations as described below. ??? (2) Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. [Most of your grade will be based on this step!] ??? (3) Perform the analysis requested in the discussion item. ??? (4) Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions.

NYC Rolling Sales - Manhattan

The data source is from NYC.gov about rolling sales records of each county from September 2016 to July 2017. There are five cities in New York, including Manhattan, Broxn, Queens, Brooklyn and Statenisland.Please see the original data in the link http://www1.nyc.gov/site/finance/taxes/property-rolling-sales-data.page .

First import all packages I need in the projec.

require(tidyr)
## Loading required package: tidyr
require(dplyr) 
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(stringr)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## arrange():   dplyr, plyr
## compact():   purrr, plyr
## count():     dplyr, plyr
## failwith():  dplyr, plyr
## filter():    dplyr, stats
## id():        dplyr, plyr
## lag():       dplyr, stats
## mutate():    dplyr, plyr
## rename():    dplyr, plyr
## summarise(): dplyr, plyr
## summarize(): dplyr, plyr

Import Data from a .csv file to a data frame “df”

# Import data
manhattan <-tbl_df(read.csv('D:/CUNY_SPS_DA/607_Data_Aq/Project2/NYCRollingSales9.2016_8.2017/rollingsales_manhattan.csv', stringsAsFactors = FALSE)) 

colnames(manhattan) <- as.character(unlist(manhattan[4,]))
manhattan = manhattan[-4, ]

df<- manhattan[4:nrow(manhattan),]
tail(df,50) 
## # A tibble: 50 x 21
##    BOROUGH             NEIGHBORHOOD
##      <chr>                    <chr>
##  1       1 WASHINGTON HEIGHTS UPPER
##  2       1 WASHINGTON HEIGHTS UPPER
##  3       1 WASHINGTON HEIGHTS UPPER
##  4       1 WASHINGTON HEIGHTS UPPER
##  5       1 WASHINGTON HEIGHTS UPPER
##  6       1 WASHINGTON HEIGHTS UPPER
##  7       1 WASHINGTON HEIGHTS UPPER
##  8       1 WASHINGTON HEIGHTS UPPER
##  9       1 WASHINGTON HEIGHTS UPPER
## 10       1 WASHINGTON HEIGHTS UPPER
## # ... with 40 more rows, and 19 more variables: `BUILDING CLASS
## #   CATEGORY` <chr>, `TAX CLASS AT PRESENT` <chr>, BLOCK <chr>, LOT <chr>,
## #   `EASE-MENT` <chr>, `BUILDING CLASS AT PRESENT` <chr>, ADDRESS <chr>,
## #   `APARTMENT NUMBER` <chr>, `ZIP CODE` <chr>, `RESIDENTIAL UNITS` <chr>,
## #   `COMMERCIAL UNITS` <chr>, `TOTAL UNITS` <chr>, `LAND SQUARE
## #   FEET` <chr>, `GROSS SQUARE FEET` <chr>, `YEAR BUILT` <chr>, `TAX CLASS
## #   AT TIME OF SALE` <chr>, `BUILDING CLASS AT TIME OF SALE` <chr>, ` SALE
## #   PRICE ` <chr>, `SALE DATE` <chr>
dim(df)
## [1] 18306    21

Data Cleaning

# Original raw data types 
#str(df)

# rename the table
library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
colnms <- c("BOROUGH", "NEIGHBORHOOD", "BUILDING_CLASS_CATEGORY", "TAX_CLASS_AT_PRESENT","BLOCK","LOT","EASE_MENT", "BUILDING_CLASS_AT_PRESENT","ADDRESS", "APARTMENT_NUMBER", "ZIP_CODE","RESIDENTIAL_UNITS","COMMERCIAL_UNITS", "TOTAL_UNITS", "LAND_SQUARE_FEET", "GROSS_SQUARE_FEET","YEAR_BUILT", "TAX_CLASS_AT_TIME_OF_SALE","BUILDING_CLASS_AT_TIME_OF_SALE","SALE_PRICE", "SALE_DATE" )
setnames(df, colnms)

#correct data types
df$BOROUGH <-as.numeric(df$BOROUGH) 
df$BLOCK <-as.numeric(df$BLOCK)
df$LOT <-as.numeric(df$LOT)
df$BLOCK <-as.numeric(df$BLOCK)
df$RESIDENTIAL_UNITS <-as.numeric(df$RESIDENTIAL_UNITS)
df$COMMERCIAL_UNITS <-as.numeric(df$COMMERCIAL_UNITS)
df$TOTAL_UNITS <-as.numeric(df$TOTAL_UNITS)
df$LAND_SQUARE_FEET <-as.numeric(gsub(",","",df$LAND_SQUARE_FEET))
## Warning: NAs introduced by coercion
df$GROSS_SQUARE_FEET <-as.numeric(gsub(",","",df$GROSS_SQUARE_FEET))
## Warning: NAs introduced by coercion
df$YEAR_BUILT <-as.numeric(df$YEAR_BUILT)
df$TAX_CLASS_AT_TIME_OF_SALE <-as.numeric(df$TAX_CLASS_AT_TIME_OF_SALE)
df$SALE_PRICE <-as.numeric(gsub(",","",df$SALE_PRICE))
## Warning: NAs introduced by coercion
#check data types
str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame':    18306 obs. of  21 variables:
##  $ BOROUGH                       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ NEIGHBORHOOD                  : chr  "ALPHABET CITY" "ALPHABET CITY" "ALPHABET CITY" "ALPHABET CITY" ...
##  $ BUILDING_CLASS_CATEGORY       : chr  "07 RENTALS - WALKUP APARTMENTS             " "07 RENTALS - WALKUP APARTMENTS             " "07 RENTALS - WALKUP APARTMENTS             " "07 RENTALS - WALKUP APARTMENTS             " ...
##  $ TAX_CLASS_AT_PRESENT          : chr  "2A" "2" "2" "2B" ...
##  $ BLOCK                         : num  392 399 399 402 404 405 406 407 379 387 ...
##  $ LOT                           : num  6 26 39 21 55 16 32 18 34 153 ...
##  $ EASE_MENT                     : chr  " " " " " " " " ...
##  $ BUILDING_CLASS_AT_PRESENT     : chr  "C2" "C7" "C7" "C4" ...
##  $ ADDRESS                       : chr  "153 AVENUE B" "234 EAST 4TH   STREET" "197 EAST 3RD   STREET" "154 EAST 7TH STREET" ...
##  $ APARTMENT_NUMBER              : chr  " " " " " " " " ...
##  $ ZIP_CODE                      : chr  "10009" "10009" "10009" "10009" ...
##  $ RESIDENTIAL_UNITS             : num  5 28 16 10 6 20 8 44 15 24 ...
##  $ COMMERCIAL_UNITS              : num  0 3 1 0 0 0 0 2 0 0 ...
##  $ TOTAL_UNITS                   : num  5 31 17 10 6 20 8 46 15 24 ...
##  $ LAND_SQUARE_FEET              : num  1633 4616 2212 2272 2369 ...
##  $ GROSS_SQUARE_FEET             : num  6440 18690 7803 6794 4615 ...
##  $ YEAR_BUILT                    : num  1900 1900 1900 1913 1900 ...
##  $ TAX_CLASS_AT_TIME_OF_SALE     : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ BUILDING_CLASS_AT_TIME_OF_SALE: chr  "C2" "C7" "C7" "C4" ...
##  $ SALE_PRICE                    : num  6625000 NA NA 3936272 8000000 ...
##  $ SALE_DATE                     : chr  "7/19/2017" "12/14/2016" "12/9/2016" "9/23/2016" ...
tail(df,50)
## # A tibble: 50 x 21
##    BOROUGH             NEIGHBORHOOD
##      <dbl>                    <chr>
##  1       1 WASHINGTON HEIGHTS UPPER
##  2       1 WASHINGTON HEIGHTS UPPER
##  3       1 WASHINGTON HEIGHTS UPPER
##  4       1 WASHINGTON HEIGHTS UPPER
##  5       1 WASHINGTON HEIGHTS UPPER
##  6       1 WASHINGTON HEIGHTS UPPER
##  7       1 WASHINGTON HEIGHTS UPPER
##  8       1 WASHINGTON HEIGHTS UPPER
##  9       1 WASHINGTON HEIGHTS UPPER
## 10       1 WASHINGTON HEIGHTS UPPER
## # ... with 40 more rows, and 19 more variables:
## #   BUILDING_CLASS_CATEGORY <chr>, TAX_CLASS_AT_PRESENT <chr>,
## #   BLOCK <dbl>, LOT <dbl>, EASE_MENT <chr>,
## #   BUILDING_CLASS_AT_PRESENT <chr>, ADDRESS <chr>,
## #   APARTMENT_NUMBER <chr>, ZIP_CODE <chr>, RESIDENTIAL_UNITS <dbl>,
## #   COMMERCIAL_UNITS <dbl>, TOTAL_UNITS <dbl>, LAND_SQUARE_FEET <dbl>,
## #   GROSS_SQUARE_FEET <dbl>, YEAR_BUILT <dbl>,
## #   TAX_CLASS_AT_TIME_OF_SALE <dbl>, BUILDING_CLASS_AT_TIME_OF_SALE <chr>,
## #   SALE_PRICE <dbl>, SALE_DATE <chr>

Data Processing with dplyr & tidyr

mydat<- df%>% separate(BUILDING_CLASS_CATEGORY, into=c("BUILDING_CLASS_Number", "BUILDING_CLASS_CATEGORY"))
## Warning: Too many values at 18306 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
#calculate unit pice per feet for each sale
mydat2<- na.omit(mydat)%>% 
  select(NEIGHBORHOOD,BUILDING_CLASS_Number, BUILDING_CLASS_CATEGORY, GROSS_SQUARE_FEET, SALE_PRICE)%>%
  mutate(PRICE_PerFeet = round(SALE_PRICE/GROSS_SQUARE_FEET))%>% 
  group_by(BUILDING_CLASS_CATEGORY)
mydat2
## # A tibble: 1,006 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [21]
##     NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##            <chr>                 <chr>                   <chr>
##  1 ALPHABET CITY                    07                 RENTALS
##  2 ALPHABET CITY                    07                 RENTALS
##  3 ALPHABET CITY                    07                 RENTALS
##  4 ALPHABET CITY                    07                 RENTALS
##  5 ALPHABET CITY                    08                 RENTALS
##  6 ALPHABET CITY                    08                 RENTALS
##  7 ALPHABET CITY                    09                   COOPS
##  8 ALPHABET CITY                    14                 RENTALS
##  9 ALPHABET CITY                    14                 RENTALS
## 10 ALPHABET CITY                    14                 RENTALS
## # ... with 996 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## #   SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
unique(unlist(df$BUILDING_CLASS_CATEGORY))
##  [1] "07 RENTALS - WALKUP APARTMENTS             " 
##  [2] "08 RENTALS - ELEVATOR APARTMENTS           " 
##  [3] "09 COOPS - WALKUP APARTMENTS               " 
##  [4] "10 COOPS - ELEVATOR APARTMENTS             " 
##  [5] "11A CONDO-RENTALS                           "
##  [6] "12 CONDOS - WALKUP APARTMENTS              " 
##  [7] "13 CONDOS - ELEVATOR APARTMENTS            " 
##  [8] "14 RENTALS - 4-10 UNIT                     " 
##  [9] "15 CONDOS - 2-10 UNIT RESIDENTIAL          " 
## [10] "16 CONDOS - 2-10 UNIT WITH COMMERCIAL UNIT " 
## [11] "17 CONDO COOPS                             " 
## [12] "22 STORE BUILDINGS                         " 
## [13] "37 RELIGIOUS FACILITIES                    " 
## [14] "42 CONDO CULTURAL/MEDICAL/EDUCATIONAL/ETC  " 
## [15] "46 CONDO STORE BUILDINGS                   " 
## [16] "47 CONDO NON-BUSINESS STORAGE              " 
## [17] "01 ONE FAMILY DWELLINGS                    " 
## [18] "02 TWO FAMILY DWELLINGS                    " 
## [19] "03 THREE FAMILY DWELLINGS                  " 
## [20] "04 TAX CLASS 1 CONDOS                      " 
## [21] "21 OFFICE BUILDINGS                        " 
## [22] "23 LOFT BUILDINGS                          " 
## [23] "25 LUXURY HOTELS                           " 
## [24] "26 OTHER HOTELS                            " 
## [25] "28 COMMERCIAL CONDOS                       " 
## [26] "29 COMMERCIAL GARAGES                      " 
## [27] "35 INDOOR PUBLIC AND CULTURAL FACILITIES   " 
## [28] "38 ASYLUMS AND HOMES                       " 
## [29] "43 CONDO OFFICE BUILDINGS                  " 
## [30] "44 CONDO PARKING                           " 
## [31] "48 CONDO TERRACES/GARDENS/CABANAS          " 
## [32] "31 COMMERCIAL VACANT LAND                  " 
## [33] "32 HOSPITAL AND HEALTH FACILITIES          " 
## [34] "41 TAX CLASS 4 - OTHER                     " 
## [35] "18 TAX CLASS 3 - UNTILITY PROPERTIES       " 
## [36] "30 WAREHOUSES                              " 
## [37] "36 OUTDOOR RECREATIONAL FACILITIES         " 
## [38] "49 CONDO WAREHOUSES/FACTORY/INDUS          " 
## [39] "34 THEATRES                                " 
## [40] "27 FACTORIES                               " 
## [41] "40 SELECTED GOVERNMENTAL FACILITIES        " 
## [42] "45 CONDO HOTELS                            " 
## [43] "33 EDUCATIONAL FACILITIES                  " 
## [44] "11 SPECIAL CONDO BILLING LOTS              "
condo <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="CONDO")
condo
## # A tibble: 3 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##              NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##                     <chr>                 <chr>                   <chr>
## 1        ROOSEVELT ISLAND                   11A                   CONDO
## 2                    SOHO                   11A                   CONDO
## 3 UPPER WEST SIDE (79-96)                   11A                   CONDO
## # ... with 3 more variables: GROSS_SQUARE_FEET <dbl>, SALE_PRICE <dbl>,
## #   PRICE_PerFeet <dbl>
COOP <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="COOPS")
COOP
## # A tibble: 14 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##                NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##                       <chr>                 <chr>                   <chr>
##  1            ALPHABET CITY                    09                   COOPS
##  2                  CHELSEA                    09                   COOPS
##  3             MIDTOWN EAST                    10                   COOPS
##  4              MURRAY HILL                    10                   COOPS
##  5  UPPER EAST SIDE (59-79)                    09                   COOPS
##  6  UPPER EAST SIDE (59-79)                    10                   COOPS
##  7  UPPER EAST SIDE (59-79)                    10                   COOPS
##  8  UPPER EAST SIDE (79-96)                    10                   COOPS
##  9  UPPER WEST SIDE (59-79)                    10                   COOPS
## 10  UPPER WEST SIDE (59-79)                    10                   COOPS
## 11  UPPER WEST SIDE (79-96)                    10                   COOPS
## 12  UPPER WEST SIDE (79-96)                    10                   COOPS
## 13  UPPER WEST SIDE (79-96)                    10                   COOPS
## 14 WASHINGTON HEIGHTS LOWER                    09                   COOPS
## # ... with 3 more variables: GROSS_SQUARE_FEET <dbl>, SALE_PRICE <dbl>,
## #   PRICE_PerFeet <dbl>
onefamily <-mydat2%>% filter(BUILDING_CLASS_CATEGORY=="ONE")
onefamily
## # A tibble: 83 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##                 NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##                        <chr>                 <chr>                   <chr>
##  1                   CHELSEA                    01                     ONE
##  2                   CHELSEA                    01                     ONE
##  3                   CHELSEA                    01                     ONE
##  4                   CHELSEA                    01                     ONE
##  5 GREENWICH VILLAGE-CENTRAL                    01                     ONE
##  6 GREENWICH VILLAGE-CENTRAL                    01                     ONE
##  7 GREENWICH VILLAGE-CENTRAL                    01                     ONE
##  8 GREENWICH VILLAGE-CENTRAL                    01                     ONE
##  9 GREENWICH VILLAGE-CENTRAL                    01                     ONE
## 10 GREENWICH VILLAGE-CENTRAL                    01                     ONE
## # ... with 73 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## #   SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
twofamily <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="TWO")
twofamily
## # A tibble: 84 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##                 NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##                        <chr>                 <chr>                   <chr>
##  1                   CHELSEA                    02                     TWO
##  2              EAST VILLAGE                    02                     TWO
##  3              EAST VILLAGE                    02                     TWO
##  4              EAST VILLAGE                    02                     TWO
##  5                  GRAMERCY                    02                     TWO
##  6                  GRAMERCY                    02                     TWO
##  7                  GRAMERCY                    02                     TWO
##  8                  GRAMERCY                    02                     TWO
##  9 GREENWICH VILLAGE-CENTRAL                    02                     TWO
## 10 GREENWICH VILLAGE-CENTRAL                    02                     TWO
## # ... with 74 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## #   SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
threefamily <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="THREE")
threefamily
## # A tibble: 38 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##                 NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##                        <chr>                 <chr>                   <chr>
##  1                   CHELSEA                    03                   THREE
##  2                  GRAMERCY                    03                   THREE
##  3 GREENWICH VILLAGE-CENTRAL                    03                   THREE
##  4    GREENWICH VILLAGE-WEST                    03                   THREE
##  5    GREENWICH VILLAGE-WEST                    03                   THREE
##  6            HARLEM-CENTRAL                    03                   THREE
##  7            HARLEM-CENTRAL                    03                   THREE
##  8            HARLEM-CENTRAL                    03                   THREE
##  9            HARLEM-CENTRAL                    03                   THREE
## 10            HARLEM-CENTRAL                    03                   THREE
## # ... with 28 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## #   SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
AptRent <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="RENTALS")
AptRent
## # A tibble: 512 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##     NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##            <chr>                 <chr>                   <chr>
##  1 ALPHABET CITY                    07                 RENTALS
##  2 ALPHABET CITY                    07                 RENTALS
##  3 ALPHABET CITY                    07                 RENTALS
##  4 ALPHABET CITY                    07                 RENTALS
##  5 ALPHABET CITY                    08                 RENTALS
##  6 ALPHABET CITY                    08                 RENTALS
##  7 ALPHABET CITY                    14                 RENTALS
##  8 ALPHABET CITY                    14                 RENTALS
##  9 ALPHABET CITY                    14                 RENTALS
## 10       CHELSEA                    07                 RENTALS
## # ... with 502 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## #   SALE_PRICE <dbl>, PRICE_PerFeet <dbl>

Analysis residentail property price per feet

popType <- c("COOP","CONDO","OneFamily", "TwoFamily","ThreFamily", "AptRent")

mean_price <- c(mean(COOP$PRICE_PerFeet), mean(condo$PRICE_PerFeet),mean(onefamily$PRICE_PerFeet), mean(twofamily$PRICE_PerFeet),mean(threefamily$PRICE_PerFeet), mean(AptRent$PRICE_PerFeet))

library ('ggplot2')
barplot(mean_price, names.arg=popType, ylim=c(0,2500), main="Manhattan Residentail Property Sale Price (ft)", ylab="Price", xlab="Poperty Types", col="skyblue")

NYC Population 2016

The data source is from census.gov about population in New York from 2008 to 2016. Please see the original data in the link https://www.census.gov/data/datasets/2016/demo/popest/counties-detail.html

Import Data from a .csv file to a data frame “df”

# Import data
population <-tbl_df(read.csv('D:/CUNY_SPS_DA/607_Data_Aq/Project2/NYC_Population_2016.csv', stringsAsFactors = FALSE)) 

df<- population[, 5:8]
head(df,5) 
## # A tibble: 5 x 4
##         CTYNAME  YEAR AGEGRP TOT_POP
##           <chr> <int>  <int>   <int>
## 1 Albany County     1      0  304204
## 2 Albany County     1      1   15286
## 3 Albany County     1      2   16131
## 4 Albany County     1      3   17639
## 5 Albany County     1      4   23752

Data Processing with dplyr & tidyr

unique(unlist(df$AGEGRP))
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18
 df3 <- df %>% spread(AGEGRP,TOT_POP)
 df3 <- rename(df3,c("0"="g0" , "1"="g1","2"= "g2","3"= "g3","4"= "g4","5"= "g5","6"= "g6","7"= "g7","8"= "g8","9"= "g9","10"="g10", "11"="g11","12"= "g12","13"= "g13","14"= "g14","15"= "g15","16"= "g16","17"= "g17","18"= "g18" ))
 
 ageG <- df3 %>% filter(YEAR==9)
 
curAge <-c(sum(ageG$g1),sum(ageG$g2),sum(ageG$g3),sum(ageG$g4),sum(ageG$g5),sum(ageG$g6),sum(ageG$g7),sum(ageG$g8),sum(ageG$g9),sum(ageG$g10),sum(ageG$g11),sum(ageG$g12),sum(ageG$g13),sum(ageG$g14),sum(ageG$g15),sum(ageG$g16),sum(ageG$g17),sum(ageG$g18))

gName <-c("g1","g2","g3","g4","g5","g6","g7","g8","g9","g10","g11","g12","g13","g14","g15","g16","g17","g18")

ageStruc<-as.data.frame(gName,curAge)
## Warning in as.data.frame.vector(x, ..., nm = nm): 'row.names' is not a
## character vector of length 18 -- omitting it. Will be an error!
dim(ageStruc)
## [1] 18  1

Analysis New York Age Grops Size

# group number bigger, the age older
ggplot(ageStruc, aes(gName,curAge ))+
  geom_bar(stat = 'identity') +
  theme(axis.text.x=element_text(angle=90, hjust=1))+ coord_flip()+
  ggtitle('New York Age Grops')

#If I can get more than 30 years population data,I can grow rates then can estemate the future age structures by simulation.

NYC Rolling Sales - Brooklyn

The data source is from census.gov about household income in New York. Please see the original data in the link https://www.census.gov/data/tables/time-series/demo/income-poverty/cps-hinc/hinc-01.html.

Import Data from a .csv file to a data frame “df”

# Import data
brooklyn <-tbl_df(read.csv('D:/CUNY_SPS_DA/607_Data_Aq/Project2/NYCRollingSales9.2016_8.2017/rollingsales_brooklyn.csv', stringsAsFactors = FALSE)) 

colnames(brooklyn) <- as.character(unlist(brooklyn[4,]))
brooklyn = brooklyn[-4, ]

df<- brooklyn[4:nrow(brooklyn),]
tail(df,50) 
## # A tibble: 50 x 21
##    BOROUGH    NEIGHBORHOOD                   `BUILDING CLASS CATEGORY`
##      <chr>           <chr>                                       <chr>
##  1       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  2       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  3       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  4       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  5       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  6       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  7       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  8       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  9       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
## 10       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
## # ... with 40 more rows, and 18 more variables: `TAX CLASS AT
## #   PRESENT` <chr>, BLOCK <chr>, LOT <chr>, `EASE-MENT` <chr>, `BUILDING
## #   CLASS AT PRESENT` <chr>, ADDRESS <chr>, `APARTMENT NUMBER` <chr>, `ZIP
## #   CODE` <chr>, `RESIDENTIAL UNITS` <chr>, `COMMERCIAL UNITS` <chr>,
## #   `TOTAL UNITS` <chr>, `LAND SQUARE FEET` <chr>, `GROSS SQUARE
## #   FEET` <chr>, `YEAR BUILT` <chr>, `TAX CLASS AT TIME OF SALE` <chr>,
## #   `BUILDING CLASS AT TIME OF SALE` <chr>, ` SALE PRICE ` <chr>, `SALE
## #   DATE` <chr>
dim(df)
## [1] 24047    21

Data Cleaning

# Original raw data types 
#str(df)

# rename the table
library(data.table)
colnms <- c("BOROUGH", "NEIGHBORHOOD", "BUILDING_CLASS_CATEGORY", "TAX_CLASS_AT_PRESENT","BLOCK","LOT","EASE_MENT", "BUILDING_CLASS_AT_PRESENT","ADDRESS", "APARTMENT_NUMBER", "ZIP_CODE","RESIDENTIAL_UNITS","COMMERCIAL_UNITS", "TOTAL_UNITS", "LAND_SQUARE_FEET", "GROSS_SQUARE_FEET","YEAR_BUILT", "TAX_CLASS_AT_TIME_OF_SALE","BUILDING_CLASS_AT_TIME_OF_SALE","SALE_PRICE", "SALE_DATE" )
setnames(df, colnms)

#correct data types
df$BOROUGH <-as.numeric(df$BOROUGH) 
df$BLOCK <-as.numeric(df$BLOCK)
df$LOT <-as.numeric(df$LOT)
df$BLOCK <-as.numeric(df$BLOCK)
df$RESIDENTIAL_UNITS <-as.numeric(df$RESIDENTIAL_UNITS)
df$COMMERCIAL_UNITS <-as.numeric(df$COMMERCIAL_UNITS)
df$TOTAL_UNITS <-as.numeric(df$TOTAL_UNITS)
df$LAND_SQUARE_FEET <-as.numeric(gsub(",","",df$LAND_SQUARE_FEET))
## Warning: NAs introduced by coercion
df$GROSS_SQUARE_FEET <-as.numeric(gsub(",","",df$GROSS_SQUARE_FEET))
## Warning: NAs introduced by coercion
df$YEAR_BUILT <-as.numeric(df$YEAR_BUILT)
df$TAX_CLASS_AT_TIME_OF_SALE <-as.numeric(df$TAX_CLASS_AT_TIME_OF_SALE)
df$SALE_PRICE <-as.numeric(gsub(",","",df$SALE_PRICE))
## Warning: NAs introduced by coercion
#check data types
str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame':    24047 obs. of  21 variables:
##  $ BOROUGH                       : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ NEIGHBORHOOD                  : chr  "BATH BEACH" "BATH BEACH" "BATH BEACH" "BATH BEACH" ...
##  $ BUILDING_CLASS_CATEGORY       : chr  "01 ONE FAMILY DWELLINGS                    " "01 ONE FAMILY DWELLINGS                    " "01 ONE FAMILY DWELLINGS                    " "01 ONE FAMILY DWELLINGS                    " ...
##  $ TAX_CLASS_AT_PRESENT          : chr  "1" "1" "1" "1" ...
##  $ BLOCK                         : num  6360 6361 6364 6367 6374 ...
##  $ LOT                           : num  157 20 2 42 67 13 49 25 36 57 ...
##  $ EASE_MENT                     : chr  " " " " " " " " ...
##  $ BUILDING_CLASS_AT_PRESENT     : chr  "A5" "A5" "A5" "S1" ...
##  $ ADDRESS                       : chr  "36 BAY 10TH   STREET" "47 BAY 10TH STREET" "1649 BENSON AVENUE" "1730 86TH STREET" ...
##  $ APARTMENT_NUMBER              : chr  " " " " " " " " ...
##  $ ZIP_CODE                      : chr  "11228" "11228" "11214" "11214" ...
##  $ RESIDENTIAL_UNITS             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ COMMERCIAL_UNITS              : num  0 0 0 1 1 0 1 0 0 0 ...
##  $ TOTAL_UNITS                   : num  1 1 1 2 2 1 2 1 1 1 ...
##  $ LAND_SQUARE_FEET              : num  1547 1933 1638 1342 1740 ...
##  $ GROSS_SQUARE_FEET             : num  1428 1660 972 1920 3240 ...
##  $ YEAR_BUILT                    : num  1930 1930 1930 1926 1925 ...
##  $ TAX_CLASS_AT_TIME_OF_SALE     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BUILDING_CLASS_AT_TIME_OF_SALE: chr  "A5" "A5" "A5" "S1" ...
##  $ SALE_PRICE                    : num  800000 NA NA 1200000 1090000 NA 1060000 2050000 950000 NA ...
##  $ SALE_DATE                     : chr  "2/27/2017" "3/4/2017" "10/6/2016" "3/31/2017" ...
tail(df,50)
## # A tibble: 50 x 21
##    BOROUGH    NEIGHBORHOOD                     BUILDING_CLASS_CATEGORY
##      <dbl>           <chr>                                       <chr>
##  1       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  2       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  3       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  4       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  5       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  6       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  7       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  8       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
##  9       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
## 10       3 WYCKOFF HEIGHTS 07 RENTALS - WALKUP APARTMENTS             
## # ... with 40 more rows, and 18 more variables:
## #   TAX_CLASS_AT_PRESENT <chr>, BLOCK <dbl>, LOT <dbl>, EASE_MENT <chr>,
## #   BUILDING_CLASS_AT_PRESENT <chr>, ADDRESS <chr>,
## #   APARTMENT_NUMBER <chr>, ZIP_CODE <chr>, RESIDENTIAL_UNITS <dbl>,
## #   COMMERCIAL_UNITS <dbl>, TOTAL_UNITS <dbl>, LAND_SQUARE_FEET <dbl>,
## #   GROSS_SQUARE_FEET <dbl>, YEAR_BUILT <dbl>,
## #   TAX_CLASS_AT_TIME_OF_SALE <dbl>, BUILDING_CLASS_AT_TIME_OF_SALE <chr>,
## #   SALE_PRICE <dbl>, SALE_DATE <chr>

Data Processing with dplyr & tidyr

mydat<- df%>% separate(BUILDING_CLASS_CATEGORY, into=c("BUILDING_CLASS_Number", "BUILDING_CLASS_CATEGORY"))
## Warning: Too many values at 24047 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
#calculate unit pice per feet for each sale
mydat2<- na.omit(mydat)%>% 
  select(NEIGHBORHOOD,BUILDING_CLASS_Number, BUILDING_CLASS_CATEGORY, GROSS_SQUARE_FEET, SALE_PRICE)%>%
  mutate(PRICE_PerFeet = round(SALE_PRICE/GROSS_SQUARE_FEET))%>% 
  group_by(BUILDING_CLASS_CATEGORY)
mydat2
## # A tibble: 8,669 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [20]
##    NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##           <chr>                 <chr>                   <chr>
##  1   BATH BEACH                    01                     ONE
##  2   BATH BEACH                    01                     ONE
##  3   BATH BEACH                    01                     ONE
##  4   BATH BEACH                    01                     ONE
##  5   BATH BEACH                    01                     ONE
##  6   BATH BEACH                    01                     ONE
##  7   BATH BEACH                    01                     ONE
##  8   BATH BEACH                    01                     ONE
##  9   BATH BEACH                    01                     ONE
## 10   BATH BEACH                    01                     ONE
## # ... with 8,659 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## #   SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
unique(unlist(df$BUILDING_CLASS_CATEGORY))
##  [1] "01 ONE FAMILY DWELLINGS                    " 
##  [2] "02 TWO FAMILY DWELLINGS                    " 
##  [3] "03 THREE FAMILY DWELLINGS                  " 
##  [4] "04 TAX CLASS 1 CONDOS                      " 
##  [5] "05 TAX CLASS 1 VACANT LAND                 " 
##  [6] "07 RENTALS - WALKUP APARTMENTS             " 
##  [7] "09 COOPS - WALKUP APARTMENTS               " 
##  [8] "10 COOPS - ELEVATOR APARTMENTS             " 
##  [9] "12 CONDOS - WALKUP APARTMENTS              " 
## [10] "13 CONDOS - ELEVATOR APARTMENTS            " 
## [11] "14 RENTALS - 4-10 UNIT                     " 
## [12] "15 CONDOS - 2-10 UNIT RESIDENTIAL          " 
## [13] "16 CONDOS - 2-10 UNIT WITH COMMERCIAL UNIT " 
## [14] "21 OFFICE BUILDINGS                        " 
## [15] "22 STORE BUILDINGS                         " 
## [16] "43 CONDO OFFICE BUILDINGS                  " 
## [17] "44 CONDO PARKING                           " 
## [18] "06 TAX CLASS 1 - OTHER                     " 
## [19] "08 RENTALS - ELEVATOR APARTMENTS           " 
## [20] "17 CONDO COOPS                             " 
## [21] "29 COMMERCIAL GARAGES                      " 
## [22] "30 WAREHOUSES                              " 
## [23] "46 CONDO STORE BUILDINGS                   " 
## [24] "47 CONDO NON-BUSINESS STORAGE              " 
## [25] "11A CONDO-RENTALS                           "
## [26] "23 LOFT BUILDINGS                          " 
## [27] "26 OTHER HOTELS                            " 
## [28] "27 FACTORIES                               " 
## [29] "28 COMMERCIAL CONDOS                       " 
## [30] "31 COMMERCIAL VACANT LAND                  " 
## [31] "32 HOSPITAL AND HEALTH FACILITIES          " 
## [32] "37 RELIGIOUS FACILITIES                    " 
## [33] "38 ASYLUMS AND HOMES                       " 
## [34] "41 TAX CLASS 4 - OTHER                     " 
## [35] "48 CONDO TERRACES/GARDENS/CABANAS          " 
## [36] "49 CONDO WAREHOUSES/FACTORY/INDUS          " 
## [37] "35 INDOOR PUBLIC AND CULTURAL FACILITIES   " 
## [38] "33 EDUCATIONAL FACILITIES                  " 
## [39] "42 CONDO CULTURAL/MEDICAL/EDUCATIONAL/ETC  " 
## [40] "45 CONDO HOTELS                            " 
## [41] "11 SPECIAL CONDO BILLING LOTS              " 
## [42] "39 TRANSPORTATION FACILITIES               " 
## [43] "36 OUTDOOR RECREATIONAL FACILITIES         "
#This is a wrong value and won't be in the analysis.
condo <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="CONDO")
condo
## # A tibble: 1 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##           NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##                  <chr>                 <chr>                   <chr>
## 1 DOWNTOWN-FULTON MALL                   11A                   CONDO
## # ... with 3 more variables: GROSS_SQUARE_FEET <dbl>, SALE_PRICE <dbl>,
## #   PRICE_PerFeet <dbl>
COOP <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="COOPS")
COOP
## # A tibble: 5 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##     NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##            <chr>                 <chr>                   <chr>
## 1       BUSHWICK                    09                   COOPS
## 2      FLATLANDS                    10                   COOPS
## 3     KENSINGTON                    10                   COOPS
## 4     KENSINGTON                    10                   COOPS
## 5 SHEEPSHEAD BAY                    10                   COOPS
## # ... with 3 more variables: GROSS_SQUARE_FEET <dbl>, SALE_PRICE <dbl>,
## #   PRICE_PerFeet <dbl>
onefamily <-mydat2%>% filter(BUILDING_CLASS_CATEGORY=="ONE")
onefamily
## # A tibble: 2,345 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##    NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##           <chr>                 <chr>                   <chr>
##  1   BATH BEACH                    01                     ONE
##  2   BATH BEACH                    01                     ONE
##  3   BATH BEACH                    01                     ONE
##  4   BATH BEACH                    01                     ONE
##  5   BATH BEACH                    01                     ONE
##  6   BATH BEACH                    01                     ONE
##  7   BATH BEACH                    01                     ONE
##  8   BATH BEACH                    01                     ONE
##  9   BATH BEACH                    01                     ONE
## 10   BATH BEACH                    01                     ONE
## # ... with 2,335 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## #   SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
twofamily <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="TWO")
twofamily
## # A tibble: 3,632 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##    NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##           <chr>                 <chr>                   <chr>
##  1   BATH BEACH                    02                     TWO
##  2   BATH BEACH                    02                     TWO
##  3   BATH BEACH                    02                     TWO
##  4   BATH BEACH                    02                     TWO
##  5   BATH BEACH                    02                     TWO
##  6   BATH BEACH                    02                     TWO
##  7   BATH BEACH                    02                     TWO
##  8   BATH BEACH                    02                     TWO
##  9   BATH BEACH                    02                     TWO
## 10   BATH BEACH                    02                     TWO
## # ... with 3,622 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## #   SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
threefamily <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="THREE")
threefamily
## # A tibble: 1,143 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##    NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##           <chr>                 <chr>                   <chr>
##  1   BATH BEACH                    03                   THREE
##  2   BATH BEACH                    03                   THREE
##  3   BATH BEACH                    03                   THREE
##  4   BATH BEACH                    03                   THREE
##  5   BATH BEACH                    03                   THREE
##  6   BATH BEACH                    03                   THREE
##  7   BATH BEACH                    03                   THREE
##  8   BATH BEACH                    03                   THREE
##  9   BATH BEACH                    03                   THREE
## 10   BATH BEACH                    03                   THREE
## # ... with 1,133 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## #   SALE_PRICE <dbl>, PRICE_PerFeet <dbl>
AptRent <- mydat2%>% filter(BUILDING_CLASS_CATEGORY=="RENTALS")
AptRent
## # A tibble: 1,045 x 6
## # Groups:   BUILDING_CLASS_CATEGORY [1]
##    NEIGHBORHOOD BUILDING_CLASS_Number BUILDING_CLASS_CATEGORY
##           <chr>                 <chr>                   <chr>
##  1   BATH BEACH                    07                 RENTALS
##  2   BATH BEACH                    07                 RENTALS
##  3   BATH BEACH                    07                 RENTALS
##  4    BAY RIDGE                    07                 RENTALS
##  5    BAY RIDGE                    07                 RENTALS
##  6    BAY RIDGE                    07                 RENTALS
##  7    BAY RIDGE                    07                 RENTALS
##  8    BAY RIDGE                    07                 RENTALS
##  9    BAY RIDGE                    07                 RENTALS
## 10    BAY RIDGE                    07                 RENTALS
## # ... with 1,035 more rows, and 3 more variables: GROSS_SQUARE_FEET <dbl>,
## #   SALE_PRICE <dbl>, PRICE_PerFeet <dbl>

Analysis residentail property price per feet

popType <- c("COOP","OneFamily", "TwoFamily","ThreFamily", "AptRent")

mean_price <- c(mean(COOP$PRICE_PerFeet),mean(onefamily$PRICE_PerFeet), mean(twofamily$PRICE_PerFeet),mean(threefamily$PRICE_PerFeet), mean(AptRent$PRICE_PerFeet))

library ('ggplot2')
barplot(mean_price, names.arg=popType, ylim=c(0,1000), main="Brooklyn Residentail Property Sale Price (ft)", ylab="Price", xlab="Poperty Types", col="red")