For this project, I will be analyizing the sales of proeprties for Brooklyn from 2017-2018
csv_file = "https://raw.githubusercontent.com/dapolloxp/R-Projects/master/rollingsales_brooklyn.csv"
pullCSVFile <- function (url)
{
if(url.exists(url))
{
expr <- tryCatch(
{
message(paste("CSV File ", url, " exists"))
message("Downloading CSV File from github...\nThis may take a while....")
out_file <- getURL(url)
csv_file <- read.csv(text = out_file)
},
error = function(err)
{
print(paste("Couldn't download file: ", err))
},
finally = {
return(csv_file)
}
)
}
else
{
print(paste("Could not location CSV file at: ", url))
break()
}
}
result <- pullCSVFile(csv_file)
## CSV File https://raw.githubusercontent.com/dapolloxp/R-Projects/master/rollingsales_brooklyn.csv exists
## Downloading CSV File from github...
## This may take a while....
## The initial data has columns that we are not interested in for the purposes of this report
## This is the list of columns within the CSV file
names(result)
## [1] "BOROUGH" "NEIGHBORHOOD"
## [3] "BUILDING.CLASS.CATEGORY" "TAX.CLASS.AT.PRESENT"
## [5] "BLOCK" "LOT"
## [7] "EASE.MENT" "BUILDING.CLASS.AT.PRESENT"
## [9] "ADDRESS" "APARTMENT.NUMBER"
## [11] "ZIP.CODE" "RESIDENTIAL.UNITS"
## [13] "COMMERCIAL.UNITS" "TOTAL.UNITS"
## [15] "LAND.SQUARE.FEET" "GROSS.SQUARE.FEET"
## [17] "YEAR.BUILT" "TAX.CLASS.AT.TIME.OF.SALE"
## [19] "BUILDING.CLASS.AT.TIME.OF.SALE" "SALE.PRICE"
## [21] "SALE.DATE"
## I will select a smaller subset of this data as there are certain fields that I am not interested in
reduced_result <- result %>% select("NEIGHBORHOOD","BUILDING.CLASS.CATEGORY", "ADDRESS", "APARTMENT.NUMBER","ZIP.CODE","RESIDENTIAL.UNITS", "COMMERCIAL.UNITS", "TOTAL.UNITS", "LAND.SQUARE.FEET", "GROSS.SQUARE.FEET", "SALE.PRICE", "SALE.DATE")
## convert sales price to numeric
reduced_result[["SALE.PRICE"]] <- as.numeric((gsub("[^0-9]","", reduced_result[["SALE.PRICE"]])))
## convert sales date from character to date
reduced_result[["SALE.DATE"]] <- as.Date(reduced_result[["SALE.DATE"]], "%m/%d/%y")
##
reduced_result[["LAND.SQUARE.FEET"]] <- as.numeric((gsub("[^0-9]","", reduced_result[["LAND.SQUARE.FEET"]])))
##
reduced_result[["GROSS.SQUARE.FEET"]] <- as.numeric((gsub("[^0-9]","", reduced_result[["GROSS.SQUARE.FEET"]])))
reduced_result[["TOTAL.UNITS"]] <- as.integer(reduced_result[["TOTAL.UNITS"]])
reduced_result[["RESIDENTIAL.UNITS"]] <- as.integer(reduced_result[["RESIDENTIAL.UNITS"]])
reduced_result[["ADDRESS"]] <- as.character(reduced_result[["ADDRESS"]])
##leaflet() %>% addTiles() %>% setView(-73.994, 40.6782, zoom = 11)
### Removing any entries that are for 2017
sales_2018 <- subset(reduced_result, reduced_result$SALE.DATE > "2017-12-31")
#reduced_result
## Since I want to do computation on price, I am filtering any values that do not contain price sales
recorded_sales_2018 <- sales_2018 %>% filter(SALE.PRICE != "NA")
## Below is a summary of sale prices
## The medium price is $790,800 for Brooklyn and the average sale price for 2018 was $1,356,196 million.
summary(recorded_sales_2018)
## NEIGHBORHOOD
## BEDFORD STUYVESANT: 773
## EAST NEW YORK : 617
## FLATBUSH-EAST : 526
## BOROUGH PARK : 524
## SHEEPSHEAD BAY : 524
## BAY RIDGE : 522
## (Other) :9915
## BUILDING.CLASS.CATEGORY ADDRESS
## 02 TWO FAMILY DWELLINGS :3194 Length:13401
## 13 CONDOS - ELEVATOR APARTMENTS :1996 Class :character
## 01 ONE FAMILY DWELLINGS :1985 Mode :character
## 10 COOPS - ELEVATOR APARTMENTS :1653
## 03 THREE FAMILY DWELLINGS :1041
## 15 CONDOS - 2-10 UNIT RESIDENTIAL : 720
## (Other) :2812
## APARTMENT.NUMBER ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS
## :9560 Min. : 0 Min. : 1.00 Min. : 0.0000
## 4 : 145 1st Qu.:11210 1st Qu.: 2.00 1st Qu.: 0.0000
## 3A : 116 Median :11219 Median : 2.00 Median : 0.0000
## 2 : 115 Mean :11186 Mean : 15.31 Mean : 0.1322
## 2B : 115 3rd Qu.:11231 3rd Qu.: 29.00 3rd Qu.: 0.0000
## 3 : 112 Max. :11249 Max. :102.00 Max. :26.0000
## (Other):3238
## TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET SALE.PRICE
## Min. : 1.00 Min. : 0 Min. : 0 Min. : 1
## 1st Qu.: 2.00 1st Qu.: 0 1st Qu.: 0 1st Qu.: 485000
## Median : 2.00 Median : 1992 Median : 1400 Median : 790800
## Mean : 17.14 Mean : 3790 Mean : 2324 Mean : 1356196
## 3rd Qu.: 31.00 3rd Qu.: 2517 3rd Qu.: 2391 3rd Qu.: 1300000
## Max. :101.00 Max. :970000 Max. :997720 Max. :869612895
##
## SALE.DATE
## Min. :2018-01-01
## 1st Qu.:2018-03-27
## Median :2018-06-18
## Mean :2018-06-15
## 3rd Qu.:2018-08-30
## Max. :2018-11-30
##
neighborhood_avgsp <- recorded_sales_2018 %>% group_by(NEIGHBORHOOD) %>% summarize(AvgSalesPrice=mean(SALE.PRICE)) %>% arrange(desc(AvgSalesPrice))
neighborhood_avgsp_no_spring_creek <- recorded_sales_2018 %>% filter(NEIGHBORHOOD != 'SPRING CREEK') %>% group_by(NEIGHBORHOOD) %>% summarize(AvgSalesPrice=mean(SALE.PRICE)) %>% arrange(desc(AvgSalesPrice))
class(neighborhood_avgsp)
## [1] "tbl_df" "tbl" "data.frame"
g <- ggplot(neighborhood_avgsp, aes(x = neighborhood_avgsp$NEIGHBORHOOD, y = neighborhood_avgsp$AvgSalesPrice/1000))
g + geom_col()+ xlab("Neighborhood") + ylab("Price in Millions per thousand") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
g <- ggplot(neighborhood_avgsp_no_spring_creek, aes(x = neighborhood_avgsp_no_spring_creek$NEIGHBORHOOD, y = neighborhood_avgsp_no_spring_creek$AvgSalesPrice/1000))
g + geom_col()+ xlab("Neighborhood") + ylab("Price in Millions per thousand") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
## For the next comparsion, I would like to know if square footage has a linear path to selling price. I assume this will be the case, but it may be exponential. ### Due to several properties not having recorded square footage, I will need to filter these properties out.
recorded_sales_with_gross_sqft <- recorded_sales_2018 %>% filter(recorded_sales_2018$GROSS.SQUARE.FEET > 0)
ggplot(recorded_sales_with_gross_sqft, aes(x=recorded_sales_with_gross_sqft$GROSS.SQUARE.FEET, y=recorded_sales_with_gross_sqft$SALE.PRICE/1000)) +geom_point() + xlab("Square Footage") + ylab("Price: Millions per 1000")
require(scales)
## Loading required package: scales
recorded_sales_with_gross_sqft <- recorded_sales_2018 %>% filter(recorded_sales_2018$GROSS.SQUARE.FEET > 0 & recorded_sales_2018$NEIGHBORHOOD != 'SPRING CREEK')
ggplot(recorded_sales_with_gross_sqft, aes(x=recorded_sales_with_gross_sqft$GROSS.SQUARE.FEET, y=recorded_sales_with_gross_sqft$SALE.PRICE/1000)) +geom_point(aes(color=recorded_sales_with_gross_sqft$SALE.PRICE)) + xlab("Square Footage") + ylab("Price: Millions per 1000") + scale_x_continuous(labels = comma) +theme(legend.position = "none") + geom_smooth(method = "lm")
## neighborhood_avgsp_no_spring_creek <- recorded_sales_2018 %>% filter(NEIGHBORHOOD != 'SPRING CREEK')
Lets count the number of sales per neighborhood. We see that Bedford Stuyvesant, East New York, and East Flat Bush recorded the most number of sales.
neighborhoods_with_most_sales <- recorded_sales_2018 %>% group_by(NEIGHBORHOOD) %>% count() %>% arrange(desc(n))
neighborhoods_with_most_sales
## # A tibble: 61 x 2
## # Groups: NEIGHBORHOOD [61]
## NEIGHBORHOOD n
## <fct> <int>
## 1 BEDFORD STUYVESANT 773
## 2 EAST NEW YORK 617
## 3 FLATBUSH-EAST 526
## 4 BOROUGH PARK 524
## 5 SHEEPSHEAD BAY 524
## 6 BAY RIDGE 522
## 7 PARK SLOPE 501
## 8 CANARSIE 500
## 9 CROWN HEIGHTS 433
## 10 FLATBUSH-CENTRAL 400
## # ... with 51 more rows
g <- ggplot(neighborhoods_with_most_sales, aes(x = neighborhoods_with_most_sales$NEIGHBORHOOD, y = neighborhoods_with_most_sales$n))
g + geom_col()+ xlab("Neighborhood") + ylab("Total Sales") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
summary(recorded_sales_2018)
## NEIGHBORHOOD
## BEDFORD STUYVESANT: 773
## EAST NEW YORK : 617
## FLATBUSH-EAST : 526
## BOROUGH PARK : 524
## SHEEPSHEAD BAY : 524
## BAY RIDGE : 522
## (Other) :9915
## BUILDING.CLASS.CATEGORY ADDRESS
## 02 TWO FAMILY DWELLINGS :3194 Length:13401
## 13 CONDOS - ELEVATOR APARTMENTS :1996 Class :character
## 01 ONE FAMILY DWELLINGS :1985 Mode :character
## 10 COOPS - ELEVATOR APARTMENTS :1653
## 03 THREE FAMILY DWELLINGS :1041
## 15 CONDOS - 2-10 UNIT RESIDENTIAL : 720
## (Other) :2812
## APARTMENT.NUMBER ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS
## :9560 Min. : 0 Min. : 1.00 Min. : 0.0000
## 4 : 145 1st Qu.:11210 1st Qu.: 2.00 1st Qu.: 0.0000
## 3A : 116 Median :11219 Median : 2.00 Median : 0.0000
## 2 : 115 Mean :11186 Mean : 15.31 Mean : 0.1322
## 2B : 115 3rd Qu.:11231 3rd Qu.: 29.00 3rd Qu.: 0.0000
## 3 : 112 Max. :11249 Max. :102.00 Max. :26.0000
## (Other):3238
## TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET SALE.PRICE
## Min. : 1.00 Min. : 0 Min. : 0 Min. : 1
## 1st Qu.: 2.00 1st Qu.: 0 1st Qu.: 0 1st Qu.: 485000
## Median : 2.00 Median : 1992 Median : 1400 Median : 790800
## Mean : 17.14 Mean : 3790 Mean : 2324 Mean : 1356196
## 3rd Qu.: 31.00 3rd Qu.: 2517 3rd Qu.: 2391 3rd Qu.: 1300000
## Max. :101.00 Max. :970000 Max. :997720 Max. :869612895
##
## SALE.DATE
## Min. :2018-01-01
## 1st Qu.:2018-03-27
## Median :2018-06-18
## Mean :2018-06-15
## 3rd Qu.:2018-08-30
## Max. :2018-11-30
##
Originally we sought out to answer the following questions:
Based on our analysis above, we were able to determine that the following neighorboods experienced the most sales:
We also were able to determine the average sales prices per neighborhood
Based on our histogram above, we were also able to see what months during the year most sales took place. As expected, July was active, but to my surprise, January had a relatively higher number of recorded sales, which is generally atypical in most areas. That being said, New York City isn’t a typical place.
Finally, based on our summary statistics of our data frame, we were able to determine that the mean and median square footage were as follows: