# load data
library(httr)
library(RCurl)
library(dplyr)
csv_file = "https://raw.githubusercontent.com/dapolloxp/R-Projects/master/rollingsales_brooklyn.csv"
out_file <- getURL(csv_file )
property.data <- read.csv(text = out_file)
reduced_result <- property.data %>% select("NEIGHBORHOOD","BUILDING.CLASS.CATEGORY", "YEAR.BUILT","ADDRESS", "APARTMENT.NUMBER","ZIP.CODE","RESIDENTIAL.UNITS", "COMMERCIAL.UNITS", "TOTAL.UNITS", "LAND.SQUARE.FEET", "GROSS.SQUARE.FEET", "SALE.PRICE", "SALE.DATE")
## convert sales price to numeric
reduced_result[["SALE.PRICE"]] <- as.numeric((gsub("[^0-9]","", reduced_result[["SALE.PRICE"]])))
## convert sales date from character to date
reduced_result[["SALE.DATE"]] <- as.Date(reduced_result[["SALE.DATE"]], "%m/%d/%y")
x<-as.character(reduced_result[["ZIP.CODE"]])
reduced_result[["ZIP.CODE"]] <- as.factor(x)
##
reduced_result[["LAND.SQUARE.FEET"]] <- as.numeric((gsub("[^0-9]","", reduced_result[["LAND.SQUARE.FEET"]])))
##
reduced_result[["GROSS.SQUARE.FEET"]] <- as.numeric((gsub("[^0-9]","", reduced_result[["GROSS.SQUARE.FEET"]])))
reduced_result[["TOTAL.UNITS"]] <- as.integer(reduced_result[["TOTAL.UNITS"]])
reduced_result[["RESIDENTIAL.UNITS"]] <- as.integer(reduced_result[["RESIDENTIAL.UNITS"]])
reduced_result[["ADDRESS"]] <- as.character(reduced_result[["ADDRESS"]])
##leaflet() %>% addTiles() %>% setView(-73.994, 40.6782, zoom = 11)
### Removing any entries that are for 2017
sales_2018 <- subset(reduced_result, reduced_result$SALE.DATE > "2017-12-31")
#reduced_result
## Since I want to do computation on price, I am filtering any values that do not contain price sales
recorded_sales_2018 <- sales_2018 %>% filter(SALE.PRICE != "NA")
## Below is a summary of sale prices
## The medium price is $790,800 for Brooklyn and the average sale price for 2018 was $1,356,196 million.
I am trying to determine if all 2018 sold properties (different units), increased at a similar rate across different zip codes within Brooklyn, NY. Ideally, I would like to know if this is uniform across all of the 5 boros, but it requires collecting data for all 5 boros.
What are the cases, and how many are there?
The cases are the total number of sales in Kings County (Brooklyn, NY). In this dataset there are 23207 observations.
Describe the method of data collection.
I pulled the XLS file from the NYC.Gov site. I then converted it to a CSV file and uploaded it to Github.
What type of study is this (observational/experiment)?
This is an observational study since all of these are part recorded events.
If you collected the data, state self-collected. If not, provide a citation/link.
https://www1.nyc.gov/site/finance/taxes/property-rolling-sales-data.page
What is the response variable? Is it quantitative or qualitative?
The response variable is the sales price and it happens to be quantitative in this case.
You should have two independent variables, one quantitative and one qualitative.
The independent variables are land.square.feet (quantitative), and neighborhood (qualitative). I can also use zip code in place of neighborhood.
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
The summary function is used below. For some of the variables, it doesn’t make sense to use the summary statistics, but I have included them for completeness.
For the boxplot, I removed outliers as it would skew the results significantly.
library(lattice)
library(randomcoloR)
summary(recorded_sales_2018)
## NEIGHBORHOOD
## BEDFORD STUYVESANT: 773
## EAST NEW YORK : 617
## FLATBUSH-EAST : 526
## BOROUGH PARK : 524
## SHEEPSHEAD BAY : 524
## BAY RIDGE : 522
## (Other) :9915
## BUILDING.CLASS.CATEGORY YEAR.BUILT
## 02 TWO FAMILY DWELLINGS :3194 Min. : 0
## 13 CONDOS - ELEVATOR APARTMENTS :1996 1st Qu.:1910
## 01 ONE FAMILY DWELLINGS :1985 Median :1930
## 10 COOPS - ELEVATOR APARTMENTS :1653 Mean :1814
## 03 THREE FAMILY DWELLINGS :1041 3rd Qu.:1963
## 15 CONDOS - 2-10 UNIT RESIDENTIAL : 720 Max. :2018
## (Other) :2812
## ADDRESS APARTMENT.NUMBER ZIP.CODE RESIDENTIAL.UNITS
## Length:13401 :9560 11235 : 680 Min. : 1.00
## Class :character 4 : 145 11201 : 676 1st Qu.: 2.00
## Mode :character 3A : 116 11234 : 676 Median : 2.00
## 2 : 115 11229 : 635 Mean : 15.31
## 2B : 115 11215 : 625 3rd Qu.: 29.00
## 3 : 112 11207 : 496 Max. :102.00
## (Other):3238 (Other):9613
## COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET
## Min. : 0.0000 Min. : 1.00 Min. : 0 Min. : 0
## 1st Qu.: 0.0000 1st Qu.: 2.00 1st Qu.: 0 1st Qu.: 0
## Median : 0.0000 Median : 2.00 Median : 1992 Median : 1400
## Mean : 0.1322 Mean : 17.14 Mean : 3790 Mean : 2324
## 3rd Qu.: 0.0000 3rd Qu.: 31.00 3rd Qu.: 2517 3rd Qu.: 2391
## Max. :26.0000 Max. :101.00 Max. :970000 Max. :997720
##
## SALE.PRICE SALE.DATE
## Min. : 1 Min. :2018-01-01
## 1st Qu.: 485000 1st Qu.:2018-03-27
## Median : 790800 Median :2018-06-18
## Mean : 1356196 Mean :2018-06-15
## 3rd Qu.: 1300000 3rd Qu.:2018-08-30
## Max. :869612895 Max. :2018-11-30
##
boxplot(reduced_result$LAND.SQUARE.FEET, outline = FALSE)
boxplot(reduced_result$SALE.PRICE)
plot(reduced_result$LAND.SQUARE.FEET, reduced_result$SALE.PRICE, xlab = "Square Feet ", ylab = "Sales Price")
my_cols <- cols<-(randomColor(count = 39, hue = c(" ", "random", "red", "orange", "yellow",
"green", "blue", "purple", "pink", "monochrome"), luminosity = c(" ",
"random", "light", "bright", "dark")))
pairs(reduced_result[,7:10], pch= 19, cex = 0.5, col=my_cols[reduced_result$ZIP.CODE], lower.panel = NULL)