library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
property_sales <- read.csv("/Users/blessinga/Desktop/nyc-property-sales.csv", header=TRUE, sep=",")
str(property_sales)
## 'data.frame': 1048575 obs. of 21 variables:
## $ BOROUGH : int 1 1 1 1 1 1 1 1 1 1 ...
## $ NEIGHBORHOOD : chr "ALPHABET CITY" "ALPHABET CITY" "ALPHABET CITY" "ALPHABET CITY" ...
## $ BUILDING.CLASS.CATEGORY : chr "01 ONE FAMILY DWELLINGS" "02 TWO FAMILY DWELLINGS" "07 RENTALS - WALKUP APARTMENTS" "07 RENTALS - WALKUP APARTMENTS" ...
## $ TAX.CLASS.AT.PRESENT : chr "1" "1" "2B" "2B" ...
## $ BLOCK : int 374 377 373 373 376 377 377 379 389 389 ...
## $ LOT : int 46 1 16 17 54 52 52 25 45 47 ...
## $ EASE.MENT : chr "" "" "" "" ...
## $ BUILDING.CLASS.AT.PRESENT : chr "A4" "S2" "C1" "C1" ...
## $ ADDRESS : chr "347 EAST 4TH STREET" "110 AVENUE C" "326 EAST 4TH STREET" "328 EAST 4TH STREET" ...
## $ APARTMENT.NUMBER : chr "" "" "" "" ...
## $ ZIP.CODE : int 10009 10009 10009 10009 10009 10009 10009 10009 10009 10009 ...
## $ RESIDENTIAL.UNITS : int 1 2 10 10 20 5 5 7 10 10 ...
## $ COMMERCIAL.UNITS : int 0 1 0 0 0 0 0 1 0 0 ...
## $ TOTAL.UNITS : int 1 3 10 10 20 5 5 8 10 10 ...
## $ LAND.SQUARE.FEET : int 2116 1502 2204 2204 2302 2168 2169 4337 2271 2248 ...
## $ GROSS.SQUARE.FEET : int 4400 2790 8625 8625 9750 3728 3728 14347 6985 6985 ...
## $ YEAR.BUILT : int 1900 1901 1899 1900 1900 1900 1900 1920 1900 1910 ...
## $ TAX.CLASS.AT.TIME.OF.SALE : int 1 1 2 2 2 2 2 2 2 2 ...
## $ BUILDING.CLASS.AT.TIME.OF.SALE: chr "A4" "S2" "C1" "C1" ...
## $ SALE.PRICE : num 399000 2999999 16800000 16800000 158822 ...
## $ SALE.DATE : chr "Sep-22" "Sep-22" "Aug-22" "Aug-22" ...
summary(property_sales)
## BOROUGH NEIGHBORHOOD BUILDING.CLASS.CATEGORY TAX.CLASS.AT.PRESENT
## Min. :1.00 Length:1048575 Length:1048575 Length:1048575
## 1st Qu.:1.00 Class :character Class :character Class :character
## Median :3.00 Mode :character Mode :character Mode :character
## Mean :2.89
## 3rd Qu.:4.00
## Max. :5.00
##
## BLOCK LOT EASE.MENT BUILDING.CLASS.AT.PRESENT
## Min. : 1 Min. : 1.0 Length:1048575 Length:1048575
## 1st Qu.: 1217 1st Qu.: 22.0 Class :character Class :character
## Median : 2997 Median : 51.0 Mode :character Mode :character
## Mean : 4060 Mean : 391.6
## 3rd Qu.: 6019 3rd Qu.:1003.0
## Max. :16350 Max. :9139.0
##
## ADDRESS APARTMENT.NUMBER ZIP.CODE RESIDENTIAL.UNITS
## Length:1048575 Length:1048575 Min. : 0 Min. : 0.00
## Class :character Class :character 1st Qu.:10128 1st Qu.: 0.00
## Mode :character Mode :character Median :11207 Median : 1.00
## Mean :10791 Mean : 2.33
## 3rd Qu.:11355 3rd Qu.: 2.00
## Max. :11697 Max. :13480.00
## NA's :35 NA's :76752
## COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET
## Min. : 0.00 Min. : 0.00 Min. : 0 Min. : 0
## 1st Qu.: 0.00 1st Qu.: 1.00 1st Qu.: 0 1st Qu.: 0
## Median : 0.00 Median : 1.00 Median : 1995 Median : 1313
## Mean : 0.25 Mean : 2.59 Mean : 3669 Mean : 4194
## 3rd Qu.: 0.00 3rd Qu.: 2.00 3rd Qu.: 3000 3rd Qu.: 2352
## Max. :2261.00 Max. :13495.00 Max. :29305534 Max. :8942176
## NA's :113608 NA's :70732 NA's :119629 NA's :119628
## YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE
## Min. : 0 Min. :1.000 Length:1048575
## 1st Qu.:1920 1st Qu.:1.000 Class :character
## Median :1940 Median :2.000 Mode :character
## Mean :1810 Mean :1.719
## 3rd Qu.:1967 3rd Qu.:2.000
## Max. :2022 Max. :4.000
## NA's :24023
## SALE.PRICE SALE.DATE
## Min. :0.000e+00 Length:1048575
## 1st Qu.:0.000e+00 Class :character
## Median :3.853e+05 Mode :character
## Mean :1.183e+06
## 3rd Qu.:8.100e+05
## Max. :4.111e+09
##
## Remove columns
nyc_sales <- property_sales %>%
select(-c(BUILDING.CLASS.CATEGORY, TAX.CLASS.AT.PRESENT, BLOCK,LOT,EASE.MENT, BUILDING.CLASS.AT.PRESENT,ADDRESS, APARTMENT.NUMBER,ZIP.CODE,RESIDENTIAL.UNITS, COMMERCIAL.UNITS,TOTAL.UNITS,LAND.SQUARE.FEET,GROSS.SQUARE.FEET,TAX.CLASS.AT.TIME.OF.SALE,BUILDING.CLASS.AT.TIME.OF.SALE))
nyc_sales <- nyc_sales %>%
na.omit()
head(nyc_sales)
## BOROUGH NEIGHBORHOOD YEAR.BUILT SALE.PRICE SALE.DATE
## 1 1 ALPHABET CITY 1900 399000 Sep-22
## 2 1 ALPHABET CITY 1901 2999999 Sep-22
## 3 1 ALPHABET CITY 1899 16800000 Aug-22
## 4 1 ALPHABET CITY 1900 16800000 Aug-22
## 5 1 ALPHABET CITY 1900 158822 Sep-22
## 6 1 ALPHABET CITY 1900 0 Aug-22
Analysis Part 1
nyc_sales2 <-rename(nyc_sales, Borough = BOROUGH, Neighborhood = NEIGHBORHOOD, SalePrice = SALE.PRICE, SaleDate=SALE.DATE, YearBuilt = YEAR.BUILT)
sales_summary <- nyc_sales2 %>%
group_by(Borough) %>%
summarise(mean_sale_price = mean(SalePrice),
median_sale_price = median(SalePrice),
min_sale_price = min(SalePrice),
max_sale_price = max(SalePrice))
print(sales_summary)
## # A tibble: 5 × 5
## Borough mean_sale_price median_sale_price min_sale_price max_sale_price
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 2641497. 695000 0 4111111766
## 2 2 695843. 250000 0 137298279
## 3 3 796102. 370000 0 869612895
## 4 4 601479. 295000 0 481093581
## 5 5 381611. 349900 0 122000000
Analysis Part 2
## line plot by borough it was meant to have colors by borough
library(ggplot2)
ggplot(nyc_sales2, aes(x = SaleDate, y = SalePrice, color = Borough)) +
geom_line() +
labs(title = "Sale Price Trends Over Time",
x = "Sale Date",
y = "Sale Price") +
theme_minimal()
Final Analysis based on Analysis Part 1 :
Borough 1 and 3 have the highest mean, median and max sales prices but most especially borough 1. Borough 1 most likely might be a borough where things are high in cost, from rent, food, groceries, and much more. It most likely might also be a borough near tourists areas and where many visitors tend to go to. Example Manhattan. Borough 1 properties have a higher value compared to the other boroughs.