title: “NYC property sales in 2020” author: “Tahsin Ifnoor Sayeed” date: “2023-08-17” output: html_document ———————-
nyc2020 <- read_csv("nyc2020.csv")
head(nyc2020, n=10)
## # A tibble: 10 × 21
## borough neighborhood building_class_category tax_class_at_present block lot
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 2 BATHGATE 01 ONE FAMILY DWELLINGS 1 3029 36
## 2 2 BATHGATE 01 ONE FAMILY DWELLINGS 1 3030 65
## 3 2 BATHGATE 01 ONE FAMILY DWELLINGS 1 3030 67
## 4 2 BATHGATE 01 ONE FAMILY DWELLINGS 1 3030 67
## 5 2 BATHGATE 01 ONE FAMILY DWELLINGS 1 3039 29
## 6 2 BATHGATE 01 ONE FAMILY DWELLINGS 1 3045 12
## 7 2 BATHGATE 01 ONE FAMILY DWELLINGS 1 3046 52
## 8 2 BATHGATE 02 TWO FAMILY DWELLINGS 1 2912 118
## 9 2 BATHGATE 02 TWO FAMILY DWELLINGS 1 2912 130
## 10 2 BATHGATE 02 TWO FAMILY DWELLINGS 1 2927 131
## # ℹ 15 more variables: ease_ment <lgl>, building_class_at_present <chr>,
## # address <chr>, apartment_number <chr>, zip_code <dbl>,
## # residential_units <dbl>, commercial_units <dbl>, total_units <dbl>,
## # land_square_feet <dbl>, gross_square_feet <dbl>, year_built <dbl>,
## # tax_class_at_time_of_sale <dbl>, building_class_at_time_of_sale <chr>,
## # sale_price <dbl>, sale_date <chr>
df_manhattan <- nyc2020 %>%
filter(borough==1) %>%
group_by(borough) %>%
summarise(n(),mean_price = mean(sale_price),
median_price = median((sale_price)),
max_price = max(sale_price),
min_price = min(sale_price)
)
df_bronx <- nyc2020 %>%
filter(borough==2) %>%
group_by(borough) %>%
summarise(n(), mean_price = mean(sale_price),
median_price = median((sale_price)),
max_price = max(sale_price),
min_price = min(sale_price)
)
df_brooklyn <- nyc2020 %>%
filter(borough==3) %>%
group_by(borough) %>%
summarise(n(),mean_price = mean(sale_price),
median_price = median((sale_price)),
max_price = max(sale_price),
min_price = min(sale_price)
)
df_queens <- nyc2020 %>%
filter(borough==4) %>%
group_by(borough) %>%
summarise(n(),mean_price = mean(sale_price),
median_price = median((sale_price)),
max_price = max(sale_price),
min_price = min(sale_price)
)
df_statenisl <- nyc2020 %>%
filter(borough==5) %>%
group_by(borough) %>%
summarise(n(), mean_price = mean(sale_price),
median_price = median((sale_price)),
max_price = max(sale_price),
min_price = min(sale_price)
)
all_boroughs <- rbind(df_brooklyn, df_manhattan, df_bronx, df_queens, df_statenisl)
all_boroughs %>% arrange(borough)
## # A tibble: 5 × 6
## borough `n()` mean_price median_price max_price min_price
## <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 12880 3285516. 775000 978090439 0
## 2 2 6645 1575194. 300000 87400000 0
## 3 3 19899 944205. 405600 137750000 0
## 4 4 21701 687600. 332000 369300000 0
## 5 5 7559 460579. 425000 85000000 0
val_lab(nyc2020$borough) <- make_labels("1 Manhattan
2 Bronx
3 Brooklyn
4 Queens
5 Staten Island")
val_lab(nyc2020$borough)
## Manhattan Bronx Brooklyn Queens Staten Island
## 1 2 3 4 5
ggplot(nyc2020, aes(x=sale_price)) +
geom_histogram(bins=50) +
scale_x_log10() +
facet_wrap(~borough) +
theme_clean() +
labs(x="Sale Price", y="No. of Transactions", title="Distribution of Sale Price in NYC Boroughs")
by_borough_trans <- nyc2020 %>%
group_by(borough) %>%
summarize(trans = n())
val_lab(by_borough_trans$borough) <- make_labels("1 Manhattan
2 Bronx
3 Brooklyn
4 Queens
5 Staten Island")
ggplot(by_borough_trans, aes(x= reorder(borough, +trans), y=trans)) +
geom_col(fill="#1b98e0") +
labs(title="Transactions in each NYC Boroughs", x="Boroughs", y="Number of Transactions") +
theme_clean()
nyc_build_100 <- nyc2020 %>%
mutate(building_age = (2020 - year_built)) %>%
filter(building_age <100)
ggplot(nyc_build_100, aes(x=building_age, y=sale_price, alpha=1/10)) +
geom_point() +
geom_smooth() +
scale_y_log10() +
labs(x= "Building Age", y="Sale Price", title="Is Sale Price Related to Building Age?") +
theme_clean()
class(nyc2020$sale_date)
## [1] "character"
# install.packages("lubridate")
nyc2020$sale_date2 = mdy(nyc2020$sale_date)
class(nyc2020$sale_date2)
## [1] "Date"
daily_mean_price <- nyc2020 %>%
filter(borough == 1, sale_date2 >= '2020-01-01' & sale_date2 <= '2020-12-31') %>%
group_by(sale_date2) %>%
summarize(daily_mean = mean(sale_price)) %>%
arrange(sale_date2)
ggplot(daily_mean_price, aes(x=sale_date2, y=daily_mean)) +
geom_line(color="#1b98e0") +
scale_y_log10() +
labs(x= "Date", y="Daily Mean Sale Price", title="Daily Average Property Sale Prices in Manhattan") +
theme_clean()