library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
DOF <- read.csv("https://raw.githubusercontent.com/Angelogallardo05/Week6/main/DOF__Summary_of_Neighborhood_Sales_by_Neighborhood_Citywide_by_Borough_20240302.csv")
head(DOF)
##     BOROUGH  NEIGHBORHOOD          TYPE.OF.HOME NUMBER.OF.SALES
## 1 MANHATTAN ALPHABET CITY   01 ONE FAMILY HOMES               1
## 2 MANHATTAN ALPHABET CITY   02 TWO FAMILY HOMES               1
## 3 MANHATTAN ALPHABET CITY 03 THREE FAMILY HOMES               1
## 4 MANHATTAN       CHELSEA   01 ONE FAMILY HOMES               2
## 5 MANHATTAN       CHELSEA   02 TWO FAMILY HOMES               2
## 6 MANHATTAN       CHELSEA 03 THREE FAMILY HOMES               1
##   LOWEST.SALE.PRICE AVERAGE.SALE.PRICE MEDIAN.SALE.PRICE HIGHEST.SALE.PRICE
## 1            593362             593362            593362             593362
## 2           1320000            1320000           1320000            1320000
## 3            900000             900000            900000             900000
## 4            500000            2875000           2875000            5250000
## 5           1306213            2603107           2603107            3900000
## 6           6400000            6400000           6400000            6400000
##   YEAR
## 1 2010
## 2 2010
## 3 2010
## 4 2010
## 5 2010
## 6 2010

adjust the “TYPE.OF.HOME” in uniform

str(DOF)
## 'data.frame':    7788 obs. of  9 variables:
##  $ BOROUGH           : chr  "MANHATTAN" "MANHATTAN" "MANHATTAN" "MANHATTAN" ...
##  $ NEIGHBORHOOD      : chr  "ALPHABET CITY" "ALPHABET CITY" "ALPHABET CITY" "CHELSEA" ...
##  $ TYPE.OF.HOME      : chr  "01 ONE FAMILY HOMES" "02 TWO FAMILY HOMES" "03 THREE FAMILY HOMES" "01 ONE FAMILY HOMES" ...
##  $ NUMBER.OF.SALES   : int  1 1 1 2 2 1 1 2 2 1 ...
##  $ LOWEST.SALE.PRICE : int  593362 1320000 900000 500000 1306213 6400000 3850000 3100000 477500 3290000 ...
##  $ AVERAGE.SALE.PRICE: int  593362 1320000 900000 2875000 2603107 6400000 3850000 5800000 2738750 3290000 ...
##  $ MEDIAN.SALE.PRICE : int  593362 1320000 900000 2875000 2603107 6400000 3850000 5800000 2738750 3290000 ...
##  $ HIGHEST.SALE.PRICE: int  593362 1320000 900000 5250000 3900000 6400000 3850000 8500000 5000000 3290000 ...
##  $ YEAR              : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
DOF <- DOF %>%
  mutate(TYPE.OF.HOME = gsub("\\D", "", TYPE.OF.HOME))

DOF <- DOF %>%
  mutate(TYPE.OF.HOME = paste0(TYPE.OF.HOME, " Family homes"))

Replace the integers in the BOROUGH column with the corroect borough name

borough_mapping <- c("1" = "MANHATTAN", "2" = "BRONX", "3" = "BROOKLYN", "4" = "QUEENS", "5" = "STATEN ISLAND")

# Replace integers in the BOROUGH column with correct borough names
DOF$BOROUGH <- ifelse(DOF$BOROUGH %in% names(borough_mapping), 
                      borough_mapping[as.character(DOF$BOROUGH)], 
                      DOF$BOROUGH)

#Filter by 1 family homes in Queens

filtered_data <- DOF %>%
  filter(TYPE.OF.HOME== "01 Family homes" & BOROUGH == "QUEENS")

#Bar chart to show the neighborhoods in queens with the most sales 2010-2022, one family home

# Calculate total number of sales for each neighborhood
neighborhood_sales <- filtered_data %>%
  group_by(NEIGHBORHOOD) %>%
  summarise(total_sales = sum(NUMBER.OF.SALES)) %>%
  arrange(desc(total_sales)) %>%
  slice_head(n = 5)

# Create a bar chart
ggplot(neighborhood_sales, aes(x = reorder(NEIGHBORHOOD, -total_sales), y = total_sales)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(x = "Neighborhood", y = "Total Number of Sales", title = "Top 5 Neighborhoods with the Most Number of Sales 20") +
  coord_flip()  # Horizontal bars for better readability

2021 included a big jumb in home sales in the North-Flushing Area, however house price did not increas dramatically.

neighborhood_sales <- filtered_data %>%
  filter(NEIGHBORHOOD == "FLUSHING-NORTH") %>%
  group_by(YEAR)
print(neighborhood_sales)
## # A tibble: 13 × 9
## # Groups:   YEAR [13]
##    BOROUGH NEIGHBORHOOD   TYPE.OF.HOME    NUMBER.OF.SALES LOWEST.SALE.PRICE
##    <chr>   <chr>          <chr>                     <int>             <int>
##  1 QUEENS  FLUSHING-NORTH 01 Family homes             413            180446
##  2 QUEENS  FLUSHING-NORTH 01 Family homes             345            155000
##  3 QUEENS  FLUSHING-NORTH 01 Family homes             401            200000
##  4 QUEENS  FLUSHING-NORTH 01 Family homes             519            200000
##  5 QUEENS  FLUSHING-NORTH 01 Family homes             446            200000
##  6 QUEENS  FLUSHING-NORTH 01 Family homes             490            257978
##  7 QUEENS  FLUSHING-NORTH 01 Family homes             484            200000
##  8 QUEENS  FLUSHING-NORTH 01 Family homes             472            215000
##  9 QUEENS  FLUSHING-NORTH 01 Family homes             442            250000
## 10 QUEENS  FLUSHING-NORTH 01 Family homes             396            226667
## 11 QUEENS  FLUSHING-NORTH 01 Family homes             296            250000
## 12 QUEENS  FLUSHING-NORTH 01 Family homes             533            200000
## 13 QUEENS  FLUSHING-NORTH 01 Family homes             461            400000
## # ℹ 4 more variables: AVERAGE.SALE.PRICE <int>, MEDIAN.SALE.PRICE <int>,
## #   HIGHEST.SALE.PRICE <int>, YEAR <int>
plot_sales <- ggplot(neighborhood_sales, aes(x = YEAR, y = NUMBER.OF.SALES)) +
  geom_line(color = "blue") +
  labs(x = "Year", y = "Total Number of Sales", title = "Total Number of Sales per Year for FLUSHING-NORTH") +
  theme_minimal()

# Create plot for Average Sale Price
plot_price <- ggplot(neighborhood_sales, aes(x = YEAR, y = AVERAGE.SALE.PRICE/1000000)) +
  geom_line(color = "red") +
  labs(x = "Year", y = "Average Sale Price (MM)", title = "Average Sale Price per Year for FLUSHING-NORTH 2010-2022") +
  theme_minimal()

# Display both plots
plot_sales

plot_price

LIC had a large decline in average home sale price price starting in 2021

top5_data <- filtered_data %>%
  group_by(NEIGHBORHOOD) %>%
  summarise(avg_sale_price = mean(AVERAGE.SALE.PRICE)) %>%
  top_n(5, avg_sale_price) %>%
  inner_join(filtered_data, by = "NEIGHBORHOOD")  # Join with original data to get all years



# Create a line plot for top 5 neighborhoods
ggplot(top5_data, aes(x = YEAR, y = AVERAGE.SALE.PRICE, color = NEIGHBORHOOD)) +
  geom_line() +
  scale_x_continuous(breaks = unique(top5_data$YEAR), labels = unique(top5_data$YEAR)) +
   scale_y_continuous(labels = scales::dollar_format(suffix = "MM", scale = 1e-6))+
  theme_minimal() +
  labs(x = "Year", y = "Average Sale Price", title = "Average Single Home Sale Price by Year for Top 5 Neighborhoods") +
  theme(legend.position = "right")