library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
DOF <- read.csv("https://raw.githubusercontent.com/Angelogallardo05/Week6/main/DOF__Summary_of_Neighborhood_Sales_by_Neighborhood_Citywide_by_Borough_20240302.csv")
head(DOF)
## BOROUGH NEIGHBORHOOD TYPE.OF.HOME NUMBER.OF.SALES
## 1 MANHATTAN ALPHABET CITY 01 ONE FAMILY HOMES 1
## 2 MANHATTAN ALPHABET CITY 02 TWO FAMILY HOMES 1
## 3 MANHATTAN ALPHABET CITY 03 THREE FAMILY HOMES 1
## 4 MANHATTAN CHELSEA 01 ONE FAMILY HOMES 2
## 5 MANHATTAN CHELSEA 02 TWO FAMILY HOMES 2
## 6 MANHATTAN CHELSEA 03 THREE FAMILY HOMES 1
## LOWEST.SALE.PRICE AVERAGE.SALE.PRICE MEDIAN.SALE.PRICE HIGHEST.SALE.PRICE
## 1 593362 593362 593362 593362
## 2 1320000 1320000 1320000 1320000
## 3 900000 900000 900000 900000
## 4 500000 2875000 2875000 5250000
## 5 1306213 2603107 2603107 3900000
## 6 6400000 6400000 6400000 6400000
## YEAR
## 1 2010
## 2 2010
## 3 2010
## 4 2010
## 5 2010
## 6 2010
str(DOF)
## 'data.frame': 7788 obs. of 9 variables:
## $ BOROUGH : chr "MANHATTAN" "MANHATTAN" "MANHATTAN" "MANHATTAN" ...
## $ NEIGHBORHOOD : chr "ALPHABET CITY" "ALPHABET CITY" "ALPHABET CITY" "CHELSEA" ...
## $ TYPE.OF.HOME : chr "01 ONE FAMILY HOMES" "02 TWO FAMILY HOMES" "03 THREE FAMILY HOMES" "01 ONE FAMILY HOMES" ...
## $ NUMBER.OF.SALES : int 1 1 1 2 2 1 1 2 2 1 ...
## $ LOWEST.SALE.PRICE : int 593362 1320000 900000 500000 1306213 6400000 3850000 3100000 477500 3290000 ...
## $ AVERAGE.SALE.PRICE: int 593362 1320000 900000 2875000 2603107 6400000 3850000 5800000 2738750 3290000 ...
## $ MEDIAN.SALE.PRICE : int 593362 1320000 900000 2875000 2603107 6400000 3850000 5800000 2738750 3290000 ...
## $ HIGHEST.SALE.PRICE: int 593362 1320000 900000 5250000 3900000 6400000 3850000 8500000 5000000 3290000 ...
## $ YEAR : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
DOF <- DOF %>%
mutate(TYPE.OF.HOME = gsub("\\D", "", TYPE.OF.HOME))
DOF <- DOF %>%
mutate(TYPE.OF.HOME = paste0(TYPE.OF.HOME, " Family homes"))
borough_mapping <- c("1" = "MANHATTAN", "2" = "BRONX", "3" = "BROOKLYN", "4" = "QUEENS", "5" = "STATEN ISLAND")
# Replace integers in the BOROUGH column with correct borough names
DOF$BOROUGH <- ifelse(DOF$BOROUGH %in% names(borough_mapping),
borough_mapping[as.character(DOF$BOROUGH)],
DOF$BOROUGH)
#Filter by 1 family homes in Queens
filtered_data <- DOF %>%
filter(TYPE.OF.HOME== "01 Family homes" & BOROUGH == "QUEENS")
#Bar chart to show the neighborhoods in queens with the most sales 2010-2022, one family home
# Calculate total number of sales for each neighborhood
neighborhood_sales <- filtered_data %>%
group_by(NEIGHBORHOOD) %>%
summarise(total_sales = sum(NUMBER.OF.SALES)) %>%
arrange(desc(total_sales)) %>%
slice_head(n = 5)
# Create a bar chart
ggplot(neighborhood_sales, aes(x = reorder(NEIGHBORHOOD, -total_sales), y = total_sales)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(x = "Neighborhood", y = "Total Number of Sales", title = "Top 5 Neighborhoods with the Most Number of Sales 20") +
coord_flip() # Horizontal bars for better readability
neighborhood_sales <- filtered_data %>%
filter(NEIGHBORHOOD == "FLUSHING-NORTH") %>%
group_by(YEAR)
print(neighborhood_sales)
## # A tibble: 13 × 9
## # Groups: YEAR [13]
## BOROUGH NEIGHBORHOOD TYPE.OF.HOME NUMBER.OF.SALES LOWEST.SALE.PRICE
## <chr> <chr> <chr> <int> <int>
## 1 QUEENS FLUSHING-NORTH 01 Family homes 413 180446
## 2 QUEENS FLUSHING-NORTH 01 Family homes 345 155000
## 3 QUEENS FLUSHING-NORTH 01 Family homes 401 200000
## 4 QUEENS FLUSHING-NORTH 01 Family homes 519 200000
## 5 QUEENS FLUSHING-NORTH 01 Family homes 446 200000
## 6 QUEENS FLUSHING-NORTH 01 Family homes 490 257978
## 7 QUEENS FLUSHING-NORTH 01 Family homes 484 200000
## 8 QUEENS FLUSHING-NORTH 01 Family homes 472 215000
## 9 QUEENS FLUSHING-NORTH 01 Family homes 442 250000
## 10 QUEENS FLUSHING-NORTH 01 Family homes 396 226667
## 11 QUEENS FLUSHING-NORTH 01 Family homes 296 250000
## 12 QUEENS FLUSHING-NORTH 01 Family homes 533 200000
## 13 QUEENS FLUSHING-NORTH 01 Family homes 461 400000
## # ℹ 4 more variables: AVERAGE.SALE.PRICE <int>, MEDIAN.SALE.PRICE <int>,
## # HIGHEST.SALE.PRICE <int>, YEAR <int>
plot_sales <- ggplot(neighborhood_sales, aes(x = YEAR, y = NUMBER.OF.SALES)) +
geom_line(color = "blue") +
labs(x = "Year", y = "Total Number of Sales", title = "Total Number of Sales per Year for FLUSHING-NORTH") +
theme_minimal()
# Create plot for Average Sale Price
plot_price <- ggplot(neighborhood_sales, aes(x = YEAR, y = AVERAGE.SALE.PRICE/1000000)) +
geom_line(color = "red") +
labs(x = "Year", y = "Average Sale Price (MM)", title = "Average Sale Price per Year for FLUSHING-NORTH 2010-2022") +
theme_minimal()
# Display both plots
plot_sales
plot_price
top5_data <- filtered_data %>%
group_by(NEIGHBORHOOD) %>%
summarise(avg_sale_price = mean(AVERAGE.SALE.PRICE)) %>%
top_n(5, avg_sale_price) %>%
inner_join(filtered_data, by = "NEIGHBORHOOD") # Join with original data to get all years
# Create a line plot for top 5 neighborhoods
ggplot(top5_data, aes(x = YEAR, y = AVERAGE.SALE.PRICE, color = NEIGHBORHOOD)) +
geom_line() +
scale_x_continuous(breaks = unique(top5_data$YEAR), labels = unique(top5_data$YEAR)) +
scale_y_continuous(labels = scales::dollar_format(suffix = "MM", scale = 1e-6))+
theme_minimal() +
labs(x = "Year", y = "Average Sale Price", title = "Average Single Home Sale Price by Year for Top 5 Neighborhoods") +
theme(legend.position = "right")