The first data set is USA real estate dataset from Kaggle. https://www.kaggle.com/datasets/ahmedshahriarsakib/usa-real-estate-dataset
Which sate has the highest average house price and which city in this state has the biggest house?
library(tidyverse)
realestate <- read_csv("~/data_607_project_2/realtor-data.csv")
head(realestate)
## # A tibble: 6 × 10
## status bed bath acre_lot city state zip_code house_size prev_sold_date
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <date>
## 1 for_sale 3 2 0.12 Adjunt… Puer… 601 920 NA
## 2 for_sale 4 2 0.08 Adjunt… Puer… 601 1527 NA
## 3 for_sale 2 1 0.15 Juana … Puer… 795 748 NA
## 4 for_sale 4 2 0.1 Ponce Puer… 731 1800 NA
## 5 for_sale 6 2 0.05 Mayagu… Puer… 680 NA NA
## 6 for_sale 4 3 0.46 San Se… Puer… 612 2520 NA
## # ℹ 1 more variable: price <dbl>
realestate
## # A tibble: 904,966 × 10
## status bed bath acre_lot city state zip_code house_size prev_sold_date
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <date>
## 1 for_sale 3 2 0.12 Adjun… Puer… 601 920 NA
## 2 for_sale 4 2 0.08 Adjun… Puer… 601 1527 NA
## 3 for_sale 2 1 0.15 Juana… Puer… 795 748 NA
## 4 for_sale 4 2 0.1 Ponce Puer… 731 1800 NA
## 5 for_sale 6 2 0.05 Mayag… Puer… 680 NA NA
## 6 for_sale 4 3 0.46 San S… Puer… 612 2520 NA
## 7 for_sale 3 1 0.2 Ciales Puer… 639 2040 NA
## 8 for_sale 3 2 0.08 Ponce Puer… 731 1050 NA
## 9 for_sale 2 1 0.09 Ponce Puer… 730 1092 NA
## 10 for_sale 5 3 7.46 Las M… Puer… 670 5403 NA
## # ℹ 904,956 more rows
## # ℹ 1 more variable: price <dbl>
# remove duplicate rows
# drop all rows with NA value
# remove prev_sold_date, status and zip code column
# rename bed to bedroom
# relocate State, city to the front
realestate2 <- realestate |>
distinct() |>
drop_na() |>
select(-c(status, prev_sold_date, zip_code)) |>
rename(bedroom = 'bed') |>
relocate(state, city)
realestate2
## # A tibble: 31,837 × 7
## state city bedroom bath acre_lot house_size price
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Puerto Rico Dorado 7 3 0.09 1192 110000
## 2 Virgin Islands Saint Thomas 5 4 0.99 5000 950000
## 3 Virgin Islands Saint Thomas 4 6 0.83 4600 6899000
## 4 Massachusetts Agawam 3 3 0.45 2314 525000
## 5 Massachusetts Agawam 3 2 0.36 1276 289900
## 6 Massachusetts Agawam 3 2 0.46 1476 384900
## 7 Massachusetts Agawam 3 2 1.76 1968 199999
## 8 Massachusetts Pelham 4 2 2 1607 419000
## 9 Massachusetts Amherst 4 3 0.56 2847 745000
## 10 Massachusetts Pelham 4 4 1.5 4366 875000
## # ℹ 31,827 more rows
#
# Summary to see number of bedroom, bath and house size in the narrowed data set
sum_bath <- realestate2 |>
group_by(bath) |>
summarise(n = n())
sum_bath
## # A tibble: 21 × 2
## bath n
## <dbl> <int>
## 1 1 5352
## 2 2 11802
## 3 3 8946
## 4 4 3200
## 5 5 1258
## 6 6 578
## 7 7 299
## 8 8 157
## 9 9 106
## 10 10 61
## # ℹ 11 more rows
ggplot(sum_bath, aes(x = bath, y = n)) +
geom_bar(stat = "identity") +
scale_x_continuous(limits = c(0, 15)) +
labs(title = "Bathroom count",
x = "Num of Bathroom",
y = "Count")
## Warning: Removed 6 rows containing missing values (`position_stack()`).
## Warning: Removed 1 rows containing missing values (`geom_bar()`).
sum_bed <- realestate2 |>
group_by(bedroom) |>
summarise(n = n())
sum_bed
## # A tibble: 25 × 2
## bedroom n
## <dbl> <int>
## 1 1 493
## 2 2 3899
## 3 3 13300
## 4 4 8596
## 5 5 3061
## 6 6 1473
## 7 7 422
## 8 8 264
## 9 9 169
## 10 10 55
## # ℹ 15 more rows
ggplot(sum_bed, aes(x = bedroom, y = n)) +
geom_bar(stat = "identity") +
scale_x_continuous(limits = c(0, 15)) +
labs(title = "Bedroom count",
x = "Num of Bedroom",
y = "Count")
## Warning: Removed 10 rows containing missing values (`position_stack()`).
## Removed 1 rows containing missing values (`geom_bar()`).
#
# From the Bathroom count summary and plot, any house with bathroom more than 5 can be removed from the data as they are not common and can be considered as extreme observation.
# From the Bedroom count summary and plot, any house with bedroom more than 6 can be removed from the data as they are not common and can be considered as extreme observation.
# When the x axis scale is limited between 0 to 15, both plots show some rows contain missing value as well.
realestate3 <- realestate2 |>
filter(bath < 6) |>
filter(bedroom < 7)
realestate3 |>
group_by(bath) |>
summarise(n = n()) |>
ggplot(aes(x = bath, y = n)) +
geom_bar(stat = "identity") +
scale_x_continuous(limits = c(0, 6)) +
labs(title = "Bathroom count",
x = "Num of Bathroom",
y = "Count")
realestate3 |>
group_by(bedroom) |>
summarise(n = n()) |>
ggplot(aes(x = bedroom, y = n)) +
geom_bar(stat = "identity") +
scale_x_continuous(limits = c(0, 8)) +
labs(title = "Bedroom count",
x = "Num of Bedroom",
y = "Count")
New York has the highest Average house price!! Glen Cove has the biggest house!!
# Average price by Sate
avg_price_by_state <- realestate3 |>
group_by(state) |>
summarise(avg_price = mean(price))
ggplot(avg_price_by_state, aes(x = reorder(state, avg_price), y = avg_price)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Average price by Sate",
x = "State",
y = "Average price")
# Top 10 expensive city in New York
avg_price_by_city <- realestate3 |>
filter(state == 'New York') |>
group_by(city) |>
summarise(avg_price = mean(price))
avg_price_by_city |>
top_n(10, avg_price) |>
ggplot(aes(x = reorder(city, avg_price), y = avg_price)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 expensive city in New York",
x = "City in New York State",
y = "Average price")
# Top 10 city with biggest house in New York
avg_housesize_by_city <- realestate3 |>
filter(state == 'New York') |>
group_by(city) |>
summarise(avg_size = mean(house_size))
avg_housesize_by_city |>
top_n(10, avg_size) |>
ggplot(aes(x = reorder(city, avg_size), y = avg_size)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 10 city with biggest house in New York",
x = "City in New York State",
y = "Average House size")