Data_607_Project_2_Dataset

The first data set is USA real estate dataset from Kaggle. https://www.kaggle.com/datasets/ahmedshahriarsakib/usa-real-estate-dataset

Which sate has the highest average house price and which city in this state has the biggest house?

library(tidyverse)

realestate <- read_csv("~/data_607_project_2/realtor-data.csv")

head(realestate)

## # A tibble: 6 × 10
##   status     bed  bath acre_lot city    state zip_code house_size prev_sold_date
##   <chr>    <dbl> <dbl>    <dbl> <chr>   <chr>    <dbl>      <dbl> <date>        
## 1 for_sale     3     2     0.12 Adjunt… Puer…      601        920 NA            
## 2 for_sale     4     2     0.08 Adjunt… Puer…      601       1527 NA            
## 3 for_sale     2     1     0.15 Juana … Puer…      795        748 NA            
## 4 for_sale     4     2     0.1  Ponce   Puer…      731       1800 NA            
## 5 for_sale     6     2     0.05 Mayagu… Puer…      680         NA NA            
## 6 for_sale     4     3     0.46 San Se… Puer…      612       2520 NA            
## # ℹ 1 more variable: price <dbl>

realestate

## # A tibble: 904,966 × 10
##    status     bed  bath acre_lot city   state zip_code house_size prev_sold_date
##    <chr>    <dbl> <dbl>    <dbl> <chr>  <chr>    <dbl>      <dbl> <date>        
##  1 for_sale     3     2     0.12 Adjun… Puer…      601        920 NA            
##  2 for_sale     4     2     0.08 Adjun… Puer…      601       1527 NA            
##  3 for_sale     2     1     0.15 Juana… Puer…      795        748 NA            
##  4 for_sale     4     2     0.1  Ponce  Puer…      731       1800 NA            
##  5 for_sale     6     2     0.05 Mayag… Puer…      680         NA NA            
##  6 for_sale     4     3     0.46 San S… Puer…      612       2520 NA            
##  7 for_sale     3     1     0.2  Ciales Puer…      639       2040 NA            
##  8 for_sale     3     2     0.08 Ponce  Puer…      731       1050 NA            
##  9 for_sale     2     1     0.09 Ponce  Puer…      730       1092 NA            
## 10 for_sale     5     3     7.46 Las M… Puer…      670       5403 NA            
## # ℹ 904,956 more rows
## # ℹ 1 more variable: price <dbl>

Data cleaning and transformation

# remove duplicate rows 
# drop all rows with NA value
# remove prev_sold_date, status and zip code column
# rename bed to bedroom
# relocate State, city to the front
realestate2 <- realestate |> 
                    distinct() |> 
                    drop_na() |> 
                    select(-c(status, prev_sold_date, zip_code)) |>
                    rename(bedroom = 'bed') |>
                    relocate(state, city)
realestate2

## # A tibble: 31,837 × 7
##    state          city         bedroom  bath acre_lot house_size   price
##    <chr>          <chr>          <dbl> <dbl>    <dbl>      <dbl>   <dbl>
##  1 Puerto Rico    Dorado             7     3     0.09       1192  110000
##  2 Virgin Islands Saint Thomas       5     4     0.99       5000  950000
##  3 Virgin Islands Saint Thomas       4     6     0.83       4600 6899000
##  4 Massachusetts  Agawam             3     3     0.45       2314  525000
##  5 Massachusetts  Agawam             3     2     0.36       1276  289900
##  6 Massachusetts  Agawam             3     2     0.46       1476  384900
##  7 Massachusetts  Agawam             3     2     1.76       1968  199999
##  8 Massachusetts  Pelham             4     2     2          1607  419000
##  9 Massachusetts  Amherst            4     3     0.56       2847  745000
## 10 Massachusetts  Pelham             4     4     1.5        4366  875000
## # ℹ 31,827 more rows

Examining what range of data is more common.

# Summary to see number of bedroom, bath and house size in the narrowed data set

sum_bath <- realestate2 |> 
              group_by(bath) |> 
              summarise(n = n()) 
sum_bath

## # A tibble: 21 × 2
##     bath     n
##    <dbl> <int>
##  1     1  5352
##  2     2 11802
##  3     3  8946
##  4     4  3200
##  5     5  1258
##  6     6   578
##  7     7   299
##  8     8   157
##  9     9   106
## 10    10    61
## # ℹ 11 more rows

ggplot(sum_bath, aes(x = bath, y = n)) +
geom_bar(stat = "identity") +
scale_x_continuous(limits = c(0, 15)) +
labs(title = "Bathroom count",
     x = "Num of Bathroom",
     y = "Count")

## Warning: Removed 6 rows containing missing values (`position_stack()`).

## Warning: Removed 1 rows containing missing values (`geom_bar()`).

sum_bed <- realestate2 |> 
              group_by(bedroom) |> 
              summarise(n = n())
sum_bed

## # A tibble: 25 × 2
##    bedroom     n
##      <dbl> <int>
##  1       1   493
##  2       2  3899
##  3       3 13300
##  4       4  8596
##  5       5  3061
##  6       6  1473
##  7       7   422
##  8       8   264
##  9       9   169
## 10      10    55
## # ℹ 15 more rows

ggplot(sum_bed, aes(x = bedroom, y = n)) +
geom_bar(stat = "identity") +
scale_x_continuous(limits = c(0, 15)) +
labs(title = "Bedroom count",
     x = "Num of Bedroom",
     y = "Count")

## Warning: Removed 10 rows containing missing values (`position_stack()`).
## Removed 1 rows containing missing values (`geom_bar()`).

Further transforming the data base.

# From the Bathroom count summary and plot, any house with bathroom more than 5 can be removed from the data as they are not common and can be considered as extreme observation.
# From the Bedroom count summary and plot, any house with bedroom more than 6 can be removed from the data as they are not common and can be considered as extreme observation.

# When the x axis scale is limited between 0 to 15, both plots show some rows contain missing value as well.

realestate3 <- realestate2 |> 
                  filter(bath < 6) |> 
                  filter(bedroom < 7)

realestate3 |> 
    group_by(bath) |> 
    summarise(n = n()) |> 
    ggplot(aes(x = bath, y = n)) +
    geom_bar(stat = "identity") +
    scale_x_continuous(limits = c(0, 6)) +
    labs(title = "Bathroom count",
         x = "Num of Bathroom",
         y = "Count")

realestate3 |> 
    group_by(bedroom) |> 
    summarise(n = n()) |> 
    ggplot(aes(x = bedroom, y = n)) +
    geom_bar(stat = "identity") +
    scale_x_continuous(limits = c(0, 8)) +
    labs(title = "Bedroom count",
         x = "Num of Bedroom",
         y = "Count")

Visualize the data

New York has the highest Average house price!! Glen Cove has the biggest house!!

# Average price by Sate
avg_price_by_state <- realestate3 |> 
                        group_by(state) |> 
                        summarise(avg_price = mean(price))

ggplot(avg_price_by_state, aes(x = reorder(state, avg_price), y = avg_price)) +
  geom_bar(stat = "identity") + 
  coord_flip() +
  labs(title = "Average price by Sate",
       x = "State",
       y = "Average price")

# Top 10 expensive city in New York
avg_price_by_city <- realestate3 |>
                        filter(state == 'New York') |> 
                        group_by(city) |>
                        summarise(avg_price = mean(price))
                        

avg_price_by_city |> 
  top_n(10, avg_price) |> 
  ggplot(aes(x = reorder(city, avg_price), y = avg_price)) +
  geom_bar(stat = "identity") + 
  coord_flip() +
  labs(title = "Top 10 expensive city in New York",
       x = "City in New York State",
       y = "Average price")

# Top 10 city with biggest house in New York

avg_housesize_by_city <- realestate3 |>
                        filter(state == 'New York') |> 
                        group_by(city) |>
                        summarise(avg_size = mean(house_size))
                        

avg_housesize_by_city |> 
  top_n(10, avg_size) |> 
  ggplot(aes(x = reorder(city, avg_size), y = avg_size)) +
  geom_bar(stat = "identity") + 
  coord_flip() +
  labs(title = "Top 10 city with biggest house in New York",
       x = "City in New York State",
       y = "Average House size")

Data_607_Project_2_Dataset_1

CHUN SHING LEUNG

2023-10-14

Data cleaning and transformation

Examining what range of data is more common.

Further transforming the data base.

Visualize the data