Data5002 Project

library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(ggmap)

## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
##   Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
##   OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.

## 
## Attaching package: 'ggmap'

## The following object is masked from 'package:plotly':
## 
##     wind

library(sf)

## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(leaflet)
library(htmlwidgets)
library(tmap)

## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')

library(sfheaders)
library(tidyr)

# Read the shapefile of Sydney suburbs
shp_path <- "/Users/LauraWu/Desktop/DATA5002 24T3/DATA5002 Project/GDA94/nsw_localities.shp"

syd <- st_read(shp_path)

## Reading layer `nsw_localities' from data source 
##   `/Users/LauraWu/Desktop/DATA5002 24T3/DATA5002 Project/GDA94/nsw_localities.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 4610 features and 6 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 140.9993 ymin: -37.50534 xmax: 159.1054 ymax: -28.15702
## Geodetic CRS:  GDA94

# View column names and attribute data
# colnames(syd)
# head(syd)

Loading the dataset

airbnb <- read.csv("/Users/LauraWu/Desktop/DATA5002 24T3/DATA5002 Project/listings_summary_dec18.csv")

General picture

Top 10 number of listings Suburbs

# General picture
suburb_listing_counts <- airbnb %>%
  group_by(neighbourhood) %>%
  summarise(listings_count = n())

# Sort by number of listings (optional)
top_10_suburbs <- suburb_listing_counts %>%
  arrange(desc(listings_count)) %>%
   slice_head(n = 10)

# View the data
head(top_10_suburbs)

# the bar plot for the number of listings in each suburb
bar_plot <- ggplot(top_10_suburbs, aes(x = neighbourhood, y = listings_count,)) +
  geom_bar(stat = "identity", fill = "#E69F00",width = 0.5) +
  labs(title = "Top 10 the Number of Listings Suburbs", x = "Suburb", y = "Number of Listings") +
  coord_flip()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_minimal()

# interactive plot
interactive_bar_plot <- ggplotly(bar_plot)
interactive_bar_plot

The chart illustrates the top ten suburbs in Sydney with the highest number of Airbnb listings.

Sydney (CBD): Unsurprisingly, the city center (CBD) tops the list with the largest number of Airbnb listings. This is expected due to its iconic landmarks, such as the Opera House and the Harbour Bridge, which attract significant tourist traffic. Additionally, Sydney’s central location and excellent public transportation make it a convenient base for visitors to explore other parts of the city.
Waverley, Randwick, and Manly: These suburbs rank second, third, and fourth respectively in the number of listings. They are closed to Sydney’s famous beaches—Bondi (Waverley), Coogee (Randwick), and Manly — makes them highly sought after by travelers. These areas offer not only scenic coastal views but also vibrant local services, increasing their appeal as short-term rental hotspots.
Warringah: This suburb stands out for its close location to a national park, attracting nature enthusiasts and outdoor adventurers. The unique combination of natural beauty and recreational opportunities contributes to its popularity among Airbnb hosts.

Suburbs near major tourist attractions, scenic coastal areas, or natural landmarks tend to attract more Airbnb listings due to their high demand among travelers. For property managers and hosts, investing in properties in these strategic locations can maximize occupancy rates and profitability.

# clean NA values rows
syd_airbnb <- airbnb %>%
  filter(!is.na(price)) 

# Convert Airbnb data to spatial format
airbnb_sf <- st_as_sf(syd_airbnb, coords = c("longitude", "latitude"), crs = 4326)

airbnb_polygon <- airbnb_sf %>%
  group_by(id) %>% 
  summarise(geometry = st_union(geometry)) %>%
  st_convex_hull()

# Perform spatial join
airbnb_polygon_tranformed <- st_transform(airbnb_polygon, crs = st_crs(syd))

airbnb_polygon_joined <- st_join(syd, airbnb_polygon_tranformed, join = st_intersects)

airbnb_with_suburbs <- airbnb_polygon_joined %>%
  filter(!is.na(id))

airbnb_with_suburbs_with_price <- merge(syd_airbnb, airbnb_with_suburbs, by = "id")

# Calculate price statistics per suburb
price_stats <- airbnb_with_suburbs_with_price %>%
  group_by(LOC_NAME) %>%
  summarise(
    avg_price = mean(price, na.rm = TRUE),
    min_price = min(price, na.rm = TRUE),
    max_price = max(price, na.rm = TRUE),
    median_price = median(price, na.rm = TRUE))

# filter for interactive plot
suburbs <- unique(airbnb_with_suburbs_with_price$LOC_NAME)

price_suburbs <- plot_ly()

for (suburb in suburbs) {
  price_suburbs <- price_suburbs %>%
    add_trace(
      data = filter(airbnb_with_suburbs_with_price, LOC_NAME == suburb),
      x = ~LOC_NAME,
      y = ~price,
      type = "box",
      name = suburb,
      visible = FALSE
    )
}

price_suburbs <- price_suburbs %>%
  add_trace(
    data = airbnb_with_suburbs_with_price,
    x = ~LOC_NAME,
    y = ~price,
    type = "box",
    name = "All Suburbs",
    visible = TRUE
  )

# dropdown filter
buttons <- list(
  list(
    label = "All Suburbs",
    method = "update",
    args = list(list(visible = c(rep(FALSE, length(suburbs)), TRUE)), 
                list(title = "Price Distribution: All Suburbs"))
  )
)


for (i in seq_along(suburbs)) {
  buttons <- append(buttons, list(
    list(
      label = suburbs[i],
      method = "update",
      args = list(list(visible = c(rep(FALSE, i - 1), TRUE, rep(FALSE, length(suburbs) - i), FALSE)),
                  list(title = paste("Price Distribution:", suburbs[i])))
    )
  ))
}

price_suburbs <- price_suburbs %>%
  layout(
    title = "Price Distribution Across Suburbs",
    xaxis = list(title = "Suburb", tickangle = 45),
    yaxis = list(title = "Price ($)"),
    updatemenus = list(
      list(
        type = "dropdown",
        x = 0.1, y = 1.2,
        buttons = buttons
      )
    )
  )

price_suburbs

price_summary <- airbnb_with_suburbs_with_price %>%
  group_by(LOC_NAME) %>%
  summarise(avg_price = mean(price, na.rm = TRUE)) %>%
  filter(!is.na(avg_price)) 

syd_with_prices <- syd %>%
  left_join(price_summary, by = c("LOC_NAME" = "LOC_NAME"))

# Filter out suburbs with no price data
syd_with_prices <- syd_with_prices %>%
  filter(!is.na(avg_price))

palette <- colorNumeric(
  palette = "YlOrRd",
  domain = syd_with_prices$avg_price
)

# Create the interactive choropleth map
price_suburbs_map <- leaflet(syd_with_prices) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~palette(avg_price),
    weight = 1,
    opacity = 1,
    color = "white",
    dashArray = "3",
    fillOpacity = 0.7,
    highlightOptions = highlightOptions(
      weight = 5,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE
    ),
    label = ~paste0(
      LOC_NAME, "\n ",
      ": $", round(avg_price, 2)
    ),
    labelOptions = labelOptions(
      style = list("font-weight" = "normal", padding = "3px 8px"),
      textsize = "15px",
      direction = "auto"
    )
  ) %>%
  addLegend(
    pal = palette,
    values = ~avg_price,
    title = "Average Price ($)",
    position = "bottomright"
  )

## Warning: sf layer has inconsistent datum (+proj=longlat +ellps=GRS80 +no_defs).
## Need '+proj=longlat +datum=WGS84'

price_suburbs_map

The map highlights the average Airbnb prices in each suburb, revealing notable patterns and trends:

Suburbs: with a high number of Airbnb listings often do not have the highest average prices. The increased competition in these areas typically results in more pricing flexibility and a broader range of options for travelers. For hosts and property managers, this means standing out in these competitive markets requires strategic pricing and offering unique features or value-added services to attract bookings.
Coastal Suburbs: Suburbs near the beaches, such as Palm Beach and Watsons Bay, generally have higher average prices. These suburbs are popular tourist destinations with their breathtaking scenic views and tranquil coastal lifestyle. Their desirability often allows hosts to charge premium rates, making these areas lucrative for short-term rental investments.
Castlereagh and Kenthurst: These suburbs record the highest average Airbnb prices. However, it’s important to recognize that the number of listings in these areas is minimal. As a result, their averages are heavily influenced by a small number of high-end properties, which may skew the data. For hosts and property managers, this highlights the importance of considering the volume of listings alongside pricing trends when evaluating investment opportunities.

** Price vs. room type**

#airbnb_with_suburbs_with_price_ <- airbnb_with_suburbs_with_price %>%
  # select(id, room_type, price, LOC_NAME)

avg_price_by_room <- airbnb_with_suburbs_with_price %>%
  group_by(room_type) %>%
  summarise(avg_price = mean(price, na.rm = TRUE))


price_roomtype <- ggplot(avg_price_by_room, aes(x = room_type, y = avg_price, fill = room_type)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Price by Room Type", x = "Room Type", y = "Average Price") +
  theme_minimal()

price_roomtype <- ggplotly(price_roomtype)
price_roomtype

** Availability**

# Create availability ranges based on 'availability_365'
availability_distribution <- airbnb_with_suburbs_with_price %>%
  mutate(availability_range = case_when(
    availability_365 == 0 ~ "Not Available",
    availability_365 <= 30 ~ "Available for <1 Month",
    availability_365 <= 90 ~ "Available for 1-3 Months",
    availability_365 <= 180 ~ "Available for 3-6 Months",
    availability_365 <= 365 ~ "Available for >6 Months"
  )) %>%
  group_by(availability_range) %>%
  summarise(count = n())


p_bar <- ggplot(availability_distribution, aes(x = availability_range, y = count, fill = availability_range)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribution of Listing Availability",
       x = "Availability Range", y = "Number of Listings") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_interactive_bar <- ggplotly(p_bar)
p_interactive_bar

# define short-term and long-term
availability_summary <- airbnb_with_suburbs_with_price %>%
  mutate(availability_range = case_when(
    availability_365 == 0 ~ "Not Available",
    availability_365 <= 30 ~ "Available for <1 Month",
    availability_365 <= 90 ~ "Available for 1-3 Months",
    availability_365 <= 180 ~ "Available for 3-6 Months",
    availability_365 <= 365 ~ "Available for >6 Months"
  )) %>%
  group_by(room_type, availability_range) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(names_from = availability_range, values_from = count, values_fill = 0) %>%
  mutate(
    Short_Term_Availability = `Available for <1 Month` + `Available for 1-3 Months`,
    Long_Term_Availability = `Available for >6 Months`
  )

#the room types with the highest availability
short_term_max <- availability_summary %>%
  filter(Short_Term_Availability == max(Short_Term_Availability))

long_term_max <- availability_summary %>%
  filter(Long_Term_Availability == max(Long_Term_Availability))

list(
  Short_Term_Highest = short_term_max,
  Long_Term_Highest = long_term_max
)

## $Short_Term_Highest
## # A tibble: 1 × 8
##   room_type Available for 1-3 Mo…¹ Available for 3-6 Mo…² Available for <1 Mon…³
##   <chr>                      <int>                  <int>                  <int>
## 1 Entire h…                   4006                   3013                   3706
## # ℹ abbreviated names: ¹`Available for 1-3 Months`,
## #   ²`Available for 3-6 Months`, ³`Available for <1 Month`
## # ℹ 4 more variables: `Available for >6 Months` <int>, `Not Available` <int>,
## #   Short_Term_Availability <int>, Long_Term_Availability <int>
## 
## $Long_Term_Highest
## # A tibble: 1 × 8
##   room_type Available for 1-3 Mo…¹ Available for 3-6 Mo…² Available for <1 Mon…³
##   <chr>                      <int>                  <int>                  <int>
## 1 Entire h…                   4006                   3013                   3706
## # ℹ abbreviated names: ¹`Available for 1-3 Months`,
## #   ²`Available for 3-6 Months`, ³`Available for <1 Month`
## # ℹ 4 more variables: `Available for >6 Months` <int>, `Not Available` <int>,
## #   Short_Term_Availability <int>, Long_Term_Availability <int>