Airbnb - Melbourne

Author

Tony Wang

Airbnb - Melbourne

The project is to look at the Airbnb Melbourne data and see what unique features could be found

Load library

# install.packages("leaflet")
# install.packages("geosphere")
# install.packages("hexbin")
# install.packages("tidytext")
# install.packages("tidyverse")
# install.packages("wordcloud")
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)
library(wordcloud)

Loading required package: RColorBrewer

library(dplyr)
library(readr)
library(leaflet)
library(dplyr)
library(geosphere)
library(hexbin)
library(readr)
library(geosphere)

Load data

df <- readr::read_csv("Airbnb dataset - Melbourne.csv")

New names:
• `` -> `...19`
• `` -> `...20`

Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)

Rows: 25587 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (12): name, host_id, host_name, neighbourhood_group, neighbourhood, lati...
dbl  (6): id, reviews_per_month, calculated_host_listings_count, availabilit...
lgl  (2): ...19, ...20

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(df)

# A tibble: 6 × 20
     id name        host_id host_name neighbourhood_group neighbourhood latitude
  <dbl> <chr>       <chr>   <chr>     <chr>               <chr>         <chr>   
1 10803 Room in De… 38901   Lindsay   <NA>                Moreland      -37.766…
2 12936 St Kilda 1… 50121   The A2C … <NA>                Port Phillip  -37.859…
3 41836 CLOSE TO C… 182833  Diana     <NA>                Darebin       -37.697…
4 43429 Tranquil J… 189684  Allan     <NA>                Monash        -37.899…
5 44082 Queen Room… 193031  Vicki     <NA>                Frankston     -38.147…
6 47100 Cosy, cute… 212071  Loren     <NA>                Yarra         -37.818…
# ℹ 13 more variables: longitude <chr>, room_type <chr>, price <chr>,
#   minimum_nights <chr>, number_of_reviews <chr>, last_review <chr>,
#   reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
#   availability_365 <dbl>, number_of_reviews_ltm <dbl>, license <dbl>,
#   ...19 <lgl>, ...20 <lgl>

# Observe the structure of the dataframe
glimpse(df)

Rows: 25,587
Columns: 20
$ id                             <dbl> 10803, 12936, 41836, 43429, 44082, 4710…
$ name                           <chr> "Room in Deco Apartment, Brunswick East…
$ host_id                        <chr> "38901", "50121", "182833", "189684", "…
$ host_name                      <chr> "Lindsay", "The A2C Team", "Diana", "Al…
$ neighbourhood_group            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ neighbourhood                  <chr> "Moreland", "Port Phillip", "Darebin", …
$ latitude                       <chr> "-37.76606", "-37.85999", "-37.69761", …
$ longitude                      <chr> "144.97951", "144.97662", "145.00066", …
$ room_type                      <chr> "Private room", "Entire home/apt", "Pri…
$ price                          <chr> "54", NA, NA, "128", "79", "116", "257"…
$ minimum_nights                 <chr> "5", "3", "7", "2", "5", "4", "2", "3",…
$ number_of_reviews              <chr> "204", "42", "157", "269", "65", "180",…
$ last_review                    <chr> "23/04/2025", "15/03/2020", "22/08/2018…
$ reviews_per_month              <dbl> 1.35, 0.23, 0.89, 1.52, 0.37, 1.00, 2.8…
$ calculated_host_listings_count <dbl> 1, 10, 2, 3, 8, 1, 1, 10, 1, 8, 1, 10, …
$ availability_365               <dbl> 148, 0, 0, 165, 127, 16, 193, 0, 364, 1…
$ number_of_reviews_ltm          <dbl> 14, 0, 0, 10, 6, 1, 52, 0, 23, 4, 12, 0…
$ license                        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ ...19                          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ ...20                          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

Find duplicate records

# Find duplicate records
duplicates <- df %>% 
  filter(duplicated(.))

# Count how many duplicates there are
nrow(duplicates)

[1] 0

# Remove exact duplicates and save it back to 'df'
df <- df %>% 
  distinct()

# Check your new total count
total_rooms_clean <- nrow(df)
print(total_rooms_clean)

[1] 25587

There is no duplicate

Find records with NA

# This keeps the rows as long as the essential info is there
df <- df %>% 
  drop_na(price, neighbourhood, latitude, longitude, room_type, price, reviews_per_month, calculated_host_listings_count, availability_365)

# Now check the count again
nrow(df)

[1] 15702

Now 15,702 rows left

Rooms analysis

Total rooms

total_rooms <- nrow(df)
print(total_rooms)

[1] 15702

Room types

# Look at the unique room types
unique(df$room_type)

 [1] "Private room"    "Entire home/apt" "93"              "Hotel room"     
 [5] "Shared room"     "194"             "53"              "63"             
 [9] "51"              "147"             "514"             "139"            
[13] "808"             "122"             "80"              "90"             
[17] "107"             "188"             "195"             "73"             
[21] "62"              "56"              "114"             "99"             
[25] "270"             "126"             "54"              "109"            
[29] "174"             "141"             "134"             "50"             
[33] "115"             "138"             "137"             "120"            
[37] "112"             "220"             "117"             "160"            
[41] "618"             "128"             "169"             "102"            
[45] "45"              "60"              "97"              "32"             
[49] "125"             "168"             "118"             "960"            
[53] "113"             "357"             "41"              "47"             
[57] "44"              "39"              "55"              "86"             
[61] "394"             "178"             "260"             "145"            
[65] "43"              "64"              "71"              "156"            
[69] "267"             "111"             "100"             "79"             
[73] "123"             "548"             "88"              "69"             
[77] "105"             "87"              "74"              "94"             
[81] "186"             "61"              "205"             "573"

There are basically four types of rooms available, with the rest represented by numbers that are not relevant.

Clean the room type & Count the number of rooms

# 1. Define the only valid room types allowed
valid_types <- c("Private room", "Entire home/apt", "Hotel room", "Shared room")

# 2. Filter the dataframe
df_clean <- df %>%
  filter(room_type %in% valid_types)

# 3. Now try your count again
room_type_counts <- df_clean %>%
  group_by(room_type) %>%
  summarise(num_rooms = n())

print(room_type_counts)

# A tibble: 4 × 2
  room_type       num_rooms
  <chr>               <int>
1 Entire home/apt     12505
2 Hotel room             27
3 Private room         2978
4 Shared room            83

# Plot the rooms by type
ggplot(room_type_counts, aes(x = room_type, y = num_rooms, fill = room_type)) +
  geom_col() +
  geom_text(aes(label = paste0(num_rooms)), 
            size = 3, vjust = -1, check_overlap = TRUE) +
  labs(
    title = "Number of rooms by room types",
    x = "Room Type",
    y = "Number of rooms"
  )

Rooms by neighbourhood

# 1. Clean the data types
airbnb_clean <- df %>%
  mutate(
    # Remove any non-numeric characters from price (like $ or ,)
    price_num = as.numeric(gsub("[^0-9.]", "", price)),
    # Convert coordinates to numeric
    lat_num = as.numeric(latitude),
    long_num = as.numeric(longitude)
  ) %>%
  # Filter out rows where coordinates or price are missing
  filter(!is.na(lat_num), !is.na(long_num))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `long_num = as.numeric(longitude)`.
Caused by warning:
! NAs introduced by coercion

# 2. Aggregate by Neighbourhood
map_summary <- airbnb_clean %>%
  group_by(neighbourhood) %>%
  summarize(
    num_rooms = n(),
    avg_price = mean(price_num, na.rm = TRUE),
    lat = mean(lat_num),
    long = mean(long_num)
  )

# 3. Plot again
ggplot(map_summary, aes(x = long, y = lat)) +
  geom_point(aes(size = num_rooms, color = avg_price), alpha = 0.7) +
  geom_text(aes(label = paste0(neighbourhood, "\n", num_rooms, " rms")), 
            size = 3, vjust = -1, check_overlap = TRUE) +
  scale_color_gradient(low = "blue", high = "red") +
  theme_minimal() +
  labs(title = "Melbourne Airbnb Distribution",
       size = "Room Count",
       color = "Avg Price ($)")

View on the interactive map

# 1. Clean data
map_summary <- df %>%
  mutate(
    price_num = as.numeric(gsub("[^0-9.]", "", price)),
    lat_num = as.numeric(latitude),
    long_num = as.numeric(longitude)
  ) %>%
  group_by(neighbourhood) %>%
  summarize(
    num_rooms = n(),
    avg_price = mean(price_num, na.rm = TRUE),
    lat = mean(lat_num, na.rm = TRUE),
    long = mean(long_num, na.rm = TRUE)
  ) %>%
  filter(!is.na(lat))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `long_num = as.numeric(longitude)`.
Caused by warning:
! NAs introduced by coercion

# 2. Create the Leaflet Map
leaflet(map_summary) %>%
  addTiles() %>%  # Adds the standard OpenStreetMap background
  addCircleMarkers(
    lng = ~long, lat = ~lat,
    radius = ~sqrt(num_rooms) * 2, # Scale bubble size
    color = ~ifelse(avg_price > 200, "red", "blue"), # Simple color logic
    stroke = FALSE, fillOpacity = 0.6,
    label = ~paste0(neighbourhood, ": ", num_rooms, " rooms, avg $", round(avg_price))
  )

Warning in validateCoords(lng, lat, funcName): Data contains 107 rows with
either missing or invalid lat/lon values and will be ignored

Distance from CBD

# 1. Ensure data is clean and distance is calculated
airbnb_density <- df %>%
  mutate(
    lat_num = as.numeric(as.character(latitude)),
    long_num = as.numeric(as.character(longitude)),
    price_num = as.numeric(as.character(price)),
    mini_nights = as.numeric(as.character(minimum_nights)),
  ) %>%
  filter(!is.na(lat_num), !is.na(long_num), !is.na(price_num)) %>%
  rowwise() %>%
  mutate(
    # Calculate distance from CBD
    dist_cbd = distHaversine(c(long_num, lat_num), c(144.9631, -37.8136)) / 1000
  ) %>% 
  ungroup()

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `long_num = as.numeric(as.character(longitude))`.
Caused by warning:
! NAs introduced by coercion

# 2. Plot: Number of Rooms vs. Distance
ggplot(airbnb_density, aes(x = dist_cbd)) +
  # 1. The Histogram
  geom_histogram(binwidth = 5, fill = "steelblue", color = "white") +
  # 2. The Text Labels
  stat_bin(binwidth = 5, geom = "text", 
           aes(label = paste0(after_stat(count), 
                              " (", 
                              round(after_stat(count) / sum(after_stat(count)) * 100, 1), 
                              "%)")), 
           vjust = -0.5, 
           size = 3) +
  # 3. The Density Line (using linewidth instead of size)
  geom_density(aes(y = after_stat(count)), color = "red", linewidth = 1) +
  # 4. Labels and Titles
  labs(
    title = "Concentration of Rooms by Distance from Melbourne CBD",
    subtitle = paste("Analysis of", nrow(airbnb_density), "listings"),
    x = "Distance from CBD (km)",
    y = "Number of Rooms (Listings)"
  ) +
  # 5. The Window
  coord_cartesian(xlim = c(0, 40)) + 
  theme_minimal()

Price Analysis

Price summary

price_summary <- airbnb_clean %>%
  # Clean price to numeric
  mutate(price_num = as.numeric(gsub("[^0-9.]", "", as.character(price)))) %>%
  filter(!is.na(price_num)) %>%
  summarise(
    Min = min(price_num),
    Q1 = quantile(price_num, 0.25),
    Median = median(price_num),
    Mean = mean(price_num),
    Q3 = quantile(price_num, 0.75),
    Max = max(price_num),
    Total_Listings = n()
  )

print(price_summary)

# A tibble: 1 × 7
    Min    Q1 Median  Mean    Q3    Max Total_Listings
  <dbl> <dbl>  <dbl> <dbl> <dbl>  <dbl>          <int>
1    17   108    153  217.   224 118611          15593

The median price is much lower than the average price, suggesting the average price is distorted by a few high-price listings. The maximum price is way higher than Q3 price, which is clearly an outlier

Plot the price distribution by room types

# This compares prices across the different Room Types
ggplot(airbnb_clean, aes(x = room_type, y = price_num, fill = room_type)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 500)) +
  labs(title = "Price Distribution by Room Type", x = "Room Category")

Price by distance

# 1. Prepare the plotting groups
# We create 'dist_group' to turn the continuous distance into 5km categories
plot_data <- airbnb_density %>%
  mutate(dist_group = cut(dist_cbd, 
                          breaks = seq(0, 40, by = 5), 
                          include.lowest = TRUE,
                          labels = c("0-5km", "5-10km", "10-15km", "15-20km", 
                                     "20-25km", "25-30km", "30-35km", "35-40km"))) %>%
  filter(!is.na(dist_group))

# 2. Plot Mean Price against these Distance Groups
ggplot(plot_data, aes(x = dist_group, y = price_num)) +
  # Calculate the MEAN price for each bar
  stat_summary(fun = mean, geom = "col", fill = "steelblue", color = "white") +
  
  # Add the Dollar Label on top of each bar
  stat_summary(fun = mean, geom = "text", 
               aes(label = paste0("$", round(after_stat(y), 0))),
               vjust = -0.5, size = 4, fontface = "bold") +
  
  labs(
    title = "Average Airbnb Price by Distance from Melbourne CBD",
    subtitle = "Calculated using mean price per 5km distance bracket",
    x = "Distance from CBD",
    y = "Mean Price ($)"
  ) +
  # Adjust y-axis to give room for labels
  coord_cartesian(ylim = c(0, 500)) + 
  theme_minimal()

The average price is the highest between 5km and10m away from CBD, and generally trending down further away from the CBD. However price picks up from 30km to 40km

Neighbourhoods that are 35km to 40km from CBD

airbnb_density %>%
  filter(dist_cbd > 35 & dist_cbd <= 40) %>%
  group_by(neighbourhood) %>%
  summarise(
    count = n(),
    avg_price = mean(price_num, na.rm = TRUE)
  ) %>%
  arrange(desc(avg_price))

# A tibble: 8 × 3
  neighbourhood count avg_price
  <chr>         <int>     <dbl>
1 Cardinia          1      657 
2 Yarra Ranges    220      358.
3 Whittlesea        4      289 
4 Hume              7      253 
5 Casey            50      213.
6 Melton           43      201.
7 Nillumbik        10      198.
8 Frankston       106      190.

Median vs Mean price

# Using the same plot_data we created earlier
ggplot(plot_data, aes(x = dist_group, y = price_num)) +
  # Change 'fun' to median
  stat_summary(fun = median, geom = "col", fill = "darkorange", color = "white") +
  
  # Update text label to show the Median
  stat_summary(fun = median, geom = "text", 
               aes(label = paste0("$", round(after_stat(y), 0))),
               vjust = -0.5, size = 4, fontface = "bold") +
  
  labs(
    title = "Median Airbnb Price by Distance from Melbourne CBD",
    subtitle = "Median is more robust against luxury outliers",
    x = "Distance from CBD",
    y = "Median Price ($)"
  ) +
  coord_cartesian(ylim = c(0, 400)) + 
  theme_minimal()

The median prices are lower than the average price across all the neighbourhoods. However, the trend that the price is picking up from 30km to 40km remains the same

Availability

Distirbution of Annual Availability

ggplot(airbnb_density, aes(x = availability_365)) +
  # 1. Histogram: Binning by 30 days (roughly one month)
  geom_histogram(binwidth = 30, fill = "steelblue", color = "white", boundary = 0) +
  
  # 2. Labels: Calculate count and percentage for each 30-day month block
  stat_bin(binwidth = 30, geom = "text", boundary = 0,
           aes(label = paste0(after_stat(count), 
                            " (", 
                            round(after_stat(count) / sum(after_stat(count)) * 100, 1), 
                            "%)")), 
           vjust = -0.5, 
           size = 3) +
  
  # 3. Density Line: Adjusted to match the count scale
  geom_density(aes(y = after_stat(density) * nrow(airbnb_density) * 30), 
               color = "red", size = 1) +
  
  labs(
    title = "Distribution of Annual Availability",
    subtitle = paste("Analysis of", nrow(airbnb_density), "listings in Melbourne"),
    x = "Days Available (out of 365)",
    y = "Number of Listings"
  ) +
  # 4. Zoom: Set X-axis to the full year (0-365 days)
  coord_cartesian(xlim = c(0, 365)) + 
  theme_minimal()

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

The availability looks like evenly distributed from less than 30 days to a full year (365 days). Can the conclusion be made that most of the hosts are not purchasing the properties purely for short rent (airbnb), but rather compensate the income by letting a short period of the year.

Distribution of Minimum Night Requirements

ggplot(airbnb_density, aes(x = as.numeric(minimum_nights))) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white", boundary = 0.5) +
  
  stat_bin(binwidth = 1, geom = "text", boundary = 0.5,
           aes(label = ifelse(after_stat(count) > 500, after_stat(count), "")), 
           vjust = -0.7, size = 3) +
  
  # Density line (scaled for binwidth = 1)
  geom_density(aes(y = after_stat(density) * nrow(airbnb_density)), 
               color = "red", size = 1) +
  
  labs(
    title = "Distribution of Minimum Night Requirements",
    x = "Minimum Nights",
    y = "Number of Listings"
  ) +
  # Zooming in to see the 'action' (1-30 nights)
  coord_cartesian(xlim = c(0, 10)) + 
  theme_minimal()

The minimum nights requirements are largely within 3 days with few goes to over a week (7 days).

Suburbs where the minimum nights are more than 6 days

airbnb_density %>%
  filter(minimum_nights > 6) %>%
  group_by(neighbourhood) %>%
  summarise(
    count = n(),
    avg_price = mean(price_num, na.rm = TRUE)
  ) %>%
  arrange(desc(neighbourhood))

# A tibble: 30 × 3
   neighbourhood count avg_price
   <chr>         <int>     <dbl>
 1 Yarra Ranges      2      878.
 2 Yarra            23      214.
 3 Wyndham           7      155.
 4 Whittlesea        7      245.
 5 Whitehorse       10      144.
 6 Stonnington      44      447.
 7 Port Phillip     51      198.
 8 Nillumbik         1      198 
 9 Moreland         21      195.
10 Moonee Valley     9      105 
# ℹ 20 more rows

Distribution of Host Listings

# 1. Clean the data
airbnb_density <- airbnb_density %>%
  mutate(calculated_host_listings_count = as.numeric(as.character(calculated_host_listings_count))) %>%
  filter(!is.na(calculated_host_listings_count))

# 2. Plot the Distribution
ggplot(airbnb_density, aes(x = calculated_host_listings_count)) +
  # Use binwidth of 1 to see individual listing counts (1, 2, 3...)
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white", boundary = 0.5) +
  
  # 3. Add Labels (Only for bars with > 30 listings to avoid clutter)
  stat_bin(binwidth = 1, geom = "text", boundary = 0.5,
           aes(label = ifelse(after_stat(count) > 30, 
                              paste0(after_stat(count), "\n(", 
                                     round(after_stat(count) / sum(after_stat(count)) * 100, 1), "%)"), 
                              "")), 
           vjust = -0.5, size = 3, fontface = "bold") +
  
  # 4. Add Density Line (Scaled to match count)
  geom_density(aes(y = after_stat(density) * nrow(airbnb_density)), 
               color = "red", size = 1) +
  
  labs(
    title = "Distribution of Host Listings",
    subtitle = paste("Analysis of", nrow(airbnb_density), "listings in Melbourne"),
    x = "Number of Listings Managed by the Host",
    y = "Frequency (Number of Listings)"
  ) +
  
  # 5. The Zoom: Focus on the 1-30 range where most hosts sit
  coord_cartesian(xlim = c(0, 30)) + 
  
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold", size = 14)
  )

It is a long tail for the number of listings from the same host. There are 37.1% single listings, with the rest scattered up to 30 listings or more, which could be managed by professional businesses rather than individual hosts.

Monthly Reviews

# 1. Clean the data
airbnb_density <- airbnb_density %>%
  mutate(calculated_host_listings_count = as.numeric(as.character(reviews_per_month))) %>%
  filter(!is.na(reviews_per_month))

# 2. Plot the Distribution
ggplot(airbnb_density, aes(x = reviews_per_month)) +
  # Use binwidth of 1 to see individual listing counts (1, 2, 3...)
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white", boundary = 0.5) +
  # 4. Add Density Line (Scaled to match count)
  geom_density(aes(y = after_stat(density) * nrow(airbnb_density)), 
               color = "red", size = 1) +
# 2. Labels: 
  stat_bin(binwidth = 1, geom = "text", boundary = 0,
           aes(label = paste0(after_stat(count), 
                            " (", 
                            round(after_stat(count) / sum(after_stat(count)) * 100, 1), 
                            "%)")), 
           vjust = -3,
           hjust = 1.25,
           size = 3) +
  
  labs(
    title = "Monthly Review Distribution",
    subtitle = paste("Analysis of", nrow(airbnb_density), "listings in Melbourne"),
    x = "Number of Reviews per Month",
    y = "Frequency (Number of Listings)"
  ) +
  
  # 5. The Zoom: Focus on the 1-30 range where most hosts sit
  coord_cartesian(xlim = c(0, 10)) + 
  
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    plot.title = element_text(face = "bold", size = 14)
  )

The monthly review per unit is largely less than 3.