Naples, Italy

Datasets for the following project can be found at the following link http://insideairbnb.com/.

First things first, let’s visualize how popular is Airbnb in Naples. We will do this by counting the total number of reviews per year.

reviews_NA <- reviews_NA %>%
  mutate(year = strftime(date, format = "%Y"),
         month = strftime(date, format = "%m"))



reviews_NA_year <- reviews_NA %>%
  group_by(year)%>%
  summarize(volume = n())

ggplot(reviews_NA_year, aes(x = year, y = volume))+
  geom_line(aes(group = 1), size = 1.5)+
  geom_point(shape= 21,fill = "red", size = 3)+
  geom_label(aes(label = volume),
             nudge_x = -0.3,
             nudge_y = 3000)+
  labs(title = "Number of reviews through years on Airbnb",
       subtitle = "Naples",
       x = "Year",
       y = "Number of reviews",
       caption = "insideairbnb.com")+
  theme_bw()+
  theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))

So, key takeaways here:

-Airbnb started becoming popular around 2014 with an exponential increase in the number of reviews until 2020.

-When the pandemic hit, there were more or less 70% less reviews with respect to the pre-pandemic peak.

-It seems that now the situation has improved and, at the end of 2022, we have surpassed the pre-pandemic peak.

What if I want to how has the situation evolved also for other major Italian cities?

Let’s see.

reviews_NA_year$City <- rep("Naples", 13)

reviews_RM <- reviews_RM %>%
  mutate(year = strftime(date, format = "%Y"))

reviews_RM_year <- reviews_RM %>%
  group_by(year)%>%
  summarize(volume = n())

#Dropping years 2009 and 2010 since we don't have data for that years in Naples and Milan
reviews_RM_year<- reviews_RM_year[-c(1,2),]

reviews_RM_year$City <- rep("Rome", 13)


reviews_MI <- reviews_MI %>%
  mutate(year = strftime(date, format = "%Y"))

reviews_MI_year <- reviews_MI %>%
  group_by(year)%>%
  summarize(volume = n())

reviews_MI_year$City <- rep("Milan", 13)

reviews_year <- rbind(reviews_NA_year, reviews_RM_year, reviews_MI_year)

ggplot(reviews_year, aes(x = year, y = volume, fill = City))+
  geom_area(aes(group = City), alpha = 0.9 ,color = "Black")+
  theme_bw()+
  labs(title = "Number of reviews through years on Airbnb",
       subtitle = "Rome, Naples, Milan",
       x = "Year",
       y = "Volume",
       caption = "insideairbnb.com")+
  theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))+
   scale_y_continuous(label=comma)

It’s clear that Rome has always been the city in Italy with the most visits, at least on Airbnb.

What if I want to know which are the municipalities in Naples with the most listings?

#Manually filling the neighbourhood_groop column since it is missing.
listings_NA$neighbourhood_group[listings_NA$neighbourhood == "Chiaia"|
                                  listings_NA$neighbourhood == "Posillipo"|
                                  listings_NA$neighbourhood =="San Ferdinando"] <-"Municipalità I"

listings_NA$neighbourhood_group[listings_NA$neighbourhood == "Avvocata"|
                                  listings_NA$neighbourhood == "Montecalvario"|
                                  listings_NA$neighbourhood =="Pendino"|
                                  listings_NA$neighbourhood == "Porto" |
                                  listings_NA$neighbourhood =="Mercato"|
                                  listings_NA$neighbourhood =="San Giuseppe"] <-"Municipalità II"

listings_NA$neighbourhood_group[listings_NA$neighbourhood == "Stella"|
                                  listings_NA$neighbourhood == "San Carlo all'Arena"] <-"Municipalità III"

listings_NA$neighbourhood_group[listings_NA$neighbourhood == "San Lorenzo"|
                                  listings_NA$neighbourhood == "Vicaria"|
                                  listings_NA$neighbourhood =="Poggioreale"|
                                  listings_NA$neighbourhood == "Zona Industriale"] <-"Municipalità IV"


listings_NA$neighbourhood_group[listings_NA$neighbourhood == "Vomero"|
                                  listings_NA$neighbourhood == "Arenella"] <-"Municipalità V"


listings_NA$neighbourhood_group[listings_NA$neighbourhood == "Ponticelli"|
                                  listings_NA$neighbourhood == "Barra"|
                                  listings_NA$neighbourhood =="San Giovanni a Teduccio"]<-"Municipalità VI"

listings_NA$neighbourhood_group[listings_NA$neighbourhood == "Miano"|
                                  listings_NA$neighbourhood == "Secondigliano"|
                                  listings_NA$neighbourhood =="San Pietro a Patierno"] <-"Municipalità VII"

listings_NA$neighbourhood_group[listings_NA$neighbourhood == "Piscinola"|
                                  listings_NA$neighbourhood == "Marianella"|
                                  listings_NA$neighbourhood =="Scampia"|
                                  listings_NA$neighbourhood == "Chiaiano"] <-"Municipalità VIII"

listings_NA$neighbourhood_group[listings_NA$neighbourhood == "Soccavo"|
                                  listings_NA$neighbourhood == "Pianura"] <-"Municipalità IX"

listings_NA$neighbourhood_group[listings_NA$neighbourhood == "Bagnoli"|
                                  listings_NA$neighbourhood == "Fuorigrotta"] <-"Municipalità X"
listings_NA_neighbourhood_composition <- listings_NA %>%
  group_by(neighbourhood_group)%>%
  summarize(volume = n())

ggplot(listings_NA_neighbourhood_composition, aes(x = reorder(neighbourhood_group, volume), y =volume))+
  geom_bar(stat = "identity", fill = "steelblue")+
  geom_label(aes(label = volume))+
  labs(title = "Number of listings per Municipality",
       subtitle = "The second municipality of Naples, composed by the neighbourhoods of Avvocata,\nMontecalvario, Mercato, Pendino, Porto and  S. Giuseppe, is the most popular on Airbnb.
",
       x = "Municipality", 
       y = "Volume of listings")+
  coord_flip()+
  theme_bw()+
  theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))

Clear that the municipality of Naples with the most visits is the second, I would like to take a look at the composition of the number of listings per neighbourhood within each municipality.

Let’s build a treemap.

listings_NA_treemap<- listings_NA %>%
  group_by(neighbourhood_group, neighbourhood)%>%
  summarize(volume = n())
## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.
treemap(listings_NA_treemap,
        index = c("neighbourhood_group","neighbourhood"),
        vSize = "volume",
        type = "index",
        palette = "Set2",
        bg.labels = "White",
        align.labels = list(
          c("center", "top"),
          c("left", "bottom")
        ),
        title = "Treemap of number of listings by municipality and neighbourhood",
        fontfamily.title = "Georgia",
        fontfamily.labels = "Georgia")

Now that we know which are the most “Popular” neighbourhoods on Airbnb, I want to extract another information: which are the most expensive?

Let’s see.

listings_NA_price_neighbourhood <- listings_NA %>%
  group_by(neighbourhood_group, neighbourhood)%>%
  summarize(volume = dplyr::n(), median_price = median(price))%>%
  filter(volume > 10)%>%
  arrange(desc(median_price))%>%
  top_n(10)
## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.
## Selecting by median_price
listings_NA_price_neighbourhood$neighbourhood <- factor(listings_NA_price_neighbourhood$neighbourhood, levels = listings_NA_price_neighbourhood$neighbourhood[order(listings_NA_price_neighbourhood$median_price)])

  
ggplot(listings_NA_price_neighbourhood, aes(median_price, neighbourhood, label = median_price))+
  geom_segment(aes(x = 0, y = neighbourhood, xend = median_price, yend = neighbourhood))+
  geom_point(shape= 21,fill = "red")+
  geom_text(nudge_x = 5)+
  labs(title = "Median price(in €) per night in Naples neighbourhoods",
       subtitle = "Posillipo leads the way with a median of 100 euro/night, \nfollowed by the neighbourhoods of San Giuseppe and San Ferdinando.",
       caption = "insideairbnb.com")+
  theme_bw()+
  theme(axis.title = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "none",
        plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 15,                                                                    l =-25)))

But… how can we use this information? I decided to use leaflet to build a map to recommend listings!

First, we construct our dataset and take a brief look at the summary.

leaflet_data <- listings_NA%>%
  select(name, latitude, longitude, neighbourhood, price)

leaflet_data <- left_join(leaflet_data, listings_NA_price_neighbourhood, by = "neighbourhood")
leaflet_data <- leaflet_data %>%
  select(-c(neighbourhood_group, volume))

library(summarytools, warn.conflicts = FALSE)

#A little summary of our map dataframe.
print(dfSummary(leaflet_data), method = "render")

Data Frame Summary

leaflet_data

Dimensions: 7520 x 6
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 name [character]
1. Ad Maiora Boutique Suite
2. Apartment and Rooms in Na
3. B&B Isa Guest Rooms
4. B&B SAN FERDINANDO VACANZ
5. B&B Spaccanapoli
6. Camera Matrimoniale
7. One room apartment comple
8. Super Central Flat in the
9. A Casa di Mamma B&B
10. B & B Sweet Night
[ 7398 others ]
4(0.1%)
4(0.1%)
4(0.1%)
4(0.1%)
4(0.1%)
4(0.1%)
4(0.1%)
4(0.1%)
3(0.0%)
3(0.0%)
7482(99.5%)
7520 (100.0%) 0 (0.0%)
2 latitude [numeric]
Mean (sd) : 40.8 (0)
min ≤ med ≤ max:
40.8 ≤ 40.8 ≤ 40.9
IQR (CV) : 0 (0)
3845 distinct values 7520 (100.0%) 0 (0.0%)
3 longitude [numeric]
Mean (sd) : 14.3 (0)
min ≤ med ≤ max:
14.1 ≤ 14.3 ≤ 14.4
IQR (CV) : 0 (0)
4481 distinct values 7520 (100.0%) 0 (0.0%)
4 neighbourhood [character]
1. San Lorenzo
2. Pendino
3. San Ferdinando
4. Chiaia
5. Montecalvario
6. San Giuseppe
7. Avvocata
8. Porto
9. Stella
10. San Carlo all'Arena
[ 20 others ]
1316(17.5%)
763(10.1%)
730(9.7%)
595(7.9%)
508(6.8%)
507(6.7%)
489(6.5%)
433(5.8%)
338(4.5%)
309(4.1%)
1532(20.4%)
7520 (100.0%) 0 (0.0%)
5 price [numeric]
Mean (sd) : 115 (301.2)
min ≤ med ≤ max:
0 ≤ 80 ≤ 10000
IQR (CV) : 56 (2.6)
383 distinct values 7520 (100.0%) 0 (0.0%)
6 median_price [numeric]
Mean (sd) : 80.5 (10.6)
min ≤ med ≤ max:
40 ≤ 79 ≤ 100
IQR (CV) : 19 (0.1)
21 distinct values 7492 (99.6%) 28 (0.4%)

Generated by summarytools 1.0.0 (R version 4.1.2)
2023-03-10

#Imputing some missing values with the median of the column
leaflet_data$median_price[is.na(leaflet_data$median_price)] = median(!is.na(leaflet_data$median_price))
which_color <- function(price, median_price){
  if(price <= median_price){
    "green"
  }
  else 
    "red"
}
  
getColor <- function(leaflet_data){ 
  
  mapply(which_color, leaflet_data$price, leaflet_data$median_price)
}

And now we can proceed with the map.

icons <- awesomeIcons(icon = "whatever",
                      iconColor = "black",
                      markerColor = getColor(leaflet_data))


leaflet(leaflet_data)%>%
  setView(lat = 40.863, lng =  14.2767,zoom = 12)%>%
  addTiles()%>%
  addAwesomeMarkers(clusterOptions = markerClusterOptions(),
                    label = ~htmlEscape(name),
                    icon = icons)
## Assuming "longitude" and "latitude" are longitude and latitude, respectively

The way this works is very simple: If the price of the property for one night is lower or equal than the median of its neighbourhood, the marker is green. On the other hand, if the price is greater the marker is red.

Lastly, looking at the price, we can ask ourself another information: Do tourists prefer expensive or less expensive properties? To answer this question, i decided to look at the correlation between the price variable and the number of reviews per month.

Firstly, what we need to do is running a summary to check for outliers and NAs.

summary(listings_NA$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      58      80     115     114   10000

Some entries in our price variable are likely to be entry errors, so we will simply restrict the axes in the chart to avoid them and improve the visualization.

summary(listings_NA$reviews_per_month)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.010   0.250   0.860   1.426   2.040  14.240    1322

Imputing the NA’s, which are present in this column, will be sufficient to run our chart without problems.

listings_NA$reviews_per_month[is.na(listings_NA$reviews_per_month)] = median(!is.na(listings_NA$reviews_per_month))

Now, to understand if our variables are correlated, we can look at the linear correlation between them.

cor(listings_NA$price, listings_NA$reviews_per_month, method = "pearson")
## [1] -0.05483768

There seems to be a slightly negative correlation among the two variables. However, as we know, linear correlation can fail to detect underlying patterns in the data, so let’s take a look at the chart.

ggplot(listings_NA, aes(x = price, y = reviews_per_month))+
  geom_hex(bins = 30)+
  scale_fill_viridis_c(option = "viridis")+
  theme_bw()+
  scale_x_continuous(limits = c(0,400))+
  scale_y_continuous(limits = c(0,10))+
  labs(title = "Correlation of price per night and reviews per month",
       x = "Price per night",
       y = "Number of reviews per month")+
  theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))

So the chart confirms what we’ve seen, there is no relationship between the price of a rent and its popularity.

Now, since we are dealing with rents, it is quite obvious that there will be some seasonality in the number of listings. How do we check this?

I decided to build a radar plot.

Firstly, we can prepare our data.

reviews_NA_radar_plot <- reviews_NA %>%
  filter(year > 2016 & year < 2022 )%>% #For clarity reasons, let's limit the number of years to the last 5.
  group_by(year, month)%>%
  summarize(volume = n())
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
reviews_NA_radar_plot <- reviews_NA_radar_plot%>%
  pivot_wider(names_from = month, values_from = volume)

colnames(reviews_NA_radar_plot) <- c("Year","January", "February", "March", "April",
                                   "May", "June", "July", "August",
                                   "September", "October", "November", "December")
ggradar(reviews_NA_radar_plot,
        grid.min = 0, 
        grid.mid = 4050,
        grid.max = 8000,
        group.point.size = 3,
        values.radar = c("0", "4000", "8000"))+
  labs(title = "Seasonality of reviews on AirBnb")+
  theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))

As it was expected, being a city located in Southern Italy, Naples receives the most visits in summer months, with August being the peak.

Lastly, let’s see if we can extract some informations from the name of the listings. A WordCloud can be useful.

library(tidytext)
library(wordcloud2)

wordcloud_data <- listings_NA%>%
  select(name)%>%
  unnest_tokens(input = name, output = word, token = "words")%>%
  count(word, sort = TRUE)%>%
  filter(n>10)

wordcloud_data <- wordcloud_data%>%
  filter(nchar(word)>=3)


wordcloud2(wordcloud_data,
          shape = "diamond", 
          color = "White",
          backgroundColor = "steelblue",
          fontFamily = "Georgia")

Judging from the wordcloud the impression is that the majority of the most frequent words are used to describe the kind of property that is for rent. (e.g. Casa, Room, House, Suite,Camera, Apartment and so on.)

Let’s see what our data tells us regarding the property type that is for rent.

room_type_data <- listings_NA%>%
  group_by(room_type)%>%
  summarize(volume = n())%>%
  mutate(perc = round(100*volume/sum(volume),1))

listings_NA_price_neighbourhood$neighbourhood <- factor(listings_NA_price_neighbourhood$neighbourhood, levels = listings_NA_price_neighbourhood$neighbourhood[order(listings_NA_price_neighbourhood$median_price)])

room_type_data$room_type <- factor(room_type_data$room_type,
                                   levels = room_type_data$room_type[order(room_type_data$volume,
                                                                           decreasing = TRUE)])


ggplot(room_type_data, aes(x =room_type, y = volume))+
  geom_bar(stat = "identity", fill = "steelblue")+
  geom_label(aes(label= volume),fill = "White")+
  theme_bw()+
  labs(title = "Distribution of properties for rent",
       subtitle = "Shared rooms seems to be really uncommon",
       y = "Volume",
       x = "Type of property for rent")+
   theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))

#Positions
positions  <- room_type_data %>% 
  mutate(csum = rev(cumsum(rev(perc))), 
         pos = perc/2 + lead(csum, 1),
         pos = if_else(is.na(pos), perc/2, pos))

positions$pos[positions$pos == 35.40] = 2.5



ggplot(room_type_data,aes(x ="", y = perc, fill = room_type))+
  geom_bar(stat = "identity")+
  scale_fill_brewer("Blues")+
  coord_polar("y",start = 0)+
  theme_void()+
  ggrepel::geom_label_repel(data = positions,
                            aes(y = pos, label = paste0(perc, "%")),
                            size = 4.5, nudge_x = 1,
                            show.legend = F)+
  labs(title = "Percentages of property tipes for rent")+
  guides(fill = guide_legend(title = "Property type"))+
  theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))

THE END