NOTE: this tutorial uses R + RStudio + some R packages to show the potential of using data visualization for inspecting and analyzing a data set. We strongly recommend you to explore the following links:

  1. RStudio: https://posit.co/downloads/
  2. ggplot2: https://ggplot2.tidyverse.org/
  3. extensions: https://exts.ggplot2.tidyverse.org/gallery/
  4. ggmosaic: this package has been removed from CRAN, it is necessary to install and older version:

Download and install RTools from “https://cran.rstudio.com/bin/windows/Rtools/rtools45/rtools.html

Download ggmosaic running install.packages( “ggmosaic”, repos = c(“https://haleyjeppson.r-universe.dev”, “https://cloud.r-project.org”)))

Load packages

library("ggmosaic")
## Warning: package 'ggmosaic' was built under R version 4.5.2
## Cargando paquete requerido: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.2
library("ggplot2")
library("fitdistrplus")
## Warning: package 'fitdistrplus' was built under R version 4.5.2
## Cargando paquete requerido: MASS
## Cargando paquete requerido: survival
library("MASS")
library("survival")
library("ggstatsplot")
## Warning: package 'ggstatsplot' was built under R version 4.5.2
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library("tidyverse")
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.1
## Warning: package 'tidyr' was built under R version 4.5.1
## Warning: package 'readr' was built under R version 4.5.1
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.1
## Warning: package 'stringr' was built under R version 4.5.1
## Warning: package 'forcats' was built under R version 4.5.1
## Warning: package 'lubridate' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.2.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Data loading and dimensions (N x M)

We read the dataset in CSV format, with 119,390 rows y 32 columns:

x=read.csv("hotel_bookings.csv", stringsAsFactors = T)
dim(x)
## [1] 119390     32

Data cleansing

##           hotel        is_canceled       lead_time   arrival_date_year
##  City Hotel  :79330   Min.   :0.0000   Min.   :  0   Min.   :2015     
##  Resort Hotel:40060   1st Qu.:0.0000   1st Qu.: 18   1st Qu.:2016     
##                       Median :0.0000   Median : 69   Median :2016     
##                       Mean   :0.3704   Mean   :104   Mean   :2016     
##                       3rd Qu.:1.0000   3rd Qu.:160   3rd Qu.:2017     
##                       Max.   :1.0000   Max.   :737   Max.   :2017     
##                                                                       
##  arrival_date_month arrival_date_week_number arrival_date_day_of_month
##  August :13877      Min.   : 1.00            Min.   : 1.0             
##  July   :12661      1st Qu.:16.00            1st Qu.: 8.0             
##  May    :11791      Median :28.00            Median :16.0             
##  October:11160      Mean   :27.17            Mean   :15.8             
##  April  :11089      3rd Qu.:38.00            3rd Qu.:23.0             
##  June   :10939      Max.   :53.00            Max.   :31.0             
##  (Other):47873                                                        
##  stays_in_weekend_nights stays_in_week_nights     adults      
##  Min.   : 0.0000         Min.   : 0.0         Min.   : 0.000  
##  1st Qu.: 0.0000         1st Qu.: 1.0         1st Qu.: 2.000  
##  Median : 1.0000         Median : 2.0         Median : 2.000  
##  Mean   : 0.9276         Mean   : 2.5         Mean   : 1.856  
##  3rd Qu.: 2.0000         3rd Qu.: 3.0         3rd Qu.: 2.000  
##  Max.   :19.0000         Max.   :50.0         Max.   :55.000  
##                                                               
##     children           babies                 meal          country     
##  Min.   : 0.0000   Min.   : 0.000000   BB       :92310   PRT    :48590  
##  1st Qu.: 0.0000   1st Qu.: 0.000000   FB       :  798   GBR    :12129  
##  Median : 0.0000   Median : 0.000000   HB       :14463   FRA    :10415  
##  Mean   : 0.1039   Mean   : 0.007949   SC       :10650   ESP    : 8568  
##  3rd Qu.: 0.0000   3rd Qu.: 0.000000   Undefined: 1169   DEU    : 7287  
##  Max.   :10.0000   Max.   :10.000000                     ITA    : 3766  
##  NA's   :4                                               (Other):28635  
##        market_segment  distribution_channel is_repeated_guest
##  Online TA    :56477   Corporate: 6677      Min.   :0.00000  
##  Offline TA/TO:24219   Direct   :14645      1st Qu.:0.00000  
##  Groups       :19811   GDS      :  193      Median :0.00000  
##  Direct       :12606   TA/TO    :97870      Mean   :0.03191  
##  Corporate    : 5295   Undefined:    5      3rd Qu.:0.00000  
##  Complementary:  743                        Max.   :1.00000  
##  (Other)      :  239                                         
##  previous_cancellations previous_bookings_not_canceled reserved_room_type
##  Min.   : 0.00000       Min.   : 0.0000                A      :85994     
##  1st Qu.: 0.00000       1st Qu.: 0.0000                D      :19201     
##  Median : 0.00000       Median : 0.0000                E      : 6535     
##  Mean   : 0.08712       Mean   : 0.1371                F      : 2897     
##  3rd Qu.: 0.00000       3rd Qu.: 0.0000                G      : 2094     
##  Max.   :26.00000       Max.   :72.0000                B      : 1118     
##                                                        (Other): 1551     
##  assigned_room_type booking_changes       deposit_type        agent      
##  A      :74053      Min.   : 0.0000   No Deposit:104641   9      :31961  
##  D      :25322      1st Qu.: 0.0000   Non Refund: 14587   NULL   :16340  
##  E      : 7806      Median : 0.0000   Refundable:   162   240    :13922  
##  F      : 3751      Mean   : 0.2211                       1      : 7191  
##  G      : 2553      3rd Qu.: 0.0000                       14     : 3640  
##  C      : 2375      Max.   :21.0000                       7      : 3539  
##  (Other): 3530                                            (Other):42797  
##     company       days_in_waiting_list         customer_type  
##  NULL   :112593   Min.   :  0.000      Contract       : 4076  
##  40     :   927   1st Qu.:  0.000      Group          :  577  
##  223    :   784   Median :  0.000      Transient      :89613  
##  67     :   267   Mean   :  2.321      Transient-Party:25124  
##  45     :   250   3rd Qu.:  0.000                             
##  153    :   215   Max.   :391.000                             
##  (Other):  4354                                               
##       adr          required_car_parking_spaces total_of_special_requests
##  Min.   :  -6.38   Min.   :0.00000             Min.   :0.0000           
##  1st Qu.:  69.29   1st Qu.:0.00000             1st Qu.:0.0000           
##  Median :  94.58   Median :0.00000             Median :0.0000           
##  Mean   : 101.83   Mean   :0.06252             Mean   :0.5714           
##  3rd Qu.: 126.00   3rd Qu.:0.00000             3rd Qu.:1.0000           
##  Max.   :5400.00   Max.   :8.00000             Max.   :5.0000           
##                                                                         
##  reservation_status reservation_status_date
##  Canceled :43017    2015-10-21:  1461      
##  Check-Out:75166    2015-07-06:   805      
##  No-Show  : 1207    2016-11-25:   790      
##                     2015-01-01:   763      
##                     2016-01-18:   625      
##                     2015-07-02:   469      
##                     (Other)   :114477
library(ggplot2)

ggplot(x, aes(x = hotel)) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "Distribución de reservas por tipo de hotel",
    x = "Tipo de hotel",
    y = "Número de reservas"
  ) +
  theme_minimal()

ggplot(x, aes(x = adr)) +
  geom_histogram(
    bins = 50,
    fill = "#4C72B0",
    color = "white"
  ) +
  labs(
    title = "Distribución del precio medio por noche (ADR)",
    x = "Precio medio por noche (ADR)",
    y = "Número de reservas"
  ) +
  coord_cartesian(xlim = c(0, 500)) +
  theme_minimal(base_size = 12)

library(ggmosaic)

x$is_canceled <- as.factor(x$is_canceled)

ggplot(data = x) +
  geom_mosaic(aes(x = product(is_canceled, hotel), fill = hotel)) +
  theme_light()

ggplot(x, aes(x = factor(is_canceled), y = adr, fill = factor(is_canceled))) +
  geom_boxplot() +
  scale_fill_manual(values = c("#4C72B0", "#DD8452")) +
  labs(
    title = "Precio medio por noche según cancelación",
    x = "Reserva cancelada (0 = No, 1 = Sí)",
    y = "Precio medio por noche (ADR)",
    fill = "Cancelada"
  ) +
  coord_cartesian(ylim = c(0, 450)) +
  theme_minimal()

ggplot(x, aes(x = factor(is_canceled), y = lead_time, fill = factor(is_canceled))) +
  geom_boxplot() +
  scale_fill_manual(values = c("#4C72B0", "#DD8452")) +
  labs(
    title = "Antelación de la reserva según cancelación",
    x = "Reserva cancelada (0 = No, 1 = Sí)",
    y = "Días de antelación",
    fill = "Cancelada"
  ) +
  theme_minimal()