NOTE: this tutorial uses R + RStudio + some R packages to show the potential of using data visualization for inspecting and analyzing a data set. We strongly recommend you to explore the following links:
Download and install RTools from “https://cran.rstudio.com/bin/windows/Rtools/rtools45/rtools.html”
Download ggmosaic running install.packages( “ggmosaic”, repos = c(“https://haleyjeppson.r-universe.dev”, “https://cloud.r-project.org”)))
library("ggmosaic")
## Warning: package 'ggmosaic' was built under R version 4.5.2
## Cargando paquete requerido: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.2
library("ggplot2")
library("fitdistrplus")
## Warning: package 'fitdistrplus' was built under R version 4.5.2
## Cargando paquete requerido: MASS
## Cargando paquete requerido: survival
library("MASS")
library("survival")
library("ggstatsplot")
## Warning: package 'ggstatsplot' was built under R version 4.5.2
## You can cite this package as:
## Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
## Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library("tidyverse")
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.1
## Warning: package 'tidyr' was built under R version 4.5.1
## Warning: package 'readr' was built under R version 4.5.1
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.1
## Warning: package 'stringr' was built under R version 4.5.1
## Warning: package 'forcats' was built under R version 4.5.1
## Warning: package 'lubridate' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.2.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
We read the dataset in CSV format, with 119,390 rows y 32 columns:
x=read.csv("hotel_bookings.csv", stringsAsFactors = T)
dim(x)
## [1] 119390 32
## hotel is_canceled lead_time arrival_date_year
## City Hotel :79330 Min. :0.0000 Min. : 0 Min. :2015
## Resort Hotel:40060 1st Qu.:0.0000 1st Qu.: 18 1st Qu.:2016
## Median :0.0000 Median : 69 Median :2016
## Mean :0.3704 Mean :104 Mean :2016
## 3rd Qu.:1.0000 3rd Qu.:160 3rd Qu.:2017
## Max. :1.0000 Max. :737 Max. :2017
##
## arrival_date_month arrival_date_week_number arrival_date_day_of_month
## August :13877 Min. : 1.00 Min. : 1.0
## July :12661 1st Qu.:16.00 1st Qu.: 8.0
## May :11791 Median :28.00 Median :16.0
## October:11160 Mean :27.17 Mean :15.8
## April :11089 3rd Qu.:38.00 3rd Qu.:23.0
## June :10939 Max. :53.00 Max. :31.0
## (Other):47873
## stays_in_weekend_nights stays_in_week_nights adults
## Min. : 0.0000 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 1.0 1st Qu.: 2.000
## Median : 1.0000 Median : 2.0 Median : 2.000
## Mean : 0.9276 Mean : 2.5 Mean : 1.856
## 3rd Qu.: 2.0000 3rd Qu.: 3.0 3rd Qu.: 2.000
## Max. :19.0000 Max. :50.0 Max. :55.000
##
## children babies meal country
## Min. : 0.0000 Min. : 0.000000 BB :92310 PRT :48590
## 1st Qu.: 0.0000 1st Qu.: 0.000000 FB : 798 GBR :12129
## Median : 0.0000 Median : 0.000000 HB :14463 FRA :10415
## Mean : 0.1039 Mean : 0.007949 SC :10650 ESP : 8568
## 3rd Qu.: 0.0000 3rd Qu.: 0.000000 Undefined: 1169 DEU : 7287
## Max. :10.0000 Max. :10.000000 ITA : 3766
## NA's :4 (Other):28635
## market_segment distribution_channel is_repeated_guest
## Online TA :56477 Corporate: 6677 Min. :0.00000
## Offline TA/TO:24219 Direct :14645 1st Qu.:0.00000
## Groups :19811 GDS : 193 Median :0.00000
## Direct :12606 TA/TO :97870 Mean :0.03191
## Corporate : 5295 Undefined: 5 3rd Qu.:0.00000
## Complementary: 743 Max. :1.00000
## (Other) : 239
## previous_cancellations previous_bookings_not_canceled reserved_room_type
## Min. : 0.00000 Min. : 0.0000 A :85994
## 1st Qu.: 0.00000 1st Qu.: 0.0000 D :19201
## Median : 0.00000 Median : 0.0000 E : 6535
## Mean : 0.08712 Mean : 0.1371 F : 2897
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000 G : 2094
## Max. :26.00000 Max. :72.0000 B : 1118
## (Other): 1551
## assigned_room_type booking_changes deposit_type agent
## A :74053 Min. : 0.0000 No Deposit:104641 9 :31961
## D :25322 1st Qu.: 0.0000 Non Refund: 14587 NULL :16340
## E : 7806 Median : 0.0000 Refundable: 162 240 :13922
## F : 3751 Mean : 0.2211 1 : 7191
## G : 2553 3rd Qu.: 0.0000 14 : 3640
## C : 2375 Max. :21.0000 7 : 3539
## (Other): 3530 (Other):42797
## company days_in_waiting_list customer_type
## NULL :112593 Min. : 0.000 Contract : 4076
## 40 : 927 1st Qu.: 0.000 Group : 577
## 223 : 784 Median : 0.000 Transient :89613
## 67 : 267 Mean : 2.321 Transient-Party:25124
## 45 : 250 3rd Qu.: 0.000
## 153 : 215 Max. :391.000
## (Other): 4354
## adr required_car_parking_spaces total_of_special_requests
## Min. : -6.38 Min. :0.00000 Min. :0.0000
## 1st Qu.: 69.29 1st Qu.:0.00000 1st Qu.:0.0000
## Median : 94.58 Median :0.00000 Median :0.0000
## Mean : 101.83 Mean :0.06252 Mean :0.5714
## 3rd Qu.: 126.00 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :5400.00 Max. :8.00000 Max. :5.0000
##
## reservation_status reservation_status_date
## Canceled :43017 2015-10-21: 1461
## Check-Out:75166 2015-07-06: 805
## No-Show : 1207 2016-11-25: 790
## 2015-01-01: 763
## 2016-01-18: 625
## 2015-07-02: 469
## (Other) :114477
library(ggplot2)
ggplot(x, aes(x = hotel)) +
geom_bar(fill = "steelblue") +
labs(
title = "Distribución de reservas por tipo de hotel",
x = "Tipo de hotel",
y = "Número de reservas"
) +
theme_minimal()
ggplot(x, aes(x = adr)) +
geom_histogram(
bins = 50,
fill = "#4C72B0",
color = "white"
) +
labs(
title = "Distribución del precio medio por noche (ADR)",
x = "Precio medio por noche (ADR)",
y = "Número de reservas"
) +
coord_cartesian(xlim = c(0, 500)) +
theme_minimal(base_size = 12)
library(ggmosaic)
x$is_canceled <- as.factor(x$is_canceled)
ggplot(data = x) +
geom_mosaic(aes(x = product(is_canceled, hotel), fill = hotel)) +
theme_light()
ggplot(x, aes(x = factor(is_canceled), y = adr, fill = factor(is_canceled))) +
geom_boxplot() +
scale_fill_manual(values = c("#4C72B0", "#DD8452")) +
labs(
title = "Precio medio por noche según cancelación",
x = "Reserva cancelada (0 = No, 1 = Sí)",
y = "Precio medio por noche (ADR)",
fill = "Cancelada"
) +
coord_cartesian(ylim = c(0, 450)) +
theme_minimal()
ggplot(x, aes(x = factor(is_canceled), y = lead_time, fill = factor(is_canceled))) +
geom_boxplot() +
scale_fill_manual(values = c("#4C72B0", "#DD8452")) +
labs(
title = "Antelación de la reserva según cancelación",
x = "Reserva cancelada (0 = No, 1 = Sí)",
y = "Días de antelación",
fill = "Cancelada"
) +
theme_minimal()