Early detection of too many NA’s (missing data points) - therefore, can not produce a meaningful dashboard
## NA SUMMARY
## Can use .. filter(!complete.cases(.)) this filters all rows with an NA
column_na_cnt <- covid_ds1 %>%
summarise_all(~ sum(is.na(.))) ## counts all NAs in columns
t1 <- column_na_cnt %>%
rotate_df() %>% ## needs library(sjmisc) .. rotates columns to rows or visa versa
arrange((V1))
t1 <- t1 %>%
mutate(tot_row_count = nrow(covid_ds1)) %>%
mutate(perc_na = (V1/tot_row_count)*100) %>%
mutate(in_out_na = case_when(V1/tot_row_count > 0.2 ~ 'dismiss',
TRUE ~ 'keep' )) %>%
filter(in_out_na == 'keep')
t1 <-plyr::rename(
t1,
replace = c(V1="number_of_nas", tot_row_count="total_rows", perc_na = "NA%",
in_out_na = "keep_or_discard"),
warn_missing = FALSE
)
t1 <- rownames_to_column(t1, var = "column_name") ## 1st column had no name
# **Below is the table now showing columns kept for further analysis**
t1 %>%
kable %>%
kable_styling("striped", full_width = F, font_size = 10) %>% ## needs library(kableExtra)
scroll_box(width = "800px", height = "400px") ## needs library(kableExtra)| column_name | number_of_nas | total_rows | NA% | keep_or_discard |
|---|---|---|---|---|
| iso_code | 0 | 252742 | 0.0000000 | keep |
| location | 0 | 252742 | 0.0000000 | keep |
| date | 0 | 252742 | 0.0000000 | keep |
| year_no | 0 | 252742 | 0.0000000 | keep |
| month_no | 0 | 252742 | 0.0000000 | keep |
| day_no | 0 | 252742 | 0.0000000 | keep |
| total_row_cnt | 0 | 252742 | 0.0000000 | keep |
| population | 1084 | 252742 | 0.4288959 | keep |
| continent | 14194 | 252742 | 5.6160037 | keep |
| total_cases | 14338 | 252742 | 5.6729788 | keep |
| new_cases | 14600 | 252742 | 5.7766418 | keep |
| total_cases_per_million | 15422 | 252742 | 6.1018746 | keep |
| new_cases_per_million | 15684 | 252742 | 6.2055377 | keep |
| new_cases_smoothed | 15804 | 252742 | 6.2530169 | keep |
| new_cases_smoothed_per_million | 16883 | 252742 | 6.6799345 | keep |
| life_expectancy | 20859 | 252742 | 8.2530802 | keep |
| population_density | 32533 | 252742 | 12.8720197 | keep |
| total_deaths | 33911 | 252742 | 13.4172397 | keep |
| new_deaths | 34014 | 252742 | 13.4579927 | keep |
| total_deaths_per_million | 34982 | 252742 | 13.8409920 | keep |
| new_deaths_per_million | 35085 | 252742 | 13.8817450 | keep |
| new_deaths_smoothed | 35200 | 252742 | 13.9272460 | keep |
| new_deaths_smoothed_per_million | 36266 | 252742 | 14.3490199 | keep |
| diabetes_prevalence | 39004 | 252742 | 15.4323381 | keep |
| median_age | 49061 | 252742 | 19.4114947 | keep |
| gdp_per_capita | 49438 | 252742 | 19.5606587 | keep |
| cardiovasc_death_rate | 49657 | 252742 | 19.6473083 | keep |
| aged_70_older | 50126 | 252742 | 19.8328730 | keep |
## type_of_meal_plan .. room_type_reserved .. market_segment_type .. booking_status
f1 <- hotel_ds1 %>%
count(type_of_meal_plan)
f2 <- hotel_ds1 %>%
count(room_type_reserved)
f3 <- hotel_ds1 %>%
count(market_segment_type)
f4 <- hotel_ds1 %>%
count(booking_status)
## count rows in dataset
nr <- nrow(hotel_ds1)
## now get percentage proportion for each factor
f1_p <- f1 %>%
mutate(t_rows = nr) %>%
mutate(perc = (n/t_rows)*100) %>%
arrange(perc)
##
f2_p <- f2 %>%
mutate(t_rows = nr) %>%
mutate(perc = (n/t_rows)*100) %>%
arrange(perc)
##
f3_p <- f3 %>%
mutate(t_rows = nr) %>%
mutate(perc = (n/t_rows)*100) %>%
arrange(perc)
##
f4_p <- f4 %>%
mutate(t_rows = nr) %>%
mutate(perc = (n/t_rows)*100) %>%
arrange(perc)## type_of_meal_plan
f1_p# A tibble: 4 × 4
type_of_meal_plan n t_rows perc
<fct> <int> <int> <dbl>
1 Meal Plan 3 5 36275 0.0138
2 Meal Plan 2 3305 36275 9.11
3 Not Selected 5130 36275 14.1
4 Meal Plan 1 27835 36275 76.7
## room_type_reserved
f2_p# A tibble: 7 × 4
room_type_reserved n t_rows perc
<fct> <int> <int> <dbl>
1 Room_Type 3 7 36275 0.0193
2 Room_Type 7 158 36275 0.436
3 Room_Type 5 265 36275 0.731
4 Room_Type 2 692 36275 1.91
5 Room_Type 6 966 36275 2.66
6 Room_Type 4 6057 36275 16.7
7 Room_Type 1 28130 36275 77.5
## market_segment_type
f3_p# A tibble: 5 × 4
market_segment_type n t_rows perc
<fct> <int> <int> <dbl>
1 Aviation 125 36275 0.345
2 Complementary 391 36275 1.08
3 Corporate 2017 36275 5.56
4 Offline 10528 36275 29.0
5 Online 23214 36275 64.0
## booking_status
f4_p# A tibble: 2 × 4
booking_status n t_rows perc
<fct> <int> <int> <dbl>
1 Canceled 11885 36275 32.8
2 Not_Canceled 24390 36275 67.2
vars n mean sd median trimmed
no_of_adults 1 36275 1.84 0.52 2.00 1.86
no_of_children 2 36275 0.11 0.40 0.00 0.00
no_of_weekend_nights 3 36275 0.81 0.87 1.00 0.74
no_of_week_nights 4 36275 2.20 1.41 2.00 2.09
required_car_parking_space 5 36275 0.03 0.17 0.00 0.00
lead_time 6 36275 85.23 85.93 57.00 71.42
arrival_year 7 36275 2017.82 0.38 2018.00 2017.90
arrival_month 8 36275 7.42 3.07 8.00 7.55
arrival_date 9 36275 15.60 8.74 16.00 15.56
repeated_guest 10 36275 0.03 0.16 0.00 0.00
no_of_previous_cancellations 11 36275 0.02 0.37 0.00 0.00
no_of_previous_bookings_not_canceled 12 36275 0.15 1.75 0.00 0.00
avg_price_per_room 13 36275 103.42 35.09 99.45 101.32
no_of_special_requests 14 36275 0.62 0.79 0.00 0.50
date 15 36238 NaN NA NA NaN
mad min max range skew kurtosis se
no_of_adults 0.00 0 4 4 -0.33 0.81 0.00
no_of_children 0.00 0 10 10 4.71 36.97 0.00
no_of_weekend_nights 1.48 0 7 7 0.74 0.30 0.00
no_of_week_nights 1.48 0 17 17 1.60 7.80 0.01
required_car_parking_space 0.00 0 1 1 5.41 27.30 0.00
lead_time 69.68 0 443 443 1.29 1.18 0.45
arrival_year 0.00 2017 2018 1 -1.67 0.79 0.00
arrival_month 2.97 1 12 11 -0.35 -0.93 0.02
arrival_date 11.86 1 31 30 0.03 -1.16 0.05
repeated_guest 0.00 0 1 1 6.00 34.03 0.00
no_of_previous_cancellations 0.00 0 13 13 25.20 732.59 0.00
no_of_previous_bookings_not_canceled 0.00 0 58 58 19.25 457.29 0.01
avg_price_per_room 30.02 0 540 540 0.67 3.15 0.18
no_of_special_requests 0.00 0 5 5 1.14 0.88 0.00
date NA Inf -Inf -Inf NA NA NA
market_segment_type Meal Plan 1 Meal Plan 2 Meal Plan 3 Not Selected
Aviation 125 (0.34%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
Complementary 370 (1.02%) 11 (0.03%) 4 (0.01%) 6 (0.02%)
Corporate 1996 (5.50%) 6 (0.02%) 0 (0.00%) 15 (0.04%)
Offline 7988 (22.02%) 2365 (6.52%) 1 (0.00%) 174 (0.48%)
Online 17356 (47.85%) 923 (2.54%) 0 (0.00%) 4935 (13.60%)
market_segment_type Meal Plan 1 Meal Plan 2 Meal Plan 3 Not Selected
Aviation 125 (0.45%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
Complementary 370 (1.33%) 11 (0.33%) 4 (80.00%) 6 (0.12%)
Corporate 1996 (7.17%) 6 (0.18%) 0 (0.00%) 15 (0.29%)
Offline 7988 (28.70%) 2365 (71.56%) 1 (20.00%) 174 (3.39%)
Online 17356 (62.35%) 923 (27.93%) 0 (0.00%) 4935 (96.20%)
market_segment_type Meal Plan 1 Meal Plan 2 Meal Plan 3 Not Selected
Aviation 125 (100.00%) 0 (0.00%) 0 (0.00%) 0 (0.00%)
Complementary 370 (94.63%) 11 (2.81%) 4 (1.02%) 6 (1.53%)
Corporate 1996 (98.96%) 6 (0.30%) 0 (0.00%) 15 (0.74%)
Offline 7988 (75.87%) 2365 (22.46%) 1 (0.01%) 174 (1.65%)
Online 17356 (74.77%) 923 (3.98%) 0 (0.00%) 4935 (21.26%)
---
title: "Covid/Hotel Data (HD)"
output:
flexdashboard::flex_dashboard:
orientation: columns
vertical_layout: fill
#theme: sandstone
css: fd.css
storyboard: false
social: menu
source: embed
keep_tex: yes
---
```{r setup, include=FALSE, eval=TRUE}
library(flexdashboard)
library(tidyverse)
library(plotly)
library(dplyr)
library(lubridate)
library(plotly)
library(ggplot2)
library(modelr)
library(corrplot)
library(corrgram)
library(janitor)
library(DT)
library(knitr)
library(formattable)
library(rmarkdown)
library(sqldf)
library(party)
library(partykit)
library(caret)
library(packrat)
library(rsconnect)
library(skimr)
library(sjmisc)
library(kableExtra)
library(highcharter)
library(psych)
library(ggcorrplot)
covid_ds1<- read_csv(file = "C:/Reference Material/Machine_Learning/covid-data.csv")
hotel_ds1 <-read_csv(file = "C:/Reference Material/Machine_Learning/Hotel_Reservations.csv")
#########################################################################################
nc <- ncol(covid_ds1) # number of columns in dataset
## add columns year_no,month_no,day_no
covid_ds1 <- covid_ds1 %>%
mutate(year_no = factor(year(date))) %>%
mutate(month_no = factor(month(date))) %>%
mutate(day_no = factor(day(date))) %>%
mutate(total_row_cnt = nrow(.)) %>%
mutate_if(is_character,factor) ##converts all character colmuns to facto
hotel_ds1 <- hotel_ds1 %>%
mutate(date = make_date(arrival_year,arrival_month,arrival_date)) %>%
mutate(wday = wday(date,label = TRUE)) %>%
mutate_if(is.character,factor) ##converts all character colmuns to factors
```
# **Covid -NA's analysis**
Column {data-width=550}
-----------------------------------------------------------------------
### **NA summary in data set**
###### We need to clarify if columns with too many NA's are of any use. So, the calculation is as follows .. any columns with more than 20% total NA's per total rows will be discarded for now - see code below for logic. For reference - the totalnumber of columns in the original data set is `r nc`. The case for bringing back columns is up for discussion and how these NAs should be replaced & with what methodology - "Imputation". One possibility is to take the mean for each column and replace NA with the mean - but will need careful consideration. If there is a need to use these columns that have high counts of NA then we can analyse each column on it's merit by only using the rows that have no NA.
**Early detection of too many NA's (missing data points) - therefore, can not produce a meaningful dashboard**
```{r, echo = TRUE,class.source = "small"}
## NA SUMMARY
## Can use .. filter(!complete.cases(.)) this filters all rows with an NA
column_na_cnt <- covid_ds1 %>%
summarise_all(~ sum(is.na(.))) ## counts all NAs in columns
t1 <- column_na_cnt %>%
rotate_df() %>% ## needs library(sjmisc) .. rotates columns to rows or visa versa
arrange((V1))
t1 <- t1 %>%
mutate(tot_row_count = nrow(covid_ds1)) %>%
mutate(perc_na = (V1/tot_row_count)*100) %>%
mutate(in_out_na = case_when(V1/tot_row_count > 0.2 ~ 'dismiss',
TRUE ~ 'keep' )) %>%
filter(in_out_na == 'keep')
t1 <-plyr::rename(
t1,
replace = c(V1="number_of_nas", tot_row_count="total_rows", perc_na = "NA%",
in_out_na = "keep_or_discard"),
warn_missing = FALSE
)
t1 <- rownames_to_column(t1, var = "column_name") ## 1st column had no name
# **Below is the table now showing columns kept for further analysis**
t1 %>%
kable %>%
kable_styling("striped", full_width = F, font_size = 10) %>% ## needs library(kableExtra)
scroll_box(width = "800px", height = "400px") ## needs library(kableExtra)
```
Column {data-width=450}
-----------------------------------------------------------------------
### Histogram of NA's - kept columns
```{r}
## Histogram of nas
t1a <- t1 %>%
select(everything()) %>%
filter(t1$number_of_nas >=0)
hist1 <-ggplot(t1a, aes(x=t1a$number_of_nas)) + geom_histogram(binwidth=500,color="black", fill="red")+
labs(x = "x-axis -> Bins = 500", y = "y-axis -> Count")
ggplotly(hist1)%>%
config(displayModeBar = FALSE)
```
### Tree Map showing kept columns with NA totals descending order
```{r}
## Tree map
t1%>%
arrange(number_of_nas)%>%
hchart(type = "treemap", hcaes(x = column_name, value = number_of_nas, color = number_of_nas))%>%
hc_colorAxis(stops = color_stops(colors = viridis::inferno(20)))
```
# **HD - Categorical Summary**
##Column {data-width=650}
-----------------------------------------------------------------------
### HD - Summary of Category factors in dataset
```{r, echo = TRUE,class.source = "small"}
## type_of_meal_plan .. room_type_reserved .. market_segment_type .. booking_status
f1 <- hotel_ds1 %>%
count(type_of_meal_plan)
f2 <- hotel_ds1 %>%
count(room_type_reserved)
f3 <- hotel_ds1 %>%
count(market_segment_type)
f4 <- hotel_ds1 %>%
count(booking_status)
## count rows in dataset
nr <- nrow(hotel_ds1)
## now get percentage proportion for each factor
f1_p <- f1 %>%
mutate(t_rows = nr) %>%
mutate(perc = (n/t_rows)*100) %>%
arrange(perc)
##
f2_p <- f2 %>%
mutate(t_rows = nr) %>%
mutate(perc = (n/t_rows)*100) %>%
arrange(perc)
##
f3_p <- f3 %>%
mutate(t_rows = nr) %>%
mutate(perc = (n/t_rows)*100) %>%
arrange(perc)
##
f4_p <- f4 %>%
mutate(t_rows = nr) %>%
mutate(perc = (n/t_rows)*100) %>%
arrange(perc)
```
##Column {data-width=350}
-----------------------------------------------------------------------
### HD - Categorical proportion in percentages
``` {r, echo = TRUE,class.source = "small"}
## type_of_meal_plan
f1_p
## room_type_reserved
f2_p
```
##Column {data-width=350}
-----------------------------------------------------------------------
### HD - Categorical proportion in percentages
``` {r, echo = TRUE,class.source = "small"}
## market_segment_type
f3_p
## booking_status
f4_p
```
# **HD - Corr and Mkt Seg Analysis**
Inputs {.sidebar}
---------------------
LEGEND:
a_p_r -> Average Price per Room
a_y -> Arrival Year
a_m -> Arrival Month
a_d -> Arrival Date
l_time -> Lead Time
n_ad -> Number of Adults
n_ch -> Number of Children
n_wen -> Number of Weekend Nights
n_wn -> Number of Week Nights
n_pc -> Number of Previous Cancellations
n_pbnc -> Number of Previous Bookings Not Cancelled
n_sr -> Number of Special Requests
r_cps -> Required Car Parking Space
r_g -> Repeated Guest
```{r, echo = TRUE}
```
## Row {.tabset .tabset-fade}
-----------------------------------------------------------------------
### HD - Correlation Chart
```{r}
cor_plot1 <- hotel_ds1 %>%
select(market_segment_type,lead_time,avg_price_per_room,starts_with(c("no","arr","re"))) %>%
group_by(market_segment_type) %>%
summarise_all(~ mean(.,na.rm = TRUE))
cor_plot2 <- cor_plot1 %>%
select(everything(),-market_segment_type)
colnames(cor_plot2)<- c("l_time","a_p_r","n_ad","n_ch","n_wen","n_wn","n_pc","n_pbnc","n_sr",
"a_y", "a_m","a_d","r_cps","r_g")
cor_plot3 <- cor(cor_plot2)
##corrplot(cor_plot3,method = "pie", type = "upper", tl.cex=1.2)
##preferred plot type below
```
```{r}
ggcorrplot(cor_plot3,
hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 2)
```
### HD - Summary of factors in dataset
```{css}
.pre {
max-height: 800px;
float: left;
width: 910px;
overflow-y: auto;
overflow-x: auto;
}
```
``` {r, class.output = "pre",echo = FALSE,class.source = "small"}
## select all columns with numeric data
nd <- hotel_ds1 %>%
select_if(is_double)
describe(nd)
```
## Column {data-width=400}
-----------------------------------------------------------------------
### HD - Market segment percentages by Meal Plan - all %
```{r}
mat1 <- hotel_ds1 %>%
select(market_segment_type,type_of_meal_plan,arrival_year)
##
## GENDER RACE %
mat1a <-mat1 %>%
tabyl(market_segment_type,type_of_meal_plan) %>%
adorn_percentages("all") %>% ## CAN BE col, row, all
adorn_pct_formatting(digits = 2) %>%
adorn_ns("front")
print(mat1a)
```
### HD - Market segment percentages by Meal Plan - column %
```{r}
mat1b <-mat1 %>%
tabyl(market_segment_type,type_of_meal_plan) %>%
adorn_percentages("col") %>% ## CAN BE col, row, all
adorn_pct_formatting(digits = 2) %>%
adorn_ns("front")
print(mat1b)
```
### HD - Market segment percentages by Meal Plan - row %
```{r, echo=FALSE}
mat1c <-mat1 %>%
tabyl(market_segment_type,type_of_meal_plan) %>%
adorn_percentages("row") %>% ## CAN BE col, row, all
adorn_pct_formatting(digits = 2) %>%
adorn_ns("front")
print(mat1c)
```
# **HD - Facet grid**
```{r}
facet1 <- hotel_ds1 %>%
filter(wday != 'NA')
sp <- ggplot(facet1, aes(x=avg_price_per_room, y=lead_time,colour = wday)) + geom_point(shape=1)+
facet_grid(room_type_reserved ~ type_of_meal_plan) +
theme(strip.text.x = element_text(size=6, angle=0),
strip.text.y = element_text(size=6, face="bold"),
strip.background = element_rect(colour="red", fill="#CCCCFF"))
ggplotly(sp)
```