Covid -NA’s analysis

Column

NA summary in data set

We need to clarify if columns with too many NA’s are of any use. So, the calculation is as follows .. any columns with more than 20% total NA’s per total rows will be discarded for now - see code below for logic. For reference - the totalnumber of columns in the original data set is 67. The case for bringing back columns is up for discussion and how these NAs should be replaced & with what methodology - “Imputation”. One possibility is to take the mean for each column and replace NA with the mean - but will need careful consideration. If there is a need to use these columns that have high counts of NA then we can analyse each column on it’s merit by only using the rows that have no NA.

Early detection of too many NA’s (missing data points) - therefore, can not produce a meaningful dashboard

## NA SUMMARY
## Can use .. filter(!complete.cases(.)) this filters all rows with an NA
column_na_cnt <- covid_ds1 %>% 
  summarise_all(~ sum(is.na(.))) ## counts all NAs in columns

t1 <- column_na_cnt %>% 
  rotate_df() %>%  ## needs library(sjmisc) .. rotates columns to rows or visa versa
  arrange((V1))

t1 <- t1 %>% 
  mutate(tot_row_count = nrow(covid_ds1)) %>% 
  mutate(perc_na = (V1/tot_row_count)*100) %>% 
  mutate(in_out_na = case_when(V1/tot_row_count > 0.2 ~ 'dismiss',
                                TRUE ~ 'keep' )) %>% 
  filter(in_out_na == 'keep') 

t1 <-plyr::rename(
  t1,
  replace      = c(V1="number_of_nas", tot_row_count="total_rows", perc_na = "NA%",
                   in_out_na = "keep_or_discard"),
  warn_missing = FALSE
)

t1 <- rownames_to_column(t1, var = "column_name") ## 1st column had no name

# **Below is the table now showing columns kept for further analysis**

t1 %>% 
  kable %>%
  kable_styling("striped", full_width = F, font_size = 10) %>% ## needs library(kableExtra)
 scroll_box(width = "800px", height = "400px") ## needs library(kableExtra)

column_name	number_of_nas	total_rows	NA%	keep_or_discard
iso_code	0	252742	0.0000000	keep
location	0	252742	0.0000000	keep
date	0	252742	0.0000000	keep
year_no	0	252742	0.0000000	keep
month_no	0	252742	0.0000000	keep
day_no	0	252742	0.0000000	keep
total_row_cnt	0	252742	0.0000000	keep
population	1084	252742	0.4288959	keep
continent	14194	252742	5.6160037	keep
total_cases	14338	252742	5.6729788	keep
new_cases	14600	252742	5.7766418	keep
total_cases_per_million	15422	252742	6.1018746	keep
new_cases_per_million	15684	252742	6.2055377	keep
new_cases_smoothed	15804	252742	6.2530169	keep
new_cases_smoothed_per_million	16883	252742	6.6799345	keep
life_expectancy	20859	252742	8.2530802	keep
population_density	32533	252742	12.8720197	keep
total_deaths	33911	252742	13.4172397	keep
new_deaths	34014	252742	13.4579927	keep
total_deaths_per_million	34982	252742	13.8409920	keep
new_deaths_per_million	35085	252742	13.8817450	keep
new_deaths_smoothed	35200	252742	13.9272460	keep
new_deaths_smoothed_per_million	36266	252742	14.3490199	keep
diabetes_prevalence	39004	252742	15.4323381	keep
median_age	49061	252742	19.4114947	keep
gdp_per_capita	49438	252742	19.5606587	keep
cardiovasc_death_rate	49657	252742	19.6473083	keep
aged_70_older	50126	252742	19.8328730	keep

Column

Histogram of NA’s - kept columns

Tree Map showing kept columns with NA totals descending order

HD - Categorical Summary

##Column

HD - Summary of Category factors in dataset

## type_of_meal_plan .. room_type_reserved .. market_segment_type .. booking_status

f1 <- hotel_ds1 %>% 
  count(type_of_meal_plan)
f2 <- hotel_ds1 %>% 
  count(room_type_reserved)
f3 <- hotel_ds1 %>% 
  count(market_segment_type)
f4 <- hotel_ds1 %>% 
  count(booking_status)

## count rows in dataset
nr <- nrow(hotel_ds1)
## now get percentage proportion for each factor
f1_p <- f1 %>% 
  mutate(t_rows = nr) %>% 
  mutate(perc = (n/t_rows)*100) %>% 
  arrange(perc)
##
f2_p <- f2 %>% 
  mutate(t_rows = nr) %>% 
  mutate(perc = (n/t_rows)*100) %>% 
  arrange(perc)
##
f3_p <- f3 %>% 
  mutate(t_rows = nr) %>% 
  mutate(perc = (n/t_rows)*100) %>% 
  arrange(perc)
##
f4_p <- f4 %>% 
  mutate(t_rows = nr) %>% 
  mutate(perc = (n/t_rows)*100) %>% 
  arrange(perc)

##Column

HD - Categorical proportion in percentages

## type_of_meal_plan 
f1_p

# A tibble: 4 × 4
  type_of_meal_plan     n t_rows    perc
  <fct>             <int>  <int>   <dbl>
1 Meal Plan 3           5  36275  0.0138
2 Meal Plan 2        3305  36275  9.11  
3 Not Selected       5130  36275 14.1   
4 Meal Plan 1       27835  36275 76.7

## room_type_reserved 
f2_p

# A tibble: 7 × 4
  room_type_reserved     n t_rows    perc
  <fct>              <int>  <int>   <dbl>
1 Room_Type 3            7  36275  0.0193
2 Room_Type 7          158  36275  0.436 
3 Room_Type 5          265  36275  0.731 
4 Room_Type 2          692  36275  1.91  
5 Room_Type 6          966  36275  2.66  
6 Room_Type 4         6057  36275 16.7   
7 Room_Type 1        28130  36275 77.5

##Column

HD - Categorical proportion in percentages

## market_segment_type 
f3_p

# A tibble: 5 × 4
  market_segment_type     n t_rows   perc
  <fct>               <int>  <int>  <dbl>
1 Aviation              125  36275  0.345
2 Complementary         391  36275  1.08 
3 Corporate            2017  36275  5.56 
4 Offline             10528  36275 29.0  
5 Online              23214  36275 64.0

## booking_status
f4_p

# A tibble: 2 × 4
  booking_status     n t_rows  perc
  <fct>          <int>  <int> <dbl>
1 Canceled       11885  36275  32.8
2 Not_Canceled   24390  36275  67.2

HD - Corr and Mkt Seg Analysis

## Row

HD - Correlation Chart

HD - Summary of factors in dataset

                                     vars     n    mean    sd  median trimmed
no_of_adults                            1 36275    1.84  0.52    2.00    1.86
no_of_children                          2 36275    0.11  0.40    0.00    0.00
no_of_weekend_nights                    3 36275    0.81  0.87    1.00    0.74
no_of_week_nights                       4 36275    2.20  1.41    2.00    2.09
required_car_parking_space              5 36275    0.03  0.17    0.00    0.00
lead_time                               6 36275   85.23 85.93   57.00   71.42
arrival_year                            7 36275 2017.82  0.38 2018.00 2017.90
arrival_month                           8 36275    7.42  3.07    8.00    7.55
arrival_date                            9 36275   15.60  8.74   16.00   15.56
repeated_guest                         10 36275    0.03  0.16    0.00    0.00
no_of_previous_cancellations           11 36275    0.02  0.37    0.00    0.00
no_of_previous_bookings_not_canceled   12 36275    0.15  1.75    0.00    0.00
avg_price_per_room                     13 36275  103.42 35.09   99.45  101.32
no_of_special_requests                 14 36275    0.62  0.79    0.00    0.50
date                                   15 36238     NaN    NA      NA     NaN
                                       mad  min  max range  skew kurtosis   se
no_of_adults                          0.00    0    4     4 -0.33     0.81 0.00
no_of_children                        0.00    0   10    10  4.71    36.97 0.00
no_of_weekend_nights                  1.48    0    7     7  0.74     0.30 0.00
no_of_week_nights                     1.48    0   17    17  1.60     7.80 0.01
required_car_parking_space            0.00    0    1     1  5.41    27.30 0.00
lead_time                            69.68    0  443   443  1.29     1.18 0.45
arrival_year                          0.00 2017 2018     1 -1.67     0.79 0.00
arrival_month                         2.97    1   12    11 -0.35    -0.93 0.02
arrival_date                         11.86    1   31    30  0.03    -1.16 0.05
repeated_guest                        0.00    0    1     1  6.00    34.03 0.00
no_of_previous_cancellations          0.00    0   13    13 25.20   732.59 0.00
no_of_previous_bookings_not_canceled  0.00    0   58    58 19.25   457.29 0.01
avg_price_per_room                   30.02    0  540   540  0.67     3.15 0.18
no_of_special_requests                0.00    0    5     5  1.14     0.88 0.00
date                                    NA  Inf -Inf  -Inf    NA       NA   NA

## Column

HD - Market segment percentages by Meal Plan - all %

 market_segment_type    Meal Plan 1  Meal Plan 2 Meal Plan 3  Not Selected
            Aviation   125  (0.34%)    0 (0.00%)   0 (0.00%)    0  (0.00%)
       Complementary   370  (1.02%)   11 (0.03%)   4 (0.01%)    6  (0.02%)
           Corporate  1996  (5.50%)    6 (0.02%)   0 (0.00%)   15  (0.04%)
             Offline  7988 (22.02%) 2365 (6.52%)   1 (0.00%)  174  (0.48%)
              Online 17356 (47.85%)  923 (2.54%)   0 (0.00%) 4935 (13.60%)

HD - Market segment percentages by Meal Plan - column %

 market_segment_type    Meal Plan 1   Meal Plan 2 Meal Plan 3  Not Selected
            Aviation   125  (0.45%)    0  (0.00%)  0  (0.00%)    0  (0.00%)
       Complementary   370  (1.33%)   11  (0.33%)  4 (80.00%)    6  (0.12%)
           Corporate  1996  (7.17%)    6  (0.18%)  0  (0.00%)   15  (0.29%)
             Offline  7988 (28.70%) 2365 (71.56%)  1 (20.00%)  174  (3.39%)
              Online 17356 (62.35%)  923 (27.93%)  0  (0.00%) 4935 (96.20%)

HD - Market segment percentages by Meal Plan - row %

 market_segment_type     Meal Plan 1   Meal Plan 2 Meal Plan 3  Not Selected
            Aviation   125 (100.00%)    0  (0.00%)   0 (0.00%)    0  (0.00%)
       Complementary   370  (94.63%)   11  (2.81%)   4 (1.02%)    6  (1.53%)
           Corporate  1996  (98.96%)    6  (0.30%)   0 (0.00%)   15  (0.74%)
             Offline  7988  (75.87%) 2365 (22.46%)   1 (0.01%)  174  (1.65%)
              Online 17356  (74.77%)  923  (3.98%)   0 (0.00%) 4935 (21.26%)

HD - Facet grid

---
title: "Covid/Hotel Data (HD)"
output: 
  flexdashboard::flex_dashboard:
    orientation: columns
    vertical_layout: fill
    #theme: sandstone
    css: fd.css
    storyboard: false
    social: menu
    source: embed
    keep_tex: yes
---

```{r setup, include=FALSE, eval=TRUE}
library(flexdashboard)
library(tidyverse)
library(plotly)
library(dplyr)
library(lubridate)
library(plotly)
library(ggplot2)
library(modelr)
library(corrplot)
library(corrgram)
library(janitor)
library(DT)
library(knitr)
library(formattable)
library(rmarkdown)
library(sqldf)
library(party)
library(partykit)
library(caret)
library(packrat)
library(rsconnect)
library(skimr)
library(sjmisc)
library(kableExtra)
library(highcharter)
library(psych)
library(ggcorrplot)


covid_ds1<- read_csv(file = "C:/Reference Material/Machine_Learning/covid-data.csv")
hotel_ds1 <-read_csv(file = "C:/Reference Material/Machine_Learning/Hotel_Reservations.csv")

#########################################################################################
nc <- ncol(covid_ds1) # number of columns in dataset

## add columns year_no,month_no,day_no
covid_ds1 <- covid_ds1 %>% 
  mutate(year_no  = factor(year(date))) %>% 
  mutate(month_no = factor(month(date))) %>% 
  mutate(day_no   = factor(day(date))) %>% 
  mutate(total_row_cnt = nrow(.)) %>% 
  mutate_if(is_character,factor) ##converts all character colmuns to facto

hotel_ds1 <- hotel_ds1 %>% 
   mutate(date = make_date(arrival_year,arrival_month,arrival_date)) %>% 
   mutate(wday = wday(date,label = TRUE)) %>% 
   mutate_if(is.character,factor) ##converts all character colmuns to factors
  

  
```

# **Covid -NA's analysis**


Column {data-width=550}
-----------------------------------------------------------------------

### **NA summary in data set**
###### We need to clarify if columns with too many NA's are of any use. So, the calculation is as follows .. any columns with more than 20% total NA's per total rows will be discarded for now - see code below for logic. For reference - the totalnumber of columns in the original data set is `r nc`. The case for bringing back columns is up for discussion and how these NAs should be replaced & with what methodology - "Imputation". One possibility is to take the mean for each column and replace NA with the mean - but will need careful consideration. If there is a need to use these columns that have high counts of NA then we can analyse each column on it's merit by only using the rows that have no NA.
**Early detection of too many NA's (missing data points) - therefore, can not produce a meaningful dashboard**

```{r, echo = TRUE,class.source = "small"}
## NA SUMMARY
## Can use .. filter(!complete.cases(.)) this filters all rows with an NA
column_na_cnt <- covid_ds1 %>% 
  summarise_all(~ sum(is.na(.))) ## counts all NAs in columns

t1 <- column_na_cnt %>% 
  rotate_df() %>%  ## needs library(sjmisc) .. rotates columns to rows or visa versa
  arrange((V1))

t1 <- t1 %>% 
  mutate(tot_row_count = nrow(covid_ds1)) %>% 
  mutate(perc_na = (V1/tot_row_count)*100) %>% 
  mutate(in_out_na = case_when(V1/tot_row_count > 0.2 ~ 'dismiss',
                                TRUE ~ 'keep' )) %>% 
  filter(in_out_na == 'keep') 

t1 <-plyr::rename(
  t1,
  replace      = c(V1="number_of_nas", tot_row_count="total_rows", perc_na = "NA%",
                   in_out_na = "keep_or_discard"),
  warn_missing = FALSE
)

t1 <- rownames_to_column(t1, var = "column_name") ## 1st column had no name

# **Below is the table now showing columns kept for further analysis**

t1 %>% 
  kable %>%
  kable_styling("striped", full_width = F, font_size = 10) %>% ## needs library(kableExtra)
 scroll_box(width = "800px", height = "400px") ## needs library(kableExtra)


```

Column {data-width=450}
-----------------------------------------------------------------------

### Histogram of NA's - kept columns

```{r}
## Histogram of nas
t1a <- t1 %>% 
  select(everything()) %>% 
  filter(t1$number_of_nas >=0)
  
  
hist1 <-ggplot(t1a, aes(x=t1a$number_of_nas)) + geom_histogram(binwidth=500,color="black", fill="red")+
  labs(x = "x-axis -> Bins = 500", y = "y-axis -> Count")

ggplotly(hist1)%>%
  config(displayModeBar = FALSE)
```

### Tree Map showing kept columns with NA totals descending order

```{r}
## Tree map

t1%>%
  arrange(number_of_nas)%>%
  hchart(type = "treemap", hcaes(x = column_name, value = number_of_nas, color = number_of_nas))%>%
  hc_colorAxis(stops = color_stops(colors = viridis::inferno(20)))
 
```

# **HD - Categorical Summary**

##Column {data-width=650}
-----------------------------------------------------------------------

### HD - Summary of  Category factors in dataset

```{r, echo = TRUE,class.source = "small"}

## type_of_meal_plan .. room_type_reserved .. market_segment_type .. booking_status

f1 <- hotel_ds1 %>% 
  count(type_of_meal_plan)
f2 <- hotel_ds1 %>% 
  count(room_type_reserved)
f3 <- hotel_ds1 %>% 
  count(market_segment_type)
f4 <- hotel_ds1 %>% 
  count(booking_status)

## count rows in dataset
nr <- nrow(hotel_ds1)
## now get percentage proportion for each factor
f1_p <- f1 %>% 
  mutate(t_rows = nr) %>% 
  mutate(perc = (n/t_rows)*100) %>% 
  arrange(perc)
##
f2_p <- f2 %>% 
  mutate(t_rows = nr) %>% 
  mutate(perc = (n/t_rows)*100) %>% 
  arrange(perc)
##
f3_p <- f3 %>% 
  mutate(t_rows = nr) %>% 
  mutate(perc = (n/t_rows)*100) %>% 
  arrange(perc)
##
f4_p <- f4 %>% 
  mutate(t_rows = nr) %>% 
  mutate(perc = (n/t_rows)*100) %>% 
  arrange(perc)


```

##Column {data-width=350}
-----------------------------------------------------------------------

### HD - Categorical proportion in percentages

``` {r, echo = TRUE,class.source = "small"}
## type_of_meal_plan 
f1_p

## room_type_reserved 
f2_p


```

##Column {data-width=350}
-----------------------------------------------------------------------

### HD - Categorical proportion in percentages

``` {r, echo = TRUE,class.source = "small"}

## market_segment_type 
f3_p

## booking_status
f4_p

```


# **HD - Corr and Mkt Seg Analysis**

Inputs {.sidebar}
---------------------
LEGEND:

a_p_r   -> Average Price per Room  
a_y     -> Arrival Year
a_m     -> Arrival Month
a_d     -> Arrival Date
l_time  -> Lead Time
n_ad    -> Number of Adults          
n_ch    -> Number of Children        
n_wen   -> Number of Weekend Nights          
n_wn    -> Number of Week Nights         
n_pc    -> Number of Previous Cancellations        
n_pbnc  -> Number of Previous Bookings Not Cancelled
n_sr    -> Number of Special Requests
r_cps   -> Required Car Parking Space
r_g     -> Repeated Guest


```{r, echo = TRUE}


```

## Row {.tabset .tabset-fade}
-----------------------------------------------------------------------

### HD - Correlation Chart

```{r}
cor_plot1 <- hotel_ds1 %>% 
  select(market_segment_type,lead_time,avg_price_per_room,starts_with(c("no","arr","re"))) %>% 
  group_by(market_segment_type) %>%
  summarise_all(~ mean(.,na.rm = TRUE))
  

cor_plot2 <- cor_plot1 %>% 
  select(everything(),-market_segment_type) 

colnames(cor_plot2)<- c("l_time","a_p_r","n_ad","n_ch","n_wen","n_wn","n_pc","n_pbnc","n_sr",
                        "a_y", "a_m","a_d","r_cps","r_g")

cor_plot3 <- cor(cor_plot2)
##corrplot(cor_plot3,method = "pie", type = "upper", tl.cex=1.2)

##preferred plot type below
```



```{r}

ggcorrplot(cor_plot3, 
           hc.order = TRUE, 
           type = "lower",
           lab = TRUE,
           lab_size = 2)

```

### HD - Summary of  factors in dataset

```{css}
.pre {
  max-height: 800px;
  float: left;
  width: 910px;
  overflow-y: auto;
  overflow-x: auto;
}
```
``` {r, class.output = "pre",echo = FALSE,class.source = "small"}
## select all columns with numeric data

nd <- hotel_ds1 %>% 
  select_if(is_double)


describe(nd)
             
```

## Column {data-width=400}
-----------------------------------------------------------------------

### HD - Market segment percentages by Meal Plan - all %

```{r}
mat1 <- hotel_ds1 %>% 
  select(market_segment_type,type_of_meal_plan,arrival_year)

##
## GENDER RACE %
mat1a <-mat1 %>%
  tabyl(market_segment_type,type_of_meal_plan) %>%
  adorn_percentages("all") %>% ## CAN BE col, row, all
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns("front")
print(mat1a)
```

### HD - Market segment percentages by Meal Plan - column %

```{r}
mat1b <-mat1 %>%
  tabyl(market_segment_type,type_of_meal_plan) %>%
  adorn_percentages("col") %>% ## CAN BE col, row, all
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns("front")
print(mat1b)
```

### HD - Market segment percentages by Meal Plan - row %

```{r, echo=FALSE}
mat1c <-mat1 %>%
  tabyl(market_segment_type,type_of_meal_plan) %>%
  adorn_percentages("row") %>% ## CAN BE col, row, all
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns("front")
print(mat1c)
```

# **HD - Facet grid**

```{r}

facet1 <- hotel_ds1 %>% 
  filter(wday != 'NA')

sp <- ggplot(facet1, aes(x=avg_price_per_room, y=lead_time,colour = wday)) + geom_point(shape=1)+
facet_grid(room_type_reserved ~ type_of_meal_plan) +
  theme(strip.text.x = element_text(size=6, angle=0),
        strip.text.y = element_text(size=6, face="bold"),
        strip.background = element_rect(colour="red", fill="#CCCCFF"))
ggplotly(sp)

```