Data Description

Load Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(summarytools)
hotel_data <- read.csv("G:/semester_1/4_Statistics_R/syllabus/lab/week8/hotel_bookings.csv")
head(hotel_data, 10)
##           hotel is_canceled lead_time arrival_date_year arrival_date_month
## 1  Resort Hotel           0       342              2015               July
## 2  Resort Hotel           0       737              2015               July
## 3  Resort Hotel           0         7              2015               July
## 4  Resort Hotel           0        13              2015               July
## 5  Resort Hotel           0        14              2015               July
## 6  Resort Hotel           0        14              2015               July
## 7  Resort Hotel           0         0              2015               July
## 8  Resort Hotel           0         9              2015               July
## 9  Resort Hotel           1        85              2015               July
## 10 Resort Hotel           1        75              2015               July
##    arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
## 1                        27                         1                       0
## 2                        27                         1                       0
## 3                        27                         1                       0
## 4                        27                         1                       0
## 5                        27                         1                       0
## 6                        27                         1                       0
## 7                        27                         1                       0
## 8                        27                         1                       0
## 9                        27                         1                       0
## 10                       27                         1                       0
##    stays_in_week_nights adults children babies meal country market_segment
## 1                     0      2        0      0   BB     PRT         Direct
## 2                     0      2        0      0   BB     PRT         Direct
## 3                     1      1        0      0   BB     GBR         Direct
## 4                     1      1        0      0   BB     GBR      Corporate
## 5                     2      2        0      0   BB     GBR      Online TA
## 6                     2      2        0      0   BB     GBR      Online TA
## 7                     2      2        0      0   BB     PRT         Direct
## 8                     2      2        0      0   FB     PRT         Direct
## 9                     3      2        0      0   BB     PRT      Online TA
## 10                    3      2        0      0   HB     PRT  Offline TA/TO
##    distribution_channel is_repeated_guest previous_cancellations
## 1                Direct                 0                      0
## 2                Direct                 0                      0
## 3                Direct                 0                      0
## 4             Corporate                 0                      0
## 5                 TA/TO                 0                      0
## 6                 TA/TO                 0                      0
## 7                Direct                 0                      0
## 8                Direct                 0                      0
## 9                 TA/TO                 0                      0
## 10                TA/TO                 0                      0
##    previous_bookings_not_canceled reserved_room_type assigned_room_type
## 1                               0                  C                  C
## 2                               0                  C                  C
## 3                               0                  A                  C
## 4                               0                  A                  A
## 5                               0                  A                  A
## 6                               0                  A                  A
## 7                               0                  C                  C
## 8                               0                  C                  C
## 9                               0                  A                  A
## 10                              0                  D                  D
##    booking_changes deposit_type agent company days_in_waiting_list
## 1                3   No Deposit  NULL    NULL                    0
## 2                4   No Deposit  NULL    NULL                    0
## 3                0   No Deposit  NULL    NULL                    0
## 4                0   No Deposit   304    NULL                    0
## 5                0   No Deposit   240    NULL                    0
## 6                0   No Deposit   240    NULL                    0
## 7                0   No Deposit  NULL    NULL                    0
## 8                0   No Deposit   303    NULL                    0
## 9                0   No Deposit   240    NULL                    0
## 10               0   No Deposit    15    NULL                    0
##    customer_type   adr required_car_parking_spaces total_of_special_requests
## 1      Transient   0.0                           0                         0
## 2      Transient   0.0                           0                         0
## 3      Transient  75.0                           0                         0
## 4      Transient  75.0                           0                         0
## 5      Transient  98.0                           0                         1
## 6      Transient  98.0                           0                         1
## 7      Transient 107.0                           0                         0
## 8      Transient 103.0                           0                         1
## 9      Transient  82.0                           0                         1
## 10     Transient 105.5                           0                         0
##    reservation_status reservation_status_date
## 1           Check-Out              2015-07-01
## 2           Check-Out              2015-07-01
## 3           Check-Out              2015-07-02
## 4           Check-Out              2015-07-02
## 5           Check-Out              2015-07-03
## 6           Check-Out              2015-07-03
## 7           Check-Out              2015-07-03
## 8           Check-Out              2015-07-03
## 9            Canceled              2015-05-06
## 10           Canceled              2015-04-22

Select a Response Variable:

response_variable <- hotel_data$adr

Select an Explanatory Variable:

explanatory_variable <- hotel_data$arrival_date_month

Null Hypothesis for ANOVA:

Perform ANOVA Test:

anova_result <- aov(response_variable ~ explanatory_variable, data = hotel_data)
summary(anova_result)
##                          Df    Sum Sq Mean Sq F value Pr(>F)    
## explanatory_variable     11  58433791 5312163    2573 <2e-16 ***
## Residuals            119378 246469729    2065                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Summarize ANOVA Results:

Consolidate Categories for “arrival_date_month”:

hotel_data$season <- ifelse(hotel_data$arrival_date_month %in% c("March", "April", "May"), "Spring",
                            ifelse(hotel_data$arrival_date_month %in% c("June", "July", "August"), "Summer",
                                   ifelse(hotel_data$arrival_date_month %in% c("September", "October", "November"), "Autumn", "Winter")))

unique(hotel_data$season)
## [1] "Summer" "Autumn" "Winter" "Spring"

Explain Implications:

Find Another Continuous Variable:

continuous_variable <- hotel_data$lead_time

Build a Linear Regression Model:

# Build a linear regression model
model_lead_time <- lm(response_variable ~ continuous_variable, data = hotel_data)
summary(model_lead_time)
## 
## Call:
## lm(formula = response_variable ~ continuous_variable, data = hotel_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -105.5  -31.4   -7.2   23.9 5296.1 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         104.933697   0.203692  515.16   <2e-16 ***
## continuous_variable  -0.029829   0.001366  -21.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.44 on 119388 degrees of freedom
## Multiple R-squared:  0.003979,   Adjusted R-squared:  0.00397 
## F-statistic: 476.9 on 1 and 119388 DF,  p-value: < 2.2e-16

Summarize Linear Regression Results:

Include at least one other variable:

model_with_season <- lm(response_variable ~ continuous_variable + season, data = hotel_data)
summary(model_with_season)
## 
## Call:
## lm(formula = response_variable ~ continuous_variable + season, 
##     data = hotel_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -137.7  -27.3   -3.9   21.3 5298.3 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          98.995021   0.311242  318.06   <2e-16 ***
## continuous_variable  -0.069481   0.001283  -54.15   <2e-16 ***
## seasonSpring          5.127029   0.373976   13.71   <2e-16 ***
## seasonSummer         38.691142   0.361971  106.89   <2e-16 ***
## seasonWinter        -20.028772   0.427095  -46.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46 on 119385 degrees of freedom
## Multiple R-squared:  0.1715, Adjusted R-squared:  0.1714 
## F-statistic:  6176 on 4 and 119385 DF,  p-value: < 2.2e-16

Consider Interaction Terms:

model_with_interaction <- lm(response_variable ~ continuous_variable * season, data = hotel_data)
summary(model_with_interaction)
## 
## Call:
## lm(formula = response_variable ~ continuous_variable * season, 
##     data = hotel_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -145.3  -26.8   -4.4   21.4 5299.7 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       97.075053   0.375092 258.803   <2e-16 ***
## continuous_variable               -0.053066   0.002216 -23.947   <2e-16 ***
## seasonSpring                       4.873523   0.521387   9.347   <2e-16 ***
## seasonSummer                      48.265427   0.525845  91.786   <2e-16 ***
## seasonWinter                     -22.851461   0.535238 -42.694   <2e-16 ***
## continuous_variable:seasonSpring   0.006307   0.003499   1.802   0.0715 .  
## continuous_variable:seasonSummer  -0.076054   0.003125 -24.336   <2e-16 ***
## continuous_variable:seasonWinter   0.069009   0.004420  15.613   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.74 on 119382 degrees of freedom
## Multiple R-squared:  0.1808, Adjusted R-squared:  0.1807 
## F-statistic:  3763 on 7 and 119382 DF,  p-value: < 2.2e-16