HW_3

Identifying the dataset for final project

MANI KANTA GOGULA
2022-03-02

#1. Identify the dataset you will be using for the final project

I will be using the data set of hotel bookings from the course dataset .

#Importing the dataset

library(readr)
hotel_bookings_ <- read_csv("hotel_bookings .csv")

#Preview of the Dataset

head(hotel_bookings_)
# A tibble: 6 x 32
  hotel        is_canceled lead_time arrival_date_ye~ arrival_date_mo~
  <chr>              <dbl>     <dbl>            <dbl> <chr>           
1 Resort Hotel           0       342             2015 July            
2 Resort Hotel           0       737             2015 July            
3 Resort Hotel           0         7             2015 July            
4 Resort Hotel           0        13             2015 July            
5 Resort Hotel           0        14             2015 July            
6 Resort Hotel           0        14             2015 July            
# ... with 27 more variables: arrival_date_week_number <dbl>,
#   arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
#   stays_in_week_nights <dbl>, adults <dbl>, children <dbl>,
#   babies <dbl>, meal <chr>, country <chr>, market_segment <chr>,
#   distribution_channel <chr>, is_repeated_guest <dbl>,
#   previous_cancellations <dbl>,
#   previous_bookings_not_canceled <dbl>, ...

#USING THE FUNCTION dim() to get the dimensions of the dataset

dim(hotel_bookings_)
[1] 119390     32

Selecting the country and is_repeated_guest column from the dataset using select() function to see which country customers repeated

select(hotel_bookings_,country,is_repeated_guest)
# A tibble: 119,390 x 2
   country is_repeated_guest
   <chr>               <dbl>
 1 PRT                     0
 2 PRT                     0
 3 GBR                     0
 4 GBR                     0
 5 GBR                     0
 6 GBR                     0
 7 PRT                     0
 8 PRT                     0
 9 PRT                     0
10 PRT                     0
# ... with 119,380 more rows

#Applying filter() function to the datset to see how many customers need car parking space.

filter(hotel_bookings_,required_car_parking_spaces==1)
# A tibble: 7,383 x 32
   hotel       is_canceled lead_time arrival_date_ye~ arrival_date_mo~
   <chr>             <dbl>     <dbl>            <dbl> <chr>           
 1 Resort Hot~           0        78             2015 July            
 2 Resort Hot~           0        99             2015 July            
 3 Resort Hot~           0         3             2015 July            
 4 Resort Hot~           0         1             2015 July            
 5 Resort Hot~           0         1             2015 July            
 6 Resort Hot~           0         5             2015 July            
 7 Resort Hot~           0        10             2015 July            
 8 Resort Hot~           0         3             2015 July            
 9 Resort Hot~           0        72             2015 July            
10 Resort Hot~           0         9             2015 July            
# ... with 7,373 more rows, and 27 more variables:
#   arrival_date_week_number <dbl>, arrival_date_day_of_month <dbl>,
#   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>,
#   adults <dbl>, children <dbl>, babies <dbl>, meal <chr>,
#   country <chr>, market_segment <chr>, distribution_channel <chr>,
#   is_repeated_guest <dbl>, previous_cancellations <dbl>,
#   previous_bookings_not_canceled <dbl>, ...

#Arranging the datset based on the reservation using arrange() function

arrange(hotel_bookings_,reservation_status_date)
# A tibble: 119,390 x 32
   hotel       is_canceled lead_time arrival_date_ye~ arrival_date_mo~
   <chr>             <dbl>     <dbl>            <dbl> <chr>           
 1 Resort Hot~           1       344             2015 September       
 2 Resort Hot~           1       399             2015 December        
 3 Resort Hot~           1       202             2015 July            
 4 City Hotel            1       258             2015 July            
 5 City Hotel            1       258             2015 July            
 6 City Hotel            1       258             2015 July            
 7 City Hotel            1       258             2015 July            
 8 City Hotel            1       258             2015 July            
 9 City Hotel            1       258             2015 July            
10 City Hotel            1       258             2015 July            
# ... with 119,380 more rows, and 27 more variables:
#   arrival_date_week_number <dbl>, arrival_date_day_of_month <dbl>,
#   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>,
#   adults <dbl>, children <dbl>, babies <dbl>, meal <chr>,
#   country <chr>, market_segment <chr>, distribution_channel <chr>,
#   is_repeated_guest <dbl>, previous_cancellations <dbl>,
#   previous_bookings_not_canceled <dbl>, ...

#Identify potential research questions that your dataset can help answer.

1.Which country customers are mostly repeated customers? 2.How many customers are checking in through the ditsribution market as corporate?