Use data set MBA Salaries.
1. Analysis of Hotel Prices Data of hotels of 42 cities.(Summarize, plots etc.)
2. Visualization of Data
3. Some T and Chi square tests through data
4. Correlation between dependent and Independent variables
5. Find out which all columns / features impact Price of hotel room
6. Predict the hotel prices with some dummy values.
The data was collected from www.hotels.in in October 2016.
Size: 2523KB 13232 observations of 19 variables:
Attributes:
Notice that the dataset tracks hotel prices on 8 different dates at different hotels across different cities. Please browse the dataset.
Dependent Variable
RoomRent <- Rent for the cheapest room, double occupancy, in Indian Rupees.
Independent Variables
External Factors
Date <- We have hotel room rent data for the following 8 dates for each hotel: {Dec 31, Dec 25, Dec 24, Dec 18, Dec 21, Dec 28, Jan 4, Jan 8} IsWeekend <- We use ‘0’ to indicate week days, ‘1’ to indicate weekend dates (Sat / Sun)
IsNewYearEve <- 1’ for Dec 31, ‘0’ otherwise CityName <- Name of the City where the Hotel is located e.g. Mumbai`
Population <- Population of the City in 2011
CityRank <- Rank order of City by Population (e.g. Mumbai = 0, Delhi = 1, so on)
IsMetroCity <- ‘1’ if CityName is {Mumbai, Delhi, Kolkatta, Chennai}, ‘0’ otherwise
IsTouristDestination <- We use ‘1’ if the city is primarily a tourist destination, ‘0’ otherwise.
Internal Factors Many Hotel Features can influence the RoomRent. The dataset captures some of these internal factors, as explained below.
HotelName <- e.g. Park Hyatt Goa Resort and Spa
StarRating <- e.g. 5
Airport <- Distance between Hotel and closest major Airport
HotelAddress <- e.g. Arrossim Beach, Cansaulim, Goa
HotelPincode <- 403712
HotelDescription <- e.g. 5-star beachfront resort with spa, near Arossim Beach
FreeWifi <- ‘1’ if the hotel offers Free Wifi, ‘0’ otherwise
FreeBreakfast <- ‘1’ if the hotel offers Free Breakfast, ‘0’ otherwise
HotelCapacity <- e.g. 242. (enter ‘0’ if not available)
HasSwimmingPool <- ‘1’ if they have a swimming pool, ‘0’ otherwise
Setup
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(vcd)
## Loading required package: grid
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(corrplot)
Functions
detect_outliers <- function(inp, na.rm=TRUE) {
i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
i.max <- 1.5 * IQR(inp, na.rm=na.rm)
otp <- inp
otp[inp < (i.qnt[1] - i.max)] <- NA
otp[inp > (i.qnt[2] + i.max)] <- NA
#inp <- count(inp[is.na(otp)])
sum(is.na(otp))
}
Non_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
y
}
Remove_Outliers <- function ( z, na.rm = TRUE){
Out <- Non_outliers(z)
Out <-as.data.frame (Out)
z <- Out$Out[match(z, Out$Out)]
z
}
Graph_Boxplot <- function (input, na.rm = TRUE){
Plot <- ggplot(dfrModel, aes(x="", y=input)) +
geom_boxplot(aes(fill=input), color="green") +
labs(title="Outliers")
Plot
}
Dataset
setwd("D:/Welingkar/My/IL/Project/Hotel Industry/Data")
dfrModel <- read.csv("./Cities42.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrModel)
head(dfrModel)
## CityName Population CityRank IsMetroCity IsTouristDestination IsWeekend
## 1 Mumbai 12442373 0 1 1 1
## 2 Mumbai 12442373 0 1 1 0
## 3 Mumbai 12442373 0 1 1 1
## 4 Mumbai 12442373 0 1 1 1
## 5 Mumbai 12442373 0 1 1 0
## 6 Mumbai 12442373 0 1 1 1
## IsNewYearEve Date HotelName RoomRent StarRating Airport
## 1 0 Dec 18 2016 Vivanta by Taj 12375 5 21
## 2 0 Dec 21 2016 Vivanta by Taj 10250 5 21
## 3 0 Dec 24 2016 Vivanta by Taj 9900 5 21
## 4 0 Dec 25 2016 Vivanta by Taj 10350 5 21
## 5 0 Dec 28 2016 Vivanta by Taj 12000 5 21
## 6 1 Dec 31 2016 Vivanta by Taj 11475 5 21
## HotelAddress HotelPincode
## 1 90 Cuffe Parade, Colaba, Mumbai, Maharashtra 400005
## 2 91 Cuffe Parade, Colaba, Mumbai, Maharashtra 400006
## 3 92 Cuffe Parade, Colaba, Mumbai, Maharashtra 400007
## 4 93 Cuffe Parade, Colaba, Mumbai, Maharashtra 400008
## 5 94 Cuffe Parade, Colaba, Mumbai, Maharashtra 400009
## 6 95 Cuffe Parade, Colaba, Mumbai, Maharashtra 400010
## HotelDescription FreeWifi FreeBreakfast
## 1 Luxury hotel with spa, near Gateway of India 1 0
## 2 Luxury hotel with spa, near Gateway of India 1 0
## 3 Luxury hotel with spa, near Gateway of India 1 0
## 4 Luxury hotel with spa, near Gateway of India 1 0
## 5 Luxury hotel with spa, near Gateway of India 1 0
## 6 Luxury hotel with spa, near Gateway of India 1 0
## HotelCapacity HasSwimmingPool
## 1 287 1
## 2 287 1
## 3 287 1
## 4 287 1
## 5 287 1
## 6 287 1
Observation 1. There are total ‘intRowCount’ data records in the file.
As there are Non Numeric data as well in the given dataset, so we are going to remove the non numeric data.
Data_Cleaning
dfrModel <- select(dfrModel, -c(CityName, Date, HotelName, HotelAddress, HotelDescription, HotelPincode ))
Summary
#describe(dfrModel$CityName)
describe(dfrModel$Population)[,c(2,3,4,5,8,9)]
## n mean sd median min max
## X1 13232 4416837 4258386 3046163 8096 12442373
#describe(dfrModel$CityRank)[,c(2,3,4,5,8,9)]
#describe(dfrModel$IsMetroCity)[,c(2,3,4,5,8,9)]
#describe(dfrModel$IsTouristDestination)[,c(2,3,4,5,8,9)]
#describe(dfrModel$IsWeekend)[,c(2,3,4,5,8,9)]
#describe(dfrModel$IsNewYearEve)[,c(2,3,4,5,8,9)]
#describe(dfrModel$Date)[,c(2,3,4,5,8,9)]
#describe(dfrModel$HotelName)[,c(2,3,4,5,8,9)]
describe(dfrModel$RoomRent)[,c(2,3,4,5,8,9)]
## n mean sd median min max
## X1 13232 5473.99 7333.12 4000 299 322500
describe(dfrModel$StarRating)[,c(2,3,4,5,8,9)]
## n mean sd median min max
## X1 13232 3.46 0.76 3 0 5
describe(dfrModel$Airport)[,c(2,3,4,5,8,9)]
## n mean sd median min max
## X1 13232 21.16 22.76 15 0.2 124
#describe(dfrModel$HotelAddress)[,c(2,3,4,5,8,9)]
#describe(dfrModel$HotelPincode)[,c(2,3,4,5,8,9)]
#describe(dfrModel$HotelDescription)[,c(2,3,4,5,8,9)]
#describe(dfrModel$FreeWifi)[,c(2,3,4,5,8,9)]
#describe(dfrModel$FreeBreakfast)[,c(2,3,4,5,8,9)]
describe(dfrModel$HotelCapacity)[,c(2,3,4,5,8,9)]
## n mean sd median min max
## X1 13232 62.51 76.66 34 0 600
#describe(dfrModel$HasSwimmingPool)[,c(2,3,4,5,8,9)]
Observations
Dependent Variable is
Y = Hotel Rent
Independent Variable is
X1 = Star Rating
X2 = IsTouristDestination
X3 = Airport Distance
X4 = Hotel Capacity
Box Plot
#Graph_Boxplot(dfrModel$CityName)
#Graph_Boxplot(dfrModel$Population)
#Graph_Boxplot(dfrModel$CityRank)
#Graph_Boxplot(dfrModel$IsMetroCity)
#Graph_Boxplot(dfrModel$IsTouristDestination)
#Graph_Boxplot(dfrModel$IsWeekend)
#Graph_Boxplot(dfrModel$IsNewYearEve)
#Graph_Boxplot(dfrModel$Date)
#Graph_Boxplot(dfrModel$HotelName)
#Graph_Boxplot(dfrModel$RoomRent)
Graph_Boxplot(dfrModel$StarRating)
Graph_Boxplot(dfrModel$Airport)
#Graph_Boxplot(dfrModel$HotelAddress)
#Graph_Boxplot(dfrModel$HotelPincode)
#Graph_Boxplot(dfrModel$HotelDescription)
#Graph_Boxplot(dfrModel$FreeWifi)
#Graph_Boxplot(dfrModel$FreeBreakfast)
Graph_Boxplot(dfrModel$HotelCapacity)
#Graph_Boxplot(dfrModel$HasSwimmingPool)
Observation
There are few outliers in the datasets
Tables
TouristDestination <- table(dfrModel$IsTouristDestination)
TouristDestination
##
## 0 1
## 4007 9225
prop.table(TouristDestination)
##
## 0 1
## 0.3028265 0.6971735
Observations
Here
1 Implies Tourist Destination
0 Implies Not an tourist destination
Scatter Plot
plot(y=dfrModel$RoomRent, x=dfrModel$Airport,
col="green",
ylim=c(0, 350000), xlim=c(0, 150),
main="Relationship Btw Room Rent and Airport Distance",
ylab="Hotel Rent", xlab="Airport Distance")
scatterplot(dfrModel$Airport, dfrModel$RoomRent , main="Relationship Btw Room Rent and Airport Distance", xlab="Airport Distance", ylab="Hotel Rent")
plot((dfrModel$IsTouristDestination),jitter(dfrModel$RoomRent),
col="green",
ylim=c(0, 350000), xlim=c(0, 5),
main="Relationship Btw Room Rent and Tourist Destination",
ylab="Hotel Rent", xlab="Tourist Destination")
plot(y=dfrModel$RoomRent, x=dfrModel$StarRating,
col="blue",
ylim=c(0, 350000), xlim=c(0, 10),
main="Relationship Btw Room Rent and Star Rating of Hotel",
ylab="Hotel Rent", xlab="Star Rating")
plot(y=dfrModel$RoomRent, x=dfrModel$HotelCapacity,
col="green",
ylim=c(0, 350000), xlim=c(0, 150),
main="Relationship Btw Room Rent and Hotel Capacity",
ylab="Hotel Rent", xlab="Hotel Capacity")
scatterplot(dfrModel$HotelCapacity, dfrModel$RoomRent , main="Relationship Btw Room Rent and Hotel Capacity", xlab="Hotel Capacity", ylab="Hotel Rent")
Observations
1.Above scatter plot is showing some relationship between Hotel rent and other Independent variables.
Correlation Plot
#pairs(dfrModel)
corrplot(corr=cor(dfrModel[ , c(4,7,8,9,12)], use="complete.obs"),
method ="ellipse")
Correlation Matrix
cor(dfrModel[, c(4,7,8,9,12)])
## IsTouristDestination RoomRent StarRating
## IsTouristDestination 1.00000000 0.12250296 -0.04055500
## RoomRent 0.12250296 1.00000000 0.36937343
## StarRating -0.04055500 0.36937343 1.00000000
## Airport 0.19442205 0.04965324 -0.06091918
## HotelCapacity -0.09435609 0.15787331 0.63743034
## Airport HotelCapacity
## IsTouristDestination 0.19442205 -0.09435609
## RoomRent 0.04965324 0.15787331
## StarRating -0.06091918 0.63743034
## Airport 1.00000000 -0.11767207
## HotelCapacity -0.11767207 1.00000000
Independent Variable is
X1 = Star Rating
X2 = IsTouristDestination
X3 = Airport Distance
X4 Can also be Hotel Capacity