library(psych)
library(corrplot)
## corrplot 0.84 loaded
setwd("d:/IIML/Term 5/DAM/")
df<-read.csv("USAHotelsTransformedData.csv")
dim(df)
## [1] 15432 21
colnames(df)
## [1] "CityName" "Population" "IsTourist"
## [4] "Day" "Date" "IsWeekend"
## [7] "HotelName" "Available" "RackRate"
## [10] "RentUSD" "Discount" "StarRating"
## [13] "GuestRating" "HotelAddress" "HotelPincode"
## [16] "HotelDescription" "FreeWifi" "FreeBreakfast"
## [19] "HotelCapacity" "HasSwimmingPool" "HasDiscount"
attach(df)
str(df)
## 'data.frame': 15432 obs. of 21 variables:
## $ CityName : Factor w/ 34 levels "Albuquerque",..: 33 33 33 33 33 33 33 33 33 33 ...
## $ Population : int 377165 377165 377165 377165 377165 377165 377165 377165 377165 377165 ...
## $ IsTourist : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Day : Factor w/ 7 levels "Friday","Monday",..: 7 5 1 3 4 2 6 7 5 1 ...
## $ Date : Factor w/ 10 levels "Dec 1 2017","Dec 2 2017",..: 9 10 1 2 3 4 5 6 7 8 ...
## $ IsWeekend : int 0 0 1 1 0 0 0 0 0 1 ...
## $ HotelName : Factor w/ 1642 levels "1840s Carrollton Inn, Baltimore",..: 683 683 683 683 683 683 683 683 683 683 ...
## $ Available : int 1 1 1 1 1 1 1 1 1 1 ...
## $ RackRate : num 244 244 244 244 244 244 244 244 244 244 ...
## $ RentUSD : num 119 119 153 244 119 102 139 119 123 157 ...
## $ Discount : num 51.2 51.2 37.3 0 51.2 ...
## $ StarRating : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ GuestRating : num 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 ...
## $ HotelAddress : Factor w/ 2398 levels "0570 W 151st Street, Olathe, KS",..: 2241 2241 2241 2241 2241 2241 2241 2241 2241 2241 ...
## $ HotelPincode : int 33610 33610 33610 33610 33610 33610 33610 33610 33610 33610 ...
## $ HotelDescription: Factor w/ 1429 levels " Motel with outdoor pool, near Heavenly Ski Resort",..: 831 831 831 831 831 831 831 831 831 831 ...
## $ FreeWifi : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FreeBreakfast : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HotelCapacity : int 76 76 76 76 76 76 76 76 76 76 ...
## $ HasSwimmingPool : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HasDiscount : int 1 1 1 0 1 1 1 1 1 1 ...
table(df$IsTourist)
##
## No Yes
## 9417 6015
table(df$IsTourist,df$FreeBreakfast)
##
## 0 1
## No 3907 5510
## Yes 4194 1821
addmargins(table(df$IsTourist,df$FreeBreakfast),margin = c(1,2))
##
## 0 1 Sum
## No 3907 5510 9417
## Yes 4194 1821 6015
## Sum 8101 7331 15432
round(table(df$IsTourist)*100/nrow(df),2)
##
## No Yes
## 61.02 38.98
mean(df$RentUSD)
## [1] 145.0503
sd(df$RentUSD)
## [1] 127.7639
var(df$RentUSD)
## [1] 16323.62
round(max(df$RentUSD),2)
## [1] 2995
round(min(df$RentUSD),2)
## [1] 24
describe(df)[,c(1:5,8:9)]
## vars n mean sd median min
## CityName* 1 15432 17.50 9.66 17.00 1
## Population 2 15432 30142834.42 154188929.37 595047.00 21717
## IsTourist* 3 15432 1.39 0.49 1.00 1
## Day* 4 15432 4.12 2.16 5.00 1
## Date* 5 15432 5.54 2.87 6.00 1
## IsWeekend 6 15432 0.29 0.45 0.00 0
## HotelName* 7 15432 821.72 474.28 824.00 1
## Available 8 15432 1.00 0.00 1.00 1
## RackRate 9 15432 190.53 185.89 149.00 30
## RentUSD 10 15432 145.05 127.76 113.00 24
## Discount 11 15432 18.95 17.69 15.08 0
## StarRating 12 15432 3.03 0.80 3.00 1
## GuestRating 13 15432 5.15 2.03 4.30 1
## HotelAddress* 14 15432 1223.00 708.43 1235.00 1
## HotelPincode 15 15432 59814.54 30245.39 64161.00 2108
## HotelDescription* 16 15432 722.54 411.23 740.00 1
## FreeWifi 17 15432 0.93 0.25 1.00 0
## FreeBreakfast 18 15432 0.48 0.50 0.00 0
## HotelCapacity 19 15432 179.87 265.79 120.00 10
## HasSwimmingPool 20 15432 0.59 0.49 1.00 0
## HasDiscount 21 15432 0.75 0.43 1.00 0
## max
## CityName* 3.4000e+01
## Population 8.4060e+08
## IsTourist* 2.0000e+00
## Day* 7.0000e+00
## Date* 1.0000e+01
## IsWeekend 1.0000e+00
## HotelName* 1.6420e+03
## Available 1.0000e+00
## RackRate 2.9950e+03
## RentUSD 2.9950e+03
## Discount 9.2710e+01
## StarRating 5.0000e+00
## GuestRating 1.0000e+01
## HotelAddress* 2.3980e+03
## HotelPincode 9.8198e+04
## HotelDescription* 1.4290e+03
## FreeWifi 1.0000e+00
## FreeBreakfast 1.0000e+00
## HotelCapacity 4.0280e+03
## HasSwimmingPool 1.0000e+00
## HasDiscount 1.0000e+00
aggregate(df$RentUSD,by=list(df$IsTourist),mean)
## Group.1 x
## 1 No 119.1239
## 2 Yes 185.6404
hist(df$RentUSD,col = "skyblue")

aggregate(df$RentUSD,by=list(df$IsTourist,df$FreeBreakfast),mean)
## Group.1 Group.2 x
## 1 No 0 128.1158
## 2 Yes 0 210.8255
## 3 No 1 112.7479
## 4 Yes 1 127.6359
barplot(round(table(df$IsTourist)*100/nrow(df),2),col=c("skyblue","red"))

boxplot(df$RentUSD,horizontal = T,col="skyblue")

boxplot(df$RentUSD~df$IsTourist,col=c("grey","skyblue"))

cor(df$RentUSD,df$RackRate,method = "pearson")
## [1] 0.8681018
cor.test(df$RackRate,df$RentUSD,method = "spearman")
## Warning in cor.test.default(df$RackRate, df$RentUSD, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: df$RackRate and df$RentUSD
## S = 6.2659e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8977018
cor(data.frame(df$Population,df$RackRate,df$RentUSD,df$StarRating,df$HotelCapacity))
## df.Population df.RackRate df.RentUSD df.StarRating
## df.Population 1.00000000 0.2911148 0.3359792 0.06258655
## df.RackRate 0.29111480 1.0000000 0.8681018 0.55628073
## df.RentUSD 0.33597918 0.8681018 1.0000000 0.57996120
## df.StarRating 0.06258655 0.5562807 0.5799612 1.00000000
## df.HotelCapacity 0.04069457 0.1691987 0.1711987 0.39625192
## df.HotelCapacity
## df.Population 0.04069457
## df.RackRate 0.16919866
## df.RentUSD 0.17119872
## df.StarRating 0.39625192
## df.HotelCapacity 1.00000000
corr.test(data.frame(df$Population,df$RackRate,df$RentUSD,df$StarRating,df$HotelCapacity))
## Call:corr.test(x = data.frame(df$Population, df$RackRate, df$RentUSD,
## df$StarRating, df$HotelCapacity))
## Correlation matrix
## df.Population df.RackRate df.RentUSD df.StarRating
## df.Population 1.00 0.29 0.34 0.06
## df.RackRate 0.29 1.00 0.87 0.56
## df.RentUSD 0.34 0.87 1.00 0.58
## df.StarRating 0.06 0.56 0.58 1.00
## df.HotelCapacity 0.04 0.17 0.17 0.40
## df.HotelCapacity
## df.Population 0.04
## df.RackRate 0.17
## df.RentUSD 0.17
## df.StarRating 0.40
## df.HotelCapacity 1.00
## Sample Size
## [1] 15432
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## df.Population df.RackRate df.RentUSD df.StarRating
## df.Population 0 0 0 0
## df.RackRate 0 0 0 0
## df.RentUSD 0 0 0 0
## df.StarRating 0 0 0 0
## df.HotelCapacity 0 0 0 0
## df.HotelCapacity
## df.Population 0
## df.RackRate 0
## df.RentUSD 0
## df.StarRating 0
## df.HotelCapacity 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
corrplot(cor(data.frame(df$Population,df$RackRate,df$RentUSD,df$StarRating,df$HotelCapacity)))
