df.df<-read.csv("../../Mradul Verma/Downloads/USAHotelsTransformedData.csv")
library(data.table)
dt.dt<-fread("USAHotelsTransformedData.csv",stringsAsFactors = TRUE)
dim(df.df)
## [1] 15432 21
colnames(df.df)
## [1] "CityName" "Population" "IsTourist"
## [4] "Day" "Date" "IsWeekend"
## [7] "HotelName" "Available" "RackRate"
## [10] "RentUSD" "Discount" "StarRating"
## [13] "GuestRating" "HotelAddress" "HotelPincode"
## [16] "HotelDescription" "FreeWifi" "FreeBreakfast"
## [19] "HotelCapacity" "HasSwimmingPool" "HasDiscount"
attach(df.df)
str(df.df)
## 'data.frame': 15432 obs. of 21 variables:
## $ CityName : Factor w/ 34 levels "Albuquerque",..: 33 33 33 33 33 33 33 33 33 33 ...
## $ Population : int 377165 377165 377165 377165 377165 377165 377165 377165 377165 377165 ...
## $ IsTourist : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Day : Factor w/ 7 levels "Friday","Monday",..: 7 5 1 3 4 2 6 7 5 1 ...
## $ Date : Factor w/ 10 levels "Dec 1 2017","Dec 2 2017",..: 9 10 1 2 3 4 5 6 7 8 ...
## $ IsWeekend : int 0 0 1 1 0 0 0 0 0 1 ...
## $ HotelName : Factor w/ 1642 levels "1840s Carrollton Inn, Baltimore",..: 683 683 683 683 683 683 683 683 683 683 ...
## $ Available : int 1 1 1 1 1 1 1 1 1 1 ...
## $ RackRate : num 244 244 244 244 244 244 244 244 244 244 ...
## $ RentUSD : num 119 119 153 244 119 102 139 119 123 157 ...
## $ Discount : num 51.2 51.2 37.3 0 51.2 ...
## $ StarRating : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ GuestRating : num 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 ...
## $ HotelAddress : Factor w/ 2398 levels "0570 W 151st Street, Olathe, KS",..: 2241 2241 2241 2241 2241 2241 2241 2241 2241 2241 ...
## $ HotelPincode : int 33610 33610 33610 33610 33610 33610 33610 33610 33610 33610 ...
## $ HotelDescription: Factor w/ 1429 levels " Motel with outdoor pool, near Heavenly Ski Resort",..: 831 831 831 831 831 831 831 831 831 831 ...
## $ FreeWifi : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FreeBreakfast : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HotelCapacity : int 76 76 76 76 76 76 76 76 76 76 ...
## $ HasSwimmingPool : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HasDiscount : int 1 1 1 0 1 1 1 1 1 1 ...
table(IsTourist)
## IsTourist
## No Yes
## 9417 6015
table(IsTourist,FreeBreakfast)
## FreeBreakfast
## IsTourist 0 1
## No 3907 5510
## Yes 4194 1821
addmargins(table(IsTourist,FreeBreakfast))
## FreeBreakfast
## IsTourist 0 1 Sum
## No 3907 5510 9417
## Yes 4194 1821 6015
## Sum 8101 7331 15432
myprop<-prop.table(table(IsTourist))
round(myprop*100,2)
## IsTourist
## No Yes
## 61.02 38.98
mean(RentUSD)
## [1] 145.0503
var(RentUSD)
## [1] 16323.62
sd(RentUSD)
## [1] 127.7639
minrent<-min(RentUSD)
maxrent<-max(RentUSD)
round(minrent,2)
## [1] 24
round(maxrent,2)
## [1] 2995
library(psych)
describe(df.df)[,c(1:5,8:9)]
## vars n mean sd median min
## CityName* 1 15432 17.50 9.66 17.00 1
## Population 2 15432 30142834.42 154188929.37 595047.00 21717
## IsTourist* 3 15432 1.39 0.49 1.00 1
## Day* 4 15432 4.12 2.16 5.00 1
## Date* 5 15432 5.54 2.87 6.00 1
## IsWeekend 6 15432 0.29 0.45 0.00 0
## HotelName* 7 15432 821.72 474.28 824.00 1
## Available 8 15432 1.00 0.00 1.00 1
## RackRate 9 15432 190.53 185.89 149.00 30
## RentUSD 10 15432 145.05 127.76 113.00 24
## Discount 11 15432 18.95 17.69 15.08 0
## StarRating 12 15432 3.03 0.80 3.00 1
## GuestRating 13 15432 5.15 2.03 4.30 1
## HotelAddress* 14 15432 1223.00 708.43 1235.00 1
## HotelPincode 15 15432 59814.54 30245.39 64161.00 2108
## HotelDescription* 16 15432 722.54 411.23 740.00 1
## FreeWifi 17 15432 0.93 0.25 1.00 0
## FreeBreakfast 18 15432 0.48 0.50 0.00 0
## HotelCapacity 19 15432 179.87 265.79 120.00 10
## HasSwimmingPool 20 15432 0.59 0.49 1.00 0
## HasDiscount 21 15432 0.75 0.43 1.00 0
## max
## CityName* 3.4000e+01
## Population 8.4060e+08
## IsTourist* 2.0000e+00
## Day* 7.0000e+00
## Date* 1.0000e+01
## IsWeekend 1.0000e+00
## HotelName* 1.6420e+03
## Available 1.0000e+00
## RackRate 2.9950e+03
## RentUSD 2.9950e+03
## Discount 9.2710e+01
## StarRating 5.0000e+00
## GuestRating 1.0000e+01
## HotelAddress* 2.3980e+03
## HotelPincode 9.8198e+04
## HotelDescription* 1.4290e+03
## FreeWifi 1.0000e+00
## FreeBreakfast 1.0000e+00
## HotelCapacity 4.0280e+03
## HasSwimmingPool 1.0000e+00
## HasDiscount 1.0000e+00
levels(IsTourist)
## [1] "No" "Yes"
We see that No - 1 and Yes - 2. If frequency of both 1&2 would have been equal mean would come out to be 1.5. Since 1.39 < 1.5, therefore, there are more 1’s than 2’s. Therefore there are more “Not a tourist destination” locations than “Tourist Destinations”
aggregate(RentUSD, by= list(IsTourist),mean)
## Group.1 x
## 1 No 119.1239
## 2 Yes 185.6404
hist(RentUSD, xlab = "Rent(in USD)", ylab = "Frequency", col = c("blue"))
tab1<-dt.dt[,.(N=.N, mean = mean(RentUSD), sd = sd(RentUSD)), by = .(IsTourist, FreeBreakfast)][order(IsTourist)]
tab1
## IsTourist FreeBreakfast N mean sd
## 1: No 1 5510 112.7479 71.87267
## 2: No 0 3907 128.1158 75.84106
## 3: Yes 0 4194 210.8255 200.04628
## 4: Yes 1 1821 127.6359 64.62385
bp<-barplot( round(myprop*100,2),xlab="IsTourist", ylab = "Percentage(%)", col = c("blue","red"), main = "% Percentage of hotels are at tourist destinations", legend.text = TRUE)
text(bp,0,round(myprop*100,2),cex=1,pos=3)
counts<- round(prop.table(table(FreeBreakfast,IsTourist))*100,2)
bp2<-barplot(counts,col = c("blue","pink"), xlab = "FreeBreakfast", ylab = "Percentage", main = "Bar Chart for IsTourist by FreeBreakfast", beside= TRUE ,legend.text = TRUE, args.legend = list(title = "IsTourist"))
text(bp2,0,counts,cex=1,pos=3)
boxplot(RentUSD,
main = "Boxplot for Rent (in USD)",
col = c("lightblue"), horizontal = TRUE, xlab="Rent (in USD)")
boxplot(RentUSD ~ IsTourist, main = "Box Plot of Rent in USD grouped by IsTourist", col= c("pink","lightblue"), ylab= "Rent (in USD)")
plot(RackRate, RentUSD, main = "Scatter Plot", xlab = "Rack Rate", ylab = "Rent (in USD)")
cor(RackRate,RentUSD, method = "pearson")
## [1] 0.8681018
cor.test(RackRate,RentUSD, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: RackRate and RentUSD
## t = 217.24, df = 15430, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8641602 0.8719369
## sample estimates:
## cor
## 0.8681018
cor.test(RackRate,RentUSD,method = "spearman")
## Warning in cor.test.default(RackRate, RentUSD, method = "spearman"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: RackRate and RentUSD
## S = 6.2659e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8977018
mydata<- df.df[,c(2,9,10,12,19)]
corres<-cor(mydata)
round(corres,3)
## Population RackRate RentUSD StarRating HotelCapacity
## Population 1.000 0.291 0.336 0.063 0.041
## RackRate 0.291 1.000 0.868 0.556 0.169
## RentUSD 0.336 0.868 1.000 0.580 0.171
## StarRating 0.063 0.556 0.580 1.000 0.396
## HotelCapacity 0.041 0.169 0.171 0.396 1.000
corr.test(mydata, use = "complete")
## Call:corr.test(x = mydata, use = "complete")
## Correlation matrix
## Population RackRate RentUSD StarRating HotelCapacity
## Population 1.00 0.29 0.34 0.06 0.04
## RackRate 0.29 1.00 0.87 0.56 0.17
## RentUSD 0.34 0.87 1.00 0.58 0.17
## StarRating 0.06 0.56 0.58 1.00 0.40
## HotelCapacity 0.04 0.17 0.17 0.40 1.00
## Sample Size
## [1] 15432
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## Population RackRate RentUSD StarRating HotelCapacity
## Population 0 0 0 0 0
## RackRate 0 0 0 0 0
## RentUSD 0 0 0 0 0
## StarRating 0 0 0 0 0
## HotelCapacity 0 0 0 0 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.1
## corrplot 0.84 loaded
corrplot(cor(mydata),method = "circle")