library(psych)
library(corrplot)
## corrplot 0.84 loaded
setwd("d:/IIML/Term 5/DAM/")
df<-read.csv("USAHotelsTransformedData.csv")
dim(df)
## [1] 15432    21
colnames(df)
##  [1] "CityName"         "Population"       "IsTourist"       
##  [4] "Day"              "Date"             "IsWeekend"       
##  [7] "HotelName"        "Available"        "RackRate"        
## [10] "RentUSD"          "Discount"         "StarRating"      
## [13] "GuestRating"      "HotelAddress"     "HotelPincode"    
## [16] "HotelDescription" "FreeWifi"         "FreeBreakfast"   
## [19] "HotelCapacity"    "HasSwimmingPool"  "HasDiscount"
attach(df)
str(df)
## 'data.frame':    15432 obs. of  21 variables:
##  $ CityName        : Factor w/ 34 levels "Albuquerque",..: 33 33 33 33 33 33 33 33 33 33 ...
##  $ Population      : int  377165 377165 377165 377165 377165 377165 377165 377165 377165 377165 ...
##  $ IsTourist       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Day             : Factor w/ 7 levels "Friday","Monday",..: 7 5 1 3 4 2 6 7 5 1 ...
##  $ Date            : Factor w/ 10 levels "Dec 1 2017","Dec 2 2017",..: 9 10 1 2 3 4 5 6 7 8 ...
##  $ IsWeekend       : int  0 0 1 1 0 0 0 0 0 1 ...
##  $ HotelName       : Factor w/ 1642 levels "1840s Carrollton Inn, Baltimore",..: 683 683 683 683 683 683 683 683 683 683 ...
##  $ Available       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ RackRate        : num  244 244 244 244 244 244 244 244 244 244 ...
##  $ RentUSD         : num  119 119 153 244 119 102 139 119 123 157 ...
##  $ Discount        : num  51.2 51.2 37.3 0 51.2 ...
##  $ StarRating      : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ GuestRating     : num  4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 ...
##  $ HotelAddress    : Factor w/ 2398 levels "0570 W 151st Street, Olathe, KS",..: 2241 2241 2241 2241 2241 2241 2241 2241 2241 2241 ...
##  $ HotelPincode    : int  33610 33610 33610 33610 33610 33610 33610 33610 33610 33610 ...
##  $ HotelDescription: Factor w/ 1429 levels " Motel with outdoor pool, near Heavenly Ski Resort",..: 831 831 831 831 831 831 831 831 831 831 ...
##  $ FreeWifi        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FreeBreakfast   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HotelCapacity   : int  76 76 76 76 76 76 76 76 76 76 ...
##  $ HasSwimmingPool : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HasDiscount     : int  1 1 1 0 1 1 1 1 1 1 ...
table(df$IsTourist)
## 
##   No  Yes 
## 9417 6015
table(df$IsTourist,df$FreeBreakfast)
##      
##          0    1
##   No  3907 5510
##   Yes 4194 1821
addmargins(table(df$IsTourist,df$FreeBreakfast),margin = c(1,2))
##      
##           0     1   Sum
##   No   3907  5510  9417
##   Yes  4194  1821  6015
##   Sum  8101  7331 15432
round(table(df$IsTourist)*100/nrow(df),2)
## 
##    No   Yes 
## 61.02 38.98
mean(df$RentUSD)
## [1] 145.0503
sd(df$RentUSD)
## [1] 127.7639
var(df$RentUSD)
## [1] 16323.62
round(max(df$RentUSD),2)
## [1] 2995
round(min(df$RentUSD),2)
## [1] 24
describe(df)[,c(1:5,8:9)]
##                   vars     n        mean           sd    median   min
## CityName*            1 15432       17.50         9.66     17.00     1
## Population           2 15432 30142834.42 154188929.37 595047.00 21717
## IsTourist*           3 15432        1.39         0.49      1.00     1
## Day*                 4 15432        4.12         2.16      5.00     1
## Date*                5 15432        5.54         2.87      6.00     1
## IsWeekend            6 15432        0.29         0.45      0.00     0
## HotelName*           7 15432      821.72       474.28    824.00     1
## Available            8 15432        1.00         0.00      1.00     1
## RackRate             9 15432      190.53       185.89    149.00    30
## RentUSD             10 15432      145.05       127.76    113.00    24
## Discount            11 15432       18.95        17.69     15.08     0
## StarRating          12 15432        3.03         0.80      3.00     1
## GuestRating         13 15432        5.15         2.03      4.30     1
## HotelAddress*       14 15432     1223.00       708.43   1235.00     1
## HotelPincode        15 15432    59814.54     30245.39  64161.00  2108
## HotelDescription*   16 15432      722.54       411.23    740.00     1
## FreeWifi            17 15432        0.93         0.25      1.00     0
## FreeBreakfast       18 15432        0.48         0.50      0.00     0
## HotelCapacity       19 15432      179.87       265.79    120.00    10
## HasSwimmingPool     20 15432        0.59         0.49      1.00     0
## HasDiscount         21 15432        0.75         0.43      1.00     0
##                          max
## CityName*         3.4000e+01
## Population        8.4060e+08
## IsTourist*        2.0000e+00
## Day*              7.0000e+00
## Date*             1.0000e+01
## IsWeekend         1.0000e+00
## HotelName*        1.6420e+03
## Available         1.0000e+00
## RackRate          2.9950e+03
## RentUSD           2.9950e+03
## Discount          9.2710e+01
## StarRating        5.0000e+00
## GuestRating       1.0000e+01
## HotelAddress*     2.3980e+03
## HotelPincode      9.8198e+04
## HotelDescription* 1.4290e+03
## FreeWifi          1.0000e+00
## FreeBreakfast     1.0000e+00
## HotelCapacity     4.0280e+03
## HasSwimmingPool   1.0000e+00
## HasDiscount       1.0000e+00
aggregate(df$RentUSD,by=list(df$IsTourist),mean)
##   Group.1        x
## 1      No 119.1239
## 2     Yes 185.6404
hist(df$RentUSD,col = "skyblue")

aggregate(df$RentUSD,by=list(df$IsTourist,df$FreeBreakfast),mean)
##   Group.1 Group.2        x
## 1      No       0 128.1158
## 2     Yes       0 210.8255
## 3      No       1 112.7479
## 4     Yes       1 127.6359
barplot(round(table(df$IsTourist)*100/nrow(df),2),col=c("skyblue","red"))

boxplot(df$RentUSD,horizontal = T,col="skyblue")

boxplot(df$RentUSD~df$IsTourist,col=c("grey","skyblue"))

cor(df$RentUSD,df$RackRate,method = "pearson")
## [1] 0.8681018
cor.test(df$RackRate,df$RentUSD,method = "spearman")
## Warning in cor.test.default(df$RackRate, df$RentUSD, method = "spearman"):
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  df$RackRate and df$RentUSD
## S = 6.2659e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8977018
cor(data.frame(df$Population,df$RackRate,df$RentUSD,df$StarRating,df$HotelCapacity))
##                  df.Population df.RackRate df.RentUSD df.StarRating
## df.Population       1.00000000   0.2911148  0.3359792    0.06258655
## df.RackRate         0.29111480   1.0000000  0.8681018    0.55628073
## df.RentUSD          0.33597918   0.8681018  1.0000000    0.57996120
## df.StarRating       0.06258655   0.5562807  0.5799612    1.00000000
## df.HotelCapacity    0.04069457   0.1691987  0.1711987    0.39625192
##                  df.HotelCapacity
## df.Population          0.04069457
## df.RackRate            0.16919866
## df.RentUSD             0.17119872
## df.StarRating          0.39625192
## df.HotelCapacity       1.00000000
corr.test(data.frame(df$Population,df$RackRate,df$RentUSD,df$StarRating,df$HotelCapacity))
## Call:corr.test(x = data.frame(df$Population, df$RackRate, df$RentUSD, 
##     df$StarRating, df$HotelCapacity))
## Correlation matrix 
##                  df.Population df.RackRate df.RentUSD df.StarRating
## df.Population             1.00        0.29       0.34          0.06
## df.RackRate               0.29        1.00       0.87          0.56
## df.RentUSD                0.34        0.87       1.00          0.58
## df.StarRating             0.06        0.56       0.58          1.00
## df.HotelCapacity          0.04        0.17       0.17          0.40
##                  df.HotelCapacity
## df.Population                0.04
## df.RackRate                  0.17
## df.RentUSD                   0.17
## df.StarRating                0.40
## df.HotelCapacity             1.00
## Sample Size 
## [1] 15432
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##                  df.Population df.RackRate df.RentUSD df.StarRating
## df.Population                0           0          0             0
## df.RackRate                  0           0          0             0
## df.RentUSD                   0           0          0             0
## df.StarRating                0           0          0             0
## df.HotelCapacity             0           0          0             0
##                  df.HotelCapacity
## df.Population                   0
## df.RackRate                     0
## df.RentUSD                      0
## df.StarRating                   0
## df.HotelCapacity                0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
corrplot(cor(data.frame(df$Population,df$RackRate,df$RentUSD,df$StarRating,df$HotelCapacity)))