Submission Session 4

Task 1a

df.df<-read.csv("../../Mradul Verma/Downloads/USAHotelsTransformedData.csv")

Task 1b

library(data.table)
dt.dt<-fread("USAHotelsTransformedData.csv",stringsAsFactors = TRUE)

Task 2 - Dimension of Dataframe df

dim(df.df)

## [1] 15432    21

Task 3 - List the column names of the dataframe “df”

colnames(df.df)

##  [1] "CityName"         "Population"       "IsTourist"       
##  [4] "Day"              "Date"             "IsWeekend"       
##  [7] "HotelName"        "Available"        "RackRate"        
## [10] "RentUSD"          "Discount"         "StarRating"      
## [13] "GuestRating"      "HotelAddress"     "HotelPincode"    
## [16] "HotelDescription" "FreeWifi"         "FreeBreakfast"   
## [19] "HotelCapacity"    "HasSwimmingPool"  "HasDiscount"

Task 4 - attach the dataframe “df”

attach(df.df)

Task 5 - list the data structures of the columns in the dataframe “df”

str(df.df)

## 'data.frame':    15432 obs. of  21 variables:
##  $ CityName        : Factor w/ 34 levels "Albuquerque",..: 33 33 33 33 33 33 33 33 33 33 ...
##  $ Population      : int  377165 377165 377165 377165 377165 377165 377165 377165 377165 377165 ...
##  $ IsTourist       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Day             : Factor w/ 7 levels "Friday","Monday",..: 7 5 1 3 4 2 6 7 5 1 ...
##  $ Date            : Factor w/ 10 levels "Dec 1 2017","Dec 2 2017",..: 9 10 1 2 3 4 5 6 7 8 ...
##  $ IsWeekend       : int  0 0 1 1 0 0 0 0 0 1 ...
##  $ HotelName       : Factor w/ 1642 levels "1840s Carrollton Inn, Baltimore",..: 683 683 683 683 683 683 683 683 683 683 ...
##  $ Available       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ RackRate        : num  244 244 244 244 244 244 244 244 244 244 ...
##  $ RentUSD         : num  119 119 153 244 119 102 139 119 123 157 ...
##  $ Discount        : num  51.2 51.2 37.3 0 51.2 ...
##  $ StarRating      : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ GuestRating     : num  4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 ...
##  $ HotelAddress    : Factor w/ 2398 levels "0570 W 151st Street, Olathe, KS",..: 2241 2241 2241 2241 2241 2241 2241 2241 2241 2241 ...
##  $ HotelPincode    : int  33610 33610 33610 33610 33610 33610 33610 33610 33610 33610 ...
##  $ HotelDescription: Factor w/ 1429 levels " Motel with outdoor pool, near Heavenly Ski Resort",..: 831 831 831 831 831 831 831 831 831 831 ...
##  $ FreeWifi        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FreeBreakfast   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HotelCapacity   : int  76 76 76 76 76 76 76 76 76 76 ...
##  $ HasSwimmingPool : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HasDiscount     : int  1 1 1 0 1 1 1 1 1 1 ...

Task 6 - count how many rows of data are for hotels located at tourist destinations

table(IsTourist)

## IsTourist
##   No  Yes 
## 9417 6015

Task 7 - count how many rows of data are for hotels located at tourist / non-tourist locations and whether or not the hotels offer Free Breakfast bundled with room rent

table(IsTourist,FreeBreakfast)

##          FreeBreakfast
## IsTourist    0    1
##       No  3907 5510
##       Yes 4194 1821

Task 8

addmargins(table(IsTourist,FreeBreakfast))

##          FreeBreakfast
## IsTourist     0     1   Sum
##       No   3907  5510  9417
##       Yes  4194  1821  6015
##       Sum  8101  7331 15432

Task 9 - calculate the percentage of rows of data for hotels located at tourist / non-tourist destinations, rounded to 2 decimal places

myprop<-prop.table(table(IsTourist))
round(myprop*100,2)

## IsTourist
##    No   Yes 
## 61.02 38.98

Task 10 - get Mean, Standard Deviation and Variance Of the variable RentUSD

mean(RentUSD)

## [1] 145.0503

var(RentUSD)

## [1] 16323.62

sd(RentUSD)

## [1] 127.7639

Task 11 - calculate the Minimum and Maximum of the variable RentUSD, rounding it to 2 decimal places

minrent<-min(RentUSD)
maxrent<-max(RentUSD)
round(minrent,2)

## [1] 24

round(maxrent,2)

## [1] 2995

Task 12a - print the following Descriptive Statistics

library(psych)
describe(df.df)[,c(1:5,8:9)]

##                   vars     n        mean           sd    median   min
## CityName*            1 15432       17.50         9.66     17.00     1
## Population           2 15432 30142834.42 154188929.37 595047.00 21717
## IsTourist*           3 15432        1.39         0.49      1.00     1
## Day*                 4 15432        4.12         2.16      5.00     1
## Date*                5 15432        5.54         2.87      6.00     1
## IsWeekend            6 15432        0.29         0.45      0.00     0
## HotelName*           7 15432      821.72       474.28    824.00     1
## Available            8 15432        1.00         0.00      1.00     1
## RackRate             9 15432      190.53       185.89    149.00    30
## RentUSD             10 15432      145.05       127.76    113.00    24
## Discount            11 15432       18.95        17.69     15.08     0
## StarRating          12 15432        3.03         0.80      3.00     1
## GuestRating         13 15432        5.15         2.03      4.30     1
## HotelAddress*       14 15432     1223.00       708.43   1235.00     1
## HotelPincode        15 15432    59814.54     30245.39  64161.00  2108
## HotelDescription*   16 15432      722.54       411.23    740.00     1
## FreeWifi            17 15432        0.93         0.25      1.00     0
## FreeBreakfast       18 15432        0.48         0.50      0.00     0
## HotelCapacity       19 15432      179.87       265.79    120.00    10
## HasSwimmingPool     20 15432        0.59         0.49      1.00     0
## HasDiscount         21 15432        0.75         0.43      1.00     0
##                          max
## CityName*         3.4000e+01
## Population        8.4060e+08
## IsTourist*        2.0000e+00
## Day*              7.0000e+00
## Date*             1.0000e+01
## IsWeekend         1.0000e+00
## HotelName*        1.6420e+03
## Available         1.0000e+00
## RackRate          2.9950e+03
## RentUSD           2.9950e+03
## Discount          9.2710e+01
## StarRating        5.0000e+00
## GuestRating       1.0000e+01
## HotelAddress*     2.3980e+03
## HotelPincode      9.8198e+04
## HotelDescription* 1.4290e+03
## FreeWifi          1.0000e+00
## FreeBreakfast     1.0000e+00
## HotelCapacity     4.0280e+03
## HasSwimmingPool   1.0000e+00
## HasDiscount       1.0000e+00

Task 12b - Interpret 1.39 mean of IsTourist

levels(IsTourist)

## [1] "No"  "Yes"

We see that No - 1 and Yes - 2. If frequency of both 1&2 would have been equal mean would come out to be 1.5. Since 1.39 < 1.5, therefore, there are more 1’s than 2’s. Therefore there are more “Not a tourist destination” locations than “Tourist Destinations”

Task 13a - measure the average rent i.e. RentUSD , for data rows corresponding to hotels located at tourist destinations and non-tourist destinations

aggregate(RentUSD, by= list(IsTourist),mean)

##   Group.1        x
## 1      No 119.1239
## 2     Yes 185.6404

Task 13b - create a Histogram of RentUSD

hist(RentUSD, xlab = "Rent(in USD)", ylab = "Frequency", col = c("blue"))

Task 14 - For all the rows of data, write R code to get a breakdown of the mean and standard deviation of the hotel rent (i.e RentUSD), with respect to variables IsTourist & FreeBreakfast, as shown in the following output

tab1<-dt.dt[,.(N=.N, mean = mean(RentUSD), sd = sd(RentUSD)), by = .(IsTourist, FreeBreakfast)][order(IsTourist)]
tab1

##    IsTourist FreeBreakfast    N     mean        sd
## 1:        No             1 5510 112.7479  71.87267
## 2:        No             0 3907 128.1158  75.84106
## 3:       Yes             0 4194 210.8255 200.04628
## 4:       Yes             1 1821 127.6359  64.62385

Task 15 - R code to create the following Bar-Chart

bp<-barplot( round(myprop*100,2),xlab="IsTourist", ylab = "Percentage(%)", col = c("blue","red"), main = "% Percentage of hotels are at tourist destinations", legend.text = TRUE)
text(bp,0,round(myprop*100,2),cex=1,pos=3)

Task 16 - create the following Grouped Bar-Chart

counts<- round(prop.table(table(FreeBreakfast,IsTourist))*100,2)
bp2<-barplot(counts,col = c("blue","pink"), xlab = "FreeBreakfast", ylab = "Percentage", main = "Bar Chart for IsTourist by FreeBreakfast", beside= TRUE ,legend.text = TRUE, args.legend = list(title = "IsTourist"))
text(bp2,0,counts,cex=1,pos=3)

Task 17 - create a Box-Plot for hotel rents

boxplot(RentUSD,
                main = "Boxplot for Rent (in USD)",
                 col = c("lightblue"), horizontal = TRUE, xlab="Rent (in USD)")

Task 18 - boxplots for hotel rent by IsTourist

boxplot(RentUSD ~ IsTourist, main = "Box Plot of Rent in USD grouped by IsTourist", col= c("pink","lightblue"), ylab= "Rent (in USD)")

Task 19 - scatter plot between Room rent & Rack Rate

plot(RackRate, RentUSD, main = "Scatter Plot", xlab = "Rack Rate", ylab = "Rent (in USD)")

Task 20a - Pearson correlation coefficient between the Variable RentUSD & RackRate using cor()

cor(RackRate,RentUSD, method = "pearson")

## [1] 0.8681018

Task 20b - Pearson correlation coefficient & significance value between Variable RentUSD & RackRate

cor.test(RackRate,RentUSD, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  RackRate and RentUSD
## t = 217.24, df = 15430, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8641602 0.8719369
## sample estimates:
##       cor 
## 0.8681018

Task 21 - Spearman Rank correlation coefficient & significance value between Variable RentUSD & RackRate

cor.test(RackRate,RentUSD,method = "spearman")

## Warning in cor.test.default(RackRate, RentUSD, method = "spearman"): Cannot
## compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  RackRate and RentUSD
## S = 6.2659e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8977018

Task 22 - draw correlation matrix for all the continuous variables in the dataframe df

mydata<- df.df[,c(2,9,10,12,19)]
corres<-cor(mydata)
round(corres,3)

##               Population RackRate RentUSD StarRating HotelCapacity
## Population         1.000    0.291   0.336      0.063         0.041
## RackRate           0.291    1.000   0.868      0.556         0.169
## RentUSD            0.336    0.868   1.000      0.580         0.171
## StarRating         0.063    0.556   0.580      1.000         0.396
## HotelCapacity      0.041    0.169   0.171      0.396         1.000

Task 23 - correlation matrix with significance value for all the continuous variables in the dataframe df

corr.test(mydata, use = "complete")

## Call:corr.test(x = mydata, use = "complete")
## Correlation matrix 
##               Population RackRate RentUSD StarRating HotelCapacity
## Population          1.00     0.29    0.34       0.06          0.04
## RackRate            0.29     1.00    0.87       0.56          0.17
## RentUSD             0.34     0.87    1.00       0.58          0.17
## StarRating          0.06     0.56    0.58       1.00          0.40
## HotelCapacity       0.04     0.17    0.17       0.40          1.00
## Sample Size 
## [1] 15432
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##               Population RackRate RentUSD StarRating HotelCapacity
## Population             0        0       0          0             0
## RackRate               0        0       0          0             0
## RentUSD                0        0       0          0             0
## StarRating             0        0       0          0             0
## HotelCapacity          0        0       0          0             0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

Task 24 - draw a correlogram to display a correlation matrix for all the continuous variables in df dataframe

library(corrplot)

## Warning: package 'corrplot' was built under R version 3.6.1

## corrplot 0.84 loaded

corrplot(cor(mydata),method = "circle")