#Q.1(a)Write R code to read the data into a dataframe called “df”"

df = read.csv("USAHotelsTransformedData.csv")
attach(df)

#Q.1b Also write R code to read the data into a data table called “dt”

library(data.table)
dt=fread("USAHotelsTransformedData.csv")
attach(dt)

## The following objects are masked from df:
## 
##     Available, CityName, Date, Day, Discount, FreeBreakfast,
##     FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
##     HotelAddress, HotelCapacity, HotelDescription, HotelName,
##     HotelPincode, IsTourist, IsWeekend, Population, RackRate,
##     RentUSD, StarRating

#Q.2 Write R code to get the dimensions of the dataframe “df”

dim(df)

## [1] 15432    21

#Q.3 Write R code to list the column names of the dataframe “df”

colnames(df)

##  [1] "CityName"         "Population"       "IsTourist"       
##  [4] "Day"              "Date"             "IsWeekend"       
##  [7] "HotelName"        "Available"        "RackRate"        
## [10] "RentUSD"          "Discount"         "StarRating"      
## [13] "GuestRating"      "HotelAddress"     "HotelPincode"    
## [16] "HotelDescription" "FreeWifi"         "FreeBreakfast"   
## [19] "HotelCapacity"    "HasSwimmingPool"  "HasDiscount"

#Q.4 Write R code to attach the dataframe “df”

attach(df)

## The following objects are masked from dt:
## 
##     Available, CityName, Date, Day, Discount, FreeBreakfast,
##     FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
##     HotelAddress, HotelCapacity, HotelDescription, HotelName,
##     HotelPincode, IsTourist, IsWeekend, Population, RackRate,
##     RentUSD, StarRating

## The following objects are masked from df (pos = 5):
## 
##     Available, CityName, Date, Day, Discount, FreeBreakfast,
##     FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
##     HotelAddress, HotelCapacity, HotelDescription, HotelName,
##     HotelPincode, IsTourist, IsWeekend, Population, RackRate,
##     RentUSD, StarRating

#Q.5 Write R code to list the data structures of the columns in the dataframe “df”

str(df)

## 'data.frame':    15432 obs. of  21 variables:
##  $ CityName        : Factor w/ 34 levels "Albuquerque",..: 33 33 33 33 33 33 33 33 33 33 ...
##  $ Population      : int  377165 377165 377165 377165 377165 377165 377165 377165 377165 377165 ...
##  $ IsTourist       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Day             : Factor w/ 7 levels "Friday","Monday",..: 7 5 1 3 4 2 6 7 5 1 ...
##  $ Date            : Factor w/ 10 levels "Dec 1 2017","Dec 2 2017",..: 9 10 1 2 3 4 5 6 7 8 ...
##  $ IsWeekend       : int  0 0 1 1 0 0 0 0 0 1 ...
##  $ HotelName       : Factor w/ 1642 levels "1840s Carrollton Inn, Baltimore",..: 683 683 683 683 683 683 683 683 683 683 ...
##  $ Available       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ RackRate        : num  244 244 244 244 244 244 244 244 244 244 ...
##  $ RentUSD         : num  119 119 153 244 119 102 139 119 123 157 ...
##  $ Discount        : num  51.2 51.2 37.3 0 51.2 ...
##  $ StarRating      : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ GuestRating     : num  4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 ...
##  $ HotelAddress    : Factor w/ 2398 levels "0570 W 151st Street, Olathe, KS",..: 2241 2241 2241 2241 2241 2241 2241 2241 2241 2241 ...
##  $ HotelPincode    : int  33610 33610 33610 33610 33610 33610 33610 33610 33610 33610 ...
##  $ HotelDescription: Factor w/ 1429 levels " Motel with outdoor pool, near Heavenly Ski Resort",..: 831 831 831 831 831 831 831 831 831 831 ...
##  $ FreeWifi        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ FreeBreakfast   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HotelCapacity   : int  76 76 76 76 76 76 76 76 76 76 ...
##  $ HasSwimmingPool : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HasDiscount     : int  1 1 1 0 1 1 1 1 1 1 ...

#Q.6 Write R code to count how many rows of data are for hotels located at tourist destinations

library(data.table)
df <- data.table(df)
tab1 <- df[, .(N = .N),
           by = .(IsTourist)]
tab1

##    IsTourist    N
## 1:        No 9417
## 2:       Yes 6015

#Q.7 Write R code to count how many rows of data are for hotels located at tourist / non-tourist locations and whether or not the hotels offer Free Breakfast bundled with room rent

library(data.table)
df <- data.table(df)
tab1 <- df[, .(N = .N),
           by = .(IsTourist,FreeBreakfast)]
tab1

##    IsTourist FreeBreakfast    N
## 1:        No             1 5510
## 2:        No             0 3907
## 3:       Yes             0 4194
## 4:       Yes             1 1821

#Q.8 Write R code to give the following output

# creating contingency table
 tab1 <- table(IsTourist,FreeBreakfast)
# Margin of rows
addmargins(tab1, c(1,2))

##          FreeBreakfast
## IsTourist     0     1   Sum
##       No   3907  5510  9417
##       Yes  4194  1821  6015
##       Sum  8101  7331 15432

#Q9. Write R code to calculate the percentage of rows of data for hotels located at tourist / non-tourist destinations, rounded to 2 decimal places

# generating frequency table
tab1 <- table(IsTourist)
protable = prop.table(tab1)
# printing the proportion table in the form of percentage
PercentPropotion <- round(protable*100,2) 
PercentPropotion

## IsTourist
##    No   Yes 
## 61.02 38.98

#Q 10 Write R code to get Mean, Standard Deviation and Variance Of the variable RentUSD

mean(df$RentUSD)

## [1] 145.0503

sd(df$RentUSD)

## [1] 127.7639

var(df$RentUSD)

## [1] 16323.62

#Q 11 Write R code to calculate the Minimum and Maximum of the variable RentUSD, rounding it to 2 decimal places

round(min(df$RentUSD),2)

## [1] 24

round(max(df$RentUSD),2)

## [1] 2995

#Q 12a Write R code to print Descriptive Statistics:

library(psych)
describe(df)[,c(1:5,8:9)]

##                   vars     n        mean           sd    median   min
## CityName*            1 15432       17.50         9.66     17.00     1
## Population           2 15432 30142834.42 154188929.37 595047.00 21717
## IsTourist*           3 15432        1.39         0.49      1.00     1
## Day*                 4 15432        4.12         2.16      5.00     1
## Date*                5 15432        5.54         2.87      6.00     1
## IsWeekend            6 15432        0.29         0.45      0.00     0
## HotelName*           7 15432      821.72       474.28    824.00     1
## Available            8 15432        1.00         0.00      1.00     1
## RackRate             9 15432      190.53       185.89    149.00    30
## RentUSD             10 15432      145.05       127.76    113.00    24
## Discount            11 15432       18.95        17.69     15.08     0
## StarRating          12 15432        3.03         0.80      3.00     1
## GuestRating         13 15432        5.15         2.03      4.30     1
## HotelAddress*       14 15432     1223.00       708.43   1235.00     1
## HotelPincode        15 15432    59814.54     30245.39  64161.00  2108
## HotelDescription*   16 15432      722.54       411.23    740.00     1
## FreeWifi            17 15432        0.93         0.25      1.00     0
## FreeBreakfast       18 15432        0.48         0.50      0.00     0
## HotelCapacity       19 15432      179.87       265.79    120.00    10
## HasSwimmingPool     20 15432        0.59         0.49      1.00     0
## HasDiscount         21 15432        0.75         0.43      1.00     0
##                          max
## CityName*         3.4000e+01
## Population        8.4060e+08
## IsTourist*        2.0000e+00
## Day*              7.0000e+00
## Date*             1.0000e+01
## IsWeekend         1.0000e+00
## HotelName*        1.6420e+03
## Available         1.0000e+00
## RackRate          2.9950e+03
## RentUSD           2.9950e+03
## Discount          9.2710e+01
## StarRating        5.0000e+00
## GuestRating       1.0000e+01
## HotelAddress*     2.3980e+03
## HotelPincode      9.8198e+04
## HotelDescription* 1.4290e+03
## FreeWifi          1.0000e+00
## FreeBreakfast     1.0000e+00
## HotelCapacity     4.0280e+03
## HasSwimmingPool   1.0000e+00
## HasDiscount       1.0000e+00

#Q 12b In the above output, Interpret the meaning of the 1.39 written as the mean of the IsTourist column

#R assumes the values of levels "Yes" & "No" as "1" & "2" respectively for the variable "IsTourist" & finds the value of mean accordingly. Thus 0.39 is the proportion of tourist destinations in total destinations

#Q 13a Write R code to measure the average rent i.e. RentUSD , for data rows corresponding to hotels located at tourist destinations and non-tourist destinations

tab3 <- dt[, .(RentUSD = mean(RentUSD)),
                      by = (IsTourist)][order(IsTourist)]
tab3

##    IsTourist  RentUSD
## 1:        No 119.1239
## 2:       Yes 185.6404

#Q 13b Write R code to create a Histogram of RentUSD

hist(df$RentUSD,main = "Histogram of Rent in USD", xlab = "Rent(in USD)",col=c("cyan"))

#Q 14 Write R code to get a breakdown of the mean and standard deviation of the hotel rent (i.e RentUSD), with respect to variables IsTourist & FreeBreakfast

tab2 <- dt[, .(N = .N,
                    MeanBalance = mean(RentUSD),
                      SDBalance = sd(RentUSD)),
                      by = .(IsTourist,FreeBreakfast)][order(IsTourist)]
tab2

##    IsTourist FreeBreakfast    N MeanBalance SDBalance
## 1:        No             1 5510    112.7479  71.87267
## 2:        No             0 3907    128.1158  75.84106
## 3:       Yes             0 4194    210.8255 200.04628
## 4:       Yes             1 1821    127.6359  64.62385

#Q15: Is Tourist Bar Plot

# Bar plot
Tourist_Subset <- table(df$IsTourist)
protable_tourist <- prop.table(Tourist_Subset)
percent_tourist <- round(protable_tourist*100,2)
barplot(percent_tourist,  ,col = c("blue", "red"), legend.text = c("No", "Yes"), ylab = "Percentage",xlab = "IsTourist", main="% of hotels are at tourist destinations")

#Q16: Barchart Is Tourist By FreeBreakfast

# Bar plot
Tourist_Breakfast_Subset <- table(df$IsTourist, df$FreeBreakfast)
protable_tourist_breakfast <- prop.table(Tourist_Breakfast_Subset)
percent_tourist_breakfast <- round(protable_tourist_breakfast*100,2)
barplot(percent_tourist_breakfast,  ,col = c("blue", "red"), legend.text = c("No", "Yes"), ylab = "Percentage",xlab = "FreeBreakfast",beside = TRUE, main= "Bar chart for IsTourist by free breakfast")

#Q17 Boxplot for rent

attach(df)

## The following objects are masked from df (pos = 4):
## 
##     Available, CityName, Date, Day, Discount, FreeBreakfast,
##     FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
##     HotelAddress, HotelCapacity, HotelDescription, HotelName,
##     HotelPincode, IsTourist, IsWeekend, Population, RackRate,
##     RentUSD, StarRating

## The following objects are masked from dt:
## 
##     Available, CityName, Date, Day, Discount, FreeBreakfast,
##     FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
##     HotelAddress, HotelCapacity, HotelDescription, HotelName,
##     HotelPincode, IsTourist, IsWeekend, Population, RackRate,
##     RentUSD, StarRating

## The following objects are masked from df (pos = 7):
## 
##     Available, CityName, Date, Day, Discount, FreeBreakfast,
##     FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
##     HotelAddress, HotelCapacity, HotelDescription, HotelName,
##     HotelPincode, IsTourist, IsWeekend, Population, RackRate,
##     RentUSD, StarRating

boxplot(RentUSD, xlab = "Rent(in USD)", horizontal = TRUE, main="Box-Plot for hotel rents (USD)")

#Q18 Boxplot for rent grouped by tourist

attach(df)

## The following objects are masked from df (pos = 3):
## 
##     Available, CityName, Date, Day, Discount, FreeBreakfast,
##     FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
##     HotelAddress, HotelCapacity, HotelDescription, HotelName,
##     HotelPincode, IsTourist, IsWeekend, Population, RackRate,
##     RentUSD, StarRating

## The following objects are masked from df (pos = 5):
## 
##     Available, CityName, Date, Day, Discount, FreeBreakfast,
##     FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
##     HotelAddress, HotelCapacity, HotelDescription, HotelName,
##     HotelPincode, IsTourist, IsWeekend, Population, RackRate,
##     RentUSD, StarRating

## The following objects are masked from dt:
## 
##     Available, CityName, Date, Day, Discount, FreeBreakfast,
##     FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
##     HotelAddress, HotelCapacity, HotelDescription, HotelName,
##     HotelPincode, IsTourist, IsWeekend, Population, RackRate,
##     RentUSD, StarRating

## The following objects are masked from df (pos = 8):
## 
##     Available, CityName, Date, Day, Discount, FreeBreakfast,
##     FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
##     HotelAddress, HotelCapacity, HotelDescription, HotelName,
##     HotelPincode, IsTourist, IsWeekend, Population, RackRate,
##     RentUSD, StarRating

boxplot(RentUSD ~ IsTourist, xlab = "IsTourist", horizontal = FALSE, ain="Box-Plot for hotel rent grouped by IsTourist (USD)")

Q19: Scatter plot room rent and rack rates

plot(RentUSD ~ RackRate, main ="Scatter Plot")

#Q.20a Find Pearson correlation coefficient between the Variable RentUSD & RackRate using cor()

cor(RentUSD, RackRate, method = "pearson")

## [1] 0.8681018

#Q.20b Write R code to find out Pearson correlation coefficient & significance value between Variable RentUSD & RackRate as shown below.

cor.test(RentUSD, RackRate, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  RentUSD and RackRate
## t = 217.24, df = 15430, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8641602 0.8719369
## sample estimates:
##       cor 
## 0.8681018

#Q.21 Write R code to find out Spearman Rank correlation coefficient & significance value between Variable RentUSD & RackRate as shown below.

cor.test(RentUSD, RackRate, method = "spearman")

## Warning in cor.test.default(RentUSD, RackRate, method = "spearman"): Cannot
## compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  RentUSD and RackRate
## S = 6.2659e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8977018

#Q.22 Write R code draw correlation matrix for all the continuous variables in the dataframe df as shown below

# taking a subset of continuous variables
subset.df <- df[
                ,c('Population','RackRate','RentUSD','StarRating','HotelCapacity')]
# correlation matrix on new dataframe subset.df
corMat <- cor(subset.df, use = "complete")
# round off upto 2 decimal places
round(corMat, 3)

##               Population RackRate RentUSD StarRating HotelCapacity
## Population         1.000    0.291   0.336      0.063         0.041
## RackRate           0.291    1.000   0.868      0.556         0.169
## RentUSD            0.336    0.868   1.000      0.580         0.171
## StarRating         0.063    0.556   0.580      1.000         0.396
## HotelCapacity      0.041    0.169   0.171      0.396         1.000

#Q.23 Write R code draw a correlation matrix with significance value for all the continuous variables in the dataframe df as shown below

# correlation significance value
library(psych)
corr.test(subset.df, use = "complete")

## Call:corr.test(x = subset.df, use = "complete")
## Correlation matrix 
##               Population RackRate RentUSD StarRating HotelCapacity
## Population          1.00     0.29    0.34       0.06          0.04
## RackRate            0.29     1.00    0.87       0.56          0.17
## RentUSD             0.34     0.87    1.00       0.58          0.17
## StarRating          0.06     0.56    0.58       1.00          0.40
## HotelCapacity       0.04     0.17    0.17       0.40          1.00
## Sample Size 
## [1] 15432
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##               Population RackRate RentUSD StarRating HotelCapacity
## Population             0        0       0          0             0
## RackRate               0        0       0          0             0
## RentUSD                0        0       0          0             0
## StarRating             0        0       0          0             0
## HotelCapacity          0        0       0          0             0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

#Q.24 Write R code draw a correlogram to display a correlation matrix for all the continuous variables in df dataframe as shown below

# correlogram
library(corrplot)

## Warning: package 'corrplot' was built under R version 3.6.1

## corrplot 0.84 loaded

corrplot(cor(subset.df), method = "circle")

DAM EDA Assignment

Team 6

23 Sep

Q19: Scatter plot room rent and rack rates