#Q.1(a)Write R code to read the data into a dataframe called “df”"
df = read.csv("USAHotelsTransformedData.csv")
attach(df)
#Q.1b Also write R code to read the data into a data table called “dt”
library(data.table)
dt=fread("USAHotelsTransformedData.csv")
attach(dt)
## The following objects are masked from df:
##
## Available, CityName, Date, Day, Discount, FreeBreakfast,
## FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
## HotelAddress, HotelCapacity, HotelDescription, HotelName,
## HotelPincode, IsTourist, IsWeekend, Population, RackRate,
## RentUSD, StarRating
#Q.2 Write R code to get the dimensions of the dataframe “df”
dim(df)
## [1] 15432 21
#Q.3 Write R code to list the column names of the dataframe “df”
colnames(df)
## [1] "CityName" "Population" "IsTourist"
## [4] "Day" "Date" "IsWeekend"
## [7] "HotelName" "Available" "RackRate"
## [10] "RentUSD" "Discount" "StarRating"
## [13] "GuestRating" "HotelAddress" "HotelPincode"
## [16] "HotelDescription" "FreeWifi" "FreeBreakfast"
## [19] "HotelCapacity" "HasSwimmingPool" "HasDiscount"
#Q.4 Write R code to attach the dataframe “df”
attach(df)
## The following objects are masked from dt:
##
## Available, CityName, Date, Day, Discount, FreeBreakfast,
## FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
## HotelAddress, HotelCapacity, HotelDescription, HotelName,
## HotelPincode, IsTourist, IsWeekend, Population, RackRate,
## RentUSD, StarRating
## The following objects are masked from df (pos = 5):
##
## Available, CityName, Date, Day, Discount, FreeBreakfast,
## FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
## HotelAddress, HotelCapacity, HotelDescription, HotelName,
## HotelPincode, IsTourist, IsWeekend, Population, RackRate,
## RentUSD, StarRating
#Q.5 Write R code to list the data structures of the columns in the dataframe “df”
str(df)
## 'data.frame': 15432 obs. of 21 variables:
## $ CityName : Factor w/ 34 levels "Albuquerque",..: 33 33 33 33 33 33 33 33 33 33 ...
## $ Population : int 377165 377165 377165 377165 377165 377165 377165 377165 377165 377165 ...
## $ IsTourist : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Day : Factor w/ 7 levels "Friday","Monday",..: 7 5 1 3 4 2 6 7 5 1 ...
## $ Date : Factor w/ 10 levels "Dec 1 2017","Dec 2 2017",..: 9 10 1 2 3 4 5 6 7 8 ...
## $ IsWeekend : int 0 0 1 1 0 0 0 0 0 1 ...
## $ HotelName : Factor w/ 1642 levels "1840s Carrollton Inn, Baltimore",..: 683 683 683 683 683 683 683 683 683 683 ...
## $ Available : int 1 1 1 1 1 1 1 1 1 1 ...
## $ RackRate : num 244 244 244 244 244 244 244 244 244 244 ...
## $ RentUSD : num 119 119 153 244 119 102 139 119 123 157 ...
## $ Discount : num 51.2 51.2 37.3 0 51.2 ...
## $ StarRating : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ GuestRating : num 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 ...
## $ HotelAddress : Factor w/ 2398 levels "0570 W 151st Street, Olathe, KS",..: 2241 2241 2241 2241 2241 2241 2241 2241 2241 2241 ...
## $ HotelPincode : int 33610 33610 33610 33610 33610 33610 33610 33610 33610 33610 ...
## $ HotelDescription: Factor w/ 1429 levels " Motel with outdoor pool, near Heavenly Ski Resort",..: 831 831 831 831 831 831 831 831 831 831 ...
## $ FreeWifi : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FreeBreakfast : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HotelCapacity : int 76 76 76 76 76 76 76 76 76 76 ...
## $ HasSwimmingPool : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HasDiscount : int 1 1 1 0 1 1 1 1 1 1 ...
#Q.6 Write R code to count how many rows of data are for hotels located at tourist destinations
library(data.table)
df <- data.table(df)
tab1 <- df[, .(N = .N),
by = .(IsTourist)]
tab1
## IsTourist N
## 1: No 9417
## 2: Yes 6015
#Q.7 Write R code to count how many rows of data are for hotels located at tourist / non-tourist locations and whether or not the hotels offer Free Breakfast bundled with room rent
library(data.table)
df <- data.table(df)
tab1 <- df[, .(N = .N),
by = .(IsTourist,FreeBreakfast)]
tab1
## IsTourist FreeBreakfast N
## 1: No 1 5510
## 2: No 0 3907
## 3: Yes 0 4194
## 4: Yes 1 1821
#Q.8 Write R code to give the following output
# creating contingency table
tab1 <- table(IsTourist,FreeBreakfast)
# Margin of rows
addmargins(tab1, c(1,2))
## FreeBreakfast
## IsTourist 0 1 Sum
## No 3907 5510 9417
## Yes 4194 1821 6015
## Sum 8101 7331 15432
#Q9. Write R code to calculate the percentage of rows of data for hotels located at tourist / non-tourist destinations, rounded to 2 decimal places
# generating frequency table
tab1 <- table(IsTourist)
protable = prop.table(tab1)
# printing the proportion table in the form of percentage
PercentPropotion <- round(protable*100,2)
PercentPropotion
## IsTourist
## No Yes
## 61.02 38.98
#Q 10 Write R code to get Mean, Standard Deviation and Variance Of the variable RentUSD
mean(df$RentUSD)
## [1] 145.0503
sd(df$RentUSD)
## [1] 127.7639
var(df$RentUSD)
## [1] 16323.62
#Q 11 Write R code to calculate the Minimum and Maximum of the variable RentUSD, rounding it to 2 decimal places
round(min(df$RentUSD),2)
## [1] 24
round(max(df$RentUSD),2)
## [1] 2995
#Q 12a Write R code to print Descriptive Statistics:
library(psych)
describe(df)[,c(1:5,8:9)]
## vars n mean sd median min
## CityName* 1 15432 17.50 9.66 17.00 1
## Population 2 15432 30142834.42 154188929.37 595047.00 21717
## IsTourist* 3 15432 1.39 0.49 1.00 1
## Day* 4 15432 4.12 2.16 5.00 1
## Date* 5 15432 5.54 2.87 6.00 1
## IsWeekend 6 15432 0.29 0.45 0.00 0
## HotelName* 7 15432 821.72 474.28 824.00 1
## Available 8 15432 1.00 0.00 1.00 1
## RackRate 9 15432 190.53 185.89 149.00 30
## RentUSD 10 15432 145.05 127.76 113.00 24
## Discount 11 15432 18.95 17.69 15.08 0
## StarRating 12 15432 3.03 0.80 3.00 1
## GuestRating 13 15432 5.15 2.03 4.30 1
## HotelAddress* 14 15432 1223.00 708.43 1235.00 1
## HotelPincode 15 15432 59814.54 30245.39 64161.00 2108
## HotelDescription* 16 15432 722.54 411.23 740.00 1
## FreeWifi 17 15432 0.93 0.25 1.00 0
## FreeBreakfast 18 15432 0.48 0.50 0.00 0
## HotelCapacity 19 15432 179.87 265.79 120.00 10
## HasSwimmingPool 20 15432 0.59 0.49 1.00 0
## HasDiscount 21 15432 0.75 0.43 1.00 0
## max
## CityName* 3.4000e+01
## Population 8.4060e+08
## IsTourist* 2.0000e+00
## Day* 7.0000e+00
## Date* 1.0000e+01
## IsWeekend 1.0000e+00
## HotelName* 1.6420e+03
## Available 1.0000e+00
## RackRate 2.9950e+03
## RentUSD 2.9950e+03
## Discount 9.2710e+01
## StarRating 5.0000e+00
## GuestRating 1.0000e+01
## HotelAddress* 2.3980e+03
## HotelPincode 9.8198e+04
## HotelDescription* 1.4290e+03
## FreeWifi 1.0000e+00
## FreeBreakfast 1.0000e+00
## HotelCapacity 4.0280e+03
## HasSwimmingPool 1.0000e+00
## HasDiscount 1.0000e+00
#Q 12b In the above output, Interpret the meaning of the 1.39 written as the mean of the IsTourist column
#R assumes the values of levels "Yes" & "No" as "1" & "2" respectively for the variable "IsTourist" & finds the value of mean accordingly. Thus 0.39 is the proportion of tourist destinations in total destinations
#Q 13a Write R code to measure the average rent i.e. RentUSD , for data rows corresponding to hotels located at tourist destinations and non-tourist destinations
tab3 <- dt[, .(RentUSD = mean(RentUSD)),
by = (IsTourist)][order(IsTourist)]
tab3
## IsTourist RentUSD
## 1: No 119.1239
## 2: Yes 185.6404
#Q 13b Write R code to create a Histogram of RentUSD
hist(df$RentUSD,main = "Histogram of Rent in USD", xlab = "Rent(in USD)",col=c("cyan"))
#Q 14 Write R code to get a breakdown of the mean and standard deviation of the hotel rent (i.e RentUSD), with respect to variables IsTourist & FreeBreakfast
tab2 <- dt[, .(N = .N,
MeanBalance = mean(RentUSD),
SDBalance = sd(RentUSD)),
by = .(IsTourist,FreeBreakfast)][order(IsTourist)]
tab2
## IsTourist FreeBreakfast N MeanBalance SDBalance
## 1: No 1 5510 112.7479 71.87267
## 2: No 0 3907 128.1158 75.84106
## 3: Yes 0 4194 210.8255 200.04628
## 4: Yes 1 1821 127.6359 64.62385
#Q15: Is Tourist Bar Plot
# Bar plot
Tourist_Subset <- table(df$IsTourist)
protable_tourist <- prop.table(Tourist_Subset)
percent_tourist <- round(protable_tourist*100,2)
barplot(percent_tourist, ,col = c("blue", "red"), legend.text = c("No", "Yes"), ylab = "Percentage",xlab = "IsTourist", main="% of hotels are at tourist destinations")
#Q16: Barchart Is Tourist By FreeBreakfast
# Bar plot
Tourist_Breakfast_Subset <- table(df$IsTourist, df$FreeBreakfast)
protable_tourist_breakfast <- prop.table(Tourist_Breakfast_Subset)
percent_tourist_breakfast <- round(protable_tourist_breakfast*100,2)
barplot(percent_tourist_breakfast, ,col = c("blue", "red"), legend.text = c("No", "Yes"), ylab = "Percentage",xlab = "FreeBreakfast",beside = TRUE, main= "Bar chart for IsTourist by free breakfast")
#Q17 Boxplot for rent
attach(df)
## The following objects are masked from df (pos = 4):
##
## Available, CityName, Date, Day, Discount, FreeBreakfast,
## FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
## HotelAddress, HotelCapacity, HotelDescription, HotelName,
## HotelPincode, IsTourist, IsWeekend, Population, RackRate,
## RentUSD, StarRating
## The following objects are masked from dt:
##
## Available, CityName, Date, Day, Discount, FreeBreakfast,
## FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
## HotelAddress, HotelCapacity, HotelDescription, HotelName,
## HotelPincode, IsTourist, IsWeekend, Population, RackRate,
## RentUSD, StarRating
## The following objects are masked from df (pos = 7):
##
## Available, CityName, Date, Day, Discount, FreeBreakfast,
## FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
## HotelAddress, HotelCapacity, HotelDescription, HotelName,
## HotelPincode, IsTourist, IsWeekend, Population, RackRate,
## RentUSD, StarRating
boxplot(RentUSD, xlab = "Rent(in USD)", horizontal = TRUE, main="Box-Plot for hotel rents (USD)")
#Q18 Boxplot for rent grouped by tourist
attach(df)
## The following objects are masked from df (pos = 3):
##
## Available, CityName, Date, Day, Discount, FreeBreakfast,
## FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
## HotelAddress, HotelCapacity, HotelDescription, HotelName,
## HotelPincode, IsTourist, IsWeekend, Population, RackRate,
## RentUSD, StarRating
## The following objects are masked from df (pos = 5):
##
## Available, CityName, Date, Day, Discount, FreeBreakfast,
## FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
## HotelAddress, HotelCapacity, HotelDescription, HotelName,
## HotelPincode, IsTourist, IsWeekend, Population, RackRate,
## RentUSD, StarRating
## The following objects are masked from dt:
##
## Available, CityName, Date, Day, Discount, FreeBreakfast,
## FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
## HotelAddress, HotelCapacity, HotelDescription, HotelName,
## HotelPincode, IsTourist, IsWeekend, Population, RackRate,
## RentUSD, StarRating
## The following objects are masked from df (pos = 8):
##
## Available, CityName, Date, Day, Discount, FreeBreakfast,
## FreeWifi, GuestRating, HasDiscount, HasSwimmingPool,
## HotelAddress, HotelCapacity, HotelDescription, HotelName,
## HotelPincode, IsTourist, IsWeekend, Population, RackRate,
## RentUSD, StarRating
boxplot(RentUSD ~ IsTourist, xlab = "IsTourist", horizontal = FALSE, ain="Box-Plot for hotel rent grouped by IsTourist (USD)")
plot(RentUSD ~ RackRate, main ="Scatter Plot")
#Q.20a Find Pearson correlation coefficient between the Variable RentUSD & RackRate using cor()
cor(RentUSD, RackRate, method = "pearson")
## [1] 0.8681018
#Q.20b Write R code to find out Pearson correlation coefficient & significance value between Variable RentUSD & RackRate as shown below.
cor.test(RentUSD, RackRate, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: RentUSD and RackRate
## t = 217.24, df = 15430, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8641602 0.8719369
## sample estimates:
## cor
## 0.8681018
#Q.21 Write R code to find out Spearman Rank correlation coefficient & significance value between Variable RentUSD & RackRate as shown below.
cor.test(RentUSD, RackRate, method = "spearman")
## Warning in cor.test.default(RentUSD, RackRate, method = "spearman"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: RentUSD and RackRate
## S = 6.2659e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8977018
#Q.22 Write R code draw correlation matrix for all the continuous variables in the dataframe df as shown below
# taking a subset of continuous variables
subset.df <- df[
,c('Population','RackRate','RentUSD','StarRating','HotelCapacity')]
# correlation matrix on new dataframe subset.df
corMat <- cor(subset.df, use = "complete")
# round off upto 2 decimal places
round(corMat, 3)
## Population RackRate RentUSD StarRating HotelCapacity
## Population 1.000 0.291 0.336 0.063 0.041
## RackRate 0.291 1.000 0.868 0.556 0.169
## RentUSD 0.336 0.868 1.000 0.580 0.171
## StarRating 0.063 0.556 0.580 1.000 0.396
## HotelCapacity 0.041 0.169 0.171 0.396 1.000
#Q.23 Write R code draw a correlation matrix with significance value for all the continuous variables in the dataframe df as shown below
# correlation significance value
library(psych)
corr.test(subset.df, use = "complete")
## Call:corr.test(x = subset.df, use = "complete")
## Correlation matrix
## Population RackRate RentUSD StarRating HotelCapacity
## Population 1.00 0.29 0.34 0.06 0.04
## RackRate 0.29 1.00 0.87 0.56 0.17
## RentUSD 0.34 0.87 1.00 0.58 0.17
## StarRating 0.06 0.56 0.58 1.00 0.40
## HotelCapacity 0.04 0.17 0.17 0.40 1.00
## Sample Size
## [1] 15432
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## Population RackRate RentUSD StarRating HotelCapacity
## Population 0 0 0 0 0
## RackRate 0 0 0 0 0
## RentUSD 0 0 0 0 0
## StarRating 0 0 0 0 0
## HotelCapacity 0 0 0 0 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
#Q.24 Write R code draw a correlogram to display a correlation matrix for all the continuous variables in df dataframe as shown below
# correlogram
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.1
## corrplot 0.84 loaded
corrplot(cor(subset.df), method = "circle")