city <- read.csv("~/Desktop/My Project IIM/city.csv")
 View(city)

Summarizing the variables of the data set

library(psych)
summary(city)
##        X              CityName      Population          CityRank    
##  Min.   :    1   Delhi    :2048   Min.   :    8096   Min.   : 0.00  
##  1st Qu.: 3309   Jaipur   : 768   1st Qu.:  744983   1st Qu.: 2.00  
##  Median : 6616   Mumbai   : 712   Median : 3046163   Median : 9.00  
##  Mean   : 6616   Bangalore: 656   Mean   : 4416837   Mean   :14.83  
##  3rd Qu.: 9924   Goa      : 624   3rd Qu.: 8443675   3rd Qu.:24.00  
##  Max.   :13232   Kochi    : 608   Max.   :12442373   Max.   :44.00  
##                  (Other)  :7816                                     
##   IsMetroCity     IsTouristDestination   IsWeekend       IsNewYearEve   
##  Min.   :0.0000   Min.   :0.0000       Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000       1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000       Median :1.0000   Median :0.0000  
##  Mean   :0.2842   Mean   :0.6972       Mean   :0.6228   Mean   :0.1244  
##  3rd Qu.:1.0000   3rd Qu.:1.0000       3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000       Max.   :1.0000   Max.   :1.0000  
##                                                                         
##           Date                       HotelName        RoomRent     
##  Dec 21 2016:1611   Vivanta by Taj        :   32   Min.   :   299  
##  Dec 24 2016:1611   Goldfinch Hotel       :   24   1st Qu.:  2436  
##  Dec 25 2016:1611   OYO Rooms             :   24   Median :  4000  
##  Dec 28 2016:1611   The Gordon House Hotel:   24   Mean   :  5474  
##  Dec 31 2016:1611   Apnayt Villa          :   16   3rd Qu.:  6299  
##  Dec 18 2016:1608   Bentleys Hotel Colaba :   16   Max.   :322500  
##  (Other)    :3569   (Other)               :13096                   
##    StarRating       Airport      
##  Min.   :0.000   Min.   :  0.20  
##  1st Qu.:3.000   1st Qu.:  8.40  
##  Median :3.000   Median : 15.00  
##  Mean   :3.459   Mean   : 21.16  
##  3rd Qu.:4.000   3rd Qu.: 24.00  
##  Max.   :5.000   Max.   :124.00  
##                                  
##                                                                    HotelAddress  
##  The Mall, Shimla                                                        :   32  
##  #2-91/14/8, White Fields, Kondapur, Hitech City, Hyderabad, 500084 India:   16  
##  121, City Terrace, Walchand Hirachand Marg, Mumbai, Maharashtra         :   16  
##  14-4507/9, Balmatta Road, Near Jyothi Circle, Hampankatta               :   16  
##  144/7, Rajiv Gandi Salai (OMR), Kottivakkam, Chennai, Tamil Nadu        :   16  
##  17, Oliver Road, Colaba, Mumbai, Maharashtra                            :   16  
##  (Other)                                                                 :13120  
##   HotelPincode         HotelDescription    FreeWifi      FreeBreakfast   
##  Min.   : 100025   3           :  120   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 221001   Abc         :  112   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median : 395003   3-star hotel:  104   Median :1.0000   Median :1.0000  
##  Mean   : 397430   3.5         :   88   Mean   :0.9259   Mean   :0.6491  
##  3rd Qu.: 570001   4           :   72   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :7000157   (Other)     :12728   Max.   :1.0000   Max.   :1.0000  
##                    NA's        :    8                                    
##  HotelCapacity    HasSwimmingPool 
##  Min.   :  0.00   Min.   :0.0000  
##  1st Qu.: 16.00   1st Qu.:0.0000  
##  Median : 34.00   Median :0.0000  
##  Mean   : 62.51   Mean   :0.3558  
##  3rd Qu.: 75.00   3rd Qu.:1.0000  
##  Max.   :600.00   Max.   :1.0000  
## 
describe(city)
##                      vars     n       mean         sd    median    trimmed
## X                       1 13232    6616.50    3819.89    6616.5    6616.50
## CityName*               2 13232      18.07      11.72      16.0      17.29
## Population              3 13232 4416836.87 4258386.00 3046163.0 4040816.22
## CityRank                4 13232      14.83      13.51       9.0      13.30
## IsMetroCity             5 13232       0.28       0.45       0.0       0.23
## IsTouristDestination    6 13232       0.70       0.46       1.0       0.75
## IsWeekend               7 13232       0.62       0.48       1.0       0.65
## IsNewYearEve            8 13232       0.12       0.33       0.0       0.03
## Date*                   9 13232      14.26       2.82      14.0      14.39
## HotelName*             10 13232     841.84     488.14     834.0     842.05
## RoomRent               11 13232    5473.99    7333.12    4000.0    4383.33
## StarRating             12 13232       3.46       0.76       3.0       3.40
## Airport                13 13232      21.16      22.76      15.0      16.39
## HotelAddress*          14 13232    1202.69     581.98    1261.0    1233.43
## HotelPincode           15 13232  397430.26  259837.50  395003.0  388540.47
## HotelDescription*      16 13224     581.40     363.01     570.0     575.79
## FreeWifi               17 13232       0.93       0.26       1.0       1.00
## FreeBreakfast          18 13232       0.65       0.48       1.0       0.69
## HotelCapacity          19 13232      62.51      76.66      34.0      46.03
## HasSwimmingPool        20 13232       0.36       0.48       0.0       0.32
##                             mad      min      max      range  skew
## X                       4904.44      1.0    13232    13231.0  0.00
## CityName*                 11.86      1.0       42       41.0  0.48
## Population           3846498.95   8096.0 12442373 12434277.0  0.68
## CityRank                  11.86      0.0       44       44.0  0.69
## IsMetroCity                0.00      0.0        1        1.0  0.96
## IsTouristDestination       0.00      0.0        1        1.0 -0.86
## IsWeekend                  0.00      0.0        1        1.0 -0.51
## IsNewYearEve               0.00      0.0        1        1.0  2.28
## Date*                      2.97      1.0       20       19.0 -1.05
## HotelName*               644.93      1.0     1670     1669.0  0.00
## RoomRent                2653.85    299.0   322500   322201.0 16.75
## StarRating                 0.74      0.0        5        5.0  0.48
## Airport                   11.12      0.2      124      123.8  2.73
## HotelAddress*            668.65      1.0     2108     2107.0 -0.37
## HotelPincode          257975.37 100025.0  7000157  6900132.0  9.99
## HotelDescription*        465.54      1.0     1226     1225.0  0.10
## FreeWifi                   0.00      0.0        1        1.0 -3.25
## FreeBreakfast              0.00      0.0        1        1.0 -0.62
## HotelCapacity             28.17      0.0      600      600.0  2.95
## HasSwimmingPool            0.00      0.0        1        1.0  0.60
##                      kurtosis       se
## X                       -1.20    33.21
## CityName*               -0.88     0.10
## Population              -1.08 37019.65
## CityRank                -0.76     0.12
## IsMetroCity             -1.08     0.00
## IsTouristDestination    -1.26     0.00
## IsWeekend               -1.74     0.00
## IsNewYearEve             3.18     0.00
## Date*                    2.93     0.02
## HotelName*              -1.26     4.24
## RoomRent               582.06    63.75
## StarRating               0.25     0.01
## Airport                  7.89     0.20
## HotelAddress*           -0.88     5.06
## HotelPincode           249.76  2258.86
## HotelDescription*       -1.25     3.16
## FreeWifi                 8.57     0.00
## FreeBreakfast           -1.61     0.00
## HotelCapacity           11.39     0.67
## HasSwimmingPool         -1.64     0.00
#BoxPlot for HotelCapacity
boxplot(city$HotelCapacity, main="Boxplot for Hotel Capacity",horizontal = TRUE)

#Scatterplot pair wise for predictor variable
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
#StarRating Vs RoomRent
scatterplot(city$StarRating,city$RoomRent,main="RoomRent of Hotels  with StarRating",ylab = "RoomRent in INR", xlab="Star rating out of 5",cex=1.1)

#RoomRent Vs HotelCapacity
 scatterplot(city$RoomRent,city$HotelCapacity,main="RoomRent of Hotels  with Hotel capacity",ylab = "Hotel Capacity in rooms", xlab="RoomRent in INR",cex=1.1)

#RoomRent Vs HasSwimmingPool
plot(jitter(city$RoomRent),jitter(city$HasSwimmingPool),main="RoomRent of Hotels  with HasSwimmingPool",ylab = "Has Swimmng Pool ", xlab="RoomRent",cex=1.1)

 library(lattice)
bwplot(HasSwimmingPool~RoomRent, data = city,main="RoomRent of Hotels  with HasSwimmingPool",ylab = "Has Swimmng Pool ", xlab="RoomRent" )

#Scatterplot matrix

 scatterplotMatrix(
     city[
        ,c("RoomRent","HasSwimmingPool","StarRating", "HotelCapacity")], 
     spread=FALSE, smoother.args=list(lty=2),
   main="Scatter Plot Matrix", diagonal = "histogram")
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

#Corrgram of Y, x1, x2, x3
 
 library(corrgram)
 
 xyz<-data.frame(city$RoomRent, city$HasSwimmingPool, city$HotelCapacity, city$StarRating)
 corrgram(xyz, order=TRUE, lower.panel=panel.shade,
          upper.panel=panel.pie, text.panel=panel.txt,
          main="Corrgram of Hotel Prices In India")

 library(corrgram)
 
 corrgram(city, order=TRUE, lower.panel=panel.shade,
          upper.panel=panel.pie, text.panel=panel.txt,
          main="Corrgram of Hotel  data")

 ##through corrgram HasSwimming, StarRating, HotelCapital are very well correlated to RoomRent
 ##so we can take them as predictors
#Comapring RoomRent on different dates
 table(city$Date)
## 
##   18-Dec-16   21-Dec-16   24-Dec-16   25-Dec-16   28-Dec-16   31-Dec-16 
##          44          44          44          44          44          44 
##    4-Jan-16    4-Jan-17    8-Jan-16    8-Jan-17 Dec 18 2016 Dec 21 2016 
##          31          13          31          13        1608        1611 
## Dec 24 2016 Dec 25 2016 Dec 28 2016 Dec 31 2016 Jan 04 2017 Jan 08 2017 
##        1611        1611        1611        1611        1548        1542 
##  Jan 4 2017  Jan 8 2017 
##          60          67
library(lattice)
histogram(~Date, data = city, main="Distribution of Dates", xlab = "Differnt of Dates", col="Blue")

 #Effect of different dates on RoomRent
 
d = aggregate(RoomRent ~ Date, data = city,mean)
 d
##           Date RoomRent
## 1    18-Dec-16 3366.795
## 2    21-Dec-16 3437.545
## 3    24-Dec-16 3510.795
## 4    25-Dec-16 3349.591
## 5    28-Dec-16 3450.045
## 6    31-Dec-16 3570.318
## 7     4-Jan-16 4738.548
## 8     4-Jan-17 3829.615
## 9     8-Jan-16 4907.419
## 10    8-Jan-17 3843.077
## 11 Dec 18 2016 4938.257
## 12 Dec 21 2016 5130.320
## 13 Dec 24 2016 5598.746
## 14 Dec 25 2016 5521.896
## 15 Dec 28 2016 5652.478
## 16 Dec 31 2016 6263.374
## 17 Jan 04 2017 5754.513
## 18 Jan 08 2017 5406.821
## 19  Jan 4 2017 4481.400
## 20  Jan 8 2017 4347.821
 scatterplot(d$Date,d$RoomRent, main="Scatterplot between Date and RoomRent", xlab="Date", ylab = "Room Rent in Rupees")

 boxplot(RoomRent~Date,data=city, main="Room rent vs. Date", xlab="Different Dates", ylab="Room Rent in rupees ", col=c("red","blue","green","yellow"))

 #Analyzing IsWeekeng effect on RoomRent
 table(city$IsWeekend)
## 
##    0    1 
## 4991 8241
 table1<-table(city$IsWeekend)
 barplot(table1, main="Distribution of Weekend", xlab="Not weekend(0)         Weekend(1)", col="orange")

#Comapring RoomRent on different dates
table(city$Date)
## 
##   18-Dec-16   21-Dec-16   24-Dec-16   25-Dec-16   28-Dec-16   31-Dec-16 
##          44          44          44          44          44          44 
##    4-Jan-16    4-Jan-17    8-Jan-16    8-Jan-17 Dec 18 2016 Dec 21 2016 
##          31          13          31          13        1608        1611 
## Dec 24 2016 Dec 25 2016 Dec 28 2016 Dec 31 2016 Jan 04 2017 Jan 08 2017 
##        1611        1611        1611        1611        1548        1542 
##  Jan 4 2017  Jan 8 2017 
##          60          67
library(lattice)
 histogram(~Date, data = city, main="Distribution of Dates", xlab = "Differnt of Dates", col="Blue")

#Analyzing Airport distance from hotel effects in what way on RoomRent
summary(city$Airport)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.20    8.40   15.00   21.16   24.00  124.00
boxplot(city$Airport, main="Boxplot of Airport",xlab= "Distance of airport from hotel(Km)" ,col="green",horizontal = TRUE)

#1.Average RoomRent in hotels having swimming pool is more than that which don't have.
 t.test(RoomRent~HasSwimmingPool,data = city, alternative="less")
## 
##  Welch Two Sample t-test
## 
## data:  RoomRent by HasSwimmingPool
## t = -29.013, df = 5011.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -4502.814
## sample estimates:
## mean in group 0 mean in group 1 
##        3775.566        8549.052
 #2.Average RoomRent in hotels with high star rating is high as compared to one which has less star rating.
 t.test(city$RoomRent,city$StarRating)
## 
##  Welch Two Sample t-test
## 
## data:  city$RoomRent and city$StarRating
## t = 85.813, df = 13231, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5345.575 5595.491
## sample estimates:
##   mean of x   mean of y 
## 5473.991838    3.458933
 #3.Average RoomRent in hotels having more hotel capacity is more compared to one with less capacity.
 t.test(city$RoomRent,city$HotelCapacity)
## 
##  Welch Two Sample t-test
## 
## data:  city$RoomRent and city$HotelCapacity
## t = 84.882, df = 13234, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5286.515 5536.445
## sample estimates:
##  mean of x  mean of y 
## 5473.99184   62.51164
#Generating a multiple linear regression model for RoomRent
 #1.
 fit1<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity-1, data = city)
 summary(fit1)
## 
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity - 
##     1, data = city)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8039  -2448  -1249    461 312401 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## StarRating      1396.8746    26.1320  53.455  < 2e-16 ***
## HasSwimmingPool 3719.6943   148.7835  25.001  < 2e-16 ***
## HotelCapacity     -7.6598     0.9415  -8.136 4.44e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6813 on 13229 degrees of freedom
## Multiple R-squared:  0.4457, Adjusted R-squared:  0.4456 
## F-statistic:  3546 on 3 and 13229 DF,  p-value: < 2.2e-16
#Coefficents of the model
 fit1$coefficients
##      StarRating HasSwimmingPool   HotelCapacity 
##     1396.874562     3719.694300       -7.659814
#Fitted residuals and values  are checked and the deviation was around 1000 , because of 
 #large data points it's not suitable to show those in the output file.
 
 ###.  Model1:    salary = b0 + b1*StarRating + b2*HasSwimmingPool+ b3*HotelCapacity
 #   b0 = -1(assumption),  b1 =  1396.874562, b2=3719.6943, b3= -7.659814
 #  Model:    salary = -1 + 1396.874562*StarRating + 3719.6943*HasSwimmingPool -7.659814*HotelCapacity
#2.
 fit2<-lm(RoomRent~StarRating+HasSwimmingPool+HotelCapacity+IsWeekend+IsTouristDestination-1, data = city)
 summary(fit2)
## 
## Call:
## lm(formula = RoomRent ~ StarRating + HasSwimmingPool + HotelCapacity + 
##     IsWeekend + IsTouristDestination - 1, data = city)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8326  -2517  -1212    463 312480 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## StarRating           1258.9558    44.4985  28.292  < 2e-16 ***
## HasSwimmingPool      3670.2511   148.8411  24.659  < 2e-16 ***
## HotelCapacity          -6.1769     0.9658  -6.396 1.65e-10 ***
## IsWeekend            -509.6479   119.1618  -4.277 1.91e-05 ***
## IsTouristDestination 1053.0394   124.7325   8.442  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6792 on 13227 degrees of freedom
## Multiple R-squared:  0.4493, Adjusted R-squared:  0.4491 
## F-statistic:  2159 on 5 and 13227 DF,  p-value: < 2.2e-16
#Coefficents of the model
 fit2$coefficients
##           StarRating      HasSwimmingPool        HotelCapacity 
##          1258.955786          3670.251057            -6.176913 
##            IsWeekend IsTouristDestination 
##          -509.647863          1053.039364
#Fitted residuals and values  are checked and the deviation was around 1000 , because of 
 #large data points it's not suitable to show those in the output file.
 ###.  Model1:    salary = b0 + b1*StarRating + b2*HasSwimmingPool+ b3*HotelCapacity +b4*IsWeekend(0) + b5*IsWeekend(1) + b6*IsTouristDestination
 #   b0 = -1(assumption),  b1 =  3635.819, b2=2285.132, b3= -13.965, b4=-8396.67457, b5=-8325.09152,b6=1878.94395
 #  Model:    salary = -1 + 3635.819*StarRating + 2285.132*HasSwimmingPool -13.965*HotelCapacity
 # -8396.67457*IsWeekend(0) - 8325.09152*IsWeekend(1) + 1878.94395*IsTouristDestination 
 knitr::opts_chunk$set(echo = TRUE)