Reading the data

data <- read.csv('study1data.csv')
data <- na.omit(data)
str(data)
## 'data.frame':    1594 obs. of  10 variables:
##  $ Review_Overall_Rating : int  5 5 3 5 5 5 5 5 2 5 ...
##  $ Review_Date           : chr  "6/22/2014" "6/7/2014" "6/6/2014" "6/1/2014" ...
##  $ Day_of_visit_tentative: chr  "4/15/2014" "3/15/2014" "3/15/2014" "5/15/2014" ...
##  $ Rating_Value          : int  4 5 2 5 5 5 5 5 2 5 ...
##  $ Rating_Location       : int  4 4 3 5 5 5 5 5 2 5 ...
##  $ Rating_Sleep_Quality  : int  5 5 1 5 5 5 5 5 3 4 ...
##  $ Rating_Rooms          : int  5 5 1 5 5 4 5 5 2 4 ...
##  $ Rating_Cleanliness    : int  5 5 4 5 5 5 5 5 2 5 ...
##  $ Rating_Service        : int  5 5 5 5 5 5 5 5 2 5 ...
##  $ VisitType             : chr  "as a couple" "as a couple" "as a couple" "as a couple" ...

Convertig Visit and Review Date to Date Format: YYYY - MM - DD

data$Review_Date <- as.POSIXct(data$Review_Date, format="%m/%d/%Y")
data$Day_of_visit_tentative <- as.POSIXct(data$Day_of_visit_tentative, format="%m/%d/%Y")

Adding Column for Delay in Review (in No. of Days)

data$Review.time.diff  <- difftime(data$Review_Date, data$Day_of_visit_tentative, units ="days")

hist(as.numeric(data$Review.time.diff))

Now Adding a Categorical Variable for Delay in Review

data$Review.delay <- ifelse(data$Review.time.diff < 0,"Before Visiting",ifelse(data$Review.time.diff >= 0 & data$Review.time.diff <100, "0-100", ifelse(data$Review.time.diff >=100 & data$Review.time.diff < 200, "100-200", ifelse(data$Review.time.diff >=200 & data$Review.time.diff <400, "200-400", "400+"))))

data$Review.delay <- as.factor(data$Review.delay)

Following Function will return the Week No. of a Date

whichWeek <- function(Date){
  ceiling(as.numeric(format(Date, "%d"))/7 )
}

Analysing Week no. of Day of Visit

table(whichWeek(data$Day_of_visit_tentative))
## 
##    3 
## 1594
Conclusion: All "Day of visit" Dates are from 3rd week

Adding a Column Corresponding to Week No. of Review Date

data$Review.Week.No <- whichWeek(data$Review_Date)
table(data$Review.Week.No)
## 
##   1   2   3   4   5 
## 353 371 365 382 123

Including Last Two Day of a Month in Week No. 4

data$Review.Week.No[which(data$Review.Week.No == 5)] <- 4
data$Review.Week.No <- as.factor(data$Review.Week.No)

Adding a Column for "Vacation" and "Work" Type of Visit

data$VisitType2 <- ifelse(data$VisitType == "on business", "Work", "Vacation")
data$VisitType2 <- as.factor(data$VisitType2)

Sbusetting the data required modeling

names(data)
##  [1] "Review_Overall_Rating"  "Review_Date"            "Day_of_visit_tentative"
##  [4] "Rating_Value"           "Rating_Location"        "Rating_Sleep_Quality"  
##  [7] "Rating_Rooms"           "Rating_Cleanliness"     "Rating_Service"        
## [10] "VisitType"              "Review.time.diff"       "Review.delay"          
## [13] "Review.Week.No"         "VisitType2"
data2 <- data[ , -c(2,3,11)]

Checking which attributes are highly corelated

View(cor(data2[,c(2:7)]))

1. Analysis of individual attributes which impact the overall rating most

lm1 <- lm(Review_Overall_Rating ~ Rating_Value + Rating_Location + Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + Rating_Service, data = data2)
library(car)
## Loading required package: carData
vif(lm1)
##         Rating_Value      Rating_Location Rating_Sleep_Quality 
##             2.184989             1.325569             2.544741 
##         Rating_Rooms   Rating_Cleanliness       Rating_Service 
##             2.485250             2.754335             2.259928

There is no strong multi-colinearity among the attributes

summary(lm1)
## 
## Call:
## lm(formula = Review_Overall_Rating ~ Rating_Value + Rating_Location + 
##     Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + 
##     Rating_Service, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.40040 -0.11665 -0.04069  0.23323  1.71900 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.14833    0.07302  -2.031   0.0424 *  
## Rating_Value          0.26864    0.01667  16.117  < 2e-16 ***
## Rating_Location       0.05254    0.01642   3.200   0.0014 ** 
## Rating_Sleep_Quality  0.16059    0.01823   8.809  < 2e-16 ***
## Rating_Rooms          0.17586    0.01869   9.409  < 2e-16 ***
## Rating_Cleanliness    0.10206    0.02020   5.052 4.87e-07 ***
## Rating_Service        0.28757    0.01707  16.846  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4383 on 1587 degrees of freedom
## Multiple R-squared:  0.7788, Adjusted R-squared:  0.778 
## F-statistic: 931.4 on 6 and 1587 DF,  p-value: < 2.2e-16

By seeing Coefficients(Estimate and Probability) from summary(lm2) we can say that the order of attributes impact on Over all Rating in decesnding order is as:

Rating_Service > Rating_Value > Rating_Rooms > Rating_Sleep_Quality > Rating_Cleanliness > Rating_Location

2. Whether individuals rate a hotel more when they visit with family/spouse/friends/solo (Vacation) vs business/collegues (Work)

lm2 <- lm(Review_Overall_Rating ~ Rating_Value + Rating_Location + Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + Rating_Service + VisitType2, data = data2)
vif(lm2)
##         Rating_Value      Rating_Location Rating_Sleep_Quality 
##             2.220949             1.393369             2.558190 
##         Rating_Rooms   Rating_Cleanliness       Rating_Service 
##             2.485946             2.787377             2.267515 
##           VisitType2 
##             1.107019
summary(lm2)
## 
## Call:
## lm(formula = Review_Overall_Rating ~ Rating_Value + Rating_Location + 
##     Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + 
##     Rating_Service + VisitType2, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.39800 -0.12064 -0.03805  0.23736  1.72235 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.152405   0.073942  -2.061  0.03945 *  
## Rating_Value          0.269397   0.016810  16.026  < 2e-16 ***
## Rating_Location       0.053852   0.016835   3.199  0.00141 ** 
## Rating_Sleep_Quality  0.160125   0.018284   8.758  < 2e-16 ***
## Rating_Rooms          0.175751   0.018699   9.399  < 2e-16 ***
## Rating_Cleanliness    0.101272   0.020326   4.982 6.96e-07 ***
## Rating_Service        0.287216   0.017104  16.793  < 2e-16 ***
## VisitType2Work        0.009258   0.026130   0.354  0.72315    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4385 on 1586 degrees of freedom
## Multiple R-squared:  0.7788, Adjusted R-squared:  0.7779 
## F-statistic: 797.9 on 7 and 1586 DF,  p-value: < 2.2e-16

Since the Estimate of "VisitType2Work" (i.e. 0.009258) is slightly greater than ZERO, so individuals orresponding to "Work" type of visit will rate slightly greater than individual corresponding to Vacation

3. Which attribute is more important in which type of visit

data2.couple <- subset(data2, VisitType == "as a couple")
data2.business <- subset(data2, VisitType == "on business")
data2.solo <- subset(data2, VisitType ==  "solo")
data2.family <- subset(data2, VisitType == "with family")
data2.friends <- subset(data2, VisitType == "with friends")

lm.couple <- lm(Review_Overall_Rating ~ Rating_Value + Rating_Location + Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + Rating_Service, data = data2.couple)

lm.business <- lm(Review_Overall_Rating ~ Rating_Value + Rating_Location + Rating_Sleep_Quality + Rating_Rooms +Rating_Cleanliness + Rating_Service, data = data2.business)

lm.solo <- lm(Review_Overall_Rating ~ Rating_Value + Rating_Location + Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + Rating_Service, data = data2.solo)

lm.family <- lm(Review_Overall_Rating ~ Rating_Value + Rating_Location + Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + Rating_Service, data = data2.family)

lm.friends <- lm(Review_Overall_Rating ~ Rating_Value + Rating_Location + Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + Rating_Service, data = data2.friends)
summary(lm.couple)
## 
## Call:
## lm(formula = Review_Overall_Rating ~ Rating_Value + Rating_Location + 
##     Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + 
##     Rating_Service, data = data2.couple)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.27870 -0.08032 -0.03107  0.22852  1.41349 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.16590    0.13384  -1.240 0.215791    
## Rating_Value          0.19398    0.02984   6.501 2.13e-10 ***
## Rating_Location       0.13573    0.03226   4.207 3.12e-05 ***
## Rating_Sleep_Quality  0.15764    0.02892   5.451 8.29e-08 ***
## Rating_Rooms          0.15120    0.03222   4.693 3.58e-06 ***
## Rating_Cleanliness    0.13110    0.03455   3.794 0.000168 ***
## Rating_Service        0.27960    0.03109   8.994  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.391 on 449 degrees of freedom
## Multiple R-squared:  0.7786, Adjusted R-squared:  0.7756 
## F-statistic: 263.1 on 6 and 449 DF,  p-value: < 2.2e-16

Attributes which are more important are:

Rating_Service > Rating_Value > Rating_Sleep_Quality > Rating_Rooms > Rating_Location > Rating_Cleanliness :: ALL ARE HIGHLY SIGNIFICANT

summary(lm.business) 
## 
## Call:
## lm(formula = Review_Overall_Rating ~ Rating_Value + Rating_Location + 
##     Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + 
##     Rating_Service, data = data2.business)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.76356 -0.17601 -0.04095  0.20431  1.44053 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.12738    0.14753  -0.863 0.388400    
## Rating_Value          0.22683    0.03078   7.368 9.31e-13 ***
## Rating_Location       0.01844    0.02812   0.656 0.512406    
## Rating_Sleep_Quality  0.15663    0.04209   3.721 0.000225 ***
## Rating_Rooms          0.17254    0.03572   4.830 1.92e-06 ***
## Rating_Cleanliness    0.11266    0.04049   2.783 0.005638 ** 
## Rating_Service        0.35039    0.03269  10.718  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4247 on 418 degrees of freedom
## Multiple R-squared:  0.7382, Adjusted R-squared:  0.7344 
## F-statistic: 196.4 on 6 and 418 DF,  p-value: < 2.2e-16

Rating_Service > Rating_Value > Rating_Rooms > Rating_Sleep_Quality > Rating_Cleanliness > Rating_Location (but Location is not significant)

summary(lm.solo)   
## 
## Call:
## lm(formula = Review_Overall_Rating ~ Rating_Value + Rating_Location + 
##     Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + 
##     Rating_Service, data = data2.solo)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81307 -0.11753 -0.02317  0.21690  0.68162 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.35428    0.22038  -1.608 0.113087    
## Rating_Value          0.22981    0.08153   2.819 0.006494 ** 
## Rating_Location       0.07465    0.06785   1.100 0.275528    
## Rating_Sleep_Quality  0.14465    0.10327   1.401 0.166374    
## Rating_Rooms          0.22056    0.06960   3.169 0.002391 ** 
## Rating_Cleanliness    0.09434    0.09604   0.982 0.329834    
## Rating_Service        0.33034    0.08027   4.115 0.000118 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3678 on 61 degrees of freedom
## Multiple R-squared:  0.9037, Adjusted R-squared:  0.8942 
## F-statistic: 95.39 on 6 and 61 DF,  p-value: < 2.2e-16

Rating_Service > Rating_Rooms > Rating_Value (REMAINING ATTRIBUTES ARE NOT SIGNIFICANT)

summary(lm.family) 
## 
## Call:
## lm(formula = Review_Overall_Rating ~ Rating_Value + Rating_Location + 
##     Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + 
##     Rating_Service, data = data2.family)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.43030 -0.23947 -0.01879  0.31318  1.36656 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.16411    0.15532  -1.057   0.2913    
## Rating_Value          0.38919    0.03426  11.361  < 2e-16 ***
## Rating_Location       0.03433    0.03667   0.936   0.3496    
## Rating_Sleep_Quality  0.16318    0.03831   4.259 2.52e-05 ***
## Rating_Rooms          0.16958    0.03979   4.262 2.49e-05 ***
## Rating_Cleanliness    0.07552    0.04093   1.845   0.0657 .  
## Rating_Service        0.21376    0.03305   6.467 2.70e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4983 on 432 degrees of freedom
## Multiple R-squared:  0.7885, Adjusted R-squared:  0.7856 
## F-statistic: 268.4 on 6 and 432 DF,  p-value: < 2.2e-16

Rating_Value > Rating_Service > Rating_Rooms > Rating_Sleep_Quality (REMAINING ATTRIBUTES ARE NOT SIGNIFICANT)

summary(lm.friends) 
## 
## Call:
## lm(formula = Review_Overall_Rating ~ Rating_Value + Rating_Location + 
##     Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + 
##     Rating_Service, data = data2.friends)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.49539 -0.09348 -0.06782  0.20255  1.62695 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.03484    0.23291  -0.150 0.881260    
## Rating_Value          0.21912    0.05148   4.257 3.20e-05 ***
## Rating_Location       0.07762    0.05608   1.384 0.167907    
## Rating_Sleep_Quality  0.15644    0.04310   3.629 0.000361 ***
## Rating_Rooms          0.20078    0.05804   3.459 0.000663 ***
## Rating_Cleanliness    0.04081    0.05997   0.681 0.496936    
## Rating_Service        0.33090    0.05541   5.972 1.06e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4298 on 199 degrees of freedom
## Multiple R-squared:  0.7634, Adjusted R-squared:  0.7563 
## F-statistic:   107 on 6 and 199 DF,  p-value: < 2.2e-16

Rating_Service > Rating_Value > Rating_Rooms > Rating_Sleep_Quality (EMAINING ATTRIBUTES ARE NOT SIGNIFICANT)

4. Whether individuals rate hotels more when they "review" in the first week of a month

NOTE: Since "Visit Date" contains data of only 3rd week (day 15 of month), so we are not able to compare the overall rating on the basis of week no. of "visit date". Thats why I am doing this problem for "Review Date".

lm3 <- lm(Review_Overall_Rating ~ Rating_Value + Rating_Location + Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + Rating_Service + Review.Week.No, data = data2)

summary(lm3)
## 
## Call:
## lm(formula = Review_Overall_Rating ~ Rating_Value + Rating_Location + 
##     Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + 
##     Rating_Service + Review.Week.No, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3650 -0.1150 -0.0456  0.2277  1.7527 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.12612    0.07678  -1.643  0.10065    
## Rating_Value          0.26841    0.01667  16.103  < 2e-16 ***
## Rating_Location       0.05172    0.01646   3.142  0.00171 ** 
## Rating_Sleep_Quality  0.16088    0.01824   8.820  < 2e-16 ***
## Rating_Rooms          0.17642    0.01869   9.437  < 2e-16 ***
## Rating_Cleanliness    0.10159    0.02024   5.019 5.77e-07 ***
## Rating_Service        0.28784    0.01710  16.832  < 2e-16 ***
## Review.Week.No2      -0.01869    0.03274  -0.571  0.56811    
## Review.Week.No3      -0.05502    0.03275  -1.680  0.09317 .  
## Review.Week.No4      -0.01090    0.03052  -0.357  0.72110    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4383 on 1584 degrees of freedom
## Multiple R-squared:  0.7793, Adjusted R-squared:  0.778 
## F-statistic: 621.4 on 9 and 1584 DF,  p-value: < 2.2e-16

Since estimates corresponding to week 2, 3 and 4 are negative. So individual who are reviewing in week 1 will rate slightly more.

5. Whether individuals rate hotels more when they rate much later from the date of the visit

lm4 <- lm(Review_Overall_Rating ~ Rating_Value + Rating_Location + Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + Rating_Service + Review.delay, data = data2)

summary(lm4)
## 
## Call:
## lm(formula = Review_Overall_Rating ~ Rating_Value + Rating_Location + 
##     Rating_Sleep_Quality + Rating_Rooms + Rating_Cleanliness + 
##     Rating_Service + Review.delay, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3855 -0.1481 -0.0313  0.2425  1.6576 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -0.16208    0.07332  -2.211 0.027212 *  
## Rating_Value                 0.26515    0.01665  15.927  < 2e-16 ***
## Rating_Location              0.05705    0.01646   3.466 0.000543 ***
## Rating_Sleep_Quality         0.16289    0.01820   8.948  < 2e-16 ***
## Rating_Rooms                 0.16943    0.01873   9.045  < 2e-16 ***
## Rating_Cleanliness           0.10286    0.02015   5.105  3.7e-07 ***
## Rating_Service               0.29097    0.01709  17.026  < 2e-16 ***
## Review.delay100-200          0.06844    0.04179   1.638 0.101654    
## Review.delay200-400          0.03595    0.03923   0.916 0.359607    
## Review.delay400+             0.36339    0.11801   3.079 0.002110 ** 
## Review.delayBefore Visiting -0.03776    0.03412  -1.107 0.268643    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4369 on 1583 degrees of freedom
## Multiple R-squared:  0.7808, Adjusted R-squared:  0.7794 
## F-statistic: 563.9 on 10 and 1583 DF,  p-value: < 2.2e-16

Individual who are rating much later from visit date (i.e 400+ days) are rating more because estimates corresponding to this are much greater than other estimates corresponding to delay.