expedia<-read.csv('/Users/yanghaoying/Desktop/277 expedia/expedia.csv', header = T)
attach(expedia)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(MatchIt)
library(ggplot2)
library('corrplot') 
## corrplot 0.84 loaded
library(tidyr)

visualize the distribution of variables using boxplot.

Graph_Boxplot <- function (input, na.rm = TRUE){
  Plot <- ggplot(expedia, aes(x="", y=input)) +
    geom_boxplot(aes(fill=input), color="green") +
    labs(title="Outliers")
  Plot
}

Graph_Boxplot(expedia$trip_roomN)

Graph_Boxplot(expedia$trip_peopleN)

Graph_Boxplot(expedia$hotel_price)

Graph_Boxplot(expedia$book_before_time)

From boxplots, we can see there are some unrealistic outliers, we delete those data.

Delete outliers in hotel_price

expedia<-filter(expedia, hotel_price<4000)

Visualize relationship between hotel_price and other variables

boxplot(expedia$hotel_price ~ expedia$trip_roomN, expedia, ylim = c(0, 1000), 
        ylab="hotel_price",xlab="trip_roomN")

boxplot(expedia$hotel_price ~ expedia$trip_peopleN, expedia, ylim = c(0, 1000),
        ylab="hotel_price",xlab="trip_peopleN")

boxplot(expedia$hotel_price ~ expedia$hotel_star, expedia, ylim = c(0, 1000),
        ylab="hotel_price",xlab="hotel_star")

scatter plot

plot(y=expedia$hotel_price, x=expedia$book_before_time,
     col="brown",
     ylim=c(0, 5000), 
     main="Relationship Btw price and book_before_time",
     ylab="price", xlab="book_before_time")

relation of hotel price with other feature

Graph <- gather(expedia, variable, value, -hotel_price)
head(Graph)
##   hotel_price variable value
## 1         440        X     1
## 2         359        X     2
## 3         449        X     3
## 4          90        X     4
## 5         413        X     5
## 6         339        X     6
ggplot(Graph) +
  geom_jitter(aes(value,hotel_price, colour=variable)) + 
  geom_smooth(aes(value,hotel_price, colour=variable), method=lm, se=FALSE) +
  facet_wrap(~variable, scales="free_x") +
  labs(title="Relation Of Price With Other Features")

Use correlation function to find relationship between variables, in order to simplify variables.

s_data<-subset(expedia,select=c(trip_peopleN,hotel_star,city1,city2,city3,city4,
                                book_before_time,Seasonality.Q1,Seasonality.Q2,Seasonality.Q3,
                                Seasonality.Q4))
M<-cor(s_data)
corrplot(M, method = "circle") 

Use stepwise regression to select important variables.

stpModel=step(lm(data=expedia, hotel_price~.), trace=0, steps=1000)
stpSummary <- summary(stpModel)
stpSummary 
## 
## Call:
## lm(formula = hotel_price ~ X + trip_peopleN + hotel_star + city1 + 
##     city2 + city3 + book_before_time + Seasonality.Q1 + Seasonality.Q2 + 
##     Seasonality.Q3, data = expedia)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -407.16  -63.12  -16.75   42.06 2607.00 
## 
## Coefficients:
##                    Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)      -9.190e+01  1.948e+00  -47.176   <2e-16 ***
## X                -2.024e-04  1.291e-05  -15.680   <2e-16 ***
## trip_peopleN      1.201e+01  6.095e-01   19.708   <2e-16 ***
## hotel_star        7.996e+01  4.211e-01  189.908   <2e-16 ***
## city1            -1.566e+02  1.073e+00 -145.872   <2e-16 ***
## city2            -3.254e+01  1.052e+00  -30.929   <2e-16 ***
## city3             1.014e+02  9.829e-01  103.165   <2e-16 ***
## book_before_time  2.290e-01  1.358e-02   16.867   <2e-16 ***
## Seasonality.Q1   -1.781e+01  1.595e+00  -11.168   <2e-16 ***
## Seasonality.Q2   -2.851e+01  2.904e+00   -9.817   <2e-16 ***
## Seasonality.Q3   -7.760e+01  4.683e+00  -16.571   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 111 on 96847 degrees of freedom
## Multiple R-squared:  0.4458, Adjusted R-squared:  0.4457 
## F-statistic:  7790 on 10 and 96847 DF,  p-value: < 2.2e-16

Build regression model based on the result from stepwise regression.

Model1 <- hotel_price ~ trip_peopleN+hotel_star+city1+city2+
  city3+book_before_time+Seasonality.Q1+Seasonality.Q2+Seasonality.Q3
  fit1 <- lm(Model1, data = expedia)
summary(fit1)
## 
## Call:
## lm(formula = Model1, data = expedia)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -402.12  -63.12  -16.58   42.03 2615.21 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -101.80840    1.84484  -55.19   <2e-16 ***
## trip_peopleN       11.64335    0.60979   19.09   <2e-16 ***
## hotel_star         79.81511    0.42149  189.37   <2e-16 ***
## city1            -154.78386    1.06862 -144.84   <2e-16 ***
## city2             -31.45015    1.05112  -29.92   <2e-16 ***
## city3             103.31350    0.97652  105.80   <2e-16 ***
## book_before_time    0.23624    0.01359   17.39   <2e-16 ***
## Seasonality.Q1    -19.32017    1.59429  -12.12   <2e-16 ***
## Seasonality.Q2    -30.60961    2.90497  -10.54   <2e-16 ***
## Seasonality.Q3    -79.82501    4.68661  -17.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 111.1 on 96848 degrees of freedom
## Multiple R-squared:  0.4444, Adjusted R-squared:  0.4443 
## F-statistic:  8607 on 9 and 96848 DF,  p-value: < 2.2e-16