expedia<-read.csv('/Users/yanghaoying/Desktop/277 expedia/expedia.csv', header = T)
attach(expedia)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MatchIt)
library(ggplot2)
library('corrplot')
## corrplot 0.84 loaded
library(tidyr)
visualize the distribution of variables using boxplot.
Graph_Boxplot <- function (input, na.rm = TRUE){
Plot <- ggplot(expedia, aes(x="", y=input)) +
geom_boxplot(aes(fill=input), color="green") +
labs(title="Outliers")
Plot
}
Graph_Boxplot(expedia$trip_roomN)
Graph_Boxplot(expedia$trip_peopleN)
Graph_Boxplot(expedia$hotel_price)
Graph_Boxplot(expedia$book_before_time)
From boxplots, we can see there are some unrealistic outliers, we delete those data.
Delete outliers in hotel_price
expedia<-filter(expedia, hotel_price<4000)
Visualize relationship between hotel_price and other variables
boxplot(expedia$hotel_price ~ expedia$trip_roomN, expedia, ylim = c(0, 1000),
ylab="hotel_price",xlab="trip_roomN")
boxplot(expedia$hotel_price ~ expedia$trip_peopleN, expedia, ylim = c(0, 1000),
ylab="hotel_price",xlab="trip_peopleN")
boxplot(expedia$hotel_price ~ expedia$hotel_star, expedia, ylim = c(0, 1000),
ylab="hotel_price",xlab="hotel_star")
scatter plot
plot(y=expedia$hotel_price, x=expedia$book_before_time,
col="brown",
ylim=c(0, 5000),
main="Relationship Btw price and book_before_time",
ylab="price", xlab="book_before_time")
relation of hotel price with other feature
Graph <- gather(expedia, variable, value, -hotel_price)
head(Graph)
## hotel_price variable value
## 1 440 X 1
## 2 359 X 2
## 3 449 X 3
## 4 90 X 4
## 5 413 X 5
## 6 339 X 6
ggplot(Graph) +
geom_jitter(aes(value,hotel_price, colour=variable)) +
geom_smooth(aes(value,hotel_price, colour=variable), method=lm, se=FALSE) +
facet_wrap(~variable, scales="free_x") +
labs(title="Relation Of Price With Other Features")
Use correlation function to find relationship between variables, in order to simplify variables.
s_data<-subset(expedia,select=c(trip_peopleN,hotel_star,city1,city2,city3,city4,
book_before_time,Seasonality.Q1,Seasonality.Q2,Seasonality.Q3,
Seasonality.Q4))
M<-cor(s_data)
corrplot(M, method = "circle")
Use stepwise regression to select important variables.
stpModel=step(lm(data=expedia, hotel_price~.), trace=0, steps=1000)
stpSummary <- summary(stpModel)
stpSummary
##
## Call:
## lm(formula = hotel_price ~ X + trip_peopleN + hotel_star + city1 +
## city2 + city3 + book_before_time + Seasonality.Q1 + Seasonality.Q2 +
## Seasonality.Q3, data = expedia)
##
## Residuals:
## Min 1Q Median 3Q Max
## -407.16 -63.12 -16.75 42.06 2607.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.190e+01 1.948e+00 -47.176 <2e-16 ***
## X -2.024e-04 1.291e-05 -15.680 <2e-16 ***
## trip_peopleN 1.201e+01 6.095e-01 19.708 <2e-16 ***
## hotel_star 7.996e+01 4.211e-01 189.908 <2e-16 ***
## city1 -1.566e+02 1.073e+00 -145.872 <2e-16 ***
## city2 -3.254e+01 1.052e+00 -30.929 <2e-16 ***
## city3 1.014e+02 9.829e-01 103.165 <2e-16 ***
## book_before_time 2.290e-01 1.358e-02 16.867 <2e-16 ***
## Seasonality.Q1 -1.781e+01 1.595e+00 -11.168 <2e-16 ***
## Seasonality.Q2 -2.851e+01 2.904e+00 -9.817 <2e-16 ***
## Seasonality.Q3 -7.760e+01 4.683e+00 -16.571 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 111 on 96847 degrees of freedom
## Multiple R-squared: 0.4458, Adjusted R-squared: 0.4457
## F-statistic: 7790 on 10 and 96847 DF, p-value: < 2.2e-16
Build regression model based on the result from stepwise regression.
Model1 <- hotel_price ~ trip_peopleN+hotel_star+city1+city2+
city3+book_before_time+Seasonality.Q1+Seasonality.Q2+Seasonality.Q3
fit1 <- lm(Model1, data = expedia)
summary(fit1)
##
## Call:
## lm(formula = Model1, data = expedia)
##
## Residuals:
## Min 1Q Median 3Q Max
## -402.12 -63.12 -16.58 42.03 2615.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -101.80840 1.84484 -55.19 <2e-16 ***
## trip_peopleN 11.64335 0.60979 19.09 <2e-16 ***
## hotel_star 79.81511 0.42149 189.37 <2e-16 ***
## city1 -154.78386 1.06862 -144.84 <2e-16 ***
## city2 -31.45015 1.05112 -29.92 <2e-16 ***
## city3 103.31350 0.97652 105.80 <2e-16 ***
## book_before_time 0.23624 0.01359 17.39 <2e-16 ***
## Seasonality.Q1 -19.32017 1.59429 -12.12 <2e-16 ***
## Seasonality.Q2 -30.60961 2.90497 -10.54 <2e-16 ***
## Seasonality.Q3 -79.82501 4.68661 -17.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 111.1 on 96848 degrees of freedom
## Multiple R-squared: 0.4444, Adjusted R-squared: 0.4443
## F-statistic: 8607 on 9 and 96848 DF, p-value: < 2.2e-16