Purpose
The aim of this project is to mine the listing price information from Airbnb in San Francisco. The exploration results could refer useful knowledge for hosts who search for comparable price and make their listing stand out.
Data
bnb <- read.csv("train.csv")
bnb <- bnb[bnb$city == "SF", ]
names(bnb)## [1] "id" "log_price"
## [3] "property_type" "room_type"
## [5] "amenities" "accommodates"
## [7] "bathrooms" "bed_type"
## [9] "cancellation_policy" "cleaning_fee"
## [11] "city" "description"
## [13] "first_review" "host_has_profile_pic"
## [15] "host_identity_verified" "host_response_rate"
## [17] "host_since" "instant_bookable"
## [19] "last_review" "latitude"
## [21] "longitude" "name"
## [23] "neighbourhood" "number_of_reviews"
## [25] "review_scores_rating" "thumbnail_url"
## [27] "zipcode" "bedrooms"
## [29] "beds"
library(mice)
library(ggplot2)
library(dplyr)
library(sampling)
library(ggmap)
library(scales)
library(caret)
library(stringr)
library(ggalt)
library(FNN)
library(rpart)
library(rpart.plot)
library(car)
library(leaps)## Round log_price to 4th decimal
bnb$log_price <- round(bnb$log_price, 4)
## Identify missing value for each attribute
bnb[bnb == '' | bnb == 'NA'] <- NA
colSums(is.na(bnb))## id log_price property_type
## 0 0 0
## room_type amenities accommodates
## 0 0 0
## bathrooms bed_type cancellation_policy
## 17 0 0
## cleaning_fee city description
## 0 0 0
## first_review host_has_profile_pic host_identity_verified
## 1317 2 2
## host_response_rate host_since instant_bookable
## 1989 2 0
## last_review latitude longitude
## 1317 0 0
## name neighbourhood number_of_reviews
## 0 6 0
## review_scores_rating thumbnail_url zipcode
## 1387 480 80
## bedrooms beds
## 6 11
## Remove columns: amenities, description, first_review, host_since, last_review, name, neighbourhood, thumbnail_url, zipcode
new <- bnb[ , -c(5, 12, 13, 17, 19, 22, 23, 26, 27)]The mice package helps analyst to impute missing values with plausible data values. Those plausible values are drawn from a distribution specifically designed for each missing datapoint.
## Use CART to impute the missing data
mice.data <- mice(new,
m = 3,
maxit = 50,
method = "cart",
seed = 188)
## Since m = 3 means 3 imputed datasets. Select 2nd dataset to run our model.
new <- complete(mice.data, 2)
## Save new imputed data as "newdf.csv" file
write.csv(new, 'newdf.csv')
The missing values have been replaced with the imputed values in the three datasets.
new.df <- read.csv("newdf.csv")
new.df <- new.df[ , -1]# Boxplot
theme_set(theme_classic())
g <- ggplot(new.df, aes(x = cancellation_policy, y = log_price))
g <- g + geom_boxplot(varwidth = T, aes(fill = factor(cancellation_policy)))
g <- g + labs(title ="Box plot",
subtitle = "Log_Price Grouped by Cancellation Policy",
caption = "Source: Airbnb Data",
x = "Cancellation Policy",
y = "Log_Price"); g
There are several characteristics in the groups of cancellation policy. First, those hosts who choose strict cancellation policy, their listing is a little bit pricey than those who choose flexible and moderate. On the other hand, hosts who choose the flexible or moderate cancellation policy have similar listing price.
## Apply the median of log_price as parameter
new.df %>% group_by(bed_type) %>% summarise(Avg_log_price = median(log_price,na.rm=T)) %>%
ggplot(aes(x = bed_type, y = Avg_log_price, fill = bed_type))+geom_bar(stat="identity") +
labs(title = "Bar Chart",
subtitle = "Bed_Type and Avg_Log_Price",
caption = "Source = Airbnb Data",
x = "Bed_Type",
y = "Avg_Log_Price")
According to the bar chart which measured by the listing price, The average price of real bed is a little bit pricey than those other bed types.
## Slice data
price_geo <- data.frame("Longitude" = new.df$longitude,
"Latitude" = new.df$latitude,
"Room type" = new.df$room_type,
"Log Price" = new.df$log_price)
## Stratified sampling the data with sample size = 40 from each room type
set.seed(50)
st.1 <- strata(price_geo, stratanames = c("Room.type"),
size = rep(40, 3), method = "srswor",
description = TRUE)## Stratum 1
##
## Population total and number of selected units: 3818 40
## Stratum 2
##
## Population total and number of selected units: 2518 40
## Stratum 3
##
## Population total and number of selected units: 98 40
## Number of strata 3
## Total number of selected units 120
sample1 <- getdata(price_geo, st.1)
## Plot the distribution
gg <- ggplot(sample1, aes(x = Longitude, y = Latitude)) +
geom_point(aes(col = Room.type, size = Log.Price)) +
xlim(c(-122.35, -122.55)) +
ylim(c(37.70, 37.83)) +
labs(title = "Scatter Plot",
subtitle =" The Geographic Location of Log_Price & Room Type",
x = "Longitude",
y = "Latitude",
caption = "Source: Airbnb"); gg
The scatter plot is based on spatial attributes of the Airbnb hosts; the bigger dots represent the higher listing price. The price drop when homes’ location far from the center of San Francisco, especially for shared room type.
lombard <- qmap("Fay Park san francisco", zoom = 17, source = "google", maptype="roadmap")
lombard +
geom_point(aes(x = longitude, y = latitude, colour = bed_type), data = new.df)
Tthere are several Airbnb homes around the Lombard Street, giving you a leisurely walk on Lombard Street and more time to take in everything without worrying about terrible traffic.
r <- ggplot(new.df %>% count(instant_bookable, cancellation_policy) %>%
mutate(pct = n/sum(n),
ypos = cumsum(n) - 0.5*n),
aes(instant_bookable, n, fill = cancellation_policy))
r <- r + geom_bar(stat = "identity") +
geom_text(aes(label = paste0(sprintf("%1.1f", pct*100), "%")), position = position_stack(vjust = 0.5))
r <- r + labs(title = "Stacked Bar Chart",
subtitle = "Instant Bookable and Cancellation Policy",
caption = "Source = Airbnb Data",
x = "Instant Book",
y = "Number of Hosts"); r
The majority of Airbnb hosts would rather choose strict cancellation policy than apply instant book service, even though the instant book service is convenient and time-saving for reservation.
To predict and mine the pattern from the Airbnb data, in this section, the exploration would base on multiple linear regression model, which is used to predict the relationship between success metric, log_price, and tracking metrics.
## Exclude id, property_type, city and host_response_rate
reg.data <- new.df[, -c(1, 3, 10, 13)]
## Dummify the data
dmy3 <- dummyVars(" ~ .", data = reg.data, fullRank = T)
reg.data1 <- data.frame(predict(dmy3, newdata = reg.data))
names(reg.data1)## [1] "log_price"
## [2] "room_type.Private.room"
## [3] "room_type.Shared.room"
## [4] "accommodates"
## [5] "bathrooms"
## [6] "bed_type.Couch"
## [7] "bed_type.Futon"
## [8] "bed_type.Pull.out.Sofa"
## [9] "bed_type.Real.Bed"
## [10] "cancellation_policy.moderate"
## [11] "cancellation_policy.strict"
## [12] "cancellation_policy.super_strict_30"
## [13] "cancellation_policy.super_strict_60"
## [14] "cleaning_fee.True"
## [15] "host_has_profile_pic.t"
## [16] "host_identity_verified.t"
## [17] "instant_bookable.t"
## [18] "latitude"
## [19] "longitude"
## [20] "number_of_reviews"
## [21] "review_scores_rating"
## [22] "bedrooms"
## [23] "beds"
set.seed(444)
train_row123 <- sample(1:nrow(reg.data1), 0.7*nrow(reg.data1))
train1 <- reg.data1[train_row123, ]
valid1 <- reg.data1[-train_row123, ]The train() function provides easy workflow to perform stepwise selection
set.seed(123)
## Set up repeated K-fold cross-validation
train.control <- trainControl(method = "cv", number = 10)
## Train the regression model
stepwise.model <- train(log_price ~., data = train1,
method = "leapBackward",
tuneGrid = data.frame(nvmax = 1:8),
trControl = train.control)## Warning in leaps.setup(x, y, wt = weights, nbest = nbest, nvmax = nvmax, :
## 1 linear dependencies found
## Reordering variables and trying again:
stepwise.model$results## nvmax RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 0.5693850 0.2894897 0.4329988 0.02309238 0.05236276 0.017127791
## 2 2 0.5065515 0.4373938 0.3746995 0.01151692 0.02524358 0.007769011
## 3 3 0.4918162 0.4697955 0.3604530 0.01175457 0.02054009 0.008300569
## 4 4 0.4719074 0.5120787 0.3481340 0.01483569 0.02832485 0.010149178
## 5 5 0.4572431 0.5417907 0.3371585 0.01803487 0.03301511 0.012593670
## 6 6 0.4505919 0.5555641 0.3303728 0.01540842 0.03020175 0.010401772
## 7 7 0.4473687 0.5619810 0.3300174 0.01687434 0.03295554 0.011991756
## 8 8 0.4462818 0.5637612 0.3289323 0.01717270 0.03405192 0.013822664
stepwise.model$bestTune## nvmax
## 8 8
coef(stepwise.model$finalModel, 8)## (Intercept) room_type.Private.room room_type.Shared.room
## -2.311549e+02 -4.747764e-01 -1.064908e+00
## accommodates bathrooms cleaning_fee.True
## 8.768883e-02 1.031460e-01 -1.325458e-01
## latitude review_scores_rating bedrooms
## 6.227619e+00 8.496271e-03 1.574902e-01
reg_lm <- lm(log_price ~ room_type.Private.room + room_type.Shared.room + accommodates +
bathrooms + cleaning_fee.True + latitude + review_scores_rating + bedrooms,
data = train1)
summary(reg_lm)##
## Call:
## lm(formula = log_price ~ room_type.Private.room + room_type.Shared.room +
## accommodates + bathrooms + cleaning_fee.True + latitude +
## review_scores_rating + bedrooms, data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2716 -0.2725 -0.0280 0.2189 2.6371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.312e+02 1.142e+01 -20.249 < 2e-16 ***
## room_type.Private.room -4.748e-01 1.594e-02 -29.784 < 2e-16 ***
## room_type.Shared.room -1.065e+00 5.626e-02 -18.930 < 2e-16 ***
## accommodates 8.769e-02 5.405e-03 16.224 < 2e-16 ***
## bathrooms 1.031e-01 1.427e-02 7.229 5.69e-13 ***
## cleaning_fee.True -1.325e-01 1.546e-02 -8.571 < 2e-16 ***
## latitude 6.228e+00 3.022e-01 20.608 < 2e-16 ***
## review_scores_rating 8.496e-03 8.301e-04 10.235 < 2e-16 ***
## bedrooms 1.575e-01 1.179e-02 13.353 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4418 on 4494 degrees of freedom
## Multiple R-squared: 0.5726, Adjusted R-squared: 0.5718
## F-statistic: 752.6 on 8 and 4494 DF, p-value: < 2.2e-16
## Evaluate collinearity
vif(reg_lm)## room_type.Private.room room_type.Shared.room accommodates
## 1.394746 1.070076 2.660912
## bathrooms cleaning_fee.True latitude
## 1.457249 1.058013 1.040871
## review_scores_rating bedrooms
## 1.006354 2.487914
## Evaluate the Model
layout(matrix(1:4, 2, 2))
plot(reg_lm)
The result of vif () shows the GVIF value of each attribute is less than 10, which means the multi-collinearity doesn’t exist.
The plot in the upper left shows the residual errors versus their fitted values. The residuals should be randomly distributed around the horizontal line representing a residual error of zero; that is, there should not be a distinct trend in the distribution of points.
The plot in the lower left is a standard Q-Q plot, which shows the residual errors are normally distributed.
The scale-location plot in the upper right shows the square root of the standardized residuals as a function of the fitted values. The plot shows no obvious trend in this plot.
At last, the plot in the lower right shows each points leverage, which is a measure of its importance in determining the regression result. The smaller distances means that removing the observation has little affect on the regression results.
## Data Prep
cart_data <- new.df
summary(cart_data$log_price)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.303 4.682 5.106 5.170 5.561 7.598
## Build Price Level
cart_data$price_lel <- ifelse(cart_data$log_price >= 5.561 & cart_data$log_price <= 7.598, "Pricey Digs",
ifelse(cart_data$log_price >= 5.106 & cart_data$log_price < 5.561, "Above Average",
ifelse(cart_data$log_price >= 4.682 & cart_data$log_price < 5.106, "Below Average",
"Student Budget")))
table(cart_data$price_lel)##
## Above Average Below Average Pricey Digs Student Budget
## 1592 1640 1597 1605
set.seed(213)
sample_row123 <- sample(1:nrow(cart_data), 0.8* nrow(cart_data))
train_cart <- cart_data[sample_row123, ]
valid_cart <- cart_data[-sample_row123, ]## X = bedrooms, bathrooms, accommodates, instant_bookable, cancellation policy.
## Y = price_seg
cart.model <- rpart(price_lel ~ bedrooms + bathrooms +
accommodates + instant_bookable + cancellation_policy, data = train_cart)
prp(cart.model,
faclen = 0,
fallen.leaves = TRUE,
shadow.col = "red",
extra = 2)
The Classification Results:
If bedrooms \(\ge\) 1.5, the model predicts that the listing price will fall into category Pricey Digs.
If bedrooms < 1.5, and accommodate \(\ge\) 2.5, the model predicts that the listing price will be classified as Above Average.
If bedrooms < 1.5, and accommodate \(\ge\) 1.5, and accommodate < 2.5, and bedrooms < 0.5, the model predicts that the listing price will be classified as Below Average.
If bedrooms < 1.5, and 1.5 \(\ge\) accommodate < 2.5, and bedrooms \(\ge\) 0.5, the model predicts that the listing price will be categorized as Student Budget.
If bedrooms < 1.5 and accommodate < 1.5, the model predicts that the listing price will be also classified as category Student Budget.
If a new host has 1 bedroom that could only accommodate 1 person in San Francisco, according to the model, the listing price has 66% would be classified as Student Budget. On the other hand, if a new host has 3 bedrooms at Marina District, it has 61% would be recognized as Pricey Digs level.
pred_cart <- predict(cart.model, newdata = valid_cart, type = "class")
table(real = valid_cart$price_lel, predict = pred_cart)## predict
## real Above Average Below Average Pricey Digs Student Budget
## Above Average 71 25 129 118
## Below Average 59 31 30 193
## Pricey Digs 35 3 238 39
## Student Budget 26 13 5 272
confus.matrix <- table(real = valid_cart$price_lel, predict = pred_cart)
sum(diag(confus.matrix))/sum(confus.matrix)## [1] 0.4755245
Although the accuracy of the classification tree model isn’t very high, it still illustrates the influential metrics while predicting and classifying the data.
The Airbnb data in San Francisco mined over location can provide information of the major place of the golden city. It can help us to understand the Airbnb development in various zones of the city such as residential areas, business zones, tourists attraction…etc.
Therefore, in this section, the k-means algorithm would cluster dataset into several fictional districts to analyze regional growth in San Francisco.
## Merge new.df with neighbourhood and host_since column
new.df1 <- mutate(new.df, neighbourhood = bnb$neighbourhood, host_since = bnb$host_since)
## Exclude missing value
new.df1[new.df1 == '' | new.df1 == 'NA'] <- NA
new.df1 <- na.omit(new.df1)## Compute and plot within from k = 2 to k = 10
set.seed(789)
wss <- sapply(1:10, function(k) {
kmeans(new.df1[, c(15,16)], k, nstart = 50, iter.max = 10)$tot.withinss}
)
wss## [1] 7.3390511 4.4800190 2.7830009 2.1222189 1.6542640 1.4045264 1.2178125
## [8] 1.0866226 0.9608950 0.8575679
plot(1:10, wss,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of squares")
abline(v = 5, col = "blue", lty = 2, lwd = 3)
The elbow line chart depicting the decline in cluster heterogeneity when adding more clusters.
set.seed(100)
clusters <- kmeans(new.df1[ , c(15,16)], 5)
new.df1$District <- as.factor(clusters$cluster)sf_map <- get_map("san francisco california", zoom = 12, source = "google", maptype="roadmap")
ggmap(sf_map, extent = "panel") + geom_point(aes(x = longitude, y = latitude, colour = as.factor(District)), data = new.df1) +
ggtitle("SF Districts using KMean")
The output of Airbnb SF boroughs visualization seems clear and explanatory, the density of Airbnb hosts in district 1 and 4 are sparse than 2, 3 and 5. Not surprisingly, the result corresponds with expectation because district 2, 3 and 5 are the major business districts and tourism spots with various public transportation in San Francisco.
However, the purpose of k- means not only segment data into groups, but also mine useful information which based on those groups.
Let’s see Airbnb annual growth in each district since 2008!
new.df1$host_since <- as.character(new.df1$host_since)
new.df1$host_since_year <- str_sub(new.df1$host_since, 1, 4)
new.df1$host_since_month <- str_sub(new.df1$host_since, 6, 7)
new.df1$host_since_day <- str_sub(new.df1$host_since, -2, -1)
new.df1$host_since_year <- as.integer(new.df1$host_since_year)
new.df1$host_since_month <- as.integer(new.df1$host_since_month)
new.df1$host_since_day <- as.integer(new.df1$host_since_day)year_borough <- count_(new.df1, vars = c('host_since_year', 'District'), sort = TRUE) %>%
arrange(host_since_year , District)
year_growth <- year_borough %>%
ggplot(aes(host_since_year, n, colour = District)) + geom_line() +
ggtitle("Airbnb Annual Growth in SF ") + ylim(0, 400) + labs(x = "Year", y = "# of Hosts"); year_growth
According to the line chart, Airbnb hosts has increased rapidly in district 2, 3 and 5 since 2010. However, after the year 2015, the growth trends became slower and fallen down. Compared with district 2, 3 and 5, district 1 and 4 still have space to gain more Airbnb community.
Mr. Fredricksen and Russell host an Airbnb “Ellie’s Loft” in San Francisco Marina District. As a new host in Airbnb community, they are considering whether to charge cleaning fee to their guests. The K nearest neighbors algorithm could help them to find out the homogeneous hosts in San Francisco, and they will leverage those hosts’ information to make the decision.
## Predictors: accommodate, bathrooms, cancellation policy_strict.
## Response: cleaning fee.
## Build the classifier's dataset
cls <- new.df[ ,c(1, 5, 6, 8)]
head(cls)## id accommodates bathrooms cancellation_policy
## 1 13418779 4 1.0 flexible
## 2 12422935 2 1.0 strict
## 3 180792 2 1.0 moderate
## 4 16904896 2 1.0 strict
## 5 3530517 6 2.0 flexible
## 6 11712092 9 2.5 flexible
## Dummify and reduce reference group cancellation policy = flexible
dmy <- dummyVars(" ~ cancellation_policy", data = cls, fullRank = T)
cls1 <- data.frame(predict(dmy, newdata = cls))
cls1$cleaning_fee <- new.df$cleaning_fee
cls1$accommodates <- new.df$accommodates
cls1$bathrooms <- new.df$bathroomsset.seed(600)
train_row <- sample(1:nrow(cls1), 0.8*nrow(cls1))
train.df <- cls1[train_row, ]
valid.df <- cls1[-train_row, ]## Measuring the accuracy of different k values
accuracy.df <- data.frame(k = seq(1, 15 ,1), accuracy = rep(0, 15))
for (i in 1:15) {
knn.pred <- knn(train.df[, c(1:4, 6:7)], valid.df[, c(1:4, 6:7)], cl = train.df[, 5], k=i)
accuracy.df[i, 2] <- confusionMatrix(knn.pred, valid.df[, 5])$overall[1]
}
accuracy.df## k accuracy
## 1 1 0.6177156
## 2 2 0.5361305
## 3 3 0.6775447
## 4 4 0.6697747
## 5 5 0.6845377
## 6 6 0.6860917
## 7 7 0.6860917
## 8 8 0.6930847
## 9 9 0.6946387
## 10 10 0.6969697
## 11 11 0.7731158
## 12 12 0.7715618
## 13 13 0.7692308
## 14 14 0.7707848
## 15 15 0.7684538
## Demonstrate the confusion matrix in line chart
plot(accuracy.df, type = "b", col = "dodgerblue", cex = 1, pch = 20,
xlab = "k, number of neighbors", ylab = "classification accuracy",
main = "Accuracy and K number of Neighbors")
## add lines to indicate k with best accuracy
abline(v = which(accuracy.df$accuracy == max(accuracy.df$accuracy)), col = "darkorange", lwd = 1.5)
## add line for max accuracy seen
abline(h = max(accuracy.df$accuracy), col = "grey", lty = 2)
According to the k number of neighbors’ line chart, the Knn classifier has the highest classification accuracy at k = 11.
## Randomly select one row to build Mr.Fredricksen's data
fred <- cls1[1, -5]
## Mr. Fredricksen host has accommodattion for 4 people, 2 bathrooms, and a strict cancellation policy.
fred$bathrooms <- 2
fred$accommodates <- 4
fred$cancellation_policy.strict <- 1
fred## cancellation_policy.moderate cancellation_policy.strict
## 1 0 1
## cancellation_policy.super_strict_30 cancellation_policy.super_strict_60
## 1 0 0
## accommodates bathrooms
## 1 4 2
## Use knn algorithm with k = 11 to predict whether Mr.Fredricksen should add cleaning fee.
nn <- knn(train.df[ , c(1:4, 6:7)], fred[ , c(1:6)], cl = train.df[ , 5], k = 11)
row.names(train.df)[attr(nn, "nn.index")]## [1] "1408" "3875" "6424" "5239" "821" "1221" "4254" "5830" "3566" "3717"
## [11] "1076"
Eleven nearest data are: “1408” “3875” “6424” “5239” “821” “1221” “4254” “5830” “3566” “3717” “1076”
nei <- new.df[c(1408, 3875, 6424, 5239, 821, 1221, 4254, 5830, 3566, 3717, 1076), ]
nei[, c(5, 6, 8, 9)]## accommodates bathrooms cancellation_policy cleaning_fee
## 1408 4 2 strict True
## 3875 4 2 strict True
## 6424 4 2 strict True
## 5239 4 2 strict True
## 821 4 2 strict True
## 1221 4 2 strict True
## 4254 4 2 strict True
## 5830 4 2 strict True
## 3566 4 2 strict True
## 3717 4 2 strict True
## 1076 4 2 strict True
Based on the classifier’s result, those Airbnb hosts who have 2 bathrooms, 4 accommodataon space and strict cancellation policy would add cleaning fee to their listing. Therefore, Mr. Fredricksen and Russell should also charge the cleaning fee to their listing.
sf <- geocode("lombard street california")
sf_osm_map <- qmap("san francisco california", zoom = 13, source = "google", maptype="roadmap")
sf_osm_map + geom_point(aes(x = longitude, y = latitude),
data = nei,
alpha = 0.7,
size = 3,
color = "tomato") +
geom_encircle(aes(x=longitude, y=latitude), data = nei, size = 4, color = "blue")
Visualize the 11 nearest neighbors’ spatial data on the Google roadmap.
12 Surprising Things Airbnb Guests Love And Hate https://www.thefrugalgene.com/airbnb-love-hate/
Airbnb Help Center https://www.airbnb.com/help