library(tidyverse)
library(ggthemes)
library(GGally)
library(ggExtra)
library(caret)
library(glmnet)
library(corrplot)
library(leaflet)
library(kableExtra)
library(RColorBrewer)
library(plotly)
library(readr)
th <- theme_fivethirtyeight() + theme(axis.title = element_text(), axis.title.x = element_text())
set.seed(252)
airbnb <- read.csv("AB_NYC_2019.csv", stringsAsFactors = FALSE, na.strings = c(""))
The first 6 rows of the data are shown:
head(airbnb)
## id name host_id host_name
## 1 2539 Clean & quiet apt home by the park 2787 John
## 2 2595 Skylit Midtown Castle 2845 Jennifer
## 3 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth
## 4 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura
## 6 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris
## neighbourhood_group neighbourhood latitude longitude room_type price
## 1 Brooklyn Kensington 40.64749 -73.97237 Private room 149
## 2 Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225
## 3 Manhattan Harlem 40.80902 -73.94190 Private room 150
## 4 Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89
## 5 Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80
## 6 Manhattan Murray Hill 40.74767 -73.97500 Entire home/apt 200
## minimum_nights number_of_reviews last_review reviews_per_month
## 1 1 9 2018-10-19 0.21
## 2 1 45 2019-05-21 0.38
## 3 3 0 <NA> NA
## 4 1 270 2019-07-05 4.64
## 5 10 9 2018-11-19 0.10
## 6 3 74 2019-06-22 0.59
## calculated_host_listings_count availability_365
## 1 6 365
## 2 2 355
## 3 1 365
## 4 1 194
## 5 1 0
## 6 1 129
This dataset contains 16 features about Airbnb listings within New York City. Below are the features with their respective descriptions:
Check the number of data.
nrow(airbnb)
## [1] 48895
Inspect the data structures.
str(airbnb)
## 'data.frame': 48895 obs. of 16 variables:
## $ id : int 2539 2595 3647 3831 5022 5099 5121 5178 5203 5238 ...
## $ name : chr "Clean & quiet apt home by the park" "Skylit Midtown Castle" "THE VILLAGE OF HARLEM....NEW YORK !" "Cozy Entire Floor of Brownstone" ...
## $ host_id : int 2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
## $ host_name : chr "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
## $ neighbourhood_group : chr "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
## $ neighbourhood : chr "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
## $ latitude : num 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num -74 -74 -73.9 -74 -73.9 ...
## $ room_type : chr "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
## $ price : int 149 225 150 89 80 200 60 79 79 150 ...
## $ minimum_nights : int 1 1 3 1 10 3 45 2 2 1 ...
## $ number_of_reviews : int 9 45 0 270 9 74 49 430 118 160 ...
## $ last_review : chr "2018-10-19" "2019-05-21" NA "2019-07-05" ...
## $ reviews_per_month : num 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ calculated_host_listings_count: int 6 2 1 1 1 1 1 1 1 4 ...
## $ availability_365 : int 365 355 365 194 0 129 0 220 0 188 ...
Get the summary of the data.
summary(airbnb)
## id name host_id host_name
## Min. : 2539 Length:48895 Min. : 2438 Length:48895
## 1st Qu.: 9471945 Class :character 1st Qu.: 7822033 Class :character
## Median :19677284 Mode :character Median : 30793816 Mode :character
## Mean :19017143 Mean : 67620011
## 3rd Qu.:29152178 3rd Qu.:107434423
## Max. :36487245 Max. :274321313
##
## neighbourhood_group neighbourhood latitude longitude
## Length:48895 Length:48895 Min. :40.50 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.71
##
## room_type price minimum_nights number_of_reviews
## Length:48895 Min. : 0.0 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.: 69.0 1st Qu.: 1.00 1st Qu.: 1.00
## Mode :character Median : 106.0 Median : 3.00 Median : 5.00
## Mean : 152.7 Mean : 7.03 Mean : 23.27
## 3rd Qu.: 175.0 3rd Qu.: 5.00 3rd Qu.: 24.00
## Max. :10000.0 Max. :1250.00 Max. :629.00
##
## last_review reviews_per_month calculated_host_listings_count
## Length:48895 Min. : 0.010 Min. : 1.000
## Class :character 1st Qu.: 0.190 1st Qu.: 1.000
## Mode :character Median : 0.720 Median : 1.000
## Mean : 1.373 Mean : 7.144
## 3rd Qu.: 2.020 3rd Qu.: 2.000
## Max. :58.500 Max. :327.000
## NA's :10052
## availability_365
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 45.0
## Mean :112.8
## 3rd Qu.:227.0
## Max. :365.0
##
Describe the data. It is observed that the price has a minimum value of 0 USD which is not possible. We will remove it after wards.
psych::describe(airbnb)
## vars n mean sd median
## id 1 48895 19017143.24 10983108.39 19677284.00
## name* 2 48879 23969.07 13814.77 23938.00
## host_id 3 48895 67620010.65 78610967.03 30793816.00
## host_name* 4 48874 5460.42 3232.22 5333.00
## neighbourhood_group* 5 48895 2.68 0.74 3.00
## neighbourhood* 6 48895 108.11 68.74 95.00
## latitude 7 48895 40.73 0.05 40.72
## longitude 8 48895 -73.95 0.05 -73.96
## room_type* 9 48895 1.50 0.55 1.00
## price 10 48895 152.72 240.15 106.00
## minimum_nights 11 48895 7.03 20.51 3.00
## number_of_reviews 12 48895 23.27 44.55 5.00
## last_review* 13 38843 1491.10 400.67 1714.00
## reviews_per_month 14 38843 1.37 1.68 0.72
## calculated_host_listings_count 15 48895 7.14 32.95 1.00
## availability_365 16 48895 112.78 131.62 45.00
## trimmed mad min max
## id 19188061.30 14689959.59 2539.00 36487245.00
## name* 23967.98 17712.62 1.00 47905.00
## host_id 54170438.55 40836605.41 2438.00 274321313.00
## host_name* 5433.36 4209.84 1.00 11452.00
## neighbourhood_group* 2.61 1.48 1.00 5.00
## neighbourhood* 106.81 88.96 1.00 221.00
## latitude 40.73 0.05 40.50 40.91
## longitude -73.96 0.04 -74.24 -73.71
## room_type* 1.48 0.00 1.00 3.00
## price 121.43 68.20 0.00 10000.00
## minimum_nights 3.58 2.97 1.00 1250.00
## number_of_reviews 12.45 7.41 0.00 629.00
## last_review* 1579.38 65.23 1.00 1764.00
## reviews_per_month 1.06 0.92 0.01 58.50
## calculated_host_listings_count 1.50 0.00 1.00 327.00
## availability_365 96.50 66.72 0.00 365.00
## range skew kurtosis se
## id 36484706.00 -0.09 -1.23 49669.87
## name* 47904.00 0.00 -1.20 62.49
## host_id 274318875.00 1.21 0.17 355509.26
## host_name* 11451.00 0.05 -1.16 14.62
## neighbourhood_group* 4.00 0.37 -0.11 0.00
## neighbourhood* 220.00 0.26 -1.26 0.31
## latitude 0.41 0.24 0.15 0.00
## longitude 0.53 1.28 5.02 0.00
## room_type* 2.00 0.42 -0.97 0.00
## price 10000.00 19.12 585.59 1.09
## minimum_nights 1249.00 21.83 853.95 0.09
## number_of_reviews 629.00 3.69 19.53 0.20
## last_review* 1763.00 -1.66 1.71 2.03
## reviews_per_month 58.49 3.13 42.49 0.01
## calculated_host_listings_count 326.00 7.93 67.54 0.15
## availability_365 365.00 0.76 -1.00 0.60
Since id, host_id does not give any information, these columns are dropped. The last_review is the date for the last review, which does not affect the price. It is also dropped.
airbnb <- subset(airbnb, select = -c(id, host_id, last_review))
The percentage of missing values is examined.
sapply(airbnb, function(x)sum(is.na(x)/nrow(airbnb)))
## name host_name
## 0.0003272318 0.0004294918
## neighbourhood_group neighbourhood
## 0.0000000000 0.0000000000
## latitude longitude
## 0.0000000000 0.0000000000
## room_type price
## 0.0000000000 0.0000000000
## minimum_nights number_of_reviews
## 0.0000000000 0.0000000000
## reviews_per_month calculated_host_listings_count
## 0.2055833930 0.0000000000
## availability_365
## 0.0000000000
If there is no entry for review_per_month, it can be treated as no review, so the review_per_month is replaced with 0.
airbnb[["reviews_per_month"]][is.na(airbnb[["reviews_per_month"]])] <- 0
sapply(airbnb, function(x)sum(is.na(x)/nrow(airbnb)))
## name host_name
## 0.0003272318 0.0004294918
## neighbourhood_group neighbourhood
## 0.0000000000 0.0000000000
## latitude longitude
## 0.0000000000 0.0000000000
## room_type price
## 0.0000000000 0.0000000000
## minimum_nights number_of_reviews
## 0.0000000000 0.0000000000
## reviews_per_month calculated_host_listings_count
## 0.0000000000 0.0000000000
## availability_365
## 0.0000000000
Now, it is observed that the NULL values in the name and the host_name is less than 0.0005%. Therefore, it is safe ro directly omit these record without losing important information.
airbnb <- na.omit(airbnb)
sapply(airbnb, function(x)sum(is.na(x)/nrow(airbnb)))
## name host_name
## 0 0
## neighbourhood_group neighbourhood
## 0 0
## latitude longitude
## 0 0
## room_type price
## 0 0
## minimum_nights number_of_reviews
## 0 0
## reviews_per_month calculated_host_listings_count
## 0 0
## availability_365
## 0
Price cannot be 0 USD so we will omit the data with price of 0 USD.
airbnb <- airbnb %>% filter(price > 0)
The variables name, host_name, neighbourhood_group, neighbourbood and room_type is changed to factor.
nrow(airbnb)
## [1] 48847
names_to_factor <- c("name", "host_name", "neighbourhood_group", "neighbourhood", "room_type")
airbnb[names_to_factor] <- map(airbnb[names_to_factor], as.factor)
Again, the structure of the dataframe is examined.
str(airbnb)
## 'data.frame': 48847 obs. of 13 variables:
## $ name : Factor w/ 47873 levels "'Fan'tastic",..: 12651 38149 45138 15690 19351 24985 8328 25032 15585 17667 ...
## $ host_name : Factor w/ 11448 levels "'Cil","-TheQueensCornerLot",..: 5048 4843 2960 6261 5979 1968 3599 9694 6932 1262 ...
## $ neighbourhood_group : Factor w/ 5 levels "Bronx","Brooklyn",..: 2 3 3 2 3 3 2 3 3 3 ...
## $ neighbourhood : Factor w/ 221 levels "Allerton","Arden Heights",..: 109 128 95 42 62 138 14 96 203 36 ...
## $ latitude : num 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num -74 -74 -73.9 -74 -73.9 ...
## $ room_type : Factor w/ 3 levels "Entire home/apt",..: 2 1 2 1 1 1 2 2 2 1 ...
## $ price : int 149 225 150 89 80 200 60 79 79 150 ...
## $ minimum_nights : int 1 1 3 1 10 3 45 2 2 1 ...
## $ number_of_reviews : int 9 45 0 270 9 74 49 430 118 160 ...
## $ reviews_per_month : num 0.21 0.38 0 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ calculated_host_listings_count: int 6 2 1 1 1 1 1 1 1 4 ...
## $ availability_365 : int 365 355 365 194 0 129 0 220 0 188 ...
## - attr(*, "na.action")= 'omit' Named int [1:37] 361 2701 2855 3704 5746 5776 5976 6076 6270 6568 ...
## ..- attr(*, "names")= chr [1:37] "361" "2701" "2855" "3704" ...
Since the price is the output to be predicted, the distribution of the price will be interesting to examine. It is observed that the plot is very skewed.
ggplot(airbnb, aes(x = price)) +
geom_histogram(bins = 40, fill = "steelblue4", colour='black') +
ggtitle("Distribution of Price")
To view the distribution of the price clearly, the x-axis is transformed into log scale as shown in the histogram below.
ggplot(airbnb, aes(x = price)) +
geom_histogram(bins = 40, fill = "steelblue4", colour='black') +
ggtitle("Transformed Distribution of Price", subtitle = expression(~'log'[10] ~ "x-axis")) +
scale_x_log10()
The price is observed to be distributed in around USD 90 to USD 120 (Remember that the scale is log scale).
Next, the factor that affect the price the most is the main focus. The initial guess is that the number of reviews will be the main factor that affect the price. Therefore, the distribution of price according to number of reviews.
ggplot(data = airbnb, aes(x = number_of_reviews, y = price))+
geom_jitter(color="steelblue4") +
ggtitle("Distribution of Price According to Review")
Again, the distribution is too skewed. The log scale transformation in the x-axis is performed once again.
ggplot(data = airbnb, aes(x = number_of_reviews, y = price)) +
geom_jitter(color="steelblue4") +
ggtitle("Transformed Distribution of Price According to Review", subtitle = expression(~'log'[10] ~ "x-axis")) +
scale_x_log10()
From the Jitter Plot above, it is observed that the more the number of review, the price is lower as the homestay might be old as compared to those with lowest number of reviews. This makes the homestay with higher number of review to be cheaper.
Next, the average price according to the neighbourhood_group is calculated and plotted.
bronx <- subset(airbnb, neighbourhood_group == "Bronx")
bronx_avg <- sum(bronx$price)/nrow(bronx)
brooklyn <- subset(airbnb, neighbourhood_group == "Brooklyn")
brooklyn_avg <- sum(brooklyn$price)/nrow(brooklyn)
manhattan <- subset(airbnb, neighbourhood_group == "Manhattan")
manhattan_avg <- sum(manhattan$price)/nrow(manhattan)
queens <- subset(airbnb, neighbourhood_group == "Queens")
queens_avg <- sum(queens$price)/nrow(queens)
staten <- subset(airbnb, neighbourhood_group == "Staten Island")
staten_avg <- sum(staten$price)/nrow(staten)
price_vs_neighbourhooodGroup <- data.frame(
"Neighbourhood_Group" = c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island"),
"Average_Price" = c(bronx_avg, brooklyn_avg, manhattan_avg, queens_avg, staten_avg)
)
price_vs_neighbourhooodGroup
## Neighbourhood_Group Average_Price
## 1 Bronx 87.54963
## 2 Brooklyn 124.46628
## 3 Manhattan 196.90657
## 4 Queens 99.53602
## 5 Staten Island 114.81233
ggplot(data = price_vs_neighbourhooodGroup, aes(x = Neighbourhood_Group, y = Average_Price)) +
geom_bar(stat="identity", fill = "steelblue4", colour = "black") +
ggtitle("Distribution of Price According to Neighbour Group")
It is observed that the average price per night is the highest in Manhattan, followed by Brooklyn, Staten Island, Queens and Bronx.
The average price per night according to room type is also calculated and plotted.
entire <- subset(airbnb, room_type == "Entire home/apt")
entire_avg <- sum(entire$price)/nrow(entire)
private <- subset(airbnb, room_type == "Private room")
private_avg <- sum(private$price)/nrow(private)
shared <- subset(airbnb, room_type == "Shared room")
shared_avg <- sum(shared$price)/nrow(shared)
price_vs_roomType <- data.frame(
"Room_Type" = c("Entire Home/Apt", "Private Room", "Shared Room"),
"Average_Price" = c(entire_avg, private_avg, shared_avg)
)
price_vs_roomType
## Room_Type Average_Price
## 1 Entire Home/Apt 211.82368
## 2 Private Room 89.82255
## 3 Shared Room 70.19706
ggplot(data = price_vs_roomType, aes(x = Room_Type, y = Average_Price)) +
geom_bar(stat="identity", fill = "steelblue4", colour = "black") +
ggtitle("Distribution of Price According to Room Type")
The entire home or apartment is observed to be the most expensive one, followed by the private room and shared room. This make sense as renting entire home or apartment means renting bigger space.
Now, the property type in each neighbourhood is also important to be visualized. It is observed that Manhattan has most entire home/apartment to be rented. This is also why the average price per night in Manhattan is high as compared to other neighbourhood.
ggplot(airbnb) +
geom_histogram(aes(neighbourhood_group, fill = room_type), stat = "count", position = 'fill') +
theme_minimal(base_size = 13)+ xlab("") + ylab("") +
ggtitle("The Proportion of Property Type in Each Area")
The distribution of price per night in each neighbourhood group is inspected.
airbnb_nh <- airbnb %>%
group_by(neighbourhood_group) %>%
summarise(price = round(mean(price), 2))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(airbnb, aes(price)) +
geom_histogram(bins = 30, aes(y = ..density..), fill = "steelblue4") +
geom_density(alpha = 0.2, fill = "blue") +
th +
ggtitle("Transformed distribution of price\n by neighbourhood groups",
subtitle = expression(~'log'[10] ~ "x-axis")) +
geom_vline(data = airbnb_nh, aes(xintercept = price), size = 2, linetype = 3) +
geom_text(data = airbnb_nh,y = 1.5, aes(x = price + 1400, label = paste("Mean = $",price)), color = "red", size = 3) +
facet_wrap(~neighbourhood_group) +
scale_x_log10()
The price distribution based on room type and neighbourhood group is shown as below. It is observed that the price per night for entire home/apt is the highest in Manhattan. In Brooklyn, Queens and Bronx, the rental for private room is higher as compared to other room types.
airbnb %>% arrange(desc(price)) %>% top_n(10) %>% select(- host_name, -name) %>%
ggplot(aes(x = price, fill = neighbourhood_group)) +
geom_histogram(bins = 50) +
scale_x_log10() +
ggtitle("Summary of Price Distributions") +
facet_wrap(~room_type + neighbourhood_group)
## Selecting by availability_365
To further visualize the data, a leaflet map is used.
pal <- colorFactor(palette = c("red", "green", "blue", "purple", "yellow"), domain = airbnb$neighbourhood_group)
leaflet(data = airbnb) %>% addProviderTiles(providers$CartoDB.DarkMatterNoLabels) %>% addCircleMarkers(~longitude, ~latitude, color = ~pal(neighbourhood_group), weight = 1, radius=1, fillOpacity = 0.1, opacity = 0.1,
label = paste("Name:", airbnb$name)) %>%
addLegend("bottomright", pal = pal, values = ~neighbourhood_group,
title = "Neighbourhood groups",
opacity = 1
)
A correlation heatmap is used to determine the correlation between the variables. 1 indicates strong positive correlation while -1 indicates strong negative correlation. It is observed that only number of review and review per month has a strong positive correlation as these two variables are fundamentally related to reviews.
airbnb_cor <- airbnb[, sapply(airbnb, is.numeric)]
airbnb_cor <- airbnb_cor[complete.cases(airbnb_cor), ]
correlation_matrix <- cor(airbnb_cor)
corrplot(correlation_matrix, method = "color")
# Train Test Split To perform predictive analysis, the data is split into train and test data with a ratio of 80% : 20%.
set.seed(45)
pd <- sample(2, nrow(airbnb), replace = TRUE, prob = c(0.8,0.2))
train <- airbnb[pd == 1,]#, here means all column
test <- airbnb[pd == 2,]
cat("Number of train data: ", nrow(train), "\n")
## Number of train data: 39132
cat("Number of test data: ", nrow(test), "\n")
## Number of test data: 9715
A linear regression model is generated.
lr1 <- train(price ~ latitude + longitude + room_type + minimum_nights + availability_365 + neighbourhood_group, data = train, method = "lm")
summary(lr1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -246.2 -63.6 -24.9 15.0 9859.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.849e+04 3.662e+03 -7.779 7.47e-15 ***
## latitude -1.957e+02 3.566e+01 -5.488 4.10e-08 ***
## longitude -4.956e+02 4.099e+01 -12.089 < 2e-16 ***
## `room_typePrivate room` -1.080e+02 2.459e+00 -43.915 < 2e-16 ***
## `room_typeShared room` -1.431e+02 7.901e+00 -18.116 < 2e-16 ***
## minimum_nights -4.650e-02 6.028e-02 -0.771 0.440500
## availability_365 1.738e-01 9.211e-03 18.865 < 2e-16 ***
## neighbourhood_groupBrooklyn -2.940e+01 9.966e+00 -2.950 0.003177 **
## neighbourhood_groupManhattan 3.172e+01 9.045e+00 3.506 0.000455 ***
## neighbourhood_groupQueens -5.244e+00 9.617e+00 -0.545 0.585566
## `neighbourhood_groupStaten Island` -1.442e+02 1.907e+01 -7.558 4.19e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 232.6 on 39121 degrees of freedom
## Multiple R-squared: 0.09587, Adjusted R-squared: 0.09564
## F-statistic: 414.8 on 10 and 39121 DF, p-value: < 2.2e-16
The performance of the linear regression model is inspected using the plots. The Normal Q-Q plot of a linear regression model should be a straight line. However, it is not straight in this model. Therefore, this model is not good to be used for predictive analysis.
plot(lr1$finalModel)
To improve the model more, the outliers in the data needs to be removed. The top 10% of the price and the bottom 10% of price is removed as they contains the most outliers. Then, the price is log so that the output is not that compact. Also, more variables is added for prediction.
filtered <- train %>% filter(price < quantile(train$price, 0.9) & price > quantile(train$price, 0.1))
lr2 <- lm(log(price) ~ room_type + neighbourhood_group + latitude + longitude
+ number_of_reviews + availability_365
+ reviews_per_month +
calculated_host_listings_count + minimum_nights, data = filtered)
summary(lr2)
##
## Call:
## lm(formula = log(price) ~ room_type + neighbourhood_group + latitude +
## longitude + number_of_reviews + availability_365 + reviews_per_month +
## calculated_host_listings_count + minimum_nights, data = filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.19464 -0.23132 -0.01506 0.21387 1.62655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.206e+02 5.850e+00 -20.614 < 2e-16 ***
## room_typePrivate room -5.370e-01 3.830e-03 -140.220 < 2e-16 ***
## room_typeShared room -6.220e-01 1.666e-02 -37.334 < 2e-16 ***
## neighbourhood_groupBrooklyn -4.348e-02 1.653e-02 -2.631 0.00851 **
## neighbourhood_groupManhattan 1.532e-01 1.515e-02 10.111 < 2e-16 ***
## neighbourhood_groupQueens 4.349e-02 1.603e-02 2.714 0.00666 **
## neighbourhood_groupStaten Island -5.487e-01 3.161e-02 -17.360 < 2e-16 ***
## latitude -5.443e-01 5.620e-02 -9.686 < 2e-16 ***
## longitude -1.996e+00 6.570e-02 -30.384 < 2e-16 ***
## number_of_reviews -1.375e-04 5.125e-05 -2.683 0.00731 **
## availability_365 3.637e-04 1.564e-05 23.250 < 2e-16 ***
## reviews_per_month -7.620e-03 1.496e-03 -5.093 3.55e-07 ***
## calculated_host_listings_count 3.990e-04 6.548e-05 6.093 1.12e-09 ***
## minimum_nights -1.348e-03 9.489e-05 -14.203 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3274 on 31155 degrees of freedom
## Multiple R-squared: 0.4837, Adjusted R-squared: 0.4835
## F-statistic: 2245 on 13 and 31155 DF, p-value: < 2.2e-16
In the improved linear regression model. The Normal Q-Q plot is observed to have a straight line. This indicate that this model is far better than the first model.
plot(lr2)
To examine the performance of the model, predictions are made. It is observed that the predicted value is close to the actual value.
test <- test %>% filter(price < quantile(train$price, 0.9) & price > quantile(train$price, 0.1))
pred <- predict(lr2, newdata = test)
pred <- exp(pred)
RMSE_regression <- sqrt(mean( (test$price - pred)**2 ))
SSE <- sum((test$price - pred)**2)
SSR <- sum((pred - mean(test$price)) ** 2)
R2 <- 1 - SSE/(SSE + SSR)
actual <- test$price
lr_result <- data.frame(
"Actual" = actual,
"Predicted" = pred
)
head(lr_result, 20)
## Actual Predicted
## 1 85 92.07526
## 2 215 143.51928
## 3 120 134.13480
## 4 80 78.29981
## 5 50 91.49867
## 6 110 139.94887
## 7 110 107.13800
## 8 80 89.17728
## 9 51 95.87432
## 10 65 91.43803
## 11 190 176.27856
## 12 200 94.90440
## 13 70 77.19218
## 14 98 81.70424
## 15 250 139.95385
## 16 140 158.06902
## 17 89 95.54870
## 18 98 107.10967
## 19 125 131.11391
## 20 60 90.78271