Import Libraries and Load Data

library(tidyverse)
library(ggthemes)
library(GGally)
library(ggExtra)
library(caret)
library(glmnet)
library(corrplot)
library(leaflet)
library(kableExtra)
library(RColorBrewer)
library(plotly)
library(readr)
th <- theme_fivethirtyeight() + theme(axis.title = element_text(), axis.title.x = element_text())
set.seed(252)
airbnb <- read.csv("AB_NYC_2019.csv", stringsAsFactors = FALSE, na.strings = c(""))

Exploratory Data Analysis (EDA)

The first 6 rows of the data are shown:

head(airbnb)
##     id                                             name host_id   host_name
## 1 2539               Clean & quiet apt home by the park    2787        John
## 2 2595                            Skylit Midtown Castle    2845    Jennifer
## 3 3647              THE VILLAGE OF HARLEM....NEW YORK !    4632   Elisabeth
## 4 3831                  Cozy Entire Floor of Brownstone    4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park    7192       Laura
## 6 5099        Large Cozy 1 BR Apartment In Midtown East    7322       Chris
##   neighbourhood_group neighbourhood latitude longitude       room_type price
## 1            Brooklyn    Kensington 40.64749 -73.97237    Private room   149
## 2           Manhattan       Midtown 40.75362 -73.98377 Entire home/apt   225
## 3           Manhattan        Harlem 40.80902 -73.94190    Private room   150
## 4            Brooklyn  Clinton Hill 40.68514 -73.95976 Entire home/apt    89
## 5           Manhattan   East Harlem 40.79851 -73.94399 Entire home/apt    80
## 6           Manhattan   Murray Hill 40.74767 -73.97500 Entire home/apt   200
##   minimum_nights number_of_reviews last_review reviews_per_month
## 1              1                 9  2018-10-19              0.21
## 2              1                45  2019-05-21              0.38
## 3              3                 0        <NA>                NA
## 4              1               270  2019-07-05              4.64
## 5             10                 9  2018-11-19              0.10
## 6              3                74  2019-06-22              0.59
##   calculated_host_listings_count availability_365
## 1                              6              365
## 2                              2              355
## 3                              1              365
## 4                              1              194
## 5                              1                0
## 6                              1              129

This dataset contains 16 features about Airbnb listings within New York City. Below are the features with their respective descriptions:

Check the number of data.

nrow(airbnb)
## [1] 48895

Inspect the data structures.

str(airbnb)
## 'data.frame':    48895 obs. of  16 variables:
##  $ id                            : int  2539 2595 3647 3831 5022 5099 5121 5178 5203 5238 ...
##  $ name                          : chr  "Clean & quiet apt home by the park" "Skylit Midtown Castle" "THE VILLAGE OF HARLEM....NEW YORK !" "Cozy Entire Floor of Brownstone" ...
##  $ host_id                       : int  2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
##  $ host_name                     : chr  "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
##  $ neighbourhood_group           : chr  "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
##  $ neighbourhood                 : chr  "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
##  $ latitude                      : num  40.6 40.8 40.8 40.7 40.8 ...
##  $ longitude                     : num  -74 -74 -73.9 -74 -73.9 ...
##  $ room_type                     : chr  "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
##  $ price                         : int  149 225 150 89 80 200 60 79 79 150 ...
##  $ minimum_nights                : int  1 1 3 1 10 3 45 2 2 1 ...
##  $ number_of_reviews             : int  9 45 0 270 9 74 49 430 118 160 ...
##  $ last_review                   : chr  "2018-10-19" "2019-05-21" NA "2019-07-05" ...
##  $ reviews_per_month             : num  0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
##  $ calculated_host_listings_count: int  6 2 1 1 1 1 1 1 1 4 ...
##  $ availability_365              : int  365 355 365 194 0 129 0 220 0 188 ...

Get the summary of the data.

summary(airbnb)
##        id               name              host_id           host_name        
##  Min.   :    2539   Length:48895       Min.   :     2438   Length:48895      
##  1st Qu.: 9471945   Class :character   1st Qu.:  7822033   Class :character  
##  Median :19677284   Mode  :character   Median : 30793816   Mode  :character  
##  Mean   :19017143                      Mean   : 67620011                     
##  3rd Qu.:29152178                      3rd Qu.:107434423                     
##  Max.   :36487245                      Max.   :274321313                     
##                                                                              
##  neighbourhood_group neighbourhood         latitude       longitude     
##  Length:48895        Length:48895       Min.   :40.50   Min.   :-74.24  
##  Class :character    Class :character   1st Qu.:40.69   1st Qu.:-73.98  
##  Mode  :character    Mode  :character   Median :40.72   Median :-73.96  
##                                         Mean   :40.73   Mean   :-73.95  
##                                         3rd Qu.:40.76   3rd Qu.:-73.94  
##                                         Max.   :40.91   Max.   :-73.71  
##                                                                         
##   room_type             price         minimum_nights    number_of_reviews
##  Length:48895       Min.   :    0.0   Min.   :   1.00   Min.   :  0.00   
##  Class :character   1st Qu.:   69.0   1st Qu.:   1.00   1st Qu.:  1.00   
##  Mode  :character   Median :  106.0   Median :   3.00   Median :  5.00   
##                     Mean   :  152.7   Mean   :   7.03   Mean   : 23.27   
##                     3rd Qu.:  175.0   3rd Qu.:   5.00   3rd Qu.: 24.00   
##                     Max.   :10000.0   Max.   :1250.00   Max.   :629.00   
##                                                                          
##  last_review        reviews_per_month calculated_host_listings_count
##  Length:48895       Min.   : 0.010    Min.   :  1.000               
##  Class :character   1st Qu.: 0.190    1st Qu.:  1.000               
##  Mode  :character   Median : 0.720    Median :  1.000               
##                     Mean   : 1.373    Mean   :  7.144               
##                     3rd Qu.: 2.020    3rd Qu.:  2.000               
##                     Max.   :58.500    Max.   :327.000               
##                     NA's   :10052                                   
##  availability_365
##  Min.   :  0.0   
##  1st Qu.:  0.0   
##  Median : 45.0   
##  Mean   :112.8   
##  3rd Qu.:227.0   
##  Max.   :365.0   
## 

Describe the data. It is observed that the price has a minimum value of 0 USD which is not possible. We will remove it after wards.

psych::describe(airbnb)
##                                vars     n        mean          sd      median
## id                                1 48895 19017143.24 10983108.39 19677284.00
## name*                             2 48879    23969.07    13814.77    23938.00
## host_id                           3 48895 67620010.65 78610967.03 30793816.00
## host_name*                        4 48874     5460.42     3232.22     5333.00
## neighbourhood_group*              5 48895        2.68        0.74        3.00
## neighbourhood*                    6 48895      108.11       68.74       95.00
## latitude                          7 48895       40.73        0.05       40.72
## longitude                         8 48895      -73.95        0.05      -73.96
## room_type*                        9 48895        1.50        0.55        1.00
## price                            10 48895      152.72      240.15      106.00
## minimum_nights                   11 48895        7.03       20.51        3.00
## number_of_reviews                12 48895       23.27       44.55        5.00
## last_review*                     13 38843     1491.10      400.67     1714.00
## reviews_per_month                14 38843        1.37        1.68        0.72
## calculated_host_listings_count   15 48895        7.14       32.95        1.00
## availability_365                 16 48895      112.78      131.62       45.00
##                                    trimmed         mad     min          max
## id                             19188061.30 14689959.59 2539.00  36487245.00
## name*                             23967.98    17712.62    1.00     47905.00
## host_id                        54170438.55 40836605.41 2438.00 274321313.00
## host_name*                         5433.36     4209.84    1.00     11452.00
## neighbourhood_group*                  2.61        1.48    1.00         5.00
## neighbourhood*                      106.81       88.96    1.00       221.00
## latitude                             40.73        0.05   40.50        40.91
## longitude                           -73.96        0.04  -74.24       -73.71
## room_type*                            1.48        0.00    1.00         3.00
## price                               121.43       68.20    0.00     10000.00
## minimum_nights                        3.58        2.97    1.00      1250.00
## number_of_reviews                    12.45        7.41    0.00       629.00
## last_review*                       1579.38       65.23    1.00      1764.00
## reviews_per_month                     1.06        0.92    0.01        58.50
## calculated_host_listings_count        1.50        0.00    1.00       327.00
## availability_365                     96.50       66.72    0.00       365.00
##                                       range  skew kurtosis        se
## id                              36484706.00 -0.09    -1.23  49669.87
## name*                              47904.00  0.00    -1.20     62.49
## host_id                        274318875.00  1.21     0.17 355509.26
## host_name*                         11451.00  0.05    -1.16     14.62
## neighbourhood_group*                   4.00  0.37    -0.11      0.00
## neighbourhood*                       220.00  0.26    -1.26      0.31
## latitude                               0.41  0.24     0.15      0.00
## longitude                              0.53  1.28     5.02      0.00
## room_type*                             2.00  0.42    -0.97      0.00
## price                              10000.00 19.12   585.59      1.09
## minimum_nights                      1249.00 21.83   853.95      0.09
## number_of_reviews                    629.00  3.69    19.53      0.20
## last_review*                        1763.00 -1.66     1.71      2.03
## reviews_per_month                     58.49  3.13    42.49      0.01
## calculated_host_listings_count       326.00  7.93    67.54      0.15
## availability_365                     365.00  0.76    -1.00      0.60

Since id, host_id does not give any information, these columns are dropped. The last_review is the date for the last review, which does not affect the price. It is also dropped.

airbnb <- subset(airbnb, select = -c(id, host_id, last_review))

The percentage of missing values is examined.

sapply(airbnb, function(x)sum(is.na(x)/nrow(airbnb)))
##                           name                      host_name 
##                   0.0003272318                   0.0004294918 
##            neighbourhood_group                  neighbourhood 
##                   0.0000000000                   0.0000000000 
##                       latitude                      longitude 
##                   0.0000000000                   0.0000000000 
##                      room_type                          price 
##                   0.0000000000                   0.0000000000 
##                 minimum_nights              number_of_reviews 
##                   0.0000000000                   0.0000000000 
##              reviews_per_month calculated_host_listings_count 
##                   0.2055833930                   0.0000000000 
##               availability_365 
##                   0.0000000000

If there is no entry for review_per_month, it can be treated as no review, so the review_per_month is replaced with 0.

airbnb[["reviews_per_month"]][is.na(airbnb[["reviews_per_month"]])] <- 0
sapply(airbnb, function(x)sum(is.na(x)/nrow(airbnb)))
##                           name                      host_name 
##                   0.0003272318                   0.0004294918 
##            neighbourhood_group                  neighbourhood 
##                   0.0000000000                   0.0000000000 
##                       latitude                      longitude 
##                   0.0000000000                   0.0000000000 
##                      room_type                          price 
##                   0.0000000000                   0.0000000000 
##                 minimum_nights              number_of_reviews 
##                   0.0000000000                   0.0000000000 
##              reviews_per_month calculated_host_listings_count 
##                   0.0000000000                   0.0000000000 
##               availability_365 
##                   0.0000000000

Now, it is observed that the NULL values in the name and the host_name is less than 0.0005%. Therefore, it is safe ro directly omit these record without losing important information.

airbnb <- na.omit(airbnb)
sapply(airbnb, function(x)sum(is.na(x)/nrow(airbnb)))
##                           name                      host_name 
##                              0                              0 
##            neighbourhood_group                  neighbourhood 
##                              0                              0 
##                       latitude                      longitude 
##                              0                              0 
##                      room_type                          price 
##                              0                              0 
##                 minimum_nights              number_of_reviews 
##                              0                              0 
##              reviews_per_month calculated_host_listings_count 
##                              0                              0 
##               availability_365 
##                              0

Price cannot be 0 USD so we will omit the data with price of 0 USD.

airbnb <- airbnb %>% filter(price > 0)

The variables name, host_name, neighbourhood_group, neighbourbood and room_type is changed to factor.

nrow(airbnb)
## [1] 48847
names_to_factor <- c("name", "host_name", "neighbourhood_group", "neighbourhood", "room_type")
airbnb[names_to_factor] <- map(airbnb[names_to_factor], as.factor)

Again, the structure of the dataframe is examined.

str(airbnb)
## 'data.frame':    48847 obs. of  13 variables:
##  $ name                          : Factor w/ 47873 levels "'Fan'tastic",..: 12651 38149 45138 15690 19351 24985 8328 25032 15585 17667 ...
##  $ host_name                     : Factor w/ 11448 levels "'Cil","-TheQueensCornerLot",..: 5048 4843 2960 6261 5979 1968 3599 9694 6932 1262 ...
##  $ neighbourhood_group           : Factor w/ 5 levels "Bronx","Brooklyn",..: 2 3 3 2 3 3 2 3 3 3 ...
##  $ neighbourhood                 : Factor w/ 221 levels "Allerton","Arden Heights",..: 109 128 95 42 62 138 14 96 203 36 ...
##  $ latitude                      : num  40.6 40.8 40.8 40.7 40.8 ...
##  $ longitude                     : num  -74 -74 -73.9 -74 -73.9 ...
##  $ room_type                     : Factor w/ 3 levels "Entire home/apt",..: 2 1 2 1 1 1 2 2 2 1 ...
##  $ price                         : int  149 225 150 89 80 200 60 79 79 150 ...
##  $ minimum_nights                : int  1 1 3 1 10 3 45 2 2 1 ...
##  $ number_of_reviews             : int  9 45 0 270 9 74 49 430 118 160 ...
##  $ reviews_per_month             : num  0.21 0.38 0 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
##  $ calculated_host_listings_count: int  6 2 1 1 1 1 1 1 1 4 ...
##  $ availability_365              : int  365 355 365 194 0 129 0 220 0 188 ...
##  - attr(*, "na.action")= 'omit' Named int [1:37] 361 2701 2855 3704 5746 5776 5976 6076 6270 6568 ...
##   ..- attr(*, "names")= chr [1:37] "361" "2701" "2855" "3704" ...

Since the price is the output to be predicted, the distribution of the price will be interesting to examine. It is observed that the plot is very skewed.

ggplot(airbnb, aes(x = price)) + 
  geom_histogram(bins = 40, fill = "steelblue4", colour='black') + 
   ggtitle("Distribution of Price")

To view the distribution of the price clearly, the x-axis is transformed into log scale as shown in the histogram below.

ggplot(airbnb, aes(x = price)) + 
  geom_histogram(bins = 40, fill = "steelblue4", colour='black') + 
  ggtitle("Transformed Distribution of Price", subtitle = expression(~'log'[10] ~ "x-axis")) +
  scale_x_log10()

The price is observed to be distributed in around USD 90 to USD 120 (Remember that the scale is log scale).

Next, the factor that affect the price the most is the main focus. The initial guess is that the number of reviews will be the main factor that affect the price. Therefore, the distribution of price according to number of reviews.

ggplot(data = airbnb, aes(x = number_of_reviews, y = price))+ 
  geom_jitter(color="steelblue4") + 
  ggtitle("Distribution of Price According to Review")

Again, the distribution is too skewed. The log scale transformation in the x-axis is performed once again.

ggplot(data = airbnb, aes(x = number_of_reviews, y = price)) + 
  geom_jitter(color="steelblue4") + 
  ggtitle("Transformed Distribution of Price According to Review", subtitle = expression(~'log'[10] ~ "x-axis")) + 
  scale_x_log10()

From the Jitter Plot above, it is observed that the more the number of review, the price is lower as the homestay might be old as compared to those with lowest number of reviews. This makes the homestay with higher number of review to be cheaper.

Next, the average price according to the neighbourhood_group is calculated and plotted.

bronx <- subset(airbnb, neighbourhood_group == "Bronx")
bronx_avg <- sum(bronx$price)/nrow(bronx)

brooklyn <- subset(airbnb, neighbourhood_group == "Brooklyn")
brooklyn_avg <- sum(brooklyn$price)/nrow(brooklyn)

manhattan <- subset(airbnb, neighbourhood_group == "Manhattan")
manhattan_avg <- sum(manhattan$price)/nrow(manhattan)

queens <- subset(airbnb, neighbourhood_group == "Queens")
queens_avg <- sum(queens$price)/nrow(queens)

staten <- subset(airbnb, neighbourhood_group == "Staten Island")
staten_avg <- sum(staten$price)/nrow(staten)

price_vs_neighbourhooodGroup <- data.frame(
  "Neighbourhood_Group" = c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island"),
  "Average_Price" = c(bronx_avg, brooklyn_avg, manhattan_avg, queens_avg, staten_avg)
)

price_vs_neighbourhooodGroup
##   Neighbourhood_Group Average_Price
## 1               Bronx      87.54963
## 2            Brooklyn     124.46628
## 3           Manhattan     196.90657
## 4              Queens      99.53602
## 5       Staten Island     114.81233
ggplot(data = price_vs_neighbourhooodGroup, aes(x = Neighbourhood_Group, y = Average_Price)) +
  geom_bar(stat="identity", fill = "steelblue4", colour = "black") + 
  ggtitle("Distribution of Price According to Neighbour Group")

It is observed that the average price per night is the highest in Manhattan, followed by Brooklyn, Staten Island, Queens and Bronx.

The average price per night according to room type is also calculated and plotted.

entire <- subset(airbnb, room_type == "Entire home/apt")
entire_avg <- sum(entire$price)/nrow(entire)

private <- subset(airbnb, room_type == "Private room")
private_avg <- sum(private$price)/nrow(private)

shared <- subset(airbnb, room_type == "Shared room")
shared_avg <- sum(shared$price)/nrow(shared)

price_vs_roomType <- data.frame(
  "Room_Type" = c("Entire Home/Apt", "Private Room", "Shared Room"),
  "Average_Price" = c(entire_avg, private_avg, shared_avg)
)

price_vs_roomType
##         Room_Type Average_Price
## 1 Entire Home/Apt     211.82368
## 2    Private Room      89.82255
## 3     Shared Room      70.19706
ggplot(data = price_vs_roomType, aes(x = Room_Type, y = Average_Price)) +
  geom_bar(stat="identity", fill = "steelblue4", colour = "black") + 
  ggtitle("Distribution of Price According to Room Type")

The entire home or apartment is observed to be the most expensive one, followed by the private room and shared room. This make sense as renting entire home or apartment means renting bigger space.

Now, the property type in each neighbourhood is also important to be visualized. It is observed that Manhattan has most entire home/apartment to be rented. This is also why the average price per night in Manhattan is high as compared to other neighbourhood.

ggplot(airbnb) + 
  geom_histogram(aes(neighbourhood_group, fill = room_type), stat = "count", position = 'fill') + 
  theme_minimal(base_size = 13)+ xlab("") + ylab("") + 
  ggtitle("The Proportion of Property Type in Each Area")

The distribution of price per night in each neighbourhood group is inspected.

airbnb_nh <- airbnb %>%
  group_by(neighbourhood_group) %>%
  summarise(price = round(mean(price), 2))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(airbnb, aes(price)) +
  geom_histogram(bins = 30, aes(y = ..density..), fill = "steelblue4") + 
  geom_density(alpha = 0.2, fill = "blue") +
  th +
  ggtitle("Transformed distribution of price\n by neighbourhood groups",
          subtitle = expression(~'log'[10] ~ "x-axis")) +
  geom_vline(data = airbnb_nh, aes(xintercept = price), size = 2, linetype = 3) +
  geom_text(data = airbnb_nh,y = 1.5, aes(x = price + 1400, label = paste("Mean  = $",price)), color = "red", size = 3) +
  facet_wrap(~neighbourhood_group) +
  scale_x_log10()

The price distribution based on room type and neighbourhood group is shown as below. It is observed that the price per night for entire home/apt is the highest in Manhattan. In Brooklyn, Queens and Bronx, the rental for private room is higher as compared to other room types.

airbnb %>% arrange(desc(price)) %>% top_n(10) %>% select(- host_name, -name) %>%  
  ggplot(aes(x = price, fill = neighbourhood_group)) +
  geom_histogram(bins = 50) +
  scale_x_log10() + 
  ggtitle("Summary of Price Distributions") +
  facet_wrap(~room_type + neighbourhood_group)
## Selecting by availability_365

Leaflet map

To further visualize the data, a leaflet map is used.

 pal <- colorFactor(palette = c("red", "green", "blue", "purple", "yellow"), domain = airbnb$neighbourhood_group)
 
 leaflet(data = airbnb) %>% addProviderTiles(providers$CartoDB.DarkMatterNoLabels) %>%  addCircleMarkers(~longitude, ~latitude, color = ~pal(neighbourhood_group), weight = 1, radius=1, fillOpacity = 0.1, opacity = 0.1,
                                                                                                        label = paste("Name:", airbnb$name)) %>% 
     addLegend("bottomright", pal = pal, values = ~neighbourhood_group,
     title = "Neighbourhood groups",
     opacity = 1
   )

Correlation Heatmap

A correlation heatmap is used to determine the correlation between the variables. 1 indicates strong positive correlation while -1 indicates strong negative correlation. It is observed that only number of review and review per month has a strong positive correlation as these two variables are fundamentally related to reviews.

airbnb_cor <- airbnb[, sapply(airbnb, is.numeric)]
airbnb_cor <- airbnb_cor[complete.cases(airbnb_cor), ]
correlation_matrix <- cor(airbnb_cor)
corrplot(correlation_matrix, method = "color")

# Train Test Split To perform predictive analysis, the data is split into train and test data with a ratio of 80% : 20%.

set.seed(45)
pd <- sample(2, nrow(airbnb), replace = TRUE, prob = c(0.8,0.2))
train <- airbnb[pd == 1,]#, here means all column 
test <- airbnb[pd == 2,]
cat("Number of train data: ", nrow(train), "\n")
## Number of train data:  39132
cat("Number of test data: ", nrow(test), "\n")
## Number of test data:  9715

A linear regression model is generated.

lr1 <- train(price ~ latitude + longitude + room_type + minimum_nights  + availability_365 + neighbourhood_group, data = train, method = "lm")
summary(lr1)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -246.2  -63.6  -24.9   15.0 9859.5 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -2.849e+04  3.662e+03  -7.779 7.47e-15 ***
## latitude                           -1.957e+02  3.566e+01  -5.488 4.10e-08 ***
## longitude                          -4.956e+02  4.099e+01 -12.089  < 2e-16 ***
## `room_typePrivate room`            -1.080e+02  2.459e+00 -43.915  < 2e-16 ***
## `room_typeShared room`             -1.431e+02  7.901e+00 -18.116  < 2e-16 ***
## minimum_nights                     -4.650e-02  6.028e-02  -0.771 0.440500    
## availability_365                    1.738e-01  9.211e-03  18.865  < 2e-16 ***
## neighbourhood_groupBrooklyn        -2.940e+01  9.966e+00  -2.950 0.003177 ** 
## neighbourhood_groupManhattan        3.172e+01  9.045e+00   3.506 0.000455 ***
## neighbourhood_groupQueens          -5.244e+00  9.617e+00  -0.545 0.585566    
## `neighbourhood_groupStaten Island` -1.442e+02  1.907e+01  -7.558 4.19e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 232.6 on 39121 degrees of freedom
## Multiple R-squared:  0.09587,    Adjusted R-squared:  0.09564 
## F-statistic: 414.8 on 10 and 39121 DF,  p-value: < 2.2e-16

The performance of the linear regression model is inspected using the plots. The Normal Q-Q plot of a linear regression model should be a straight line. However, it is not straight in this model. Therefore, this model is not good to be used for predictive analysis.

plot(lr1$finalModel)

To improve the model more, the outliers in the data needs to be removed. The top 10% of the price and the bottom 10% of price is removed as they contains the most outliers. Then, the price is log so that the output is not that compact. Also, more variables is added for prediction.

filtered <- train %>% filter(price < quantile(train$price, 0.9) & price > quantile(train$price, 0.1))
lr2 <- lm(log(price) ~ room_type + neighbourhood_group + latitude + longitude 
                        + number_of_reviews + availability_365
                       + reviews_per_month + 
                     calculated_host_listings_count + minimum_nights, data = filtered)
summary(lr2)
## 
## Call:
## lm(formula = log(price) ~ room_type + neighbourhood_group + latitude + 
##     longitude + number_of_reviews + availability_365 + reviews_per_month + 
##     calculated_host_listings_count + minimum_nights, data = filtered)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19464 -0.23132 -0.01506  0.21387  1.62655 
## 
## Coefficients:
##                                    Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                      -1.206e+02  5.850e+00  -20.614  < 2e-16 ***
## room_typePrivate room            -5.370e-01  3.830e-03 -140.220  < 2e-16 ***
## room_typeShared room             -6.220e-01  1.666e-02  -37.334  < 2e-16 ***
## neighbourhood_groupBrooklyn      -4.348e-02  1.653e-02   -2.631  0.00851 ** 
## neighbourhood_groupManhattan      1.532e-01  1.515e-02   10.111  < 2e-16 ***
## neighbourhood_groupQueens         4.349e-02  1.603e-02    2.714  0.00666 ** 
## neighbourhood_groupStaten Island -5.487e-01  3.161e-02  -17.360  < 2e-16 ***
## latitude                         -5.443e-01  5.620e-02   -9.686  < 2e-16 ***
## longitude                        -1.996e+00  6.570e-02  -30.384  < 2e-16 ***
## number_of_reviews                -1.375e-04  5.125e-05   -2.683  0.00731 ** 
## availability_365                  3.637e-04  1.564e-05   23.250  < 2e-16 ***
## reviews_per_month                -7.620e-03  1.496e-03   -5.093 3.55e-07 ***
## calculated_host_listings_count    3.990e-04  6.548e-05    6.093 1.12e-09 ***
## minimum_nights                   -1.348e-03  9.489e-05  -14.203  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3274 on 31155 degrees of freedom
## Multiple R-squared:  0.4837, Adjusted R-squared:  0.4835 
## F-statistic:  2245 on 13 and 31155 DF,  p-value: < 2.2e-16

In the improved linear regression model. The Normal Q-Q plot is observed to have a straight line. This indicate that this model is far better than the first model.

plot(lr2)

To examine the performance of the model, predictions are made. It is observed that the predicted value is close to the actual value.

test <- test %>% filter(price < quantile(train$price, 0.9) & price > quantile(train$price, 0.1))
pred <- predict(lr2, newdata = test)
pred <- exp(pred)

RMSE_regression <- sqrt(mean( (test$price - pred)**2 ))

SSE <- sum((test$price - pred)**2)
SSR <- sum((pred - mean(test$price)) ** 2)
R2 <- 1 - SSE/(SSE + SSR)

actual <- test$price

lr_result <- data.frame(
  "Actual" = actual,
  "Predicted" = pred
)

head(lr_result, 20)
##    Actual Predicted
## 1      85  92.07526
## 2     215 143.51928
## 3     120 134.13480
## 4      80  78.29981
## 5      50  91.49867
## 6     110 139.94887
## 7     110 107.13800
## 8      80  89.17728
## 9      51  95.87432
## 10     65  91.43803
## 11    190 176.27856
## 12    200  94.90440
## 13     70  77.19218
## 14     98  81.70424
## 15    250 139.95385
## 16    140 158.06902
## 17     89  95.54870
## 18     98 107.10967
## 19    125 131.11391
## 20     60  90.78271