AirBnB House Predicting and exploratory analysis

knitr::opts_chunk$set(echo = TRUE)

## R Markdown
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)
library(sf)

## Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE

library(tmap)

## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, were retired in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')

library(tmaptools)
library(leaflet)
library(leaflet.extras)

listing <- read_csv("Listings.csv")

## Rows: 1718 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): neighborhood, host_since, room_type, bathrooms
## dbl (8): id, host_acceptance_rate, host_total_listings, accommodates, bedroo...
## lgl (1): superhost
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Reviews <- read_csv("Reviews.csv")

## Rows: 2790 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): listing_id, total_reviews, avg_rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Neighborhood <- read_excel("Data Dictionary.xlsx",sheet="Neighborhoods Information")

lefted <- left_join(listing, Reviews, by = join_by(id == listing_id))
complete <- left_join(lefted, Neighborhood, by = join_by(neighborhood == Name))

anyNA(lefted)

## [1] TRUE

apply(lefted, 2, anyNA)

##                   id         neighborhood           host_since 
##                FALSE                FALSE                FALSE 
##            superhost host_acceptance_rate  host_total_listings 
##                FALSE                 TRUE                FALSE 
##            room_type         accommodates            bathrooms 
##                 TRUE                FALSE                FALSE 
##             bedrooms                 beds                price 
##                 TRUE                FALSE                FALSE 
##           min_nights        total_reviews           avg_rating 
##                FALSE                FALSE                 TRUE

missing_row <- which(is.na(lefted$avg_rating))
lefted$avg_rating[missing_row] <- mean(lefted$avg_rating, na.rm = TRUE)

neighborhoods_sf <- st_as_sf(Neighborhood, coords = c("Longitude (center)", "Latitude (center)"), crs = 4326)

listings_count <- lefted %>%
  group_by(neighborhood) %>%
  summarise(n_listings = n()) %>%
  rename(Name = neighborhood)

neighborhoods_sf <- left_join(neighborhoods_sf, listings_count, by = "Name")


leaflet(neighborhoods_sf) %>%
  addTiles() %>%
  addHeatmap(lng = ~st_coordinates(geometry)[,1], 
             lat = ~st_coordinates(geometry)[,2], 
             intensity = ~n_listings, 
             blur = 20, max = max(neighborhoods_sf$n_listings, na.rm = TRUE), radius = 15)

# Create a color palette for our listing counts
pal <- colorNumeric(palette = "viridis", domain = neighborhoods_sf$n_listings)

leaflet(neighborhoods_sf) %>%
  addTiles() %>%
  addCircleMarkers(popup = ~Name, 
                   color = ~pal(n_listings), 
                   radius = 50, 
                   stroke = FALSE, 
                   fillOpacity = 0.7)

Including Plots

ggplot(lefted, aes(price)) + geom_histogram(bins = 30)

ggplot(lefted,aes(avg_rating))+geom_histogram(bins = 30)

boxplot(price~room_type, data=lefted)

ggplot(lefted,aes(as.factor(avg_rating),price))+geom_point()+geom_smooth(method=lm,se=F)

## `geom_smooth()` using formula = 'y ~ x'

a = lefted %>% group_by(neighborhood) %>% summarize(avg_price =mean(price))

ggplot(a, aes(x = neighborhood, y = avg_price)) +
  geom_bar(stat="identity") +
  labs(title = "Count of Observations by Neighborhood", 
       x = "Neighborhood", 
       y = "average price ") +
  theme_minimal()

b = lefted %>% group_by(neighborhood) %>% summarize(count =n())

ggplot(b, aes(x = neighborhood, y = count)) +
  geom_bar(stat="identity") +
  labs(title = "Count of Observations by Neighborhood", 
       x = "Neighborhood", 
       y = "number of listing ") +
  theme_minimal()

lefted %>% 
  group_by(neighborhood, room_type) %>%
  summarise(avg_price = mean(price, na.rm=TRUE)) %>%
  ggplot(aes(x=neighborhood, y=room_type, fill=avg_price)) +
  geom_tile() +
  scale_fill_gradient(low="blue", high="red") +
  labs(title="Heatmap of Average Price by Neighborhood and Room Type", x="Neighborhood", y="Room Type", fill="Average Price") +
  theme_minimal()

## `summarise()` has grouped output by 'neighborhood'. You can override using the
## `.groups` argument.

lefted %>% group_by(neighborhood) %>% filter(room_type=="Shared room" & neighborhood =="Dupont Circle") #to make sure no shared room in Dupot circle

## # A tibble: 0 × 15
## # Groups:   neighborhood [0]
## # ℹ 15 variables: id <dbl>, neighborhood <chr>, host_since <chr>,
## #   superhost <lgl>, host_acceptance_rate <dbl>, host_total_listings <dbl>,
## #   room_type <chr>, accommodates <dbl>, bathrooms <chr>, bedrooms <dbl>,
## #   beds <dbl>, price <dbl>, min_nights <dbl>, total_reviews <dbl>,
## #   avg_rating <dbl>

price_summary <- lefted %>% 
  group_by(neighborhood, room_type) %>% 
  summarise(avg_price = mean(price, na.rm=TRUE),
            variability = sd(price, na.rm=TRUE))

## `summarise()` has grouped output by 'neighborhood'. You can override using the
## `.groups` argument.

highest_avg_price <- price_summary[which.max(price_summary$avg_price), ]
highest_avg_price

## # A tibble: 1 × 4
## # Groups:   neighborhood [1]
##   neighborhood room_type    avg_price variability
##   <chr>        <chr>            <dbl>       <dbl>
## 1 Foggy Bottom Private room       334        234.

lowest_avg_price <- price_summary[which.min(price_summary$avg_price), ]
lowest_avg_price

## # A tibble: 1 × 4
## # Groups:   neighborhood [1]
##   neighborhood room_type   avg_price variability
##   <chr>        <chr>           <dbl>       <dbl>
## 1 Capitol Hill Shared room      35.1        12.4

highest_variability <- price_summary[which.max(price_summary$variability), ]
highest_variability

## # A tibble: 1 × 4
## # Groups:   neighborhood [1]
##   neighborhood room_type    avg_price variability
##   <chr>        <chr>            <dbl>       <dbl>
## 1 Foggy Bottom Private room       334        234.

lowest_variability <- price_summary[which.min(price_summary$variability), ]
lowest_variability

## # A tibble: 1 × 4
## # Groups:   neighborhood [1]
##   neighborhood room_type   avg_price variability
##   <chr>        <chr>           <dbl>       <dbl>
## 1 Capitol Hill Shared room      35.1        12.4

n <- nrow(lefted)
p_hat <- mean(lefted$superhost, na.rm=TRUE)
z <- qnorm(0.95)
se <- sqrt(p_hat*(1-p_hat)/n)

CI_lower <- p_hat - z*se
CI_lower

## [1] 0.4070315

CI_upper <- p_hat + z*se
CI_upper

## [1] 0.4462863

t_test <- t.test(lefted$avg_rating, mu=4, alternative="greater", na.rm=TRUE)
t_test

## 
##  One Sample t-test
## 
## data:  lefted$avg_rating
## t = 56.679, df = 1717, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 4
## 95 percent confidence interval:
##  4.699772      Inf
## sample estimates:
## mean of x 
##  4.720698

ggplot(lefted, aes(x=price)) + 
  geom_histogram(binwidth=10, fill="blue", alpha=0.7) + 
  labs(title="Histogram of Prices", x="Price", y="Frequency") +
  theme_minimal()

numericVars <- which(sapply(lefted, is.numeric)) #index vector numeric variables
numericVarNames <- names(numericVars) #saving names vector for use later on
cat('There are', length(numericVars), 'numeric variables')

## There are 10 numeric variables

## There are 37 numeric variables
all_numVar <- lefted[, numericVars]
cor_numVar <- cor(all_numVar, use="pairwise.complete.obs") #correlations of all numeric variables

#sort on decreasing correlations with SalePrice
cor_sorted <- as.matrix(sort(cor_numVar[,'price'], decreasing = TRUE))
 #select only high corelations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]
library(corrplot)

## corrplot 0.92 loaded

corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt")

model1 <- lm(price ~ avg_rating + accommodates + bathrooms + bedrooms, data=lefted)
model2 <- lm(price ~ avg_rating + total_reviews, data=lefted)

summary(model1)

## 
## Call:
## lm(formula = price ~ avg_rating + accommodates + bathrooms + 
##     bedrooms, data = lefted)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -302.49  -51.16  -14.14   34.77  947.65 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 56.137     25.950   2.163 0.030688 *  
## avg_rating                   3.627      5.257   0.690 0.490353    
## accommodates                14.010      2.460   5.694 1.50e-08 ***
## bathrooms1 private bath     45.505     28.748   1.583 0.113667    
## bathrooms1 shared bath     -28.341     75.327  -0.376 0.706795    
## bathrooms1.5 baths          44.487     11.654   3.817 0.000141 ***
## bathrooms2 baths            38.287     10.556   3.627 0.000297 ***
## bathrooms2.5 baths         111.968     13.578   8.246 3.63e-16 ***
## bathrooms2.5 shared baths  515.019    106.589   4.832 1.50e-06 ***
## bathrooms3 baths           176.682     27.426   6.442 1.60e-10 ***
## bathrooms3.5 baths         106.195     23.898   4.444 9.52e-06 ***
## bathrooms4 baths           194.500     33.725   5.767 9.84e-09 ***
## bathrooms4.5 baths          53.961     77.639   0.695 0.487154    
## bathrooms5 baths           203.638     63.972   3.183 0.001488 ** 
## bedrooms                    38.778      6.710   5.779 9.20e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 106.4 on 1442 degrees of freedom
##   (261 observations deleted due to missingness)
## Multiple R-squared:  0.4558, Adjusted R-squared:  0.4506 
## F-statistic: 86.28 on 14 and 1442 DF,  p-value: < 2.2e-16

summary(model2)

## 
## Call:
## lm(formula = price ~ avg_rating + total_reviews, data = lefted)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -206.99  -78.59  -33.88   32.95 1010.74 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   140.92751   29.76221   4.735 2.37e-06 ***
## avg_rating     15.80105    6.32495   2.498   0.0126 *  
## total_reviews  -0.13367    0.03111  -4.296 1.84e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 136.5 on 1715 degrees of freedom
## Multiple R-squared:  0.01262,    Adjusted R-squared:  0.01147 
## F-statistic: 10.96 on 2 and 1715 DF,  p-value: 1.862e-05

AirBnB House Predicting and exploratory analysis

2023-10-06

Including Plots