knitr::opts_chunk$set(echo = TRUE)
## R Markdown
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(sf)
## Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE
library(tmap)
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, were retired in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(tmaptools)
library(leaflet)
library(leaflet.extras)
listing <- read_csv("Listings.csv")
## Rows: 1718 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): neighborhood, host_since, room_type, bathrooms
## dbl (8): id, host_acceptance_rate, host_total_listings, accommodates, bedroo...
## lgl (1): superhost
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Reviews <- read_csv("Reviews.csv")
## Rows: 2790 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): listing_id, total_reviews, avg_rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Neighborhood <- read_excel("Data Dictionary.xlsx",sheet="Neighborhoods Information")
lefted <- left_join(listing, Reviews, by = join_by(id == listing_id))
complete <- left_join(lefted, Neighborhood, by = join_by(neighborhood == Name))
anyNA(lefted)
## [1] TRUE
apply(lefted, 2, anyNA)
## id neighborhood host_since
## FALSE FALSE FALSE
## superhost host_acceptance_rate host_total_listings
## FALSE TRUE FALSE
## room_type accommodates bathrooms
## TRUE FALSE FALSE
## bedrooms beds price
## TRUE FALSE FALSE
## min_nights total_reviews avg_rating
## FALSE FALSE TRUE
missing_row <- which(is.na(lefted$avg_rating))
lefted$avg_rating[missing_row] <- mean(lefted$avg_rating, na.rm = TRUE)
neighborhoods_sf <- st_as_sf(Neighborhood, coords = c("Longitude (center)", "Latitude (center)"), crs = 4326)
listings_count <- lefted %>%
group_by(neighborhood) %>%
summarise(n_listings = n()) %>%
rename(Name = neighborhood)
neighborhoods_sf <- left_join(neighborhoods_sf, listings_count, by = "Name")
leaflet(neighborhoods_sf) %>%
addTiles() %>%
addHeatmap(lng = ~st_coordinates(geometry)[,1],
lat = ~st_coordinates(geometry)[,2],
intensity = ~n_listings,
blur = 20, max = max(neighborhoods_sf$n_listings, na.rm = TRUE), radius = 15)
# Create a color palette for our listing counts
pal <- colorNumeric(palette = "viridis", domain = neighborhoods_sf$n_listings)
leaflet(neighborhoods_sf) %>%
addTiles() %>%
addCircleMarkers(popup = ~Name,
color = ~pal(n_listings),
radius = 50,
stroke = FALSE,
fillOpacity = 0.7)
Including Plots
ggplot(lefted, aes(price)) + geom_histogram(bins = 30)

ggplot(lefted,aes(avg_rating))+geom_histogram(bins = 30)

boxplot(price~room_type, data=lefted)

ggplot(lefted,aes(as.factor(avg_rating),price))+geom_point()+geom_smooth(method=lm,se=F)
## `geom_smooth()` using formula = 'y ~ x'

a = lefted %>% group_by(neighborhood) %>% summarize(avg_price =mean(price))
ggplot(a, aes(x = neighborhood, y = avg_price)) +
geom_bar(stat="identity") +
labs(title = "Count of Observations by Neighborhood",
x = "Neighborhood",
y = "average price ") +
theme_minimal()

b = lefted %>% group_by(neighborhood) %>% summarize(count =n())
ggplot(b, aes(x = neighborhood, y = count)) +
geom_bar(stat="identity") +
labs(title = "Count of Observations by Neighborhood",
x = "Neighborhood",
y = "number of listing ") +
theme_minimal()

lefted %>%
group_by(neighborhood, room_type) %>%
summarise(avg_price = mean(price, na.rm=TRUE)) %>%
ggplot(aes(x=neighborhood, y=room_type, fill=avg_price)) +
geom_tile() +
scale_fill_gradient(low="blue", high="red") +
labs(title="Heatmap of Average Price by Neighborhood and Room Type", x="Neighborhood", y="Room Type", fill="Average Price") +
theme_minimal()
## `summarise()` has grouped output by 'neighborhood'. You can override using the
## `.groups` argument.

lefted %>% group_by(neighborhood) %>% filter(room_type=="Shared room" & neighborhood =="Dupont Circle") #to make sure no shared room in Dupot circle
## # A tibble: 0 × 15
## # Groups: neighborhood [0]
## # ℹ 15 variables: id <dbl>, neighborhood <chr>, host_since <chr>,
## # superhost <lgl>, host_acceptance_rate <dbl>, host_total_listings <dbl>,
## # room_type <chr>, accommodates <dbl>, bathrooms <chr>, bedrooms <dbl>,
## # beds <dbl>, price <dbl>, min_nights <dbl>, total_reviews <dbl>,
## # avg_rating <dbl>
price_summary <- lefted %>%
group_by(neighborhood, room_type) %>%
summarise(avg_price = mean(price, na.rm=TRUE),
variability = sd(price, na.rm=TRUE))
## `summarise()` has grouped output by 'neighborhood'. You can override using the
## `.groups` argument.
highest_avg_price <- price_summary[which.max(price_summary$avg_price), ]
highest_avg_price
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Foggy Bottom Private room 334 234.
lowest_avg_price <- price_summary[which.min(price_summary$avg_price), ]
lowest_avg_price
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Capitol Hill Shared room 35.1 12.4
highest_variability <- price_summary[which.max(price_summary$variability), ]
highest_variability
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Foggy Bottom Private room 334 234.
lowest_variability <- price_summary[which.min(price_summary$variability), ]
lowest_variability
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Capitol Hill Shared room 35.1 12.4
n <- nrow(lefted)
p_hat <- mean(lefted$superhost, na.rm=TRUE)
z <- qnorm(0.95)
se <- sqrt(p_hat*(1-p_hat)/n)
CI_lower <- p_hat - z*se
CI_lower
## [1] 0.4070315
CI_upper <- p_hat + z*se
CI_upper
## [1] 0.4462863
t_test <- t.test(lefted$avg_rating, mu=4, alternative="greater", na.rm=TRUE)
t_test
##
## One Sample t-test
##
## data: lefted$avg_rating
## t = 56.679, df = 1717, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 4
## 95 percent confidence interval:
## 4.699772 Inf
## sample estimates:
## mean of x
## 4.720698
ggplot(lefted, aes(x=price)) +
geom_histogram(binwidth=10, fill="blue", alpha=0.7) +
labs(title="Histogram of Prices", x="Price", y="Frequency") +
theme_minimal()

numericVars <- which(sapply(lefted, is.numeric)) #index vector numeric variables
numericVarNames <- names(numericVars) #saving names vector for use later on
cat('There are', length(numericVars), 'numeric variables')
## There are 10 numeric variables
## There are 37 numeric variables
all_numVar <- lefted[, numericVars]
cor_numVar <- cor(all_numVar, use="pairwise.complete.obs") #correlations of all numeric variables
#sort on decreasing correlations with SalePrice
cor_sorted <- as.matrix(sort(cor_numVar[,'price'], decreasing = TRUE))
#select only high corelations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]
library(corrplot)
## corrplot 0.92 loaded
corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt")

model1 <- lm(price ~ avg_rating + accommodates + bathrooms + bedrooms, data=lefted)
model2 <- lm(price ~ avg_rating + total_reviews, data=lefted)
summary(model1)
##
## Call:
## lm(formula = price ~ avg_rating + accommodates + bathrooms +
## bedrooms, data = lefted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -302.49 -51.16 -14.14 34.77 947.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.137 25.950 2.163 0.030688 *
## avg_rating 3.627 5.257 0.690 0.490353
## accommodates 14.010 2.460 5.694 1.50e-08 ***
## bathrooms1 private bath 45.505 28.748 1.583 0.113667
## bathrooms1 shared bath -28.341 75.327 -0.376 0.706795
## bathrooms1.5 baths 44.487 11.654 3.817 0.000141 ***
## bathrooms2 baths 38.287 10.556 3.627 0.000297 ***
## bathrooms2.5 baths 111.968 13.578 8.246 3.63e-16 ***
## bathrooms2.5 shared baths 515.019 106.589 4.832 1.50e-06 ***
## bathrooms3 baths 176.682 27.426 6.442 1.60e-10 ***
## bathrooms3.5 baths 106.195 23.898 4.444 9.52e-06 ***
## bathrooms4 baths 194.500 33.725 5.767 9.84e-09 ***
## bathrooms4.5 baths 53.961 77.639 0.695 0.487154
## bathrooms5 baths 203.638 63.972 3.183 0.001488 **
## bedrooms 38.778 6.710 5.779 9.20e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 106.4 on 1442 degrees of freedom
## (261 observations deleted due to missingness)
## Multiple R-squared: 0.4558, Adjusted R-squared: 0.4506
## F-statistic: 86.28 on 14 and 1442 DF, p-value: < 2.2e-16
summary(model2)
##
## Call:
## lm(formula = price ~ avg_rating + total_reviews, data = lefted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -206.99 -78.59 -33.88 32.95 1010.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 140.92751 29.76221 4.735 2.37e-06 ***
## avg_rating 15.80105 6.32495 2.498 0.0126 *
## total_reviews -0.13367 0.03111 -4.296 1.84e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 136.5 on 1715 degrees of freedom
## Multiple R-squared: 0.01262, Adjusted R-squared: 0.01147
## F-statistic: 10.96 on 2 and 1715 DF, p-value: 1.862e-05