knitr::opts_chunk$set(echo = TRUE)
## R Markdown
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(rpart)

listing <- read_csv("Listings.csv")
## Rows: 1718 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): neighborhood, host_since, room_type, bathrooms
## dbl (8): id, host_acceptance_rate, host_total_listings, accommodates, bedroo...
## lgl (1): superhost
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Reviews <- read_csv("Reviews.csv")
## Rows: 2790 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): listing_id, total_reviews, avg_rating
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Neighborhood <- read_excel("Data Dictionary.xlsx",sheet="Neighborhoods Information")

Combined <- left_join(listing, Reviews, by = join_by(id == listing_id))
complete <- left_join(Combined, Neighborhood, by = join_by(neighborhood == Name))

anyNA( Combined)
## [1] TRUE
apply( Combined, 2, anyNA)
##                   id         neighborhood           host_since 
##                FALSE                FALSE                FALSE 
##            superhost host_acceptance_rate  host_total_listings 
##                FALSE                 TRUE                FALSE 
##            room_type         accommodates            bathrooms 
##                 TRUE                FALSE                FALSE 
##             bedrooms                 beds                price 
##                 TRUE                FALSE                FALSE 
##           min_nights        total_reviews           avg_rating 
##                FALSE                FALSE                 TRUE
missing_row <- which(is.na( Combined$avg_rating))
 Combined$avg_rating[missing_row] <- mean( Combined$avg_rating, na.rm = TRUE)
 Combined$avg_rating = as.numeric( Combined$avg_rating)

Exploratory Analysis Q1 Distribution of prices

ggplot( Combined, aes(price)) + geom_histogram(bins = 30)

boxplot of room_type and price to see find out outliers and anomalies

#room type with 
ggplot( Combined, aes(x = room_type, y = price, fill = room_type)) + 
  geom_boxplot() +
  labs(title = "Price Distribution by Room Type", 
       x = "Room Type", 
       y = "Price") +
  theme_minimal()

Barplot of neighborhood broken down by price

a =  Combined %>% group_by(neighborhood) %>% summarize(avg_price =mean(price))

ggplot(a, aes(x = neighborhood, y = avg_price,fill=neighborhood)) +
  geom_bar(stat="identity") +
  labs(title = "Average Price of Observations by Neighborhood", 
       x = "Neighborhood", 
       y = "average price ") +
  theme_minimal()

Barplot of neighborhood listing to show us which neighborhood has more

b =  Combined %>% group_by(neighborhood) %>% summarize(count =n())

ggplot(b, aes(x = neighborhood, y = count,fill=neighborhood)) +
  geom_bar(stat="identity") +
  labs(title = "Count of Observations by Neighborhood", 
       x = "Neighborhood", 
       y = "number of listing ") +
  theme_minimal()

neighborhood and roomtype breakdown by price to show us more insights on which neighborhood has mor sharedroom and which neighborhood has expensive private apartment

 Combined %>% 
  group_by(neighborhood, room_type) %>%
  summarise(avg_price = mean(price, na.rm=TRUE)) %>%
  ggplot(aes(x=neighborhood, y=room_type, fill=avg_price)) +
  geom_tile() +
  scale_fill_gradient(low="blue", high="red") +
  labs(title="Heatmap of Average Price by Neighborhood and Room Type", x="Neighborhood", y="Room Type", fill="Average Price") +
  theme_minimal()
## `summarise()` has grouped output by 'neighborhood'. You can override using the
## `.groups` argument.

To conduct an analysis of the variability of both neighborhood and room type, we identified the standard deviation and mean of the grouped variables. Within our sample, private rooms in Foggy Bottom had the highest average price and variability, with a mean of $334.00 and standard deviation of $234.00. Alternatively, shared rooms in Capitol Hill had the lowest average price and variability, with a mean of $35.10 and standard deviation of $12.00, respectively. These values provide insights into which combinations of neighborhood and room type are more affordable and consistently priced.

price_summary <-  Combined %>% 
  group_by(neighborhood, room_type) %>% 
  summarise(avg_price = mean(price, na.rm=TRUE),
            variability = sd(price, na.rm=TRUE))
## `summarise()` has grouped output by 'neighborhood'. You can override using the
## `.groups` argument.
highest_avg_price <- price_summary[which.max(price_summary$avg_price), ]
highest_avg_price
## # A tibble: 1 × 4
## # Groups:   neighborhood [1]
##   neighborhood room_type    avg_price variability
##   <chr>        <chr>            <dbl>       <dbl>
## 1 Foggy Bottom Private room       334        234.
lowest_avg_price <- price_summary[which.min(price_summary$avg_price), ]
lowest_avg_price
## # A tibble: 1 × 4
## # Groups:   neighborhood [1]
##   neighborhood room_type   avg_price variability
##   <chr>        <chr>           <dbl>       <dbl>
## 1 Capitol Hill Shared room      35.1        12.4
highest_variability <- price_summary[which.max(price_summary$variability), ]
highest_variability
## # A tibble: 1 × 4
## # Groups:   neighborhood [1]
##   neighborhood room_type    avg_price variability
##   <chr>        <chr>            <dbl>       <dbl>
## 1 Foggy Bottom Private room       334        234.
lowest_variability <- price_summary[which.min(price_summary$variability), ]
lowest_variability
## # A tibble: 1 × 4
## # Groups:   neighborhood [1]
##   neighborhood room_type   avg_price variability
##   <chr>        <chr>           <dbl>       <dbl>
## 1 Capitol Hill Shared room      35.1        12.4

To make an approximation of the population proportion of superhosts in Washington D.C., I computed a confidence interval using an alpha of 10%. Within our sample, the proportion of being a superhost in Washington D.C. is 42%. Multiplying the sample proportion and its complement by the number of observations returns a value greater than five. Therefore, we determined that the sample size was large enough for us to assume that the sampling distribution for proportions follows a normal distribution. With this condition met, we continued to calculate the margin of error, and ultimately, the upper and lower bounds of the confidence interval. We found that, 90% of the time, an interval of .4070 and .4463 will contain the true proportion of superhosts in Washington D.C. In other words, if we were to collect another sample of Airbnbs in Washington D.C., 90% of the time, we would find that less than 50% of hosts have earned this title.

n <- nrow( Combined)
p_hat <- mean( Combined$superhost, na.rm=TRUE)
z <- qnorm(0.95)
se <- sqrt(p_hat*(1-p_hat)/n)

CI_lower <- p_hat - z*se
CI_lower
## [1] 0.4070315
CI_upper <- p_hat + z*se
CI_upper
## [1] 0.4462863

testing if the population mean is greater than 4 based on the data

t_test <- t.test( Combined$avg_rating, mu=4, alternative="greater", na.rm=TRUE)
t_test
## 
##  One Sample t-test
## 
## data:  Combined$avg_rating
## t = 56.679, df = 1717, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 4
## 95 percent confidence interval:
##  4.699772      Inf
## sample estimates:
## mean of x 
##  4.720698

visualizing of price using histogram

ggplot( Combined, aes(x=price)) + 
  geom_histogram(binwidth=10, fill="blue", alpha=0.7) + 
  labs(title="Histogram of Prices", x="Price", y="Frequency") +
  theme_bw()

linear regression model

model1 <- lm(price ~ avg_rating, data= Combined)
model2 <- lm(beds ~ bedrooms +  accommodates + beds, data= Combined)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on
## the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 3 in
## model.matrix: no columns are assigned
summary(model1)
## 
## Call:
## lm(formula = price ~ avg_rating, data = Combined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -198.09  -80.37  -33.09   32.34 1019.91 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  149.909     29.839   5.024 5.59e-07 ***
## avg_rating    11.637      6.282   1.852   0.0641 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 137.2 on 1716 degrees of freedom
## Multiple R-squared:  0.001996,   Adjusted R-squared:  0.001414 
## F-statistic: 3.432 on 1 and 1716 DF,  p-value: 0.06413
summary(model2)
## 
## Call:
## lm(formula = beds ~ bedrooms + accommodates + beds, data = Combined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0068 -0.2457 -0.1883  0.4005  3.4024 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.13086    0.03660   3.575 0.000361 ***
## bedrooms      0.35376    0.03083  11.475  < 2e-16 ***
## accommodates  0.35184    0.01367  25.736  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6023 on 1454 degrees of freedom
##   (261 observations deleted due to missingness)
## Multiple R-squared:  0.7228, Adjusted R-squared:  0.7224 
## F-statistic:  1895 on 2 and 1454 DF,  p-value: < 2.2e-16

Visualizing our model

ggplot( Combined, aes(x = avg_rating, y = price,)) +
  geom_point(aes(color = avg_rating), alpha = 0.5) +  
  geom_smooth(method = "lm", se = FALSE, color = "blue") + 
  labs(title = "Price vs Avg Rating (Model 1)", x = "Avg Rating", y = "Price") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

more detail for our regression which shows our prediction and the observed values for the above two models

model1_residuals <- residuals(model1)
model1_fitted <- fitted(model1)

ggplot() +
  geom_point(aes(x = model1_fitted, y = model1_residuals), alpha = 0.5) +
  geom_hline(yintercept = 0, color = "red") +
  labs(title = "Residuals vs Fitted (Model 1)", x = "Fitted values", y = "Residuals") +
  theme_minimal()

to see and gain insight by plottting visualising

# Model 1
qqnorm(model1_residuals)
qqline(model1_residuals, col = "red")

# Model 2
model2_residuals <- residuals(model2)
model2_fitted <- fitted(model2)

qqnorm(model2_residuals)
qqline(model2_residuals, col = "red")

# Fit the linear model

plotting collinearity to choose for regression in the above

numericVars <- which(sapply( Combined, is.numeric)) #index vector numeric variables
numericVarNames <- names(numericVars) #saving names vector for use later on

all_numVar <-  Combined[, numericVars]
cor_numVar <- cor(all_numVar, use="pairwise.complete.obs") #correlations of all numeric variables

#sort on decreasing correlations with SalePrice
cor_sorted <- as.matrix(sort(cor_numVar[,'price'], decreasing = TRUE))
 #select only high corelations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>=0.5)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]
library(corrplot)
## corrplot 0.92 loaded
corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt")

Question 8 and also this is the 3D representation of the model which i found it to be cool but we dont have to submit it but feel free to interact with it

model3 <- lm(price ~ avg_rating + accommodates, data =  Combined)

# Define the grid over which you'll compute the predicted values
grid_data <- expand.grid(avg_rating = seq(min( Combined$avg_rating, na.rm = TRUE), 
                                          max( Combined$avg_rating, na.rm = TRUE), length.out = 50),
                         accommodates = seq(min( Combined$accommodates, na.rm = TRUE), 
                                            max( Combined$accommodates, na.rm = TRUE), length.out = 50))

# Get predictions over the grid
grid_data$predicted_price <- predict(model3, newdata = grid_data)

# Create the scatterplot and add the regression plane
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly() %>%
  add_trace(data =  Combined, x = ~avg_rating, y = ~accommodates, z = ~price, type = "scatter3d", mode = "markers", 
            marker = list(size = 3, opacity = 0.5)) %>%
  add_trace(data = grid_data, x = ~avg_rating, y = ~accommodates, z = ~predicted_price, type = "mesh3d", opacity = 0.5, color = 'blue')
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

logarithmic regression

log_model <- lm(log(price) ~ avg_rating, data=Combined)
summary(log_model)
## 
## Call:
## lm(formula = log(price) ~ avg_rating, data = Combined)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.87115 -0.33568 -0.02025  0.30272  1.93940 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.99489    0.11946  41.813   <2e-16 ***
## avg_rating   0.03577    0.02515   1.422    0.155    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5492 on 1716 degrees of freedom
## Multiple R-squared:  0.001178,   Adjusted R-squared:  0.0005955 
## F-statistic: 2.023 on 1 and 1716 DF,  p-value: 0.1551

predict price with multiple predictors like avg_rating and accommodates, then:

log_model_multivariate <- lm(log(price) ~ avg_rating + accommodates, data=Combined)
summary(log_model_multivariate)
## 
## Call:
## lm(formula = log(price) ~ avg_rating + accommodates, data = Combined)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.02922 -0.26436  0.01304  0.26428  2.04823 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.5404946  0.1004293   45.21   <2e-16 ***
## avg_rating   0.0008374  0.0209006    0.04    0.968    
## accommodates 0.1574602  0.0056436   27.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4556 on 1715 degrees of freedom
## Multiple R-squared:  0.313,  Adjusted R-squared:  0.3122 
## F-statistic: 390.7 on 2 and 1715 DF,  p-value: < 2.2e-16

Fitting Decision Tree

tree_model <- rpart(price ~ avg_rating + accommodates, data=Combined, method="anova")

visualizing our decision tree

library(rpart.plot)
rpart.plot(tree_model, main="Decision Tree for Price Prediction")