knitr::opts_chunk$set(echo = TRUE)
## R Markdown
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(rpart)
listing <- read_csv("Listings.csv")
## Rows: 1718 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): neighborhood, host_since, room_type, bathrooms
## dbl (8): id, host_acceptance_rate, host_total_listings, accommodates, bedroo...
## lgl (1): superhost
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Reviews <- read_csv("Reviews.csv")
## Rows: 2790 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): listing_id, total_reviews, avg_rating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Neighborhood <- read_excel("Data Dictionary.xlsx",sheet="Neighborhoods Information")
Combined <- left_join(listing, Reviews, by = join_by(id == listing_id))
complete <- left_join(Combined, Neighborhood, by = join_by(neighborhood == Name))
anyNA( Combined)
## [1] TRUE
apply( Combined, 2, anyNA)
## id neighborhood host_since
## FALSE FALSE FALSE
## superhost host_acceptance_rate host_total_listings
## FALSE TRUE FALSE
## room_type accommodates bathrooms
## TRUE FALSE FALSE
## bedrooms beds price
## TRUE FALSE FALSE
## min_nights total_reviews avg_rating
## FALSE FALSE TRUE
missing_row <- which(is.na( Combined$avg_rating))
Combined$avg_rating[missing_row] <- mean( Combined$avg_rating, na.rm = TRUE)
Combined$avg_rating = as.numeric( Combined$avg_rating)
Exploratory Analysis Q1 Distribution of prices
ggplot( Combined, aes(price)) + geom_histogram(bins = 30)
boxplot of room_type and price to see find out outliers and
anomalies
#room type with
ggplot( Combined, aes(x = room_type, y = price, fill = room_type)) +
geom_boxplot() +
labs(title = "Price Distribution by Room Type",
x = "Room Type",
y = "Price") +
theme_minimal()
Barplot of neighborhood broken down by price
a = Combined %>% group_by(neighborhood) %>% summarize(avg_price =mean(price))
ggplot(a, aes(x = neighborhood, y = avg_price,fill=neighborhood)) +
geom_bar(stat="identity") +
labs(title = "Average Price of Observations by Neighborhood",
x = "Neighborhood",
y = "average price ") +
theme_minimal()
Barplot of neighborhood listing to show us which neighborhood has more
b = Combined %>% group_by(neighborhood) %>% summarize(count =n())
ggplot(b, aes(x = neighborhood, y = count,fill=neighborhood)) +
geom_bar(stat="identity") +
labs(title = "Count of Observations by Neighborhood",
x = "Neighborhood",
y = "number of listing ") +
theme_minimal()
neighborhood and roomtype breakdown by price to show us more insights on
which neighborhood has mor sharedroom and which neighborhood has
expensive private apartment
Combined %>%
group_by(neighborhood, room_type) %>%
summarise(avg_price = mean(price, na.rm=TRUE)) %>%
ggplot(aes(x=neighborhood, y=room_type, fill=avg_price)) +
geom_tile() +
scale_fill_gradient(low="blue", high="red") +
labs(title="Heatmap of Average Price by Neighborhood and Room Type", x="Neighborhood", y="Room Type", fill="Average Price") +
theme_minimal()
## `summarise()` has grouped output by 'neighborhood'. You can override using the
## `.groups` argument.
To conduct an analysis of the variability of both neighborhood and room
type, we identified the standard deviation and mean of the grouped
variables. Within our sample, private rooms in Foggy Bottom had the
highest average price and variability, with a mean of $334.00 and
standard deviation of $234.00. Alternatively, shared rooms in Capitol
Hill had the lowest average price and variability, with a mean of $35.10
and standard deviation of $12.00, respectively. These values provide
insights into which combinations of neighborhood and room type are more
affordable and consistently priced.
price_summary <- Combined %>%
group_by(neighborhood, room_type) %>%
summarise(avg_price = mean(price, na.rm=TRUE),
variability = sd(price, na.rm=TRUE))
## `summarise()` has grouped output by 'neighborhood'. You can override using the
## `.groups` argument.
highest_avg_price <- price_summary[which.max(price_summary$avg_price), ]
highest_avg_price
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Foggy Bottom Private room 334 234.
lowest_avg_price <- price_summary[which.min(price_summary$avg_price), ]
lowest_avg_price
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Capitol Hill Shared room 35.1 12.4
highest_variability <- price_summary[which.max(price_summary$variability), ]
highest_variability
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Foggy Bottom Private room 334 234.
lowest_variability <- price_summary[which.min(price_summary$variability), ]
lowest_variability
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Capitol Hill Shared room 35.1 12.4
To make an approximation of the population proportion of superhosts in Washington D.C., I computed a confidence interval using an alpha of 10%. Within our sample, the proportion of being a superhost in Washington D.C. is 42%. Multiplying the sample proportion and its complement by the number of observations returns a value greater than five. Therefore, we determined that the sample size was large enough for us to assume that the sampling distribution for proportions follows a normal distribution. With this condition met, we continued to calculate the margin of error, and ultimately, the upper and lower bounds of the confidence interval. We found that, 90% of the time, an interval of .4070 and .4463 will contain the true proportion of superhosts in Washington D.C. In other words, if we were to collect another sample of Airbnbs in Washington D.C., 90% of the time, we would find that less than 50% of hosts have earned this title.
n <- nrow( Combined)
p_hat <- mean( Combined$superhost, na.rm=TRUE)
z <- qnorm(0.95)
se <- sqrt(p_hat*(1-p_hat)/n)
CI_lower <- p_hat - z*se
CI_lower
## [1] 0.4070315
CI_upper <- p_hat + z*se
CI_upper
## [1] 0.4462863
testing if the population mean is greater than 4 based on the data
t_test <- t.test( Combined$avg_rating, mu=4, alternative="greater", na.rm=TRUE)
t_test
##
## One Sample t-test
##
## data: Combined$avg_rating
## t = 56.679, df = 1717, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 4
## 95 percent confidence interval:
## 4.699772 Inf
## sample estimates:
## mean of x
## 4.720698
visualizing of price using histogram
ggplot( Combined, aes(x=price)) +
geom_histogram(binwidth=10, fill="blue", alpha=0.7) +
labs(title="Histogram of Prices", x="Price", y="Frequency") +
theme_bw()
linear regression model
model1 <- lm(price ~ avg_rating, data= Combined)
model2 <- lm(beds ~ bedrooms + accommodates + beds, data= Combined)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on
## the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 3 in
## model.matrix: no columns are assigned
summary(model1)
##
## Call:
## lm(formula = price ~ avg_rating, data = Combined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -198.09 -80.37 -33.09 32.34 1019.91
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.909 29.839 5.024 5.59e-07 ***
## avg_rating 11.637 6.282 1.852 0.0641 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 137.2 on 1716 degrees of freedom
## Multiple R-squared: 0.001996, Adjusted R-squared: 0.001414
## F-statistic: 3.432 on 1 and 1716 DF, p-value: 0.06413
summary(model2)
##
## Call:
## lm(formula = beds ~ bedrooms + accommodates + beds, data = Combined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0068 -0.2457 -0.1883 0.4005 3.4024
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13086 0.03660 3.575 0.000361 ***
## bedrooms 0.35376 0.03083 11.475 < 2e-16 ***
## accommodates 0.35184 0.01367 25.736 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6023 on 1454 degrees of freedom
## (261 observations deleted due to missingness)
## Multiple R-squared: 0.7228, Adjusted R-squared: 0.7224
## F-statistic: 1895 on 2 and 1454 DF, p-value: < 2.2e-16
Visualizing our model
ggplot( Combined, aes(x = avg_rating, y = price,)) +
geom_point(aes(color = avg_rating), alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Price vs Avg Rating (Model 1)", x = "Avg Rating", y = "Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
more detail for our regression which shows our prediction and the observed values for the above two models
model1_residuals <- residuals(model1)
model1_fitted <- fitted(model1)
ggplot() +
geom_point(aes(x = model1_fitted, y = model1_residuals), alpha = 0.5) +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Residuals vs Fitted (Model 1)", x = "Fitted values", y = "Residuals") +
theme_minimal()
to see and gain insight by plottting visualising
# Model 1
qqnorm(model1_residuals)
qqline(model1_residuals, col = "red")
# Model 2
model2_residuals <- residuals(model2)
model2_fitted <- fitted(model2)
qqnorm(model2_residuals)
qqline(model2_residuals, col = "red")
# Fit the linear model
plotting collinearity to choose for regression in the above
numericVars <- which(sapply( Combined, is.numeric)) #index vector numeric variables
numericVarNames <- names(numericVars) #saving names vector for use later on
all_numVar <- Combined[, numericVars]
cor_numVar <- cor(all_numVar, use="pairwise.complete.obs") #correlations of all numeric variables
#sort on decreasing correlations with SalePrice
cor_sorted <- as.matrix(sort(cor_numVar[,'price'], decreasing = TRUE))
#select only high corelations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>=0.5)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]
library(corrplot)
## corrplot 0.92 loaded
corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt")
Question 8 and also this is the 3D representation of the model which i
found it to be cool but we dont have to submit it but feel free to
interact with it
model3 <- lm(price ~ avg_rating + accommodates, data = Combined)
# Define the grid over which you'll compute the predicted values
grid_data <- expand.grid(avg_rating = seq(min( Combined$avg_rating, na.rm = TRUE),
max( Combined$avg_rating, na.rm = TRUE), length.out = 50),
accommodates = seq(min( Combined$accommodates, na.rm = TRUE),
max( Combined$accommodates, na.rm = TRUE), length.out = 50))
# Get predictions over the grid
grid_data$predicted_price <- predict(model3, newdata = grid_data)
# Create the scatterplot and add the regression plane
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly() %>%
add_trace(data = Combined, x = ~avg_rating, y = ~accommodates, z = ~price, type = "scatter3d", mode = "markers",
marker = list(size = 3, opacity = 0.5)) %>%
add_trace(data = grid_data, x = ~avg_rating, y = ~accommodates, z = ~predicted_price, type = "mesh3d", opacity = 0.5, color = 'blue')
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
logarithmic regression
log_model <- lm(log(price) ~ avg_rating, data=Combined)
summary(log_model)
##
## Call:
## lm(formula = log(price) ~ avg_rating, data = Combined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.87115 -0.33568 -0.02025 0.30272 1.93940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.99489 0.11946 41.813 <2e-16 ***
## avg_rating 0.03577 0.02515 1.422 0.155
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5492 on 1716 degrees of freedom
## Multiple R-squared: 0.001178, Adjusted R-squared: 0.0005955
## F-statistic: 2.023 on 1 and 1716 DF, p-value: 0.1551
predict price with multiple predictors like avg_rating and accommodates, then:
log_model_multivariate <- lm(log(price) ~ avg_rating + accommodates, data=Combined)
summary(log_model_multivariate)
##
## Call:
## lm(formula = log(price) ~ avg_rating + accommodates, data = Combined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.02922 -0.26436 0.01304 0.26428 2.04823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.5404946 0.1004293 45.21 <2e-16 ***
## avg_rating 0.0008374 0.0209006 0.04 0.968
## accommodates 0.1574602 0.0056436 27.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4556 on 1715 degrees of freedom
## Multiple R-squared: 0.313, Adjusted R-squared: 0.3122
## F-statistic: 390.7 on 2 and 1715 DF, p-value: < 2.2e-16
Fitting Decision Tree
tree_model <- rpart(price ~ avg_rating + accommodates, data=Combined, method="anova")
visualizing our decision tree
library(rpart.plot)
rpart.plot(tree_model, main="Decision Tree for Price Prediction")