Including Plots
ggplot(lefted, aes(x = price)) +
geom_histogram(fill = "violet", bins = 30) +
labs(title = "Price Distribution",
x = "Price",
y = "Count") +
theme_minimal()

ggplot(lefted, aes(x = room_type, y = price, fill = room_type)) +
geom_boxplot() +
labs(title = "Price Distribution by Room Type",
x = "Room Type",
y = "Price") +
theme_minimal()

ggplot(lefted,aes(as.factor(avg_rating),price))+geom_point()+geom_smooth(method=lm,se=F)
## `geom_smooth()` using formula = 'y ~ x'

a = lefted %>% group_by(neighborhood) %>% summarize(avg_price =mean(price))
ggplot(a, aes(x = neighborhood, y = avg_price,fill=neighborhood)) +
geom_bar(stat="identity") +
labs(title = "Count of Observations by Neighborhood",
x = "Neighborhood",
y = "average price ") +
theme_minimal()

b = lefted %>% group_by(neighborhood) %>% summarize(count =n())
ggplot(b, aes(x = neighborhood, y = count,fill=neighborhood)) +
geom_bar(stat="identity") +
labs(title = "Count of Observations by Neighborhood",
x = "Neighborhood",
y = "number of listing ") +
theme_minimal()

lefted %>%
group_by(neighborhood, room_type) %>%
summarise(avg_price = mean(price, na.rm=TRUE)) %>%
ggplot(aes(x=neighborhood, y=room_type, fill=avg_price)) +
geom_tile() +
scale_fill_gradient(low="blue", high="red") +
labs(title="Heatmap of Average Price by Neighborhood and Room Type", x="Neighborhood", y="Room Type", fill="Average Price") +
theme_minimal()
## `summarise()` has grouped output by 'neighborhood'. You can override using the
## `.groups` argument.

lefted %>% group_by(neighborhood) %>% filter(room_type=="Shared room" & neighborhood =="Dupont Circle")
## # A tibble: 0 × 15
## # Groups: neighborhood [0]
## # ℹ 15 variables: id <dbl>, neighborhood <chr>, host_since <chr>,
## # superhost <lgl>, host_acceptance_rate <dbl>, host_total_listings <dbl>,
## # room_type <chr>, accommodates <dbl>, bathrooms <chr>, bedrooms <dbl>,
## # beds <dbl>, price <dbl>, min_nights <dbl>, total_reviews <dbl>,
## # avg_rating <dbl>
price_summary <- lefted %>%
group_by(neighborhood, room_type) %>%
summarise(avg_price = mean(price, na.rm=TRUE),
variability = sd(price, na.rm=TRUE))
## `summarise()` has grouped output by 'neighborhood'. You can override using the
## `.groups` argument.
highest_avg_price <- price_summary[which.max(price_summary$avg_price), ]
highest_avg_price
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Foggy Bottom Private room 334 234.
lowest_avg_price <- price_summary[which.min(price_summary$avg_price), ]
lowest_avg_price
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Capitol Hill Shared room 35.1 12.4
highest_variability <- price_summary[which.max(price_summary$variability), ]
highest_variability
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Foggy Bottom Private room 334 234.
lowest_variability <- price_summary[which.min(price_summary$variability), ]
lowest_variability
## # A tibble: 1 × 4
## # Groups: neighborhood [1]
## neighborhood room_type avg_price variability
## <chr> <chr> <dbl> <dbl>
## 1 Capitol Hill Shared room 35.1 12.4
n <- nrow(lefted)
p_hat <- mean(lefted$superhost, na.rm=TRUE)
z <- qnorm(0.95)
se <- sqrt(p_hat*(1-p_hat)/n)
CI_lower <- p_hat - z*se
CI_lower
## [1] 0.4070315
CI_upper <- p_hat + z*se
CI_upper
## [1] 0.4462863
t_test <- t.test(lefted$avg_rating, mu=4, alternative="greater", na.rm=TRUE)
t_test
##
## One Sample t-test
##
## data: lefted$avg_rating
## t = 56.679, df = 1717, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 4
## 95 percent confidence interval:
## 4.699772 Inf
## sample estimates:
## mean of x
## 4.720698
ggplot(lefted, aes(x=price)) +
geom_histogram(binwidth=10, fill="blue", alpha=0.7) +
labs(title="Histogram of Prices", x="Price", y="Frequency") +
theme_minimal()

numericVars <- which(sapply(lefted, is.numeric)) #index vector numeric variables
numericVarNames <- names(numericVars) #saving names vector for use later on
cat('There are', length(numericVars), 'numeric variables')
## There are 10 numeric variables
## There are 37 numeric variables
all_numVar <- lefted[, numericVars]
cor_numVar <- cor(all_numVar, use="pairwise.complete.obs") #correlations of all numeric variables
#sort on decreasing correlations with SalePrice
cor_sorted <- as.matrix(sort(cor_numVar[,'price'], decreasing = TRUE))
#select only high corelations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]
library(corrplot)
## corrplot 0.92 loaded
corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt")

model1 <- lm(price ~ avg_rating + accommodates + bathrooms + bedrooms, data=lefted)
model2 <- lm(price ~ avg_rating + total_reviews, data=lefted)
summary(model1)
##
## Call:
## lm(formula = price ~ avg_rating + accommodates + bathrooms +
## bedrooms, data = lefted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -302.49 -51.16 -14.14 34.77 947.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.137 25.950 2.163 0.030688 *
## avg_rating 3.627 5.257 0.690 0.490353
## accommodates 14.010 2.460 5.694 1.50e-08 ***
## bathrooms1 private bath 45.505 28.748 1.583 0.113667
## bathrooms1 shared bath -28.341 75.327 -0.376 0.706795
## bathrooms1.5 baths 44.487 11.654 3.817 0.000141 ***
## bathrooms2 baths 38.287 10.556 3.627 0.000297 ***
## bathrooms2.5 baths 111.968 13.578 8.246 3.63e-16 ***
## bathrooms2.5 shared baths 515.019 106.589 4.832 1.50e-06 ***
## bathrooms3 baths 176.682 27.426 6.442 1.60e-10 ***
## bathrooms3.5 baths 106.195 23.898 4.444 9.52e-06 ***
## bathrooms4 baths 194.500 33.725 5.767 9.84e-09 ***
## bathrooms4.5 baths 53.961 77.639 0.695 0.487154
## bathrooms5 baths 203.638 63.972 3.183 0.001488 **
## bedrooms 38.778 6.710 5.779 9.20e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 106.4 on 1442 degrees of freedom
## (261 observations deleted due to missingness)
## Multiple R-squared: 0.4558, Adjusted R-squared: 0.4506
## F-statistic: 86.28 on 14 and 1442 DF, p-value: < 2.2e-16
summary(model2)
##
## Call:
## lm(formula = price ~ avg_rating + total_reviews, data = lefted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -206.99 -78.59 -33.88 32.95 1010.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 140.92751 29.76221 4.735 2.37e-06 ***
## avg_rating 15.80105 6.32495 2.498 0.0126 *
## total_reviews -0.13367 0.03111 -4.296 1.84e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 136.5 on 1715 degrees of freedom
## Multiple R-squared: 0.01262, Adjusted R-squared: 0.01147
## F-statistic: 10.96 on 2 and 1715 DF, p-value: 1.862e-05
ggplot(lefted, aes(x = avg_rating, y = price,)) +
geom_point(aes(color = avg_rating), alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Price vs Avg Rating (Model 1)", x = "Avg Rating", y = "Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(lefted, aes(x = avg_rating, y = price)) +
geom_point(aes(color = avg_rating), alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Price vs Avg Rating (Model 2)", x = "Avg Rating", y = "Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

plot_ly(lefted, x = ~avg_rating, y = ~accommodates, z = ~price, type="scatter3d", mode="markers")
model3 <- lm(price ~ avg_rating + accommodates, data = lefted)
# Define the grid over which you'll compute the predicted values
grid_data <- expand.grid(avg_rating = seq(min(lefted$avg_rating, na.rm = TRUE),
max(lefted$avg_rating, na.rm = TRUE), length.out = 50),
accommodates = seq(min(lefted$accommodates, na.rm = TRUE),
max(lefted$accommodates, na.rm = TRUE), length.out = 50))
# Get predictions over the grid
grid_data$predicted_price <- predict(model3, newdata = grid_data)
# Create the scatterplot and add the regression plane
plot_ly() %>%
add_trace(data = lefted, x = ~avg_rating, y = ~accommodates, z = ~price, type = "scatter3d", mode = "markers",
marker = list(size = 3, opacity = 0.5)) %>%
add_trace(data = grid_data, x = ~avg_rating, y = ~accommodates, z = ~predicted_price, type = "mesh3d", opacity = 0.5, color = 'blue')
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
#multivariate linear regression plot
model3 <- lm(price ~ avg_rating + accommodates, data = lefted)
# Define the grid over which you'll compute the predicted values
grid_data <- with(lefted, expand.grid(avg_rating = seq(min(avg_rating, na.rm = TRUE),
max(avg_rating, na.rm = TRUE), length.out = 20),
accommodates = seq(min(accommodates, na.rm = TRUE),
max(accommodates, na.rm = TRUE), length.out = 20)))
# Predict the prices over the grid
grid_data$pred_price <- predict(model3, newdata = grid_data)
# Create the scatterplot
s3d <- with(lefted, scatterplot3d(avg_rating, accommodates, price, pch = 20,
xlab = "Average Rating", ylab = "Accommodates",
zlab = "Price", angle = 45, color = "blue",
main="3D Scatterplot with Regression Plane"))
# Add the regression plane
s3d$plane3d(model3, grid = TRUE, shade = TRUE)
## Warning in segments(x, z1, x + y.max * yx.f, z2 + yz.f * y.max, lty = ltya, :
## "grid" is not a graphical parameter
## Warning in segments(x, z1, x + y.max * yx.f, z2 + yz.f * y.max, lty = ltya, :
## "shade" is not a graphical parameter
## Warning in segments(x.min + y * yx.f, z1 + y * yz.f, x.max + y * yx.f, z2 + :
## "grid" is not a graphical parameter
## Warning in segments(x.min + y * yx.f, z1 + y * yz.f, x.max + y * yx.f, z2 + :
## "shade" is not a graphical parameter

model1_residuals <- residuals(model1)
model1_fitted <- fitted(model1)
ggplot() +
geom_point(aes(x = model1_fitted, y = model1_residuals), alpha = 0.5) +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Residuals vs Fitted (Model 1)", x = "Fitted values", y = "Residuals") +
theme_minimal()

model2_residuals <- residuals(model2)
model2_fitted <- fitted(model2)
ggplot() +
geom_point(aes(x = model2_fitted, y = model2_residuals), alpha = 0.5) +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Residuals vs Fitted (Model 2)", x = "Fitted values", y = "Residuals") +
theme_minimal()

# Model 1
qqnorm(model1_residuals)
qqline(model1_residuals, col = "red")

# Model 2
qqnorm(model2_residuals)
qqline(model2_residuals, col = "red")
