if(!require(dplyr)) install.packages(“dplyr”) if(!require(ggplot2)) install.packages(“ggplot2”) if(!require(corrplot)) install.packages(“corrplot”)
library(dplyr) library(ggplot2) library(corrplot)
df <- read.csv(“AB_NYC_2019.csv”, stringsAsFactors = FALSE)
dim(df) # Show the structure of the data (Variable Types) str(df)
head(df, 10) # Select only numeric columns (excluding IDs) numeric_vars <- df %>% select(price, minimum_nights, number_of_reviews, reviews_per_month, calculated_host_listings_count, availability_365)
summary(numeric_vars) ggplot(df, aes(x=price)) + geom_histogram(binwidth=10, fill=“steelblue”, color=“white”) + xlim(0, 1000) + # Zooming in to make it readable labs(title=“Distribution of Airbnb Prices”, x=“Price ($)”, y=“Frequency”) + theme_minimal() # (Take a SCREENSHOT of the graph) ggplot(df, aes(x=room_type, y=price, fill=room_type)) + geom_boxplot() + ylim(0, 500) + # Limiting y-axis to see the boxes labs(title=“Price Distribution by Room Type”, x=“Room Type”, y=“Price”) + theme_minimal() # (Take a SCREENSHOT of the graph) ggplot(df, aes(x=neighbourhood_group, fill=neighbourhood_group)) + geom_bar() + labs(title=“Listings Count by Neighbourhood Group”, x=“Borough”, y=“Count”) + theme_minimal() # (Take a SCREENSHOT of the graph) ggplot(df, aes(x=number_of_reviews, y=price)) + geom_point(alpha=0.4, color=“darkgreen”) + ylim(0, 1000) + labs(title=“Price vs. Number of Reviews”, x=“Number of Reviews”, y=“Price”) + theme_minimal() # (Take a SCREENSHOT of the graph) # Calculate correlation matrix (remove missing values) cor_matrix <- cor(numeric_vars, use = “complete.obs”)
corrplot(cor_matrix, method=“color”, type=“upper”, tl.col=“black”, tl.srt=45, title=“Correlation Heatmap”) # Select numeric columns for PCA (including lat/long, excluding IDs) pca_data <- df %>% select(latitude, longitude, price, minimum_nights, number_of_reviews, reviews_per_month, calculated_host_listings_count, availability_365) %>% na.omit() # PCA cannot handle NAs
pca_result <- prcomp(pca_data, scale. = TRUE)
summary(pca_result) # Calculate Variance var_explained <- pca_result$sdev^2 prop_var <- var_explained / sum(var_explained)
plot(prop_var, type = “b”, pch = 19, col = “blue”, main = “Scree Plot”, xlab = “Principal Component”, ylab = “Proportion of Variance Explained”)
pca_table <- data.frame( PC = paste0(“PC”, 1:length(var_explained)), Variance_Percent = round(prop_var * 100, 2), Cumulative_Percent = round(cumsum(prop_var) * 100, 2) ) print(head(pca_table, 10)) # Show top 3 loadings for the first few PCs (Take a SCREENSHOT of this output) print(pca_result$rotation[, 1:5])