# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load datasets
zillow_data <- read.csv("/Users/aribarazzaq/Desktop/house-prices-advanced-regression-techniques/Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
train_data <- read.csv("/Users/aribarazzaq/Desktop/house-prices-advanced-regression-techniques/train.csv")
test_data <- read.csv("/Users/aribarazzaq/Desktop/house-prices-advanced-regression-techniques/test.csv")
# Step 1: Transform Zillow dataset
# Select relevant columns and rename them to align with Kaggle's schema
zillow_transformed <- zillow_data %>%
select(RegionName, StateName, X2024.10.31) %>%
rename(Neighborhood = RegionName, ZHVI_2024 = X2024.10.31)
# Step 2: Filter Zillow data for Ames, IA
# Identify data specific to "Ames, IA" for integration
ames_zillow <- zillow_transformed %>% filter(Neighborhood == "Ames, IA")
# Step 3: Extract ZHVI value for Ames
# Assign the home value index for Ames to a variable for mapping
ames_zhvi_value <- ames_zillow$ZHVI_2024[1]
# Step 4: Add ZHVI to Kaggle Datasets
# Apply the ZHVI value for Ames to all neighborhoods in Kaggle datasets
train_data <- train_data %>%
mutate(ZHVI_2024 = ames_zhvi_value)
test_data <- test_data %>%
mutate(ZHVI_2024 = ames_zhvi_value)
# Step 5: Verify the Updated Datasets
# Check a few rows of the updated datasets to confirm the addition of ZHVI
train_data_updated <- train_data %>% select(Neighborhood, ZHVI_2024) %>% head()
test_data_updated <- test_data %>% select(Neighborhood, ZHVI_2024) %>% head()
# Step 6: Exploratory Visualizations
# Visualize the relationship between ZHVI_2024 and SalePrice in train data
library(ggplot2)
# Scatter plot: ZHVI_2024 vs. SalePrice
scatter_plot <- ggplot(train_data, aes(x = ZHVI_2024, y = SalePrice)) +
geom_point(alpha = 0.5, color = "blue") +
labs(title = "Scatter Plot: ZHVI_2024 vs SalePrice",
x = "ZHVI (Zillow Home Value Index)",
y = "Sale Price") +
theme_minimal()
print(scatter_plot)

# Histogram of SalePrice
histogram <- ggplot(train_data, aes(x = SalePrice)) +
geom_histogram(fill = "skyblue", bins = 30, color = "black") +
labs(title = "Distribution of Sale Prices",
x = "Sale Price",
y = "Frequency") +
theme_minimal()
print(histogram)

# Step 7: Statistical Analysis
# Correlation between ZHVI_2024 and SalePrice
correlation <- cor(train_data$ZHVI_2024, train_data$SalePrice, use = "complete.obs")
## Warning in cor(train_data$ZHVI_2024, train_data$SalePrice, use =
## "complete.obs"): the standard deviation is zero
print(paste("Correlation between ZHVI_2024 and SalePrice:", correlation))
## [1] "Correlation between ZHVI_2024 and SalePrice: NA"
# Linear model: SalePrice vs. ZHVI_2024
lm_model <- lm(SalePrice ~ ZHVI_2024, data = train_data)
summary_lm <- summary(lm_model)
print(summary_lm)
##
## Call:
## lm(formula = SalePrice ~ ZHVI_2024, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -146021 -50946 -17921 33079 574079
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 180921 2079 87.02 <2e-16 ***
## ZHVI_2024 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 79440 on 1459 degrees of freedom
# Step 8: Save Outputs for Reproducibility
# Save updated datasets
write.csv(train_data, "~/updated_train_data.csv", row.names = FALSE)
write.csv(test_data, "~/updated_test_data.csv", row.names = FALSE)
# Display results
print("Updated Train Data")
## [1] "Updated Train Data"
print(train_data_updated)
## Neighborhood ZHVI_2024
## 1 CollgCr 240503.5
## 2 Veenker 240503.5
## 3 CollgCr 240503.5
## 4 Crawfor 240503.5
## 5 NoRidge 240503.5
## 6 Mitchel 240503.5
print("Updated Test Data")
## [1] "Updated Test Data"
print(test_data_updated)
## Neighborhood ZHVI_2024
## 1 NAmes 240503.5
## 2 NAmes 240503.5
## 3 Gilbert 240503.5
## 4 Gilbert 240503.5
## 5 StoneBr 240503.5
## 6 Gilbert 240503.5