# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load datasets
zillow_data <- read.csv("/Users/aribarazzaq/Desktop/house-prices-advanced-regression-techniques/Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
train_data <- read.csv("/Users/aribarazzaq/Desktop/house-prices-advanced-regression-techniques/train.csv")
test_data <- read.csv("/Users/aribarazzaq/Desktop/house-prices-advanced-regression-techniques/test.csv")

# Step 1: Transform Zillow dataset
# Select relevant columns and rename them to align with Kaggle's schema
zillow_transformed <- zillow_data %>%
  select(RegionName, StateName, X2024.10.31) %>%
  rename(Neighborhood = RegionName, ZHVI_2024 = X2024.10.31)

# Step 2: Filter Zillow data for Ames, IA
# Identify data specific to "Ames, IA" for integration
ames_zillow <- zillow_transformed %>% filter(Neighborhood == "Ames, IA")

# Step 3: Extract ZHVI value for Ames
# Assign the home value index for Ames to a variable for mapping
ames_zhvi_value <- ames_zillow$ZHVI_2024[1]

# Step 4: Add ZHVI to Kaggle Datasets
# Apply the ZHVI value for Ames to all neighborhoods in Kaggle datasets
train_data <- train_data %>%
  mutate(ZHVI_2024 = ames_zhvi_value)

test_data <- test_data %>%
  mutate(ZHVI_2024 = ames_zhvi_value)

# Step 5: Verify the Updated Datasets
# Check a few rows of the updated datasets to confirm the addition of ZHVI
train_data_updated <- train_data %>% select(Neighborhood, ZHVI_2024) %>% head()
test_data_updated <- test_data %>% select(Neighborhood, ZHVI_2024) %>% head()

# Step 6: Exploratory Visualizations
# Visualize the relationship between ZHVI_2024 and SalePrice in train data
library(ggplot2)

# Scatter plot: ZHVI_2024 vs. SalePrice
scatter_plot <- ggplot(train_data, aes(x = ZHVI_2024, y = SalePrice)) +
  geom_point(alpha = 0.5, color = "blue") +
  labs(title = "Scatter Plot: ZHVI_2024 vs SalePrice",
       x = "ZHVI (Zillow Home Value Index)",
       y = "Sale Price") +
  theme_minimal()

print(scatter_plot)

# Histogram of SalePrice
histogram <- ggplot(train_data, aes(x = SalePrice)) +
  geom_histogram(fill = "skyblue", bins = 30, color = "black") +
  labs(title = "Distribution of Sale Prices",
       x = "Sale Price",
       y = "Frequency") +
  theme_minimal()

print(histogram)

# Step 7: Statistical Analysis
# Correlation between ZHVI_2024 and SalePrice
correlation <- cor(train_data$ZHVI_2024, train_data$SalePrice, use = "complete.obs")
## Warning in cor(train_data$ZHVI_2024, train_data$SalePrice, use =
## "complete.obs"): the standard deviation is zero
print(paste("Correlation between ZHVI_2024 and SalePrice:", correlation))
## [1] "Correlation between ZHVI_2024 and SalePrice: NA"
# Linear model: SalePrice vs. ZHVI_2024
lm_model <- lm(SalePrice ~ ZHVI_2024, data = train_data)
summary_lm <- summary(lm_model)
print(summary_lm)
## 
## Call:
## lm(formula = SalePrice ~ ZHVI_2024, data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -146021  -50946  -17921   33079  574079 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   180921       2079   87.02   <2e-16 ***
## ZHVI_2024         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79440 on 1459 degrees of freedom
# Step 8: Save Outputs for Reproducibility
# Save updated datasets
write.csv(train_data, "~/updated_train_data.csv", row.names = FALSE)
write.csv(test_data, "~/updated_test_data.csv", row.names = FALSE)

# Display results
print("Updated Train Data")
## [1] "Updated Train Data"
print(train_data_updated)
##   Neighborhood ZHVI_2024
## 1      CollgCr  240503.5
## 2      Veenker  240503.5
## 3      CollgCr  240503.5
## 4      Crawfor  240503.5
## 5      NoRidge  240503.5
## 6      Mitchel  240503.5
print("Updated Test Data")
## [1] "Updated Test Data"
print(test_data_updated)
##   Neighborhood ZHVI_2024
## 1        NAmes  240503.5
## 2        NAmes  240503.5
## 3      Gilbert  240503.5
## 4      Gilbert  240503.5
## 5      StoneBr  240503.5
## 6      Gilbert  240503.5