Introduction

This analysis aims to predict housing prices using various machine learning models and census data. We will compare the performance of Linear Regression, Random Forest, Mixed Effects, and Neural Network models to identify the best approach for predicting housing prices.

The dataset includes housing features such as the number of bedrooms, bathrooms, square footage, and location, as well as census data like median income, population, education levels, and employment rates by ZIP code.

Data Preprocessing

First, we read the housing dataset download from kaggle. We convert relevant columns to appropriate data types and create new features like house age, years since renovation, and living area to lot size ratio.

head(housing_data)

datatable(housing_data, options = list(pageLength = 5))
Warning: It seems your data is too big for client-side DataTables. You may consider server-side processing: https://rstudio.github.io/DT/server.htmlWarning: It seems your data is too big for client-side DataTables. You may consider server-side processing: https://rstudio.github.io/DT/server.html
skim(housing_data)
# Price Distribution
ggplot(housing_data, aes(x = price)) + 
  geom_histogram(bins = 30, fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Price Distribution")

# Bedrooms Distribution
ggplot(housing_data, aes(x = factor(bedrooms))) + 
  geom_bar(fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Bedrooms Distribution", x = "Bedrooms")

# Bathrooms Distribution
ggplot(housing_data, aes(x = factor(bathrooms))) + 
  geom_bar(fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Bathrooms Distribution", x = "Bathrooms")

# Living Area sqft Distribution
ggplot(housing_data, aes(x = sqft_living)) + 
  geom_histogram(bins = 30, fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Living Area sqft Distribution")

# Correlation Heatmap
numeric_data <- select(housing_data, where(is.numeric))
cor_matrix <- cor(numeric_data)
corrplot(cor_matrix, method = 'circle', type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

Incorperating Census Data

# Set the Census API key
census_api_key('bcbac889885cad20c1a787bc242d4aa4011251a2')

# Retrieve median income data by ZIP code
income_data <- get_acs(
  geography = "zcta",
  variables = "B19013_001",
  year = 2019,
  survey = "acs5"
)

f_income <- income_data %>%
  filter(GEOID %in% housing_data$ZIP) %>%
  rename(ZIP = GEOID, median_income = estimate)

# Retrieve population data for each ZIP code (ZCTA) from the latest ACS data
population_data <- get_acs(
  geography = "zcta",
  variables = "B01003_001",
  year = 2019,
  survey = "acs5"
)

f_pop <- population_data %>%
  filter(GEOID %in% housing_data$ZIP) %>%
  rename(ZIP = GEOID, population = estimate)

# Retrieve education data by ZIP code
education_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP02_0059P", # Percent high school graduate or higher
    "DP02_0065PE"  # Percent bachelor's degree or higher
  ),
  year = 2019,
  survey = "acs5"
)

education_data <- education_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    pct_high_school_grad = mean(DP02_0059P, na.rm = TRUE),
    pct_bachelors_degree = mean(DP02_0065P, na.rm = TRUE)
  )

# Retrieve employment and occupation data by ZIP code
employment_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP03_0005PE", # Percent unemployed
    "DP03_0027PE", # Percent in management, business, science, and arts occupations
    "DP03_0028PE", # Percent in service occupations
    "DP03_0029PE", # Percent in sales and office occupations
    "DP03_0030PE"  # Percent in natural resources, construction, and maintenance occupations
  ),
  year = 2019,
  survey = "acs5"
)

employment_data <- employment_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    pct_unemployed = mean(DP03_0005P, na.rm = TRUE),
    pct_mgmt_occupation = mean(DP03_0027P, na.rm = TRUE),
    pct_service_occupation = mean(DP03_0028P, na.rm = TRUE),
    pct_sales_office_occupation = mean(DP03_0029P, na.rm = TRUE),
    pct_construction_occupation = mean(DP03_0030P, na.rm = TRUE)
  )

# Retrieve housing characteristics data by ZIP code
housing_chars_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP04_0046PE", # Percent owner-occupied housing units
    "DP04_0039E", # Median year built for housing structures
    "DP04_0093PE"  # Percent of housing units with central air conditioning
  ),
  year = 2019,
  survey = "acs5"
)

housing_chars_data <- housing_chars_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    pct_owner_occupied = mean(DP04_0046P, na.rm = TRUE),
    median_year_built = mean(DP04_0039, na.rm = TRUE),
    pct_central_air = mean(DP04_0093P, na.rm = TRUE)
  )

# Retrieve commuting patterns data by ZIP code
commuting_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP03_0025E", # Mean travel time to work (minutes)
    "DP03_0021PE", # Percent using public transportation (excluding taxicab)
    "DP03_0024PE"  # Percent working from home
  ),
  year = 2019,
  survey = "acs5"
)

commuting_data <- commuting_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    mean_travel_time = mean(DP03_0025, na.rm = TRUE),
    pct_public_transport = mean(DP03_0021P, na.rm = TRUE),
    pct_work_from_home = mean(DP03_0024P, na.rm = TRUE)
  )

# Retrieve household characteristics data by ZIP code
household_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP02_0016E", # Average household size
    "DP02_0007PE", # Percent of households with own children under 18 years
    "DP02_0009PE", # Percent of single-parent households
    "DP05_0018E"  # Median age
  ),
  year = 2019,
  survey = "acs5"
)

household_data <- household_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    avg_household_size = mean(DP02_0016, na.rm = TRUE),
    pct_households_with_children = mean(DP02_0007P, na.rm = TRUE),
    pct_single_parent_households = mean(DP02_0009P, na.rm = TRUE),
    median_age = mean(DP05_0018, na.rm = TRUE)
  )

# Merge all the Census data with the housing data
housing_data_with_census <- housing_data %>%
  left_join(f_income, by = "ZIP") %>%
  left_join(f_pop, by = "ZIP") %>%
  left_join(education_data, by = c("ZIP" = "GEOID")) %>%
  left_join(employment_data, by = c("ZIP" = "GEOID")) %>%
  left_join(housing_chars_data, by = c("ZIP" = "GEOID")) %>%
  left_join(commuting_data, by = c("ZIP" = "GEOID")) %>%
  left_join(household_data, by = c("ZIP" = "GEOID"))%>%
    select(
    date, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition,
    sqft_above, sqft_basement, yr_built, yr_renovated, street, city, country, ZIP,
    house_age, years_since_renovation, total_rooms, living_lot_ratio, has_basement, renovated,
    floor_area_ratio, outdoor_space, season,
    median_income, population,
    pct_high_school_grad, pct_bachelors_degree,
    pct_unemployed, pct_mgmt_occupation, pct_service_occupation, pct_sales_office_occupation, pct_construction_occupation,
    pct_owner_occupied, median_year_built, pct_central_air,
    mean_travel_time, pct_public_transport, pct_work_from_home,
    avg_household_size, pct_households_with_children, pct_single_parent_households, median_age
  )%>%
  #write_csv('Data_with_census.csv')

Preprocessing

housing_data_with_census <- read_csv('Data_with_census.csv')
Rows: 4412 Columns: 47── Column specification ───────────────────────────────────────────
Delimiter: ","
chr   (5): street, city, country, State, season
dbl  (41): price, bedrooms, bathrooms, sqft_living, sqft_lot, f...
date  (1): date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

We decided to perform a log transformation on the price for make the distribution more normal

print(vif_values)
                    bedrooms                    bathrooms 
                    1.708079                     2.840310 
                 sqft_living                       floors 
                    3.061817                     2.763125 
                   condition             living_lot_ratio 
                    1.271854                     2.283708 
      pct_service_occupation  pct_sales_office_occupation 
                    3.574640                     2.379895 
 pct_construction_occupation            median_year_built 
                    4.023819                     4.305020 
            mean_travel_time           pct_work_from_home 
                    2.076649                     3.530957 
          avg_household_size pct_households_with_children 
                    4.812068                     1.362034 
pct_single_parent_households                   median_age 
                    2.331042                     3.079090 
               waterfront_X1                      view_X1 
                    1.326975                     1.034925 
                     view_X2                      view_X3 
                    1.061864                     1.082554 
                     view_X4              has_basement_X1 
                    1.303335                     1.705690 
                renovated_X1                season_Summer 
                    1.235108                     1.005978 

VIF is bellow 5 for all predictors so we have addressed colinearity problems.

library(multilevelmod)
set.seed(333)

## Create cross-validation folds for each model type
cv_folds_lm_rf <- vfold_cv(lm_rf_data, v = 10)
cv_folds_me <- vfold_cv(me_data_selected, v = 10)
cv_folds_nn <- vfold_cv(housing_data_for_model, v = 10)

# Create recipes for models
rec_lm_rf <- recipe(log_price ~ ., data = lm_rf_data)
rec_me <- recipe(log_price ~ ., data = me_data_selected)
rec_nn <- recipe(log_price ~ ., data = housing_data_for_model) %>%
  step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
  step_dummy(all_nominal_predictors(), -all_outcomes())

# Define the models
lm_model <- linear_reg() %>% set_engine("lm") %>% set_mode("regression")
rf_model <- rand_forest(trees = 1000) %>% set_mode("regression") %>% set_engine("ranger", importance = "impurity")
me_model <- linear_reg() %>% set_engine("lmer") %>% set_mode("regression")
nn_model <- mlp(hidden_units = 10) %>% set_engine("nnet", linout = TRUE, trace = T) %>% set_mode("regression")

# Create workflows
lm_workflow <- workflow() %>% add_recipe(rec_lm_rf) %>% add_model(lm_model)
rf_workflow <- workflow() %>% add_recipe(rec_lm_rf) %>% add_model(rf_model)
me_workflow <- workflow() %>% add_recipe(rec_me) %>% add_model(me_model, formula = log_price ~ . + (1 | ZIP))
nn_workflow <- workflow() %>% add_recipe(rec_nn) %>% add_model(nn_model)

# Perform cross-validation
lm_cv_results <- fit_resamples(lm_workflow, resamples = cv_folds_lm_rf, metrics = metric_set(rmse, mae, rsq))
rf_cv_results <- fit_resamples(rf_workflow, resamples = cv_folds_lm_rf, metrics = metric_set(rmse, mae, rsq))
me_cv_results <- fit_resamples(me_workflow, resamples = cv_folds_me, metrics = metric_set(rmse, mae, rsq))
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
→ A | warning: unable to evaluate scaled gradient, Model failed to converge: degenerate  Hessian with 1 negative eigenvalues

There were issues with some computations   A: x1

There were issues with some computations   A: x1
nn_cv_results <- fit_resamples(nn_workflow, resamples = cv_folds_nn, metrics = metric_set(rmse, mae, rsq))

# Summarize and compare the cross-validation results
cv_results <- bind_rows(
  collect_metrics(lm_cv_results) %>% mutate(model = "Linear Regression"),
  collect_metrics(rf_cv_results) %>% mutate(model = "Random Forest"),
  collect_metrics(me_cv_results) %>% mutate(model = "Mixed Effects"),
  collect_metrics(nn_cv_results) %>% mutate(model = "Neural Network")
)

cv_summary <- cv_results %>%
  group_by(model, .metric) %>%
  summarize(
    mean_value = mean(mean, na.rm = TRUE),
    std_error = mean(std_err, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  pivot_wider(names_from = .metric, values_from = c(mean_value, std_error))

print(cv_summary)
# Split the data into training and testing sets for each model type
data_split_lm_rf <- initial_split(lm_rf_data, prop = 0.8)
train_data_lm_rf <- training(data_split_lm_rf)
test_data_lm_rf <- testing(data_split_lm_rf)

data_split_me <- initial_split(me_data_selected, prop = 0.8)
train_data_me <- training(data_split_me)
test_data_me <- testing(data_split_me)

data_split_nn <- initial_split(housing_data_for_model, prop = 0.8)
train_data_nn <- training(data_split_nn)
test_data_nn <- testing(data_split_nn)

# Fit and evaluate each model using last_fit()
lm_last_fit <- last_fit(lm_workflow, data_split_lm_rf)
rf_last_fit <- last_fit(rf_workflow, data_split_lm_rf)
me_last_fit <- last_fit(me_workflow, data_split_me)
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
boundary (singular) fit: see help('isSingular')
nn_last_fit <- last_fit(nn_workflow, data_split_nn)

# Extract predictions and metrics for each model
lm_results <- lm_last_fit %>% collect_predictions() %>% mutate(model = "Linear Regression")
rf_results <- rf_last_fit %>% collect_predictions() %>% mutate(model = "Random Forest")
me_results <- me_last_fit %>% collect_predictions() %>% mutate(model = "Mixed Effects")
nn_results <- nn_last_fit %>% collect_predictions() %>% mutate(model = "Neural Network")

# Combine the results
results <- bind_rows(lm_results, rf_results, me_results, nn_results)

# Plot predicted vs. actual for each model with R^2
results %>%
  ggplot(aes(x = log_price, y = .pred, color = model)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  facet_wrap(~ model, scales = "free") +
  labs(
    title = "Predicted vs. Actual House Prices",
    x = "Actual Price",
    y = "Predicted Price"
  ) +
  theme_minimal() +
  theme(legend.position = "none") +
  geom_text(
    data = results %>%
      group_by(model) %>%
      summarise(rsq = cor(log_price, .pred)^2),
    aes(x = Inf, y = -Inf, label = sprintf("R^2 = %.3f", rsq)),
    hjust = 1.1,
    vjust = -1.1
  )

The model with the best predictive accuracy was the random forest model, followed by the linear regression model, neural network model, and mixed effects model. Issues with model fitting are clear with the mixed effects model due to the strong left skew in the error.

# Extract the trained Random Forest model from the last_fit object
rf_model <- rf_last_fit %>% 
  extract_fit_engine()

# Extract variable importance from the Random Forest model
rf_importance <- rf_model %>% 
  ranger::importance()

# Plot variable importance
ggplot(data.frame(variable = names(rf_importance), importance = rf_importance), 
       aes(x = reorder(variable, importance), y = importance, fill = importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(x = "Variable", y = "Importance", title = "Random Forest Variable Importance") +
  theme_minimal()

SQFT of living space followed by occupation prevalence appear to be most predictive of housing price. Things like view, and commuting time are suprisingly much lower by comparison!

Conversly in the linear model, having an outswtanding view seems to be the most important predictor. Very different apporaches!

---
title: "EDA Housing"
output:
  html_notebook: default
  pdf_document: default
---

```{r, echo=FALSE, include=FALSE}
library(tidyverse, warn.conflicts = F)
library(GGally)
library(corrplot)
library(lme4)
library(lubridate)  # Load lubridate for date manipulation
library(tidycensus)  # Load tidycensus for census data retrieval
library(car)  # Load car for VIF calculation
library(caret)  # Load caret for finding high correlations
library(MASS)  # Load MASS for stepAIC function
library(tidymodels, warn.conflicts = F)  # Load tidymodels for modeling
library(ranger)  # Load ranger for Random Forest
library(skimr)
library(DT)

tidymodels_prefer()
```

## Introduction

This analysis aims to predict housing prices using various machine learning models and census data. We will compare the performance of Linear Regression, Random Forest, Mixed Effects, and Neural Network models to identify the best approach for predicting housing prices.

The dataset includes housing features such as the number of bedrooms, bathrooms, square footage, and location, as well as census data like median income, population, education levels, and employment rates by ZIP code.

## Data Preprocessing

First, we read the housing dataset download from [kaggle](https://www.kaggle.com/datasets/shree1992/housedata/data "Dataset Download"). We convert relevant columns to appropriate data types and create new features like house age, years since renovation, and living area to lot size ratio.

head(housing_data)

```{r}
housing_data <- read_csv('data.csv')

# Convert relevant columns to appropriate data types
housing_data$view <- as.factor(housing_data$view)
housing_data$waterfront <- as.factor(housing_data$waterfront)
housing_data$condition <- as.factor(housing_data$condition)
housing_data$date <- as.Date(housing_data$date)
housing_data$price <- as.numeric(housing_data$price)
housing_data$bedrooms <- as.numeric(housing_data$bedrooms)
housing_data$bathrooms <- as.numeric(housing_data$bathrooms)
housing_data$sqft_living <- as.numeric(housing_data$sqft_living)
housing_data$sqft_lot <- as.numeric(housing_data$sqft_lot)
housing_data$floors <- as.numeric(housing_data$floors)
housing_data$sqft_above <- as.numeric(housing_data$sqft_above)
housing_data$sqft_basement <- as.numeric(housing_data$sqft_basement)
housing_data$yr_built <- as.integer(housing_data$yr_built)
housing_data$yr_renovated <- as.integer(housing_data$yr_renovated)
housing_data$street <- as.character(housing_data$street)
housing_data$city <- as.character(housing_data$city)
housing_data$statezip <- as.character(housing_data$statezip)
housing_data$country <- as.character(housing_data$country)

summary(housing_data)

housing_data <- housing_data %>%
  mutate(ZIP = str_extract(statezip, "\\d{5}"))



housing_data <- housing_data %>%
  mutate(
    house_age = as.integer(format(Sys.Date(), "%Y")) - yr_built,
    years_since_renovation = if_else(yr_renovated > 0, as.integer(format(Sys.Date(), "%Y")) - yr_renovated, house_age),
    total_rooms = bedrooms + bathrooms,
    living_lot_ratio = sqft_living / sqft_lot,
    has_basement = as.integer(sqft_basement > 0),
    renovated = as.integer(yr_renovated > 0),
    floor_area_ratio = sqft_above / sqft_lot,
    outdoor_space = sqft_lot - sqft_living,
    season = case_when(
      month(date) %in% 3:5 ~ "Spring",
      month(date) %in% 6:8 ~ "Summer",
      month(date) %in% 9:11 ~ "Autumn",
      TRUE ~ "Winter"
    )
  )
datatable(housing_data, options = list(pageLength = 5))
```

-   price: Ranges from \$0 (which may indicate missing or placeholder values) to \$26,590,000 with a mean of \$551,963.

-   bedrooms: Ranges from 0 to 9, with an average of about 3.4 bedrooms per house.

-   bathrooms: Ranges from 0 to 8, with a mean of approximately 2.16 bathrooms per house.

-   sqft_living: Living area square footage ranges from 370 to 13,540 sqft, with a mean of 2,139 sqft.

-   sqft_lot: Lot size square footage varies widely from 638 to 1,074,218 sqft, indicating a mix of urban to very large rural properties.

-   floors, waterfront, view, condition: These features are categorical or binary, with floors ranging from 1 to 3.5, indicating multi-level houses are included. Waterfront has a binary indicator, with very few waterfront properties present. View and condition are ordinal, with their own range of values.

```{r}
skim(housing_data)
# Price Distribution
ggplot(housing_data, aes(x = price)) + 
  geom_histogram(bins = 30, fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Price Distribution")

# Bedrooms Distribution
ggplot(housing_data, aes(x = factor(bedrooms))) + 
  geom_bar(fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Bedrooms Distribution", x = "Bedrooms")

# Bathrooms Distribution
ggplot(housing_data, aes(x = factor(bathrooms))) + 
  geom_bar(fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Bathrooms Distribution", x = "Bathrooms")

# Living Area sqft Distribution
ggplot(housing_data, aes(x = sqft_living)) + 
  geom_histogram(bins = 30, fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Living Area sqft Distribution")

# Correlation Heatmap
numeric_data <- select(housing_data, where(is.numeric))
cor_matrix <- cor(numeric_data)
corrplot(cor_matrix, method = 'circle', type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)
```

## Incorperating Census Data
```{r}
# Set the Census API key
census_api_key('bcbac889885cad20c1a787bc242d4aa4011251a2')

# Retrieve median income data by ZIP code
income_data <- get_acs(
  geography = "zcta",
  variables = "B19013_001",
  year = 2019,
  survey = "acs5"
)

f_income <- income_data %>%
  filter(GEOID %in% housing_data$ZIP) %>%
  rename(ZIP = GEOID, median_income = estimate)

# Retrieve population data for each ZIP code (ZCTA) from the latest ACS data
population_data <- get_acs(
  geography = "zcta",
  variables = "B01003_001",
  year = 2019,
  survey = "acs5"
)

f_pop <- population_data %>%
  filter(GEOID %in% housing_data$ZIP) %>%
  rename(ZIP = GEOID, population = estimate)

# Retrieve education data by ZIP code
education_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP02_0059P", # Percent high school graduate or higher
    "DP02_0065PE"  # Percent bachelor's degree or higher
  ),
  year = 2019,
  survey = "acs5"
)

education_data <- education_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    pct_high_school_grad = mean(DP02_0059P, na.rm = TRUE),
    pct_bachelors_degree = mean(DP02_0065P, na.rm = TRUE)
  )

# Retrieve employment and occupation data by ZIP code
employment_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP03_0005PE", # Percent unemployed
    "DP03_0027PE", # Percent in management, business, science, and arts occupations
    "DP03_0028PE", # Percent in service occupations
    "DP03_0029PE", # Percent in sales and office occupations
    "DP03_0030PE"  # Percent in natural resources, construction, and maintenance occupations
  ),
  year = 2019,
  survey = "acs5"
)

employment_data <- employment_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    pct_unemployed = mean(DP03_0005P, na.rm = TRUE),
    pct_mgmt_occupation = mean(DP03_0027P, na.rm = TRUE),
    pct_service_occupation = mean(DP03_0028P, na.rm = TRUE),
    pct_sales_office_occupation = mean(DP03_0029P, na.rm = TRUE),
    pct_construction_occupation = mean(DP03_0030P, na.rm = TRUE)
  )

# Retrieve housing characteristics data by ZIP code
housing_chars_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP04_0046PE", # Percent owner-occupied housing units
    "DP04_0039E", # Median year built for housing structures
    "DP04_0093PE"  # Percent of housing units with central air conditioning
  ),
  year = 2019,
  survey = "acs5"
)

housing_chars_data <- housing_chars_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    pct_owner_occupied = mean(DP04_0046P, na.rm = TRUE),
    median_year_built = mean(DP04_0039, na.rm = TRUE),
    pct_central_air = mean(DP04_0093P, na.rm = TRUE)
  )

# Retrieve commuting patterns data by ZIP code
commuting_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP03_0025E", # Mean travel time to work (minutes)
    "DP03_0021PE", # Percent using public transportation (excluding taxicab)
    "DP03_0024PE"  # Percent working from home
  ),
  year = 2019,
  survey = "acs5"
)

commuting_data <- commuting_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    mean_travel_time = mean(DP03_0025, na.rm = TRUE),
    pct_public_transport = mean(DP03_0021P, na.rm = TRUE),
    pct_work_from_home = mean(DP03_0024P, na.rm = TRUE)
  )

# Retrieve household characteristics data by ZIP code
household_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP02_0016E", # Average household size
    "DP02_0007PE", # Percent of households with own children under 18 years
    "DP02_0009PE", # Percent of single-parent households
    "DP05_0018E"  # Median age
  ),
  year = 2019,
  survey = "acs5"
)

household_data <- household_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    avg_household_size = mean(DP02_0016, na.rm = TRUE),
    pct_households_with_children = mean(DP02_0007P, na.rm = TRUE),
    pct_single_parent_households = mean(DP02_0009P, na.rm = TRUE),
    median_age = mean(DP05_0018, na.rm = TRUE)
  )

# Merge all the Census data with the housing data
housing_data_with_census <- housing_data %>%
  left_join(f_income, by = "ZIP") %>%
  left_join(f_pop, by = "ZIP") %>%
  left_join(education_data, by = c("ZIP" = "GEOID")) %>%
  left_join(employment_data, by = c("ZIP" = "GEOID")) %>%
  left_join(housing_chars_data, by = c("ZIP" = "GEOID")) %>%
  left_join(commuting_data, by = c("ZIP" = "GEOID")) %>%
  left_join(household_data, by = c("ZIP" = "GEOID"))%>%
    select(
    date, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition,
    sqft_above, sqft_basement, yr_built, yr_renovated, street, city, country, ZIP,
    house_age, years_since_renovation, total_rooms, living_lot_ratio, has_basement, renovated,
    floor_area_ratio, outdoor_space, season,
    median_income, population,
    pct_high_school_grad, pct_bachelors_degree,
    pct_unemployed, pct_mgmt_occupation, pct_service_occupation, pct_sales_office_occupation, pct_construction_occupation,
    pct_owner_occupied, median_year_built, pct_central_air,
    mean_travel_time, pct_public_transport, pct_work_from_home,
    avg_household_size, pct_households_with_children, pct_single_parent_households, median_age
  )%>%
  #write_csv('Data_with_census.csv')



```

# Preprocessing
```{r}
housing_data_with_census <- read_csv('Data_with_census.csv')

# Check for multicollinearity using VIF
housing_data_final_num <- housing_data_for_model %>%
  select_if(is.numeric)

vif_model <- lm(price ~ ., data = housing_data_final_num)
vif_values <- vif(vif_model)
print(vif_values)

# Correlation matrix
cor_matrix <- cor(housing_data_final_num)
#write.csv(cor_matrix, file = "correlation_matrix.csv", row.names = TRUE)
corrplot::corrplot(cor_matrix, method = "circle")

# Find highly correlated variables
high_correlation <- findCorrelation(cor_matrix, cutoff = 0.75)
high_correlation_names <- names(housing_data_final_num)[high_correlation]
print(high_correlation_names)

# Remove highly correlated variables (one from each pair)
housing_data_reduced <- housing_data_final_num[, -high_correlation]
```

```{r}
# Prepare data for modeling
housing_data_for_model <- housing_data_with_census %>%
  select(
    -yr_built, -yr_renovated, -floor_area_ratio, -country,
    -State, -street, -date, -total_rooms, -outdoor_space,
    -sqft_basement, -sqft_above, -pct_mgmt_occupation, -pct_high_school_grad, -pct_public_transport, -median_income, -pct_owner_occupied, -pct_central_air, -pct_bachelors_degree, -city, -country)

# Transform price to log scale
housing_data_for_model <- housing_data_for_model %>%
  filter(price != 0, !is.na(mean_travel_time))%>%
  mutate(log_price = log(price))

ggplot(housing_data_for_model, aes(x=log_price)) +
  geom_histogram(bins=30, fill="lightblue", color='black', alpha=0.7) +
  ggtitle("Histogram of Log-transformed Prices") +
  xlab("Log of Price") +
  ylab("Frequency") # much better shape than earlier

```
We decided to perform a log transformation on the price for make the distribution more normal


```{r}
# Basic preprocessing applicable to all models
housing_data_for_model <- housing_data_with_census %>%
  select(-yr_built, -yr_renovated, -floor_area_ratio, -country, 
         -State, -street, -date, -total_rooms, -outdoor_space,
         -sqft_basement, -sqft_above, -pct_mgmt_occupation, 
         -pct_high_school_grad, -pct_public_transport, -median_income, 
         -pct_owner_occupied, -pct_central_air, -pct_bachelors_degree, -city, -country) %>%
  filter(price != 0, !is.na(mean_travel_time)) %>%
  mutate(log_price = log(price)) %>%
  select(-price)%>%
  mutate(
    season = as.factor(season),
    waterfront = as.factor(waterfront),
    view = as.factor(view),
    has_basement = as.factor(has_basement),
    renovated = as.factor(renovated)
  )
# Separate data for mixed-effects model to include ZIP
me_data <- housing_data_for_model %>%
  mutate(ZIP = as.factor(ZIP))

# Data for other models where ZIP is not required
other_models_data <- select(housing_data_for_model, -ZIP)

# Create recipes for other models and mixed-effects model
rec_other <- recipe(log_price ~ ., data = other_models_data) %>%
  step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
  step_dummy(all_nominal_predictors(), -all_outcomes())

rec_me <- recipe(log_price ~ ., data = me_data) %>%
  step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
  step_dummy(all_nominal_predictors(), -all_outcomes(), -ZIP)

# Preprocess the data for stepwise AIC
preprocessed_data <- prep(rec_other) %>% bake(new_data = NULL)
preprocessed_me_data <- prep(rec_me) %>% bake(new_data = NULL)

# Perform stepwise AIC feature selection
full_model <- lm(log_price ~ ., data = preprocessed_data)
reduced_model <- stepAIC(full_model, direction = "both", trace = T)

# Print the selected variables
selected_vars <- names(reduced_model$coefficients)[-1]
print(paste("Selected variables:", paste(selected_vars, collapse = ", ")))

# Update the data for models (except neural network) with the selected variables
lm_rf_data <- preprocessed_data %>%
  select(log_price, all_of(selected_vars))

me_data_selected <- preprocessed_me_data %>%
  select(log_price, ZIP, all_of(selected_vars))

# Check for multicollinearity using VIF
vif_model <- lm(log_price ~ ., data = lm_rf_data)
vif_values <- vif(vif_model)
print(vif_values)
```
VIF is bellow 5 for all predictors so we have addressed colinearity problems.

```{r}
library(multilevelmod)
set.seed(333)

## Create cross-validation folds for each model type
cv_folds_lm_rf <- vfold_cv(lm_rf_data, v = 10)
cv_folds_me <- vfold_cv(me_data_selected, v = 10)
cv_folds_nn <- vfold_cv(housing_data_for_model, v = 10)

# Create recipes for models
rec_lm_rf <- recipe(log_price ~ ., data = lm_rf_data)
rec_me <- recipe(log_price ~ ., data = me_data_selected)
rec_nn <- recipe(log_price ~ ., data = housing_data_for_model) %>%
  step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
  step_dummy(all_nominal_predictors(), -all_outcomes())

# Define the models
lm_model <- linear_reg() %>% set_engine("lm") %>% set_mode("regression")
rf_model <- rand_forest(trees = 1000) %>% set_mode("regression") %>% set_engine("ranger", importance = "impurity")
me_model <- linear_reg() %>% set_engine("lmer") %>% set_mode("regression")
nn_model <- mlp(hidden_units = 10) %>% set_engine("nnet", linout = TRUE, trace = T) %>% set_mode("regression")

# Create workflows
lm_workflow <- workflow() %>% add_recipe(rec_lm_rf) %>% add_model(lm_model)
rf_workflow <- workflow() %>% add_recipe(rec_lm_rf) %>% add_model(rf_model)
me_workflow <- workflow() %>% add_recipe(rec_me) %>% add_model(me_model, formula = log_price ~ . + (1 | ZIP))
nn_workflow <- workflow() %>% add_recipe(rec_nn) %>% add_model(nn_model)

# Perform cross-validation
lm_cv_results <- fit_resamples(lm_workflow, resamples = cv_folds_lm_rf, metrics = metric_set(rmse, mae, rsq))
rf_cv_results <- fit_resamples(rf_workflow, resamples = cv_folds_lm_rf, metrics = metric_set(rmse, mae, rsq))
me_cv_results <- fit_resamples(me_workflow, resamples = cv_folds_me, metrics = metric_set(rmse, mae, rsq))
nn_cv_results <- fit_resamples(nn_workflow, resamples = cv_folds_nn, metrics = metric_set(rmse, mae, rsq))

# Summarize and compare the cross-validation results
cv_results <- bind_rows(
  collect_metrics(lm_cv_results) %>% mutate(model = "Linear Regression"),
  collect_metrics(rf_cv_results) %>% mutate(model = "Random Forest"),
  collect_metrics(me_cv_results) %>% mutate(model = "Mixed Effects"),
  collect_metrics(nn_cv_results) %>% mutate(model = "Neural Network")
)

cv_summary <- cv_results %>%
  group_by(model, .metric) %>%
  summarize(
    mean_value = mean(mean, na.rm = TRUE),
    std_error = mean(std_err, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  pivot_wider(names_from = .metric, values_from = c(mean_value, std_error))

print(cv_summary)
# Split the data into training and testing sets for each model type
data_split_lm_rf <- initial_split(lm_rf_data, prop = 0.8)
train_data_lm_rf <- training(data_split_lm_rf)
test_data_lm_rf <- testing(data_split_lm_rf)

data_split_me <- initial_split(me_data_selected, prop = 0.8)
train_data_me <- training(data_split_me)
test_data_me <- testing(data_split_me)

data_split_nn <- initial_split(housing_data_for_model, prop = 0.8)
train_data_nn <- training(data_split_nn)
test_data_nn <- testing(data_split_nn)

# Fit and evaluate each model using last_fit()
lm_last_fit <- last_fit(lm_workflow, data_split_lm_rf)
rf_last_fit <- last_fit(rf_workflow, data_split_lm_rf)
me_last_fit <- last_fit(me_workflow, data_split_me)
nn_last_fit <- last_fit(nn_workflow, data_split_nn)

# Extract predictions and metrics for each model
lm_results <- lm_last_fit %>% collect_predictions() %>% mutate(model = "Linear Regression")
rf_results <- rf_last_fit %>% collect_predictions() %>% mutate(model = "Random Forest")
me_results <- me_last_fit %>% collect_predictions() %>% mutate(model = "Mixed Effects")
nn_results <- nn_last_fit %>% collect_predictions() %>% mutate(model = "Neural Network")

# Combine the results
results <- bind_rows(lm_results, rf_results, me_results, nn_results)

# Plot predicted vs. actual for each model with R^2
results %>%
  ggplot(aes(x = log_price, y = .pred, color = model)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  facet_wrap(~ model, scales = "free") +
  labs(
    title = "Predicted vs. Actual House Prices",
    x = "Actual Price",
    y = "Predicted Price"
  ) +
  theme_minimal() +
  theme(legend.position = "none") +
  geom_text(
    data = results %>%
      group_by(model) %>%
      summarise(rsq = cor(log_price, .pred)^2),
    aes(x = Inf, y = -Inf, label = sprintf("R^2 = %.3f", rsq)),
    hjust = 1.1,
    vjust = -1.1
  )
```
The model with the best predictive accuracy was the random forest model, followed by the linear regression model, neural network model, and mixed effects model. Issues with model fitting are clear with the mixed effects model due to the strong left skew in the error.

```{r}
# Extract the trained Random Forest model from the last_fit object
rf_model <- rf_last_fit %>% 
  extract_fit_engine()

# Extract variable importance from the Random Forest model
rf_importance <- rf_model %>% 
  ranger::importance()

# Plot variable importance
ggplot(data.frame(variable = names(rf_importance), importance = rf_importance), 
       aes(x = reorder(variable, importance), y = importance, fill = importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(x = "Variable", y = "Importance", title = "Random Forest Variable Importance") +
  theme_minimal()

```
SQFT of living space followed by occupation prevalence appear to be most predictive of housing price. Things like view, and commuting time are suprisingly much lower by comparison!
```{r}
# Extract the trained models from the last_fit objects
lm_model <- lm_last_fit %>% extract_fit_engine()
rf_model <- rf_last_fit %>% extract_fit_engine()
me_model <- me_last_fit %>% extract_fit_engine()
nn_model <- nn_last_fit %>% extract_fit_engine()

# Extract coefficients from the Linear Regression model
lm_coef <- coef(lm_model)
lm_coef_df <- data.frame(variable = names(lm_coef), coefficient = lm_coef)

# Plot coefficients for the Linear Regression model
ggplot(lm_coef_df, aes(x = reorder(variable, coefficient), y = coefficient, fill = coefficient)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(x = "Variable", y = "Coefficient", title = "Linear Regression Coefficients") +
  theme_minimal()
```
Conversly in the linear model, having an outswtanding view seems to be the most important predictor. Very different apporaches!

