Introduction
This analysis aims to predict housing prices using various machine
learning models and census data. We will compare the performance of
Linear Regression, Random Forest, Mixed Effects, and Neural Network
models to identify the best approach for predicting housing prices.
The dataset includes housing features such as the number of bedrooms,
bathrooms, square footage, and location, as well as census data like
median income, population, education levels, and employment rates by ZIP
code.
Data Preprocessing
First, we read the housing dataset download from kaggle. We convert relevant columns to
appropriate data types and create new features like house age, years
since renovation, and living area to lot size ratio.
head(housing_data)
datatable(housing_data, options = list(pageLength = 5))
Warning: It seems your data is too big for client-side DataTables. You may consider server-side processing: https://rstudio.github.io/DT/server.htmlWarning: It seems your data is too big for client-side DataTables. You may consider server-side processing: https://rstudio.github.io/DT/server.html
price: Ranges from $0 (which may indicate missing or placeholder
values) to $26,590,000 with a mean of $551,963.
bedrooms: Ranges from 0 to 9, with an average of about 3.4
bedrooms per house.
bathrooms: Ranges from 0 to 8, with a mean of approximately 2.16
bathrooms per house.
sqft_living: Living area square footage ranges from 370 to 13,540
sqft, with a mean of 2,139 sqft.
sqft_lot: Lot size square footage varies widely from 638 to
1,074,218 sqft, indicating a mix of urban to very large rural
properties.
floors, waterfront, view, condition: These features are
categorical or binary, with floors ranging from 1 to 3.5, indicating
multi-level houses are included. Waterfront has a binary indicator, with
very few waterfront properties present. View and condition are ordinal,
with their own range of values.
skim(housing_data)
# Price Distribution
ggplot(housing_data, aes(x = price)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
theme_minimal() +
labs(title = "Price Distribution")
# Bedrooms Distribution
ggplot(housing_data, aes(x = factor(bedrooms))) +
geom_bar(fill = "lightblue", color = "black") +
theme_minimal() +
labs(title = "Bedrooms Distribution", x = "Bedrooms")
# Bathrooms Distribution
ggplot(housing_data, aes(x = factor(bathrooms))) +
geom_bar(fill = "lightblue", color = "black") +
theme_minimal() +
labs(title = "Bathrooms Distribution", x = "Bathrooms")
# Living Area sqft Distribution
ggplot(housing_data, aes(x = sqft_living)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
theme_minimal() +
labs(title = "Living Area sqft Distribution")
# Correlation Heatmap
numeric_data <- select(housing_data, where(is.numeric))
cor_matrix <- cor(numeric_data)
corrplot(cor_matrix, method = 'circle', type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
Incorperating Census Data
# Set the Census API key
census_api_key('bcbac889885cad20c1a787bc242d4aa4011251a2')
# Retrieve median income data by ZIP code
income_data <- get_acs(
geography = "zcta",
variables = "B19013_001",
year = 2019,
survey = "acs5"
)
f_income <- income_data %>%
filter(GEOID %in% housing_data$ZIP) %>%
rename(ZIP = GEOID, median_income = estimate)
# Retrieve population data for each ZIP code (ZCTA) from the latest ACS data
population_data <- get_acs(
geography = "zcta",
variables = "B01003_001",
year = 2019,
survey = "acs5"
)
f_pop <- population_data %>%
filter(GEOID %in% housing_data$ZIP) %>%
rename(ZIP = GEOID, population = estimate)
# Retrieve education data by ZIP code
education_data <- get_acs(
geography = "zcta",
variables = c(
"DP02_0059P", # Percent high school graduate or higher
"DP02_0065PE" # Percent bachelor's degree or higher
),
year = 2019,
survey = "acs5"
)
education_data <- education_data %>%
pivot_wider(names_from = variable, values_from = estimate) %>%
group_by(GEOID) %>%
summarize(
pct_high_school_grad = mean(DP02_0059P, na.rm = TRUE),
pct_bachelors_degree = mean(DP02_0065P, na.rm = TRUE)
)
# Retrieve employment and occupation data by ZIP code
employment_data <- get_acs(
geography = "zcta",
variables = c(
"DP03_0005PE", # Percent unemployed
"DP03_0027PE", # Percent in management, business, science, and arts occupations
"DP03_0028PE", # Percent in service occupations
"DP03_0029PE", # Percent in sales and office occupations
"DP03_0030PE" # Percent in natural resources, construction, and maintenance occupations
),
year = 2019,
survey = "acs5"
)
employment_data <- employment_data %>%
pivot_wider(names_from = variable, values_from = estimate) %>%
group_by(GEOID) %>%
summarize(
pct_unemployed = mean(DP03_0005P, na.rm = TRUE),
pct_mgmt_occupation = mean(DP03_0027P, na.rm = TRUE),
pct_service_occupation = mean(DP03_0028P, na.rm = TRUE),
pct_sales_office_occupation = mean(DP03_0029P, na.rm = TRUE),
pct_construction_occupation = mean(DP03_0030P, na.rm = TRUE)
)
# Retrieve housing characteristics data by ZIP code
housing_chars_data <- get_acs(
geography = "zcta",
variables = c(
"DP04_0046PE", # Percent owner-occupied housing units
"DP04_0039E", # Median year built for housing structures
"DP04_0093PE" # Percent of housing units with central air conditioning
),
year = 2019,
survey = "acs5"
)
housing_chars_data <- housing_chars_data %>%
pivot_wider(names_from = variable, values_from = estimate) %>%
group_by(GEOID) %>%
summarize(
pct_owner_occupied = mean(DP04_0046P, na.rm = TRUE),
median_year_built = mean(DP04_0039, na.rm = TRUE),
pct_central_air = mean(DP04_0093P, na.rm = TRUE)
)
# Retrieve commuting patterns data by ZIP code
commuting_data <- get_acs(
geography = "zcta",
variables = c(
"DP03_0025E", # Mean travel time to work (minutes)
"DP03_0021PE", # Percent using public transportation (excluding taxicab)
"DP03_0024PE" # Percent working from home
),
year = 2019,
survey = "acs5"
)
commuting_data <- commuting_data %>%
pivot_wider(names_from = variable, values_from = estimate) %>%
group_by(GEOID) %>%
summarize(
mean_travel_time = mean(DP03_0025, na.rm = TRUE),
pct_public_transport = mean(DP03_0021P, na.rm = TRUE),
pct_work_from_home = mean(DP03_0024P, na.rm = TRUE)
)
# Retrieve household characteristics data by ZIP code
household_data <- get_acs(
geography = "zcta",
variables = c(
"DP02_0016E", # Average household size
"DP02_0007PE", # Percent of households with own children under 18 years
"DP02_0009PE", # Percent of single-parent households
"DP05_0018E" # Median age
),
year = 2019,
survey = "acs5"
)
household_data <- household_data %>%
pivot_wider(names_from = variable, values_from = estimate) %>%
group_by(GEOID) %>%
summarize(
avg_household_size = mean(DP02_0016, na.rm = TRUE),
pct_households_with_children = mean(DP02_0007P, na.rm = TRUE),
pct_single_parent_households = mean(DP02_0009P, na.rm = TRUE),
median_age = mean(DP05_0018, na.rm = TRUE)
)
# Merge all the Census data with the housing data
housing_data_with_census <- housing_data %>%
left_join(f_income, by = "ZIP") %>%
left_join(f_pop, by = "ZIP") %>%
left_join(education_data, by = c("ZIP" = "GEOID")) %>%
left_join(employment_data, by = c("ZIP" = "GEOID")) %>%
left_join(housing_chars_data, by = c("ZIP" = "GEOID")) %>%
left_join(commuting_data, by = c("ZIP" = "GEOID")) %>%
left_join(household_data, by = c("ZIP" = "GEOID"))%>%
select(
date, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition,
sqft_above, sqft_basement, yr_built, yr_renovated, street, city, country, ZIP,
house_age, years_since_renovation, total_rooms, living_lot_ratio, has_basement, renovated,
floor_area_ratio, outdoor_space, season,
median_income, population,
pct_high_school_grad, pct_bachelors_degree,
pct_unemployed, pct_mgmt_occupation, pct_service_occupation, pct_sales_office_occupation, pct_construction_occupation,
pct_owner_occupied, median_year_built, pct_central_air,
mean_travel_time, pct_public_transport, pct_work_from_home,
avg_household_size, pct_households_with_children, pct_single_parent_households, median_age
)%>%
#write_csv('Data_with_census.csv')
Preprocessing
housing_data_with_census <- read_csv('Data_with_census.csv')
Rows: 4412 Columns: 47── Column specification ───────────────────────────────────────────
Delimiter: ","
chr (5): street, city, country, State, season
dbl (41): price, bedrooms, bathrooms, sqft_living, sqft_lot, f...
date (1): date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

We decided to perform a log transformation on the price for make the
distribution more normal
print(vif_values)
bedrooms bathrooms
1.708079 2.840310
sqft_living floors
3.061817 2.763125
condition living_lot_ratio
1.271854 2.283708
pct_service_occupation pct_sales_office_occupation
3.574640 2.379895
pct_construction_occupation median_year_built
4.023819 4.305020
mean_travel_time pct_work_from_home
2.076649 3.530957
avg_household_size pct_households_with_children
4.812068 1.362034
pct_single_parent_households median_age
2.331042 3.079090
waterfront_X1 view_X1
1.326975 1.034925
view_X2 view_X3
1.061864 1.082554
view_X4 has_basement_X1
1.303335 1.705690
renovated_X1 season_Summer
1.235108 1.005978
VIF is bellow 5 for all predictors so we have addressed colinearity
problems.
library(multilevelmod)
set.seed(333)
## Create cross-validation folds for each model type
cv_folds_lm_rf <- vfold_cv(lm_rf_data, v = 10)
cv_folds_me <- vfold_cv(me_data_selected, v = 10)
cv_folds_nn <- vfold_cv(housing_data_for_model, v = 10)
# Create recipes for models
rec_lm_rf <- recipe(log_price ~ ., data = lm_rf_data)
rec_me <- recipe(log_price ~ ., data = me_data_selected)
rec_nn <- recipe(log_price ~ ., data = housing_data_for_model) %>%
step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
step_dummy(all_nominal_predictors(), -all_outcomes())
# Define the models
lm_model <- linear_reg() %>% set_engine("lm") %>% set_mode("regression")
rf_model <- rand_forest(trees = 1000) %>% set_mode("regression") %>% set_engine("ranger", importance = "impurity")
me_model <- linear_reg() %>% set_engine("lmer") %>% set_mode("regression")
nn_model <- mlp(hidden_units = 10) %>% set_engine("nnet", linout = TRUE, trace = T) %>% set_mode("regression")
# Create workflows
lm_workflow <- workflow() %>% add_recipe(rec_lm_rf) %>% add_model(lm_model)
rf_workflow <- workflow() %>% add_recipe(rec_lm_rf) %>% add_model(rf_model)
me_workflow <- workflow() %>% add_recipe(rec_me) %>% add_model(me_model, formula = log_price ~ . + (1 | ZIP))
nn_workflow <- workflow() %>% add_recipe(rec_nn) %>% add_model(nn_model)
# Perform cross-validation
lm_cv_results <- fit_resamples(lm_workflow, resamples = cv_folds_lm_rf, metrics = metric_set(rmse, mae, rsq))
rf_cv_results <- fit_resamples(rf_workflow, resamples = cv_folds_lm_rf, metrics = metric_set(rmse, mae, rsq))
me_cv_results <- fit_resamples(me_workflow, resamples = cv_folds_me, metrics = metric_set(rmse, mae, rsq))
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
→ A | warning: unable to evaluate scaled gradient, Model failed to converge: degenerate Hessian with 1 negative eigenvalues
There were issues with some computations A: x1
There were issues with some computations A: x1
nn_cv_results <- fit_resamples(nn_workflow, resamples = cv_folds_nn, metrics = metric_set(rmse, mae, rsq))
# Summarize and compare the cross-validation results
cv_results <- bind_rows(
collect_metrics(lm_cv_results) %>% mutate(model = "Linear Regression"),
collect_metrics(rf_cv_results) %>% mutate(model = "Random Forest"),
collect_metrics(me_cv_results) %>% mutate(model = "Mixed Effects"),
collect_metrics(nn_cv_results) %>% mutate(model = "Neural Network")
)
cv_summary <- cv_results %>%
group_by(model, .metric) %>%
summarize(
mean_value = mean(mean, na.rm = TRUE),
std_error = mean(std_err, na.rm = TRUE),
.groups = 'drop'
) %>%
pivot_wider(names_from = .metric, values_from = c(mean_value, std_error))
print(cv_summary)
# Split the data into training and testing sets for each model type
data_split_lm_rf <- initial_split(lm_rf_data, prop = 0.8)
train_data_lm_rf <- training(data_split_lm_rf)
test_data_lm_rf <- testing(data_split_lm_rf)
data_split_me <- initial_split(me_data_selected, prop = 0.8)
train_data_me <- training(data_split_me)
test_data_me <- testing(data_split_me)
data_split_nn <- initial_split(housing_data_for_model, prop = 0.8)
train_data_nn <- training(data_split_nn)
test_data_nn <- testing(data_split_nn)
# Fit and evaluate each model using last_fit()
lm_last_fit <- last_fit(lm_workflow, data_split_lm_rf)
rf_last_fit <- last_fit(rf_workflow, data_split_lm_rf)
me_last_fit <- last_fit(me_workflow, data_split_me)
fixed-effect model matrix is rank deficient so dropping 10 columns / coefficients
boundary (singular) fit: see help('isSingular')
nn_last_fit <- last_fit(nn_workflow, data_split_nn)
# Extract predictions and metrics for each model
lm_results <- lm_last_fit %>% collect_predictions() %>% mutate(model = "Linear Regression")
rf_results <- rf_last_fit %>% collect_predictions() %>% mutate(model = "Random Forest")
me_results <- me_last_fit %>% collect_predictions() %>% mutate(model = "Mixed Effects")
nn_results <- nn_last_fit %>% collect_predictions() %>% mutate(model = "Neural Network")
# Combine the results
results <- bind_rows(lm_results, rf_results, me_results, nn_results)
# Plot predicted vs. actual for each model with R^2
results %>%
ggplot(aes(x = log_price, y = .pred, color = model)) +
geom_point(alpha = 0.5) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
facet_wrap(~ model, scales = "free") +
labs(
title = "Predicted vs. Actual House Prices",
x = "Actual Price",
y = "Predicted Price"
) +
theme_minimal() +
theme(legend.position = "none") +
geom_text(
data = results %>%
group_by(model) %>%
summarise(rsq = cor(log_price, .pred)^2),
aes(x = Inf, y = -Inf, label = sprintf("R^2 = %.3f", rsq)),
hjust = 1.1,
vjust = -1.1
)

The model with the best predictive accuracy was the random forest
model, followed by the linear regression model, neural network model,
and mixed effects model. Issues with model fitting are clear with the
mixed effects model due to the strong left skew in the error.
# Extract the trained Random Forest model from the last_fit object
rf_model <- rf_last_fit %>%
extract_fit_engine()
# Extract variable importance from the Random Forest model
rf_importance <- rf_model %>%
ranger::importance()
# Plot variable importance
ggplot(data.frame(variable = names(rf_importance), importance = rf_importance),
aes(x = reorder(variable, importance), y = importance, fill = importance)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x = "Variable", y = "Importance", title = "Random Forest Variable Importance") +
theme_minimal()

SQFT of living space followed by occupation prevalence appear to be
most predictive of housing price. Things like view, and commuting time
are suprisingly much lower by comparison!

Conversly in the linear model, having an outswtanding view seems to
be the most important predictor. Very different apporaches!
---
title: "EDA Housing"
output:
  html_notebook: default
  pdf_document: default
---

```{r, echo=FALSE, include=FALSE}
library(tidyverse, warn.conflicts = F)
library(GGally)
library(corrplot)
library(lme4)
library(lubridate)  # Load lubridate for date manipulation
library(tidycensus)  # Load tidycensus for census data retrieval
library(car)  # Load car for VIF calculation
library(caret)  # Load caret for finding high correlations
library(MASS)  # Load MASS for stepAIC function
library(tidymodels, warn.conflicts = F)  # Load tidymodels for modeling
library(ranger)  # Load ranger for Random Forest
library(skimr)
library(DT)

tidymodels_prefer()
```

## Introduction

This analysis aims to predict housing prices using various machine learning models and census data. We will compare the performance of Linear Regression, Random Forest, Mixed Effects, and Neural Network models to identify the best approach for predicting housing prices.

The dataset includes housing features such as the number of bedrooms, bathrooms, square footage, and location, as well as census data like median income, population, education levels, and employment rates by ZIP code.

## Data Preprocessing

First, we read the housing dataset download from [kaggle](https://www.kaggle.com/datasets/shree1992/housedata/data "Dataset Download"). We convert relevant columns to appropriate data types and create new features like house age, years since renovation, and living area to lot size ratio.

head(housing_data)

```{r}
housing_data <- read_csv('data.csv')

# Convert relevant columns to appropriate data types
housing_data$view <- as.factor(housing_data$view)
housing_data$waterfront <- as.factor(housing_data$waterfront)
housing_data$condition <- as.factor(housing_data$condition)
housing_data$date <- as.Date(housing_data$date)
housing_data$price <- as.numeric(housing_data$price)
housing_data$bedrooms <- as.numeric(housing_data$bedrooms)
housing_data$bathrooms <- as.numeric(housing_data$bathrooms)
housing_data$sqft_living <- as.numeric(housing_data$sqft_living)
housing_data$sqft_lot <- as.numeric(housing_data$sqft_lot)
housing_data$floors <- as.numeric(housing_data$floors)
housing_data$sqft_above <- as.numeric(housing_data$sqft_above)
housing_data$sqft_basement <- as.numeric(housing_data$sqft_basement)
housing_data$yr_built <- as.integer(housing_data$yr_built)
housing_data$yr_renovated <- as.integer(housing_data$yr_renovated)
housing_data$street <- as.character(housing_data$street)
housing_data$city <- as.character(housing_data$city)
housing_data$statezip <- as.character(housing_data$statezip)
housing_data$country <- as.character(housing_data$country)

summary(housing_data)

housing_data <- housing_data %>%
  mutate(ZIP = str_extract(statezip, "\\d{5}"))



housing_data <- housing_data %>%
  mutate(
    house_age = as.integer(format(Sys.Date(), "%Y")) - yr_built,
    years_since_renovation = if_else(yr_renovated > 0, as.integer(format(Sys.Date(), "%Y")) - yr_renovated, house_age),
    total_rooms = bedrooms + bathrooms,
    living_lot_ratio = sqft_living / sqft_lot,
    has_basement = as.integer(sqft_basement > 0),
    renovated = as.integer(yr_renovated > 0),
    floor_area_ratio = sqft_above / sqft_lot,
    outdoor_space = sqft_lot - sqft_living,
    season = case_when(
      month(date) %in% 3:5 ~ "Spring",
      month(date) %in% 6:8 ~ "Summer",
      month(date) %in% 9:11 ~ "Autumn",
      TRUE ~ "Winter"
    )
  )
datatable(housing_data, options = list(pageLength = 5))
```

-   price: Ranges from \$0 (which may indicate missing or placeholder values) to \$26,590,000 with a mean of \$551,963.

-   bedrooms: Ranges from 0 to 9, with an average of about 3.4 bedrooms per house.

-   bathrooms: Ranges from 0 to 8, with a mean of approximately 2.16 bathrooms per house.

-   sqft_living: Living area square footage ranges from 370 to 13,540 sqft, with a mean of 2,139 sqft.

-   sqft_lot: Lot size square footage varies widely from 638 to 1,074,218 sqft, indicating a mix of urban to very large rural properties.

-   floors, waterfront, view, condition: These features are categorical or binary, with floors ranging from 1 to 3.5, indicating multi-level houses are included. Waterfront has a binary indicator, with very few waterfront properties present. View and condition are ordinal, with their own range of values.

```{r}
skim(housing_data)
# Price Distribution
ggplot(housing_data, aes(x = price)) + 
  geom_histogram(bins = 30, fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Price Distribution")

# Bedrooms Distribution
ggplot(housing_data, aes(x = factor(bedrooms))) + 
  geom_bar(fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Bedrooms Distribution", x = "Bedrooms")

# Bathrooms Distribution
ggplot(housing_data, aes(x = factor(bathrooms))) + 
  geom_bar(fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Bathrooms Distribution", x = "Bathrooms")

# Living Area sqft Distribution
ggplot(housing_data, aes(x = sqft_living)) + 
  geom_histogram(bins = 30, fill = "lightblue", color = "black") + 
  theme_minimal() + 
  labs(title = "Living Area sqft Distribution")

# Correlation Heatmap
numeric_data <- select(housing_data, where(is.numeric))
cor_matrix <- cor(numeric_data)
corrplot(cor_matrix, method = 'circle', type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)
```

## Incorperating Census Data
```{r}
# Set the Census API key
census_api_key('bcbac889885cad20c1a787bc242d4aa4011251a2')

# Retrieve median income data by ZIP code
income_data <- get_acs(
  geography = "zcta",
  variables = "B19013_001",
  year = 2019,
  survey = "acs5"
)

f_income <- income_data %>%
  filter(GEOID %in% housing_data$ZIP) %>%
  rename(ZIP = GEOID, median_income = estimate)

# Retrieve population data for each ZIP code (ZCTA) from the latest ACS data
population_data <- get_acs(
  geography = "zcta",
  variables = "B01003_001",
  year = 2019,
  survey = "acs5"
)

f_pop <- population_data %>%
  filter(GEOID %in% housing_data$ZIP) %>%
  rename(ZIP = GEOID, population = estimate)

# Retrieve education data by ZIP code
education_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP02_0059P", # Percent high school graduate or higher
    "DP02_0065PE"  # Percent bachelor's degree or higher
  ),
  year = 2019,
  survey = "acs5"
)

education_data <- education_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    pct_high_school_grad = mean(DP02_0059P, na.rm = TRUE),
    pct_bachelors_degree = mean(DP02_0065P, na.rm = TRUE)
  )

# Retrieve employment and occupation data by ZIP code
employment_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP03_0005PE", # Percent unemployed
    "DP03_0027PE", # Percent in management, business, science, and arts occupations
    "DP03_0028PE", # Percent in service occupations
    "DP03_0029PE", # Percent in sales and office occupations
    "DP03_0030PE"  # Percent in natural resources, construction, and maintenance occupations
  ),
  year = 2019,
  survey = "acs5"
)

employment_data <- employment_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    pct_unemployed = mean(DP03_0005P, na.rm = TRUE),
    pct_mgmt_occupation = mean(DP03_0027P, na.rm = TRUE),
    pct_service_occupation = mean(DP03_0028P, na.rm = TRUE),
    pct_sales_office_occupation = mean(DP03_0029P, na.rm = TRUE),
    pct_construction_occupation = mean(DP03_0030P, na.rm = TRUE)
  )

# Retrieve housing characteristics data by ZIP code
housing_chars_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP04_0046PE", # Percent owner-occupied housing units
    "DP04_0039E", # Median year built for housing structures
    "DP04_0093PE"  # Percent of housing units with central air conditioning
  ),
  year = 2019,
  survey = "acs5"
)

housing_chars_data <- housing_chars_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    pct_owner_occupied = mean(DP04_0046P, na.rm = TRUE),
    median_year_built = mean(DP04_0039, na.rm = TRUE),
    pct_central_air = mean(DP04_0093P, na.rm = TRUE)
  )

# Retrieve commuting patterns data by ZIP code
commuting_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP03_0025E", # Mean travel time to work (minutes)
    "DP03_0021PE", # Percent using public transportation (excluding taxicab)
    "DP03_0024PE"  # Percent working from home
  ),
  year = 2019,
  survey = "acs5"
)

commuting_data <- commuting_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    mean_travel_time = mean(DP03_0025, na.rm = TRUE),
    pct_public_transport = mean(DP03_0021P, na.rm = TRUE),
    pct_work_from_home = mean(DP03_0024P, na.rm = TRUE)
  )

# Retrieve household characteristics data by ZIP code
household_data <- get_acs(
  geography = "zcta",
  variables = c(
    "DP02_0016E", # Average household size
    "DP02_0007PE", # Percent of households with own children under 18 years
    "DP02_0009PE", # Percent of single-parent households
    "DP05_0018E"  # Median age
  ),
  year = 2019,
  survey = "acs5"
)

household_data <- household_data %>%
  pivot_wider(names_from = variable, values_from = estimate) %>%
  group_by(GEOID) %>%
  summarize(
    avg_household_size = mean(DP02_0016, na.rm = TRUE),
    pct_households_with_children = mean(DP02_0007P, na.rm = TRUE),
    pct_single_parent_households = mean(DP02_0009P, na.rm = TRUE),
    median_age = mean(DP05_0018, na.rm = TRUE)
  )

# Merge all the Census data with the housing data
housing_data_with_census <- housing_data %>%
  left_join(f_income, by = "ZIP") %>%
  left_join(f_pop, by = "ZIP") %>%
  left_join(education_data, by = c("ZIP" = "GEOID")) %>%
  left_join(employment_data, by = c("ZIP" = "GEOID")) %>%
  left_join(housing_chars_data, by = c("ZIP" = "GEOID")) %>%
  left_join(commuting_data, by = c("ZIP" = "GEOID")) %>%
  left_join(household_data, by = c("ZIP" = "GEOID"))%>%
    select(
    date, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition,
    sqft_above, sqft_basement, yr_built, yr_renovated, street, city, country, ZIP,
    house_age, years_since_renovation, total_rooms, living_lot_ratio, has_basement, renovated,
    floor_area_ratio, outdoor_space, season,
    median_income, population,
    pct_high_school_grad, pct_bachelors_degree,
    pct_unemployed, pct_mgmt_occupation, pct_service_occupation, pct_sales_office_occupation, pct_construction_occupation,
    pct_owner_occupied, median_year_built, pct_central_air,
    mean_travel_time, pct_public_transport, pct_work_from_home,
    avg_household_size, pct_households_with_children, pct_single_parent_households, median_age
  )%>%
  #write_csv('Data_with_census.csv')



```

# Preprocessing
```{r}
housing_data_with_census <- read_csv('Data_with_census.csv')

# Check for multicollinearity using VIF
housing_data_final_num <- housing_data_for_model %>%
  select_if(is.numeric)

vif_model <- lm(price ~ ., data = housing_data_final_num)
vif_values <- vif(vif_model)
print(vif_values)

# Correlation matrix
cor_matrix <- cor(housing_data_final_num)
#write.csv(cor_matrix, file = "correlation_matrix.csv", row.names = TRUE)
corrplot::corrplot(cor_matrix, method = "circle")

# Find highly correlated variables
high_correlation <- findCorrelation(cor_matrix, cutoff = 0.75)
high_correlation_names <- names(housing_data_final_num)[high_correlation]
print(high_correlation_names)

# Remove highly correlated variables (one from each pair)
housing_data_reduced <- housing_data_final_num[, -high_correlation]
```

```{r}
# Prepare data for modeling
housing_data_for_model <- housing_data_with_census %>%
  select(
    -yr_built, -yr_renovated, -floor_area_ratio, -country,
    -State, -street, -date, -total_rooms, -outdoor_space,
    -sqft_basement, -sqft_above, -pct_mgmt_occupation, -pct_high_school_grad, -pct_public_transport, -median_income, -pct_owner_occupied, -pct_central_air, -pct_bachelors_degree, -city, -country)

# Transform price to log scale
housing_data_for_model <- housing_data_for_model %>%
  filter(price != 0, !is.na(mean_travel_time))%>%
  mutate(log_price = log(price))

ggplot(housing_data_for_model, aes(x=log_price)) +
  geom_histogram(bins=30, fill="lightblue", color='black', alpha=0.7) +
  ggtitle("Histogram of Log-transformed Prices") +
  xlab("Log of Price") +
  ylab("Frequency") # much better shape than earlier

```
We decided to perform a log transformation on the price for make the distribution more normal


```{r}
# Basic preprocessing applicable to all models
housing_data_for_model <- housing_data_with_census %>%
  select(-yr_built, -yr_renovated, -floor_area_ratio, -country, 
         -State, -street, -date, -total_rooms, -outdoor_space,
         -sqft_basement, -sqft_above, -pct_mgmt_occupation, 
         -pct_high_school_grad, -pct_public_transport, -median_income, 
         -pct_owner_occupied, -pct_central_air, -pct_bachelors_degree, -city, -country) %>%
  filter(price != 0, !is.na(mean_travel_time)) %>%
  mutate(log_price = log(price)) %>%
  select(-price)%>%
  mutate(
    season = as.factor(season),
    waterfront = as.factor(waterfront),
    view = as.factor(view),
    has_basement = as.factor(has_basement),
    renovated = as.factor(renovated)
  )
# Separate data for mixed-effects model to include ZIP
me_data <- housing_data_for_model %>%
  mutate(ZIP = as.factor(ZIP))

# Data for other models where ZIP is not required
other_models_data <- select(housing_data_for_model, -ZIP)

# Create recipes for other models and mixed-effects model
rec_other <- recipe(log_price ~ ., data = other_models_data) %>%
  step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
  step_dummy(all_nominal_predictors(), -all_outcomes())

rec_me <- recipe(log_price ~ ., data = me_data) %>%
  step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
  step_dummy(all_nominal_predictors(), -all_outcomes(), -ZIP)

# Preprocess the data for stepwise AIC
preprocessed_data <- prep(rec_other) %>% bake(new_data = NULL)
preprocessed_me_data <- prep(rec_me) %>% bake(new_data = NULL)

# Perform stepwise AIC feature selection
full_model <- lm(log_price ~ ., data = preprocessed_data)
reduced_model <- stepAIC(full_model, direction = "both", trace = T)

# Print the selected variables
selected_vars <- names(reduced_model$coefficients)[-1]
print(paste("Selected variables:", paste(selected_vars, collapse = ", ")))

# Update the data for models (except neural network) with the selected variables
lm_rf_data <- preprocessed_data %>%
  select(log_price, all_of(selected_vars))

me_data_selected <- preprocessed_me_data %>%
  select(log_price, ZIP, all_of(selected_vars))

# Check for multicollinearity using VIF
vif_model <- lm(log_price ~ ., data = lm_rf_data)
vif_values <- vif(vif_model)
print(vif_values)
```
VIF is bellow 5 for all predictors so we have addressed colinearity problems.

```{r}
library(multilevelmod)
set.seed(333)

## Create cross-validation folds for each model type
cv_folds_lm_rf <- vfold_cv(lm_rf_data, v = 10)
cv_folds_me <- vfold_cv(me_data_selected, v = 10)
cv_folds_nn <- vfold_cv(housing_data_for_model, v = 10)

# Create recipes for models
rec_lm_rf <- recipe(log_price ~ ., data = lm_rf_data)
rec_me <- recipe(log_price ~ ., data = me_data_selected)
rec_nn <- recipe(log_price ~ ., data = housing_data_for_model) %>%
  step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
  step_dummy(all_nominal_predictors(), -all_outcomes())

# Define the models
lm_model <- linear_reg() %>% set_engine("lm") %>% set_mode("regression")
rf_model <- rand_forest(trees = 1000) %>% set_mode("regression") %>% set_engine("ranger", importance = "impurity")
me_model <- linear_reg() %>% set_engine("lmer") %>% set_mode("regression")
nn_model <- mlp(hidden_units = 10) %>% set_engine("nnet", linout = TRUE, trace = T) %>% set_mode("regression")

# Create workflows
lm_workflow <- workflow() %>% add_recipe(rec_lm_rf) %>% add_model(lm_model)
rf_workflow <- workflow() %>% add_recipe(rec_lm_rf) %>% add_model(rf_model)
me_workflow <- workflow() %>% add_recipe(rec_me) %>% add_model(me_model, formula = log_price ~ . + (1 | ZIP))
nn_workflow <- workflow() %>% add_recipe(rec_nn) %>% add_model(nn_model)

# Perform cross-validation
lm_cv_results <- fit_resamples(lm_workflow, resamples = cv_folds_lm_rf, metrics = metric_set(rmse, mae, rsq))
rf_cv_results <- fit_resamples(rf_workflow, resamples = cv_folds_lm_rf, metrics = metric_set(rmse, mae, rsq))
me_cv_results <- fit_resamples(me_workflow, resamples = cv_folds_me, metrics = metric_set(rmse, mae, rsq))
nn_cv_results <- fit_resamples(nn_workflow, resamples = cv_folds_nn, metrics = metric_set(rmse, mae, rsq))

# Summarize and compare the cross-validation results
cv_results <- bind_rows(
  collect_metrics(lm_cv_results) %>% mutate(model = "Linear Regression"),
  collect_metrics(rf_cv_results) %>% mutate(model = "Random Forest"),
  collect_metrics(me_cv_results) %>% mutate(model = "Mixed Effects"),
  collect_metrics(nn_cv_results) %>% mutate(model = "Neural Network")
)

cv_summary <- cv_results %>%
  group_by(model, .metric) %>%
  summarize(
    mean_value = mean(mean, na.rm = TRUE),
    std_error = mean(std_err, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  pivot_wider(names_from = .metric, values_from = c(mean_value, std_error))

print(cv_summary)
# Split the data into training and testing sets for each model type
data_split_lm_rf <- initial_split(lm_rf_data, prop = 0.8)
train_data_lm_rf <- training(data_split_lm_rf)
test_data_lm_rf <- testing(data_split_lm_rf)

data_split_me <- initial_split(me_data_selected, prop = 0.8)
train_data_me <- training(data_split_me)
test_data_me <- testing(data_split_me)

data_split_nn <- initial_split(housing_data_for_model, prop = 0.8)
train_data_nn <- training(data_split_nn)
test_data_nn <- testing(data_split_nn)

# Fit and evaluate each model using last_fit()
lm_last_fit <- last_fit(lm_workflow, data_split_lm_rf)
rf_last_fit <- last_fit(rf_workflow, data_split_lm_rf)
me_last_fit <- last_fit(me_workflow, data_split_me)
nn_last_fit <- last_fit(nn_workflow, data_split_nn)

# Extract predictions and metrics for each model
lm_results <- lm_last_fit %>% collect_predictions() %>% mutate(model = "Linear Regression")
rf_results <- rf_last_fit %>% collect_predictions() %>% mutate(model = "Random Forest")
me_results <- me_last_fit %>% collect_predictions() %>% mutate(model = "Mixed Effects")
nn_results <- nn_last_fit %>% collect_predictions() %>% mutate(model = "Neural Network")

# Combine the results
results <- bind_rows(lm_results, rf_results, me_results, nn_results)

# Plot predicted vs. actual for each model with R^2
results %>%
  ggplot(aes(x = log_price, y = .pred, color = model)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  facet_wrap(~ model, scales = "free") +
  labs(
    title = "Predicted vs. Actual House Prices",
    x = "Actual Price",
    y = "Predicted Price"
  ) +
  theme_minimal() +
  theme(legend.position = "none") +
  geom_text(
    data = results %>%
      group_by(model) %>%
      summarise(rsq = cor(log_price, .pred)^2),
    aes(x = Inf, y = -Inf, label = sprintf("R^2 = %.3f", rsq)),
    hjust = 1.1,
    vjust = -1.1
  )
```
The model with the best predictive accuracy was the random forest model, followed by the linear regression model, neural network model, and mixed effects model. Issues with model fitting are clear with the mixed effects model due to the strong left skew in the error.

```{r}
# Extract the trained Random Forest model from the last_fit object
rf_model <- rf_last_fit %>% 
  extract_fit_engine()

# Extract variable importance from the Random Forest model
rf_importance <- rf_model %>% 
  ranger::importance()

# Plot variable importance
ggplot(data.frame(variable = names(rf_importance), importance = rf_importance), 
       aes(x = reorder(variable, importance), y = importance, fill = importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(x = "Variable", y = "Importance", title = "Random Forest Variable Importance") +
  theme_minimal()

```
SQFT of living space followed by occupation prevalence appear to be most predictive of housing price. Things like view, and commuting time are suprisingly much lower by comparison!
```{r}
# Extract the trained models from the last_fit objects
lm_model <- lm_last_fit %>% extract_fit_engine()
rf_model <- rf_last_fit %>% extract_fit_engine()
me_model <- me_last_fit %>% extract_fit_engine()
nn_model <- nn_last_fit %>% extract_fit_engine()

# Extract coefficients from the Linear Regression model
lm_coef <- coef(lm_model)
lm_coef_df <- data.frame(variable = names(lm_coef), coefficient = lm_coef)

# Plot coefficients for the Linear Regression model
ggplot(lm_coef_df, aes(x = reorder(variable, coefficient), y = coefficient, fill = coefficient)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(x = "Variable", y = "Coefficient", title = "Linear Regression Coefficients") +
  theme_minimal()
```
Conversly in the linear model, having an outswtanding view seems to be the most important predictor. Very different apporaches!

