Introduction

The U.S. Real Estate Income data set encompasses data from 2014 to 2022, including average sales prices, mean and median incomes, and the number of households across various regions: Midwest, Northeast, South, United States, and West, spanning from January to December each year. Analyzing this data set is valuable for examining the relationships between real estate prices and household incomes across different regions in the United States.

Key Findings

Analysis

Import Data

df <- read.csv("~/Case Studies/US Real Estate/RealEstateUnitedStates.csv")

Import Libraries

library('magrittr')
library('dplyr')
library('ggplot2')
library('patchwork')

Data Structure

str(df)
## 'data.frame':    1575 obs. of  10 variables:
##  $ Year                            : int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ Month                           : chr  "December" "December" "December" "December" ...
##  $ Region                          : chr  "Midwest" "Midwest" "Midwest" "Northeast" ...
##  $ Home.Size                       : chr  "Double" "Single " "Total1" "Double" ...
##  $ Average.Sales.Price             : num  144300 82300 104700 158300 75300 ...
##  $ Number.of.Households..Thousands.: num  28280 28280 28280 22630 22630 ...
##  $ Median.Income...Current.Dollars : num  73070 73070 73070 80360 80360 ...
##  $ Median.Income...2022.Dollars    : num  73070 73070 73070 80360 80360 ...
##  $ Mean.Income...Current.Dollars   : num  102400 102400 102400 115300 115300 ...
##  $ Mean.Income...2022.Dollars      : num  102400 102400 102400 115300 115300 ...

Data Cleaning

Clean Column Names

## change all col names to lower case 
colnames(df) <- tolower(colnames(df))

Trim Off Trailing Spaces

df <- df %>% 
    mutate_at(vars(month, region, home.size), trimws)

Check For NAs

sum(is.na(df))
## [1] 0

Check For Duplicates

sum(duplicated(df))
## [1] 0

Overall Data Summary

summarydf <- df %>% 
    select(-year, -month, -region, -home.size)
summary(summarydf)
##  average.sales.price number.of.households..thousands.
##  Min.   : 33900      Min.   : 22030                  
##  1st Qu.: 59350      1st Qu.: 27460                  
##  Median : 81700      Median : 29100                  
##  Mean   : 83472      Mean   : 51233                  
##  3rd Qu.:100300      3rd Qu.: 49760                  
##  Max.   :178200      Max.   :131400                  
##  median.income...current.dollars median.income...2022.dollars
##  Min.   :49660                   Min.   :60060               
##  1st Qu.:58310                   1st Qu.:69310               
##  Median :64280                   Median :74380               
##  Mean   :65156                   Mean   :74205               
##  3rd Qu.:70780                   3rd Qu.:78350               
##  Max.   :82890                   Max.   :86820               
##  mean.income...current.dollars mean.income...2022.dollars
##  Min.   : 70990                Min.   : 85870            
##  1st Qu.: 81960                1st Qu.: 96300            
##  Median : 91610                Median :104100            
##  Mean   : 92139                Mean   :104891            
##  3rd Qu.:102200                3rd Qu.:111700            
##  Max.   :117500                Max.   :125200

Number of Households by Region

region_pop_gen <- df %>% 
    select(year, region, number.of.households..thousands.) %>% 
    group_by(year) %>% 
    distinct(year, region, .keep_all = TRUE) 

regioncolour <- c("darkseagreen4", "burlywood4", "coral4","peachpuff2", "tan")
ggplot(region_pop_gen, aes(x = region, y = number.of.households..thousands., fill = region)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = regioncolour) +
    labs(title = "Households Distribution by Regions", 
         x = "Regions", y = "No. of Households", fill = "Region") +
    theme(text = element_text(family = "Helvetica", size = 12),
          plot.background = element_rect(fill = "snow1"),
          panel.background = element_rect(fill = "oldlace"),
          panel.grid = element_line(color = "snow1", size = 1),
          legend.background = element_rect(fill = "oldlace")) 

Most Populated Region by Year

top_region_hh_yr <- df %>%  
    select(year, region, number.of.households..thousands.) %>% #select relevant data
    filter(region!= "United States") %>% #exclude US
    group_by(year) %>%  #group data by year 
    filter(number.of.households..thousands. == max(number.of.households..thousands.)) %>% #keep only rows with max no. of households
    distinct(year, region, .keep_all = TRUE) %>%  #show only unique years and region while keeping all columns 
    ungroup()

southcolor <- c("coral4")
ggplot(top_region_hh_yr, aes(x = year, y = number.of.households..thousands., fill = region)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = southcolor, name = "Region") +
    labs(title = "Most Populated Region by Year",
        x = "Year", y = "No. of Households", fill = "Region") +
    scale_x_continuous(breaks = seq(min(df$year), max(df$year), by =1)) +
    theme(text = element_text(family = "Helvetica", size = 12),
          plot.background = element_rect(fill = "snow1"),
          panel.background = element_rect(fill = "oldlace"),
          panel.grid = element_line(color = "snow1", size = 1),
          legend.background = element_rect(fill = "oldlace")) 

Distribution of Population for Each Region by Year

region_pop_yr <- df %>% 
    select(year, region, number.of.households..thousands.) %>% 
    group_by(year) %>%  
    distinct(region, .keep_all = TRUE) %>% 
    ungroup()

ggplot(region_pop_yr, aes(x = year, y = number.of.households..thousands., color = region)) + 
    geom_line(size = 1) +
    geom_point(size = 2) +
    scale_color_manual(values = regioncolour) +
    labs(title = "Household Distribution of Each Region by Year", x = "Year",
         y = "No. of Households", color = "Region") + 
    theme(text = element_text(family = "Helvetica", size = 12),
          plot.background = element_rect(fill = "snow1"),
          panel.background = element_rect(fill = "oldlace"),
          panel.grid = element_line(color = "snow1", size = 1),
          legend.background = element_rect(fill = "oldlace")) +
    scale_x_continuous(breaks = seq(min(df$year), max(df$year), by = 1)) +
    scale_y_continuous(breaks = seq(0, max(df$number.of.households..thousands.), by = 25000))

Distribution of Average Sales Price by Region

avg_sp_region_gen <- df %>%  
    select(year, region, average.sales.price) %>% 
    group_by(region) %>% 
    summarise('average.sales.price' = mean(average.sales.price)) %>% 
    ungroup()
    
ggplot(avg_sp_region_gen, aes(x = region, y = average.sales.price, group = 1)) +
    geom_line(color = "burlywood4", size = 1) +
    labs(title = "Average Sales Prices by Region", 
         x = "Region", y = "Average Sales Price ($)") + 
    theme(text = element_text(family = "Helvetica", size = 12),
          plot.background = element_rect(fill = "snow1"),
          panel.background = element_rect(fill = "oldlace"),
          panel.grid = element_line(color = "snow1", size = 1),
          legend.background = element_rect(fill = "oldlace"))

Distribution of Average Sales Price of Each Region by Year

avg_sp_region_yr <- df %>%  
    select(year, region, average.sales.price) %>% 
    group_by(year, region) %>%  
    summarise('average.sales.price' = sum(average.sales.price)) %>% 
    ungroup()

ggplot(avg_sp_region_yr, aes(x = year, y = average.sales.price, color = region)) +
    geom_line(size = 1) +
    geom_point(size = 2) +
    scale_color_manual(values = regioncolour) + 
    labs(title = "Average Sales Price of Each Region by Year",
         x = 'Year', y = 'Average Sales Price ($)', color = "Region") + 
    theme(text = element_text(family = "Helvetica", size = 12),
          plot.background = element_rect(fill = "snow1"),
          panel.background = element_rect(fill = "oldlace"),
          panel.grid = element_line(color = "snow1", size = 1),
          legend.background = element_rect(fill = "oldlace")) +
    scale_x_continuous(breaks = seq(min(avg_sp_region_yr$year), max(avg_sp_region_yr$year), by = 1)) +
    scale_y_continuous(breaks = seq(0, max(avg_sp_region_yr$average.sales.price), by = 500000))

Mean & Median Income across Each Region

mean_med_income_reg <- df %>% 
    select(year, region, mean.income...current.dollars, median.income...current.dollars) %>% 
    distinct(year, region, .keep_all = TRUE)

p1 <- ggplot(df, aes(x = region, y = mean.income...current.dollars, fill = region)) +
        geom_boxplot() +
        geom_point() + 
        scale_fill_manual(values = regioncolour) + 
        labs(title = "Mean Income by Region",
             x = "Region", y = "Mean Income ($)", fill = "Region") + 
        theme(text = element_text(family = "Helvetica", size = 12),
              plot.background = element_rect(fill = "snow1"),
              panel.background = element_rect(fill = "oldlace"),
              panel.grid = element_line(color = "snow1", size = 1),
              legend.background = element_rect(fill = "oldlace")) +
        scale_y_continuous(limits = c(50000, NA),
                           breaks = seq(50000, max(mean_med_income_reg$mean.income...current.dollars), by = 10000))

p2 <- ggplot(df, aes(x = region, y = median.income...current.dollars, fill = region)) + 
        geom_boxplot() +
        geom_point() +
        scale_fill_manual(values = regioncolour) +
        labs(title = "Median Income by Region",
             x = "Region", y = "Median Income ($)", fill = "Region") + 
        theme(text = element_text(family = "Helvetica", size = 12),
              plot.background = element_rect(fill = "snow1"),
              panel.background = element_rect(fill = "oldlace"),
              panel.grid = element_line(color = "snow1", size = 1),
              legend.background = element_rect(fill = "oldlace")) +
        scale_y_continuous(limits = c(50000, NA),
                           breaks = seq(50000, max(mean_med_income_reg$median.income...current.dollars), by = 10000))

combined1 <- p1 + p2 + plot_layout(ncol = 2)
combined1

Average Sales Price by Home Size

avg_sp_hs_gen <- df %>% 
    select(home.size, average.sales.price) %>% 
    group_by(home.size) %>% 
    summarise('average.sales.price' = mean(average.sales.price)) %>% 
    ungroup()

homesizecolour<- c("honeydew4", "navajowhite3","rosybrown2")
ggplot(avg_sp_hs_gen, aes(x = home.size, y = average.sales.price, fill = home.size)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = homesizecolour) +
    labs(title = "Average Sales Price by Home Size",
         x = "Home Size", y = "Average Sales Price ($)", fill = "Home Size") +
    theme(text = element_text(family = "Helvetica", size = 12),
          plot.background = element_rect(fill = "snow1"),
          panel.background = element_rect(fill = "oldlace"),
          panel.grid = element_line(color = "snow1", size = 1),
          legend.background = element_rect(fill = "oldlace"))

Average Sales Price each year by home size

avg_sp_hs_yr <- df %>%  
    select(year, home.size, average.sales.price) %>% 
    group_by(year, home.size) %>% 
    summarise('average.sales.price' = mean(average.sales.price)) %>% 
    ungroup()

ggplot(avg_sp_hs_yr, aes(x = year, y = average.sales.price, color = home.size)) +
    geom_line(size = 1) +
    geom_point(size = 2) +
    scale_color_manual(values = homesizecolour) + 
    labs(title = "Average Sales Price Each Year by Home Size", 
        x = "Year", y = "Average Sales Price ($)", color = "Home Size") + 
    theme(text = element_text(family = "Helvetica", size = 12),
          plot.background = element_rect(fill = "snow1"),
          panel.background = element_rect(fill = "oldlace"),
          panel.grid = element_line(color = "snow1", size = 1),
          legend.background = element_rect(fill = "oldlace")) +
    scale_x_continuous(breaks = seq(min(df$year), max(df$year), by = 1))

Box Plots of Mean & Median Income by Home Size

p3 <- ggplot(df, aes(x = home.size, y = median.income...current.dollars, fill = home.size)) + 
        geom_point() +
        geom_boxplot() +
        scale_fill_manual(values = homesizecolour) +
        labs(title = "Box Plot of Median Income by Home Size",
             x = "Home Size", y = "Median Income ($)", fill = "Home Size") + 
        theme(text = element_text(family = "Helvetica", size = 12),
              plot.background = element_rect(fill = "snow1"),
              panel.background = element_rect(fill = "oldlace"),
              panel.grid = element_line(color = "snow1", size = 1),
              legend.background = element_rect(fill = "oldlace"))

p4 <- ggplot(df, aes(x = home.size, y = mean.income...current.dollars, fill = home.size)) +
        geom_point() +
        geom_boxplot() +
        scale_fill_manual(values = homesizecolour) +
        labs(title = "Box Plot of Mean Income by Home Size",
             x = "Home Size", y = "Mean Income ($)", fill = "Home Size") + 
        theme(text = element_text(family = "Helvetica", size = 12),
              plot.background = element_rect(fill = "snow1"),
              panel.background = element_rect(fill = "oldlace"),
              panel.grid = element_line(color = "snow1", size = 1),
              legend.background = element_rect(fill = "oldlace"))
    
combined2 <- p3 + p4 +plot_layout(ncol = 2)
combined2 

ggplot(df, aes(x = home.size, y = average.sales.price, fill = home.size)) + 
    geom_point() + 
    geom_boxplot() +
    scale_fill_manual(values = homesizecolour) +
    labs(title = "Box Plot of Average Sales Price by Home",
         x  = "Home Size", y = "Average Sales Price ($)", fill = "Home Size") + 
    theme(text = element_text(family = "Helvetica", size = 12),
          plot.background = element_rect(fill = "snow1"),
          panel.background = element_rect(fill = "oldlace"),
          panel.grid = element_line(color = "snow1", size = 1),
          legend.background = element_rect(fill = "oldlace"))

Correlation Matrix

corr_mat_df <- df %>% 
    select(-year, -month, -region, - home.size)
corr_mat <- cor(corr_mat_df)
ggcorrplot::ggcorrplot(corr_mat, type = "lower", lab = TRUE, hc.order = TRUE,
                       colors = c("darkseagreen4", "peachpuff2", "indianred4")) +
    labs(title = "Correlation Matrix of Real Estate Income") +
    theme(text = element_text(family = "Helvetica", size = 12),
          panel.background = element_rect(fill = "oldlace"),
          legend.background = element_rect(fill = "oldlace"))

Scatter Plots of Average Sales Price VS Mean & Median Income

p5 <- ggplot(df, aes(x = average.sales.price, y = mean.income...current.dollars)) +
        geom_point(color = "bisque4") +
        geom_smooth(method = "lm", se  = FALSE, color = "rosybrown4", formula = y ~ x) + 
        labs(title = "Scatter Plot of Average Sales Price VS Mean Income",
             x  = "Average Sales Price ($)", y  = "Mean Income ($)") + 
        theme(text = element_text(family = "Helvetica", size = 12),
              plot.background = element_rect(fill = "snow1"),
              panel.background = element_rect(fill = "oldlace"),
              panel.grid = element_line(color = "snow1", size = 1),
              legend.background = element_rect(fill = "oldlace")) +
        scale_y_continuous(limits = c(50000, NA),
                           breaks = seq(50000, max(df$mean.income...current.dollars), by = 10000))

p6 <- ggplot(df, aes(x = average.sales.price, y = median.income...current.dollars)) + 
        geom_point(color = "honeydew4") +
        geom_smooth(method = "lm", se  = FALSE, color = "rosybrown4", formula = y ~ x) +
        labs(title = "Scatter Plot of Average Sales Price VS Median Income",
             x  = "Average Sales Price ($)", y  = "Median Income ($)") + 
        theme(text = element_text(family = "Helvetica", size = 12),
              plot.background = element_rect(fill = "snow1"),
              panel.background = element_rect(fill = "oldlace"),
              panel.grid = element_line(color = "snow1", size = 1),
              legend.background = element_rect(fill = "oldlace")) +
        scale_y_continuous(limits = c(50000, NA),
                           breaks = seq(50000, max(df$median.income...current.dollars), by = 10000))

combined3 <- p5 + p6 +plot_layout(ncol = 2)
combined3

Conclusion

In conclusion, this analysis offers a comprehensive view of the U.S. real estate market and its economic landscape. The findings highlight notable trends, such as the Southern region having the highest number of households despite lower income levels compared to other areas. Additionally, double homes emerge as the most expensive type, and there is a moderate correlation between average sales prices and income. These insights provide valuable understanding of the dynamics within the U.S. real estate market.