The U.S. Real Estate Income data set encompasses data from 2014 to 2022, including average sales prices, mean and median incomes, and the number of households across various regions: Midwest, Northeast, South, United States, and West, spanning from January to December each year. Analyzing this data set is valuable for examining the relationships between real estate prices and household incomes across different regions in the United States.
The Southern region of the U.S. has the highest population, with 442,200 households.
The Western region of the U.S. has the highest average sales price, at $93,298.
The Southern region is closely following the Western region in terms of rising sales prices.
There is a wider income distribution in the Northeastern and Western regions of the U.S.
The Southern region of the U.S. has relatively lower income levels.
Double homes are the most expensive among the three home sizes, with an average sales price of $107,074.
There is a moderate correlation between average sales prices and income.
df <- read.csv("~/Case Studies/US Real Estate/RealEstateUnitedStates.csv")
library('magrittr')
library('dplyr')
library('ggplot2')
library('patchwork')
str(df)
## 'data.frame': 1575 obs. of 10 variables:
## $ Year : int 2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
## $ Month : chr "December" "December" "December" "December" ...
## $ Region : chr "Midwest" "Midwest" "Midwest" "Northeast" ...
## $ Home.Size : chr "Double" "Single " "Total1" "Double" ...
## $ Average.Sales.Price : num 144300 82300 104700 158300 75300 ...
## $ Number.of.Households..Thousands.: num 28280 28280 28280 22630 22630 ...
## $ Median.Income...Current.Dollars : num 73070 73070 73070 80360 80360 ...
## $ Median.Income...2022.Dollars : num 73070 73070 73070 80360 80360 ...
## $ Mean.Income...Current.Dollars : num 102400 102400 102400 115300 115300 ...
## $ Mean.Income...2022.Dollars : num 102400 102400 102400 115300 115300 ...
## change all col names to lower case
colnames(df) <- tolower(colnames(df))
df <- df %>%
mutate_at(vars(month, region, home.size), trimws)
sum(is.na(df))
## [1] 0
sum(duplicated(df))
## [1] 0
summarydf <- df %>%
select(-year, -month, -region, -home.size)
summary(summarydf)
## average.sales.price number.of.households..thousands.
## Min. : 33900 Min. : 22030
## 1st Qu.: 59350 1st Qu.: 27460
## Median : 81700 Median : 29100
## Mean : 83472 Mean : 51233
## 3rd Qu.:100300 3rd Qu.: 49760
## Max. :178200 Max. :131400
## median.income...current.dollars median.income...2022.dollars
## Min. :49660 Min. :60060
## 1st Qu.:58310 1st Qu.:69310
## Median :64280 Median :74380
## Mean :65156 Mean :74205
## 3rd Qu.:70780 3rd Qu.:78350
## Max. :82890 Max. :86820
## mean.income...current.dollars mean.income...2022.dollars
## Min. : 70990 Min. : 85870
## 1st Qu.: 81960 1st Qu.: 96300
## Median : 91610 Median :104100
## Mean : 92139 Mean :104891
## 3rd Qu.:102200 3rd Qu.:111700
## Max. :117500 Max. :125200
region_pop_gen <- df %>%
select(year, region, number.of.households..thousands.) %>%
group_by(year) %>%
distinct(year, region, .keep_all = TRUE)
regioncolour <- c("darkseagreen4", "burlywood4", "coral4","peachpuff2", "tan")
ggplot(region_pop_gen, aes(x = region, y = number.of.households..thousands., fill = region)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = regioncolour) +
labs(title = "Households Distribution by Regions",
x = "Regions", y = "No. of Households", fill = "Region") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace"))
top_region_hh_yr <- df %>%
select(year, region, number.of.households..thousands.) %>% #select relevant data
filter(region!= "United States") %>% #exclude US
group_by(year) %>% #group data by year
filter(number.of.households..thousands. == max(number.of.households..thousands.)) %>% #keep only rows with max no. of households
distinct(year, region, .keep_all = TRUE) %>% #show only unique years and region while keeping all columns
ungroup()
southcolor <- c("coral4")
ggplot(top_region_hh_yr, aes(x = year, y = number.of.households..thousands., fill = region)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = southcolor, name = "Region") +
labs(title = "Most Populated Region by Year",
x = "Year", y = "No. of Households", fill = "Region") +
scale_x_continuous(breaks = seq(min(df$year), max(df$year), by =1)) +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace"))
region_pop_yr <- df %>%
select(year, region, number.of.households..thousands.) %>%
group_by(year) %>%
distinct(region, .keep_all = TRUE) %>%
ungroup()
ggplot(region_pop_yr, aes(x = year, y = number.of.households..thousands., color = region)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_color_manual(values = regioncolour) +
labs(title = "Household Distribution of Each Region by Year", x = "Year",
y = "No. of Households", color = "Region") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace")) +
scale_x_continuous(breaks = seq(min(df$year), max(df$year), by = 1)) +
scale_y_continuous(breaks = seq(0, max(df$number.of.households..thousands.), by = 25000))
avg_sp_region_gen <- df %>%
select(year, region, average.sales.price) %>%
group_by(region) %>%
summarise('average.sales.price' = mean(average.sales.price)) %>%
ungroup()
ggplot(avg_sp_region_gen, aes(x = region, y = average.sales.price, group = 1)) +
geom_line(color = "burlywood4", size = 1) +
labs(title = "Average Sales Prices by Region",
x = "Region", y = "Average Sales Price ($)") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace"))
avg_sp_region_yr <- df %>%
select(year, region, average.sales.price) %>%
group_by(year, region) %>%
summarise('average.sales.price' = sum(average.sales.price)) %>%
ungroup()
ggplot(avg_sp_region_yr, aes(x = year, y = average.sales.price, color = region)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_color_manual(values = regioncolour) +
labs(title = "Average Sales Price of Each Region by Year",
x = 'Year', y = 'Average Sales Price ($)', color = "Region") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace")) +
scale_x_continuous(breaks = seq(min(avg_sp_region_yr$year), max(avg_sp_region_yr$year), by = 1)) +
scale_y_continuous(breaks = seq(0, max(avg_sp_region_yr$average.sales.price), by = 500000))
mean_med_income_reg <- df %>%
select(year, region, mean.income...current.dollars, median.income...current.dollars) %>%
distinct(year, region, .keep_all = TRUE)
p1 <- ggplot(df, aes(x = region, y = mean.income...current.dollars, fill = region)) +
geom_boxplot() +
geom_point() +
scale_fill_manual(values = regioncolour) +
labs(title = "Mean Income by Region",
x = "Region", y = "Mean Income ($)", fill = "Region") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace")) +
scale_y_continuous(limits = c(50000, NA),
breaks = seq(50000, max(mean_med_income_reg$mean.income...current.dollars), by = 10000))
p2 <- ggplot(df, aes(x = region, y = median.income...current.dollars, fill = region)) +
geom_boxplot() +
geom_point() +
scale_fill_manual(values = regioncolour) +
labs(title = "Median Income by Region",
x = "Region", y = "Median Income ($)", fill = "Region") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace")) +
scale_y_continuous(limits = c(50000, NA),
breaks = seq(50000, max(mean_med_income_reg$median.income...current.dollars), by = 10000))
combined1 <- p1 + p2 + plot_layout(ncol = 2)
combined1
avg_sp_hs_gen <- df %>%
select(home.size, average.sales.price) %>%
group_by(home.size) %>%
summarise('average.sales.price' = mean(average.sales.price)) %>%
ungroup()
homesizecolour<- c("honeydew4", "navajowhite3","rosybrown2")
ggplot(avg_sp_hs_gen, aes(x = home.size, y = average.sales.price, fill = home.size)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = homesizecolour) +
labs(title = "Average Sales Price by Home Size",
x = "Home Size", y = "Average Sales Price ($)", fill = "Home Size") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace"))
avg_sp_hs_yr <- df %>%
select(year, home.size, average.sales.price) %>%
group_by(year, home.size) %>%
summarise('average.sales.price' = mean(average.sales.price)) %>%
ungroup()
ggplot(avg_sp_hs_yr, aes(x = year, y = average.sales.price, color = home.size)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_color_manual(values = homesizecolour) +
labs(title = "Average Sales Price Each Year by Home Size",
x = "Year", y = "Average Sales Price ($)", color = "Home Size") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace")) +
scale_x_continuous(breaks = seq(min(df$year), max(df$year), by = 1))
p3 <- ggplot(df, aes(x = home.size, y = median.income...current.dollars, fill = home.size)) +
geom_point() +
geom_boxplot() +
scale_fill_manual(values = homesizecolour) +
labs(title = "Box Plot of Median Income by Home Size",
x = "Home Size", y = "Median Income ($)", fill = "Home Size") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace"))
p4 <- ggplot(df, aes(x = home.size, y = mean.income...current.dollars, fill = home.size)) +
geom_point() +
geom_boxplot() +
scale_fill_manual(values = homesizecolour) +
labs(title = "Box Plot of Mean Income by Home Size",
x = "Home Size", y = "Mean Income ($)", fill = "Home Size") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace"))
combined2 <- p3 + p4 +plot_layout(ncol = 2)
combined2
ggplot(df, aes(x = home.size, y = average.sales.price, fill = home.size)) +
geom_point() +
geom_boxplot() +
scale_fill_manual(values = homesizecolour) +
labs(title = "Box Plot of Average Sales Price by Home",
x = "Home Size", y = "Average Sales Price ($)", fill = "Home Size") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace"))
corr_mat_df <- df %>%
select(-year, -month, -region, - home.size)
corr_mat <- cor(corr_mat_df)
ggcorrplot::ggcorrplot(corr_mat, type = "lower", lab = TRUE, hc.order = TRUE,
colors = c("darkseagreen4", "peachpuff2", "indianred4")) +
labs(title = "Correlation Matrix of Real Estate Income") +
theme(text = element_text(family = "Helvetica", size = 12),
panel.background = element_rect(fill = "oldlace"),
legend.background = element_rect(fill = "oldlace"))
p5 <- ggplot(df, aes(x = average.sales.price, y = mean.income...current.dollars)) +
geom_point(color = "bisque4") +
geom_smooth(method = "lm", se = FALSE, color = "rosybrown4", formula = y ~ x) +
labs(title = "Scatter Plot of Average Sales Price VS Mean Income",
x = "Average Sales Price ($)", y = "Mean Income ($)") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace")) +
scale_y_continuous(limits = c(50000, NA),
breaks = seq(50000, max(df$mean.income...current.dollars), by = 10000))
p6 <- ggplot(df, aes(x = average.sales.price, y = median.income...current.dollars)) +
geom_point(color = "honeydew4") +
geom_smooth(method = "lm", se = FALSE, color = "rosybrown4", formula = y ~ x) +
labs(title = "Scatter Plot of Average Sales Price VS Median Income",
x = "Average Sales Price ($)", y = "Median Income ($)") +
theme(text = element_text(family = "Helvetica", size = 12),
plot.background = element_rect(fill = "snow1"),
panel.background = element_rect(fill = "oldlace"),
panel.grid = element_line(color = "snow1", size = 1),
legend.background = element_rect(fill = "oldlace")) +
scale_y_continuous(limits = c(50000, NA),
breaks = seq(50000, max(df$median.income...current.dollars), by = 10000))
combined3 <- p5 + p6 +plot_layout(ncol = 2)
combined3
In conclusion, this analysis offers a comprehensive view of the U.S. real estate market and its economic landscape. The findings highlight notable trends, such as the Southern region having the highest number of households despite lower income levels compared to other areas. Additionally, double homes emerge as the most expensive type, and there is a moderate correlation between average sales prices and income. These insights provide valuable understanding of the dynamics within the U.S. real estate market.