Ansel Melly 22/00071
Karanja Everlyne Njeri 21/06866
Songa Constance Joyce 21/05347
Deylon Ng’ang’a 21/05753
library(tidyverse)
library(here)
library(readxl)
library(kableExtra)
library(scales)
library(moments)
library(patchwork)
library(ggthemes)
library(gridExtra)
library(corrplot)
library(forcats)
library(ggcorrplot)
library(knitr)
library(knitLatex)
library(broom)
hpd<-read_xlsx(here("data/House pricing data.xlsx"))
dataset seem to be centered round house pricing based on specific features and amenities. these properties have effect on the house price
data_structure_summary <- function(df) {
result <- data.frame(
Seq = 1:length(names(df)),
Variable = names(df),
Class = sapply(df, class),
First_Values = sapply(df, function(x) paste0(head(x, 3), collapse = ", ")),
Missing = sapply(df, function(x) sum(is.na(x))),
Unique_Values = sapply(df, function(x) length(unique(x))),
row.names = NULL
)
return(result)
}
oput<-data_structure_summary(hpd) %>% kable(
caption = "Data Structure Summary",
align = c("r","l", "l", "l", "r", "r"),
row.names = FALSE
) %>% kable_styling(
bootstrap_options = c("striped","hover","condensed"),
full_width = F
) %>%
column_spec(1, bold = TRUE, width = "3em") %>%
column_spec(2, bold = TRUE) %>%
column_spec(4, width = "30em")
| Seq | Variable | Class | First_Values | Missing | Unique_Values |
|---|---|---|---|---|---|
| 1 | HouseId | numeric | 1, 2, 3 | 0 | 1460 |
| 2 | MSZoning | character | Residential Low Density, Residential Low Density, Residential Low Density | 0 | 5 |
| 3 | LotAreaSquareFeet | numeric | 8450, 9600, 11250 | 0 | 1073 |
| 4 | LandSlope | character | Gentleslope, Gentleslope, Gentleslope | 0 | 3 |
| 5 | BuildingType | character | Single-family Detached, Single-family Detached, Single-family Detached | 0 | 5 |
| 6 | OverallCondition | numeric | 5, 8, 5 | 0 | 9 |
| 7 | YearBuilt | numeric | 2003, 1976, 2001 | 0 | 112 |
| 8 | ExteriorCondition | character | Average/Typical, Average/Typical, Average/Typical | 0 | 5 |
| 9 | Foundation | character | Poured Contrete, Cinder Block, Poured Contrete | 0 | 6 |
| 10 | TotalBasementSquareFeet | numeric | 856, 1262, 920 | 0 | 721 |
| 11 | HeatingQualityCondition | character | Excellent, Excellent, Excellent | 0 | 5 |
| 12 | CentralAirConditioning | character | Yes, Yes, Yes | 0 | 2 |
| 13 | 1stFloorSquareFeet | numeric | 856, 1262, 920 | 0 | 753 |
| 14 | 2ndFlrSquareFeet | numeric | 854, 0, 866 | 0 | 417 |
| 15 | LivAreaSquareFeet | numeric | 1710, 1262, 1786 | 0 | 861 |
| 16 | FullBathrooms | numeric | 2, 2, 2 | 0 | 4 |
| 17 | Bedrooms | numeric | 3, 3, 3 | 0 | 8 |
| 18 | KitchenQualityCondition | character | Good, Typical/Average, Good | 0 | 4 |
| 19 | TotalRooms | numeric | 8, 6, 6 | 0 | 12 |
| 20 | GarageArea | numeric | 548, 460, 608 | 0 | 441 |
| 21 | TotalPorchAreaSquareFeet | numeric | 61, 0, 42 | 0 | 256 |
| 22 | MonthSold | numeric | 2, 5, 9 | 0 | 12 |
| 23 | YearSold | numeric | 2008, 2007, 2008 | 0 | 5 |
| 24 | SaleType | character | Warranty Deed - Conventional, Warranty Deed - Conventional, Warranty Deed - Conventional | 0 | 9 |
| 25 | SaleCondition | character | Normal, Normal, Normal | 0 | 6 |
| 26 | SalePrice | numeric | 208500, 181500, 223500 | 0 | 663 |
fmt<-function(k) {
res<-k %>% t() %>% kable() %>% kable_styling(
bootstrap_options = c("striped","condensed","hover"),
full_width = F
) %>%
column_spec(1, bold = TRUE)
return (res)
}
randsamp<-hpd %>%
sample_n(3) %>%
mutate(Seq = seq_along(1:n())) %>%
select(Seq,everything()) %>%
fmt()
randsamp %>% landscape()
| Seq | 1 | 2 | 3 |
| HouseId | 1210 | 1371 | 1430 |
| MSZoning | Residential Low Density | Residential Low Density | Residential Low Density |
| LotAreaSquareFeet | 10182 | 5400 | 12546 |
| LandSlope | Gentleslope | Gentleslope | Gentleslope |
| BuildingType | Single-family Detached | Single-family Detached | Single-family Detached |
| OverallCondition | 5 | 6 | 7 |
| YearBuilt | 2006 | 1920 | 1981 |
| ExteriorCondition | Average/Typical | Average/Typical | Good |
| Foundation | Poured Contrete | Poured Contrete | Cinder Block |
| TotalBasementSquareFeet | 1660 | 840 | 1440 |
| HeatingQualityCondition | Excellent | Excellent | Excellent |
| CentralAirConditioning | Yes | Yes | Yes |
| 1stFloorSquareFeet | 1660 | 840 | 1440 |
| 2ndFlrSquareFeet | 0 | 534 | 0 |
| LivAreaSquareFeet | 1660 | 1374 | 1440 |
| FullBathrooms | 2 | 1 | 2 |
| Bedrooms | 3 | 2 | 3 |
| KitchenQualityCondition | Good | Typical/Average | Good |
| TotalRooms | 8 | 6 | 7 |
| GarageArea | 500 | 338 | 467 |
| TotalPorchAreaSquareFeet | 50 | 198 | 99 |
| MonthSold | 5 | 10 | 4 |
| YearSold | 2006 | 2009 | 2007 |
| SaleType | Home just constructed and sold | Warranty Deed - Conventional | Warranty Deed - Conventional |
| SaleCondition | Partial | Normal | Normal |
| SalePrice | 290000 | 105000 | 182900 |
f<-colSums(is.na(hpd))
na_cols <- data.frame(column = names(f), count = as.vector(f))
na_cols %>% kable() %>% kable_classic()
| column | count |
|---|---|
| HouseId | 0 |
| MSZoning | 0 |
| LotAreaSquareFeet | 0 |
| LandSlope | 0 |
| BuildingType | 0 |
| OverallCondition | 0 |
| YearBuilt | 0 |
| ExteriorCondition | 0 |
| Foundation | 0 |
| TotalBasementSquareFeet | 0 |
| HeatingQualityCondition | 0 |
| CentralAirConditioning | 0 |
| 1stFloorSquareFeet | 0 |
| 2ndFlrSquareFeet | 0 |
| LivAreaSquareFeet | 0 |
| FullBathrooms | 0 |
| Bedrooms | 0 |
| KitchenQualityCondition | 0 |
| TotalRooms | 0 |
| GarageArea | 0 |
| TotalPorchAreaSquareFeet | 0 |
| MonthSold | 0 |
| YearSold | 0 |
| SaleType | 0 |
| SaleCondition | 0 |
| SalePrice | 0 |
Note:there is no missing data
# COMPREHENSIVE NUMERIC VARIABLE ANALYSIS
#
# Purpose: This analysis generates a complete statistical summary of all numeric
# variables in the housing price dataset. It provides key distributional measures
# including central tendency, spread, extremes, and data completeness.
#
# Methodology:
# 1. Selects all numeric variables from the dataset
# 2. Calculates 6 key statistics for each variable: mean, median, min, max,
# standard deviation, and count of non-missing values
# 3. Transforms the data into a readable table format through strategic pivoting
# 4. Formats numbers for better readability (with comma separators)
#
# Use case: This table helps identify outliers, understand data distributions,
# assess variable ranges, and determine data completeness before proceeding
# with more advanced analyses or modeling.
# Basic Statistics of numeric variables
hpd %>%
# Step 1: Select only numeric columns
select(where(is.numeric)) %>%
# Step 2: Calculate summary statistics for all selected columns
summarise(
across(everything(), list(
mean = ~mean(., na.rm = TRUE), # Average value
median = ~median(., na.rm = TRUE), # Middle value (50th percentile)
min = ~min(., na.rm = TRUE), # Minimum value
max = ~max(., na.rm = TRUE), # Maximum value
sd = ~sd(., na.rm = TRUE), # Standard deviation (measure of spread)
count = ~sum(!is.na(.)) # Number of non-missing values
))
) %>%
# Step 3: Reshape data for better presentation - first to long format
pivot_longer(everything(),
names_to = c("variable", "statistic"),
names_sep = "_",
values_to = "value") %>%
# Step 4: Reshape to wide format with variables as rows and statistics as columns
pivot_wider(names_from = statistic, values_from = value) %>%
# Step 5: Format numeric values with commas and round to 4 decimal places
mutate(across(where(is.numeric), ~comma(round(., 4)))) %>%
# Step 6: Create a formatted table
kable(caption = "Comprehensive Statistics for All Numeric Variables") %>%
# Step 7: Apply styling to the table
kable_styling(
bootstrap_options = c("striped", "condensed", "hover"),
full_width = FALSE
)
| variable | mean | median | min | max | sd | count |
|---|---|---|---|---|---|---|
| HouseId | 730.50 | 730.5 | 1 | 1,460 | 421.61 | 1,460 |
| LotAreaSquareFeet | 10,516.83 | 9,478.5 | 1,300 | 215,245 | 9,981.26 | 1,460 |
| OverallCondition | 5.58 | 5.0 | 1 | 9 | 1.11 | 1,460 |
| YearBuilt | 1,971.27 | 1,973.0 | 1,872 | 2,010 | 30.20 | 1,460 |
| TotalBasementSquareFeet | 1,057.43 | 991.5 | 0 | 6,110 | 438.71 | 1,460 |
| 1stFloorSquareFeet | 1,162.63 | 1,087.0 | 334 | 4,692 | 386.59 | 1,460 |
| 2ndFlrSquareFeet | 346.99 | 0.0 | 0 | 2,065 | 436.53 | 1,460 |
| LivAreaSquareFeet | 1,515.46 | 1,464.0 | 334 | 5,642 | 525.48 | 1,460 |
| FullBathrooms | 1.57 | 2.0 | 0 | 3 | 0.55 | 1,460 |
| Bedrooms | 2.87 | 3.0 | 0 | 8 | 0.82 | 1,460 |
| TotalRooms | 6.52 | 6.0 | 2 | 14 | 1.63 | 1,460 |
| GarageArea | 472.98 | 480.0 | 0 | 1,418 | 213.80 | 1,460 |
| TotalPorchAreaSquareFeet | 68.61 | 40.0 | 0 | 638 | 85.86 | 1,460 |
| MonthSold | 6.32 | 6.0 | 1 | 12 | 2.70 | 1,460 |
| YearSold | 2,007.82 | 2,008.0 | 2,006 | 2,010 | 1.33 | 1,460 |
| SalePrice | 180,921.20 | 163,000.0 | 34,900 | 755,000 | 79,442.50 | 1,460 |
# BUILDING TYPE CLASSIFICATION AND AGE ANALYSIS
#
# This analysis performs two key data transformations:
# 1. Converts text-based building type descriptions to standardized numeric codes
# and adds back descriptive labels for better categorization
# 2. Calculates the age of each house at the time of sale for temporal analysis
#
# This approach enables more consistent filtering, grouping, and visualization
# of housing data based on building type and age characteristics.
# Step 1: Add numeric codes for building types and calculate house age
hpd <- hpd %>%
mutate(
# Create numeric building type codes based on text patterns
# Using startsWith() for pattern matching and nchar() to ensure we're not matching partial words
BuildingTypeCode = case_when(
startsWith(BuildingType, "Single") & nchar(BuildingType) >= 6 ~ 1, # Single-family Detached
startsWith(BuildingType, "Two") & nchar(BuildingType) >= 6 ~ 2, # Two-family Conversion
startsWith(BuildingType, "Duplex") & nchar(BuildingType) >= 6 ~ 3, # Duplex
startsWith(BuildingType, "Townhouse E") & nchar(BuildingType) >= 6 ~ 4, # Townhouse End Unit
startsWith(BuildingType, "Townhouse I") & nchar(BuildingType) >= 6 ~ 5, # Townhouse Inside Unit
TRUE ~ NA_real_ # Assign NA to any unmatched building types
),
# Calculate house age at time of sale (in years)
HouseAge = (YearSold - YearBuilt)
)
# Step 2: Create a reference table for building type codes
building_type_key <- data.frame(
Code = 1:5,
Pattern = c("Single*", "Two*", "Duplex*", "Townhouse E*", "Townhouse I*"),
Description = c(
"Single-family Detached",
"Two-family Conversion",
"Duplex",
"Townhouse End Unit",
"Townhouse Inside Unit"
)
)
# Step 3: Join the housing data with the building type key to add descriptive labels
hpd <- hpd %>%
left_join(building_type_key, by = c("BuildingTypeCode" = "Code"))
# Summary of new columns added to the dataset
new_columns <- data.frame(
Column = c("BuildingTypeCode", "HouseAge", "Pattern", "Description"),
Description = c(
"Numeric code (1-5) representing the building type",
"Age of house in years (YearSold - YearBuilt)",
"Text pattern used to match building types",
"Full descriptive label for each building type code"
),
DataType = c("numeric", "numeric", "character", "character")
)
# Display the new columns summary
new_columns %>%
kable(caption = "New Variables Added to Housing Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
| Column | Description | DataType |
|---|---|---|
| BuildingTypeCode | Numeric code (1-5) representing the building type | numeric |
| HouseAge | Age of house in years (YearSold - YearBuilt) | numeric |
| Pattern | Text pattern used to match building types | character |
| Description | Full descriptive label for each building type code | character |
# ANALYSIS: Housing Condition Quality Assessment
# This visualization examines the distribution of property condition ratings across the housing stock.
# The condition rating (1-9 scale) represents the overall physical condition of the property:
# - Lower ratings (1-3): Poor to fair condition, may require significant repairs
# - Middle ratings (4-6): Average to good condition, typical for the housing market
# - Higher ratings (7-9): Very good to excellent condition, well-maintained properties
# Understanding this distribution helps identify the general quality level of the housing inventory.
hpd %>%
# Ensure proper ordering of condition ratings while maintaining factor type
mutate(OverallCondition = factor(OverallCondition, levels = 1:9)) %>%
ggplot(aes(x = OverallCondition, fill = OverallCondition)) +
# Use a discrete color palette instead of gradient
geom_bar() +
scale_fill_brewer(palette = "Blues") + # Use a discrete blue palette
# Add count labels above bars
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.5, size = 3.5, color = "black") +
# Add mean condition indicator (fixed with linewidth instead of size)
geom_vline(
xintercept = as.numeric(mean(as.numeric(as.character(hpd$OverallCondition)), na.rm = TRUE)),
linetype = "dashed",
color = "red",
linewidth = 1 # Using linewidth instead of size
) +
# Annotation for mean
annotate(
"text",
x = as.numeric(mean(as.numeric(as.character(hpd$OverallCondition)), na.rm = TRUE)) + 0.5,
y = max(table(hpd$OverallCondition)) * 0.8,
label = paste0("Mean: ", round(mean(as.numeric(as.character(hpd$OverallCondition)), na.rm = TRUE), 2)),
color = "red",
size = 3.5
) +
# Formatting labels and scales
scale_x_discrete(labels = 1:9) + # Explicit x-axis labels
labs(
title = "Distribution of Houses by Overall Condition Rating",
subtitle = "Analysis of property condition quality across the housing inventory",
x = "Condition Rating (1=Poor, 9=Excellent)",
y = "Number of Houses",
caption = "Source: Housing Price Dataset (hpd)\nNote: Most properties fall in the average to good condition range (5-6)"
) +
# Theme customization
theme_stata() +
theme(
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "gray30"),
plot.caption = element_text(hjust = 0, face = "italic", size = 9),
panel.grid.minor = element_blank(),
legend.position = "none" # Remove redundant legend
)
observation most properties fall in the average to good
condition rage i.e (5-6)
# ANALYSIS: Room Configuration Analysis in Residential Properties
# This comprehensive visualization examines the distribution of key room metrics across the housing dataset:
# 1. Total Rooms: Overall room count showing the typical size of properties in the market
# 2. Bedrooms: Specific bedroom count distribution, critical for family home categorization
# 3. Bathrooms: Full bathroom count, an important modernization and value indicator
#
# These metrics together provide insight into typical housing configurations, helping to identify
# which room layouts are most common in the market. The prevalence of certain configurations
# can inform development planning, renovation decisions, and market positioning strategies.
# Create bar chart for total rooms distribution
chartTRooms <- hpd %>%
mutate(TotalRooms = factor(TotalRooms)) %>%
ggplot(aes(x = TotalRooms)) +
geom_bar(fill = "#FF9A56") + # Using hex code for orange
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.2, size = 3.5) +
labs(
title = "Distribution of Houses by Total Number of Rooms",
x = "Total Number of Rooms",
y = "Number of Houses"
) +
theme_stata() +
theme(
plot.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
# Create bar chart of bedrooms distribution
chartBed <- hpd %>%
mutate(Bedrooms = factor(Bedrooms)) %>% # Convert to factor
ggplot(aes(x = Bedrooms)) +
geom_bar(fill = "steelblue") +
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.2, size = 3.5) + # Add count labels above bars
labs(
title = "Distribution of Houses by Number of Bedrooms",
x = "Number of Bedrooms",
y = "Number of Houses"
) +
theme_stata() +
theme(
plot.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
# Create bar chart for full bathrooms
chartBath <- hpd %>%
mutate(FullBathrooms = factor(FullBathrooms)) %>%
ggplot(aes(x = FullBathrooms)) +
geom_bar(fill = "#4ECDC4") + # Teal/turquoise color
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.2, size = 3.5) +
labs(
title = "Distribution of Houses by Number of Full Bathrooms",
x = "Number of Full Bathrooms",
y = "Number of Houses"
) +
theme_stata() +
theme(
plot.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
# Combine plots with patchwork
combined_plot <- chartTRooms / (chartBed + chartBath)
combined_plot +
plot_annotation(
title = "House Room Configuration Analysis",
subtitle = "Examining the prevalence of different room layouts in residential properties",
caption = "Source: Housing Price Dataset (hpd)\nNote: The most common configuration is 6 total rooms with 3 bedrooms and 2 bathrooms",
theme = theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, color = "gray30", hjust = 0.5),
plot.caption = element_text(size = 9, face = "italic", hjust = 0)
)
) &
theme(plot.margin = margin(1, 0.6, 0.6, 0.6, "cm"))
Observation: The analysis of room configurations
reveals that the most prevalent housing type in the market features 6
total rooms, typically configured with 3 bedrooms and 2 bathrooms.
# ANALYSIS: Distribution of Building Types in the Housing Market
# This visualization examines the composition of different building types within the housing dataset.
# The pie chart format provides an intuitive representation of the relative proportions of each
# building category, helping to identify which types dominate the market.
#
# Key aspects:
# - Percentages are calculated and displayed with radially oriented labels for better readability
# - The distribution helps understand housing stock diversity in the market
# - This information can be valuable for developers, investors, and policy makers to understand
# the current housing landscape
# Pie chart with radially oriented percentage labels
hpd %>%
count(Description) %>%
mutate(
percentage = n / sum(n) * 100,
label = paste0(round(percentage, 1), "%"),
# Calculate position and angle for each slice
pos = cumsum(n) - n/2,
prop = pos / sum(n),
angle = 90 - 360 * prop
) %>%
ggplot(aes(x = "", y = n, fill = Description)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
# Use calculated angle for radial text orientation
geom_text(aes(label = label, angle = angle),
position = position_stack(vjust = 0.9),
size = 3.5) +
labs(
title = "Distribution of Building Types",
fill = "Building Type"
) +
theme_void() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.position = "right"
) +
scale_fill_brewer(palette = "Set2")
Observation: Single-family detached homes make up the largest percentage of property sales in our dataset. This shows that most buyers during this period purchased standalone houses rather than other housing types. The market was clearly dominated by traditional detached homes, which far outnumbered other residential property types.
# ANALYSIS: Statistical Examination of Sale Price Distribution
# This visualization analyzes the distribution pattern of house sale prices with formal statistical tests.
# It combines visual representation (histogram and density curve) with numerical measures (skewness
# and Shapiro-Wilk test) to quantify the deviation from normality in the price distribution.
#
# Key statistical measures:
# - Skewness: Measures the asymmetry of the distribution. Positive values indicate right-skew
# (longer tail on the right), common in price data due to luxury properties.
# - Shapiro-Wilk test: Tests the null hypothesis that data follows a normal distribution.
# A p-value < 0.05 indicates non-normal distribution.
# Calculate statistical measures
shapiro_test <- shapiro.test(hpd$SalePrice)
skew_value <- skewness(hpd$SalePrice)
# Format statistics for display
p_formatted <- label_scientific(decimal.mark = ",")(shapiro_test$p.value)
skew_formatted <- label_number(accuracy = 0.01, decimal.mark = ",")(skew_value)
# Calculate mean and median for reference
mean_price <- mean(hpd$SalePrice, na.rm = TRUE)
median_price <- median(hpd$SalePrice, na.rm = TRUE)
# Create the plot
hpd %>% ggplot(aes(x = SalePrice)) +
# Histogram with density scaling
geom_histogram( aes(y = after_stat(density)), binwidth = 30000, fill = "lightblue", color = "white", alpha = 0.8) +
geom_density(color = "darkblue", linewidth = 0.5,alpha = 0.3) +
# Add vertical lines for mean and median
geom_vline(xintercept = mean_price,
linetype = "dashed",
color = "red",
alpha = 0.7
) +
geom_vline(
xintercept = median_price,
linetype = "dotted",
color = "darkgreen",
alpha = 0.7
) +
# Annotate statistical measures
annotate(
"text",
x = Inf, y = Inf,
label = paste0(
"Skewness: ", skew_formatted, " (right-skewed)\n",
"Shapiro-Wilk p: ", p_formatted, " (non-normal)"
),
hjust = 1.1, vjust = 1.1,
size = 3.5,
color = "navy"
) +
# Add annotation explaining reference lines
annotate(
"text",
x = mean_price,
y = max(density(hpd$SalePrice, na.rm = TRUE)$y) * 0.8,
label = "Mean",
color = "red",
angle = 90,
hjust = 1.2
) +
annotate(
"text",
x = median_price,
y = max(density(hpd$SalePrice, na.rm = TRUE)$y) * 0.8,
label = "Median",
color = "darkgreen",
angle = 90,
hjust = 1.2
) +
# Labels and formatting
labs(
title = "Distribution of Home Sale Prices",
subtitle = "Right-skewed distribution with significant deviation from normality",
x = "Sale Price ($)",
y = "Density",
caption = "Data source: Housing price dataset\nNote: Positive skewness indicates right-skewed distribution (higher prices stretch to the right)\nShapiro-Wilk p < 0.05 indicates the distribution significantly deviates from normal"
) +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
theme_stata() +
theme(
plot.subtitle = element_text(color = "gray30"),
plot.caption = element_text(hjust = 0, face = "italic", size = 8)
)
observation
Housing prices typically show positive skewness (likely >1) as most homes cluster at lower/middle prices with fewer luxury properties creating a long right tail.
# ANALYSIS: Housing Market Trends - Price and Sales Volume Over Time
# This visualization examines two critical market indicators side by side:
# 1. Average home sale prices over time (top panel)
# 2. Monthly sales volume (bottom panel)
# Together, these metrics provide insight into market dynamics, seasonality patterns,
# and potential relationships between pricing and demand.
# Convert dates and calculate time-based statistics
hpd <- hpd %>%
mutate(SaleDate = as.Date(
paste(YearSold, MonthSold, "15", sep="-"),
format="%Y-%m-%d")
)
# Calculate overall statistics from raw data
overall_mean <- mean(hpd$SalePrice, na.rm = TRUE)
overall_median <- median(hpd$SalePrice, na.rm = TRUE)
overall_max <- max(hpd$SalePrice, na.rm = TRUE)
overall_min <- min(hpd$SalePrice, na.rm = TRUE)
# Summarize by date for both plots
price_summary <- hpd %>%
group_by(SaleDate) %>%
summarise(
average_price = mean(SalePrice, na.rm = TRUE),
median_price = median(SalePrice, na.rm = TRUE),
count = n()
)
# Create combined annotation text
stats_text <- paste0(
"Mean: $", comma(round(overall_mean)), "\n",
"Median: $", comma(round(overall_median)), "\n",
"Max: $", comma(round(overall_max)), "\n",
"Min: $", comma(round(overall_min))
)
# Plot 1: Average price over time
price_plot <- price_summary %>%
ggplot(aes(x = SaleDate)) +
geom_line(aes(y = average_price), color = "purple", linewidth = 1) +
geom_point(aes(y = average_price), color = "purple", size = 2) +
geom_smooth(aes(y = average_price), method = "loess", color = "darkred",
se = FALSE, linetype = "dashed", formula = y~x) +
# Add annotation in top right
annotate("text", x = max(price_summary$SaleDate), y = max(price_summary$average_price),
label = stats_text,
hjust = 1.1, vjust = 1.1, size = 3.5) +
scale_y_continuous(labels = scales::comma) +
scale_x_date(date_breaks = "3 months", date_labels = "%b %Y") +
labs(
title = "Average Home Sale Price Over Time",
x = NULL, # Remove x-axis label since it's shared
y = "Average Sale Price ($)"
) +
theme_stata() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
# Plot 2: Number of sales over time
volume_plot <- price_summary %>%
ggplot(aes(x = SaleDate)) +
geom_line(aes(y = count), color = "forestgreen", linewidth = 1) +
geom_point(aes(y = count), color = "forestgreen", size = 2) +
geom_smooth(aes(y = count), method = "loess", color = "darkgreen",
se = FALSE, linetype = "dashed", formula = y~x) +
# Add volume statistics annotation
annotate("text", x = max(price_summary$SaleDate), y = max(price_summary$count),
label = paste0(
"Highest volume: ", max(price_summary$count), " sales\n",
"Average volume: ", round(mean(price_summary$count), 1), " sales/month"
),
hjust = 1.1, vjust = 1.1, size = 3.5) +
scale_y_continuous(breaks = seq(0, max(price_summary$count) + 5, by = 5)) +
scale_x_date(date_breaks = "3 months", date_labels = "%b %Y") +
labs(
title = "Monthly Sales Volume",
x = "Date",
y = "Number of Sales"
) +
theme_stata() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
# Combine plots with patchwork
(price_plot / volume_plot) +
plot_annotation(
title = "Housing Market Analysis: Price and Volume Trends",
subtitle = "Tracking average prices and sales volume to identify market patterns",
caption = "Data source: Housing price dataset\nNote: Dashed lines represent smoothed trends to highlight underlying patterns",
theme = theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, color = "gray30", hjust = 0.5),
plot.caption = element_text(size = 9, face = "italic", hjust = 0)
)
)
Observation: The housing market showed fluctuating patterns in both sales volume and prices over the years. Sales reached their peak in April 2006 with 67 transactions, well above the average monthly volume of 26.5 sales. Price-wise, homes sold at an average of $180,921 across the entire period. These figures highlight the market’s variability while providing benchmarks for typical sales activity and pricing in this area.
# ANALYSIS: Comparing the Impact of Different Property Areas on Sale Price
# This visualization examines two key relationships:
# 1. How porch area (left) correlates with property sale price
# 2. How living area (right) correlates with property sale price
# Comparing these two areas helps quantify their relative impact on home values
# Plot 1: Porch area vs. Sale Price
porch <- hpd %>%
filter(TotalPorchAreaSquareFeet != 0) %>%
ggplot(aes(x = TotalPorchAreaSquareFeet, y = SalePrice)) +
geom_point(
col = '#F39C12', # Using a rich orange hex color
alpha = 0.6, # Adding transparency for better visibility
size = 1.8 # Slightly larger points
) +
geom_smooth(
method = "lm",
se = TRUE,
color = "salmon",
formula = y ~ x,
fill = "salmon",
alpha = 0.2 # Lighter confidence interval
) +
labs(
x = "Porch Area (sq ft)",
y = "Sale Price ($)",
title = "Porch Size Impact on Price",
subtitle = "Outdoor space and property value relationship"
) +
scale_y_continuous(labels = scales::comma) + # Format with commas
theme_stata() +
theme(
plot.title = element_text(size = 12, face = "bold"),
plot.subtitle = element_text(size = 10, color = "gray30")
)
# Plot 2: Living area vs. Sale Price
liv <- hpd %>%
filter(LivAreaSquareFeet != 0) %>%
ggplot(aes(x = LivAreaSquareFeet, y = SalePrice)) +
geom_point(
col = '#F39C12', # Match color from first plot
alpha = 0.6,
size = 1.8
) +
geom_smooth(
method = "lm",
se = TRUE,
color = "salmon",
formula = y ~ x,
fill = "salmon",
alpha = 0.2
) +
labs(
x = "Living Area (sq ft)",
y = "Sale Price ($)",
title = "Living Area Impact on Price",
subtitle = "Indoor space and property value relationship"
) +
scale_y_continuous(labels = scales::comma) + # Format with commas
theme_stata() +
theme(
plot.title = element_text(size = 12, face = "bold"),
plot.subtitle = element_text(size = 10, color = "gray30")
)
# Combine the plots with enhanced annotation
(porch + liv) +
plot_annotation(
title = "Comparative Impact of Porch vs. Living Area on Home Prices",
subtitle = "Analysis reveals that living area has a stronger influence on price than porch space",
caption = "Source: Housing Price Dataset (hpd)\nNote: Linear regression lines show the predicted relationship between area and price",
theme = theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 12, color = "gray30", hjust = 0.5),
plot.caption = element_text(size = 10, face = "italic", hjust = 0)
)
) +
# Add text annotations to compare slopes
annotate(
"text", x = 0.7, y = 0.1,
label = "Living area typically adds more value per sq ft\nthan comparable porch space",
hjust = 0, vjust = 0,
color = "darkblue",
size = 3.5,
fontface = "italic",
alpha = 0.8
)
Observation: The scatter plot shows a clear linear
relationship between living area and sale price. As living area
increases, home prices consistently rise, creating a strong positive
correlation. This straightforward pattern confirms that larger homes
command higher prices in the market, with each additional square foot of
living space contributing to increased property value.
# ANALYSIS: Relationship Between Number of Bedrooms and Property Sale Price
# This visualization examines how the number of bedrooms in a property relates to its sale price.
# Each point represents a property, colored by its bedroom count to help identify potential
# patterns or price premiums associated with specific bedroom configurations.
hpd %>%
# Create the scatter plot
ggplot(aes(x = Bedrooms, y = SalePrice)) +
# Add points with colors by bedroom count
geom_point(
aes(color = as.factor(Bedrooms)),
alpha = 0.7, # Add transparency for overlapping points
size = 2.5, # Slightly larger points
position = position_jitter(width = 0.2) # Add jitter to reduce overplotting
) +
# Improve axis formatting
scale_y_continuous(
labels = scales::dollar_format(scale = 1e-3, suffix = "K"),
name = "Sale Price (thousands USD)"
) +
scale_x_continuous(
breaks = seq(0, max(hpd$Bedrooms, na.rm = TRUE), by = 1),
name = "Number of Bedrooms"
) +
# Improve color scale with more distinct colors
scale_color_brewer(
palette = "Set1",
name = "Bedroom Count"
) +
# Add informative labels
labs(
title = "Relationship Between Number of Bedrooms and Home Sale Price",
subtitle = "Analysis of price distribution by bedroom count",
caption = "Source: Housing Price Dataset (hpd)"
) +
# Apply Stata theme with customizations
theme_stata() +
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "gray30"),
plot.caption = element_text(hjust = 0, face = "italic"),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray95")
) +
# Add a horizontal line at median price for reference
geom_hline(
yintercept = median(hpd$SalePrice, na.rm = TRUE),
linetype = "dashed",
color = "darkgray",
alpha = 0.7
) +
# Add annotation to highlight the median
annotate(
"text",
x = max(hpd$Bedrooms, na.rm = TRUE) - 0.5,
y = median(hpd$SalePrice, na.rm = TRUE) * 1.05,
label = "Median Sale Price",
fontface = "italic",
size = 3,
color = "darkgray"
)
Observation: The density of data points around the
median sale price reveals an interesting pattern: homes with various
bedroom counts (2, 3, 4, and more) tend to sell at similar price points.
Despite differences in bedroom number, most properties cluster around
the same median price level. This suggests that bedroom count alone
doesn’t dramatically shift a home’s value in this market, as prices
remain centered around the median regardless of how many bedrooms a
property contains.
# This visualization examines the frequency distribution of different municipal zoning types
# across all properties in the dataset. The chart reveals which zoning categories are most
# common in the housing market, providing insights into the predominant land use patterns.
zoning<-hpd %>%
# Convert MSZoning to factor
mutate(MSZoning = as.factor(MSZoning)) %>%
# Count occurrences for each zoning type
count(MSZoning) %>%
# Arrange in descending order by count
arrange(desc(n)) %>%
# Convert back to factor with levels in the desired order
mutate(MSZoning = factor(MSZoning, levels = MSZoning)) %>%
# Create the bar chart
ggplot(aes(x = MSZoning, y = n)) +
geom_bar(
stat = "identity", # Use pre-counted values
fill = "#FF7F00", # Vibrant orange hex code
color = "#E65100", # Darker orange border
width = 0.7 # Slightly narrower bars
) +
# Add descriptive labels
labs(
title = "Distribution of Properties by Zoning Classification",
subtitle = "Properties ordered by frequency from highest to lowest",
x = "Municipal Zoning Category",
y = "Number of Properties",
caption = "Source: Housing Price Dataset (hpd)"
) +
# Apply the Stata theme with customizations
theme_stata() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), # 45-degree angled labels
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "gray30"),
) +
# Add value labels on top of bars
geom_text(
aes(label = n),
vjust = -0.5,
color = "black",
size = 3.5
)
# ANALYSIS: Sale Price Distribution for Specific Housing Segment
# This visualization examines the price distribution of larger family homes in residential
# low-density zones. We specifically filter for properties that:
# - Are zoned as "Residential Low Density"
# - Have a second floor (2ndFlrSquareFeet > 0)
# - Contain at least 3 bedrooms (suitable for families)
# This segment represents suburban family homes that typically command higher prices.
residential<-hpd %>%
# Filter for residential low density properties with second floors and 3+ bedrooms
filter(MSZoning == "Residential Low Density" &
`2ndFlrSquareFeet` != 0 &
Bedrooms >= 3) %>%
# Create histogram of sale prices
ggplot(aes(x = SalePrice)) +
geom_histogram(
aes(y = after_stat(density)), # Show density instead of counts
binwidth = 50000, # $50k intervals for bins
fill = "lightblue",
color = "white", # White borders between bins
alpha = 0.8 # Slight transparency
) +
# Add a density curve to show distribution shape
geom_density(color = "darkblue", linewidth = 1, alpha = 0.2) +
# Format axis labels with commas for better readability
scale_x_continuous(labels = scales::comma,
name = "Sale Price (USD)") +
scale_y_continuous(labels = scales::comma) +
# Add informative titles and caption
labs(
title = "Sale Price Distribution of Houses in Low-Density Areas",
subtitle = "Analysis of houses with second floors and 3+ bedrooms",
y = "Density",
caption = "Source: Housing Price Dataset (hpd)\nNote: Prices shown in $50,000 intervals"
) +
# Apply Stata theme and additional formatting
theme_stata() +
theme(
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "gray30"),
plot.caption = element_text(hjust = 0, face = "italic")
)
(zoning+residential)+
plot_annotation(
title = "Zoning Distribution and Sale Price",
subtitle = "Examining zoning frequency and price patterns",
caption = "Note: Bottom plot focuses on residential low-density properties with second floors and 3+ bedrooms",
theme = theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(color = "gray30", size = 12),
plot.caption = element_text(hjust = 0, face = "italic", size = 9)
)
)
Observation: Our analysis of properties in the highest
zoning category focuses specifically on two-story houses with 3 bedrooms
(those having a second floor). The price distribution for these homes
shows a leftward skew, with a mean value of approximately $200,000. This
pattern indicates that while most of these specific properties cluster
around this price point, there are fewer homes at the higher end of the
price spectrum, creating the asymmetrical distribution observed in the
data.
# Group and analyze housing data by Municipal Zoning classification
hpd %>%
# Group the data by zoning classification
group_by(MSZoning) %>%
# Calculate summary statistics for each zoning group
summarise(
mean_price = mean(SalePrice, na.rm = TRUE), # Average sale price
median_price = median(SalePrice, na.rm = TRUE), # Median sale price
min_price = min(SalePrice, na.rm = TRUE), # Minimum sale price
max_price = max(SalePrice, na.rm = TRUE), # Maximum sale price
sd_price = sd(SalePrice, na.rm = TRUE), # Standard deviation of prices
mean_area = mean(LivAreaSquareFeet, na.rm = TRUE), # Average living area in sq ft
price_per_sqft = mean_price / mean_area, # Average price per square foot
count = n() # Number of houses in each zone
) %>%
# Sort results by descending average price
arrange(desc(mean_price)) %>%
# Format monetary values for better readability
mutate(
mean_price = scales::dollar(mean_price),
median_price = scales::dollar(median_price),
min_price = scales::dollar(min_price),
max_price = scales::dollar(max_price),
price_per_sqft = scales::dollar(price_per_sqft)
) %>%
# Create a formatted table
kable(
caption = "Housing Price Statistics by Municipal Zoning Classification"
) %>%
# Apply styling to the table
kable_styling(
bootstrap_options = c("condensed", "hover", "striped"),
full_width = F
) %>% landscape()
| MSZoning | mean_price | median_price | min_price | max_price | sd_price | mean_area | price_per_sqft | count |
|---|---|---|---|---|---|---|---|---|
| Floating Village Residential | $214,014 | $205,950 | $144,152 | $370,878 | 52369.66 | 1574.538 | $135.92 | 65 |
| Residential Low Density | $191,005 | $174,000 | $39,300 | $755,000 | 80766.34 | 1551.646 | $123.10 | 1151 |
| Residential High Density | $131,558 | $136,500 | $76,000 | $200,000 | 35714.12 | 1510.125 | $87.12 | 16 |
| Residential Medium Density | $126,317 | $120,500 | $37,900 | $475,000 | 48521.69 | 1322.073 | $95.54 | 218 |
| Commercial | $74,528 | $74,700 | $34,900 | $133,900 | 33791.09 | 1191.400 | $62.55 | 10 |
# ANALYSIS: Impact of Zoning Categories on Property Sale Prices
# This visualization examines how municipal zoning classifications affect property values.
# Boxplots reveal the distribution, central tendency, and variability of prices within
# each zoning category, helping identify which zones command premium prices and which
# show the greatest price variation.
hpd %>%
# Main plot creation
ggplot(aes(x = MSZoning, y = SalePrice)) +
geom_boxplot(
fill = "lightblue",
color = "steelblue",
alpha = 0.7,
outlier.color = "darkred",
outlier.alpha = 0.5,
outlier.size = 2
) +
# Add median labels
stat_summary(
fun = median,
geom = "text",
aes(label = scales::comma(..y..)),
vjust = -0.5,
size = 3,
color = "navy"
) +
# Formatting axes and labels
scale_y_continuous(
labels = scales::comma,
name = "Sale Price ($)"
) +
scale_x_discrete(
name = "Municipal Zoning Classification"
) +
# Titles and annotations
labs(
title = "Sale Price Distribution by Zoning Category",
subtitle = "Comparing price ranges and medians across different zoning classifications",
caption = "Source: Housing Price Dataset (hpd)\nOutliers shown in red"
) +
# Apply theme with customizations
theme_stata() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), # Angled x-axis labels
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "gray30"),
plot.caption = element_text(hjust = 0, face = "italic"),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray95")
) +
# Add a horizontal reference line at the overall median price
geom_hline(
yintercept = median(hpd$SalePrice, na.rm = TRUE),
linetype = "dashed",
color = "gray50",
alpha = 0.7
) +
# Add annotation for the overall median
annotate(
"text",
x = length(unique(hpd$MSZoning)),
y = median(hpd$SalePrice, na.rm = TRUE) * 1.05,
label = "Overall Median",
color = "gray40",
size = 3,
hjust = 1
)
Observation: The boxplot reveals a clear hierarchy in housing prices across zoning classifications, with Floating Village Residential commanding the highest median price ($205,950), followed by Residential Low Density ($174,000), which also displays the most price volatility with numerous high-value outliers exceeding $600,000. Both Residential High Density ($136,500) and Residential Medium Density ($120,500) fall below the overall market median, suggesting more affordable housing options, while Commercial zoning shows the lowest median price ($74,700). This distribution demonstrates how zoning regulations significantly influence property values, with lower-density residential zones generally commanding premium prices, likely due to larger lot sizes, increased privacy, and potentially more exclusive neighborhood characteristics.
# Select key numeric variables that might influence housing prices
selected_vars <- hpd %>%
select(SalePrice, LotAreaSquareFeet, TotalPorchAreaSquareFeet, LivAreaSquareFeet,Bedrooms, FullBathrooms, TotalRooms, OverallCondition,HouseAge)
# Calculate correlation matrix for selected variables only
cor_matrix <- cor(selected_vars, use = "complete.obs")
# Create ggplot correlation visualization
ggcorrplot(cor_matrix,
hc.order = TRUE, # Hierarchical clustering order
type = "upper", # Show only upper triangle
lab = TRUE, # Show correlation coefficients
lab_size = 3, # Size of labels
tl.cex = 8, # Text label size
colors = c("#6D9EC1", "white", "#E46726"), # Blue-white-orange
title = "Key Housing Features Correlation Analysis") +
theme_stata()+
theme(
plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_text(angle = 45, hjust = 1),
legend.position = "right"
)
Observation: The correlation matrix reveals
relationships ranging from weak negative to strong positive associations
between variables. The weakest negative correlation exists between
overall condition and lot area square footage, suggesting these factors
slightly move in opposite directions. On the positive side, total room
count and living area square footage show the strongest correlation
(0.83), indicating these variables are highly related. This strong
relationship makes intuitive sense: homes with more rooms typically have
larger overall living areas. This pattern helps us understand which
housing features tend to occur together and which characteristics vary
independently.
# Create the ANOVA model
house_aov <- aov(SalePrice ~ LivAreaSquareFeet + Bedrooms + FullBathrooms + OverallCondition + HouseAge, data = hpd)
# Display ANOVA table
anova_tidy <- tidy(house_aov)
table_data <- anova_tidy %>%
select(term, sumsq, df, statistic, p.value) %>%
rename("Term" = term, "Sum Sq" = sumsq, "DF" = df,
"F value" = statistic, "p-value" = p.value)
table_data %>% kable() %>% kable_classic()
| Term | Sum Sq | DF | F value | p-value |
|---|---|---|---|---|
| LivAreaSquareFeet | 4.623740e+12 | 1 | 2398.994713 | 0.0000000 |
| Bedrooms | 5.116764e+11 | 1 | 265.479672 | 0.0000000 |
| FullBathrooms | 2.348333e+11 | 1 | 121.841603 | 0.0000000 |
| OverallCondition | 5.491197e+09 | 1 | 2.849068 | 0.0916414 |
| HouseAge | 1.029780e+12 | 1 | 534.294193 | 0.0000000 |
| Residuals | 2.802390e+12 | 1454 | NA | NA |
Observation: The ANOVA results indicate that most
variables in our model significantly influence sale price, as evidenced
by their p-values below 0.05. Living area, bedroom count, bathroom
count, and overall condition all show statistically significant
relationships with property values. House age, however, appears to have
a weaker relationship with a p-value of 0.09, exceeding the conventional
0.05 significance threshold. This suggests that while most physical
characteristics strongly predict home prices, the age of the property
may be a less influential factor in this market.