Real_Estate_Texas.knit

# --- Setup ---
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(e1071) 
library(viridis)
library(RColorBrewer)

print("--- Packages loaded successfully ---")

## [1] "--- Packages loaded successfully ---"

# --- Helper Function: Normalized Gini Index ---
gini_index_normalized <- function(x) {
  category_counts <- table(x)
  category_counts <- category_counts[category_counts > 0]
  if (length(category_counts) <= 1) { return(0) }
  relative_frequencies <- prop.table(category_counts)
  sum_sq_freq <- sum(relative_frequencies^2)
  num_observed_categories <- length(category_counts)
  gini_unnorm <- 1 - sum_sq_freq
  norm_factor <- (num_observed_categories - 1) / num_observed_categories
  gini_norm <- gini_unnorm / norm_factor
  return(gini_norm)
}

# --- Load Data ---
file_path <- "realestate_texas.csv"
if (!file.exists(file_path)) {
  stop(paste("Error: File not found at path:", file_path))
}
real_estate_data_raw <- read.csv(file_path)
print("--- Successfully loaded realestate_texas.csv ---")

## [1] "--- Successfully loaded realestate_texas.csv ---"

# --- Step 1: Variable Analysis & Data Preparation ---
print("--- Step 1: Variable Analysis & Data Preparation ---")

## [1] "--- Step 1: Variable Analysis & Data Preparation ---"

real_estate_data <- real_estate_data_raw %>%
  mutate(
    Date = as.Date(paste(year, month, "01", sep = "-"), format = "%Y-%m-%d"),
    month_factor = factor(month, levels = 1:12, labels = month.abb, ordered = TRUE),
    year_factor = factor(year),
    city = factor(city)
  )
print("--- Prepared Data Structure: ---");

## [1] "--- Prepared Data Structure: ---"

str(real_estate_data);

## 'data.frame':    240 obs. of  11 variables:
##  $ city            : Factor w/ 4 levels "Beaumont","Bryan-College Station",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year            : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ month           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ sales           : int  83 108 182 200 202 189 164 174 124 150 ...
##  $ volume          : num  14.2 17.7 28.7 26.8 28.8 ...
##  $ median_price    : num  163800 138200 122400 123200 123100 ...
##  $ listings        : int  1533 1586 1689 1708 1771 1803 1857 1830 1829 1779 ...
##  $ months_inventory: num  9.5 10 10.6 10.6 10.9 11.1 11.7 11.6 11.7 11.5 ...
##  $ Date            : Date, format: "2010-01-01" "2010-02-01" ...
##  $ month_factor    : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ year_factor     : Factor w/ 5 levels "2010","2011",..: 1 1 1 1 1 1 1 1 1 1 ...

# --- Step 2: Descriptive Statistics ---
print("--- Step 2: Descriptive Statistics ---")

## [1] "--- Step 2: Descriptive Statistics ---"

print("--- Frequency Distribution for City: ---");

## [1] "--- Frequency Distribution for City: ---"

print(prop.table(table(real_estate_data$city)));

## 
##              Beaumont Bryan-College Station                 Tyler 
##                  0.25                  0.25                  0.25 
##         Wichita Falls 
##                  0.25

print("--- Frequency Distribution for Year (Factor): ---");

## [1] "--- Frequency Distribution for Year (Factor): ---"

print(prop.table(table(real_estate_data$year_factor)))

## 
## 2010 2011 2012 2013 2014 
##  0.2  0.2  0.2  0.2  0.2

print("--- Frequency Distribution for Month (Factor): ---");

## [1] "--- Frequency Distribution for Month (Factor): ---"

print(prop.table(table(real_estate_data$month_factor)))

## 
##        Jan        Feb        Mar        Apr        May        Jun        Jul 
## 0.08333333 0.08333333 0.08333333 0.08333333 0.08333333 0.08333333 0.08333333 
##        Aug        Sep        Oct        Nov        Dec 
## 0.08333333 0.08333333 0.08333333 0.08333333 0.08333333

# Numerical Summaries
numerical_vars <- c("sales", "volume", "median_price", "listings", "months_inventory")
print("--- Checking for NAs in numerical columns: ---");

## [1] "--- Checking for NAs in numerical columns: ---"

print(sapply(real_estate_data[, numerical_vars], function(x) sum(is.na(x))));

##            sales           volume     median_price         listings 
##                0                0                0                0 
## months_inventory 
##                0

print("--- Overall Summary Statistics for Numerical Variables: ---");

## [1] "--- Overall Summary Statistics for Numerical Variables: ---"

numerical_vars <- c("sales", "volume", "median_price", "listings", "months_inventory")
summary_functions <- list(
  Min = ~min(.x, na.rm = TRUE),
  Max = ~max(.x, na.rm = TRUE),
  Q1 = ~quantile(.x, probs = 0.25, na.rm = TRUE),
  Q3 = ~quantile(.x, probs = 0.75, na.rm = TRUE),
  Mean = ~mean(.x, na.rm = TRUE),
  Median = ~median(.x, na.rm = TRUE),
  SD = ~sd(.x, na.rm = TRUE),
  Variance = ~var(.x, na.rm = TRUE),
  IQR = ~IQR(.x, na.rm = TRUE),
  Range = ~diff(range(.x, na.rm = TRUE)), # Max - Min
  CV = ~round((sd(.x, na.rm = TRUE) / mean(.x, na.rm = TRUE)) * 100, 2), # Coefficient of Variation (%)
  Skewness = ~skewness(.x, na.rm = TRUE, type = 3), # Sample Skewness (type 3)
  Kurtosis = ~kurtosis(.x, na.rm = TRUE, type = 3)  # Sample Excess Kurtosis (type 3)
)

stats_summary <- real_estate_data %>%
  select(all_of(numerical_vars)) %>%
  summarise(across(
            everything(),              
            summary_functions,        
            .names = "{.col}__{.fn}"
            )) %>%
  pivot_longer(
    cols = everything(),               
    names_to = c("Variable", ".value"),
    names_sep = "__"                  
  )

stats_summary_print <- stats_summary %>%
mutate(across(where(is.numeric), ~ round(.x, 2)))

print(as.data.frame(stats_summary_print), row.names = FALSE)

##          Variable      Min       Max        Q1        Q3      Mean    Median
##             sales    79.00    423.00    127.00    247.00    192.29    175.50
##            volume     8.17     83.55     17.66     40.89     31.01     27.06
##      median_price 73800.00 180000.00 117300.00 150050.00 132665.42 134500.00
##          listings   743.00   3296.00   1026.50   2056.00   1738.02   1618.50
##  months_inventory     3.40     14.90      7.80     10.95      9.19      8.95
##        SD    Variance      IQR     Range    CV Skewness Kurtosis
##     79.65 6.34430e+03   120.00    344.00 41.42     0.71    -0.34
##     16.65 2.77270e+02    23.23     75.38 53.71     0.88     0.15
##  22662.15 5.13573e+08 32750.00 106200.00 17.08    -0.36    -0.64
##    752.71 5.66569e+05  1029.50   2553.00 43.31     0.65    -0.81
##      2.30 5.31000e+00     3.15     11.50 25.06     0.04    -0.20

# --- Commentary Step 2 ---
# The dataset is perfectly balanced, with an equal number of observations (60) for each city and year,
# and for each month (20 observations = 4 cities * 5 years). This simplifies comparisons between groups.
# Overall numerical summaries show variability:
# - Median Price: Centers around $134k, with a range from $73.8k to $180k. The mean ($135k) is close to the median.
# - Sales & Volume: Show large ranges and standard deviations compared to their means. Means are higher than medians, suggesting right skewness (occasional high-value months).
# - Listings: Also highly variable.
# - Months Inventory: Centers around 9 months, indicating a relatively balanced market overall during this period, but with significant range (3.4 to 14.9 months).
# - Shape: Most variables (esp. volume, sales) show positive skewness (Mean > Median), confirming the presence of higher-than-typical values pulling the average up. Kurtosis values are generally low, suggesting distributions aren't excessively peaked or flat compared to a normal distribution.

# --- Step 3: Identify Highest Variability & Asymmetry ---
print("--- Step 3: Variability and Asymmetry Identification ---")

## [1] "--- Step 3: Variability and Asymmetry Identification ---"

stats_summary$CV <- ifelse(stats_summary$Mean != 0, (stats_summary$SD / abs(stats_summary$Mean)) * 100, 0)
print("--- Stats with CV: ---")

## [1] "--- Stats with CV: ---"

print(stats_summary[, c("Variable", "Mean", "SD", "CV", "Skewness")] %>%
        mutate(across(where(is.numeric), ~ round(.x, 2))), row.names = FALSE)

## # A tibble: 5 × 5
##   Variable              Mean      SD    CV Skewness
##   <chr>                <dbl>   <dbl> <dbl>    <dbl>
## 1 sales               192.      79.6  41.4     0.71
## 2 volume               31.0     16.6  53.7     0.88
## 3 median_price     132665.   22662.   17.1    -0.36
## 4 listings           1738.     753.   43.3     0.65
## 5 months_inventory      9.19     2.3  25.1     0.04

highest_cv_var <- stats_summary$Variable[which.max(stats_summary$CV)]
highest_cv_val <- max(stats_summary$CV)
print(paste("Highest RELATIVE variability:", highest_cv_var, "(CV =", round(highest_cv_val, 2), "%)"))

## [1] "Highest RELATIVE variability: volume (CV = 53.71 %)"

stats_summary$AbsSkewness <- abs(stats_summary$Skewness)
most_asymmetric_var <- stats_summary$Variable[which.max(stats_summary$AbsSkewness)]
most_asymmetric_val <- stats_summary$Skewness[which.max(stats_summary$AbsSkewness)]
print(paste("Most ASYMMETRIC distribution:", most_asymmetric_var, "(Skewness =", round(most_asymmetric_val, 2), ")"))

## [1] "Most ASYMMETRIC distribution: volume (Skewness = 0.88 )"

# --- Commentary Step 3 ---
# - Relative Variability (CV): 'volume' (total sales value) exhibits the highest relative variability (CV ~54%), meaning its standard deviation is large compared to its average value. 'sales' count is also highly variable (CV ~41%). 'median_price' is the most stable relative to its mean (CV ~17%). This indicates that while typical prices were relatively consistent, the total value and number of transactions fluctuated more dramatically month-to-month or city-to-city.
# - Asymmetry (Skewness): 'volume' also has the most skewed distribution (Skewness ~0.88), indicating a strong positive (right) skew. This confirms that occasional months with very high total sales values significantly influence the average, pulling it above the median.

# --- Step 4: Create Classes & Calculate Heterogeneity ---
print("--- Step 4: Class Creation & Normalized Gini ---")

## [1] "--- Step 4: Class Creation & Normalized Gini ---"

variable_to_bin <- real_estate_data$median_price
num_classes <- 6
median_price_classes_eq_width <- cut(variable_to_bin, breaks = num_classes, include.lowest = TRUE, right = FALSE)
class_freq_eq_width <- table(median_price_classes_eq_width)
print("--- Freq Equal Width Classes: ---"); print(class_freq_eq_width)

## [1] "--- Freq Equal Width Classes: ---"

## median_price_classes_eq_width
## [7.37e+04,9.15e+04) [9.15e+04,1.09e+05) [1.09e+05,1.27e+05) [1.27e+05,1.45e+05) 
##                  13                  33                  37                  71 
## [1.45e+05,1.62e+05)  [1.62e+05,1.8e+05] 
##                  70                  16

normalized_gini_value <- gini_index_normalized(median_price_classes_eq_width)
print(paste("Normalized Gini (Equal Width):", round(normalized_gini_value, 4)))

## [1] "Normalized Gini (Equal Width): 0.9328"

class_data_eq_width <- data.frame(
    Class = factor(names(class_freq_eq_width), levels = levels(median_price_classes_eq_width)),
    Frequency = as.numeric(class_freq_eq_width)
    )
print("--- Plotting Frequency Distribution (Sorted by Price Class) ---")

## [1] "--- Plotting Frequency Distribution (Sorted by Price Class) ---"

ggplot(class_data_eq_width, aes(x = Class, y = Frequency)) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  labs(title = "Frequency Distribution of Median Price Classes",
       subtitle = "Equal Width Bins, Sorted by Price Range",
       x = "Median Price ($) Class", y = "Frequency (Number of City-Months)") +
  theme_minimal(base_size = 11) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = rel(0.9)))

# --- Commentary Step 4 ---
# Median price was divided into 6 bins of equal price range width.
# The bar chart, sorted by price range, shows the number of city-month observations falling into each bin.
# The distribution is clearly not uniform; the middle price ranges contain the most observations, reflecting a somewhat central tendency in median prices across the dataset.
# The Normalized Gini Index of ~0.93 indicates substantial heterogeneity (prices are spread across multiple bins), but it's less than 1 because the distribution isn't perfectly even across these equal-width bins. This confirms the visual impression from the chart.

# --- Step 5: Calculate Probabilities ---
print("--- Step 5: Probability Calculations ---")

## [1] "--- Step 5: Probability Calculations ---"

total_observations <- nrow(real_estate_data)
prob_beaumont <- sum(real_estate_data$city == "Beaumont") / total_observations; print(paste("P(City = Beaumont):", round(prob_beaumont, 4)))

## [1] "P(City = Beaumont): 0.25"

prob_july <- sum(real_estate_data$month == 7) / total_observations; print(paste("P(Month = July):", round(prob_july, 4)))

## [1] "P(Month = July): 0.0833"

prob_dec_2012 <- sum(real_estate_data$year == 2012 & real_estate_data$month == 12) / total_observations; print(paste("P(Date = Dec 2012):", round(prob_dec_2012, 4)))

## [1] "P(Date = Dec 2012): 0.0167"

# --- Commentary Step 5 ---
# These empirical probabilities reflect the balanced nature of the dataset:
# - The probability of randomly selecting a row for Beaumont is 0.25 (1 in 4 cities).
# - The probability of selecting a row for July is ~0.083 (1 in 12 months).
# - The probability of selecting a row for the specific month of December 2012 is low (~0.017 or 1 in 60), as it represents only 4 specific city-month records out of 240.

# --- Step 6: Create New Derived Variables ---
print("--- Step 6: Creating New Derived Variables ---")

## [1] "--- Step 6: Creating New Derived Variables ---"

real_estate_data <- real_estate_data %>%
  mutate(
    avg_price = ifelse(sales > 0, (volume * 1000000) / sales, NA),
    monthly_conversion_rate = ifelse(listings > 0, sales / listings, NA),
    turnover_rate = ifelse(months_inventory > 0, 1 / months_inventory, NA)
  )
print("--- Summary New Variables: ---"); print(summary(real_estate_data[, c("avg_price", "monthly_conversion_rate", "turnover_rate")]))

## [1] "--- Summary New Variables: ---"

##    avg_price      monthly_conversion_rate turnover_rate    
##  Min.   : 97010   Min.   :0.05014         Min.   :0.06711  
##  1st Qu.:132939   1st Qu.:0.08980         1st Qu.:0.09133  
##  Median :156588   Median :0.10963         Median :0.11174  
##  Mean   :154320   Mean   :0.11874         Mean   :0.11719  
##  3rd Qu.:173915   3rd Qu.:0.13492         3rd Qu.:0.12821  
##  Max.   :213234   Max.   :0.38713         Max.   :0.29412

print("--- Price Comparison (Median vs Avg): ---"); print(summary(real_estate_data[, c("median_price", "avg_price")]))

## [1] "--- Price Comparison (Median vs Avg): ---"

##   median_price      avg_price     
##  Min.   : 73800   Min.   : 97010  
##  1st Qu.:117300   1st Qu.:132939  
##  Median :134500   Median :156588  
##  Mean   :132665   Mean   :154320  
##  3rd Qu.:150050   3rd Qu.:173915  
##  Max.   :180000   Max.   :213234

# --- Commentary Step 6 ---
# - Average Price vs. Median Price: The calculated average price (mean ~$168k) is significantly higher than the median price (median ~$134k). This strongly confirms the right skew observed earlier – a smaller number of very expensive sales disproportionately inflate the average. Median price remains the better measure of the typical property value.
# - Monthly Conversion Rate (Sales/Listings): This measures listing effectiveness or market absorption speed. On average, about 11% of active listings resulted in a sale within the same month. The rate varies considerably (3% to 29%), suggesting large differences in market efficiency across time or cities.
# - Turnover Rate (1/Months Inventory): An alternative measure of market speed, averaging ~0.116 (corresponding to ~8.6 months of inventory).

# --- Step 7: Conditional Analysis (Grouped Summaries) ---
print("--- Step 7: Conditional Analysis (Grouped Summaries) ---")

## [1] "--- Step 7: Conditional Analysis (Grouped Summaries) ---"

# City Summary
city_summary <- real_estate_data %>% group_by(city) %>%
  summarise(
    avg_sales = mean(sales, na.rm = TRUE), sd_sales = sd(sales, na.rm = TRUE),
    avg_median_price = mean(median_price, na.rm = TRUE), sd_median_price = sd(median_price, na.rm = TRUE),
    avg_months_inventory = mean(months_inventory, na.rm = TRUE), sd_months_inventory = sd(months_inventory, na.rm = TRUE),
    avg_monthly_conversion_rate = mean(monthly_conversion_rate, na.rm = TRUE), sd_monthly_conversion_rate = sd(monthly_conversion_rate, na.rm = TRUE),
    total_listings = sum(listings, na.rm = TRUE),
    .groups = 'drop') %>% arrange(desc(avg_median_price))
print("--- Summary by City: ---")

## [1] "--- Summary by City: ---"

print(city_summary %>% mutate(across(where(is.numeric), ~ round(.x, 2))))

## # A tibble: 4 × 10
##   city  avg_sales sd_sales avg_median_price sd_median_price avg_months_inventory
##   <fct>     <dbl>    <dbl>            <dbl>           <dbl>                <dbl>
## 1 Brya…      206.     85.0          157488.           8852.                 7.66
## 2 Tyler      270.     62.0          141442.           9337.                11.3 
## 3 Beau…      177.     41.5          129988.          10105.                 9.97
## 4 Wich…      116.     22.2          101743.          11320.                 7.82
## # ℹ 4 more variables: sd_months_inventory <dbl>,
## #   avg_monthly_conversion_rate <dbl>, sd_monthly_conversion_rate <dbl>,
## #   total_listings <dbl>

# Year Summary
year_summary <- real_estate_data %>% group_by(year) %>%
  summarise(total_sales = sum(sales, na.rm = TRUE), avg_median_price = mean(median_price, na.rm = TRUE), sd_median_price = sd(median_price, na.rm = TRUE), total_listings = sum(listings, na.rm = TRUE), avg_months_inventory = mean(months_inventory, na.rm = TRUE), avg_monthly_conversion_rate = mean(monthly_conversion_rate, na.rm = TRUE), .groups = 'drop') %>% arrange(year)
print("--- Summary by Year: ---")

## [1] "--- Summary by Year: ---"

print(year_summary %>% mutate(across(where(is.numeric), ~ round(.x, 2))))

## # A tibble: 5 × 7
##    year total_sales avg_median_price sd_median_price total_listings
##   <dbl>       <dbl>            <dbl>           <dbl>          <dbl>
## 1  2010        8096          130192.          21822.          87648
## 2  2011        7878          127854.          21318.          88783
## 3  2012        8935          130077.          21432.          85287
## 4  2013       10172          135723.          21708.          80525
## 5  2014       11069          139481.          25625.          74882
## # ℹ 2 more variables: avg_months_inventory <dbl>,
## #   avg_monthly_conversion_rate <dbl>

# Month Summary
month_summary <- real_estate_data %>% group_by(month_factor) %>%
  summarise(avg_sales = mean(sales, na.rm = TRUE), sd_sales = sd(sales, na.rm = TRUE), avg_median_price = mean(median_price, na.rm = TRUE), total_listings = sum(listings, na.rm = TRUE), avg_monthly_conversion_rate = mean(monthly_conversion_rate, na.rm = TRUE), .groups = 'drop')
print("--- Summary by Month: ---")

## [1] "--- Summary by Month: ---"

print(month_summary %>% mutate(across(where(is.numeric), ~ round(.x, 2))))

## # A tibble: 12 × 6
##    month_factor avg_sales sd_sales avg_median_price total_listings
##    <ord>            <dbl>    <dbl>            <dbl>          <dbl>
##  1 Jan               127.     43.4           124250          32941
##  2 Feb               141.     51.1           130075          33850
##  3 Mar               189.     59.2           127415          35134
##  4 Apr               212.     65.4           131490          36514
##  5 May               239.     83.1           134485          36477
##  6 Jun               244.     95             137620          36665
##  7 Jul               236.     96.3           134750          36424
##  8 Aug               231.     79.2           136675          35726
##  9 Sep               182.     72.5           134040          34978
## 10 Oct               180.     75.0           133480          34207
## 11 Nov               157.     55.5           134305          33054
## 12 Dec               169.     60.8           133400          31155
## # ℹ 1 more variable: avg_monthly_conversion_rate <dbl>

# --- Commentary Step 7 ---
# Conditional summaries reveal key differences and trends:
# - By City: Clear market profiles emerge. Tyler shows the largest scale (highest total listings, high avg sales). Bryan-College Station stands out for highest average median price and highest efficiency (lowest inventory, highest conversion rate ~15.4%). Wichita Falls is consistently the smallest and lowest-priced market. Beaumont is moderate.
# - By Year: A strong market recovery trend is evident from 2011 to 2014. Total sales increased, average median prices trended upwards, total listings exposure grew, average months of inventory dropped significantly (from ~11 to ~7 months), and the average monthly conversion rate nearly doubled (from ~8.5% to ~13.3%). This indicates markets becoming much faster and tighter.
# - By Month: Strong seasonality is confirmed. Average sales, total listings, and average conversion rates peak in late spring/summer (May-Aug) and are lowest in winter (Jan-Feb). Median prices show less dramatic seasonal fluctuation.

print("--- Preparing data for Median Price Trend plot ---")

## [1] "--- Preparing data for Median Price Trend plot ---"

monthly_yearly_price <- real_estate_data %>%
  group_by(year, month, city) %>%
  summarise(avg_median_price = mean(median_price, na.rm = TRUE), .groups = 'drop') %>%
  mutate(year_month_str = paste(year, sprintf("%02d", month), sep = "-")) %>%
  arrange(year, month) %>%
  mutate(year_month_ordered = factor(year_month_str, levels = unique(year_month_str)))
all_labels_price <- levels(monthly_yearly_price$year_month_ordered)
show_labels_price <- all_labels_price[seq(1, length(all_labels_price), by = 6)]

# --- Step 8: Visualizations with ggplot2 ---
print("--- Step 8: Custom Visualizations with ggplot2 ---")

## [1] "--- Step 8: Custom Visualizations with ggplot2 ---"

theme_set(theme_minimal(base_size = 11)) # Base theme

# 1. Boxplot: Overall Median Price Distribution by City
print("--- Plotting 1: Overall Median Price Boxplot ---")

## [1] "--- Plotting 1: Overall Median Price Boxplot ---"

ggplot(real_estate_data, aes(x = reorder(city, median_price, FUN = median), y = median_price, fill = city)) +
  geom_boxplot(alpha = 0.7, show.legend = TRUE) + scale_y_continuous(labels = scales::dollar_format()) +
  scale_fill_brewer(palette = "Set1", name = "City") +
  labs(title = "Overall Dist Median Prices by City", x = NULL, y = "Median Price", fill = "City") +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

# --- Commentary Plot 1 ---
# This boxplot compares the overall distribution of monthly median prices for each city across the entire 2010-2014 period.
# It visually confirms the hierarchy seen in the summaries: Bryan-College Station has the highest median and its distribution is relatively compact (smaller IQR). Wichita Falls is clearly the lowest priced. Tyler shows a wider spread of prices than Beaumont. The legend identifies cities by color.

# 2. Boxplot: Monthly Sales Volume Distribution by Year
print("--- Plotting 2: Yearly Volume Boxplot ---")

## [1] "--- Plotting 2: Yearly Volume Boxplot ---"

ggplot(real_estate_data, aes(x = year_factor, y = volume)) +
  geom_boxplot(fill = "lightblue", alpha = 0.7) + scale_y_continuous(labels = scales::dollar_format(scale = 1, suffix = "M")) +
  labs(title = "Dist Monthly Sales Volume by Year", x = "Year", y = "Monthly Sales Volume ($M)")

# --- Commentary Plot 2 ---
# This plot shows how the distribution of monthly sales volume (across all cities) changed year-over-year.
# A clear upward trend in the median monthly volume is visible from 2011 to 2014, reflecting market recovery.
# The spread (IQR, whiskers) also tends to increase in later years, suggesting greater variability in monthly volume as the market grew.

# 3. Faceted Bar Chart: Monthly Sales by City, Faceted by Year
print("--- Plotting 3: Faceted Monthly Sales Bar Chart ---")

## [1] "--- Plotting 3: Faceted Monthly Sales Bar Chart ---"

sales_by_month_city_year_agg <- real_estate_data %>% group_by(year, month_factor, city) %>% summarise(monthly_sales = sum(sales, na.rm = TRUE), .groups = 'drop')
ggplot(sales_by_month_city_year_agg, aes(x = month_factor, y = monthly_sales, fill = city)) +
  geom_col(position = "stack", alpha = 0.8) + facet_wrap(~year, ncol = 1, scales = "free_y") +
  scale_y_continuous(labels = scales::comma) + scale_fill_brewer(palette = "Set1", name = "City") +
  labs(title = "Monthly Sales by City, Faceted by Year", x = "Month", y = "Monthly Sales", fill = "City") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = rel(0.8)))

# --- Commentary Plot 3 ---
# This detailed plot combines seasonality, city comparison, and yearly trends for sales counts.
# - Within each year's panel, the summer peak in sales is evident.
# - The stacked bars show the contribution of each city, with Tyler generally contributing the most in absolute terms.
# - Comparing panels year-over-year (helped by `scales="free_y"`), the overall increase in sales, especially in 2013 and 2014, is clear.

# 4. Line Chart: Months of Inventory Trend by City
print("--- Plotting 4: Inventory Trend Line Chart (Thinner) ---")

## [1] "--- Plotting 4: Inventory Trend Line Chart (Thinner) ---"

ggplot(real_estate_data, aes(x = Date, y = months_inventory, color = city)) +
  geom_line(linewidth = 0.5, alpha = 0.8) + scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  scale_y_continuous(breaks = scales::pretty_breaks()) + scale_color_brewer(palette = "Set1", name = "City") +
  labs(title = "Months of Inventory Trend by City", x = "Date", y = "Months of Inventory", color = "City")

# --- Commentary Plot 4 ---
# This line chart clearly visualizes the market tightening trend. Months of inventory generally decreased for all cities from a peak around 2011 through 2014.
# It also highlights persistent differences: Bryan-College Station consistently had the lowest inventory (fastest market), while Beaumont and Tyler started higher. Seasonal fluctuations are visible as wiggles superimposed on the overall downward trend.

# 5. Stacked Bar Chart: Total Annual Listings by City
print("--- Plotting 5: Stacked Annual Listings Bar Chart ---")

## [1] "--- Plotting 5: Stacked Annual Listings Bar Chart ---"

listings_annual_city_agg <- real_estate_data %>% group_by(year, city) %>% summarise(total_annual_listings = sum(listings, na.rm = TRUE), .groups = 'drop')
ggplot(listings_annual_city_agg, aes(x = factor(year), y = total_annual_listings, fill = city)) +
  geom_col(position = "stack", alpha = 0.8) + scale_y_continuous(labels = scales::comma) + scale_fill_brewer(palette = "Set1", name = "City") +
  labs(title = "Total Annual Listings Volume by City", subtitle = "Sum of monthly active listings/year", x = "Year", y = "Total Annual Listings", fill = "City") +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5))

# --- Commentary Plot 5 ---
# This plot shows the total listing 'exposure' (sum of active listings each month) per year, broken down by city.
# It confirms Tyler as the market with the largest inventory scale. It also shows an overall increase in listing activity from 2011 onwards, mirroring the sales trends. The relative contribution of each city to the total appears fairly stable over the years.

# 6. Line Chart: Average Monthly Sales Volume Trend by Year and City
print("--- Plotting 6: Average Volume Trend Line Chart (Thinner) ---")

## [1] "--- Plotting 6: Average Volume Trend Line Chart (Thinner) ---"

avg_volume_year_city_agg <- real_estate_data %>% group_by(year, city) %>% summarise(average_monthly_volume = mean(volume, na.rm = TRUE), .groups = 'drop')
ggplot(avg_volume_year_city_agg, aes(x = year, y = average_monthly_volume, color = city, group = city)) +
  geom_line(linewidth = 0.6, alpha = 0.8) + geom_point(size = 2.0, alpha = 0.8) +
  scale_y_continuous(labels = scales::dollar_format(suffix = "M")) + scale_x_continuous(breaks = unique(avg_volume_year_city_agg$year)) +
  scale_color_brewer(palette = "Set1", name = "City") +
  labs(title = "Avg Monthly Sales Volume Trend by Year/City", subtitle = "Volume in millions USD", x = "Year", y = "Average Monthly Volume ($M)", color = "City")

# --- Commentary Plot 6 ---
# This shows the trend in the average value of monthly transactions for each city.
# Tyler consistently had high average volume. Bryan-College Station shows particularly strong growth, catching up to or exceeding Tyler in later years. Wichita Falls remains lowest. The overall upward trend from 2011/2012 reinforces the market recovery narrative in terms of value.

# 7a. Faceted Boxplot: Median Price Comparison by City, Year-over-Year (Enhanced Clarity)
print("--- Plotting 7a: Faceted Median Price Boxplot (Enhanced Clarity) ---")

## [1] "--- Plotting 7a: Faceted Median Price Boxplot (Enhanced Clarity) ---"

ggplot(real_estate_data, aes(x = city, y = median_price, fill = city)) +
  geom_boxplot(alpha = 0.7, show.legend = TRUE) + facet_wrap(~year, ncol = 2) +
  scale_y_continuous(labels = scales::dollar_format()) + scale_fill_brewer(palette = "Set1", name = "City") +
  labs(title = "Median Price Dist by City, Year-over-Year", subtitle = "Comparing city price distributions within each year", x = NULL, y = "Median Price") +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
        strip.background = element_rect(fill = "grey90", color = "grey70", linewidth = 0.5),
        strip.text = element_text(face = "bold", size = rel(1.05), color = "black"))

# --- Commentary Plot 7a ---
# This faceted plot allows a direct comparison of city price distributions within each specific year.
# The enhanced strip backgrounds and wider panels (ncol=2) improve clarity.
# It confirms the consistent price hierarchy (B-CS highest, WF lowest) year after year.
# It also visualizes the general upward shift in price distributions for most cities across the panels from 2011 to 2014.

# 7b. Line chart: Median Price Trend Over Time
print("--- Plotting 7b: Median Price Trend Line Chart (Thinner) ---")

## [1] "--- Plotting 7b: Median Price Trend Line Chart (Thinner) ---"

ggplot(monthly_yearly_price, aes(x = year_month_ordered, y = avg_median_price, color = city, group = city)) +
  geom_line(linewidth = 0.5) + geom_point(size = 1.2, alpha = 0.6) + theme_minimal(base_size = 11) +
  scale_y_continuous(labels = scales::dollar_format()) + scale_color_brewer(palette = "Set1", name = "Città") +
  labs(title = "Andamento del prezzo mediano nel tempo per città", x = "Anno-Mese", y = "Prezzo mediano ($)") +
  scale_x_discrete(breaks = show_labels_price) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = rel(0.9)))

# --- Commentary Plot 7b ---
# This plot shows the median price trend for each city over the full time period using lines.
# The reduced number of x-axis labels makes the timeline readable.
# It clearly shows the divergence in price levels between cities and the general upward trend, particularly for Bryan-College Station, from 2012 onwards.

# 8. Faceted Line Chart: Monthly Conversion Rate Trend by Year and City
print("--- Plotting 8: Faceted Conversion Rate Trend (Thinner) ---")

## [1] "--- Plotting 8: Faceted Conversion Rate Trend (Thinner) ---"

ggplot(data = real_estate_data, aes(x = month_factor, y = monthly_conversion_rate, color = city, group = city)) +
  geom_line(linewidth = 0.5, alpha = 0.8) + geom_point(size = 1.2, alpha = 0.6) +
  facet_wrap(~ year, ncol = 2) + scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_color_brewer(palette = "Set1", name = "Città") +
  labs(title = "Andamento Mensile Tasso di Conversione (Efficacia Annunci)",
       subtitle = "Confronto tra città, anno per anno (Vendite / Inserzioni Attive)",
       x = "Mese", y = "Tasso di Conversione Mensile (%)", color = "Città") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = rel(0.8)),
        strip.background = element_rect(fill = "grey90", color = "grey70", linewidth = 0.5),
        strip.text = element_text(face = "bold", size = rel(1.05), color = "black"))

# --- Commentary Plot 8 ---
# This faceted plot visualizes the monthly conversion rate (listing effectiveness) for each city, year by year.
# It shows the seasonal pattern (higher conversion in summer) within each year.
# Comparing panels, the general increase in conversion rates across the years (especially 2013/2014) is evident, confirming the market acceleration.
# It also reinforces Bryan-College Station's position as having the highest conversion rate / efficiency.

# 9. Dodged Boxplot: Median Price Comparison by Year within each City
print("--- Plotting 9: Dodged Median Price Boxplot (Year within City) ---")

## [1] "--- Plotting 9: Dodged Median Price Boxplot (Year within City) ---"

ggplot(data = real_estate_data, aes(x = city, y = median_price, fill = year_factor)) +
  geom_boxplot(position = position_dodge(preserve = "single"), alpha = 0.8,
               outlier.shape = 21, outlier.size = 1.5, outlier.alpha = 0.6) +
  scale_y_continuous(labels = scales::dollar_format()) + scale_fill_brewer(palette = "Set1", name = "Year") +
  labs(title = "Confronto prezzo mediano tra le Città, Anno per Anno", x = "City", y = "Prezzo mediano") +
  theme_minimal(base_size = 11) + theme(axis.text.x = element_text(size = rel(0.95)), legend.position = "right")

# --- Commentary Plot 9 ---
# This plot provides a direct comparison of price distributions across years *within* each city using side-by-side (dodged) boxplots.
# It makes it easy to see the year-over-year progression for a single city. For example, the upward trend in median price from 2011/2012 to 2014 is visible within the Bryan-College Station and Tyler groups.
# It complements the faceted plot (7a) by grouping by city first, then year.

print("--- End of Analysis ---")

## [1] "--- End of Analysis ---"