Advanced Analysis

1. Importing Master file & 2 cost files .rds from 01 - Build Quarterly Master Levels

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
rds_dir <- "/Users/lukavrtac/Faks - EF/IMB/Master thesis/Data/R ready/rds"

master <- readRDS(file.path(rds_dir, "master_q_levels.rds"))
costs_base <- readRDS(file.path(rds_dir, "costs_base.rds"))
eq_cgt <- readRDS(file.path(rds_dir, "eq_cgt_schedule.rds"))

glimpse(master)
## Rows: 60
## Columns: 15
## $ q_id                  <chr> "2010Q1", "2010Q2", "2010Q3", "2010Q4", "2011Q1"…
## $ date_q                <date> 2010-03-31, 2010-06-30, 2010-09-30, 2010-12-31,…
## $ year                  <int> 2010, 2010, 2010, 2010, 2011, 2011, 2011, 2011, …
## $ qtr                   <int> 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, …
## $ hpi_si_eurostat       <dbl> 117.41, 118.08, 116.53, 116.80, 122.24, 121.89, …
## $ hpi_si_surs_used      <dbl> 118.15, 117.64, 117.23, 116.10, 119.29, 119.94, …
## $ hpi_lj_surs_used      <dbl> 119.75, 119.41, 118.80, 122.17, 124.68, 125.93, …
## $ hpi_si_exlj_surs_used <dbl> 116.78, 116.17, 115.88, 112.38, 116.03, 116.29, …
## $ hicp_rent             <dbl> 102.48, 101.58, 102.28, 101.72, 101.92, 101.56, …
## $ hicp_all              <dbl> 93.18, 94.78, 93.93, 94.48, 95.38, 96.25, 96.05,…
## $ sbitop_close          <dbl> 965.95, 880.02, 830.94, 850.35, 832.37, 742.29, …
## $ dy_eq_ann             <dbl> 0.02579782, 0.02579782, 0.02579782, 0.02579782, …
## $ rf_log_q              <dbl> 8.604179e-04, 8.856720e-04, 1.158545e-03, 1.5262…
## $ rf_simple_q           <dbl> 8.607882e-04, 8.860644e-04, 1.159216e-03, 1.5274…
## $ rf_rate_ann_qavg_pct  <dbl> 0.34410000, 0.35050000, 0.45343333, 0.59733333, …

2. Descriptive statistics

# ============================================================
# 2. DESCRIPTIVE STATISTICS (THESIS-READY, SELF-CONTAINED)
# ============================================================

library(dplyr)
library(tidyr)
library(lubridate)
library(ggplot2)
library(scales)
library(zoo)
## Warning: package 'zoo' was built under R version 4.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(stringr)
library(patchwork)
library(ggrepel)
library(gt)
## Warning: package 'gt' was built under R version 4.4.3
# --- Optional for skew/kurtosis (clean + standard) ---
if (!requireNamespace("moments", quietly = TRUE)) install.packages("moments")
library(moments)

# -----------------------------
# 0) Local thesis theme (so this chunk runs standalone)
# -----------------------------
theme_thesis <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10.5, color = "grey30"),
      legend.position = "bottom",
      legend.title = element_blank(),
      legend.text = element_text(size = 9.5),
      legend.key.width = unit(14, "pt"),
      legend.spacing.x = unit(6, "pt"),
      legend.box.margin = margin(t = 8),
      plot.margin = margin(t = 10, r = 10, b = 10, l = 10),
      panel.grid.minor = element_blank()
    )
}

# -----------------------------
# 1) Helpers
# -----------------------------

# Quarter label as YYYYQ#
to_qid <- function(d) paste0(year(d), "Q", quarter(d))

# Annualization from quarterly log returns
ann_ret_from_log <- function(rq) exp(4 * mean(rq, na.rm = TRUE)) - 1
ann_vol_from_log <- function(rq) sd(rq, na.rm = TRUE) * sqrt(4)
ann_rf_from_log  <- function(rf_q_log) exp(4 * mean(rf_q_log, na.rm = TRUE)) - 1

# Lag-1 autocorrelation
acf1 <- function(x) {
  x <- x[is.finite(x)]
  if (length(x) < 3) return(NA_real_)
  cor(x[-1], x[-length(x)], use = "complete.obs")
}

# Wealth index from log returns
wealth_index <- function(r_log) 100 * exp(cumsum(replace_na(r_log, 0)))

# Drawdown from wealth index
drawdown_from_wealth <- function(w) (w / cummax(w)) - 1

# Max drawdown + time-to-recovery (quarters from trough to first recovery to prior peak)
mdd_and_recovery <- function(date_q, r_log) {
  ok <- is.finite(r_log) & is.finite(date_q)
  date_q <- date_q[ok]
  r_log  <- r_log[ok]
  if (length(r_log) < 3) return(tibble(mdd = NA_real_, ttr_q = NA_real_))

  w <- wealth_index(r_log)
  peak <- cummax(w)
  dd <- drawdown_from_wealth(w)

  i_trough <- which.min(dd)
  mdd <- dd[i_trough]
  peak_before <- peak[i_trough]

  if (i_trough < length(w)) {
    i_rec <- which(w[(i_trough + 1):length(w)] >= peak_before)
    ttr_q <- if (length(i_rec) == 0) NA_real_ else i_rec[1]
  } else {
    ttr_q <- NA_real_
  }

  tibble(mdd = mdd, ttr_q = ttr_q)
}

# Formatting rules
fmt_pct1 <- function(x) ifelse(is.finite(x), percent(x, accuracy = 0.1), NA_character_)
fmt_num2 <- function(x) ifelse(is.finite(x), formatC(x, format = "f", digits = 2), NA_character_)
fmt_int  <- function(x) ifelse(is.finite(x), as.character(as.integer(x)), NA_character_)

# -----------------------------
# 2) Build returns ONCE (nominal + real), inside this chunk
#    This avoids the "df not found" problem when knitting.
# -----------------------------

y0_ann <- 0.04  # baseline gross annual rental yield (consistent with your later code)

df_desc <- master %>%
  arrange(date_q) %>%
  # keep only what we need (safer + clearer)
  select(
    date_q,
    sbitop_close, dy_eq_ann,
    hicp_all, hicp_rent,
    hpi_si_eurostat,
    hpi_si_surs_used,
    hpi_lj_surs_used,
    hpi_si_exlj_surs_used,
    rf_log_q
  )

# Robust anchor quarter for rent/price yield model
base_anchor <- df_desc %>%
  filter(
    !is.na(hicp_rent),
    !is.na(hpi_si_eurostat),
    !is.na(hpi_si_surs_used),
    !is.na(hpi_lj_surs_used),
    !is.na(hpi_si_exlj_surs_used)
  ) %>%
  slice(1)

R0  <- base_anchor$hicp_rent
He0 <- base_anchor$hpi_si_eurostat
Hs0 <- base_anchor$hpi_si_surs_used
Hl0 <- base_anchor$hpi_lj_surs_used
Hx0 <- base_anchor$hpi_si_exlj_surs_used

df_desc <- df_desc %>%
  mutate(
    # Inflation (quarterly log change)
    pi_log = log(hicp_all / lag(hicp_all)),

    # Equity: nominal total log return = log price return + DY/4
    dy_eq_q        = dy_eq_ann / 4,
    r_eq_price_log = log(sbitop_close / lag(sbitop_close)),
    r_eq_total_log = r_eq_price_log + dy_eq_q,

    # Real estate price log returns
    r_re_eu_price_log   = log(hpi_si_eurostat / lag(hpi_si_eurostat)),
    r_re_si_price_log   = log(hpi_si_surs_used / lag(hpi_si_surs_used)),
    r_re_lj_price_log   = log(hpi_lj_surs_used / lag(hpi_lj_surs_used)),
    r_re_ex_price_log   = log(hpi_si_exlj_surs_used / lag(hpi_si_exlj_surs_used)),

    # Time-varying rental yields via rent/price ratio (anchored at base quarter)
    y_eu_t = y0_ann * ((hicp_rent / R0) / (hpi_si_eurostat / He0)),
    y_si_t = y0_ann * ((hicp_rent / R0) / (hpi_si_surs_used / Hs0)),
    y_lj_t = y0_ann * ((hicp_rent / R0) / (hpi_lj_surs_used / Hl0)),
    y_ex_t = y0_ann * ((hicp_rent / R0) / (hpi_si_exlj_surs_used / Hx0)),

    # Quarterly rental income (lagged to avoid look-ahead)
    r_re_eu_income_q = lag(y_eu_t) / 4,
    r_re_si_income_q = lag(y_si_t) / 4,
    r_re_lj_income_q = lag(y_lj_t) / 4,
    r_re_ex_income_q = lag(y_ex_t) / 4,

    # Real estate: nominal total log returns
    r_re_eu_total_log = r_re_eu_price_log + r_re_eu_income_q,
    r_re_si_total_log = r_re_si_price_log + r_re_si_income_q,
    r_re_lj_total_log = r_re_lj_price_log + r_re_lj_income_q,
    r_re_ex_total_log = r_re_ex_price_log + r_re_ex_income_q,

    # Real total log returns (deflated)
    r_eq_total_real_log    = r_eq_total_log    - pi_log,
    r_re_eu_total_real_log = r_re_eu_total_log - pi_log,
    r_re_si_total_real_log = r_re_si_total_log - pi_log,
    r_re_lj_total_real_log = r_re_lj_total_log - pi_log,
    r_re_ex_total_real_log = r_re_ex_total_log - pi_log
  )

# -----------------------------
# 3) Long panels for stats
# -----------------------------

series_map <- tibble::tribble(
  ~key,                     ~label,
  "r_eq_total_log",          "Equities: SBITOP",
  "r_re_eu_total_log",       "Real estate: Slovenia (Eurostat)",
  "r_re_si_total_log",       "Real estate: Slovenia (SURS)",
  "r_re_lj_total_log",       "Real estate: Ljubljana (SURS)",
  "r_re_ex_total_log",       "Real estate: Slovenia ex-LJ (SURS)"
)

series_map_real <- tibble::tribble(
  ~key,                          ~label,
  "r_eq_total_real_log",          "Equities: SBITOP",
  "r_re_eu_total_real_log",       "Real estate: Slovenia (Eurostat)",
  "r_re_si_total_real_log",       "Real estate: Slovenia (SURS)",
  "r_re_lj_total_real_log",       "Real estate: Ljubljana (SURS)",
  "r_re_ex_total_real_log",       "Real estate: Slovenia ex-LJ (SURS)"
)

ret_nom_long <- df_desc %>%
  select(date_q, rf_log_q, all_of(series_map$key)) %>%
  pivot_longer(cols = all_of(series_map$key), names_to = "key", values_to = "r_log") %>%
  left_join(series_map, by = "key") %>%
  transmute(
    date_q, series = label,
    r_log,
    r_simple = exp(r_log) - 1,         # for percent-style quartiles/min/max
    rf_log_q
  ) %>%
  filter(is.finite(r_log), is.finite(rf_log_q), is.finite(r_simple))

ret_real_long <- df_desc %>%
  select(date_q, rf_log_q, all_of(series_map_real$key)) %>%
  pivot_longer(cols = all_of(series_map_real$key), names_to = "key", values_to = "r_log") %>%
  left_join(series_map_real, by = "key") %>%
  transmute(
    date_q, series = label,
    r_log,
    r_simple = exp(r_log) - 1,
    rf_log_q
  ) %>%
  filter(is.finite(r_log), is.finite(rf_log_q), is.finite(r_simple))

# -----------------------------
# 4) TABLE 1 — Descriptive statistics
#    Rules enforced:
#    - Start/End shown as YYYYQ#
#    - Sharpe (ann.) uses rf from rf_log_q (annualized)
#    - Percent columns shown with 1 decimal
#    - Skew/kurt with 2 decimals
#    - Full borders + readable alignment
#    - Subtitle clarifies excess kurtosis (k-3)
# -----------------------------

make_desc_block <- function(ret_long, block_name) {
  core <- ret_long %>%
    group_by(series) %>%
    summarise(
      n_q = n(),
      start = min(date_q),
      end   = max(date_q),

      # Tails / min / max on SIMPLE quarterly returns (matches percent interpretation)
      p05_q  = quantile(r_simple, 0.05, na.rm = TRUE, type = 7),
      med_q  = median(r_simple, na.rm = TRUE),
      p95_q  = quantile(r_simple, 0.95, na.rm = TRUE, type = 7),
      min_q  = min(r_simple, na.rm = TRUE),
      max_q  = max(r_simple, na.rm = TRUE),

      # Distribution shape (use log returns; stable + standard)
      skew    = moments::skewness(r_log, na.rm = TRUE),
      ex_kurt = moments::kurtosis(r_log, na.rm = TRUE) - 3,

      # Annualized return/vol from LOG returns
      ann_ret = ann_ret_from_log(r_log),
      ann_vol = ann_vol_from_log(r_log),

      # Annualized rf from LOG rf aligned to the same quarters
      ann_rf  = ann_rf_from_log(rf_log_q),

      # Sharpe (annualized)
      sharpe  = (ann_ret - ann_rf) / ann_vol,

      # Lag-1 autocorr on log returns
      acf1_q  = acf1(r_log),

      .groups = "drop"
    )

  dd <- ret_long %>%
    group_by(series) %>%
    summarise(tmp = list(mdd_and_recovery(date_q, r_log)), .groups = "drop") %>%
    tidyr::unnest(tmp)

  core %>%
    left_join(dd, by = "series") %>%
    mutate(
      block = block_name,
      Start = to_qid(start),
      End   = to_qid(end)
    ) %>%
    select(
      block, Series = series,
      n_q, Start, End,
      ann_ret, ann_vol, ann_rf, sharpe,
      p05_q, med_q, p95_q,
      acf1_q, skew, ex_kurt,
      mdd, ttr_q,
      min_q, max_q
    )
}

desc_nom  <- make_desc_block(ret_nom_long,  "Nominal")
desc_real <- make_desc_block(ret_real_long, "Real (inflation-adjusted)")

desc_all <- bind_rows(desc_nom, desc_real) %>%
  mutate(
    `N (quarters)` = n_q,
    `Annualized return`     = fmt_pct1(ann_ret),
    `Annualized volatility` = fmt_pct1(ann_vol),
    `Annualized rf`         = fmt_pct1(ann_rf),
    `Sharpe (ann.)`         = fmt_num2(sharpe),

    `P5 (qtr)`     = fmt_pct1(p05_q),
    `Median (qtr)` = fmt_pct1(med_q),
    `P95 (qtr)`    = fmt_pct1(p95_q),

    `ACF(1)` = fmt_num2(acf1_q),
    Skewness = fmt_num2(skew),
    `Excess kurtosis` = fmt_num2(ex_kurt),

    `Max drawdown` = fmt_pct1(mdd),
    `Time to recovery (qtrs)` = fmt_int(ttr_q),

    `Min (qtr)` = fmt_pct1(min_q),
    `Max (qtr)` = fmt_pct1(max_q)
  ) %>%
  select(
    block, Series,
    `N (quarters)`, Start, End,
    `Annualized return`, `Annualized volatility`, `Annualized rf`, `Sharpe (ann.)`,
    `P5 (qtr)`, `Median (qtr)`, `P95 (qtr)`,
    `ACF(1)`, Skewness, `Excess kurtosis`,
    `Max drawdown`, `Time to recovery (qtrs)`,
    `Min (qtr)`, `Max (qtr)`
  )

tab_2_1 <- desc_all %>%
  gt(groupname_col = "block") %>%
  cols_align(align = "left", columns = c(Series, Start, End)) %>%
  cols_align(align = "center", columns = -c(Series, Start, End)) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_row_groups()
  ) %>%
  # Full borders + clear grid
  tab_options(
    table.border.top.width = px(1),
    table.border.bottom.width = px(1),
    table.border.left.width = px(1),
    table.border.right.width = px(1),
    table_body.hlines.width = px(1),
    table_body.vlines.width = px(1),
    heading.border.bottom.width = px(1),
    row_group.border.top.width = px(1),
    row_group.border.bottom.width = px(1),
    data_row.padding = px(6),
    heading.title.font.size = px(14),
    heading.subtitle.font.size = px(11)
  )

tab_2_1
Series N (quarters) Start End Annualized return Annualized volatility Annualized rf Sharpe (ann.) P5 (qtr) Median (qtr) P95 (qtr) ACF(1) Skewness Excess kurtosis Max drawdown Time to recovery (qtrs) Min (qtr) Max (qtr)
Nominal
Equities: SBITOP 59 2010Q2 2024Q4 9.3% 16.9% 0.3% 0.53 -12.4% 3.2% 16.0% 0.06 -0.57 0.12 -37.0% 8 -19.9% 18.6%
Real estate: Ljubljana (SURS) 59 2010Q2 2024Q4 8.0% 5.2% 0.3% 1.48 -2.5% 2.0% 5.7% 0.42 -0.83 0.91 -14.9% 7 -6.6% 6.3%
Real estate: Slovenia (Eurostat) 59 2010Q2 2024Q4 8.3% 4.3% 0.3% 1.86 -2.5% 2.6% 5.2% 0.38 -1.03 1.09 -8.2% 7 -4.8% 5.7%
Real estate: Slovenia (SURS) 59 2010Q2 2024Q4 8.6% 4.4% 0.3% 1.89 -1.6% 2.4% 5.2% 0.50 -0.59 0.16 -8.2% 5 -4.2% 6.4%
Real estate: Slovenia ex-LJ (SURS) 59 2010Q2 2024Q4 8.9% 4.8% 0.3% 1.80 -2.1% 2.7% 5.9% 0.40 -0.56 0.23 -6.5% 3 -4.2% 7.0%
Real (inflation-adjusted)
Equities: SBITOP 59 2010Q2 2024Q4 7.0% 16.9% 0.3% 0.39 -12.9% 2.8% 14.6% 0.10 -0.57 -0.11 -39.4% 9 -19.3% 17.9%
Real estate: Ljubljana (SURS) 59 2010Q2 2024Q4 5.7% 5.5% 0.3% 0.99 -3.0% 1.4% 5.3% 0.30 -0.72 0.48 -19.5% 9 -6.9% 6.0%
Real estate: Slovenia (Eurostat) 59 2010Q2 2024Q4 5.9% 4.3% 0.3% 1.30 -2.7% 1.7% 3.9% 0.22 -1.07 0.88 -13.4% 9 -5.1% 4.8%
Real estate: Slovenia (SURS) 59 2010Q2 2024Q4 6.2% 4.6% 0.3% 1.31 -2.6% 1.9% 4.3% 0.33 -0.66 0.14 -12.9% 7 -5.2% 5.7%
Real estate: Slovenia ex-LJ (SURS) 59 2010Q2 2024Q4 6.6% 4.9% 0.3% 1.29 -2.8% 1.9% 5.1% 0.26 -0.58 0.28 -8.9% 6 -5.2% 7.2%
gtsave(tab_2_1, "Table_1_descriptive_stats.html")

# -----------------------------
# 5) TABLE 2 and 3 — Correlation matrices (nominal + real)
#    - bold diagonal
#    - light shading by magnitude
# -----------------------------

make_corr_table <- function(ret_long, table_title) {
  wide <- ret_long %>%
    select(date_q, series, r_log) %>%
    pivot_wider(names_from = series, values_from = r_log) %>%
    arrange(date_q)

  corr <- cor(wide %>% select(-date_q), use = "pairwise.complete.obs")
  corr_df <- as.data.frame(round(corr, 2)) %>% tibble::rownames_to_column("Series")

  gt_tbl <- corr_df %>%
    gt() %>%
    cols_align(align = "left", columns = "Series") %>%
    cols_align(align = "center", columns = -Series) %>%
    fmt_number(columns = -Series, decimals = 2) %>%
    tab_options(
      table.border.top.width = px(1),
      table.border.bottom.width = px(1),
      table.border.left.width = px(1),
      table.border.right.width = px(1),
      table_body.hlines.width = px(1),
      table_body.vlines.width = px(1),
      heading.border.bottom.width = px(1),
      data_row.padding = px(6)
    ) %>%
    data_color(
      columns = -Series,
      colors = scales::col_numeric(
        palette = c("#FFFFFF", "#F2F2F2"),
        domain = c(0, 1)
      ),
      alpha = 1
    )

  # Bold diagonal
  series_names <- colnames(corr)
  for (s in series_names) {
    gt_tbl <- gt_tbl %>%
      tab_style(
        style = cell_text(weight = "bold"),
        locations = cells_body(columns = s, rows = Series == s)
      )
  }

  gt_tbl
}

tab_2_2 <- make_corr_table(ret_nom_long,  "Table 2: Correlation matrix of nominal quarterly total returns")
## Warning: Since gt v0.9.0, the `colors` argument has been deprecated.
## • Please use the `fn` argument instead.
## This warning is displayed once every 8 hours.
tab_2_3 <- make_corr_table(ret_real_long, "Table 3: Correlation matrix of real quarterly total returns")

tab_2_2
Series Equities: SBITOP Real estate: Slovenia (Eurostat) Real estate: Slovenia (SURS) Real estate: Ljubljana (SURS) Real estate: Slovenia ex-LJ (SURS)
Equities: SBITOP 1.00 0.09 0.05 0.05 0.03
Real estate: Slovenia (Eurostat) 0.09 1.00 0.82 0.67 0.77
Real estate: Slovenia (SURS) 0.05 0.82 1.00 0.82 0.92
Real estate: Ljubljana (SURS) 0.05 0.67 0.82 1.00 0.54
Real estate: Slovenia ex-LJ (SURS) 0.03 0.77 0.92 0.54 1.00
tab_2_3
Series Equities: SBITOP Real estate: Slovenia (Eurostat) Real estate: Slovenia (SURS) Real estate: Ljubljana (SURS) Real estate: Slovenia ex-LJ (SURS)
Equities: SBITOP 1.00 0.09 0.05 0.06 0.03
Real estate: Slovenia (Eurostat) 0.09 1.00 0.83 0.70 0.77
Real estate: Slovenia (SURS) 0.05 0.83 1.00 0.84 0.92
Real estate: Ljubljana (SURS) 0.06 0.70 0.84 1.00 0.57
Real estate: Slovenia ex-LJ (SURS) 0.03 0.77 0.92 0.57 1.00
gtsave(tab_2_2, "Table_2_corr_matrix_nominal.html")
gtsave(tab_2_3, "Table_3_corr_matrix_real.html")

# -----------------------------
# 6) FIGURE 1 — Risk–return scatter (fixed scale) + Sharpe
#    Label includes Return, Vol, Sharpe
# -----------------------------

riskret <- ret_nom_long %>%
  group_by(series) %>%
  summarise(
    ann_ret = ann_ret_from_log(r_log),
    ann_vol = ann_vol_from_log(r_log),
    ann_rf  = ann_rf_from_log(rf_log_q),
    sharpe  = (ann_ret - ann_rf) / ann_vol,
    .groups = "drop"
  ) %>%
  mutate(
    series_short = case_when(
      series == "Equities: SBITOP" ~ "EQ: SBITOP",
      series == "Real estate: Slovenia (Eurostat)" ~ "RE: SI (Eurostat)",
      series == "Real estate: Slovenia (SURS)" ~ "RE: SI (SURS)",
      series == "Real estate: Ljubljana (SURS)" ~ "RE: LJ (SURS)",
      series == "Real estate: Slovenia ex-LJ (SURS)" ~ "RE: ex-LJ (SURS)",
      TRUE ~ series
    )
  )

# Axis limits with padding so equities fully visible + labels don’t clip
x_min <- min(riskret$ann_vol, na.rm = TRUE); x_max <- max(riskret$ann_vol, na.rm = TRUE)
y_min <- min(riskret$ann_ret, na.rm = TRUE); y_max <- max(riskret$ann_ret, na.rm = TRUE)
x_pad <- 0.12 * (x_max - x_min); y_pad <- 0.12 * (y_max - y_min)
x_lim <- c(x_min - x_pad, x_max + x_pad)
y_lim <- c(y_min - y_pad, y_max + y_pad)

riskret <- riskret %>%
  mutate(
    lab_A = paste0(
      series_short, "\n",
      fmt_pct1(ann_ret), ", ", fmt_pct1(ann_vol), ", S=", fmt_num2(sharpe)
    )
  )

p2_1_A <- ggplot(riskret, aes(x = ann_vol, y = ann_ret)) +
  geom_point(size = 3.2, alpha = 0.9) +
  ggrepel::geom_text_repel(
    aes(label = lab_A),
    size = 3.3,
    min.segment.length = 0,
    box.padding = 0.35,
    point.padding = 0.25,
    max.overlaps = Inf
  ) +
  scale_x_continuous(labels = percent_format(accuracy = 1), limits = x_lim) +
  scale_y_continuous(labels = percent_format(accuracy = 1), limits = y_lim) +
  labs(
    x = "Annualized volatility",
    y = "Annualized return"
  ) +
  theme_thesis() +
  theme(legend.position = "none")

p2_1_A

ggsave("Figure_1_risk_return_scatter_labels.png", p2_1_A,
       width = 12, height = 7.2, dpi = 300, limitsize = FALSE)


# -----------------------------
# 7) FIGURE 2 — Return distributions
#    - identical x-limits across facets
#    - mean + median markers
# -----------------------------

dist_df <- ret_nom_long %>%
  select(date_q, series, r_simple) %>%
  filter(is.finite(r_simple)) %>%
  mutate(series = factor(series, levels = series_map$label))

x_rng <- range(dist_df$r_simple, na.rm = TRUE)
x_pad2 <- 0.06 * (x_rng[2] - x_rng[1])
x_lim2 <- c(x_rng[1] - x_pad2, x_rng[2] + x_pad2)

markers <- dist_df %>%
  group_by(series) %>%
  summarise(
    mean_q = mean(r_simple, na.rm = TRUE),
    med_q  = median(r_simple, na.rm = TRUE),
    .groups = "drop"
  )

p2_2 <- ggplot(dist_df, aes(x = r_simple)) +
  geom_density(linewidth = 1.0, fill = "grey80", color = "grey20", adjust = 1.05) +
  geom_vline(xintercept = 0, linewidth = 0.4, color = "grey65") +
  geom_vline(data = markers, aes(xintercept = mean_q),
             linewidth = 0.55, linetype = "solid", color = "grey35") +
  geom_vline(data = markers, aes(xintercept = med_q),
             linewidth = 0.55, linetype = "dashed", color = "grey35") +
  facet_wrap(~ series, ncol = 2, scales = "fixed") +
  coord_cartesian(xlim = x_lim2) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    x = "Quarterly total return (simple, nominal)",
    y = "Density"
  ) +
  theme_thesis() +
  theme(legend.position = "none",
        strip.text = element_text(face = "bold"))

p2_2

ggsave("Figure_2_return_distributions_nominal_fixed_scale.png", p2_2,
       width = 12, height = 8.2, dpi = 300, limitsize = FALSE)

3. Graphical analysis

theme_thesis <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10.5, color = "grey30"),
      legend.position = "bottom",
      legend.title = element_blank(),
      legend.text = element_text(size = 9.5),
      legend.key.width = unit(18, "pt"),
      legend.spacing.x = unit(8, "pt"),
      plot.margin = margin(t = 10, r = 10, b = 10, l = 10),
      panel.grid.minor = element_blank()
    )
}

# inside ggplot(...)
guides(
  color = guide_legend(nrow = 3, byrow = TRUE),
  linetype = "none"
)
## <Guides[2] ggproto object>
## 
##   colour : <GuideLegend>
## linetype : "none"

3.1. Figure 3: Index levels comparisson - capital appreciation only

# ------------------------------------------------------------
# FIGURE 3: Capital appreciation (price-only) – indexed growth since base period
#
# What this figure represents:
#   - Equities: SBITOP (quarter-end close level; price-only, no dividends)
#   - Real estate price indices (price-only, no rental income):
#       * Eurostat HPI Slovenia (broad overall market coverage / all dwellings)
#       * SURS HPI "selected series" used in this thesis:
#           - Slovenia total
#           - Ljubljana
#           - Slovenia ex-Ljubljana
#
# Why rebasing is needed:
#   - SBITOP and HPI are both index levels but with different base/scales.
#   - Rebasing to 100 at a common base quarter makes them comparable as "growth since start".
#
# Why the graph was being cut:
#   - Legend labels were long and exceeded the plotting canvas.
#   - Fix: wrap legend labels + give enough bottom space + save with larger height.
#
# Output:
#   - Prints a clean figure in RStudio/Rmd
#   - Saves a high-resolution PNG that will not clip the legend
# ------------------------------------------------------------

# 1) Load packages
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(stringr)

# 2) Start from the LEVELS dataset (master) so the base quarter is included
#    Ensure data are ordered by time.
levels_plot <- master %>% arrange(date_q)

# 3) Choose a robust base quarter:
#    - First quarter where all required level series exist (Eurostat + SURS + SBITOP).
base <- levels_plot %>%
  filter(
    !is.na(sbitop_close),
    !is.na(hpi_si_eurostat),
    !is.na(hpi_si_surs_used),
    !is.na(hpi_lj_surs_used),
    !is.na(hpi_si_exlj_surs_used)
  ) %>%
  slice(1)

# Base values for rebasing
P0  <- base$sbitop_close
He0 <- base$hpi_si_eurostat
Hs0 <- base$hpi_si_surs_used
Hl0 <- base$hpi_lj_surs_used
Hx0 <- base$hpi_si_exlj_surs_used

# 4) Rebase each index to 100 at the base quarter (capital appreciation only)
levels_plot <- levels_plot %>%
  mutate(
    idx_eq_price   = 100 * sbitop_close / P0,

    # Eurostat HPI: broad overall residential market coverage ("all dwellings")
    idx_re_si_eu   = 100 * hpi_si_eurostat / He0,

    # SURS HPI "selected series": the SURS indices you chose to use in the thesis
    idx_re_si_surs = 100 * hpi_si_surs_used / Hs0,
    idx_re_lj_surs = 100 * hpi_lj_surs_used / Hl0,
    idx_re_exlj    = 100 * hpi_si_exlj_surs_used / Hx0
  )

# 5) Convert to long format for ggplot and create reader-friendly labels
#    IMPORTANT: Wrap labels to prevent legend clipping.
plot_1a_data <- levels_plot %>%
  select(date_q, idx_eq_price, idx_re_si_eu, idx_re_si_surs, idx_re_lj_surs, idx_re_exlj) %>%
  pivot_longer(-date_q, names_to = "series", values_to = "index") %>%
  mutate(
    series = recode(series,
      idx_eq_price   = "Equities: SBITOP (price-only)",
      idx_re_si_eu   = "Real estate: Slovenia (Eurostat HPI, all dwellings)",
      idx_re_si_surs = "Real estate: Slovenia (SURS HPI, selected series)",
      idx_re_lj_surs = "Real estate: Ljubljana (SURS HPI, selected series)",
      idx_re_exlj    = "Real estate: Slovenia ex-LJ (SURS HPI, selected series)"
    ),
    # Wrap long legend labels so they fit on the canvas (prevents clipping)
    series = str_wrap(series, width = 34)
  )

# Set legend order AFTER wrapping (so ordering stays correct)
legend_levels <- str_wrap(c(
  "Equities: SBITOP (price-only)",
  "Real estate: Slovenia (Eurostat HPI, all dwellings)",
  "Real estate: Slovenia (SURS HPI, selected series)",
  "Real estate: Ljubljana (SURS HPI, selected series)",
  "Real estate: Slovenia ex-LJ (SURS HPI, selected series)"
), width = 34)

plot_1a_data <- plot_1a_data %>%
  mutate(series = factor(series, levels = legend_levels))

# 6) Optional event markers for narrative context (edit/remove if you want)
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

# Put event labels just below the top of the plot
y_top <- max(plot_1a_data$index, na.rm = TRUE)

# 7) Define a thesis-style theme:
#    - smaller legend text
#    - extra margin/spacing so legend never clips
theme_thesis <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10.5, color = "grey30"),
      legend.position = "bottom",
      legend.title = element_blank(),
      legend.text = element_text(size = 9.5),
      legend.key.width = unit(14, "pt"),
      legend.spacing.x = unit(6, "pt"),
      legend.box.margin = margin(t = 8),
      # Keep margins modest; the real anti-clipping fix is label wrapping + save height
      plot.margin = margin(t = 10, r = 10, b = 10, l = 10),
      panel.grid.minor = element_blank()
    )
}

# 8) Build the plot
#    - Use both color + linetype for readability
#    - Show only ONE legend (color) to avoid duplicated legend entries
p1a <- ggplot(plot_1a_data, aes(x = date_q, y = index, color = series, linetype = series)) +
  geom_line(linewidth = 1.0) +
  scale_y_continuous(labels = label_number(accuracy = 1)) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  guides(
    color = guide_legend(ncol = 2, byrow = TRUE),  # 2 columns helps fit wrapped labels cleanly
    linetype = "none"
  ) +
  labs(
    x = NULL,
    y = "Index level (base = 100 at start)"
  ) +
  theme_thesis() +
  geom_vline(
    data = events,
    aes(xintercept = date),
    linewidth = 0.4,
    linetype = 3,
    color = "grey60",
    inherit.aes = FALSE
  ) +
  geom_text(
    data = events,
    aes(x = date, y = 0.96 * y_top, label = label),
    vjust = -0.3,
    hjust = 0,
    size = 3.2,
    color = "grey35",
    inherit.aes = FALSE
  )

# 9) Print in RStudio / in the knitted document
p1a

# 10) Save high-resolution image for thesis 
ggsave(
  filename = "Figure_3_price_only_indexed_growth.png",
  plot = p1a,
  width = 12,
  height = 8.2,
  dpi = 300,
  limitsize = FALSE
)

Consistency instuctions

# ============================================================
# FIGURE TOOLKIT (use for ALL figures to keep style consistent)
# ============================================================

library(ggplot2)
library(scales)
library(stringr)

# --- Global event markers (reuse across all plots) ---
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

# --- Thesis theme: matches Figure 1A ---
theme_thesis <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10.5, color = "grey30"),
      legend.position = "bottom",
      legend.title = element_blank(),
      legend.text = element_text(size = 9.5),
      legend.key.width = unit(14, "pt"),
      legend.spacing.x = unit(6, "pt"),
      legend.box.margin = margin(t = 8),
      plot.margin = margin(t = 10, r = 10, b = 10, l = 10),
      panel.grid.minor = element_blank()
    )
}

# --- Add event lines + labels (prevents aesthetic inheritance bugs) ---
add_events <- function(p, y_top, events_df = events) {
  p +
    geom_vline(
      data = events_df,
      aes(xintercept = date),
      linewidth = 0.4,
      linetype = 3,
      color = "grey60",
      inherit.aes = FALSE
    ) +
    geom_text(
      data = events_df,
      aes(x = date, y = 0.96 * y_top, label = label),
      vjust = -0.3,
      hjust = 0,
      size = 3.2,
      color = "grey35",
      inherit.aes = FALSE
    )
}

# --- Save consistently (same size as Figure 1A) ---
save_figure <- function(plot_obj, filename, width = 12, height = 8.2, dpi = 300) {
  ggsave(
    filename = filename,
    plot = plot_obj,
    width = width,
    height = height,
    dpi = dpi,
    limitsize = FALSE
  )
}

3.2. Graph 4: Total retudn wealth indices

# ------------------------------------------------------------
# FIGURE 4: Total return wealth indices (investor experience) – indexed to 100
#
# What this figure represents:
#   - Equities (SBITOP): price appreciation + dividend income (quarterly = dy_eq_ann/4)
#   - Real estate (HPI): price appreciation + modeled rental income
#       * Eurostat HPI Slovenia (broad overall market coverage / all dwellings)
#       * SURS HPI "selected series" used in this thesis:
#           - Slovenia total
#           - Ljubljana
#           - Slovenia ex-Ljubljana
#
# Why this figure is needed:
#   - Figure 3 is price-only (capital appreciation) and excludes cash-flow returns.
#   - Figure 4 adds income components (dividends + rents) to approximate investor experience.
#
# Important assumptions (baseline, later tested in sensitivity analysis):
#   - Dividend income is allocated evenly across quarters: dy_eq_ann/4
#   - Base gross annual rental yield y0_ann is calibrated as a constant starting yield
#   - Rental yield evolves with rent/price ratio using HICP rents and HPI:
#       y_t = y0_ann * [(hicp_rent_t / hicp_rent_0) / (HPI_t / HPI_0)]
#   - Quarterly rental income is lagged (t-1) to avoid look-ahead: lag(y_t)/4
#
# Output:
#   - Prints a clean figure in RStudio/Rmd (same style as Figure 1A)
#   - Saves a high-resolution PNG that will not clip the legend
# ------------------------------------------------------------

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(stringr)

# 1) Start from the master LEVELS dataset and ensure time order
levels_plot <- master %>% arrange(date_q)

# 2) Parameters for baseline rental income model
y0_ann <- 0.04  # base gross annual rental yield (baseline assumption)

# 3) Robust base quarter: first quarter where all required series exist
#    (Needed for both rebasing and the rent/price yield model.)
base <- levels_plot %>%
  filter(
    !is.na(sbitop_close),
    !is.na(dy_eq_ann),
    !is.na(hicp_rent),
    !is.na(hpi_si_eurostat),
    !is.na(hpi_si_surs_used),
    !is.na(hpi_lj_surs_used),
    !is.na(hpi_si_exlj_surs_used)
  ) %>%
  slice(1)

P0  <- base$sbitop_close
R0  <- base$hicp_rent
He0 <- base$hpi_si_eurostat
Hs0 <- base$hpi_si_surs_used
Hl0 <- base$hpi_lj_surs_used
Hx0 <- base$hpi_si_exlj_surs_used

# 4) Construct quarterly total log returns (price + income) for each series
#    Equities: log price return + dividend yield/4
#    Real estate: log price return + lagged rental yield/4
levels_plot <- levels_plot %>%
  mutate(
    # Equity quarterly income (baseline: evenly across quarters)
    dy_eq_q = dy_eq_ann / 4,

    # Equity log returns
    r_eq_price_log = log(sbitop_close / lag(sbitop_close)),
    r_eq_total_log = r_eq_price_log + dy_eq_q,

    # Real estate price log returns
    r_re_eu_price_log   = log(hpi_si_eurostat / lag(hpi_si_eurostat)),
    r_re_si_price_log   = log(hpi_si_surs_used / lag(hpi_si_surs_used)),
    r_re_lj_price_log   = log(hpi_lj_surs_used / lag(hpi_lj_surs_used)),
    r_re_exlj_price_log = log(hpi_si_exlj_surs_used / lag(hpi_si_exlj_surs_used)),

    # Time-varying rental yields via rent/price ratio (anchored at base quarter)
    y_eu_t   = y0_ann * ((hicp_rent / R0) / (hpi_si_eurostat / He0)),
    y_si_t   = y0_ann * ((hicp_rent / R0) / (hpi_si_surs_used / Hs0)),
    y_lj_t   = y0_ann * ((hicp_rent / R0) / (hpi_lj_surs_used / Hl0)),
    y_exlj_t = y0_ann * ((hicp_rent / R0) / (hpi_si_exlj_surs_used / Hx0)),

    # Quarterly rental income (lagged yield to avoid look-ahead bias)
    r_re_eu_income_q   = lag(y_eu_t) / 4,
    r_re_si_income_q   = lag(y_si_t) / 4,
    r_re_lj_income_q   = lag(y_lj_t) / 4,
    r_re_exlj_income_q = lag(y_exlj_t) / 4,

    # Real estate total log returns
    r_re_eu_total_log   = r_re_eu_price_log   + r_re_eu_income_q,
    r_re_si_total_log   = r_re_si_price_log   + r_re_si_income_q,
    r_re_lj_total_log   = r_re_lj_price_log   + r_re_lj_income_q,
    r_re_exlj_total_log = r_re_exlj_price_log + r_re_exlj_income_q
  )

# 5) Build wealth indices (start = 100) from total log returns
#    replace_na(..., 0) ensures the first return is treated as 0 for cumulation.
levels_plot <- levels_plot %>%
  mutate(
    idx_eq_tr       = 100 * exp(cumsum(replace_na(r_eq_total_log, 0))),
    idx_re_eu_tr    = 100 * exp(cumsum(replace_na(r_re_eu_total_log, 0))),
    idx_re_si_tr    = 100 * exp(cumsum(replace_na(r_re_si_total_log, 0))),
    idx_re_lj_tr    = 100 * exp(cumsum(replace_na(r_re_lj_total_log, 0))),
    idx_re_exlj_tr  = 100 * exp(cumsum(replace_na(r_re_exlj_total_log, 0)))
  )

# 6) Convert to long format + readable labels (wrap to prevent legend clipping)
plot_1b_data <- levels_plot %>%
  select(date_q, idx_eq_tr, idx_re_eu_tr, idx_re_si_tr, idx_re_lj_tr, idx_re_exlj_tr) %>%
  pivot_longer(-date_q, names_to = "series", values_to = "wealth") %>%
  mutate(
    series = recode(series,
      idx_eq_tr      = "Equities: SBITOP (total return: price + dividends)",
      idx_re_eu_tr   = "Real estate: Slovenia (Eurostat HPI + rent model)",
      idx_re_si_tr   = "Real estate: Slovenia (SURS selected HPI + rent model)",
      idx_re_lj_tr   = "Real estate: Ljubljana (SURS selected HPI + rent model)",
      idx_re_exlj_tr = "Real estate: Slovenia ex-LJ (SURS selected HPI + rent model)"
    ),
    series = str_wrap(series, width = 34)
  )

legend_levels_1b <- str_wrap(c(
  "Equities: SBITOP (total return: price + dividends)",
  "Real estate: Slovenia (Eurostat HPI + rent model)",
  "Real estate: Slovenia (SURS selected HPI + rent model)",
  "Real estate: Ljubljana (SURS selected HPI + rent model)",
  "Real estate: Slovenia ex-LJ (SURS selected HPI + rent model)"
), width = 34)

plot_1b_data <- plot_1b_data %>%
  mutate(series = factor(series, levels = legend_levels_1b))

# 7) Optional event markers for narrative context (reuse same dates as Figure 1A)
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

y_top <- max(plot_1b_data$wealth, na.rm = TRUE)

# 8) Reuse the same thesis theme as Figure 3 for visual consistency
theme_thesis <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10.5, color = "grey30"),
      legend.position = "bottom",
      legend.title = element_blank(),
      legend.text = element_text(size = 9.5),
      legend.key.width = unit(14, "pt"),
      legend.spacing.x = unit(6, "pt"),
      legend.box.margin = margin(t = 8),
      plot.margin = margin(t = 10, r = 10, b = 10, l = 10),
      panel.grid.minor = element_blank()
    )
}

# 9) Plot Figure 4
p1b <- ggplot(plot_1b_data, aes(x = date_q, y = wealth, color = series, linetype = series)) +
  geom_line(linewidth = 1.0) +
  scale_y_continuous(labels = label_number(accuracy = 1)) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  guides(
    color = guide_legend(ncol = 2, byrow = TRUE),
    linetype = "none"
  ) +
  labs(
    x = NULL,
    y = "Wealth index (base = 100 at start)"
  ) +
  theme_thesis() +
  geom_vline(
    data = events,
    aes(xintercept = date),
    linewidth = 0.4,
    linetype = 3,
    color = "grey60",
    inherit.aes = FALSE
  ) +
  geom_text(
    data = events,
    aes(x = date, y = 0.96 * y_top, label = label),
    vjust = -0.3,
    hjust = 0,
    size = 3.2,
    color = "grey35",
    inherit.aes = FALSE
  )

# 10) Print and save (same canvas approach as Figure 3 to prevent clipping)
p1b

ggsave(
  filename = "Figure_4_total_return_wealth_indices.png",
  plot = p1b,
  width = 12,
  height = 8.2,
  dpi = 300,
  limitsize = FALSE
)

3.3. Graph 5: Real returns

# ------------------------------------------------------------
# FIGURE 5: REAL total return wealth indices (inflation-adjusted), indexed to 100
#
# What this figure represents:
#   - Same total-return investor experience as Figure 1B (dividends + rent model),
#     but expressed in REAL terms by deflating with HICP all-items inflation.
#
# Method (consistent with your methodology):
#   1) Compute nominal total log returns for each asset:
#        - Equities: r_eq_total_log = log(P_t/P_{t-1}) + DY/4
#        - Real estate: r_re_total_log = log(HPI_t/HPI_{t-1}) + lag(y_t)/4
#   2) Compute quarterly inflation log change:
#        - pi_log = log(HICP_all_t / HICP_all_{t-1})
#   3) Compute real total log returns:
#        - r_real_log = r_nominal_log - pi_log
#   4) Build real wealth indices:
#        - idx_real = 100 * exp(cumsum(r_real_log))
#
# Output:
#   - Prints a clean figure in RStudio/Rmd (same style as Figures 1A and 1B)
#   - Saves a high-resolution PNG that will not clip the legend
# ------------------------------------------------------------

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(stringr)

# 1) Start from the master LEVELS dataset and ensure time order
levels_plot <- master %>% arrange(date_q)

# 2) Parameters for baseline rental income model (same as Figure 1B)
y0_ann <- 0.04  # base gross annual rental yield (baseline)

# 3) Robust base quarter: first quarter where all required series exist
base <- levels_plot %>%
  filter(
    !is.na(sbitop_close),
    !is.na(dy_eq_ann),
    !is.na(hicp_rent),
    !is.na(hicp_all),
    !is.na(hpi_si_eurostat),
    !is.na(hpi_si_surs_used),
    !is.na(hpi_lj_surs_used),
    !is.na(hpi_si_exlj_surs_used)
  ) %>%
  slice(1)

R0  <- base$hicp_rent
He0 <- base$hpi_si_eurostat
Hs0 <- base$hpi_si_surs_used
Hl0 <- base$hpi_lj_surs_used
Hx0 <- base$hpi_si_exlj_surs_used

# 4) Inflation log change (quarterly)
#    This is used to deflate nominal returns into real returns.
levels_plot <- levels_plot %>%
  mutate(
    pi_log = log(hicp_all / lag(hicp_all))
  )

# 5) Construct nominal total log returns (same construction as Figure 4)
levels_plot <- levels_plot %>%
  mutate(
    # Equity income: annual dividend yield allocated evenly across quarters
    dy_eq_q = dy_eq_ann / 4,

    # Equity nominal total log return
    r_eq_price_log = log(sbitop_close / lag(sbitop_close)),
    r_eq_total_log = r_eq_price_log + dy_eq_q,

    # Real estate price log returns
    r_re_eu_price_log   = log(hpi_si_eurostat / lag(hpi_si_eurostat)),
    r_re_si_price_log   = log(hpi_si_surs_used / lag(hpi_si_surs_used)),
    r_re_lj_price_log   = log(hpi_lj_surs_used / lag(hpi_lj_surs_used)),
    r_re_exlj_price_log = log(hpi_si_exlj_surs_used / lag(hpi_si_exlj_surs_used)),

    # Time-varying rental yields via rent/price ratio (anchored at base quarter)
    y_eu_t   = y0_ann * ((hicp_rent / R0) / (hpi_si_eurostat / He0)),
    y_si_t   = y0_ann * ((hicp_rent / R0) / (hpi_si_surs_used / Hs0)),
    y_lj_t   = y0_ann * ((hicp_rent / R0) / (hpi_lj_surs_used / Hl0)),
    y_exlj_t = y0_ann * ((hicp_rent / R0) / (hpi_si_exlj_surs_used / Hx0)),

    # Quarterly rental income (lagged yield to avoid look-ahead bias)
    r_re_eu_income_q   = lag(y_eu_t) / 4,
    r_re_si_income_q   = lag(y_si_t) / 4,
    r_re_lj_income_q   = lag(y_lj_t) / 4,
    r_re_exlj_income_q = lag(y_exlj_t) / 4,

    # Real estate nominal total log returns
    r_re_eu_total_log   = r_re_eu_price_log   + r_re_eu_income_q,
    r_re_si_total_log   = r_re_si_price_log   + r_re_si_income_q,
    r_re_lj_total_log   = r_re_lj_price_log   + r_re_lj_income_q,
    r_re_exlj_total_log = r_re_exlj_price_log + r_re_exlj_income_q
  )

# 6) Convert nominal total log returns to REAL total log returns by subtracting inflation
levels_plot <- levels_plot %>%
  mutate(
    r_eq_total_real_log      = r_eq_total_log      - pi_log,
    r_re_eu_total_real_log   = r_re_eu_total_log   - pi_log,
    r_re_si_total_real_log   = r_re_si_total_log   - pi_log,
    r_re_lj_total_real_log   = r_re_lj_total_log   - pi_log,
    r_re_exlj_total_real_log = r_re_exlj_total_log - pi_log
  )

# 7) Build REAL wealth indices from real log returns (start = 100)
levels_plot <- levels_plot %>%
  mutate(
    idx_eq_tr_real       = 100 * exp(cumsum(replace_na(r_eq_total_real_log, 0))),
    idx_re_eu_tr_real    = 100 * exp(cumsum(replace_na(r_re_eu_total_real_log, 0))),
    idx_re_si_tr_real    = 100 * exp(cumsum(replace_na(r_re_si_total_real_log, 0))),
    idx_re_lj_tr_real    = 100 * exp(cumsum(replace_na(r_re_lj_total_real_log, 0))),
    idx_re_exlj_tr_real  = 100 * exp(cumsum(replace_na(r_re_exlj_total_real_log, 0)))
  )

# 8) Convert to long format + readable labels (wrap to prevent legend clipping)
plot_1c_data <- levels_plot %>%
  select(date_q, idx_eq_tr_real, idx_re_eu_tr_real, idx_re_si_tr_real, idx_re_lj_tr_real, idx_re_exlj_tr_real) %>%
  pivot_longer(-date_q, names_to = "series", values_to = "wealth_real") %>%
  mutate(
    series = recode(series,
      idx_eq_tr_real      = "Equities: SBITOP (real total return)",
      idx_re_eu_tr_real   = "Real estate: Slovenia (Eurostat HPI + rent model, real)",
      idx_re_si_tr_real   = "Real estate: Slovenia (SURS selected HPI + rent model, real)",
      idx_re_lj_tr_real   = "Real estate: Ljubljana (SURS selected HPI + rent model, real)",
      idx_re_exlj_tr_real = "Real estate: Slovenia ex-LJ (SURS selected HPI + rent model, real)"
    ),
    series = str_wrap(series, width = 34)
  )

legend_levels_1c <- str_wrap(c(
  "Equities: SBITOP (real total return)",
  "Real estate: Slovenia (Eurostat HPI + rent model, real)",
  "Real estate: Slovenia (SURS selected HPI + rent model, real)",
  "Real estate: Ljubljana (SURS selected HPI + rent model, real)",
  "Real estate: Slovenia ex-LJ (SURS selected HPI + rent model, real)"
), width = 34)

plot_1c_data <- plot_1c_data %>%
  mutate(series = factor(series, levels = legend_levels_1c))

# 9) Optional event markers (same as Figures 3 and 4)
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

y_top <- max(plot_1c_data$wealth_real, na.rm = TRUE)

# 10) Same thesis theme as Figures 3 and 4 (keep visual consistency)
theme_thesis <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10.5, color = "grey30"),
      legend.position = "bottom",
      legend.title = element_blank(),
      legend.text = element_text(size = 9.5),
      legend.key.width = unit(14, "pt"),
      legend.spacing.x = unit(6, "pt"),
      legend.box.margin = margin(t = 8),
      plot.margin = margin(t = 10, r = 10, b = 10, l = 10),
      panel.grid.minor = element_blank()
    )
}

# 11) Plot Figure 5
p1c <- ggplot(plot_1c_data, aes(x = date_q, y = wealth_real, color = series, linetype = series)) +
  geom_line(linewidth = 1.0) +
  scale_y_continuous(labels = label_number(accuracy = 1)) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  guides(
    color = guide_legend(ncol = 2, byrow = TRUE),
    linetype = "none"
  ) +
  labs(
    x = NULL,
    y = "Real wealth index (base = 100 at start)"
  ) +
  theme_thesis() +
  geom_vline(
    data = events,
    aes(xintercept = date),
    linewidth = 0.4,
    linetype = 3,
    color = "grey60",
    inherit.aes = FALSE
  ) +
  geom_text(
    data = events,
    aes(x = date, y = 0.96 * y_top, label = label),
    vjust = -0.3,
    hjust = 0,
    size = 3.2,
    color = "grey35",
    inherit.aes = FALSE
  )

# 12) Print and save (same canvas as Figures 3 and 4)
p1c

ggsave(
  filename = "Figure_5_real_total_return_wealth_indices.png",
  plot = p1c,
  width = 12,
  height = 8.2,
  dpi = 300,
  limitsize = FALSE
)

3.3. Graph 6 & 7: Nominal & Real returns

# ============================================================
# FIGURE 6 & 7 (TWO-PANEL, TWO LEGENDS)
# Panel 1 (bigger): Equities vs Slovenia real estate (Eurostat)
# Panel 2 (smaller): Real estate comparison (Eurostat vs SURS regions)
# Each panel has its OWN legend directly below it.
# Colors/roles are consistent with Figures 3–5.
# ============================================================

library(dplyr)
library(tidyr)
library(lubridate)
library(ggplot2)
library(scales)
library(patchwork)

# -----------------------------
# 0) Thesis theme (consistent)
# -----------------------------
theme_thesis <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10.5, color = "grey30"),
      legend.position = "bottom",
      legend.title = element_blank(),
      legend.text = element_text(size = 9.5),
      legend.key.width = unit(14, "pt"),
      legend.spacing.x = unit(6, "pt"),
      legend.box.margin = margin(t = 6),
      plot.margin = margin(t = 10, r = 10, b = 10, l = 10),
      panel.grid.minor = element_blank()
    )
}

# -----------------------------
# 1) Event markers (same dates)
# -----------------------------
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

add_events <- function(p, y_top) {
  p +
    geom_vline(
      data = events,
      aes(xintercept = date),
      linewidth = 0.4,
      linetype = 3,
      color = "grey60",
      inherit.aes = FALSE
    ) +
    geom_text(
      data = events,
      aes(x = date, y = 0.96 * y_top, label = label),
      vjust = -0.3,
      hjust = 0,
      size = 3.2,
      color = "grey35",
      inherit.aes = FALSE
    )
}

# -----------------------------
# 2) Global labels (stable)
# -----------------------------
LAB_EQ <- "Equities: SBITOP"
LAB_EU <- "Real estate: Slovenia (Eurostat)"
LAB_SI <- "Real estate: Slovenia (SURS)"
LAB_LJ <- "Real estate: Ljubljana (SURS)"
LAB_EX <- "Real estate: Slovenia ex-LJ (SURS)"

# Legend order (global + per panel)
LEGEND_ALL <- c(LAB_EQ, LAB_EU, LAB_SI, LAB_LJ, LAB_EX)
LEGEND_P1  <- c(LAB_EQ, LAB_EU)
LEGEND_P2  <- c(LAB_EU, LAB_SI, LAB_LJ, LAB_EX)

# -----------------------------
# 3) Global colors/linetypes
# (match roles used in 3–5)
# -----------------------------
COLORS <- c(
  `Equities: SBITOP`                    = "#F8766D", # red
  `Real estate: Slovenia (Eurostat)`    = "#B79F00", # mustard
  `Real estate: Slovenia (SURS)`        = "#00BFC4", # teal
  `Real estate: Ljubljana (SURS)`       = "#619CFF", # blue
  `Real estate: Slovenia ex-LJ (SURS)`  = "#C77CFF"  # purple
)

LTYPES <- c(
  `Equities: SBITOP`                    = "solid",
  `Real estate: Slovenia (Eurostat)`    = "dashed",
  `Real estate: Slovenia (SURS)`        = "dashed",
  `Real estate: Ljubljana (SURS)`       = "dashed",
  `Real estate: Slovenia ex-LJ (SURS)`  = "dotted"
)

# -----------------------------
# 4) Build return series ONCE
# (assumes master exists)
# -----------------------------
y0_ann <- 0.04
df <- master %>% arrange(date_q)

# Anchors for rent-yield model (robust)
R0  <- df %>% filter(!is.na(hicp_rent)) %>% slice(1) %>% pull(hicp_rent)
He0 <- df %>% filter(!is.na(hpi_si_eurostat)) %>% slice(1) %>% pull(hpi_si_eurostat)
Hs0 <- df %>% filter(!is.na(hpi_si_surs_used)) %>% slice(1) %>% pull(hpi_si_surs_used)
Hl0 <- df %>% filter(!is.na(hpi_lj_surs_used)) %>% slice(1) %>% pull(hpi_lj_surs_used)
Hx0 <- df %>% filter(!is.na(hpi_si_exlj_surs_used)) %>% slice(1) %>% pull(hpi_si_exlj_surs_used)

df <- df %>%
  mutate(
    # Inflation (quarterly log change)
    pi_log = log(hicp_all / lag(hicp_all)),

    # Equities: nominal total log return
    dy_eq_q        = dy_eq_ann / 4,
    r_eq_price_log = log(sbitop_close / lag(sbitop_close)),
    r_eq_total_log = r_eq_price_log + dy_eq_q,

    # Real estate (Eurostat): nominal total log return
    r_re_eu_price_log = log(hpi_si_eurostat / lag(hpi_si_eurostat)),
    y_eu_t            = y0_ann * ((hicp_rent / R0) / (hpi_si_eurostat / He0)),
    r_re_eu_income_q  = lag(y_eu_t) / 4,
    r_re_eu_total_log = r_re_eu_price_log + r_re_eu_income_q,

    # Real estate (SURS regions): nominal total log returns
    r_re_si_price_log = log(hpi_si_surs_used / lag(hpi_si_surs_used)),
    r_re_lj_price_log = log(hpi_lj_surs_used / lag(hpi_lj_surs_used)),
    r_re_ex_price_log = log(hpi_si_exlj_surs_used / lag(hpi_si_exlj_surs_used)),

    y_si_t = y0_ann * ((hicp_rent / R0) / (hpi_si_surs_used / Hs0)),
    y_lj_t = y0_ann * ((hicp_rent / R0) / (hpi_lj_surs_used / Hl0)),
    y_ex_t = y0_ann * ((hicp_rent / R0) / (hpi_si_exlj_surs_used / Hx0)),

    r_re_si_income_q = lag(y_si_t) / 4,
    r_re_lj_income_q = lag(y_lj_t) / 4,
    r_re_ex_income_q = lag(y_ex_t) / 4,

    r_re_si_total_log = r_re_si_price_log + r_re_si_income_q,
    r_re_lj_total_log = r_re_lj_price_log + r_re_lj_income_q,
    r_re_ex_total_log = r_re_ex_price_log + r_re_ex_income_q,

    # Real (inflation-adjusted) total log returns
    r_eq_total_real_log    = r_eq_total_log    - pi_log,
    r_re_eu_total_real_log = r_re_eu_total_log - pi_log,
    r_re_si_total_real_log = r_re_si_total_log - pi_log,
    r_re_lj_total_real_log = r_re_lj_total_log - pi_log,
    r_re_ex_total_real_log = r_re_ex_total_log - pi_log
  )

# -----------------------------
# 5) Factory: 2-panel figure with 2 legends
# -----------------------------
make_two_panel_two_legends <- function(df,
                                       eq_col, eu_col, si_col, lj_col, ex_col,
                                      fig_subtitle, ylab,
                                       out_file) {

  # Panel 1 data: EQ vs Eurostat
  p1_data <- df %>%
    transmute(
      date_q,
      `Equities: SBITOP` = .data[[eq_col]],
      `Real estate: Slovenia (Eurostat)` = .data[[eu_col]]
    ) %>%
    pivot_longer(-date_q, names_to = "series", values_to = "ret") %>%
    filter(!is.na(ret)) %>%
    mutate(series = factor(series, levels = LEGEND_ALL))

  # Panel 2 data: RE comparison only
  p2_data <- df %>%
    transmute(
      date_q,
      `Real estate: Slovenia (Eurostat)`   = .data[[eu_col]],
      `Real estate: Slovenia (SURS)`       = .data[[si_col]],
      `Real estate: Ljubljana (SURS)`      = .data[[lj_col]],
      `Real estate: Slovenia ex-LJ (SURS)` = .data[[ex_col]]
    ) %>%
    pivot_longer(-date_q, names_to = "series", values_to = "ret") %>%
    filter(!is.na(ret)) %>%
    mutate(series = factor(series, levels = LEGEND_ALL))

  y_top_p1 <- max(p1_data$ret, na.rm = TRUE)
  y_top_p2 <- max(p2_data$ret, na.rm = TRUE)

  # Panel 1 plot
  panel1 <- ggplot(p1_data, aes(date_q, ret, color = series, linetype = series)) +
    geom_hline(yintercept = 0, linewidth = 0.4, color = "grey70") +
    geom_line(linewidth = 1.05) +
    scale_color_manual(values = COLORS, breaks = LEGEND_P1, drop = FALSE) +
    scale_linetype_manual(values = LTYPES, breaks = LEGEND_P1, drop = FALSE) +
    scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
    scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
    labs(
      subtitle = fig_subtitle,
      x = NULL,
      y = ylab
    ) +
    theme_thesis() +
    theme(
      legend.position = "bottom",
      plot.margin = margin(t = 10, r = 10, b = 6, l = 10)
    ) +
    guides(
      color = guide_legend(ncol = 2, byrow = TRUE),
      linetype = "none"
    )

  panel1 <- add_events(panel1, y_top_p1)

  # Panel 2 plot
  panel2 <- ggplot(p2_data, aes(date_q, ret, color = series, linetype = series)) +
    geom_hline(yintercept = 0, linewidth = 0.4, color = "grey70") +
    geom_line(linewidth = 0.95) +
    scale_color_manual(values = COLORS, breaks = LEGEND_P2, drop = FALSE) +
    scale_linetype_manual(values = LTYPES, breaks = LEGEND_P2, drop = FALSE) +
    scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
    scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
    labs(
      subtitle = "Panel 2: Real estate comparison (Eurostat vs SURS regions)",
      x = NULL, y = NULL
    ) +
    theme_thesis() +
    theme(
      plot.subtitle = element_text(size = 10, color = "grey30", face = "bold"),
      legend.position = "bottom",
      plot.margin = margin(t = 0, r = 10, b = 10, l = 10)
    ) +
    guides(
      color = guide_legend(ncol = 2, byrow = TRUE),
      linetype = "none"
    )

  panel2 <- add_events(panel2, y_top_p2)

  # Combine (Panel 1 larger)
  fig <- panel1 / panel2 + plot_layout(heights = c(2.2, 1))

  print(fig)

  ggsave(
    filename = out_file,
    plot = fig,
    width = 12,
    height = 10.4,
    dpi = 300,
    limitsize = FALSE
  )
}

# -----------------------------
# 6) FIGURE 6 (Nominal)
# -----------------------------
make_two_panel_two_legends(
  df = df,
  eq_col = "r_eq_total_log",
  eu_col = "r_re_eu_total_log",
  si_col = "r_re_si_total_log",
  lj_col = "r_re_lj_total_log",
  ex_col = "r_re_ex_total_log",
  fig_subtitle = "Panel 1: Equities vs Slovenia real estate (Eurostat).",
  ylab = "Quarterly total return (log, nominal)",
  out_file = "Figure_6_quarterly_nominal_total_returns.png"
)

# -----------------------------
# 7) FIGURE 7 (Real)
# -----------------------------
make_two_panel_two_legends(
  df = df,
  eq_col = "r_eq_total_real_log",
  eu_col = "r_re_eu_total_real_log",
  si_col = "r_re_si_total_real_log",
  lj_col = "r_re_lj_total_real_log",
  ex_col = "r_re_ex_total_real_log",
  fig_subtitle = "Panel 1: Equities vs Eurostat Slovenia real estate.",
  ylab = "Quarterly total return (log, real)",
  out_file = "Figure_7_quarterly_real_total_returns.png"
)

3.4. Graph 8 & 9: Return distribution & QQ plot

# ============================================================
# Figures:
#   - Figure 8: Return distribution (density) — equities vs Eurostat real estate (NOMINAL)
#   - Figure 9: QQ plots vs Normal — equities and Eurostat real estate (NOMINAL)
#
# Scope (as requested): ONLY 2 asset classes
#   1) Equities: SBITOP (nominal total log return)
#   2) Real estate: Slovenia (Eurostat, nominal total log return)
#
# Notes:
# - Uses quarterly NOMINAL log total returns already computed earlier in your script:
#     r_eq_total_log, r_re_eu_total_log
# - If you have not computed these columns yet, run the Figure 2A/2B data-prep chunk first.
# ============================================================

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)

# -----------------------------
# 1) Prepare the return dataset
# -----------------------------
ret2 <- df %>%   # df from your previous chunk where returns are computed
  select(date_q, r_eq_total_log, r_re_eu_total_log) %>%
  pivot_longer(-date_q, names_to = "asset", values_to = "ret_log") %>%
  filter(!is.na(ret_log)) %>%
  mutate(
    asset = recode(asset,
      r_eq_total_log    = "Equities: SBITOP",
      r_re_eu_total_log = "Real estate: Slovenia (Eurostat)"
    ),
    # keep the same color roles as earlier figures (SBITOP red, Eurostat mustard)
    asset = factor(asset, levels = c("Equities: SBITOP", "Real estate: Slovenia (Eurostat)"))
  )

# manual colors consistent with your earlier palette
COL_2ASSETS <- c(
  "Equities: SBITOP" = "#F8766D",
  "Real estate: Slovenia (Eurostat)" = "#B79F00"
)

# -----------------------------
# 2) Figure 8: Distribution plot
# -----------------------------
# Density is usually cleaner than histogram for thesis comparisons.
# Keep y-axis as density and x-axis in percent units.
p2c <- ggplot(ret2, aes(x = ret_log, color = asset)) +
  geom_density(linewidth = 1.05, adjust = 1.05) +
  geom_vline(xintercept = 0, linewidth = 0.35, color = "grey65") +
  scale_color_manual(values = COL_2ASSETS) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    x = "Quarterly total return (log, nominal)",
    y = "Density"
  ) +
  theme_thesis() +
  theme(
    legend.position = "bottom",
    legend.title = element_blank()
  )

p2c

ggsave(
  filename = "Figure_8_return_distribution_density_nominal.png",
  plot = p2c,
  width = 12,
  height = 6.8,
  dpi = 300,
  limitsize = FALSE
)

# -----------------------------
# 3) Figure 9: QQ plots vs Normal
# -----------------------------
# Approach:
# - Build two QQ plots (one per asset) and stack them.
# - Use sample mean/sd implied by ggplot's stat_qq_line().
# - This visually shows tail fatness / deviations from normality.

# 1) Build a clean long dataset of nominal quarterly log total returns (same inputs as Fig 2C)
qq_df <- df %>%
  select(date_q, r_eq_total_log, r_re_eu_total_log) %>%
  pivot_longer(-date_q, names_to = "asset", values_to = "ret_log") %>%
  filter(!is.na(ret_log)) %>%
  mutate(
    asset = recode(asset,
      r_eq_total_log    = "Equities: SBITOP",
      r_re_eu_total_log = "Real estate: Slovenia (Eurostat)"
    ),
    asset = factor(asset, levels = c("Equities: SBITOP", "Real estate: Slovenia (Eurostat)"))
  )

# 2) Use the SAME y-scale in both panels:
#    - set limits using the combined min/max across BOTH assets
y_lim <- range(qq_df$ret_log, na.rm = TRUE)

# 3) Use the SAME x-scale in both panels:
#    - theoretical quantiles depend only on n; use the MAX sample size (conservative)
n_max <- qq_df %>% count(asset) %>% pull(n) %>% max()
x_lim <- range(qnorm(ppoints(n_max)))  # same theoretical range for both panels

# 4) Colors consistent with your previous figures
COL_2ASSETS <- c(
  "Equities: SBITOP" = "#F8766D",
  "Real estate: Slovenia (Eurostat)" = "#B79F00"
)

# 5) Plot: separate QQ line per panel (normal line fitted to that asset),
#    but identical axis limits so the comparison is apples-to-apples.
p2d <- ggplot(qq_df, aes(sample = ret_log, color = asset)) +
  stat_qq(size = 1.6, alpha = 0.85) +
  stat_qq_line(linewidth = 0.8, color = "grey35") +
  facet_wrap(~ asset, ncol = 1) +
  scale_color_manual(values = COL_2ASSETS, guide = "none") +
  coord_cartesian(xlim = x_lim, ylim = y_lim) +
  scale_x_continuous(labels = label_number(accuracy = 0.1)) +
  scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
  labs(
    x = "Theoretical quantiles (Normal)",
    y = "Sample quantiles (quarterly log total return, nominal)"
  ) +
  theme_thesis() +
  theme(strip.text = element_text(face = "bold"))

p2d

ggsave(
  filename = "Figure_9_QQplots_same_scale_nominal.png",
  plot = p2d,
  width = 12,
  height = 8.2,
  dpi = 300,
  limitsize = FALSE
)

3.5. Graph 10: Rolling volatility

# ============================================================
# FIGURE 10: Rolling volatility (8-quarter rolling standard deviation)
# + Event markers (Euro area crisis, COVID-19 shock, Inflation & rate shock)
# Panel 1: SBITOP vs RE Slovenia (Eurostat)
# Panel 2: RE comparison (Eurostat vs SURS SI/LJ/ex-LJ)
# ============================================================

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(zoo)
library(patchwork)

# --- Rolling window length (8 quarters = 2 years) ---
win_q <- 8

# --- Event markers (same as previous figures) ---
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

# --- Colors consistent with Figures 1A–2D ---
COL_5 <- c(
  "Equities: SBITOP" = "#F8766D",
  "Real estate: Slovenia (Eurostat)" = "#B79F00",
  "Real estate: Slovenia (SURS)" = "#00BFC4",
  "Real estate: Ljubljana (SURS)" = "#619CFF",
  "Real estate: Slovenia ex-LJ (SURS)" = "#C77CFF"
)

# --- Linetypes: equities solid; real estate dashed (as in prior figures) ---
LT_5 <- c(
  "Equities: SBITOP" = "solid",
  "Real estate: Slovenia (Eurostat)" = "dashed",
  "Real estate: Slovenia (SURS)" = "dashed",
  "Real estate: Ljubljana (SURS)" = "dashed",
  "Real estate: Slovenia ex-LJ (SURS)" = "dashed"
)

# --- Rolling SD helper ---
roll_sd <- function(x, k) {
  zoo::rollapply(x, width = k, FUN = sd, fill = NA, align = "right", na.rm = TRUE)
}

# 1) Build df_vol (assumes df exists from your Figure 2A/2B chunk)
df_vol <- df %>%
  arrange(date_q) %>%
  select(
    date_q,
    r_eq_total_log,
    r_re_eu_total_log,
    r_re_si_total_log,
    r_re_lj_total_log,
    r_re_ex_total_log
  ) %>%
  mutate(
    vol_eq      = roll_sd(r_eq_total_log, win_q),
    vol_re_eu   = roll_sd(r_re_eu_total_log, win_q),
    vol_re_si   = roll_sd(r_re_si_total_log, win_q),
    vol_re_lj   = roll_sd(r_re_lj_total_log, win_q),
    vol_re_exlj = roll_sd(r_re_ex_total_log, win_q)
  )

# 2) Panel 1 data (ONLY SBITOP + Eurostat)
p3a_p1_data <- df_vol %>%
  select(date_q, vol_eq, vol_re_eu) %>%
  pivot_longer(-date_q, names_to = "series", values_to = "vol") %>%
  filter(!is.na(vol)) %>%
  mutate(
    series = recode(series,
      vol_eq    = "Equities: SBITOP",
      vol_re_eu = "Real estate: Slovenia (Eurostat)"
    ),
    series = factor(series, levels = c("Equities: SBITOP", "Real estate: Slovenia (Eurostat)"))
  )

# 3) Panel 2 data (ONLY real estate series)
p3a_p2_data <- df_vol %>%
  select(date_q, vol_re_eu, vol_re_si, vol_re_lj, vol_re_exlj) %>%
  pivot_longer(-date_q, names_to = "series", values_to = "vol") %>%
  filter(!is.na(vol)) %>%
  mutate(
    series = recode(series,
      vol_re_eu   = "Real estate: Slovenia (Eurostat)",
      vol_re_si   = "Real estate: Slovenia (SURS)",
      vol_re_lj   = "Real estate: Ljubljana (SURS)",
      vol_re_exlj = "Real estate: Slovenia ex-LJ (SURS)"
    ),
    series = factor(series, levels = c(
      "Real estate: Slovenia (Eurostat)",
      "Real estate: Slovenia (SURS)",
      "Real estate: Ljubljana (SURS)",
      "Real estate: Slovenia ex-LJ (SURS)"
    ))
  )

# 4) Y-positions for event labels (panel-specific so labels sit near the top)
y_top_p1 <- max(p3a_p1_data$vol, na.rm = TRUE)
y_top_p2 <- max(p3a_p2_data$vol, na.rm = TRUE)

# 5) Panel 1 plot + events
p3a_panel1 <- ggplot(p3a_p1_data, aes(date_q, vol, color = series, linetype = series)) +
  geom_line(linewidth = 1.05) +
  scale_color_manual(values = COL_5, drop = FALSE) +
  scale_linetype_manual(values = LT_5, drop = FALSE) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
  labs(
    subtitle = "Panel 1: SBITOP vs Slovenia real estate (Eurostat).",
    x = NULL,
    y = "Rolling SD of quarterly total returns (log, nominal)"
  ) +
  theme_thesis() +
  guides(color = guide_legend(ncol = 2, byrow = TRUE), linetype = "none") +
  geom_vline(
    data = events, aes(xintercept = date),
    linewidth = 0.4, linetype = 3, color = "grey60", inherit.aes = FALSE
  ) +
  geom_text(
    data = events, aes(x = date, y = 0.96 * y_top_p1, label = label),
    vjust = -0.3, hjust = 0, size = 3.2, color = "grey35", inherit.aes = FALSE
  )

# 6) Panel 2 plot + events
p3a_panel2 <- ggplot(p3a_p2_data, aes(date_q, vol, color = series, linetype = series)) +
  geom_line(linewidth = 0.95) +
  scale_color_manual(values = COL_5, drop = FALSE) +
  scale_linetype_manual(values = LT_5, drop = FALSE) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
  labs(
    subtitle = "Panel 2: Real estate volatility comparison (Eurostat vs SURS regions)",
    x = NULL,
    y = NULL
  ) +
  theme_thesis() +
  guides(color = guide_legend(ncol = 2, byrow = TRUE), linetype = "none") +
  geom_vline(
    data = events, aes(xintercept = date),
    linewidth = 0.4, linetype = 3, color = "grey60", inherit.aes = FALSE
  ) +
  geom_text(
    data = events, aes(x = date, y = 0.96 * y_top_p2, label = label),
    vjust = -0.3, hjust = 0, size = 3.2, color = "grey35", inherit.aes = FALSE
  )

# 7) Combine (Panel 1 larger)
p3a <- p3a_panel1 / p3a_panel2 + patchwork::plot_layout(heights = c(2.2, 1))

p3a

ggsave(
  filename = "Figure_10_rolling_volatility_8q.png",
  plot = p3a,
  width = 12,
  height = 8.2,
  dpi = 300,
  limitsize = FALSE
)

3.6. Graph 11: Drawdowns

# ============================================================
# FIGURE 11: Drawdowns (underwater plot)
# - Panel 1 (larger): SBITOP vs Slovenia real estate (Eurostat)
# - Panel 2 (smaller): Real estate drawdowns (Eurostat vs SURS regions)
# - Separate legends per panel (only series shown)
# - Event labels visible in BOTH panels
# ============================================================

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(patchwork)

# ---- Event markers (same as before) ----
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

# ---- Colours consistent with your previous figures ----
COLS <- c(
  "Equities: SBITOP"                  = "#F8766D",
  "Real estate: Slovenia (Eurostat)"  = "#B79F00",
  "Real estate: Slovenia (SURS)"      = "#00BFC4",
  "Real estate: Ljubljana (SURS)"     = "#619CFF",
  "Real estate: Slovenia ex-LJ (SURS)"= "#C77CFF"
)

# ---- Linetypes consistent with your convention ----
LTYPES <- c(
  "Equities: SBITOP"                  = "solid",
  "Real estate: Slovenia (Eurostat)"  = "dashed",
  "Real estate: Slovenia (SURS)"      = "dashed",
  "Real estate: Ljubljana (SURS)"     = "dashed",
  "Real estate: Slovenia ex-LJ (SURS)"= "dashed"
)

# ---- Helper: wealth index from log returns ----
to_wealth <- function(r_log) 100 * exp(cumsum(replace_na(r_log, 0)))

# ---- Helper: drawdown from wealth index (in decimals, <= 0) ----
to_drawdown <- function(w) (w / cummax(w)) - 1

# ---- IMPORTANT: this assumes your 'df' already exists from Figure 6/7
# and contains nominal total log returns:
#   r_eq_total_log, r_re_eu_total_log, r_re_si_total_log, r_re_lj_total_log, r_re_exlj_total_log
# If not, create it first via your Figure 2A/2B return-construction chunk.
df_dd <- df %>%
  arrange(date_q) %>%
  select(date_q,
         r_eq_total_log,
         r_re_eu_total_log,
         r_re_si_total_log,
         r_re_lj_total_log,
         r_re_ex_total_log) %>%
  mutate(
    w_eq      = to_wealth(r_eq_total_log),
    w_re_eu   = to_wealth(r_re_eu_total_log),
    w_re_si   = to_wealth(r_re_si_total_log),
    w_re_lj   = to_wealth(r_re_lj_total_log),
    w_re_ex = to_wealth(r_re_ex_total_log),

    dd_eq      = to_drawdown(w_eq),
    dd_re_eu   = to_drawdown(w_re_eu),
    dd_re_si   = to_drawdown(w_re_si),
    dd_re_lj   = to_drawdown(w_re_lj),
    dd_re_ex = to_drawdown(w_re_ex)
  )

# ------------------------------------------------------------
# Panel 1: Equity vs Eurostat RE
# ------------------------------------------------------------
p3b_p1 <- df_dd %>%
  select(date_q, dd_eq, dd_re_eu) %>%
  pivot_longer(-date_q, names_to = "series", values_to = "dd") %>%
  mutate(
    series = recode(series,
      dd_eq    = "Equities: SBITOP",
      dd_re_eu = "Real estate: Slovenia (Eurostat)"
    )
  ) %>%
  filter(!is.na(dd))

# Place event labels slightly below 0 so they show on underwater plots
y_lab_p1 <- -0.02

p_panel1 <- ggplot(p3b_p1, aes(date_q, dd, color = series, linetype = series)) +
  geom_hline(yintercept = 0, linewidth = 0.4, color = "grey60") +
  geom_line(linewidth = 1.05) +
  scale_color_manual(values = COLS, breaks = c("Equities: SBITOP", "Real estate: Slovenia (Eurostat)")) +
  scale_linetype_manual(values = LTYPES, breaks = c("Equities: SBITOP", "Real estate: Slovenia (Eurostat)")) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    subtitle = "Panel 1: SBITOP vs Slovenia real estate (Eurostat).",
    x = NULL,
    y = "Drawdown from peak (%, nominal)"
  ) +
  theme_thesis() +
  theme(
    legend.position = "bottom",
    plot.margin = margin(t = 10, r = 10, b = 8, l = 10)
  ) +
  geom_vline(
    data = events, aes(xintercept = date),
    linewidth = 0.4, linetype = 3, color = "grey60",
    inherit.aes = FALSE
  ) +
  geom_text(
    data = events,
    aes(x = date, y = y_lab_p1, label = label),
    hjust = 0, vjust = -0.3, size = 3.2, color = "grey35",
    inherit.aes = FALSE
  ) +
  coord_cartesian(clip = "off")

# ------------------------------------------------------------
# Panel 2: Real estate drawdowns (Eurostat + SURS regions)
# ------------------------------------------------------------
p3b_p2 <- df_dd %>%
  select(date_q, dd_re_eu, dd_re_si, dd_re_lj, dd_re_ex) %>%
  pivot_longer(-date_q, names_to = "series", values_to = "dd") %>%
  mutate(
    series = recode(series,
      dd_re_eu   = "Real estate: Slovenia (Eurostat)",
      dd_re_si   = "Real estate: Slovenia (SURS)",
      dd_re_lj   = "Real estate: Ljubljana (SURS)",
      dd_re_ex = "Real estate: Slovenia ex-LJ (SURS)"
    )
  ) %>%
  filter(!is.na(dd))

y_lab_p2 <- -0.01

p_panel2 <- ggplot(p3b_p2, aes(date_q, dd, color = series, linetype = series)) +
  geom_hline(yintercept = 0, linewidth = 0.4, color = "grey60") +
  geom_line(linewidth = 0.95) +
  scale_color_manual(
    values = COLS,
    breaks = c(
      "Real estate: Slovenia (Eurostat)",
      "Real estate: Slovenia (SURS)",
      "Real estate: Ljubljana (SURS)",
      "Real estate: Slovenia ex-LJ (SURS)"
    )
  ) +
  scale_linetype_manual(
    values = LTYPES,
    breaks = c(
      "Real estate: Slovenia (Eurostat)",
      "Real estate: Slovenia (SURS)",
      "Real estate: Ljubljana (SURS)",
      "Real estate: Slovenia ex-LJ (SURS)"
    )
  ) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    subtitle = "Panel 2: Real estate drawdowns (Eurostat vs SURS regions)",
    x = NULL,
    y = NULL
  ) +
  theme_thesis() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", size = 12),
    plot.margin = margin(t = 2, r = 10, b = 10, l = 10)
  ) +
  geom_vline(
    data = events, aes(xintercept = date),
    linewidth = 0.4, linetype = 3, color = "grey60",
    inherit.aes = FALSE
  ) +
  geom_text(
    data = events,
    aes(x = date, y = y_lab_p2, label = label),
    hjust = 0, vjust = -0.3, size = 3.0, color = "grey35",
    inherit.aes = FALSE
  ) +
  coord_cartesian(clip = "off")

# ------------------------------------------------------------
# Combine: Panel 1 emphasized (larger) + Panel 2 smaller
# ------------------------------------------------------------
p3b <- p_panel1 / p_panel2 +
  plot_layout(heights = c(2.6, 1.0))

p3b

ggsave(
  filename = "Figure_11_drawdowns_underwater_emphasized.png",
  plot = p3b,
  width = 13, height = 9.8, dpi = 300,
  limitsize = FALSE
)

3.7. Graph 12: Extreme quarters

# ============================================================
# FIGURE 12: Tail events = worst 5 + best 5 quarters
# - Horizontal bar chart, labeled
# - Two assets: SBITOP vs Slovenia RE (Eurostat)
# - Uses nominal quarterly total returns (log)
# ============================================================

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(lubridate)
library(patchwork)

# 1) Prepare the two asset return series (nominal total log returns)
ret_long <- df %>%
  transmute(
    date_q,
    `Equities: SBITOP` = r_eq_total_log,
    `Real estate: Slovenia (Eurostat)` = r_re_eu_total_log
  ) %>%
  pivot_longer(-date_q, names_to = "asset", values_to = "ret_log") %>%
  filter(!is.na(ret_log)) %>%
  mutate(
    q_label = paste0(year(date_q), " Q", quarter(date_q))
  )

# 2) Select worst 5 and best 5 per asset
worst5 <- ret_long %>%
  group_by(asset) %>%
  slice_min(order_by = ret_log, n = 5, with_ties = FALSE) %>%
  mutate(tail = "Worst 5")

best5 <- ret_long %>%
  group_by(asset) %>%
  slice_max(order_by = ret_log, n = 5, with_ties = FALSE) %>%
  mutate(tail = "Best 5")

extremes <- bind_rows(worst5, best5) %>%
  ungroup() %>%
  mutate(
    tail  = factor(tail, levels = c("Worst 5", "Best 5")),
    label = percent(ret_log, accuracy = 0.1)
  )

# 3) Force chronological order for ONLY the selected quarters
#    Oldest at TOP -> newest at BOTTOM
q_levels_used <- extremes %>%
  distinct(date_q, q_label) %>%
  arrange(date_q) %>%
  pull(q_label)

extremes <- extremes %>%
  mutate(q_label = factor(q_label, levels = rev(q_levels_used)))  # rev => oldest top

# 4) Colours consistent with earlier figures
COL_2 <- c(
  "Equities: SBITOP" = "#F8766D",
  "Real estate: Slovenia (Eurostat)" = "#B79F00"
)

# 5) Make sure labels never clip: explicit x-limits with padding
xmin <- min(extremes$ret_log, na.rm = TRUE)
xmax <- max(extremes$ret_log, na.rm = TRUE)

pad_left  <- 0.05  # extra room for negative labels (fixes -22.2%)
pad_right <- 0.03

# 6) Plot
p3c <- ggplot(extremes, aes(x = ret_log, y = q_label, fill = asset)) +
  geom_vline(xintercept = 0, linewidth = 0.35, color = "grey65") +
  geom_col(width = 0.72) +
  geom_text(
    aes(label = label),
    hjust = ifelse(extremes$ret_log >= 0, -0.08, 1.18),  # push negative labels further left
    size = 3.6,
    color = "grey20"
  ) +
  facet_grid(asset ~ tail, scales = "free_y") +
  scale_fill_manual(values = COL_2) +
  scale_x_continuous(
    limits = c(xmin - pad_left, xmax + pad_right),
    labels = percent_format(accuracy = 1)
  ) +
  labs(
    x = "Quarterly total return (log, nominal)",
    y = NULL
  ) +
  theme_thesis() +
  theme(
    legend.position = "none",
    strip.text = element_text(face = "bold"),
    # extra margins prevent text clipping at left/right edges
    plot.margin = margin(t = 10, r = 28, b = 12, l = 50)
  )

p3c

# 7) Save: slightly wider so nothing is cropped
ggsave(
  "Figure_12_extreme_quarters_timeline.png",
  p3c,
  width = 15.5, height = 8.6, dpi = 300,
  limitsize = FALSE
)

3.8. Graph 13: Rolling correlation

# ============================================================
# FIGURE 13: Rolling correlation (diversification stability)
# - 12-quarter rolling correlation between equity and real estate returns
# - Uses nominal total log returns:
#     * Equities: SBITOP (r_eq_total_log)
#     * Real estate: Slovenia (Eurostat) (r_re_eu_total_log)
# - Same visual language as earlier figures (theme_thesis + event markers)
# ============================================================

library(dplyr)
library(ggplot2)
library(scales)
library(zoo)

# ---- Event markers (reuse the same dates as before) ----
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

# ---- Rolling window length (12 quarters = 3 years) ----
win_q <- 12

# ---- Rolling correlation helper (align right: uses previous 12 quarters) ----
roll_corr <- function(x, y, k) {
  zoo::rollapply(
    data = cbind(x, y),
    width = k,
    FUN = function(z) cor(z[, 1], z[, 2], use = "complete.obs"),
    by.column = FALSE,
    align = "right",
    fill = NA
  )
}

# ---- Build working df (assumes df exists from your Figure 2A/2B chunk) ----
# df must contain: date_q, r_eq_total_log, r_re_eu_total_log
df_corr <- df %>%
  arrange(date_q) %>%
  transmute(
    date_q,
    r_eq = r_eq_total_log,
    r_re = r_re_eu_total_log
  ) %>%
  mutate(
    corr_12q = roll_corr(r_eq, r_re, win_q)
  ) %>%
  filter(!is.na(corr_12q))

# ---- Y position for event labels (keep consistent placement) ----
y_top <- max(df_corr$corr_12q, na.rm = TRUE)

# ---- Plot ----
p4a <- ggplot(df_corr, aes(x = date_q, y = corr_12q)) +
  geom_hline(yintercept = 0, linewidth = 0.4, color = "grey70") +
  geom_line(linewidth = 1.05, color = "grey20") +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.25)) +
  labs(
    x = NULL,
    y = "Rolling correlation"
  ) +
  theme_thesis() +
  geom_vline(
    data = events,
    aes(xintercept = date),
    linewidth = 0.4,
    linetype = 3,
    color = "grey60",
    inherit.aes = FALSE
  ) +
  geom_text(
    data = events,
    aes(x = date, y = 0.96 * y_top, label = label),
    vjust = -0.3,
    hjust = 0,
    size = 3.2,
    color = "grey35",
    inherit.aes = FALSE
  )

p4a

ggsave(
  filename = "Figure_13_rolling_correlation_12q.png",
  plot = p4a,
  width = 12,
  height = 8.2,
  dpi = 300,
  limitsize = FALSE
)

3.9. Graph 14: Scatter and conditional behavior

# ============================================================
# FIGURE 14: Scatter + conditional behaviour (RE vs EQ returns)
# - Uses quarterly nominal total log returns:
#     * Equity: r_eq_total_log
#     * Real estate: r_re_eu_total_log (Eurostat Slovenia)
# - Highlights crisis regimes (same event dates as earlier figures)
# ============================================================

library(dplyr)
library(ggplot2)
library(scales)

# ---- Toggle: one line overall vs separate lines by regime ----
by_regime <- TRUE   # set FALSE if you want one fitted line only

# ---- Event dates (same as your vertical markers) ----
d_euro  <- as.Date("2011-09-30")
d_covid <- as.Date("2020-03-31")
d_infl  <- as.Date("2022-03-31")

# ---- Regime windows (keep simple + readable) ----
# You can adjust these, but don't over-engineer: the goal is visual intuition.
w_euro  <- 2  # quarters on each side of event
w_covid <- 2
w_infl  <- 2

# Helper to create +/- k quarter window in Date terms
add_q <- function(d, k) seq(d, by = "quarter", length.out = abs(k) + 1)[abs(k) + 1]  # safe-ish
# More robust: use zoo/tsibble; but this works if date_q is quarter-end dates.

# ---- Colors consistent with your thesis palette ----
COL_EQ <- "#F8766D"  # SBITOP
COL_RE <- "#B79F00"  # Eurostat RE
COLS_REGIME <- c(
  "Non-event quarters"               = "grey",
  "Euro area crisis quarters"     = "green",
  "COVID-19 shock quarters"       = "red",
  "Inflation & rate shock quarters" = "orange"
)

# ============================================================
# 1) Build scatter dataset (assumes df already exists from Fig 2A/2B)
#    and contains: date_q, r_eq_total_log, r_re_eu_total_log
# ============================================================

scat <- df %>%
  select(date_q, r_eq_total_log, r_re_eu_total_log) %>%
  filter(!is.na(r_eq_total_log), !is.na(r_re_eu_total_log)) %>%
  mutate(
    # Label event windows as regimes; otherwise "Normal"
    regime = case_when(
      date_q >= (d_euro  - 90*w_euro)  & date_q <= (d_euro  + 90*w_euro)  ~ "Euro area crisis quarters",
      date_q >= (d_covid - 90*w_covid) & date_q <= (d_covid + 90*w_covid) ~ "COVID-19 shock quarters",
      date_q >= (d_infl  - 90*w_infl)  & date_q <= (d_infl  + 90*w_infl)  ~ "Inflation & rate shock quarters",
      TRUE ~ "Non-event quarters"
    ),
    regime = factor(regime, levels = c("Non-event quarters","Euro area crisis quarters","COVID-19 shock quarters","Inflation & rate shock quarters"))
  )

# ============================================================
# 2) Plot
# ============================================================

p4b <- ggplot(scat, aes(x = r_eq_total_log, y = r_re_eu_total_log)) +
  geom_hline(yintercept = 0, linewidth = 0.35, color = "grey70") +
  geom_vline(xintercept = 0, linewidth = 0.35, color = "grey70") +
  geom_point(aes(color = regime), size = 2.1, alpha = 0.85) +
  scale_color_manual(values = COLS_REGIME) +
  scale_x_continuous(labels = percent_format(accuracy = 1)) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    x = "Equities: SBITOP quarterly total return (log, nominal)",
    y = "Real estate: Slovenia (Eurostat) quarterly total return (log, nominal)"
  ) +
  theme_thesis() +
  guides(color = guide_legend(ncol = 2, byrow = TRUE))

# ---- Add fitted line(s) ----
if (by_regime) {
  # Separate OLS lines by regime (including Normal)
  p4b <- p4b +
    geom_smooth(aes(color = regime), method = "lm", se = FALSE, linewidth = 0.9)
} else {
  # One overall OLS line
  p4b <- p4b +
    geom_smooth(color = "black", method = "lm", se = FALSE, linewidth = 0.9)
}

p4b
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = "Figure_14_scatter_RE_vs_EQ_regimes.png",
  plot = p4b,
  width = 12,
  height = 7.0,
  dpi = 300,
  limitsize = FALSE
)
## `geom_smooth()` using formula = 'y ~ x'

3.10. Graph 15: Lead lag

# ============================================================
# FIGURE 15: Lead–lag / delayed adjustment (CCF)
# - Cross-correlation between equity and real estate returns
# - Uses quarterly NOMINAL total log returns:
#     x_t = r_eq_total_log  (SBITOP)
#     y_t = r_re_eu_total_log (Slovenia RE, Eurostat)
# - Positive lags (k > 0): corr(x_{t-k}, y_t)  => equities lead RE by k quarters
# - Negative lags (k < 0): corr(x_{t+|k|}, y_t) => RE leads equities by |k| quarters
# ============================================================

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)

# ---- Assumption: df already exists (from Figures 2A/2B chunks) with:
# df$date_q, df$r_eq_total_log, df$r_re_eu_total_log
# If not, create it: df <- master %>% arrange(date_q) %>% mutate(...)  # as you did earlier

# ---- Parameters ----
max_lag_q <- 12  # show +/- 12 quarters (3 years)
alpha_sig <- 0.05

# ---- Prepare aligned returns (complete cases only) ----
ccf_df <- df %>%
  select(date_q, r_eq_total_log, r_re_eu_total_log) %>%
  filter(complete.cases(.)) %>%
  arrange(date_q)

x <- ccf_df$r_eq_total_log
y <- ccf_df$r_re_eu_total_log
n <- length(x)

# ---- Compute CCF without plotting ----
cc <- stats::ccf(x, y, lag.max = max_lag_q, plot = FALSE, na.action = na.pass)

cc_plot <- tibble(
  lag = as.integer(cc$lag),
  corr = as.numeric(cc$acf)
)

# ---- Approx. significance bands (white-noise benchmark) ----
# For large n, approx SE(corr) ~ 1/sqrt(n). CI ~ +/- z * 1/sqrt(n)
z <- qnorm(1 - alpha_sig/2)
sig <- z / sqrt(n)

# ---- Event markers (same as earlier chapters; optional here) ----
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

# ---- Plot ----
p4c <- ggplot(cc_plot, aes(x = lag, y = corr)) +
  geom_hline(yintercept = 0, linewidth = 0.4, color = "grey55") +
  geom_hline(yintercept = c(-sig, sig), linetype = 3, linewidth = 0.4, color = "grey60") +
  geom_col(width = 0.85, fill = "grey25") +
  scale_x_continuous(
    breaks = seq(-max_lag_q, max_lag_q, by = 2),
    minor_breaks = NULL
  ) +
  scale_y_continuous(limits = c(-1, 1), breaks = seq(-1, 1, by = 0.25)) +
  labs(
    x = "Lag (quarters)  |  k > 0: equities lead RE by k quarters",
    y = "Cross-correlation"
  ) +
  theme_thesis() +
  theme(
    legend.position = "none",
    plot.margin = margin(t = 10, r = 10, b = 10, l = 10)
  )

p4c

ggsave(
  filename = "Figure_15_CCF_equity_vs_realestate_nominal.png",
  plot = p4c,
  width = 12,
  height = 6.8,
  dpi = 300,
  limitsize = FALSE
)

3.11. Graph 16 & 17: Wealth and return divergence by region

# ============================================================
# FIGURE 16 & 17: Regional sensitivity within real estate
# 16: Small multiples (same scale) – wealth indices for SI, LJ, ex-LJ
# 17: Returns comparison by region – violin + boxplot
# Data: SURS regional HPI series + rent-income model (as in Ch. 3)

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(lubridate)
library(stringr)

# -----------------------------
# Global events + theme (reuse)
# -----------------------------
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

theme_thesis <- function() {
  theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10.5, color = "grey30"),
      legend.position = "bottom",
      legend.title = element_blank(),
      legend.text = element_text(size = 9.5),
      legend.key.width = unit(14, "pt"),
      legend.spacing.x = unit(6, "pt"),
      legend.box.margin = margin(t = 8),
      plot.margin = margin(t = 10, r = 10, b = 10, l = 10),
      panel.grid.minor = element_blank()
    )
}

add_events <- function(p, y_top, events_df = events) {
  p +
    geom_vline(
      data = events_df,
      aes(xintercept = date),
      linewidth = 0.4,
      linetype = 3,
      color = "grey60",
      inherit.aes = FALSE
    ) +
    geom_text(
      data = events_df,
      aes(x = date, y = 0.96 * y_top, label = label),
      vjust = -0.3,
      hjust = 0,
      size = 3.2,
      color = "grey35",
      inherit.aes = FALSE
    )
}

# -----------------------------
# Colors (match earlier figures)
# -----------------------------
COL_RE <- c(
  "Real estate: Slovenia (SURS)"         = "#00BFC4",
  "Real estate: Ljubljana (SURS)"        = "#619CFF",
  "Real estate: Slovenia ex-LJ (SURS)"   = "#C77CFF"
)

# Optional: keep RE style similar to earlier plots (dashed/dotted)
LT_RE <- c(
  "Real estate: Slovenia (SURS)"         = "solid",
  "Real estate: Ljubljana (SURS)"        = "solid",
  "Real estate: Slovenia ex-LJ (SURS)"   = "solid"
)

# ============================================================
# 1) Build SURS nominal total log returns (HPI + rent-income model)
# ============================================================

y0_ann <- 0.04  # baseline gross annual rental yield (same as earlier)

df_re <- master %>%
  arrange(date_q) %>%
  select(date_q, hicp_rent, hpi_si_surs_used, hpi_lj_surs_used, hpi_si_exlj_surs_used)

# Base quarter (first quarter with all inputs for anchoring rent/price yield model)
base <- df_re %>%
  filter(
    !is.na(hicp_rent),
    !is.na(hpi_si_surs_used),
    !is.na(hpi_lj_surs_used),
    !is.na(hpi_si_exlj_surs_used)
  ) %>%
  slice(1)

R0  <- base$hicp_rent
Hs0 <- base$hpi_si_surs_used
Hl0 <- base$hpi_lj_surs_used
Hx0 <- base$hpi_si_exlj_surs_used

df_re <- df_re %>%
  mutate(
    # price log returns
    r_si_price_log   = log(hpi_si_surs_used / lag(hpi_si_surs_used)),
    r_lj_price_log   = log(hpi_lj_surs_used / lag(hpi_lj_surs_used)),
    r_ex_price_log   = log(hpi_si_exlj_surs_used / lag(hpi_si_exlj_surs_used)),

    # time-varying rental yields via rent/price ratio (anchored at base quarter)
    y_si_t   = y0_ann * ((hicp_rent / R0) / (hpi_si_surs_used / Hs0)),
    y_lj_t   = y0_ann * ((hicp_rent / R0) / (hpi_lj_surs_used / Hl0)),
    y_ex_t   = y0_ann * ((hicp_rent / R0) / (hpi_si_exlj_surs_used / Hx0)),

    # quarterly rental income (lagged to avoid look-ahead)
    r_si_income_q = lag(y_si_t) / 4,
    r_lj_income_q = lag(y_lj_t) / 4,
    r_ex_income_q = lag(y_ex_t) / 4,

    # nominal total log returns
    r_si_total_log = r_si_price_log + r_si_income_q,
    r_lj_total_log = r_lj_price_log + r_lj_income_q,
    r_ex_total_log = r_ex_price_log + r_ex_income_q
  )

# ============================================================
# FIGURE 16: Wealth indices (3 lines, one panel, same scale)
# ============================================================

# Wealth indices from total log returns (start = 100 at beginning of series)
df_wealth <- df_re %>%
  mutate(
    idx_si = 100 * exp(cumsum(replace_na(r_si_total_log, 0))),
    idx_lj = 100 * exp(cumsum(replace_na(r_lj_total_log, 0))),
    idx_ex = 100 * exp(cumsum(replace_na(r_ex_total_log, 0)))
  )

plot_5a <- df_wealth %>%
  select(date_q, idx_si, idx_lj, idx_ex) %>%
  pivot_longer(-date_q, names_to = "series", values_to = "wealth") %>%
  mutate(
    series = recode(series,
      idx_si = "Real estate: Slovenia (SURS)",
      idx_lj = "Real estate: Ljubljana (SURS)",
      idx_ex = "Real estate: Slovenia ex-LJ (SURS)"
    ),
    series = factor(series, levels = names(COL_RE))
  )

y_top_5a <- max(plot_5a$wealth, na.rm = TRUE)

p5a <- ggplot(plot_5a, aes(date_q, wealth, color = series, linetype = series)) +
  geom_line(linewidth = 1.05) +
  scale_color_manual(values = COL_RE) +
  scale_linetype_manual(values = LT_RE) +
  scale_y_continuous(labels = label_number(accuracy = 1)) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  guides(
    color = guide_legend(ncol = 2, byrow = TRUE),
    linetype = "none"
  ) +
  labs(
    x = NULL,
    y = "Wealth index (base = 100)"
  ) +
  theme_thesis()

p5a <- add_events(p5a, y_top_5a)

p5a

ggsave(
  filename = "Figure_16_regional_wealth_indices_one_panel.png",
  plot = p5a,
  width = 12,
  height = 8.2,
  dpi = 300,
  limitsize = FALSE
)

# ============================================================
# FIGURE 17: Return distributions by region (violin + box)
# ============================================================

plot_5b <- df_re %>%
  select(date_q, r_si_total_log, r_lj_total_log, r_ex_total_log) %>%
  pivot_longer(-date_q, names_to = "series", values_to = "ret_log") %>%
  filter(!is.na(ret_log)) %>%
  mutate(
    series = recode(series,
      r_si_total_log = "Real estate: Slovenia (SURS)",
      r_lj_total_log = "Real estate: Ljubljana (SURS)",
      r_ex_total_log = "Real estate: Slovenia ex-LJ (SURS)"
    ),
    series = factor(series, levels = names(COL_RE))
  )

p5b <- ggplot(plot_5b, aes(x = series, y = ret_log, fill = series)) +
  geom_hline(yintercept = 0, linewidth = 0.4, color = "grey70") +
  geom_violin(trim = FALSE, linewidth = 0.7, alpha = 0.55) +
  geom_boxplot(width = 0.18, outlier.size = 1.4, linewidth = 0.7) +
  scale_fill_manual(values = COL_RE) +
  scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
  labs(
    x = NULL,
    y = "Quarterly total return (log, nominal)"
  ) +
  theme_thesis() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(size = 10)
  )

p5b

ggsave(
  filename = "Figure_17_regional_return_distributions_violin.png",
  plot = p5b,
  width = 12,
  height = 7.2,
  dpi = 300,
  limitsize = FALSE
)

3.12. Graph 18: Drawdowns by region

# ============================================================
# FIGURE 18: Drawdowns by region (underwater plot, SURS)
# Goal: make "regional risk is different" visually obvious
# - Computes drawdowns from TOTAL RETURN wealth indices (nominal)
# - Regions: Slovenia (SURS), Ljubljana (SURS), Slovenia ex-LJ (SURS)
# ============================================================

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(lubridate)

# ---- Event markers (reuse across Chapter 4/5) ----
events <- tibble::tribble(
  ~date, ~label,
  as.Date("2011-09-30"), "Euro area crisis",
  as.Date("2020-03-31"), "COVID-19 shock",
  as.Date("2022-03-31"), "Inflation & rate shock"
)

# ---- Colors consistent with earlier figures ----
COL_REG <- c(
  "Real estate: Slovenia (SURS)"        = "#00BFC4",
  "Real estate: Ljubljana (SURS)"       = "#619CFF",
  "Real estate: Slovenia ex-LJ (SURS)"  = "#C77CFF"
)

# ---- Linetypes: keep RE dashed as in earlier figures ----
LT_REG <- c(
  "Real estate: Slovenia (SURS)"        = "solid",
  "Real estate: Ljubljana (SURS)"       = "solid",
  "Real estate: Slovenia ex-LJ (SURS)"  = "solid"
)


# 1) Build nominal total return wealth indices (base = 100)
#    If you already built these indices earlier, you can skip this and just select them.
df_5c <- df %>%
  arrange(date_q) %>%
  transmute(
    date_q,
    idx_si   = 100 * exp(cumsum(replace_na(r_re_si_total_log,   0))),
    idx_lj   = 100 * exp(cumsum(replace_na(r_re_lj_total_log,   0))),
    idx_ex = 100 * exp(cumsum(replace_na(r_re_ex_total_log, 0)))
  )

# 2) Compute drawdowns (underwater): DD_t = (Index_t / max_to_date(Index)) - 1
dd_from_index <- function(x) {
  x / cummax(x) - 1
}

dd_long <- df_5c %>%
  mutate(
    dd_si   = dd_from_index(idx_si),
    dd_lj   = dd_from_index(idx_lj),
    dd_ex = dd_from_index(idx_ex)
  ) %>%
  select(date_q, dd_si, dd_lj, dd_ex) %>%
  pivot_longer(-date_q, names_to = "series", values_to = "dd") %>%
  mutate(
    series = recode(series,
      dd_si   = "Real estate: Slovenia (SURS)",
      dd_lj   = "Real estate: Ljubljana (SURS)",
      dd_ex = "Real estate: Slovenia ex-LJ (SURS)"
    ),
    series = factor(series, levels = names(COL_REG))
  ) %>%
  filter(!is.na(dd))

# 3) Optional: identify and label max drawdown (MDD) point per region
mdd_pts <- dd_long %>%
  group_by(series) %>%
  slice_min(order_by = dd, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(mdd_lab = paste0("MDD: ", percent(dd, accuracy = 0.1)))

# 4) Build plot (overlay)
p5c <- ggplot(dd_long, aes(x = date_q, y = dd, color = series, linetype = series)) +
  geom_hline(yintercept = 0, linewidth = 0.35, color = "grey65") +
  geom_line(linewidth = 1.05) +
  geom_point(
    data = mdd_pts,
    aes(x = date_q, y = dd),
    inherit.aes = FALSE,
    size = 2.0,
    color = "grey25"
  ) +
  geom_text(
    data = mdd_pts,
    aes(x = date_q, y = dd, label = mdd_lab),
    inherit.aes = FALSE,
    hjust = 0,
    vjust = -0.5,
    size = 3.2,
    color = "grey25"
  ) +
  geom_vline(
    data = events,
    aes(xintercept = date),
    linewidth = 0.4,
    linetype = 3,
    color = "grey60",
    inherit.aes = FALSE
  ) +
  geom_text(
    data = events,
    aes(x = date, y = 0.02, label = label),
    vjust = -0.3,
    hjust = 0,
    size = 3.2,
    color = "grey35",
    inherit.aes = FALSE
  ) +
  scale_color_manual(values = COL_REG, drop = TRUE) +
  scale_linetype_manual(values = LT_REG, drop = TRUE) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  guides(
    color = guide_legend(ncol = 3, byrow = TRUE),
    linetype = "none"
  ) +
  labs(
    x = NULL,
    y = "Drawdown from peak (%, nominal)"
  ) +
  theme_thesis()

p5c

ggsave(
  filename = "Figure_18_drawdowns_by_region_SURS.png",
  plot = p5c,
  width = 12,
  height = 7.8,
  dpi = 300,
  limitsize = FALSE
)

3.13. Graph 19: Holding period outcomes

# ============================================================
# FIGURE 19: Holding-period outcomes (net of costs)
# - Distribution of realized NET outcomes by holding period
# - Equity (SBITOP total return) vs Real estate (Eurostat SI, total return)
# - Shows timing risk + transaction-cost drag
#
# REQUIREMENTS / ASSUMPTIONS
# 1) You already have `df` from earlier (Figure 2A/2B construction), containing:
#    date_q, r_eq_total_log, r_re_eu_total_log
# 2) You have loaded:
#    - costs_base (from your RDS)
#    - eq_cgt (capital gains tax schedule for equities, from your RDS)
# 3) You have `theme_thesis()` defined (as in your thesis toolkit).
#
# Notes:
# - Net outcome here is implemented as: log gross growth over horizon + log(1 - roundtrip_cost)
#   and optionally minus capital-gains tax on the *price* gain component.
# - If your `costs_base` / `eq_cgt` objects use different column names, adjust the "PARAMS" block.
# ============================================================

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(lubridate)
library(zoo)

# ---- Prereqs (must exist from earlier chunks) ----
# df with: date_q, r_eq_total_log, r_re_eu_total_log
# theme_thesis() defined

# --------------------------
# Holding periods (years)
# --------------------------
hp_years_vec <- c(1, 2, 3, 5, 7, 10, 15)
hp_q_vec     <- hp_years_vec * 4

hp_levels <- paste0(hp_years_vec, "y")  # UNIQUE levels -> fixes your duplicated factor issue

# --------------------------
# Costs and CGT schedules
# --------------------------
# Use your objects if they exist; otherwise safe defaults

eq_roundtrip_cost <- if (exists("costs_base")) {
  if (is.data.frame(costs_base) && "eq_roundtrip_cost" %in% names(costs_base)) costs_base$eq_roundtrip_cost[1] else
    if (is.list(costs_base) && "eq_roundtrip_cost" %in% names(costs_base)) costs_base$eq_roundtrip_cost else 0.002
} else 0.002

re_roundtrip_cost <- if (exists("costs_base")) {
  if (is.data.frame(costs_base) && "re_roundtrip_cost" %in% names(costs_base)) costs_base$re_roundtrip_cost[1] else
    if (is.list(costs_base) && "re_roundtrip_cost" %in% names(costs_base)) costs_base$re_roundtrip_cost else 0.08
} else 0.08

get_eq_cgt_rate <- function(years) {
  if (!exists("eq_cgt")) return(0)
  if (!is.data.frame(eq_cgt)) return(0)

  y_col <- intersect(names(eq_cgt), c("years", "holding_years", "hp_years"))
  r_col <- intersect(names(eq_cgt), c("rate", "tax_rate", "cgt_rate"))
  if (length(y_col) != 1 || length(r_col) != 1) return(0)

  tmp <- eq_cgt %>% arrange(.data[[y_col]]) %>% filter(.data[[y_col]] <= years)
  if (nrow(tmp) == 0) {
    return(eq_cgt %>% arrange(.data[[y_col]]) %>% slice(1) %>% pull(.data[[r_col]]))
  }
  tmp %>% slice_tail(n = 1) %>% pull(.data[[r_col]])
}

# Real estate CGT: set to 0 unless you explicitly model it elsewhere
get_re_cgt_rate <- function(years) 0

# --------------------------
# Long returns (two assets)
# --------------------------
ret_long <- df %>%
  arrange(date_q) %>%
  transmute(
    date_q,
    asset_eq = r_eq_total_log,
    asset_re = r_re_eu_total_log
  ) %>%
  pivot_longer(
    cols = c(asset_eq, asset_re),
    names_to = "asset_key",
    values_to = "r_log"
  ) %>%
  mutate(
    asset = dplyr::recode(asset_key,
      asset_eq = "Equities: SBITOP",
      asset_re = "Real estate: Slovenia (Eurostat)"
    )
  ) %>%
  select(date_q, asset, r_log) %>%
  filter(!is.na(r_log)) %>%
  arrange(asset, date_q)

# --------------------------
# Build outcomes (robust loop; no group_modify pitfalls)
# --------------------------
roll_sum_future <- function(x, k) {
  # sum_{i=1..k} x_{t+i} implemented as rollapplyr on lead(x,1)
  zoo::rollapplyr(dplyr::lead(x, 1), width = k, FUN = sum, fill = NA, partial = FALSE)
}

assets <- unique(ret_long$asset)

outcomes <- bind_rows(lapply(assets, function(a) {
  d <- ret_long %>% filter(asset == a) %>% arrange(date_q)

  rt_cost <- if (a == "Equities: SBITOP") eq_roundtrip_cost else re_roundtrip_cost

  bind_rows(lapply(seq_along(hp_q_vec), function(i) {
    Hy <- hp_years_vec[i]
    Hq <- hp_q_vec[i]

    gross_log <- roll_sum_future(d$r_log, Hq)
    gross_simple <- exp(gross_log) - 1

    cgt_rate <- if (a == "Equities: SBITOP") get_eq_cgt_rate(Hy) else get_re_cgt_rate(Hy)
    cgt_drag <- cgt_rate * pmax(gross_simple, 0)

    net_simple <- (1 + gross_simple) * (1 - rt_cost) - 1 - cgt_drag

    tibble(
      date_q = d$date_q,
      asset = a,
      hp_years = Hy,
      hp_label = factor(paste0(Hy, "y"), levels = hp_levels),
      gross_log = gross_log,
      gross_simple = gross_simple,
      net_simple = net_simple
    )
  }))
})) %>%
  filter(!is.na(net_simple))

# --------------------------
# Plot: violin + box by holding period, split by asset
# --------------------------
COL_2 <- c(
  "Equities: SBITOP" = "#F8766D",
  "Real estate: Slovenia (Eurostat)" = "#B79F00"
)

p6a <- ggplot(outcomes, aes(x = hp_label, y = net_simple, fill = asset)) +
  geom_hline(yintercept = 0, linewidth = 0.35, color = "grey65") +
  geom_violin(
    trim = TRUE,
    alpha = 0.45,
    width = 0.95,
    position = position_dodge(width = 0.82),
    color = "grey25"
  ) +
  geom_boxplot(
    width = 0.22,
    outlier.size = 1.1,
    outlier.alpha = 0.65,
    position = position_dodge(width = 0.82),
    color = "grey20"
  ) +
  scale_fill_manual(values = COL_2) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    x = "Holding period",
    y = "Net realized outcome (simple return, %)"
  ) +
  theme_thesis() +
  guides(fill = guide_legend(ncol = 2, byrow = TRUE))

p6a

ggsave(
  "Figure_19_holding_period_net_outcomes.png",
  p6a, width = 12, height = 8.2, dpi = 300, limitsize = FALSE
)

3.14. Graph 20 & 21: Break-even yield and scenario robustness

names(master)
##  [1] "q_id"                  "date_q"                "year"                 
##  [4] "qtr"                   "hpi_si_eurostat"       "hpi_si_surs_used"     
##  [7] "hpi_lj_surs_used"      "hpi_si_exlj_surs_used" "hicp_rent"            
## [10] "hicp_all"              "sbitop_close"          "dy_eq_ann"            
## [13] "rf_log_q"              "rf_simple_q"           "rf_rate_ann_qavg_pct"
names(df)
##  [1] "q_id"                   "date_q"                 "year"                  
##  [4] "qtr"                    "hpi_si_eurostat"        "hpi_si_surs_used"      
##  [7] "hpi_lj_surs_used"       "hpi_si_exlj_surs_used"  "hicp_rent"             
## [10] "hicp_all"               "sbitop_close"           "dy_eq_ann"             
## [13] "rf_log_q"               "rf_simple_q"            "rf_rate_ann_qavg_pct"  
## [16] "pi_log"                 "dy_eq_q"                "r_eq_price_log"        
## [19] "r_eq_total_log"         "r_re_eu_price_log"      "y_eu_t"                
## [22] "r_re_eu_income_q"       "r_re_eu_total_log"      "r_re_si_price_log"     
## [25] "r_re_lj_price_log"      "r_re_ex_price_log"      "y_si_t"                
## [28] "y_lj_t"                 "y_ex_t"                 "r_re_si_income_q"      
## [31] "r_re_lj_income_q"       "r_re_ex_income_q"       "r_re_si_total_log"     
## [34] "r_re_lj_total_log"      "r_re_ex_total_log"      "r_eq_total_real_log"   
## [37] "r_re_eu_total_real_log" "r_re_si_total_real_log" "r_re_lj_total_real_log"
## [40] "r_re_ex_total_real_log"
str(costs_base)
## tibble [14 × 8] (S3: tbl_df/tbl/data.frame)
##  $ asset     : chr [1:14] "RE" "RE" "RE" "RE" ...
##  $ cost_group: chr [1:14] "ONGOING" "ONGOING" "ONGOING" "ONGOING" ...
##  $ cost_name : chr [1:14] "maintenance" "vacancy" "property_tax" "insurance" ...
##  $ applies_to: chr [1:14] "property_value" "gross_rent" "property_value" "property_value" ...
##  $ timing    : chr [1:14] "ANNUAL" "ANNUAL" "ANNUAL" "ANNUAL" ...
##  $ rate      : num [1:14] 0.01 0.05 0.005 0.002 0.08 0.002 0.005 0.02 0.02 0.02 ...
##  $ rate_unit : chr [1:14] "decimal" "decimal" "decimal" "decimal" ...
##  $ note      : chr [1:14] "1% of property value per year" "5% of time vacant (reduces rent)" "0.5% of property value per year" "0.2% of property value per year" ...
head(eq_cgt)
## # A tibble: 4 × 5
##   holding_years_min holding_years_max tax_rate rate_unit note                   
##               <dbl>             <dbl>    <dbl> <chr>     <chr>                  
## 1                 0                 5     0.25 decimal   25% if held <5 years   
## 2                 5                10     0.2  decimal   20% if held 5–10 years 
## 3                10                15     0.15 decimal   15% if held 10–15 years
## 4                15               999     0    decimal   No tax if held >=15 ye…
# ============================================================
# FIGURE 20: Break-even yield map using TERMINAL WEALTH RATIO GAP
#           gap_wr = median( (1+RE_net)/(1+EQ_net) - 1 )
#           gap_wr > 0  => RE ends with more wealth than EQ
#           gap_wr = 0  => break-even
#           gap_wr < 0  => EQ ends with more wealth than RE
#
# FIGURE 21: Tornado sensitivity for the same metric
#           includes legend explaining bars, dots, baseline
# ============================================================

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(lubridate)
library(zoo)
library(stringr)
library(forcats)

# --------------------------
# 0) Helpers: robust params
# --------------------------
pull_cost <- function(costs_base, field, default) {
  if (!exists("costs_base")) return(default)
  if (is.data.frame(costs_base) && field %in% names(costs_base)) return(as.numeric(costs_base[[field]][1]))
  if (is.list(costs_base) && field %in% names(costs_base)) return(as.numeric(costs_base[[field]]))
  default
}

eq_roundtrip_cost_base <- pull_cost(costs_base, "eq_roundtrip_cost", 0.002)
re_roundtrip_cost_base <- pull_cost(costs_base, "re_roundtrip_cost", 0.08)

get_eq_cgt_rate <- function(years) {
  if (!exists("eq_cgt")) return(0)
  if (!is.data.frame(eq_cgt)) return(0)

  y_col <- intersect(names(eq_cgt), c("years", "holding_years", "hp_years"))
  r_col <- intersect(names(eq_cgt), c("rate", "tax_rate", "cgt_rate"))
  if (length(y_col) != 1 || length(r_col) != 1) return(0)

  tmp <- eq_cgt %>% arrange(.data[[y_col]]) %>% filter(.data[[y_col]] <= years)
  if (nrow(tmp) == 0) {
    return(eq_cgt %>% arrange(.data[[y_col]]) %>% slice(1) %>% pull(.data[[r_col]]))
  }
  tmp %>% slice_tail(n = 1) %>% pull(.data[[r_col]])
}

roll_sum_future <- function(x, k) {
  zoo::rollapplyr(dplyr::lead(x, 1), width = k, FUN = sum, fill = NA, partial = FALSE)
}

# --------------------------
# 1) Build EQ and RE quarterly log total returns
#    (Same construction as earlier, but self-contained here)
# --------------------------
df_base <- df %>%
  arrange(date_q) %>%
  select(date_q, sbitop_close, dy_eq_ann, hicp_rent, hpi_si_eurostat) %>%
  filter(!is.na(date_q))

base_anchor <- df_base %>%
  filter(!is.na(hicp_rent), !is.na(hpi_si_eurostat)) %>%
  slice(1)

R0 <- base_anchor$hicp_rent
H0 <- base_anchor$hpi_si_eurostat

df_eq <- df_base %>%
  mutate(
    dy_eq_q        = dy_eq_ann / 4,
    r_eq_price_log = log(sbitop_close / lag(sbitop_close)),
    r_eq_total_log = r_eq_price_log + dy_eq_q
  ) %>%
  select(date_q, r_eq_total_log)

df_re_price <- df_base %>%
  mutate(
    r_re_price_log = log(hpi_si_eurostat / lag(hpi_si_eurostat)),
    rent_price_ratio_rel = (hicp_rent / R0) / (hpi_si_eurostat / H0)
  ) %>%
  select(date_q, r_re_price_log, rent_price_ratio_rel)

# --------------------------
# 2) Metric choices (pick one)
# --------------------------
# (A) Wealth ratio gap (recommended): (1+RE_net)/(1+EQ_net) - 1
gap_wealth_ratio <- function(net_eq_simple, net_re_simple) {
  # drop cases where either side <= -100% (wealth <= 0) to avoid nonsense ratios
  ok <- is.finite(net_eq_simple) & is.finite(net_re_simple) &
    (1 + net_eq_simple) > 0 & (1 + net_re_simple) > 0

  if (!any(ok)) return(NA_real_)
  median((1 + net_re_simple[ok]) / (1 + net_eq_simple[ok]) - 1, na.rm = TRUE)
}

# (B) Log gap option: log(1+RE_net) - log(1+EQ_net)
gap_log <- function(net_eq_simple, net_re_simple) {
  ok <- is.finite(net_eq_simple) & is.finite(net_re_simple) &
    (1 + net_eq_simple) > 0 & (1 + net_re_simple) > 0

  if (!any(ok)) return(NA_real_)
  median(log(1 + net_re_simple[ok]) - log(1 + net_eq_simple[ok]), na.rm = TRUE)
}

# Choose which metric you want for 6B/6C:
GAP_METRIC <- "wealth_ratio"   # "wealth_ratio" or "log_gap"

# --------------------------
# 3) Net-outcome simulator for a single (hp_years, y0, eq_cost, re_cost)
# --------------------------
calc_gap_metric <- function(hp_years, y0, eq_cost, re_cost) {
  Hq <- hp_years * 4

  re_series <- df_re_price %>%
    mutate(
      y_t = y0 * rent_price_ratio_rel,
      r_re_total_log = r_re_price_log + (lag(y_t) / 4)
    ) %>%
    select(date_q, r_re_total_log)

  d <- df_eq %>%
    left_join(re_series, by = "date_q") %>%
    filter(!is.na(r_eq_total_log), !is.na(r_re_total_log)) %>%
    arrange(date_q)

  if (nrow(d) < (Hq + 2)) return(NA_real_)

  gross_eq_log <- roll_sum_future(d$r_eq_total_log, Hq)
  gross_re_log <- roll_sum_future(d$r_re_total_log, Hq)

  gross_eq_simple <- exp(gross_eq_log) - 1
  gross_re_simple <- exp(gross_re_log) - 1

  # Equity CGT drag (apply to positive gains)
  cgt_rate <- get_eq_cgt_rate(hp_years)
  cgt_drag <- cgt_rate * pmax(gross_eq_simple, 0)

  # Net simple outcomes
  net_eq_simple <- (1 + gross_eq_simple) * (1 - eq_cost) - 1 - cgt_drag
  net_re_simple <- (1 + gross_re_simple) * (1 - re_cost) - 1

  # Metric
  if (GAP_METRIC == "wealth_ratio") {
    return(gap_wealth_ratio(net_eq_simple, net_re_simple))
  } else if (GAP_METRIC == "log_gap") {
    return(gap_log(net_eq_simple, net_re_simple))
  } else {
    stop("Unknown GAP_METRIC.")
  }
}

# ============================================================
# FIGURE 20: Heatmap + break-even contour
# ============================================================

hp_years_vec <- c(1, 2, 3, 5, 7, 10)
hp_labels <- paste0(hp_years_vec, "y")

y0_grid <- seq(0.02, 0.0625, by = 0.0025)  # 2.0% to 6.25%

grid_6b <- tidyr::expand_grid(
  hp_years = hp_years_vec,
  y0 = y0_grid
) %>%
  mutate(
    gap = mapply(calc_gap_metric, hp_years, y0,
                 MoreArgs = list(eq_cost = eq_roundtrip_cost_base, re_cost = re_roundtrip_cost_base)),
    hp_label = factor(paste0(hp_years, "y"), levels = hp_labels)
  ) %>%
  filter(is.finite(gap))

fill_title <- if (GAP_METRIC == "wealth_ratio") {
  "Median wealth ratio gap\n(RE vs EQ)"
} else {
  "Median log gap\nlog(RE) − log(EQ)"
}

xlab_gap <- if (GAP_METRIC == "wealth_ratio") {
  "Median terminal wealth ratio: (1+RE)/(1+EQ) − 1"
} else {
  "Median log wealth gap: log(1+RE) − log(1+EQ)"
}

p6b <- ggplot(grid_6b, aes(x = hp_label, y = y0, fill = gap)) +
  geom_tile(color = "white", linewidth = 0.35) +
  geom_contour(aes(z = gap), breaks = 0, color = "grey20", linewidth = 0.7) +
  scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
  {
    if (GAP_METRIC == "wealth_ratio") {
      scale_fill_gradient2(
        midpoint = 0,
        labels = percent_format(accuracy = 1),
        guide = guide_colorbar(
          title = fill_title,
          title.position = "top",
          barwidth = unit(85, "pt"),
          barheight = unit(12, "pt")
        )
      )
    } else {
      scale_fill_gradient2(
        midpoint = 0,
        labels = label_number(accuracy = 0.01),
        guide = guide_colorbar(
          title = fill_title,
          title.position = "top",
          barwidth = unit(85, "pt"),
          barheight = unit(12, "pt")
        )
      )
    }
  } +
  labs(
    x = "Holding period",
    y = "Base gross annual yield y0"
  ) +
  theme_thesis() +
  theme(
    legend.position = "bottom",
    legend.box.margin = margin(t = 10),
    legend.text = element_text(size = 10),
    plot.margin = margin(t = 10, r = 16, b = 24, l = 10)
  )

p6b
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf

ggsave(
  filename = "Figure_20_break_even_yield_map.png",
  plot = p6b,
  width = 12.5,
  height = 9.2,
  dpi = 300,
  limitsize = FALSE
)
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning: `stat_contour()`: Zero contours were generated
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
# ============================================================
# FIGURE 21: Tornado sensitivity (same metric)
# ============================================================

hp_star <- 10
y0_star <- 0.04

lever_ranges <- tibble::tribble(
  ~lever,                 ~low,   ~high,
  "Base gross yield y0",   0.02,   0.06,
  "RE roundtrip cost",     0.05,   0.12,
  "Equity roundtrip cost", 0.0005, 0.01
)

gap_baseline <- calc_gap_metric(hp_star, y0_star, eq_roundtrip_cost_base, re_roundtrip_cost_base)

tornado_df <- lever_ranges %>%
  rowwise() %>%
  mutate(
    gap_low = case_when(
      lever == "Base gross yield y0"   ~ calc_gap_metric(hp_star, low,  eq_roundtrip_cost_base, re_roundtrip_cost_base),
      lever == "RE roundtrip cost"     ~ calc_gap_metric(hp_star, y0_star, eq_roundtrip_cost_base, low),
      lever == "Equity roundtrip cost" ~ calc_gap_metric(hp_star, y0_star, low, re_roundtrip_cost_base),
      TRUE ~ NA_real_
    ),
    gap_high = case_when(
      lever == "Base gross yield y0"   ~ calc_gap_metric(hp_star, high, eq_roundtrip_cost_base, re_roundtrip_cost_base),
      lever == "RE roundtrip cost"     ~ calc_gap_metric(hp_star, y0_star, eq_roundtrip_cost_base, high),
      lever == "Equity roundtrip cost" ~ calc_gap_metric(hp_star, y0_star, high, re_roundtrip_cost_base),
      TRUE ~ NA_real_
    )
  ) %>%
  ungroup() %>%
  mutate(
    span = abs(gap_high - gap_low),
    lo = pmin(gap_low, gap_high),
    hi = pmax(gap_low, gap_high),
    lever = fct_reorder(lever, span)
  )

endpoints <- tornado_df %>%
  select(lever, gap_low, gap_high) %>%
  pivot_longer(cols = c(gap_low, gap_high), names_to = "endpoint", values_to = "gap") %>%
  mutate(
    endpoint = recode(endpoint, gap_low = "Low case", gap_high = "High case"),
    endpoint = factor(endpoint, levels = c("Low case", "High case"))
  )

x_scale <- if (GAP_METRIC == "wealth_ratio") {
  scale_x_continuous(labels = percent_format(accuracy = 1))
} else {
  scale_x_continuous(labels = label_number(accuracy = 0.01))
}

x_title <- if (GAP_METRIC == "wealth_ratio") {
  "Median terminal wealth ratio gap: (1+RE)/(1+EQ) − 1"
} else {
  "Median log wealth gap: log(1+RE) − log(1+EQ)"
}

p6c <- ggplot(tornado_df, aes(y = lever)) +
  geom_vline(aes(xintercept = gap_baseline, linetype = "Baseline"), linewidth = 0.9, color = "grey15") +
  geom_segment(
    aes(x = lo, xend = hi, yend = lever, color = "Sensitivity range"),
    linewidth = 10,
    lineend = "butt",
    alpha = 0.35
  ) +
  geom_point(
    data = endpoints,
    aes(x = gap, y = lever, shape = endpoint),
    size = 2.8,
    color = "grey15",
    inherit.aes = FALSE
  ) +
  x_scale +
  scale_color_manual(
    values = c("Sensitivity range" = "grey40"),
    name = NULL
  ) +
  scale_linetype_manual(
    values = c("Baseline" = "solid"),
    name = NULL
  ) +
  scale_shape_manual(
    values = c("Low case" = 16, "High case" = 16),
    name = NULL
  ) +
  guides(
    color = guide_legend(order = 1, override.aes = list(linewidth = 6, alpha = 0.35)),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 3, override.aes = list(color = "grey15", linewidth = 0.9))
  ) +
  labs(
    x = x_title,
    y = NULL
  ) +
  theme_thesis() +
  theme(
    legend.position = "bottom",
    legend.box = "vertical",
    legend.spacing.y = unit(4, "pt"),
    plot.margin = margin(t = 10, r = 16, b = 18, l = 10)
  )

p6c

ggsave(
  filename = "Figure_21_tornado_sensitivity.png",
  plot = p6c,
  width = 12.5,
  height = 7.6,
  dpi = 300,
  limitsize = FALSE
)