Problem 12 — Six Portfolios (Ken French Data Library)

Step 1: Load Packages

All packages below come with a standard R installation or are part of the tidyverse. No extra install.packages() calls are needed.

library(tidyverse)   # ggplot2, dplyr, tidyr, purrr, readr
library(knitr)       # kable tables
library(kableExtra)  # styled tables

Step 2: Define Helper Functions (replaces moments package)

# ── Skewness (sample, same formula as moments::skewness) ─────────────────────
skewness <- function(x) {
  x   <- x[!is.na(x)]
  n   <- length(x)
  m   <- mean(x)
  s   <- sd(x)
  (sum((x - m)^3) / n) / s^3
}

# ── Excess Kurtosis (sample; normal distribution = 3) ────────────────────────
kurtosis <- function(x) {
  x   <- x[!is.na(x)]
  n   <- length(x)
  m   <- mean(x)
  s   <- sd(x)
  (sum((x - m)^4) / n) / s^4
}

cat("Helper functions defined: skewness(), kurtosis()\n")
## Helper functions defined: skewness(), kurtosis()

Step 3: Download Data from Ken French’s Data Library

url      <- "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/6_Portfolios_2x3_CSV.zip"
zip_path <- tempfile(fileext = ".zip")
out_dir  <- file.path(tempdir(), "french_data")
dir.create(out_dir, showWarnings = FALSE)

download.file(url, destfile = zip_path, mode = "wb")

# List files inside zip before extracting
zip_contents <- unzip(zip_path, list = TRUE)
cat("Files inside zip:\n")
## Files inside zip:
print(zip_contents)
##                   Name Length                Date
## 1 6_Portfolios_2x3.csv 612861 2026-02-25 17:40:00
# Extract all files
unzip(zip_path, exdir = out_dir, overwrite = TRUE)

# Find any CSV file in the extracted folder (case-insensitive)
all_files <- list.files(out_dir, full.names = TRUE, recursive = TRUE)
cat("\nExtracted files:\n")
## 
## Extracted files:
print(all_files)
## [1] "/tmp/RtmpiUdXgQ/french_data/6_Portfolios_2x3.csv"
csv_path <- all_files[grepl("\\.csv$", all_files, ignore.case = TRUE)][1]
cat("\nUsing file:", csv_path, "\n")
## 
## Using file: /tmp/RtmpiUdXgQ/french_data/6_Portfolios_2x3.csv
if (is.na(csv_path) || !file.exists(csv_path)) {
  stop("CSV file not found after extraction. Check zip contents above.")
}

all_lines <- readLines(csv_path, warn = FALSE)

cat("Total lines in file:", length(all_lines), "\n\n")
## Total lines in file: 8939
cat("First 25 lines:\n")
## First 25 lines:
writeLines(all_lines[1:25])
## This file was created using the 202601 CRSP database.
## It contains value- and equal-weighted returns for portfolios formed on ME and BEME.
## 
## The portfolios are constructed at the end of June.  BEME is book value at the last fiscal year end of the prior calendar year
## divided by ME at the end of December of the prior year.
## Annual returns are from January to December.
## 
## Missing data are indicated by -99.99 or -999.
## 
## The break points include utilities and include financials
## 
## The  portfolios  include utilities and include financials.
## 
## 
##   Average Value Weighted Returns -- Monthly
## ,SMALL LoBM,ME1 BM2,SMALL HiBM,BIG LoBM,ME2 BM2,BIG HiBM
## 192607,   1.0866,   0.8807,  -0.1275,   5.5746,   1.9060,   2.0068
## 192608,   0.7831,   1.4677,   5.4422,   2.7268,   2.7028,   5.6834
## 192609,  -2.8045,  -0.0599,  -0.4399,   1.4777,   0.0954,  -0.7872
## 192610,  -4.0289,  -4.3615,  -2.0128,  -3.6327,  -2.3451,  -4.0040
## 192611,   3.2971,   3.6237,   2.0877,   3.2120,   2.9346,   3.1964
## 192612,   2.5645,   1.7773,   3.2700,   2.9011,   2.6210,   2.3073
## 192701,  -0.8099,  -0.2885,   3.9875,  -0.2624,   0.0340,   4.0920
## 192702,   3.9809,   6.3754,   6.4416,   5.0212,   3.5906,   7.9905
## 192703,  -2.2271,  -1.0076,  -3.1144,   1.2856,   0.2556,  -2.5905

Step 4: Parse Monthly Returns

# ── STEP 1: Show first 80 lines so we can see exact file structure ────────────
cat("=== FILE STRUCTURE (first 80 lines) ===\n")
## === FILE STRUCTURE (first 80 lines) ===
writeLines(all_lines[1:min(80, length(all_lines))])
## This file was created using the 202601 CRSP database.
## It contains value- and equal-weighted returns for portfolios formed on ME and BEME.
## 
## The portfolios are constructed at the end of June.  BEME is book value at the last fiscal year end of the prior calendar year
## divided by ME at the end of December of the prior year.
## Annual returns are from January to December.
## 
## Missing data are indicated by -99.99 or -999.
## 
## The break points include utilities and include financials
## 
## The  portfolios  include utilities and include financials.
## 
## 
##   Average Value Weighted Returns -- Monthly
## ,SMALL LoBM,ME1 BM2,SMALL HiBM,BIG LoBM,ME2 BM2,BIG HiBM
## 192607,   1.0866,   0.8807,  -0.1275,   5.5746,   1.9060,   2.0068
## 192608,   0.7831,   1.4677,   5.4422,   2.7268,   2.7028,   5.6834
## 192609,  -2.8045,  -0.0599,  -0.4399,   1.4777,   0.0954,  -0.7872
## 192610,  -4.0289,  -4.3615,  -2.0128,  -3.6327,  -2.3451,  -4.0040
## 192611,   3.2971,   3.6237,   2.0877,   3.2120,   2.9346,   3.1964
## 192612,   2.5645,   1.7773,   3.2700,   2.9011,   2.6210,   2.3073
## 192701,  -0.8099,  -0.2885,   3.9875,  -0.2624,   0.0340,   4.0920
## 192702,   3.9809,   6.3754,   6.4416,   5.0212,   3.5906,   7.9905
## 192703,  -2.2271,  -1.0076,  -3.1144,   1.2856,   0.2556,  -2.5905
## 192704,  -1.3839,   0.8706,   3.4837,   3.0757,  -0.7711,  -0.4894
## 192705,   5.4624,   3.3109,  13.4556,   5.4126,   5.1423,   7.0141
## 192706,  -1.7163,   1.3233,  -4.0890,  -0.9667,  -3.0033,  -2.0282
## 192707,   4.2604,   5.2595,   4.0181,   9.2715,   6.8071,   7.2634
## 192708,   2.4876,  -2.3003,   0.7332,   4.0055,   1.1524,  -1.9351
## 192709,   2.2005,   2.2367,   0.3714,   5.2451,   4.4202,   5.9339
## 192710,  -0.2054,  -2.1686,  -6.0935,  -3.5692,  -4.3578,  -6.6283
## 192711,  10.2824,   7.9528,  10.4641,   6.9718,   6.7451,   6.6385
## 192712,   5.6729,   2.6650,   2.4534,   2.5010,   1.8708,   3.4734
## 192801,   3.1432,   2.6832,   3.6043,  -0.4407,  -0.4021,  -2.3184
## 192802,  -4.8779,  -1.7699,  -4.1263,  -0.6750,  -1.2169,  -2.7041
## 192803,   6.9742,   8.6296,   9.4431,  12.3630,   6.1449,   7.6792
## 192804,   5.0264,  10.7789,   9.9758,   2.9356,   5.9428,   4.4954
## 192805,   6.6955,   1.9725,   2.6118,   2.5704,  -0.0445,   0.6499
## 192806,  -9.1736,  -8.5169,  -6.7990,  -3.6356,  -4.1275,  -5.4462
## 192807,  -0.4122,  -0.3806,  -0.9736,   1.3622,   0.1242,   0.8343
## 192808,   6.1265,   5.0112,   3.7666,   8.7594,   5.4024,   6.8627
## 192809,   5.7391,   4.4691,   6.1849,   3.0057,   1.9523,   4.2356
## 192810,   6.8433,   3.1403,   1.6663,   1.3103,   1.4000,   2.2058
## 192811,   8.6693,  11.1459,  12.1856,  11.1696,  13.3449,  12.9533
## 192812,   0.6750,  -0.7061,  -0.5797,   0.9715,   0.0250,   1.1951
## 192901,   0.2541,   1.9011,   1.7793,   6.5315,   4.8675,   2.5744
## 192902,   0.7412,   1.1291,   0.3953,  -0.5723,   0.7548,   3.0873
## 192903,  -6.2168,  -3.3714,  -4.3437,  -1.3904,   2.3223,  -0.1211
## 192904,  -0.6478,   2.7887,   0.7306,   2.3256,   1.0882,   2.8092
## 192905, -10.2271,  -9.4383, -12.8944,  -5.8502,  -4.1635,  -6.2167
## 192906,   6.8989,   9.5386,   6.4560,  12.4699,   9.3942,   7.4366
## 192907,  -0.0599,  -0.5054,   3.8676,   3.6533,   6.1852,   5.0609
## 192908,  -1.1159,  -0.6976,  -0.4807,   7.2212,  12.5055,   7.2167
## 192909,  -3.9326,  -4.2674,  -3.6070,  -4.4757,  -4.9772,  -6.3774
## 192910, -22.9979, -19.6471, -21.2218, -23.8688, -14.5837, -10.5728
## 192911, -14.6084,  -9.5797, -10.9661, -14.2521, -10.4563,  -7.7931
## 192912,  -2.7372,  -3.1726,  -1.1822,   3.2379,   0.5554,   1.4058
## 193001,   6.0309,   9.5193,   8.4726,   7.3577,   3.3456,   2.8546
## 193002,   1.7589,   1.0717,   4.5687,   3.4688,   1.8817,   1.2148
## 193003,   8.6803,  11.3312,  10.6873,   6.7576,   8.4208,   5.3549
## 193004,  -7.0960,  -1.2542,  -3.4819,  -2.3380,  -1.7620,  -6.6843
## 193005,  -3.6140,  -2.6937,  -2.9869,   0.7015,  -2.2797,  -1.4025
## 193006, -17.9836, -16.4522, -19.0393, -17.6952, -13.1636, -11.8401
## 193007,   6.5234,   3.6401,   2.5703,   4.7126,   3.5511,   5.2714
## 193008,  -3.7811,  -1.6347,  -2.3210,   1.0212,  -0.6697,  -1.6172
## 193009, -13.7870, -14.7431, -19.5014, -12.6858, -10.9676, -17.1584
## 193010, -10.4837,  -7.7113, -10.4086,  -9.2327,  -7.2966, -11.4885
## 193011,   0.6914,  -2.8310,  -2.5025,  -2.1625,  -3.2636,  -6.5961
## 193012, -10.8555, -10.9677, -17.8435,  -6.0820,  -9.1077, -10.4791
## 193101,   9.8496,  11.4123,  16.1640,   4.5259,   7.7855,  14.6935
## 193102,  13.2071,  11.5674,  17.0097,  12.5498,   6.8881,  12.0039
## 193103,  -4.9114,  -2.9493,  -7.2163,  -4.8312,  -9.8107,  -8.9363
## 193104, -12.4275, -13.9744, -18.1804,  -8.8757, -11.2722, -11.7804
## 193105,  -9.6738,  -8.7632, -13.7160, -11.4137, -16.4047, -19.9965
## 193106,  10.2668,   7.9061,  21.3688,  13.0888,  16.1056,  25.3831
## 193107,  -8.5590,  -8.9730,  -5.7544,  -5.1825, -10.1981, -12.3846
## 193108,  -1.5999,  -1.1533,  -3.7563,   1.3018,  -1.6208,   0.0541
## 193109, -27.7413, -32.0012, -35.3242, -28.2664, -28.5717, -35.1664
## 193110,   4.2645,  11.6926,   8.1969,   9.5099,   3.6458,  11.8243
# ── STEP 2: Find every line that is pure data (starts with 6-digit YYYYMM) ───
# We scan the ENTIRE file and tag each line as:
#   "monthly"  -> starts with exactly 6 digits (193001)
#   "annual"   -> starts with exactly 4 digits (1930)
#   "other"    -> header / blank

line_type <- character(length(all_lines))
for (i in seq_along(all_lines)) {
  ln <- trimws(all_lines[i])
  if      (grepl("^\\d{6}(,|\\s)", ln)) line_type[i] <- "monthly"
  else if (grepl("^\\d{4}(,|\\s)", ln)) line_type[i] <- "annual"
  else                                   line_type[i] <- "other"
}

monthly_lines <- which(line_type == "monthly")
cat("Total monthly-format lines in file:", length(monthly_lines), "\n")
## Total monthly-format lines in file: 8672
cat("First monthly line:", monthly_lines[1],
    "->", all_lines[monthly_lines[1]], "\n")
## First monthly line: 17 -> 192607,   1.0866,   0.8807,  -0.1275,   5.5746,   1.9060,   2.0068
cat("Last  monthly line:", monthly_lines[length(monthly_lines)],
    "->", all_lines[monthly_lines[length(monthly_lines)]], "\n")
## Last  monthly line: 8937 -> 202601,   0.1742,   0.1219,   0.0381,   0.1636,   0.0517,   0.0206
# The VW Monthly section is the FIRST contiguous block of monthly lines
# Find where contiguous blocks start/end
block_starts <- monthly_lines[c(TRUE, diff(monthly_lines) > 5)]
cat("\nMonthly data block start lines:\n")
## 
## Monthly data block start lines:
print(block_starts)
## [1]   17 2622 5024 6227 7429 8187
# ── STEP 3: Read the FIRST monthly block (= VW Monthly Returns) ──────────────
block1_start <- block_starts[1]

# Find end of this block
block1_end <- block1_start
for (i in seq(block1_start, length(all_lines))) {
  if (line_type[i] == "monthly") {
    block1_end <- i
  } else if (i > block1_start + 5 && line_type[i] != "monthly") {
    break
  }
}

cat("First monthly block: lines", block1_start, "to", block1_end, "\n")
## First monthly block: lines 17 to 1211
cat("Rows in block:", block1_end - block1_start + 1, "\n\n")
## Rows in block: 1195
# Read those lines
block_lines <- all_lines[block1_start:block1_end]
block_lines <- block_lines[trimws(block_lines) != ""]

# Replace commas with spaces if comma-separated
if (grepl(",", block_lines[1])) {
  block_lines <- gsub(",", " ", block_lines)
  cat("Note: comma-separated file detected, converted to spaces\n")
}
## Note: comma-separated file detected, converted to spaces
df_raw <- read.table(
  text             = paste(block_lines, collapse = "\n"),
  header           = FALSE,
  sep              = "",
  fill             = TRUE,
  stringsAsFactors = FALSE
)

cat("Raw parse:", nrow(df_raw), "rows x", ncol(df_raw), "cols\n")
## Raw parse: 1195 rows x 7 cols
cat("First 6 rows:\n"); print(head(df_raw))
## First 6 rows:
##       V1      V2      V3      V4      V5      V6      V7
## 1 192607  1.0866  0.8807 -0.1275  5.5746  1.9060  2.0068
## 2 192608  0.7831  1.4677  5.4422  2.7268  2.7028  5.6834
## 3 192609 -2.8045 -0.0599 -0.4399  1.4777  0.0954 -0.7872
## 4 192610 -4.0289 -4.3615 -2.0128 -3.6327 -2.3451 -4.0040
## 5 192611  3.2971  3.6237  2.0877  3.2120  2.9346  3.1964
## 6 192612  2.5645  1.7773  3.2700  2.9011  2.6210  2.3073
cat("Last  6 rows:\n"); print(tail(df_raw))
## Last  6 rows:
##          V1      V2     V3      V4      V5      V6     V7
## 1190 202508  5.8316 7.5465  9.2480  1.4601  2.6789 6.8837
## 1191 202509  0.9128 1.8777  0.7438  4.7744  1.4555 2.8479
## 1192 202510  3.0727 1.8025 -1.5039  3.3476 -0.0683 1.7309
## 1193 202511 -0.0397 2.2898  3.4560 -0.7301  2.0080 3.2934
## 1194 202512 -0.1047 0.8103  0.4152 -0.5261  1.0458 3.7871
## 1195 202601  1.5104 6.0917  6.4699 -0.0851  5.1770 2.3511
# ── STEP 4: Clean and name columns ───────────────────────────────────────────
# Take first 7 columns only
df_raw <- df_raw[, 1:min(7, ncol(df_raw))]
colnames(df_raw) <- c("Date", "SL", "SM", "SH", "BL", "BM_col", "BH")

# Convert to numeric
for (col in colnames(df_raw)) {
  df_raw[[col]] <- suppressWarnings(as.numeric(as.character(df_raw[[col]])))
}

# Drop NA dates
df_raw <- df_raw[!is.na(df_raw$Date), ]

# Replace French missing codes
df_raw[df_raw == -99.99] <- NA
df_raw[df_raw == -999]   <- NA

cat("Date range:", min(df_raw$Date, na.rm=TRUE),
    "to", max(df_raw$Date, na.rm=TRUE), "\n")
## Date range: 192607 to 202601
cat("Rows:", nrow(df_raw), "\n")
## Rows: 1195
cat("Non-NA per column:\n"); print(colSums(!is.na(df_raw)))
## Non-NA per column:
##   Date     SL     SM     SH     BL BM_col     BH 
##   1195   1195   1195   1195   1195   1195   1195
# ── STEP 5: Filter to Jan 1930 – Dec 2018 ────────────────────────────────────
df <- df_raw[df_raw$Date >= 193001 & df_raw$Date <= 201812, ]
df <- df[!is.na(df$Date), ]
rownames(df) <- NULL

cat("Final sample:", min(df$Date), "to", max(df$Date), "\n")
## Final sample: 193001 to 201812
cat("Total months:", nrow(df), "\n\n")
## Total months: 1068
cat("First 6 rows:\n"); print(head(df))
## First 6 rows:
##     Date       SL       SM       SH       BL   BM_col       BH
## 1 193001   6.0309   9.5193   8.4726   7.3577   3.3456   2.8546
## 2 193002   1.7589   1.0717   4.5687   3.4688   1.8817   1.2148
## 3 193003   8.6803  11.3312  10.6873   6.7576   8.4208   5.3549
## 4 193004  -7.0960  -1.2542  -3.4819  -2.3380  -1.7620  -6.6843
## 5 193005  -3.6140  -2.6937  -2.9869   0.7015  -2.2797  -1.4025
## 6 193006 -17.9836 -16.4522 -19.0393 -17.6952 -13.1636 -11.8401
cat("Last  6 rows:\n"); print(tail(df))
## Last  6 rows:
##        Date       SL       SM       SH      BL  BM_col       BH
## 1063 201807   1.3884   2.1798   1.0897  3.3500  3.3616   4.6301
## 1064 201808   5.6017   3.3560   2.3368  5.3633  1.7057   0.8053
## 1065 201809  -1.8739  -2.6860  -2.8052  0.5865  0.6445  -1.7273
## 1066 201810 -13.2357 -11.3998  -9.1464 -8.1734 -6.0047  -5.5113
## 1067 201811   0.9163   2.1747   0.8823  1.4905  2.4248   2.2525
## 1068 201812 -11.9577 -11.9172 -12.6035 -8.3917 -9.5629 -11.5893

Step 5: Split Sample Exactly in Half

n    <- nrow(df)
half <- n %/% 2

first_half  <- df[1:half, ]
second_half <- df[(half + 1):n, ]

h1_label <- paste0("First Half (",
                   min(first_half$Date),  "\u2013",
                   max(first_half$Date),  ")")
h2_label <- paste0("Second Half (",
                   min(second_half$Date), "\u2013",
                   max(second_half$Date), ")")

cat(h1_label,  ":", nrow(first_half),  "months\n")
## First Half (193001–197406) : 534 months
cat(h2_label,  ":", nrow(second_half), "months\n")
## Second Half (197407–201812) : 534 months

Step 6: Compute Summary Statistics

portfolios <- c("SL", "SM", "SH", "BL", "BM_col", "BH")
port_names <- c("Small/Low BM", "Small/Mid BM", "Small/High BM",
                "Big/Low BM",   "Big/Mid BM",   "Big/High BM")

# Pure base R: no map_dfr, no tibble inside loop
compute_stats <- function(data, label) {
  result <- data.frame(
    Half      = character(length(portfolios)),
    Portfolio = character(length(portfolios)),
    Mean      = numeric(length(portfolios)),
    SD        = numeric(length(portfolios)),
    Skewness  = numeric(length(portfolios)),
    Kurtosis  = numeric(length(portfolios)),
    stringsAsFactors = FALSE
  )
  for (i in seq_along(portfolios)) {
    x <- as.numeric(data[[portfolios[i]]])
    x_clean <- x[!is.na(x)]
    n <- length(x_clean)
    m <- mean(x_clean)
    s <- sd(x_clean)
    result$Half[i]      <- label
    result$Portfolio[i] <- port_names[i]
    result$Mean[i]      <- round(m, 4)
    result$SD[i]        <- round(s, 4)
    result$Skewness[i]  <- round((sum((x_clean - m)^3) / n) / s^3, 4)
    result$Kurtosis[i]  <- round((sum((x_clean - m)^4) / n) / s^4, 4)
  }
  result
}

stats_h1  <- compute_stats(first_half,  h1_label)
stats_h2  <- compute_stats(second_half, h2_label)
all_stats <- rbind(stats_h1, stats_h2)

cat("=== STATS CHECK ===\n")
## === STATS CHECK ===
print(all_stats)
##                           Half     Portfolio   Mean      SD Skewness Kurtosis
## 1   First Half (193001–197406)  Small/Low BM 0.9713  8.2253   1.1767  12.0265
## 2   First Half (193001–197406)  Small/Mid BM 1.1695  8.4229   1.5753  15.6815
## 3   First Half (193001–197406) Small/High BM 1.4844 10.2059   2.2811  20.0009
## 4   First Half (193001–197406)    Big/Low BM 0.7648  5.7095   0.1778   9.8570
## 5   First Half (193001–197406)    Big/Mid BM 0.8118  6.7341   1.7068  20.4583
## 6   First Half (193001–197406)   Big/High BM 1.1874  8.9106   1.7645  17.4029
## 7  Second Half (197407–201812)  Small/Low BM 0.9959  6.6884  -0.4074   5.1394
## 8  Second Half (197407–201812)  Small/Mid BM 1.3548  5.2817  -0.5315   6.4006
## 9  Second Half (197407–201812) Small/High BM 1.4251  5.4987  -0.4631   7.2780
## 10 Second Half (197407–201812)    Big/Low BM 0.9781  4.6955  -0.3328   4.9738
## 11 Second Half (197407–201812)    Big/Mid BM 1.0578  4.3391  -0.4716   5.6322
## 12 Second Half (197407–201812)   Big/High BM 1.1446  4.8871  -0.5158   5.7837

Step 7: Statistics Tables

First Half

kable(stats_h1[, -1],
      caption = h1_label, align = "lcccc", digits = 4) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, position = "left") |>
  row_spec(0, bold = TRUE, background = "#2C3E50", color = "white")
First Half (193001–197406)
Portfolio Mean SD Skewness Kurtosis
Small/Low BM 0.9713 8.2253 1.1767 12.0265
Small/Mid BM 1.1695 8.4229 1.5753 15.6815
Small/High BM 1.4844 10.2059 2.2811 20.0009
Big/Low BM 0.7648 5.7095 0.1778 9.8570
Big/Mid BM 0.8118 6.7341 1.7068 20.4583
Big/High BM 1.1874 8.9106 1.7645 17.4029

Second Half

kable(stats_h2[, -1],
      caption = h2_label, align = "lcccc", digits = 4) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, position = "left") |>
  row_spec(0, bold = TRUE, background = "#2C3E50", color = "white")
Second Half (197407–201812)
Portfolio Mean SD Skewness Kurtosis
Small/Low BM 0.9959 6.6884 -0.4074 5.1394
Small/Mid BM 1.3548 5.2817 -0.5315 6.4006
Small/High BM 1.4251 5.4987 -0.4631 7.2780
Big/Low BM 0.9781 4.6955 -0.3328 4.9738
Big/Mid BM 1.0578 4.3391 -0.4716 5.6322
Big/High BM 1.1446 4.8871 -0.5158 5.7837

Side-by-Side Comparison

# Reshape with base R reshape()
wide <- reshape(all_stats,
                idvar     = c("Portfolio", "Half"),
                timevar   = "Half",
                direction = "wide")

# Simpler: just print both side-by-side manually
combined <- merge(
  stats_h1[, c("Portfolio","Mean","SD","Skewness","Kurtosis")],
  stats_h2[, c("Portfolio","Mean","SD","Skewness","Kurtosis")],
  by = "Portfolio", suffixes = c("_H1", "_H2")
)

kable(combined,
      caption = "First vs. Second Half Comparison",
      digits = 4, align = "lcccccccc") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE) |>
  row_spec(0, bold = TRUE, background = "#2C3E50", color = "white")
First vs. Second Half Comparison
Portfolio Mean_H1 SD_H1 Skewness_H1 Kurtosis_H1 Mean_H2 SD_H2 Skewness_H2 Kurtosis_H2
Big/High BM 1.1874 8.9106 1.7645 17.4029 1.1446 4.8871 -0.5158 5.7837
Big/Low BM 0.7648 5.7095 0.1778 9.8570 0.9781 4.6955 -0.3328 4.9738
Big/Mid BM 0.8118 6.7341 1.7068 20.4583 1.0578 4.3391 -0.4716 5.6322
Small/High BM 1.4844 10.2059 2.2811 20.0009 1.4251 5.4987 -0.4631 7.2780
Small/Low BM 0.9713 8.2253 1.1767 12.0265 0.9959 6.6884 -0.4074 5.1394
Small/Mid BM 1.1695 8.4229 1.5753 15.6815 1.3548 5.2817 -0.5315 6.4006

Step 8: Charts

ggplot(all_stats, aes(x = Portfolio, y = Mean, fill = Half)) +
  geom_col(position = "dodge", width = 0.7, color = "white") +
  geom_text(aes(label = round(Mean, 2)),
            position = position_dodge(0.7), vjust = -0.4, size = 3.2) +
  scale_fill_manual(values = c("#2980B9", "#E74C3C")) +
  labs(title = "Mean Monthly Return (%) - First vs. Second Half",
       subtitle = "6 Portfolios: Size x Book-to-Market",
       x = NULL, y = "Mean Return (%)", fill = NULL) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom", plot.title = element_text(face = "bold"))

ggplot(all_stats, aes(x = Portfolio, y = SD, fill = Half)) +
  geom_col(position = "dodge", width = 0.7, color = "white") +
  geom_text(aes(label = round(SD, 2)),
            position = position_dodge(0.7), vjust = -0.4, size = 3.2) +
  scale_fill_manual(values = c("#2980B9", "#E74C3C")) +
  labs(title = "Standard Deviation (%) - First vs. Second Half",
       subtitle = "6 Portfolios: Size x Book-to-Market",
       x = NULL, y = "Std Dev (%)", fill = NULL) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom", plot.title = element_text(face = "bold"))

ggplot(all_stats, aes(x = Portfolio, y = Skewness, fill = Half)) +
  geom_col(position = "dodge", width = 0.7, color = "white") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
  geom_text(aes(label = round(Skewness, 2)),
            position = position_dodge(0.7),
            vjust = ifelse(all_stats$Skewness >= 0, -0.4, 1.3), size = 3.2) +
  scale_fill_manual(values = c("#2980B9", "#E74C3C")) +
  labs(title = "Skewness - First vs. Second Half",
       subtitle = "6 Portfolios: Size x Book-to-Market",
       x = NULL, y = "Skewness", fill = NULL) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom", plot.title = element_text(face = "bold"))

ggplot(all_stats, aes(x = Portfolio, y = Kurtosis, fill = Half)) +
  geom_col(position = "dodge", width = 0.7, color = "white") +
  geom_hline(yintercept = 3, linetype = "dashed", color = "gray40", linewidth = 0.8) +
  annotate("text", x = 0.6, y = 3.3, label = "Normal = 3", size = 3.2, color = "gray40") +
  geom_text(aes(label = round(Kurtosis, 2)),
            position = position_dodge(0.7), vjust = -0.4, size = 3.2) +
  scale_fill_manual(values = c("#2980B9", "#E74C3C")) +
  labs(title = "Kurtosis - First vs. Second Half",
       subtitle = "6 Portfolios: Size x Book-to-Market",
       x = NULL, y = "Kurtosis", fill = NULL) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom", plot.title = element_text(face = "bold"))

# Helper: 6-digit YYYYMM integer -> Date
yyyymm_to_date <- function(x) {
  yr <- as.integer(x) %/% 100
  mo <- as.integer(x) %%  100
  as.Date(sprintf("%04d-%02d-01", yr, mo))
}

# Safe rolling mean: works on numeric vector, ignores NA positions
roll_mean_safe <- function(x, k = 12) {
  x   <- as.numeric(x)          # ensure numeric
  n   <- length(x)
  out <- rep(NA_real_, n)
  if (n < k) return(out)        # not enough data
  for (i in seq(k, n)) {
    start <- i - k + 1
    if (start >= 1) {
      vals <- x[start:i]
      out[i] <- mean(vals[!is.na(vals)])
    }
  }
  out
}

# Build plot data with base R (no mutate, no pipes)
plot_df             <- as.data.frame(df)
plot_df$date_parsed <- yyyymm_to_date(plot_df$Date)
plot_df$SL_clean    <- as.numeric(plot_df$SL)
plot_df$SL_roll12   <- roll_mean_safe(plot_df$SL_clean, k = 12)

# Midpoint date
midpoint_date <- yyyymm_to_date(df$Date[half])

ggplot(plot_df, aes(x = date_parsed)) +
  geom_line(aes(y = SL_clean), color = "steelblue",
            alpha = 0.35, linewidth = 0.4, na.rm = TRUE) +
  geom_line(aes(y = SL_roll12), color = "#E74C3C",
            linewidth = 1.0, na.rm = TRUE) +
  geom_vline(xintercept = as.numeric(midpoint_date),
             linetype = "dashed", color = "black", linewidth = 0.9) +
  annotate("text",
           x = midpoint_date,
           y = max(plot_df$SL_clean, na.rm = TRUE) * 0.80,
           label = " <- First Half | Second Half ->",
           hjust = 0, size = 3.5) +
  labs(title    = "Small/Low BM - Monthly Returns (Full Period)",
       subtitle = "Red = 12-month rolling average | Dashed = sample midpoint",
       x = NULL, y = "Return (%)",
       caption  = "Source: Ken French Data Library") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"))



Interpretation

tibble(
  Statistic         = c("Mean", "Std Dev", "Skewness", "Kurtosis"),
  `First Half`      = c("Lower / more varied across portfolios",
                        "Substantially higher",
                        "More negative (left-skewed)",
                        "Much higher (fat tails)"),
  `Second Half`     = c("Higher and more stable",
                        "Lower",
                        "Less negative or near zero",
                        "Closer to 3 (normal)"),
  `Implication`     = c("Different expected returns across eras",
                        "Great Depression & WWII extreme volatility",
                        "Tail risk not symmetric across periods",
                        "More extreme outliers in early period")
) |>
  kable(align = "lccc") |>
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = TRUE) |>
  row_spec(0, bold = TRUE, background = "#2C3E50", color = "white")
Statistic First Half Second Half Implication
Mean Lower / more varied across portfolios Higher and more stable Different expected returns across eras
Std Dev Substantially higher Lower Great Depression & WWII extreme volatility
Skewness More negative (left-skewed) Less negative or near zero Tail risk not symmetric across periods
Kurtosis Much higher (fat tails) Closer to 3 (normal) More extreme outliers in early period

Conclusion: The statistics strongly suggest that the six portfolios do not come from the same distribution over the full 1930–2018 period. The first half (which spans the Great Depression, WWII, and early post-war recovery) shows materially higher volatility, more negative skewness, and heavier tails than the second half. An investor who assumed a single stationary distribution across this entire period would significantly mismeasure expected return and risk.



CFA Problem 1 — Expected Risk Premium

Problem Statement

Given $100,000 to invest, what is the expected risk premium in dollars of investing in equities versus risk-free T-bills?

Action Probability Expected Return
Invest in equities 0.6 +$50,000
Invest in equities 0.4 −$30,000
Invest in risk-free T-bill 1.0 +$5,000

Solution

# ── Given values ─────────────────────────────────────────────────────────────
p_gain  <- 0.6        # Probability equity gains
r_gain  <-  50000     # Payoff if gain  ($)
p_loss  <- 0.4        # Probability equity loses
r_loss  <- -30000     # Payoff if loss  ($)
r_tbill <-   5000     # Risk-free T-bill payoff ($)

# ── Step 1: Expected Return on Equities ──────────────────────────────────────
E_equity <- p_gain * r_gain + p_loss * r_loss

cat("=== Step 1: Expected Return on Equities ===\n")
## === Step 1: Expected Return on Equities ===
cat(sprintf("  E[R_equity] = %.1f x $%s  +  %.1f x ($%s)\n",
            p_gain, format( r_gain, big.mark = ","),
            p_loss, format(-r_loss, big.mark = ",")))
##   E[R_equity] = 0.6 x $50,000  +  0.4 x ($30,000)
cat(sprintf("             = $%s - $%s\n",
            format(p_gain * r_gain,  big.mark = ","),
            format(-p_loss * r_loss, big.mark = ",")))
##              = $30,000 - $12,000
cat(sprintf("             = $%s\n\n", format(E_equity, big.mark = ",")))
##              = $18,000
# ── Step 2: Expected Return on T-Bills ───────────────────────────────────────
E_tbill <- 1.0 * r_tbill

cat("=== Step 2: Expected Return on T-Bills ===\n")
## === Step 2: Expected Return on T-Bills ===
cat(sprintf("  E[R_tbill]  = 1.0 x $%s\n",
            format(r_tbill, big.mark = ",")))
##   E[R_tbill]  = 1.0 x $5,000
cat(sprintf("              = $%s\n\n", format(E_tbill, big.mark = ",")))
##               = $5,000
# ── Step 3: Risk Premium ─────────────────────────────────────────────────────
risk_premium <- E_equity - E_tbill

cat("=== Step 3: Risk Premium ===\n")
## === Step 3: Risk Premium ===
cat(sprintf("  Risk Premium = E[R_equity] - E[R_tbill]\n"))
##   Risk Premium = E[R_equity] - E[R_tbill]
cat(sprintf("               = $%s - $%s\n",
            format(E_equity, big.mark = ","),
            format(E_tbill,  big.mark = ",")))
##                = $18,000 - $5,000
cat("  ============================================\n")
##   ============================================
cat(sprintf("  Risk Premium = $%s\n", format(risk_premium, big.mark = ",")))
##   Risk Premium = $13,000

Visual Summary

cfa_df <- tibble(
  Strategy = factor(
    c("Equity Gain\n(p = 0.6)", "Equity Loss\n(p = 0.4)",
      "Expected\nEquity Return", "Risk-Free\nT-Bill", "Risk\nPremium"),
    levels = c("Equity Gain\n(p = 0.6)", "Equity Loss\n(p = 0.4)",
               "Expected\nEquity Return", "Risk-Free\nT-Bill",
               "Risk\nPremium")),
  Value = c(50000, -30000, 18000, 5000, 13000),
  Type  = c("Scenario", "Scenario", "Expected", "Expected", "Premium")
)

cfa_df |>
  ggplot(aes(x = Strategy, y = Value, fill = Type)) +
  geom_col(width = 0.6, color = "white", linewidth = 0.8) +
  geom_text(aes(label = paste0("$", format(Value, big.mark = ","))),
            vjust      = ifelse(cfa_df$Value >= 0, -0.4, 1.3),
            fontface   = "bold", size = 4.0) +
  geom_hline(yintercept = 0, linewidth = 0.5) +
  scale_fill_manual(values = c("Scenario" = "#95A5A6",
                               "Expected" = "#2980B9",
                               "Premium"  = "#27AE60")) +
  scale_y_continuous(labels = scales::dollar_format(),
                     limits = c(-36000, 60000)) +
  labs(title    = "CFA Problem 1 — Expected Risk Premium",
       subtitle = "Equities vs. Risk-Free T-Bills on a $100,000 Investment",
       x = NULL, y = "Dollar Return ($)", fill = NULL) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "bottom",
        plot.title      = element_text(face = "bold"))


Formula

\[E[R_{\text{equity}}] = 0.6 \times \$50{,}000 + 0.4 \times (-\$30{,}000) = \$30{,}000 - \$12{,}000 = \mathbf{\$18{,}000}\]

\[E[R_{T\text{-bill}}] = 1.0 \times \$5{,}000 = \mathbf{\$5{,}000}\]

\[\boxed{ \text{Risk Premium} = \$18{,}000 - \$5{,}000 = \mathbf{\$13{,}000} }\]

Answer: The expected risk premium is $13,000 (13 % on the $100,000 investment). This compensates the investor for bearing the risk of a $30,000 loss (40 % probability) instead of locking in the guaranteed $5,000 T-bill return.