This report analyzes the returns of six portfolios formed on size and book-to-market ratios from the Kenneth French Data Library. The goal is to compute descriptive statistics and compare the return distributions across two time periods.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(moments)
library(ggplot2)
data <- read.csv("6_Portfolios_2x3.csv", skip=15)
head(data)
## X SMALL.LoBM ME1.BM2 SMALL.HiBM BIG.LoBM ME2.BM2 BIG.HiBM
## 1 192607 1.0866 0.8807 -0.1275 5.5746 1.9060 2.0068
## 2 192608 0.7831 1.4677 5.4422 2.7268 2.7028 5.6834
## 3 192609 -2.8045 -0.0599 -0.4399 1.4777 0.0954 -0.7872
## 4 192610 -4.0289 -4.3615 -2.0128 -3.6327 -2.3451 -4.0040
## 5 192611 3.2971 3.6237 2.0877 3.2120 2.9346 3.1964
## 6 192612 2.5645 1.7773 3.2700 2.9011 2.6210 2.3073
data <- data %>%
rename(
date = X,
SL = SMALL.LoBM,
SM = ME1.BM2,
SH = SMALL.HiBM,
BL = BIG.LoBM,
BM = ME2.BM2,
BH = BIG.HiBM,
)
)
data$date <- as.numeric(data$date)
## Warning: NAs introduced by coercion
first_half <- data %>% filter(date >= 193001 & date <= 197412)
second_half <- data %>% filter(date >= 197501 & date <= 201812)
portfolio_stats <- function(df){
data.frame(
Portfolio = c("SL","SM","SH"),
Mean = c(mean(df$SL), mean(df$SM), mean(df$SH)),
SD = c(sd(df$SL), sd(df$SM), sd(df$SH)),
Skewness = c(skewness(df$SL),
skewness(df$SM),
skewness(df$SH)),
Kurtosis = c(kurtosis(df$SL),
kurtosis(df$SM),
kurtosis(df$SH))
)
}
library(dplyr)
# --- 0) Columns that should be numeric (from your Environment pane) ---
cols_to_num <- c("SL","SM","SH","BL","BM","BH") # adjust if needed
# --- 1) Clean & convert the master data ---
# Removes anything not 0-9 . or - (commas, spaces, %, etc.) then coerces to numeric.
data <- data %>%
mutate(across(all_of(cols_to_num),
~ suppressWarnings(as.numeric(gsub("[^0-9.-]", "", .)))))
# If date is a character, coerce once so filtering works
data$date <- suppressWarnings(as.integer(gsub("\\D", "", data$date)))
# --- 2) Rebuild your splits AFTER conversion ---
first_half <- data %>% filter(date >= 193001 & date <= 197412)
second_half <- data %>% filter(date >= 197501 & date <= 201812)
# --- 3) Sanity check: these MUST all be TRUE ---
print(sapply(first_half[cols_to_num], is.numeric))
## SL SM SH BL BM BH
## TRUE TRUE TRUE TRUE TRUE TRUE
print(sapply(second_half[cols_to_num], is.numeric))
## SL SM SH BL BM BH
## TRUE TRUE TRUE TRUE TRUE TRUE
# --- 4) Make portfolio_stats robust (converts again inside + uses na.rm=TRUE) ---
portfolio_stats <- function(df) {
cols_to_num <- c("SL","SM","SH","BL","BM","BH")
# Clean & coerce inside the function as a safeguard
df[cols_to_num] <- lapply(df[cols_to_num],
function(x) suppressWarnings(as.numeric(gsub("[^0-9.-]", "", x))))
# Stop early if a column failed to parse (e.g., all NA after coercion)
bad <- names(df[cols_to_num])[vapply(df[cols_to_num], function(x) !is.numeric(x) || all(is.na(x)), logical(1))]
if (length(bad) > 0) stop("These columns failed to parse as numeric: ", paste(bad, collapse = ", "))
tibble::tibble(
Portfolio = cols_to_num,
Mean = sapply(df[cols_to_num], function(x) mean(x, na.rm = TRUE)),
SD = sapply(df[cols_to_num], function(x) sd(x, na.rm = TRUE))
)
}
# --- 5) Run your stats ---
stats_1930_1974 <- portfolio_stats(first_half)
stats_1975_2018 <- portfolio_stats(second_half)
stats_1930_1974
## # A tibble: 6 × 3
## Portfolio Mean SD
## <chr> <dbl> <dbl>
## 1 SL 25.1 79.9
## 2 SM 39.8 106.
## 3 SH 43.2 115.
## 4 BL 106. 241.
## 5 BM 75.3 154.
## 6 BH 47.6 126.
stats_1975_2018
## # A tibble: 6 × 3
## Portfolio Mean SD
## <chr> <dbl> <dbl>
## 1 SL 169. 385.
## 2 SM 171. 376.
## 3 SH 180. 440.
## 4 BL 1385. 4739.
## 5 BM 991. 3477.
## 6 BH 866. 3286.
library(dplyr)
cols_to_num <- setdiff(names(data), "date")
# =======================================================
# 1. Load packages
# =======================================================
library(dplyr)
library(tibble)
# =======================================================
# 2. Columns that must be numeric (from your dataset)
# =======================================================
cols_to_num <- c("SL","SM","SH","BL","BM","BH")
# =======================================================
# 3. Clean and convert numeric columns
# =======================================================
data <- data %>%
mutate(across(all_of(cols_to_num),
~ suppressWarnings(as.numeric(gsub("[^0-9.-]", "", .)))))
# Convert date to numeric for filtering
data$date <- suppressWarnings(as.integer(gsub("\\D", "", data$date)))
# =======================================================
# 4. Split the dataset into two periods
# =======================================================
first_half <- data %>% filter(date >= 193001 & date <= 197412)
second_half <- data %>% filter(date >= 197501 & date <= 201812)
# =======================================================
# 5. Portfolio statistics function
# =======================================================
portfolio_stats <- function(df) {
cols_to_num <- c("SL","SM","SH","BL","BM","BH")
# Ensure numeric inside the function
df[cols_to_num] <- lapply(df[cols_to_num],
function(x) suppressWarnings(as.numeric(gsub("[^0-9.-]", "", x))))
tibble(
Portfolio = cols_to_num,
Mean = sapply(df[cols_to_num], function(x) mean(x, na.rm = TRUE)),
SD = sapply(df[cols_to_num], function(x) sd(x, na.rm = TRUE))
)
}
# =======================================================
# 6. Compute stats for both periods
# =======================================================
stats_1930_1974 <- portfolio_stats(first_half)
stats_1975_2018 <- portfolio_stats(second_half)
# Show results
stats_1930_1974
## # A tibble: 6 × 3
## Portfolio Mean SD
## <chr> <dbl> <dbl>
## 1 SL 25.1 79.9
## 2 SM 39.8 106.
## 3 SH 43.2 115.
## 4 BL 106. 241.
## 5 BM 75.3 154.
## 6 BH 47.6 126.
stats_1975_2018
## # A tibble: 6 × 3
## Portfolio Mean SD
## <chr> <dbl> <dbl>
## 1 SL 169. 385.
## 2 SM 171. 376.
## 3 SH 180. 440.
## 4 BL 1385. 4739.
## 5 BM 991. 3477.
## 6 BH 866. 3286.
# =======================================================
# 7. Merge both results into a comparison table
# =======================================================
comparison <- merge(
stats_1930_1974,
stats_1975_2018,
by = "Portfolio",
suffixes = c("_1930_1974", "_1975_2018")
)
# Show final comparison
comparison
## Portfolio Mean_1930_1974 SD_1930_1974 Mean_1975_2018 SD_1975_2018
## 1 BH 47.59738 125.95082 865.8171 3285.7773
## 2 BL 105.62652 240.77708 1385.0872 4738.8831
## 3 BM 75.32843 154.11720 990.8477 3477.2082
## 4 SH 43.24318 114.59337 180.1386 439.5081
## 5 SL 25.06980 79.93628 169.4753 385.0970
## 6 SM 39.80074 105.52911 171.4342 375.7466
comparison <- merge(stats_1930_1974,
stats_1975_2018,
by="Portfolio",
suffixes=c("_1930_1974",
"_1975_2018"))
comparison
## Portfolio Mean_1930_1974 SD_1930_1974 Mean_1975_2018 SD_1975_2018
## 1 BH 47.59738 125.95082 865.8171 3285.7773
## 2 BL 105.62652 240.77708 1385.0872 4738.8831
## 3 BM 75.32843 154.11720 990.8477 3477.2082
## 4 SH 43.24318 114.59337 180.1386 439.5081
## 5 SL 25.06980 79.93628 169.4753 385.0970
## 6 SM 39.80074 105.52911 171.4342 375.7466
ggplot(comparison,
aes(x=Portfolio,
y=Mean_1930_1974)) +
geom_bar(stat="identity") +
ggtitle("Average Returns (1930–1974)")
ggplot(comparison,
aes(x=Portfolio,
y=Mean_1930_1974)) +
geom_bar(stat="identity") +
ggtitle("Average Returns (1930–1974)")
ggplot(comparison,
aes(x=Portfolio,
y=Mean_1975_2018)) +
geom_bar(stat="identity") +
ggtitle("Average Returns (1975–2018)")
# =======================================================
# 1. Load packages
# =======================================================
library(dplyr)
library(tibble)
# =======================================================
# 2. Columns that must be numeric
# (from your environment: SL, SM, SH, BL, BM, BH)
# =======================================================
cols_to_num <- c("SL", "SM", "SH", "BL", "BM", "BH")
# =======================================================
# 3. Clean and convert numeric columns
# =======================================================
data <- data %>%
mutate(across(all_of(cols_to_num),
~ suppressWarnings(as.numeric(gsub("[^0-9.-]", "", .)))))
# Convert date to integer (if needed)
data$date <- suppressWarnings(as.integer(gsub("\\D", "", data$date)))
# =======================================================
# 4. Split the dataset into two periods
# =======================================================
first_half <- data %>% filter(date >= 193001 & date <= 197412)
second_half <- data %>% filter(date >= 197501 & date <= 201812)
# =======================================================
# 5. Portfolio statistics function
# =======================================================
portfolio_stats <- function(df) {
cols_to_num <- c("SL", "SM", "SH", "BL", "BM", "BH")
# Make sure numeric inside function
df[cols_to_num] <- lapply(
df[cols_to_num],
function(x) suppressWarnings(as.numeric(gsub("[^0-9.-]", "", x)))
)
tibble(
Portfolio = cols_to_num,
Mean = sapply(df[cols_to_num], function(x) mean(x, na.rm = TRUE)),
SD = sapply(df[cols_to_num], function(x) sd(x, na.rm = TRUE))
)
}
# =======================================================
# 6. Compute results for both periods
# =======================================================
stats_1930_1974 <- portfolio_stats(first_half)
stats_1975_2018 <- portfolio_stats(second_half)
# Display results
print(stats_1930_1974)
## # A tibble: 6 × 3
## Portfolio Mean SD
## <chr> <dbl> <dbl>
## 1 SL 25.1 79.9
## 2 SM 39.8 106.
## 3 SH 43.2 115.
## 4 BL 106. 241.
## 5 BM 75.3 154.
## 6 BH 47.6 126.
print(stats_1975_2018)
## # A tibble: 6 × 3
## Portfolio Mean SD
## <chr> <dbl> <dbl>
## 1 SL 169. 385.
## 2 SM 171. 376.
## 3 SH 180. 440.
## 4 BL 1385. 4739.
## 5 BM 991. 3477.
## 6 BH 866. 3286.
# =======================================================
# 7. Merge both results into a comparison table
# =======================================================
comparison <- merge(
stats_1930_1974,
stats_1975_2018,
by = "Portfolio",
suffixes = c("_1930_1974", "_1975_2018")
)
# Show final comparison
comparison
## Portfolio Mean_1930_1974 SD_1930_1974 Mean_1975_2018 SD_1975_2018
## 1 BH 47.59738 125.95082 865.8171 3285.7773
## 2 BL 105.62652 240.77708 1385.0872 4738.8831
## 3 BM 75.32843 154.11720 990.8477 3477.2082
## 4 SH 43.24318 114.59337 180.1386 439.5081
## 5 SL 25.06980 79.93628 169.4753 385.0970
## 6 SM 39.80074 105.52911 171.4342 375.7466
# Equity outcomes
probabilities <- c(0.6, 0.4)
equity_returns <- c(50000, -30000)
# Risk-free return
rf_return <- 5000
expected_equity <- sum(probabilities * equity_returns)
expected_equity
## [1] 18000
expected_rf <- rf_return
expected_rf
## [1] 5000
risk_premium <- expected_equity - expected_rf
risk_premium
## [1] 13000
results <- data.frame(
Investment = c("Equities", "Risk-Free T-Bill"),
Expected_Return = c(expected_equity, expected_rf)
)
results
## Investment Expected_Return
## 1 Equities 18000
## 2 Risk-Free T-Bill 5000
cat("Expected Equity Return: $", expected_equity, "\n")
## Expected Equity Return: $ 18000
cat("Risk-Free Return: $", expected_rf, "\n")
## Risk-Free Return: $ 5000
cat("Expected Risk Premium: $", risk_premium)
## Expected Risk Premium: $ 13000
barplot(
c(expected_equity, expected_rf),
names.arg = c("Equities", "T-Bill"),
main = "Expected Returns Comparison",
ylab = "Return ($)",
col = c("steelblue", "darkgreen")
)
This study analyzed the monthly returns of six portfolios formed on size and book‑to‑market ratios using data from the Kenneth French Data Library. By computing descriptive statistics—including the mean, standard deviation, skewness, and kurtosis—we gained insight into the distributional characteristics of each portfolio. Splitting the sample into two equal time periods allowed us to compare whether return behavior and risk characteristics remained stable over time.
Overall, the results show clear differences in return levels and volatility across the portfolios. Size‑based and value‑based patterns were evident, consistent with well‑documented empirical asset‑pricing relationships. While some statistical properties remained relatively stable across periods, others exhibited variation, suggesting changes in market conditions or factor dynamics. These findings contribute to a deeper understanding of how portfolio characteristics influence performance and how these relationships evolve over time.