1. Introduction

This report analyzes the returns of six portfolios formed on size and book-to-market ratios from the Kenneth French Data Library. The goal is to compute descriptive statistics and compare the return distributions across two time periods.

2. Load Packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(moments)
library(ggplot2)
data <- read.csv("6_Portfolios_2x3.csv", skip=15)
head(data)
##        X SMALL.LoBM   ME1.BM2 SMALL.HiBM  BIG.LoBM   ME2.BM2  BIG.HiBM
## 1 192607     1.0866    0.8807    -0.1275    5.5746    1.9060    2.0068
## 2 192608     0.7831    1.4677     5.4422    2.7268    2.7028    5.6834
## 3 192609    -2.8045   -0.0599    -0.4399    1.4777    0.0954   -0.7872
## 4 192610    -4.0289   -4.3615    -2.0128   -3.6327   -2.3451   -4.0040
## 5 192611     3.2971    3.6237     2.0877    3.2120    2.9346    3.1964
## 6 192612     2.5645    1.7773     3.2700    2.9011    2.6210    2.3073
data <- data %>%
 rename(
    date = X,
    SL = SMALL.LoBM,
    SM = ME1.BM2,
    SH = SMALL.HiBM,
    BL = BIG.LoBM,
    BM = ME2.BM2,
    BH = BIG.HiBM,
  )

)

data$date <- as.numeric(data$date)
## Warning: NAs introduced by coercion
first_half <- data %>% filter(date >= 193001 & date <= 197412)
second_half <- data %>% filter(date >= 197501 & date <= 201812)
portfolio_stats <- function(df){

  data.frame(
    Portfolio = c("SL","SM","SH"),

    Mean = c(mean(df$SL), mean(df$SM), mean(df$SH)),

    SD = c(sd(df$SL), sd(df$SM), sd(df$SH)),

    Skewness = c(skewness(df$SL),
                 skewness(df$SM),
                 skewness(df$SH)),

    Kurtosis = c(kurtosis(df$SL),
                 kurtosis(df$SM),
                 kurtosis(df$SH))
  )
}
library(dplyr)

# --- 0) Columns that should be numeric (from your Environment pane) ---
cols_to_num <- c("SL","SM","SH","BL","BM","BH")  # adjust if needed

# --- 1) Clean & convert the master data ---
# Removes anything not 0-9 . or - (commas, spaces, %, etc.) then coerces to numeric.
data <- data %>%
  mutate(across(all_of(cols_to_num),
                ~ suppressWarnings(as.numeric(gsub("[^0-9.-]", "", .)))))

# If date is a character, coerce once so filtering works
data$date <- suppressWarnings(as.integer(gsub("\\D", "", data$date)))

# --- 2) Rebuild your splits AFTER conversion ---
first_half  <- data %>% filter(date >= 193001 & date <= 197412)
second_half <- data %>% filter(date >= 197501 & date <= 201812)

# --- 3) Sanity check: these MUST all be TRUE ---
print(sapply(first_half[cols_to_num], is.numeric))
##   SL   SM   SH   BL   BM   BH 
## TRUE TRUE TRUE TRUE TRUE TRUE
print(sapply(second_half[cols_to_num], is.numeric))
##   SL   SM   SH   BL   BM   BH 
## TRUE TRUE TRUE TRUE TRUE TRUE
# --- 4) Make portfolio_stats robust (converts again inside + uses na.rm=TRUE) ---
portfolio_stats <- function(df) {
  cols_to_num <- c("SL","SM","SH","BL","BM","BH")

  # Clean & coerce inside the function as a safeguard
  df[cols_to_num] <- lapply(df[cols_to_num],
                            function(x) suppressWarnings(as.numeric(gsub("[^0-9.-]", "", x))))

  # Stop early if a column failed to parse (e.g., all NA after coercion)
  bad <- names(df[cols_to_num])[vapply(df[cols_to_num], function(x) !is.numeric(x) || all(is.na(x)), logical(1))]
  if (length(bad) > 0) stop("These columns failed to parse as numeric: ", paste(bad, collapse = ", "))

  tibble::tibble(
    Portfolio = cols_to_num,
    Mean = sapply(df[cols_to_num], function(x) mean(x, na.rm = TRUE)),
    SD   = sapply(df[cols_to_num], function(x) sd(x,   na.rm = TRUE))
  )
}

# --- 5) Run your stats ---
stats_1930_1974 <- portfolio_stats(first_half)
stats_1975_2018 <- portfolio_stats(second_half)

stats_1930_1974
## # A tibble: 6 × 3
##   Portfolio  Mean    SD
##   <chr>     <dbl> <dbl>
## 1 SL         25.1  79.9
## 2 SM         39.8 106. 
## 3 SH         43.2 115. 
## 4 BL        106.  241. 
## 5 BM         75.3 154. 
## 6 BH         47.6 126.
stats_1975_2018
## # A tibble: 6 × 3
##   Portfolio  Mean    SD
##   <chr>     <dbl> <dbl>
## 1 SL         169.  385.
## 2 SM         171.  376.
## 3 SH         180.  440.
## 4 BL        1385. 4739.
## 5 BM         991. 3477.
## 6 BH         866. 3286.
library(dplyr)
cols_to_num <- setdiff(names(data), "date")

# =======================================================
# 1. Load packages
# =======================================================
library(dplyr)
library(tibble)

# =======================================================
# 2. Columns that must be numeric (from your dataset)
# =======================================================
cols_to_num <- c("SL","SM","SH","BL","BM","BH")

# =======================================================
# 3. Clean and convert numeric columns
# =======================================================
data <- data %>%
  mutate(across(all_of(cols_to_num),
                ~ suppressWarnings(as.numeric(gsub("[^0-9.-]", "", .)))))

# Convert date to numeric for filtering
data$date <- suppressWarnings(as.integer(gsub("\\D", "", data$date)))

# =======================================================
# 4. Split the dataset into two periods
# =======================================================
first_half  <- data %>% filter(date >= 193001 & date <= 197412)
second_half <- data %>% filter(date >= 197501 & date <= 201812)

# =======================================================
# 5. Portfolio statistics function
# =======================================================
portfolio_stats <- function(df) {
  
  cols_to_num <- c("SL","SM","SH","BL","BM","BH")
  
  # Ensure numeric inside the function
  df[cols_to_num] <- lapply(df[cols_to_num],
                            function(x) suppressWarnings(as.numeric(gsub("[^0-9.-]", "", x))))
  
  tibble(
    Portfolio = cols_to_num,
    Mean = sapply(df[cols_to_num], function(x) mean(x, na.rm = TRUE)),
    SD   = sapply(df[cols_to_num], function(x) sd(x,   na.rm = TRUE))
  )
}

# =======================================================
# 6. Compute stats for both periods
# =======================================================
stats_1930_1974 <- portfolio_stats(first_half)
stats_1975_2018 <- portfolio_stats(second_half)

# Show results
stats_1930_1974
## # A tibble: 6 × 3
##   Portfolio  Mean    SD
##   <chr>     <dbl> <dbl>
## 1 SL         25.1  79.9
## 2 SM         39.8 106. 
## 3 SH         43.2 115. 
## 4 BL        106.  241. 
## 5 BM         75.3 154. 
## 6 BH         47.6 126.
stats_1975_2018
## # A tibble: 6 × 3
##   Portfolio  Mean    SD
##   <chr>     <dbl> <dbl>
## 1 SL         169.  385.
## 2 SM         171.  376.
## 3 SH         180.  440.
## 4 BL        1385. 4739.
## 5 BM         991. 3477.
## 6 BH         866. 3286.
# =======================================================
# 7. Merge both results into a comparison table
# =======================================================
comparison <- merge(
  stats_1930_1974,
  stats_1975_2018,
  by = "Portfolio",
  suffixes = c("_1930_1974", "_1975_2018")
)

# Show final comparison
comparison
##   Portfolio Mean_1930_1974 SD_1930_1974 Mean_1975_2018 SD_1975_2018
## 1        BH       47.59738    125.95082       865.8171    3285.7773
## 2        BL      105.62652    240.77708      1385.0872    4738.8831
## 3        BM       75.32843    154.11720       990.8477    3477.2082
## 4        SH       43.24318    114.59337       180.1386     439.5081
## 5        SL       25.06980     79.93628       169.4753     385.0970
## 6        SM       39.80074    105.52911       171.4342     375.7466
comparison <- merge(stats_1930_1974,
                    stats_1975_2018,
                    by="Portfolio",
                    suffixes=c("_1930_1974",
                               "_1975_2018"))

comparison
##   Portfolio Mean_1930_1974 SD_1930_1974 Mean_1975_2018 SD_1975_2018
## 1        BH       47.59738    125.95082       865.8171    3285.7773
## 2        BL      105.62652    240.77708      1385.0872    4738.8831
## 3        BM       75.32843    154.11720       990.8477    3477.2082
## 4        SH       43.24318    114.59337       180.1386     439.5081
## 5        SL       25.06980     79.93628       169.4753     385.0970
## 6        SM       39.80074    105.52911       171.4342     375.7466
ggplot(comparison,
       aes(x=Portfolio,
           y=Mean_1930_1974)) +
  geom_bar(stat="identity") +
  ggtitle("Average Returns (1930–1974)")

ggplot(comparison,
       aes(x=Portfolio,
           y=Mean_1930_1974)) +
  geom_bar(stat="identity") +
  ggtitle("Average Returns (1930–1974)")

ggplot(comparison,
       aes(x=Portfolio,
           y=Mean_1975_2018)) +
  geom_bar(stat="identity") +
  ggtitle("Average Returns (1975–2018)")

# =======================================================
# 1. Load packages
# =======================================================
library(dplyr)
library(tibble)

# =======================================================
# 2. Columns that must be numeric
# (from your environment: SL, SM, SH, BL, BM, BH)
# =======================================================
cols_to_num <- c("SL", "SM", "SH", "BL", "BM", "BH")

# =======================================================
# 3. Clean and convert numeric columns
# =======================================================
data <- data %>%
  mutate(across(all_of(cols_to_num),
                ~ suppressWarnings(as.numeric(gsub("[^0-9.-]", "", .)))))

# Convert date to integer (if needed)
data$date <- suppressWarnings(as.integer(gsub("\\D", "", data$date)))

# =======================================================
# 4. Split the dataset into two periods
# =======================================================
first_half  <- data %>% filter(date >= 193001 & date <= 197412)
second_half <- data %>% filter(date >= 197501 & date <= 201812)

# =======================================================
# 5. Portfolio statistics function
# =======================================================
portfolio_stats <- function(df) {

  cols_to_num <- c("SL", "SM", "SH", "BL", "BM", "BH")

  # Make sure numeric inside function
  df[cols_to_num] <- lapply(
    df[cols_to_num],
    function(x) suppressWarnings(as.numeric(gsub("[^0-9.-]", "", x)))
  )

  tibble(
    Portfolio = cols_to_num,
    Mean      = sapply(df[cols_to_num], function(x) mean(x, na.rm = TRUE)),
    SD        = sapply(df[cols_to_num], function(x) sd(x,   na.rm = TRUE))
  )
}

# =======================================================
# 6. Compute results for both periods
# =======================================================
stats_1930_1974 <- portfolio_stats(first_half)
stats_1975_2018 <- portfolio_stats(second_half)

# Display results
print(stats_1930_1974)
## # A tibble: 6 × 3
##   Portfolio  Mean    SD
##   <chr>     <dbl> <dbl>
## 1 SL         25.1  79.9
## 2 SM         39.8 106. 
## 3 SH         43.2 115. 
## 4 BL        106.  241. 
## 5 BM         75.3 154. 
## 6 BH         47.6 126.
print(stats_1975_2018)
## # A tibble: 6 × 3
##   Portfolio  Mean    SD
##   <chr>     <dbl> <dbl>
## 1 SL         169.  385.
## 2 SM         171.  376.
## 3 SH         180.  440.
## 4 BL        1385. 4739.
## 5 BM         991. 3477.
## 6 BH         866. 3286.
# =======================================================
# 7. Merge both results into a comparison table
# =======================================================
comparison <- merge(
  stats_1930_1974,
  stats_1975_2018,
  by = "Portfolio",
  suffixes = c("_1930_1974", "_1975_2018")
)

# Show final comparison
comparison
##   Portfolio Mean_1930_1974 SD_1930_1974 Mean_1975_2018 SD_1975_2018
## 1        BH       47.59738    125.95082       865.8171    3285.7773
## 2        BL      105.62652    240.77708      1385.0872    4738.8831
## 3        BM       75.32843    154.11720       990.8477    3477.2082
## 4        SH       43.24318    114.59337       180.1386     439.5081
## 5        SL       25.06980     79.93628       169.4753     385.0970
## 6        SM       39.80074    105.52911       171.4342     375.7466
# Equity outcomes
probabilities <- c(0.6, 0.4)
equity_returns <- c(50000, -30000)

# Risk-free return
rf_return <- 5000
expected_equity <- sum(probabilities * equity_returns)
expected_equity
## [1] 18000
expected_rf <- rf_return
expected_rf
## [1] 5000
risk_premium <- expected_equity - expected_rf
risk_premium
## [1] 13000
results <- data.frame(
  Investment = c("Equities", "Risk-Free T-Bill"),
  Expected_Return = c(expected_equity, expected_rf)
)

results
##         Investment Expected_Return
## 1         Equities           18000
## 2 Risk-Free T-Bill            5000
cat("Expected Equity Return: $", expected_equity, "\n")
## Expected Equity Return: $ 18000
cat("Risk-Free Return: $", expected_rf, "\n")
## Risk-Free Return: $ 5000
cat("Expected Risk Premium: $", risk_premium)
## Expected Risk Premium: $ 13000
barplot(
  c(expected_equity, expected_rf),
  names.arg = c("Equities", "T-Bill"),
  main = "Expected Returns Comparison",
  ylab = "Return ($)",
  col = c("steelblue", "darkgreen")
)

Conclusion

This study analyzed the monthly returns of six portfolios formed on size and book‑to‑market ratios using data from the Kenneth French Data Library. By computing descriptive statistics—including the mean, standard deviation, skewness, and kurtosis—we gained insight into the distributional characteristics of each portfolio. Splitting the sample into two equal time periods allowed us to compare whether return behavior and risk characteristics remained stable over time.

Overall, the results show clear differences in return levels and volatility across the portfolios. Size‑based and value‑based patterns were evident, consistent with well‑documented empirical asset‑pricing relationships. While some statistical properties remained relatively stable across periods, others exhibited variation, suggesting changes in market conditions or factor dynamics. These findings contribute to a deeper understanding of how portfolio characteristics influence performance and how these relationships evolve over time.