library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(moments)

# Load the dataset
data <- read_csv("6_Portfolios_2x3.CSV", col_types = cols(.default = "d"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
# Check column names and structure
print(colnames(data))
## [1] "This file was created by CMPT_ME_BEME_OP_INV_RETS using the 202412 CRSP database."
str(data)
## spc_tbl_ [8,806 × 1] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ This file was created by CMPT_ME_BEME_OP_INV_RETS using the 202412 CRSP database.: num [1:8806] NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   .default = col_double(),
##   ..   `This file was created by CMPT_ME_BEME_OP_INV_RETS using the 202412 CRSP database.` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# If Date column exists, attempt to convert it
if("Date" %in% colnames(data)) {
  data$Date <- as.Date(as.character(data$Date), format="%Y%m")
} else {
  print("Date column not found. Please check the dataset structure.")
}
## [1] "Date column not found. Please check the dataset structure."
# Check for missing values
print(sum(is.na(data$Date)))
## Warning: Unknown or uninitialised column: `Date`.
## [1] 0
# If Date column is missing, rename the first column as Date
if(!"Date" %in% colnames(data)) {
  colnames(data)[1] <- "Date"
}

# Split the data into two halves
midpoint <- floor(nrow(data) / 2)
data_first_half <- data[1:midpoint, ]
data_second_half <- data[(midpoint + 1):nrow(data), ]

# Function to compute summary statistics
compute_stats <- function(df) {
  df %>% summarise_all(
    list(
      Mean = mean,
      SD = sd,
      Skewness = skewness,
      Kurtosis = kurtosis
    )
  )
}

# Compute statistics for both halves
stats_first_half <- compute_stats(data_first_half)
stats_second_half <- compute_stats(data_second_half)

# Print results
print("Summary Statistics for First Half")
## [1] "Summary Statistics for First Half"
print(stats_first_half)
## # A tibble: 1 × 4
##    Mean    SD Skewness Kurtosis
##   <dbl> <dbl>    <dbl>    <dbl>
## 1    NA    NA       NA       NA
print("Summary Statistics for Second Half")
## [1] "Summary Statistics for Second Half"
print(stats_second_half)
## # A tibble: 1 × 4
##    Mean    SD Skewness Kurtosis
##   <dbl> <dbl>    <dbl>    <dbl>
## 1    NA    NA       NA       NA