# packages
library(readr)
library(stringr)
library(tidyr) # reshaping data
library(dplyr) # data manipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) # data visualization

## Warning: package 'ggplot2' was built under R version 4.5.3

library(e1071) # statistics & machine learning

## Warning: package 'e1071' was built under R version 4.5.3

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:ggplot2':
## 
##     element

library(ggrepel) # ggplot2 + better labels

## Warning: package 'ggrepel' was built under R version 4.5.3

library(ggridges)

## Warning: package 'ggridges' was built under R version 4.5.3

Phase 1: Data Preprocessing

1.1 How can raw mutual fund data to be cleaned to ensure consistency in numeric format.

mf_data <- read_csv(
  "D:/semester_projects/R/mutual_funds_data.csv",
  col_names = TRUE
)

## New names:
## Rows: 1407 Columns: 37
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (32): Name, Sub Category, Plan, CAGR 3Y, CAGR 5Y, Absolute Returns - 3M,... dbl
## (2): Category St Dev, Time since inception num (2): AUM, NAV lgl (1): ...37
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...37`

mf_data

## # A tibble: 1,407 × 37
##    Name   `Sub Category` Plan     AUM `CAGR 3Y` `CAGR 5Y` Absolute Returns - 3…¹
##    <chr>  <chr>          <chr>  <dbl> <chr>     <chr>     <chr>                 
##  1 Parag… Flexi Cap Fund Grow… 1.34e5 18.3      16.75     -8.75                 
##  2 Parag… Flexi Cap Fund IDCW  1.34e5 -         -         -8.75                 
##  3 Parag… Flexi Cap Fund IDCW  1.34e5 -         -         -8.75                 
##  4 HDFC … Balanced Adva… IDCW  1.08e5 8.52      7.74      -9.11                 
##  5 HDFC … Balanced Adva… IDCW  1.08e5 8.52      7.74      -9.11                 
##  6 HDFC … Balanced Adva… Grow… 1.08e5 16.36     16.7      -7.59                 
##  7 HDFC … Flexi Cap Fund IDCW  1.00e5 10.83     9.79      -16.26                
##  8 HDFC … Flexi Cap Fund IDCW  1.00e5 10.83     9.79      -16.26                
##  9 HDFC … Flexi Cap Fund Grow… 1.00e5 20.04     19.42     -9.64                 
## 10 HDFC … Mid Cap Fund   IDCW  9.43e4 16.14     13.25     -14.61                
## # ℹ 1,397 more rows
## # ℹ abbreviated name: ¹`Absolute Returns - 3M`
## # ℹ 30 more variables: `Absolute Returns - 1Y` <chr>,
## #   `Absolute Returns - 6M` <chr>, `SEBI Risk Category` <chr>,
## #   Volatility <chr>, `Category St Dev` <dbl>, `% Other Holdings` <chr>,
## #   `% Largecap Holding` <chr>, `% Midcap Holding` <chr>,
## #   `% Equity Holding` <chr>, `% Smallcap Holding` <chr>, …

Cleaning the data as column names contains blank spaces and special characters.

# Column names which starts with %
colnames(mf_data)[grep("%", colnames(mf_data))]

## [1] "% Other Holdings"   "% Largecap Holding" "% Midcap Holding"  
## [4] "% Equity Holding"   "% Smallcap Holding" "% Debt Holding"

# replacing %
clean_mf_data <- mf_data |>
  rename_with(~ str_replace_all(., "%", "")) |>
  rename_with(~ str_replace_all(., " ", "_")) |>
  rename_with(~ str_replace_all(., "-", "")) |>
  # removes more than one underscore
  rename_with(~ str_replace_all(., "__+", "_")) |>
  # removes leading / trailing underscore
  rename_with(~ str_remove(., "^_|_$")) |>
  select(-starts_with("..."))

glimpse(clean_mf_data)

## Rows: 1,407
## Columns: 36
## $ Name                 <chr> "Parag Parikh Flexi Cap Fund", "Parag Parikh Flex…
## $ Sub_Category         <chr> "Flexi Cap Fund", "Flexi Cap Fund", "Flexi Cap Fu…
## $ Plan                 <chr> "Growth", "IDCW", "IDCW", "IDCW", "IDCW", "Growth…
## $ AUM                  <dbl> 134253.17, 134253.17, 134253.17, 107589.67, 10758…
## $ CAGR_3Y              <chr> "18.3", "-", "-", "8.52", "8.52", "16.36", "10.83…
## $ CAGR_5Y              <chr> "16.75", "-", "-", "7.74", "7.74", "16.7", "9.79"…
## $ Absolute_Returns_3M  <chr> "-8.75", "-8.75", "-8.75", "-9.11", "-9.11", "-7.…
## $ Absolute_Returns_1Y  <chr> "0.08", "-", "-", "-5.8", "-5.8", "0.65", "-6.33"…
## $ Absolute_Returns_6M  <chr> "-7.25", "-", "-", "-8.1", "-8.1", "-5.02", "-14.…
## $ SEBI_Risk_Category   <chr> "Very High", "Very High", "Very High", "Very High…
## $ Volatility           <chr> "8.76", "8.17", "8.17", "8.63", "8.63", "8.63", "…
## $ Category_St_Dev      <dbl> 13.52, 13.52, 13.52, 8.25, 8.25, 8.25, 13.52, 13.…
## $ Other_Holdings       <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-",…
## $ Largecap_Holding     <chr> "63.01", "63.01", "63.01", "54.22", "54.22", "54.…
## $ Midcap_Holding       <chr> "2.17", "2.17", "2.17", "7.47", "7.47", "7.47", "…
## $ Equity_Holding       <chr> "78.36", "78.36", "78.36", "69.36", "69.36", "69.…
## $ Smallcap_Holding     <chr> "2.67", "2.67", "2.67", "7.67", "7.67", "7.67", "…
## $ Debt_Holding         <chr> "13.94", "13.94", "13.94", "25.78", "25.78", "25.…
## $ Category_YTM         <chr> "-", "-", "-", "6.56", "6.56", "6.56", "-", "-", …
## $ PE_Ratio             <chr> "17.51", "17.51", "17.51", "19", "19", "19", "24.…
## $ Average_Maturity     <chr> "-", "-", "-", "7.6", "7.6", "7.6", "-", "-", "-"…
## $ Category_PE_Ratio    <chr> "27.28", "27.28", "27.28", "25.11", "25.11", "25.…
## $ Sortino_Ratio        <chr> "-0.02", "-0.25", "-0.25", "-0.01", "-0.01", "-0.…
## $ Sharpe_Ratio         <chr> "-0.17", "-2.82", "-2.82", "-0.13", "-0.13", "-0.…
## $ Average_YTM          <chr> "-", "-", "-", "7.01", "7.01", "7.01", "-", "-", …
## $ Benchmark            <chr> "NIFTY 500 - TRI", "NIFTY 500 - TRI", "NIFTY 500 …
## $ Time_since_inception <dbl> 155, 6, 6, 159, 159, 159, 159, 159, 159, 159, 159…
## $ Expense_Ratio        <chr> "0.63", "0.63", "0.63", "0.77", "0.77", "0.77", "…
## $ NAV                  <dbl> 87.00, 87.00, 87.00, 41.82, 41.82, 535.18, 82.81,…
## $ AMC                  <chr> "PPFAS Asset Management Pvt. Ltd.", "PPFAS Asset …
## $ Minimum_Lumpsum      <chr> "1,000.00", "1,000.00", "1,000.00", "100", "Not a…
## $ Lockin               <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-",…
## $ Fund_Manager         <chr> "Rajeev Thakkar+3 others", "Rajeev Thakkar+3 othe…
## $ Exit_Load            <chr> "2", "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ SIP_Investment       <chr> "Allowed", "Allowed", "Allowed", "Allowed", "Not …
## $ Minimum_SIP          <chr> "3,000.00", "3,000.00", "3,000.00", "100", "100",…

Identify and convert columns to numeric

clean_mf_data <- clean_mf_data |>
  mutate(across(
    .cols =
      contains(c(
        "CAGR", "Returns", "Expense", "Sharpe", "Sortino", "Nav", "AUM",
        "Large", "Mid", "Equity", "Smallcap", "Debt", "Average", "Absolute"
      )),
    ~ suppressWarnings(as.numeric(str_replace_all(., "[%,]", "")))
  ))

clean_mf_data

## # A tibble: 1,407 × 36
##    Name            Sub_Category Plan     AUM CAGR_3Y CAGR_5Y Absolute_Returns_3M
##    <chr>           <chr>        <chr>  <dbl>   <dbl>   <dbl>               <dbl>
##  1 Parag Parikh F… Flexi Cap F… Grow… 1.34e5   18.3    16.8                -8.75
##  2 Parag Parikh F… Flexi Cap F… IDCW  1.34e5   NA      NA                  -8.75
##  3 Parag Parikh F… Flexi Cap F… IDCW  1.34e5   NA      NA                  -8.75
##  4 HDFC Balanced … Balanced Ad… IDCW  1.08e5    8.52    7.74               -9.11
##  5 HDFC Balanced … Balanced Ad… IDCW  1.08e5    8.52    7.74               -9.11
##  6 HDFC Balanced … Balanced Ad… Grow… 1.08e5   16.4    16.7                -7.59
##  7 HDFC Flexi Cap… Flexi Cap F… IDCW  1.00e5   10.8     9.79              -16.3 
##  8 HDFC Flexi Cap… Flexi Cap F… IDCW  1.00e5   10.8     9.79              -16.3 
##  9 HDFC Flexi Cap… Flexi Cap F… Grow… 1.00e5   20.0    19.4                -9.64
## 10 HDFC Mid Cap F… Mid Cap Fund IDCW  9.43e4   16.1    13.2               -14.6 
## # ℹ 1,397 more rows
## # ℹ 29 more variables: Absolute_Returns_1Y <dbl>, Absolute_Returns_6M <dbl>,
## #   SEBI_Risk_Category <chr>, Volatility <chr>, Category_St_Dev <dbl>,
## #   Other_Holdings <chr>, Largecap_Holding <dbl>, Midcap_Holding <dbl>,
## #   Equity_Holding <dbl>, Smallcap_Holding <dbl>, Debt_Holding <dbl>,
## #   Category_YTM <chr>, PE_Ratio <chr>, Average_Maturity <dbl>,
## #   Category_PE_Ratio <chr>, Sortino_Ratio <dbl>, Sharpe_Ratio <dbl>, …

# saving clean data into a file
write.csv(clean_mf_data,
  "D:/semester_projects/R/data_preprocessing_phase/clean_mf_data.csv",
  row.names = FALSE
)

Interpretation: The dataset was cleaned by removing special characters, standardizing column names, and converting percentage and string values into numeric format. This ensures consistency across variables and prevents errors during analysis, making the dataset structured, readable, and suitable for statistical and machine learning operations.

1.2 What are the pattern and proportion of missing values or NA values across features and treatment.

mf_data <- read_csv(
  "D:/semester_projects/R/data_preprocessing_phase/clean_mf_data.csv",
  col_names = TRUE
)

## Rows: 1407 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): Name, Sub_Category, Plan, SEBI_Risk_Category, Volatility, Other_Ho...
## dbl (19): AUM, CAGR_3Y, CAGR_5Y, Absolute_Returns_3M, Absolute_Returns_1Y, A...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

mf_data

## # A tibble: 1,407 × 36
##    Name            Sub_Category Plan     AUM CAGR_3Y CAGR_5Y Absolute_Returns_3M
##    <chr>           <chr>        <chr>  <dbl>   <dbl>   <dbl>               <dbl>
##  1 Parag Parikh F… Flexi Cap F… Grow… 1.34e5   18.3    16.8                -8.75
##  2 Parag Parikh F… Flexi Cap F… IDCW  1.34e5   NA      NA                  -8.75
##  3 Parag Parikh F… Flexi Cap F… IDCW  1.34e5   NA      NA                  -8.75
##  4 HDFC Balanced … Balanced Ad… IDCW  1.08e5    8.52    7.74               -9.11
##  5 HDFC Balanced … Balanced Ad… IDCW  1.08e5    8.52    7.74               -9.11
##  6 HDFC Balanced … Balanced Ad… Grow… 1.08e5   16.4    16.7                -7.59
##  7 HDFC Flexi Cap… Flexi Cap F… IDCW  1.00e5   10.8     9.79              -16.3 
##  8 HDFC Flexi Cap… Flexi Cap F… IDCW  1.00e5   10.8     9.79              -16.3 
##  9 HDFC Flexi Cap… Flexi Cap F… Grow… 1.00e5   20.0    19.4                -9.64
## 10 HDFC Mid Cap F… Mid Cap Fund IDCW  9.43e4   16.1    13.2               -14.6 
## # ℹ 1,397 more rows
## # ℹ 29 more variables: Absolute_Returns_1Y <dbl>, Absolute_Returns_6M <dbl>,
## #   SEBI_Risk_Category <chr>, Volatility <chr>, Category_St_Dev <dbl>,
## #   Other_Holdings <chr>, Largecap_Holding <dbl>, Midcap_Holding <dbl>,
## #   Equity_Holding <dbl>, Smallcap_Holding <dbl>, Debt_Holding <dbl>,
## #   Category_YTM <chr>, PE_Ratio <chr>, Average_Maturity <dbl>,
## #   Category_PE_Ratio <chr>, Sortino_Ratio <dbl>, Sharpe_Ratio <dbl>, …

# lets identify fields that contain missing values or " - " symbol
cols_to_be_treated <- colnames(mf_data)[
  sapply(mf_data, function(col) {
    is.character(col) &&
      any(trimws(col) == "-", na.rm = TRUE)
  })
]

# Replacing these missing records with some appropriate value
mf_data <- mf_data |> mutate(
  across(
    all_of(cols_to_be_treated),
    ~ suppressWarnings(as.numeric(ifelse(trimws(.) == "-", "0", .)))
  )
)

# lets identify NA values
cols_contains_na <- colnames(mf_data)[colSums(is.na(mf_data)) > 0]

# Replacing these NA values with median
mf_data <- mf_data |> mutate(
  across(
    all_of(cols_contains_na),
    ~ ifelse(is.na(.), median(., na.rm = TRUE), .)
  )
)

# Saving the file as csv
write.csv(mf_data,
  "D:/semester_projects/R/data_preprocessing_phase/mf_data.csv",
  row.names = FALSE
)

Interpretation: Missing values represented as “-” or NA were handled using replacement and median imputation. Median was preferred due to skewed financial data. This approach preserves distribution while minimizing bias, ensuring that missing data does not distort analysis or affect model performance.

1.3 How can duplicate fund entries should be detected and removed efficiently.

# identifying the duplicate records
dup_funds <- mf_data |> filter(
  duplicated(mf_data[, c("Name", "Plan", "Sub_Category")]) |
    duplicated(mf_data[, c("Name", "Plan", "Sub_Category")], fromLast = TRUE)
)

# removing the duplicate records
mf_data <- mf_data[
  !duplicated(mf_data[, c("Name", "Plan", "Sub_Category")]),
]

Interpretation: Duplicate entries based on fund name, plan, and category were identified and removed. This prevents redundancy and avoids over-representation of certain funds, ensuring that each observation contributes uniquely to the analysis and improves the reliability of statistical results.

1.4 Are data types correctly assigned and how check incorrect types that can affect analysis.

# here we will check the invalid datatypes
# although we have converted the datatype before.
# first we will assign levels in the dataset for easier data exploration.

mf_data <- read.csv(
  "D:/semester_projects/R/data_preprocessing_phase/mf_data.csv",
  header = TRUE
)
mf_data$Plan <- as.factor(mf_data$Plan)
mf_data$SEBI_Risk_Category <- as.factor(mf_data$SEBI_Risk_Category)

# check levels
levels(mf_data$Plan)

## [1] "Bonus"  "Growth" "IDCW"

levels(mf_data$SEBI_Risk_Category)

## [1] "High"            "Low"             "Moderate"        "Moderately High"
## [5] "Moderately Low"  "Very High"

# lets detect columns with mixed type of values
str(mf_data)

## 'data.frame':    1407 obs. of  36 variables:
##  $ Name                : chr  "Parag Parikh Flexi Cap Fund" "Parag Parikh Flexi Cap Fund(IDCW)" "Parag Parikh Flexi Cap Fund(IDCW Payout)" "HDFC Balanced Advantage Fund(IDCW)" ...
##  $ Sub_Category        : chr  "Flexi Cap Fund" "Flexi Cap Fund" "Flexi Cap Fund" "Balanced Advantage Fund" ...
##  $ Plan                : Factor w/ 3 levels "Bonus","Growth",..: 2 3 3 3 3 2 3 3 2 3 ...
##  $ AUM                 : num  134253 134253 134253 107590 107590 ...
##  $ CAGR_3Y             : num  18.3 7.58 7.58 8.52 8.52 ...
##  $ CAGR_5Y             : num  16.75 6.32 6.32 7.74 7.74 ...
##  $ Absolute_Returns_3M : num  -8.75 -8.75 -8.75 -9.11 -9.11 ...
##  $ Absolute_Returns_1Y : num  0.08 0 0 -5.8 -5.8 0.65 -6.33 -6.33 1.07 -1 ...
##  $ Absolute_Returns_6M : num  -7.25 -0.9 -0.9 -8.1 -8.1 ...
##  $ SEBI_Risk_Category  : Factor w/ 6 levels "High","Low","Moderate",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ Volatility          : num  8.76 8.17 8.17 8.63 8.63 ...
##  $ Category_St_Dev     : num  13.52 13.52 13.52 8.25 8.25 ...
##  $ Other_Holdings      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Largecap_Holding    : num  63 63 63 54.2 54.2 ...
##  $ Midcap_Holding      : num  2.17 2.17 2.17 7.47 7.47 ...
##  $ Equity_Holding      : num  78.4 78.4 78.4 69.4 69.4 ...
##  $ Smallcap_Holding    : num  2.67 2.67 2.67 7.67 7.67 ...
##  $ Debt_Holding        : num  13.9 13.9 13.9 25.8 25.8 ...
##  $ Category_YTM        : num  0 0 0 6.56 6.56 6.56 0 0 0 0 ...
##  $ PE_Ratio            : num  17.5 17.5 17.5 19 19 ...
##  $ Average_Maturity    : num  1 1 1 7.6 7.6 7.6 1 1 1 1 ...
##  $ Category_PE_Ratio   : num  27.3 27.3 27.3 25.1 25.1 ...
##  $ Sortino_Ratio       : num  -0.02 -0.25 -0.25 -0.01 -0.01 -0.01 0 0 0 0.03 ...
##  $ Sharpe_Ratio        : num  -0.17 -2.82 -2.82 -0.13 -0.13 -0.13 -0.02 -0.02 -0.03 0.31 ...
##  $ Average_YTM         : num  6.77 6.77 6.77 7.01 7.01 7.01 6.77 6.77 6.77 6.77 ...
##  $ Benchmark           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Time_since_inception: int  155 6 6 159 159 159 159 159 159 159 ...
##  $ Expense_Ratio       : num  0.63 0.63 0.63 0.77 0.77 0.77 0.7 0.7 0.7 0.76 ...
##  $ NAV                 : num  87 87 87 41.8 41.8 ...
##  $ AMC                 : chr  "PPFAS Asset Management Pvt. Ltd." "PPFAS Asset Management Pvt. Ltd." "PPFAS Asset Management Pvt. Ltd." "HDFC Asset Management Company Limited" ...
##  $ Minimum_Lumpsum     : chr  "1,000.00" "1,000.00" "1,000.00" "100" ...
##  $ Lockin              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fund_Manager        : chr  "Rajeev Thakkar+3 others" "Rajeev Thakkar+3 others" "Rajeev Thakkar+3 others" "Gopal Agrawal+3 others" ...
##  $ Exit_Load           : num  2 1 1 1 1 1 1 1 1 1 ...
##  $ SIP_Investment      : chr  "Allowed" "Allowed" "Allowed" "Allowed" ...
##  $ Minimum_SIP         : int  100 100 100 100 100 100 100 100 100 100 ...

char_cols <- sapply(mf_data, is.character)
names(mf_data[char_cols])

## [1] "Name"            "Sub_Category"    "AMC"             "Minimum_Lumpsum"
## [5] "Fund_Manager"    "SIP_Investment"

# here minimum_lumpsum contains mixed type of data
unique(mf_data$Minimum_Lumpsum)

## [1] "1,000.00"    "100"         "Not allowed" "5,000.00"    "500"        
## [6] "10,000.00"   "99"          "20,000.00"   "1,00,000.00"

# converting the values into similar types of data
mf_data <- mf_data |>
  mutate(
    Minimum_Lumpsum = suppressWarnings(
      as.numeric(gsub(",", "", Minimum_Lumpsum))
    )
  )

Interpretation: Columns with incorrect data types were identified and converted appropriately, especially numeric fields stored as text. Ensuring correct data types is essential for accurate computation, enabling proper use of statistical functions and avoiding unexpected errors during modeling.

1.5 Detect Outliers using IQR method.

# these are important columns and data is
# naturally skewed it is important to ignore them.
exclude_cols <- c("AUM", "NAV", "Minimum_Lumpsum")

# identifying the numeric columns
numeric_cols <- names(mf_data)[
  sapply(mf_data, is.numeric) &
    !(names(mf_data) %in% exclude_cols)
]

# Detecting outliers
for (col in numeric_cols) {
  q1 <- quantile(mf_data[[col]], 0.25, na.rm = TRUE)
  q3 <- quantile(mf_data[[col]], 0.75, na.rm = TRUE)
  iqr_val <- IQR(mf_data[[col]], na.rm = TRUE)

  lower <- q1 - 1.5 * iqr_val
  upper <- q3 + 1.5 * iqr_val

  outliers <- mf_data[[col]][
    mf_data[[col]] < lower | mf_data[[col]] > upper
  ]
  if (length(outliers) > 0) {
    cat("column", col, "\n")
    print(outliers)
    cat("\n-----------\n")
  }
}

## column CAGR_3Y 
##  [1] 31.65 31.65 31.65 31.38 31.21 31.21 31.21 44.84 44.84 44.84 31.18 31.18
## [13] 31.18 32.82 32.82 32.83 31.48 31.48 31.48 44.99 44.69 44.69 44.69
## 
## -----------
## column CAGR_5Y 
##  [1] 24.42 24.42 24.42 25.20 27.36 27.36 27.36 26.95 24.87 24.87 24.87
## 
## -----------
## column Absolute_Returns_1Y 
##  [1] -21.68 -15.40 -15.40 -13.91  57.86  57.86  57.86 -13.51 -13.51 -16.44
## [11] -16.44 -13.40 -13.40 -13.22 -13.22  21.34  57.08 -13.28 -13.28 -15.76
## [21] -15.76 -15.76 -14.73 -14.73 -14.73  19.58  19.58  19.58  17.21  17.21
## [31]  17.21  57.92  57.92  57.92 122.54 122.54 122.54  57.19  57.19  57.19
## [41]  58.26  58.26  58.26 -14.15 -14.15  18.44  18.44  18.44  19.13  19.13
## [51]  19.13 123.57 123.02 123.02 123.02 126.78 126.78 126.78
## 
## -----------
## column Absolute_Returns_6M 
##  [1] 22.22 22.22 22.22 21.75 21.90 21.90 21.90 62.20 62.20 62.20 21.81 21.81
## [13] 21.81 22.18 22.18 22.18 62.29 61.90 61.90 61.90 64.46 64.46 64.46
## 
## -----------
## column Volatility 
##  [1] 54.82 54.82 54.82 58.91 51.71 51.71 51.71 56.72 56.72 56.72
## 
## -----------
## column Category_St_Dev 
##  [1] 34.60 34.60 34.60 34.60 34.60 34.60 34.60 60.61 60.61 60.61 34.60 34.60
## [13] 34.60 34.60 34.60 34.60 60.61 60.61 60.61 60.61 60.61 60.61 60.61
## 
## -----------
## column Largecap_Holding 
##   [1]  63.01  63.01  63.01  54.22  54.22  54.22  75.25  75.25  75.25  12.47
##  [11]  12.47  12.47  56.10  56.10  56.10  84.95  84.95  84.95  57.77  57.77
##  [21]  57.77  57.77  57.77  14.89  14.89  14.89  14.89  16.77  16.77  16.77
##  [31]  75.61  75.61  75.61  76.13  76.13  76.13  80.80  80.80  80.80  83.46
##  [41]  83.46  83.46  83.46  45.14  45.14  45.14  45.14  58.08  58.08  58.08
##  [51]  58.08  58.08  58.08  58.08  25.09  25.09  25.09  25.09  84.94  84.94
##  [61]  84.94  87.04  87.04  87.04  41.57  41.57  41.57   9.16   9.16   9.16
##  [71]  66.27  66.27  66.27  25.88  25.88  25.88  74.05  74.05  39.84  39.84
##  [81]  39.84  89.12  89.12  89.12  63.97  63.97  63.97  19.20  19.20  19.20
##  [91]  57.55  57.55  57.55  82.87  82.87  82.87  -0.38  -0.38  -0.38  -0.38
## [101]  47.14  47.14  47.14  23.52  23.52  23.52  38.26  38.26  38.26  71.61
## [111]  71.61  71.61  46.77  46.77  46.77  99.24  99.24  99.24  58.06  58.06
## [121]   5.64   5.64   5.64  59.52  59.52  59.52  41.31  41.31  41.31  54.29
## [131]  54.29  54.29  54.29  54.29  54.29  54.29  54.29  46.41  46.41  46.41
## [141]   8.04   8.04   8.04  67.42  67.42  67.42  38.47  38.47  38.47  58.67
## [151]  58.67  58.67  99.26  46.67  46.67  46.67   4.81   4.81  63.82  63.82
## [161]  63.82  58.52  58.52  58.52  18.57  18.57  18.57  76.49  76.49  76.49
## [171]  46.01  46.01  46.01  14.17  14.17  14.17  65.31  65.31  65.31  54.67
## [181]  54.67  54.67  68.75  68.75  68.75  87.29  87.29  87.29  81.64  81.64
## [191]  81.64  47.30  47.30  47.30  47.30  47.30   6.35   6.35   6.35  15.80
## [201]  15.80  15.80  15.80  15.80  15.80  15.80  73.48  73.48  73.48  40.44
## [211]  40.44  40.44  41.31  41.31  41.31  41.31  41.31  41.31  41.31  99.06
## [221]  99.06  99.06   2.62   2.62   2.62  39.28  39.28  39.28  61.57  61.57
## [231]  61.57  73.40  73.40  73.40  69.18  69.18  69.18  42.25  42.25  42.25
## [241]  20.41  20.41  20.41  75.50  75.50  44.39  44.39  44.39  43.56  43.56
## [251]  43.56  71.77  71.77  71.77  18.71  18.71  18.71  64.31  64.31  64.31
## [261]  64.31  64.31  64.31  64.31   6.58   6.58   6.58  46.59  46.59  46.59
## [271]  56.86  56.86  56.86  88.54  88.54  88.54  45.32  45.32  45.32   8.86
## [281]   8.86   8.86  27.25  27.25  27.25  15.17  15.17  15.17  60.73  60.73
## [291]  60.73  77.11  77.11  77.11  81.55  81.55  99.24  99.24  99.24  13.01
## [301]  13.01  13.01  65.93  65.93  65.93  54.03  54.03  54.03  42.99  42.99
## [311]  42.99  62.57  62.57  62.57  85.66  85.66  85.66  20.82  20.82  20.82
## [321]  20.82  15.84  15.84  15.84  76.00  76.00  76.00  56.83  56.83  56.83
## [331]  59.34  59.34  59.34  63.36  63.36  63.36   7.51   7.51   7.51   7.51
## [341]   7.51   7.51   7.51  45.96  45.96  45.96  63.20  63.20  63.20  10.28
## [351]  10.28  10.28  55.92  55.92  55.92  45.35  45.35  45.35  47.14  47.14
## [361]  47.14  57.30  57.30  57.30  35.78  35.78  35.78  46.07  46.07  46.07
## [371]  65.65  65.65  65.65  -0.48  -0.48  -0.48  40.08  40.08  40.08  61.66
## [381]  61.66  61.66  71.57  71.57  71.57  35.42  35.42  35.42 100.04  72.01
## [391]  72.01  69.96  69.96  69.96  69.96  69.96  73.39  41.65  41.65  41.65
## [401]  41.93  41.93  41.93  41.93  41.93  41.93  20.97  20.97  20.97  64.91
## [411]  64.91  64.91  78.21  78.21  78.21  11.54  11.54  11.54  44.29  44.29
## [421]  44.29  44.29  43.05  43.05  43.05  53.99  53.99  53.99  35.83  35.83
## [431]  35.83  45.38  45.38  45.38  77.90  77.90  77.90  63.03  63.03  63.03
## [441]  63.03  81.73  81.73  81.73  43.87  43.87  43.87  68.18  68.18  68.18
## [451]  65.51  65.51  65.51  11.26  11.26  11.26  57.39  57.39  57.39  23.17
## [461]  23.17  23.17  30.17  30.17  30.17  78.53  78.53  78.53  53.09  53.09
## [471]  53.09  67.96  67.96  62.14  62.14  62.14  62.14  44.89  44.89  44.89
## [481]  44.89  44.89  74.43  74.43  74.43  37.07  37.07  37.07  43.83  43.83
## [491]  43.83  46.03  46.03  20.05  65.93  65.93  65.93  73.45  73.45  73.45
## [501]  56.84  56.84  56.84  75.83  75.83  72.07  72.07  61.86  61.86  61.86
## [511]  44.38  44.38  44.38  78.18  63.75  63.75  63.75  64.01  64.01  64.01
## [521]  43.23  43.23  43.23  10.28  10.28  10.28  64.81  64.81  62.05  62.05
## [531]  62.05  62.05  60.86  60.86  60.86  54.05  54.05  54.05  65.69  65.69
## [541]  65.69  38.08  38.08  38.08  45.46  45.46  45.46  57.28  57.28  57.28
## [551]  57.28  57.28  46.59  46.59  46.59  70.50   1.29   1.29   1.29  76.90
## [561]  76.90  76.90  16.21  16.21  16.21   2.63   2.63   2.63   2.63  43.82
## [571]  43.82  43.82  39.20  39.20  39.20  39.20  39.20  47.14  47.14  47.14
## [581]  59.15  59.15  59.15  61.73  61.73  61.73  59.55  59.55  59.55  64.26
## [591]  64.26  64.26  43.26  43.26  43.26  23.48  23.48  42.42  42.42  42.42
## [601]  44.93  44.93  44.93  24.43  24.43  24.43  38.90  38.90  38.90   4.41
## [611]   4.41   4.41  42.52  42.52  42.52  45.77  45.77  45.77  39.55  39.55
## [621]  39.55  10.70  10.70  10.70  57.36  57.36  56.39  56.39  56.39  72.20
## [631]  72.20  72.20  68.22  68.22  68.22  14.23  14.23  14.23  44.62  44.62
## [641]  44.62  20.97  20.97  20.97  20.45  20.45
## 
## -----------
## column Midcap_Holding 
##   [1]  2.17  2.17  2.17  7.47  7.47  7.47  7.89  7.89  7.89 63.77 63.77 63.77
##  [13] 14.01 14.01 14.01  7.01  7.01  7.01 14.89 14.89 14.89  5.92  5.92  5.92
##  [25]  5.92  5.92 68.90 68.90 68.90  4.56  4.56  4.56 12.54 12.54 12.54 13.05
##  [37] 13.05 13.05 13.05 27.31 27.31 27.31 27.31  6.28  6.28  6.28  6.28  6.28
##  [49]  6.28  6.28 20.83 20.83 20.83 61.39 61.39 61.39 61.39 14.60 14.60 14.60
##  [61] 40.60 40.60 40.60  9.66  9.66  9.66 12.87 12.87 12.87 11.49 11.49 11.49
##  [73] 35.57 35.57 35.57 12.06 12.06 12.06  2.82  2.82  2.82 66.88 66.88 66.88
##  [85] 20.81 20.81 19.45 19.45 19.45  1.22  1.22  1.22 20.38 20.38 20.38 64.77
##  [97] 64.77 64.77 33.02 33.02 33.02  6.94  6.94  6.94 -0.06 -0.06 -0.06 -0.06
## [109] 37.29 37.29 37.29  4.72  4.72  4.72 41.59 41.59 41.59  7.66  7.66  7.66
## [121]  0.71  0.71  0.71 25.24 25.24 22.69 22.69 22.69 42.60 42.60 42.60  9.55
## [133]  9.55  9.55  9.55  9.55  9.55  9.55  9.55 21.90 21.90 21.90  5.58  5.58
## [145]  5.58 67.60 67.60 67.60 12.57 12.57 12.57 22.00 22.00 22.00 25.19 25.19
## [157] 25.19  0.71 19.35 19.35 19.35 13.71 13.71  8.43  8.43  8.43 23.05 23.05
## [169] 23.05 63.67 63.67 63.67 11.16 11.16 11.16 27.24 27.24 27.24 69.30 69.30
## [181] 69.30  1.59  1.59  1.59 32.05 32.05 32.05  9.91  9.91  9.91  9.87  9.87
## [193]  9.87  6.56  6.56  6.56  7.02  7.02  7.02 19.80 19.80 19.80 12.98 12.98
## [205] 12.98 12.98 12.98 12.98 12.98 10.36 10.36 10.36 30.44 30.44 30.44 32.83
## [217] 32.83 32.83 22.23 22.23 22.23 22.23 22.23 22.23 22.23  0.71  0.71  0.71
## [229] 23.79 23.79 23.79 35.20 35.20 35.20 19.88 19.88 19.88 24.15 24.15 24.15
## [241] 12.51 12.51 12.51 31.46 31.46 31.46 65.33 65.33 65.33  7.07  7.07 10.16
## [253] 10.16 10.16 21.46 21.46 21.46 61.60 61.60 61.60 12.25 12.25 12.25 12.25
## [265] 12.25 12.25 12.25  8.37  8.37  8.37 28.41 28.41 28.41 20.67 20.67 20.67
## [277]  7.17  7.17  7.17 12.16 12.16 12.16 22.51 22.51 22.51 49.58 49.58 49.58
## [289] 64.78 64.78 64.78 23.01 23.01 23.01 12.06 12.06 12.06  7.21  7.21  9.57
## [301]  9.57  9.57  0.71  0.71  0.71 64.71 64.71 64.71 12.62 12.62 12.62 13.75
## [313] 13.75 13.75 12.51 12.51 12.51  8.82  8.82  8.82 64.26 64.26 64.26 64.26
## [325] 57.57 57.57 57.57 13.06 13.06 13.06  4.10  4.10  4.10 13.34 13.34 13.34
## [337]  3.63  3.63  3.63  3.63  3.63  3.63  3.63 15.36 15.36 15.36 20.19 20.19
## [349] 20.19 19.89 19.89 19.89 11.51 11.51 11.51 19.37 19.37 19.37 13.31 13.31
## [361] 13.31 11.72 11.72 11.72 37.13 37.13 37.13 24.78 24.78 24.78 20.41 20.41
## [373] 20.41 -0.06 -0.06 -0.06 19.17 19.17 19.17 14.72 14.72 14.72 27.28 27.28
## [385] 27.28 10.53 10.53 10.53 10.53 10.53 26.55 27.53 27.53 27.53 19.99 19.99
## [397] 19.99 19.99 19.99 19.99  5.07  5.07  5.07 19.31 19.31 19.31 21.69 21.69
## [409] 21.69  3.21  3.21  3.21 29.35 29.35 29.35 29.35 33.61 33.61 33.61  7.54
## [421]  7.54  7.54 19.51 19.51 19.51 24.78 24.78 24.78 20.94 20.94 20.94 20.94
## [433] 12.59 12.59 12.59 24.95 24.95 24.95 11.36 11.36 11.36 19.29 19.29 19.29
## [445] 69.80 69.80 69.80 11.24 11.24 11.24 64.50 64.50 64.50 20.58 20.58 20.58
## [457]  6.18  6.18  6.18  9.29  9.29  9.29 23.13 23.13 23.13 23.13 15.18 15.18
## [469] 33.72 33.72 33.72 33.72 13.45 13.45 13.45 34.11 34.11 34.11 10.84 10.84
## [481] 10.84 27.11 27.11 27.11 13.15 13.15 13.15 20.10 20.10  2.35 24.17 24.17
## [493] 24.17 10.23 10.23 10.23 14.36 14.36 14.36 12.15 12.15 14.85 14.85 11.37
## [505] 11.37 11.37 15.45 15.45 15.45 25.13 25.13 25.13 14.00 14.00 14.00 21.69
## [517] 20.39 20.39 20.39 22.73 22.73 22.73 22.22 22.22 22.22 64.74 64.74 64.74
## [529] 26.34 26.34 20.87 20.87 20.87 20.87  2.23  2.23  2.23  3.41  3.41  3.41
## [541] 11.95 11.95 11.95 23.60 23.60 23.60 45.62 45.62 45.62  7.26  7.26  7.26
## [553]  7.26  7.26 33.88 33.88 33.88  2.26  9.25  9.25  9.25 62.91 62.91 62.91
## [565] 33.35 33.35 33.35 33.35 13.09 13.09 13.09 20.31 20.31 20.31 20.31 20.31
## [577] 22.82 22.82 22.82 24.79 24.79 24.79 21.51 21.51 21.51 11.87 11.87 11.87
## [589] 27.76 27.76 27.76 10.35 10.35  8.23  8.23  8.23 25.90 25.90 25.90  3.30
## [601]  3.30  3.30 28.86 28.86 28.86 21.53 21.53 21.53 32.00 32.00 32.00 61.02
## [613] 61.02 61.02 26.09 26.09 26.09 14.02 14.02 29.56 29.56 29.56 19.44 19.44
## [625]  9.90  9.90  9.90 35.19 35.19 35.19  8.09  8.09  8.09 69.12 69.12 69.12
## [637] 20.32 20.32 20.32 65.94 65.94 65.94 34.62 34.62
## 
## -----------
## column Equity_Holding 
##   [1]  78.36  78.36  78.36  69.36  69.36  69.36  90.40  90.40  90.40  69.27
##  [11]  69.27  69.27  74.02  74.02  74.02  92.16  92.16  92.16  67.14  67.14
##  [21]  67.14  66.40  66.40  66.40  66.40  66.40  95.91  95.91  95.91  95.91
##  [31]  99.07  99.07  99.07  85.68  85.68  85.68  98.21  98.21  98.21  99.31
##  [41]  99.31  99.31  99.31  99.40  99.40  99.40  99.40  70.56  70.56  70.56
##  [51]  70.56  70.56  70.56  70.56  83.37  83.37  83.37  98.68  98.68  98.68
##  [61]  98.68  69.91  69.91  69.91  83.18  83.18  83.18  98.90  98.90  98.90
##  [71]  99.20  99.20  99.20  65.94  65.94  65.94  98.53  98.53  98.53  93.15
##  [81]  93.15  93.15  91.93  91.93  91.93  92.48  92.48  92.48  86.01  86.01
##  [91]  86.01  92.76  92.76  92.76  96.82  96.82  61.92  61.92  61.92  90.34
## [101]  90.34  90.34  97.79  97.79  97.79  88.43  88.43  88.43  97.68  97.68
## [111]  97.68  96.85  96.85  96.85  -0.27  -0.27  -0.27  -0.27  98.67  98.67
## [121]  98.67  89.84  89.84  89.84  66.56  66.56  66.56  99.95  99.95  99.95
## [131]  99.18  99.18  89.05  89.05  89.05  97.69  97.69  97.69  97.73  97.73
## [141]  97.73  65.13  65.13  65.13  65.13  65.13  65.13  65.13  65.13  97.09
## [151]  97.09  97.09  70.56  70.56  70.56  93.80  93.80  93.80  93.71  93.71
## [161]  93.71  92.19  92.19  92.19  96.88  96.88  96.88  99.97  69.67  69.67
## [171]  69.67  89.74  89.74  97.34  97.34  97.34  96.75  96.75  96.75  98.31
## [181]  98.31  98.31  98.06  98.06  98.06  67.42  67.42  67.42  96.93  96.93
## [191]  96.93  67.33  67.33  67.33  97.48  97.48  97.48  97.17  97.17  97.17
## [201]  97.92  97.92  97.92  66.93  66.93  66.93  66.93  66.93  98.31  98.31
## [211]  98.31  40.85  40.85  40.85  40.85  40.85  40.85  40.85  91.27  91.27
## [221]  91.27  97.84  97.84  97.84  93.66  93.66  93.66  67.30  67.30  67.30
## [231]  67.30  67.30  67.30  67.30  99.77  99.77  99.77  96.54  96.54  96.54
## [241]  98.15  98.15  98.15  97.03  97.03  97.03  97.55  97.55  97.55  99.29
## [251]  99.29  99.29  93.45  93.45  93.45  97.89  97.89  89.98  89.98  89.98
## [261]  58.21  58.21  58.21  96.39  96.39  96.39  96.71  96.71  96.71  81.77
## [271]  81.77  81.77  81.77  81.77  81.77  81.77  93.35  93.35  93.35  78.84
## [281]  78.84  78.84  88.78  88.78  88.78  97.99  97.99  97.99  68.00  68.00
## [291]  68.00  97.27  97.27  97.27  97.94  97.94  97.94  95.83  95.83  95.83
## [301]  96.67  96.67  96.67  93.40  93.40  66.67  66.67  66.67  99.94  99.94
## [311]  99.94  98.33  98.33  98.33  78.55  78.55  78.55  74.25  74.25  74.25
## [321]  65.25  65.25  65.25  97.80  97.80  97.80  97.22  97.22  97.22  97.22
## [331]  99.35  99.35  99.35  97.72  97.72  97.72  92.48  92.48  92.48  99.01
## [341]  99.01  99.01  91.57  91.57  91.57  93.37  93.37  93.37  22.69  22.69
## [351]  22.69  22.69  22.69  22.69  22.69  66.18  66.18  66.18  99.02  99.02
## [361]  99.02  98.91  98.91  98.91  70.10  70.10  70.10  65.61  65.61  65.61
## [371]  76.04  76.04  76.04  96.83  96.83  96.83  98.92  98.92  98.92  -0.55
## [381]  -0.55  -0.55  66.41  66.41  66.41  97.22  97.22  97.22  92.25  92.25
## [391]  92.25 100.04  97.40  97.40  90.53  90.53  90.53  90.53  90.53  99.94
## [401]  75.88  75.88  75.88  68.78  68.78  68.78  68.78  68.78  68.78  38.15
## [411]  38.15  38.15  96.94  96.94  96.94  97.96  97.96  97.96  99.90  99.90
## [421]  99.90  92.20  92.20  92.20  99.70  99.70  99.70  99.70  96.86  96.86
## [431]  96.86  68.77  68.77  68.77  91.69  91.69  91.69  96.29  96.29  96.29
## [441]  96.74  96.74  96.74  97.80  97.80  97.80  97.80  97.67  97.67  97.67
## [451]  96.01  96.01  96.01  96.17  96.17  96.17  99.60  99.60  99.60  89.16
## [461]  89.16  89.16  75.92  75.92  75.92  98.31  98.31  98.31  82.71  82.71
## [471]  82.71  97.70  97.70  97.70  89.41  89.41  89.41  86.10  86.10  86.10
## [481]  99.22  99.22  99.22  99.22  97.71  97.71  97.71  97.71  65.07  65.07
## [491]  65.07  67.96  67.96  67.96  67.96  67.96  97.15  97.15  97.15  97.98
## [501]  97.98  97.98  67.72  67.72  67.72  97.21  97.21  26.22  95.84  95.84
## [511]  95.84  91.04  91.04  91.04  97.28  97.28  97.28  99.60  99.60  81.29
## [521]  81.29  81.29  64.85  64.85  64.85  95.99  95.99  95.99  99.87  98.24
## [531]  98.24  98.24  99.23  99.23  99.23  98.20  98.20  98.20  97.77  97.77
## [541]  96.31  96.31  96.31  96.31  66.63  66.63  66.63  63.48  63.48  63.48
## [551]  98.84  98.84  98.84  98.17  98.17  98.17  65.71  65.71  65.71  65.71
## [561]  65.71  87.87  97.06  97.06  97.06  98.82  98.82  98.82  93.47  93.47
## [571]  93.47  97.88  97.88  97.88  97.88  73.30  73.30  73.30  73.30  73.30
## [581]  97.84  97.84  97.84  98.01  98.01  98.01  98.35  98.35  98.35  96.45
## [591]  96.45  96.45  97.39  97.39  97.39  97.58  97.58  97.58  37.56  37.56
## [601]  97.92  97.92  97.92  64.88  64.88  64.88  93.58  93.58  93.58  45.35
## [611]  45.35  45.35  85.98  85.98  85.98  96.04  96.04  96.04  73.30  73.30
## [621]  73.30  99.41  99.41  99.41  87.19  87.19  87.19  96.97  96.97  96.97
## [631]  90.57  90.57  98.40  98.40  98.40  98.23  98.23  98.23  97.12  97.12
## [641]  97.12  99.19  99.19  99.19  96.36  96.36  96.36  97.26  97.26  97.26
## [651]  99.03  99.03
## 
## -----------
## column Smallcap_Holding 
##   [1]  2.67  2.67  2.67  7.67  7.67  7.67  7.26  7.26  7.26 17.99 17.99 17.99
##  [13]  7.51  7.51  7.51  1.00  1.00  1.00  0.20  0.20  0.20  2.11  2.11  2.11
##  [25]  2.71  2.71  2.71  2.71  2.71 65.09 65.09 65.09 65.09 13.40 13.40 13.40
##  [37]  5.51  5.51  5.51  3.50  3.50  3.50  1.95  1.95  1.95  2.80  2.80  2.80
##  [49]  2.80 26.96 26.96 26.96 26.96  6.21  6.21  6.21  6.21  6.21  6.21  6.21
##  [61]  9.27  9.27  9.27  3.06  3.06  3.06  4.41  4.41  4.41  9.69  9.69  9.69
##  [73]  4.60  4.60  4.60  3.67  3.67  3.67 15.42 15.42 15.42 70.71 70.71 70.71
##  [85] 10.28 10.28 10.28 83.20 83.20 83.20  1.95  1.95  2.63  2.63  2.63 13.45
##  [97] 13.45 13.45  4.46  4.46  4.46  7.11  7.11  7.11  7.04  7.04  7.04  0.18
## [109]  0.18  0.18  0.18 14.24 14.24 14.24 66.68 66.68 66.68 15.18 15.18 15.18
## [121] 10.56 10.56 10.56  2.97  2.97  2.97 15.88 15.88 66.34 66.34 66.34 15.48
## [133] 15.48 15.48 13.82 13.82 13.82  1.29  1.29  1.29  1.29  1.29  1.29  1.29
## [145]  1.29 28.77 28.77 28.77 16.89 16.89 16.89 18.16 18.16 18.16 11.82 11.82
## [157] 11.82 31.73 31.73 31.73 13.01 13.01 13.01  3.65  3.65  3.65 71.22 71.22
## [169] 25.09 25.09 25.09 15.18 15.18 15.18 13.55 13.55 13.55  7.97  7.97  7.97
## [181] 25.06 25.06 25.06 14.58 14.58 14.58  0.52  0.52  0.52 10.21 10.21 10.21
## [193]  6.92  6.92  6.92 13.12 13.12 13.12  0.01  0.01  0.01 88.10 88.10 88.10
## [205]  9.26  9.26  9.26  3.01  3.01  3.01  3.01  3.01 72.15 72.15 72.15 12.07
## [217] 12.07 12.07 12.07 12.07 12.07 12.07  7.43  7.43  7.43 26.96 26.96 26.96
## [229]  4.10  4.10  4.10  3.76  3.76  3.76  3.76  3.76  3.76  3.76 70.12 70.12
## [241] 70.12 23.68 23.68 23.68 15.58 15.58 15.58 17.60 17.60 17.60 19.74 19.74
## [253] 19.74 29.76 29.76 29.76  9.04  9.04  9.04 15.32 15.32 22.99 22.99 22.99
## [265]  4.48  4.48  4.48  3.16  3.16  3.16 16.39 16.39 16.39  5.21  5.21  5.21
## [277]  5.21  5.21  5.21  5.21 78.40 78.40 78.40  3.84  3.84  3.84 11.25 11.25
## [289] 11.25  2.28  2.28  2.28 10.53 10.53 10.53 65.90 65.90 65.90 21.11 21.11
## [301] 21.11 15.87 15.87 15.87 11.93 11.93 11.93  7.50  7.50  7.50  4.64  4.64
## [313]  8.90  8.90  8.90 20.62 20.62 20.62  6.46  6.46  6.46  4.69  4.69  4.69
## [325] 19.75 19.75 19.75  3.32  3.32  3.32 12.13 12.13 12.13 12.13 25.95 25.95
## [337] 25.95  8.67  8.67  8.67 88.38 88.38 88.38 25.11 25.11 25.11 16.41 16.41
## [349] 16.41 16.67 16.67 16.67 11.55 11.55 11.55 11.55 11.55 11.55 11.55  4.87
## [361]  4.87  4.87 15.64 15.64 15.64 68.74 68.74 68.74  2.67  2.67  2.67 29.28
## [373] 29.28 29.28  5.16  5.16  5.16  7.02  7.02  7.02 23.92 23.92 23.92 23.36
## [385] 23.36 23.36 12.86 12.86 12.86 -0.01 -0.01 -0.01  9.12  9.12  9.12 16.39
## [397] 16.39 16.39  5.96  5.96  5.96 31.70 31.70 31.70  8.85  8.85 10.04 10.04
## [409] 10.04 10.04 10.04  6.71  6.71  6.71  6.87  6.87  6.87  6.87  6.87  6.87
## [421]  3.32  3.32  3.32 12.73 12.73 12.73 28.08 28.08 28.08 77.45 77.45 77.45
## [433] 26.05 26.05 26.05 26.05 20.19 20.19 20.19  7.23  7.23  7.23 36.36 36.36
## [445] 36.36 26.13 26.13 26.13  1.36  1.36  1.36 13.83 13.83 13.83 13.83  1.57
## [457]  1.57  1.57 25.01 25.01 25.01 16.63 16.63 16.63 14.79 14.79 14.79  8.10
## [469]  8.10  8.10  7.29  7.29  7.29 10.64 10.64 10.64 33.63 33.63 33.63 28.98
## [481] 28.98 28.98  4.70  4.70  4.70 23.71 23.71 23.71 23.66 23.66 23.66 23.66
## [493] 11.48 11.48  1.85  1.85  1.85  1.85  1.49  1.49  1.49  4.87  4.87  4.87
## [505]  4.87  4.87 11.38 11.38 11.38 10.41 10.41 10.41 33.81 33.81 33.81 10.74
## [517] 10.74 10.74 31.08 31.08  3.81  5.74  5.74  5.74  7.36  7.36  7.36 26.08
## [529] 26.08 26.08  7.30  7.30 12.68 12.68  8.06  8.06  8.06  5.02  5.02  5.02
## [541] 21.66 21.66 21.66 28.58 28.58 28.58 14.11 14.11 14.11 12.49 12.49 12.49
## [553] 28.50 28.50 28.50 23.18 23.18 23.18  6.62  6.62 13.39 13.39 13.39 13.39
## [565]  3.55  3.55  3.55  6.02  6.02  6.02 21.20 21.20 21.20 32.77 32.77 32.77
## [577]  7.08  7.08  7.08  1.16  1.16  1.16  1.16  1.16 13.70 13.70 13.70 15.12
## [589] 86.52 86.52 86.52  5.96  5.96  5.96 14.34 14.34 14.34 61.91 61.91 61.91
## [601] 61.91 38.19 38.19 38.19 13.79 13.79 13.79 13.79 13.79 27.88 27.88 27.88
## [613] 23.23 23.23 23.23 11.83 11.83 11.83 15.39 15.39 15.39 21.26 21.26 21.26
## [625] 26.56 26.56 26.56  3.72  3.72 37.72 37.72 37.72 11.72 11.72 11.72 43.25
## [637] 43.25 43.25  3.15  3.15  3.15 18.11 18.11 18.11 73.75 73.75 73.75 24.67
## [649] 24.67 24.67  5.99  5.99  5.99 27.85 27.85 27.85 15.48 15.48 15.48 21.39
## [661] 21.39 21.39 19.19 19.19 21.58 21.58 21.58 25.85 25.85 25.85 17.89 17.89
## [673] 17.89 75.53 75.53 16.13 16.13 16.13 10.39 10.39 10.39 22.88 22.88 22.88
## [685] 13.01 13.01 13.01 19.77 19.77 19.77  8.07  8.07  8.07 43.96 43.96
## 
## -----------
## column Debt_Holding 
##   [1] 13.94 13.94 13.94 25.78 25.78 25.78  0.51  0.51  0.51 13.31 13.31 13.31
##  [13] 18.82 18.82 18.82  2.80  2.80  2.80  5.62  5.62  5.62 19.97 19.97 19.97
##  [25] 19.97 19.97  1.46  1.46  1.46  1.46  1.78  1.78  1.78  0.75  0.75  0.75
##  [37] 21.38 21.38 21.38 21.38 21.38 21.38 21.38  8.12  8.12  8.12  1.03  1.03
##  [49]  1.03 26.01 26.01 26.01  0.48  0.48  0.48  0.73  0.73  0.73  0.97  0.97
##  [61]  0.97  2.13  2.13  2.13 11.97 11.97 11.97  0.12  0.12  0.12  0.61  0.61
##  [73]  0.61 10.38 10.38 10.38 10.38  1.21  1.21  1.21  0.83  0.83  0.83  0.19
##  [85]  0.19  0.19 10.15 10.15 10.15  1.73  1.73  1.73 14.84 14.84 14.84 14.84
##  [97] 14.84 14.84 14.84 14.84 23.50 23.50 23.50  0.77  0.77  0.77  1.61  1.61
## [109]  1.61  0.53  0.53  0.53  0.22  0.22  0.22 17.08 17.08 17.08  0.26  0.26
## [121]  0.26  0.25  0.25  0.25 12.12 12.12 12.12 21.47 21.47 21.47  0.31  0.31
## [133]  0.31  8.13  8.13  8.13  8.13  8.13 35.87 35.87 35.87 35.87 35.87 35.87
## [145] 35.87  1.71  1.71  1.71  0.45  0.45  0.45 13.90 13.90 13.90 13.90 13.90
## [157] 13.90 13.90  1.17  1.17  1.17  0.37  0.37  0.37 16.19 16.19 16.19  7.16
## [169]  7.16  7.16  7.16  7.16  7.16  7.16  0.39  0.39  0.39  0.40  0.40  0.40
## [181] 10.52 10.52 10.52  0.20  0.20  0.20  0.20  0.20  0.20 27.86 27.86 27.86
## [193] 20.73 20.73 20.73 12.36 12.36 12.36  0.21  0.21  0.21  0.15  0.15  0.15
## [205]  0.15  0.04  0.04  0.04 69.48 69.48 69.48 69.48 69.48 69.48 69.48 11.14
## [217] 11.14 11.14  0.26  0.26  0.26 16.89 16.89 16.89  0.16  0.16  0.16 26.99
## [229] 26.99 26.99 21.59 21.59 21.59  0.37  0.37  0.37 17.89 17.89 17.89 19.48
## [241] 19.48 19.48  0.05  0.05  0.05  0.61  0.61  0.61 18.94 18.94 18.94  1.82
## [253]  1.82  1.82  1.82  1.82  1.82 11.16 11.16 11.16  0.29  0.29  0.29  1.66
## [265]  1.66  1.66 21.55 21.55 21.55  1.12  1.12  1.12  0.13  0.13  0.13  0.99
## [277]  0.99  0.99  4.39  4.39  4.39 19.68 19.68 19.68  5.35  5.35  5.35  2.86
## [289]  2.86  2.86 10.79 10.79 10.79 14.83 14.83 14.83 14.83 14.83  0.03  0.03
## [301]  0.03 26.37 26.37 26.37 29.41  0.07  0.07  0.07  0.64  0.64  0.64  0.23
## [313]  0.23  4.75  4.75  4.75 10.48 10.48 10.48  0.30  0.30  0.30  0.11  0.11
## [325]  0.11  0.11 21.57 21.57 21.57 11.23 11.23 11.23  0.33  0.33  0.33 23.34
## [337] 23.34 23.34 23.34 23.34  8.66  0.09  0.09  0.09 22.31 22.31 22.31 22.31
## [349] 22.31 59.37 59.37  0.09  0.09  0.09  9.63  9.63  9.63  0.10  0.10  0.10
## [361] 10.56 10.56 10.56 19.23 19.23 19.23  3.62  3.62  3.62  1.17  1.17  1.17
## [373]  0.07  0.07  0.07
## 
## -----------
## column Average_Maturity 
##   [1]  7.60  7.60  7.60  5.56  5.56  5.56  3.39  3.39  3.39  6.73  6.73  6.73
##  [13]  6.73  6.73  8.19  8.19  8.19  8.19  8.19  8.19  8.19  3.88  3.88  3.88
##  [25]  6.00  6.00  6.00  6.00  6.00  7.50  7.50  7.50  7.50  7.50  7.01  7.01
##  [37]  7.01  7.01  9.67  9.67  9.67  4.53  4.53  4.53  2.92  2.92  2.92  2.92
##  [49]  2.92  3.14  3.14  3.14 12.90 12.90 12.90  3.38  3.38  3.38  4.16  4.16
##  [61]  4.16  3.44  3.44  3.44  2.97  2.97  2.97  2.97  2.97  2.97  2.97  4.12
##  [73]  4.12  4.12  4.12  4.12  3.49  3.49  3.49  3.49  3.49  3.49  3.49 14.03
##  [85] 14.03 14.03 14.03 14.03 14.03 14.03 14.03 14.03  2.97  2.97  2.97  2.97
##  [97]  2.97  2.97 11.13 11.13 11.13 13.66 13.66 13.66  3.93  3.93  3.93 13.82
## [109] 13.82 13.82 20.72 20.72 20.72  3.81  3.81  3.81  3.81  3.81  3.81  3.81
## [121]  2.99  2.99  2.99  2.99  2.99  3.59  3.59  3.59  7.30  7.30  7.30  5.13
## [133]  5.13  5.13  5.94  5.94  5.94  3.29  3.29  3.29  4.52  4.52  4.52  4.52
## [145]  4.52  3.42  3.42  3.42  3.42  3.42  3.42  3.42  3.42  3.42  3.42  3.42
## [157] 13.69 13.69 13.69  3.69  3.69  3.69  3.69  3.69  3.69  3.69  3.69  5.89
## [169]  5.89  5.89  3.00  3.00  3.00  3.00  3.00  3.00  3.00 22.05 22.05 22.05
## [181] 22.05 22.05 22.05 22.05 22.05 22.05 22.05 22.05  4.46  4.46  4.46  3.07
## [193]  3.07  3.07  3.07  3.07  3.07  6.45  6.45  6.45  9.26  9.26  9.26  3.77
## [205]  3.77  3.77  3.77  3.77  4.05  4.05  4.05  9.40  9.40  9.40  4.03  4.03
## [217]  4.03  4.03  4.03  4.03  4.03  4.03  4.03  3.65  3.65  3.65  3.43  3.43
## [229]  3.43  6.48  6.48  6.48  4.34  4.34  4.34  4.13  4.13  4.13  4.13  4.13
## [241]  4.13  4.13  4.13  4.13  4.04  4.04  4.04  4.11  4.11  4.11  4.11  4.11
## [253]  4.11  4.11  4.11  4.11  4.11  9.53  9.53  3.15  3.15  3.15  5.67  5.67
## [265]  5.67  2.91  2.91  2.91  2.91  2.91
## 
## -----------
## column Sortino_Ratio 
##   [1] 1.05 1.06 1.37 1.37 1.03 1.03 1.03 1.04 1.04 1.04 1.04 0.89 1.04 1.01 1.01
##  [16] 1.04 0.90 0.90 0.90 0.89 0.89 0.91 0.94 0.94 0.93 0.94 0.93 0.94 0.94 0.93
##  [31] 0.99 0.99 0.99 0.99 0.99 0.99 0.99 1.05 1.06 1.01 0.90 0.90 0.89 0.90 0.90
##  [46] 0.90 0.90 0.90 0.90 0.90 0.90 0.90 0.90 0.89 0.90 0.90 1.03 1.03 1.05 1.02
##  [61] 1.02 1.02 1.02 1.02 1.02 1.02 1.02 1.02 1.02 1.02 1.06 1.07 1.02 1.02 1.00
##  [76] 1.00 1.00 1.00 1.00 1.00 0.90 0.90 0.89 0.95 0.95 0.95 0.95 1.18 1.18 1.18
##  [91] 1.20 1.18 1.18 0.95 0.95 1.04 1.05 0.99 0.99 0.99 1.01 1.01 1.01 1.01 1.01
## [106] 1.01 1.20 1.21 1.21 1.00 0.99 1.00 1.00 1.02 1.02 1.02 0.94 0.94 1.06 1.08
## [121] 1.08 1.06 1.06 0.95 0.95 0.96 0.95 1.00 1.00 1.00 0.89
## 
## -----------
## column Sharpe_Ratio 
##  [1] -28.62 -28.62 -28.63 -28.62   7.12   7.19   7.14   7.14   7.08   7.15
## [11]   7.14   7.14   7.39   7.39   7.38   7.39   7.38   7.39   7.38   7.38
## [21]   7.28   7.33   7.19   7.19   7.19   7.19 -28.07 -28.42   7.97   7.97
## [31]   7.97   8.00   7.97   7.97  -5.69 -23.18 -23.91 -23.97 -23.97  -5.69
## [41] -27.26 -27.26 -20.87 -26.98 -27.25 -12.94 -27.12  -6.06 -24.00 -24.03
## [51] -14.85 -24.02 -24.00 -24.03   7.57   7.55   7.55   7.29   7.30   7.29
## [61]   7.29 -26.59 -26.59 -26.59 -26.59   7.13   7.21   7.21   7.13   7.13
## [71]   7.07   7.06   7.07 -26.64 -25.28 -10.86 -26.64 -25.28 -26.64
## 
## -----------
## column Average_YTM 
##   [1] 6.18 6.18 6.18 6.18 6.18 6.18 6.21 6.21 6.21 7.26 7.26 7.26 7.26 7.26 7.24
##  [16] 7.24 7.24 7.24 7.24 5.60 5.60 5.60 6.06 6.06 6.06 6.06 6.06 6.06 6.06 6.06
##  [31] 6.06 6.06 6.06 6.06 4.97 4.97 4.97 4.97 6.10 6.10 6.10 6.10 6.03 6.03 6.03
##  [46] 6.03 6.03 6.03 6.03 6.03 6.03 6.03 6.03 6.03 6.03 6.03 6.03 7.29 7.29 7.29
##  [61] 7.41 7.41 7.41 5.73 5.73 5.73 6.18 6.18 6.18 6.18 6.18 6.18 6.01 6.01 6.01
##  [76] 6.01 6.01 6.01 6.01 6.01 6.01 7.29 7.29 7.29 7.25 7.25 7.25 6.12 6.12 6.12
##  [91] 6.12 6.12 6.12 7.74 7.74 7.74 7.74 7.74 7.74 7.74 7.74 7.74 7.26 7.26 7.26
## [106] 6.25 6.25 6.25 6.25 6.25 6.25 6.25 5.88 5.88 5.88 5.88 5.88 5.88 5.88 6.25
## [121] 6.25 6.25 6.25 5.04 5.04 5.04 5.04 5.04 5.04 5.04 5.04 5.05 5.05 7.27 7.27
## [136] 7.27 7.27 7.27 7.27 7.27 6.04 6.04 6.04 6.04 6.04 6.04 6.04 6.04 6.04 6.04
## [151] 6.04 6.04 6.04 5.04 5.04 5.04 5.04 5.04 5.04 7.39 7.39 7.39 7.56 7.56 7.56
## [166] 7.56 7.56 7.56 7.56 6.15 6.15 6.15 7.29 7.29 7.29 7.29 7.29 4.99 4.99 4.99
## [181] 4.99 4.99 6.14 5.97 5.97 5.97 7.28 7.28 7.28 7.28 7.28 7.28 7.28 7.40 7.40
## [196] 7.40 7.40 7.40 7.40 7.40 7.40 7.40 7.40 7.40 6.20 6.20 6.15 6.15 6.15 6.15
## [211] 6.15 6.15 6.15 6.05 6.05 6.05 6.05 6.05 6.05 6.05 6.05 7.46 7.46 7.46 5.06
## [226] 5.06 5.98 5.98 5.98 8.31 8.31 8.31 8.31 8.31 8.03 8.03 8.03 5.05 5.05 5.05
## [241] 5.05 5.05 5.05 5.05 6.15 7.58 7.58 7.58 6.07 6.07 6.07 5.88 5.88 5.88 5.61
## [256] 5.61 5.61 8.41 8.41 8.41 5.85 5.85 5.85 7.33 7.33 7.33 7.33 7.33 6.09 6.09
## [271] 6.09 6.09 6.09 6.09 6.09 6.09 7.99 7.99 7.99 4.95 4.95 4.95 4.95 5.92 5.92
## [286] 5.92 5.92 5.92 7.32 7.32 7.32 5.10 5.10 5.10 5.01 5.01 5.01 5.01 5.01 5.01
## 
## -----------
## column Time_since_inception 
##  [1]  6  6 10 23 23 23 21 21 21  9 22 22 22 20 20 20 21 21 21 21 21 21
## 
## -----------
## column Lockin 
##  [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 
## -----------
## column Minimum_SIP 
##   [1] 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500
##  [19] 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500
##  [37] 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500
##  [55] 500 500 500 500 500 500 500 500 500 500 500 500 500 500 300 300 300 500
##  [73] 500 500 500 500 500 500 500 500 500 500 500 500 500 600 600 600 500 500
##  [91] 500 500 300 300 300 300 300 300 300 300 300 500 500 500 500 500 500 500
## [109] 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500
## [127] 500 500 750 750 750 500 500 500 500 500 500 500 500 500 500 500 500 500
## [145] 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500 500
## [163] 500 500 500 500
## 
## -----------

# small visaulization to understand outliers
exclude_cols <- c("AUM", "NAV", "Minimum_Lumpsum")

mf_data |>
  select(-all_of(exclude_cols)) |>
  pivot_longer(cols = where(is.numeric)) |>
  ggplot(aes(x = value, y = name)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90))

## Error : The fig.showtext code chunk option must be TRUE

# removing outliers
mf_data_cleaned <- mf_data
for (col in numeric_cols) {
  q1 <- quantile(mf_data[[col]], 0.25, na.rm = TRUE)
  q3 <- quantile(mf_data[[col]], 0.75, na.rm = TRUE)
  iqr_val <- IQR(mf_data[[col]], na.rm = TRUE)

  lower <- q1 - 1.5 * iqr_val
  upper <- q3 + 1.5 * iqr_val

  mf_data_cleaned <- mf_data_cleaned[
    (mf_data_cleaned[[col]] >= lower & mf_data_cleaned[[col]] <= upper) |
      is.na(mf_data_cleaned[[col]]),
  ]
}

# finally data is cleaned but still contains na we
# need detect and treat them

sum(is.na(mf_data_cleaned))

## [1] 23

mf_data[is.na(mf_data)] <- 0

# finally store the data into file
write.csv(
  mf_data_cleaned,
  "D:/semester_projects/R/data_preprocessing_phase/mutual_fund.csv",
  row.names = FALSE
)

Interpretation: Outliers were detected using the IQR method and removed to reduce extreme influence on analysis. Some naturally skewed variables were excluded. This improves model stability and ensures that results reflect typical fund behavior rather than rare extreme observations.

Phase 2: Exploratory Data Analysis.

2.1 What are the central tendencies and spread of returns and risk metrics.

# loading the data
mf_data <- read_csv(
  "D:/semester_projects/R/data_preprocessing_phase/mutual_fund.csv",
  show_col_types = FALSE
)

# Central Tendencies of returns
# CAGR 3Y
mean_cagr_3y <- mf_data |> summarise(mean_cagr_3y = mean(CAGR_3Y, na.rm = TRUE))

median_cagr_3y <- mf_data |>
  summarise(
    median_cagr_3y = median(CAGR_3Y, na.rm = TRUE)
  )

sd_cagr_3y <- mf_data |> summarise(SD = sd(CAGR_3Y, na.rm = TRUE))

iqr_cagr_3y <- mf_data |> summarise(IQR = IQR(CAGR_3Y, na.rm = TRUE))

# CAGR 5Y
mean_cagr_5y <- mf_data |> summarise(mean_cagr_5y = mean(CAGR_5Y, na.rm = TRUE))

median_cagr_5y <- mf_data |> summarise(
  median_cagr_5y = median(CAGR_5Y, na.rm = TRUE)
)

sd_cagr_5y <- mf_data |> summarise(SD = sd(CAGR_5Y, na.rm = TRUE))

iqr_cagr_5y <- mf_data |> summarise(IQR = IQR(CAGR_5Y, na.rm = TRUE))

# Absolute_Returns_3M
mean_absolute_returns_3m <- mf_data |>
  summarize(Absolute_Returns_3M = mean(Absolute_Returns_3M, na.rm = TRUE))

median_absolute_returns_3m <- mf_data |>
  summarize(Absolute_Returns_3M = median(Absolute_Returns_3M, na.rm = TRUE))

sd_absolute_returns_3m <- mf_data |> summarise(
  SD = sd(Absolute_Returns_3M, na.rm = TRUE)
)

iqr_absolute_returns_3m <- mf_data |> summarise(
  IQR = IQR(Absolute_Returns_3M, na.rm = TRUE)
)

# Absolute_Returns_6M
mean_absolute_returns_6m <- mf_data |>
  summarize(Absolute_Returns_6M = mean(Absolute_Returns_6M, na.rm = TRUE))

median_absolute_returns_6m <- mf_data |>
  summarize(Absolute_Returns_6M = median(Absolute_Returns_6M, na.rm = TRUE))

sd_absolute_returns_6m <- mf_data |> summarise(
  SD = sd(Absolute_Returns_6M, na.rm = TRUE)
)

iqr_absolute_returns_6m <- mf_data |> summarise(
  IQR = IQR(Absolute_Returns_6M, na.rm = TRUE)
)

# Absolute_Returns_1Y
mean_absolute_returns_1y <- mf_data |>
  summarize(Absolute_Returns_1y = mean(Absolute_Returns_1Y, na.rm = TRUE))

median_absolute_returns_1y <- mf_data |>
  summarize(Absolute_Returns_1y = median(Absolute_Returns_1Y, na.rm = TRUE))

sd_absolute_returns_1y <- mf_data |> summarise(
  SD = sd(Absolute_Returns_1Y, na.rm = TRUE)
)
iqr_absolute_returns_1y <- mf_data |> summarise(
  IQR = IQR(Absolute_Returns_1Y, na.rm = TRUE)
)
central_tendency_and_spread <- data.frame(
  Metric = c("CAGR_3Y", "CAGR_5Y", "3M", "6M", "1Y"),
  Mean = c(
    mean_cagr_3y$mean_cagr_3y,
    mean_cagr_5y$mean_cagr_5y,
    mean_absolute_returns_3m$Absolute_Returns_3M,
    mean_absolute_returns_6m$Absolute_Returns_6M,
    mean_absolute_returns_1y$Absolute_Returns_1y
  ),
  Median = c(
    median_cagr_3y$median_cagr_3y,
    median_cagr_5y$median_cagr_5y,
    median_absolute_returns_3m$Absolute_Returns_3M,
    median_absolute_returns_6m$Absolute_Returns_6M,
    median_absolute_returns_1y$Absolute_Returns_1y
  ),
  SD = c(
    sd_cagr_3y$SD,
    sd_cagr_5y$SD,
    sd_absolute_returns_3m$SD,
    sd_absolute_returns_6m$SD,
    sd_absolute_returns_1y$SD
  ),
  IQR = c(
    iqr_cagr_3y$IQR,
    iqr_cagr_5y$IQR,
    iqr_absolute_returns_3m$IQR,
    iqr_absolute_returns_6m$IQR,
    iqr_absolute_returns_1y$IQR
  ),
  stringsAsFactors = FALSE
)

central_tendency_and_spread

##    Metric      Mean Median       SD    IQR
## 1 CAGR_3Y 2.9014706   0.40 3.558050 7.3950
## 2 CAGR_5Y 2.3390294   0.40 2.906254 6.1200
## 3      3M 0.2169412  -0.05 0.731940 1.1275
## 4      6M 0.6809412  -0.04 1.441582 2.6600
## 5      1Y 1.9766471   0.00 3.299085 6.6325

Interpretation: Measures like mean, median, standard deviation, and IQR reveal the central tendency and variability of returns. Differences between mean and median indicate skewness, while higher standard deviation reflects greater uncertainty and risk in mutual fund performance.

2.2 How are returns distributed.

hist(mf_data$CAGR_3Y, breaks = 30, main = "Distribution of Returns")

## Error : The fig.showtext code chunk option must be TRUE

plot(density(mf_data$CAGR_3Y, na.rm = TRUE))

## Error : The fig.showtext code chunk option must be TRUE

skewness(mf_data$CAGR_3Y, na.rm = TRUE)

## [1] 0.4708283

Interpretation: Return distributions show slight skewness, indicating that most funds deliver moderate returns while a few achieve significantly higher performance. This highlights inequality in fund performance and suggests the presence of outliers or high-performing funds influencing the distribution.

2.3 What is the relationship between short-term and long-term returns.

ggplot(mf_data, aes(x = Absolute_Returns_1Y, y = CAGR_3Y)) +
  geom_point(alpha = 0.6, color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    title = "Short term returns VS Long term returns",
    x = "1-Year Returns (short-term)",
    y = "3-Year CAGR (Long-term)"
  ) +
  theme_minimal()

## Error : The fig.showtext code chunk option must be TRUE

## `geom_smooth()` using formula = 'y ~ x'

Interpretation: The relationship between short-term and long-term returns shows moderate correlation. While some consistency exists, short-term performance does not always translate into long-term success, indicating that investment decisions should not rely solely on recent returns.

2.4 Are high-return funds consistently associated with higher risk.

ggplot(mf_data, aes(x = Volatility, y = Absolute_Returns_1Y)) +
  geom_point(
    color = "steelblue", alpha = 0.6, size = 1.5
  ) + # Added alpha to see overlapping dots

  # This is the magic line that fixes the labels:
  geom_text_repel(
    aes(label = Plan),
    size = 2.5, # Small but readable font
    max.overlaps = 15, # Only labels the 15 least-crowded points (prevents mess)
    box.padding = 0.35, # Adds space around labels
    point.padding = 0.5, # Adds space between label and dot
    segment.color = "grey50" # Adds a tiny line pointing from text to dot
  ) +
  geom_smooth(
    method = "lm",
    se = FALSE, linetype = "dashed",
    color = "red", size = 0.8
  ) +
  labs(
    title = "Risk vs. Return Analysis (1-Year)",
    subtitle = "Higher volatility generally
    trends with higher absolute returns",
    x = "Volatility (Risk %)",
    y = "Absolute 1-Year Return (%)",
    caption = "Red line represents the expected risk-return tradeoff"
  ) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Error : The fig.showtext code chunk option must be TRUE

## `geom_smooth()` using formula = 'y ~ x'

Interpretation: A general positive relationship exists between volatility and returns, supporting the risk-return tradeoff theory. However, the presence of scattered points indicates variability, meaning higher risk does not always guarantee higher returns for all funds.

2.5 What patterns emerge from scatter plots of risk vs return.

mf_long <- mf_data |>
  select(
    Volatility, CAGR_3Y,
    CAGR_5Y, Absolute_Returns_1Y,
    Absolute_Returns_3M, Absolute_Returns_6M
  ) |>
  pivot_longer(
    cols = c(
      CAGR_3Y, CAGR_5Y,
      Absolute_Returns_1Y,
      Absolute_Returns_3M, Absolute_Returns_6M
    ),
    names_to = "Return_Type",
    values_to = "Return_Value"
  )

# Plot
ggplot(mf_long, aes(x = Volatility, y = Return_Value)) +
  geom_point(alpha = 0.6, color = "orange") +
  geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "dashed") +
  facet_wrap(~Return_Type, scales = "free_y") + # 🔥 Subplots here

  labs(
    title = "Risk vs Return Across Different Time Horizons",
    x = "Volatility",
    y = "Returns"
  ) +
  theme_minimal()

## Error : The fig.showtext code chunk option must be TRUE

## `geom_smooth()` using formula = 'y ~ x'

Interpretation: Across different time horizons, risk-return patterns vary, with long-term returns showing more stable relationships compared to short-term returns. This suggests that long-term investments provide more predictable performance compared to short-term fluctuations.

Phase 3: Feature Engineering & Transformation.

3.1 How can new features like Net Return should be created.

mf_data <- mf_data |>
  mutate(
    Net_CAGR_3Y = CAGR_3Y - Expense_Ratio,
    Net_CAGR_5Y = CAGR_5Y - Expense_Ratio,
    Net_Return_1Y = Absolute_Returns_1Y - Expense_Ratio
  )

Interpretation: Net returns were calculated by subtracting expense ratios from gross returns, reflecting actual investor gains. This adjustment provides a more realistic measure of performance, highlighting the impact of costs on overall investment returns.

3.2 Can a custom Risk-Adjusted Return metric outperform traditional Sharpe Ratio.

mf_data <- mf_data |>
  mutate(
    Custom_Risk_Adjusted = round(
      Net_CAGR_3Y / (Volatility + 0.01) * Sortino_Ratio, 2
    )
  )
head(mf_data$Custom_Risk_Adjusted, 20)

##  [1] -0.62 19.81 -0.62  2.41 -0.62 -0.62 18.40 18.40 -0.63 19.68  1.56 -0.63
## [13] -0.12  3.67 -0.11 -0.12 -0.01 -0.07  2.87  3.18

Interpretation: A custom metric combining return, volatility, and Sortino ratio was developed to better evaluate performance. It emphasizes downside risk, offering a more investor-focused measure compared to traditional metrics like Sharpe ratio.

3.3 What composite risk score can be built using volatility, Sortino, and drawdown proxies.

mf_data <- mf_data |>
  mutate(
    Risk_Score = round(scale(Volatility) +
      scale(1 / (Sortino_Ratio + 0.01)) +
      scale(1 / (Sharpe_Ratio + 0.01)), 0)
  )
head(mf_data$Risk_Score)

##      [,1]
## [1,]   -2
## [2,]   -2
## [3,]   -2
## [4,]    1
## [5,]   -2
## [6,]   -2

Interpretation: A composite risk score was created using volatility, Sharpe, and Sortino ratios. This provides a holistic assessment of risk, capturing multiple dimensions rather than relying on a single metric, leading to more balanced evaluation.

3.4 How can equity exposure to be derived from allocation percentages.

mf_data <- mf_data |>
  mutate(
    Total_Equity_Exposure = Largecap_Holding +
      Midcap_Holding +
      Smallcap_Holding,
    Equity_to_Debt_Ratio = Equity_Holding / (Debt_Holding + 0.01)
  )
head(mf_data$Total_Equity_Exposure)

## [1] 79.55 79.55 79.55 79.55 79.55 79.55

head(mf_data$Equity_to_Debt_Ratio)

## [1] 17.80019 17.80019 17.80019 17.80019 17.80019 17.80019

Interpretation: Equity exposure and allocation ratios were derived from portfolio holdings. These metrics help understand fund composition and risk profile, as higher equity allocation typically indicates higher risk and potential for greater returns.

3.5 What normalization techniques are required for combining the features.

normalize <- function(x) {
  (x - min(x, na.rm = TRUE)) /
    (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}

mf_data <- mf_data |>
  mutate(
    Norm_Return = round(normalize(Net_CAGR_3Y), 2),
    Norm_Risk = round(normalize(Volatility), 2),
    Norm_Cost = round(normalize(Expense_Ratio), 2),
    Norm_Score = round(normalize(Custom_Risk_Adjusted), 2)
  )
head(mf_data$Norm_Return, 20)

##  [1] 0.05 0.99 0.05 0.91 0.05 0.05 0.91 0.91 0.05 0.99 0.91 0.05 0.06 0.92 0.06
## [16] 0.06 0.06 0.06 1.00 0.93

Interpretation: Normalization scaled all variables to a common range, ensuring comparability across features. This step is essential for combining different metrics into composite scores and prevents variables with larger scales from dominating analysis.

Phase 4: Risk – Return Analysis.

4.1 Defining statistic significant relationship between risk and returns.

# finding the linear relationship between risk and return using CAGR
mf_lm_model <- lm(Norm_Return ~ Norm_Risk, data = mf_data)
summary(mf_lm_model)

## 
## Call:
## lm(formula = Norm_Return ~ Norm_Risk, data = mf_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4221 -0.3786 -0.2622  0.5042  0.6436 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.47206    0.04536   10.41   <2e-16 ***
## Norm_Risk   -0.57840    0.25365   -2.28   0.0238 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4314 on 168 degrees of freedom
## Multiple R-squared:  0.03002,    Adjusted R-squared:  0.02425 
## F-statistic:   5.2 on 1 and 168 DF,  p-value: 0.02385

ggplot(mf_data, aes(x = Norm_Risk, y = Norm_Return)) +
  geom_point(alpha = 0.5, color = "orange") +
  geom_smooth(method = "lm", color = "black", fill = "lightgreen") +
  labs(
    title = "Relationship between risk and return",
    subtitle = paste("R-squared: ", round(summary(mf_lm_model)$r.squared, 3)),
    x = "Normalized Risk",
    y = "Normalized Returns"
  )

## Error : The fig.showtext code chunk option must be TRUE

## `geom_smooth()` using formula = 'y ~ x'

Interpretation: Regression analysis shows a positive but not strong relationship between risk and return. The R^2 value indicates that risk explains only part of return variability, suggesting other factors also influence fund performance.

4.2 Identify inefficient funds base on high risk and low return.

return_cutoff <- median(mf_data$Norm_Return)
risk_cutoff <- median(mf_data$Norm_Risk)

inefficient_funds <- mf_data[mf_data$Norm_Risk > risk_cutoff & mf_data$Norm_Return < return_cutoff, "Name"]

unique(inefficient_funds)

## # A tibble: 26 × 1
##    Name                                           
##    <chr>                                          
##  1 Bandhan Corp Bond Fund(M-IDCW)                 
##  2 Bandhan Corp Bond Fund(H-IDCW)                 
##  3 Aditya Birla SL Low Duration Fund(W-IDCW Reinv)
##  4 Aditya Birla SL Low Duration Fund(DD-IDCW)     
##  5 Aditya Birla SL Low Duration Fund(W-IDCW)      
##  6 Aditya Birla SL Floating Rate Fund(DD-IDCW)    
##  7 Aditya Birla SL Floating Rate Fund(W-IDCW)     
##  8 Bandhan Banking and PSU Fund(DD-IDCW)          
##  9 Bandhan Banking and PSU Fund(M-IDCW)           
## 10 Bandhan Banking and PSU Fund(FN-IDCW)          
## # ℹ 16 more rows

Interpretation: Funds with high risk and low returns were identified as inefficient. These funds fail to compensate investors adequately for the level of risk taken and may not be suitable for investment compared to better-performing alternatives.

4.3 How do Sharpe and Sortino ratios differs in ranking funds.

sharpe_ranks <- mf_data[order(-mf_data$Sharpe_Ratio), c("Name", "Sharpe_Ratio")]
colnames(sharpe_ranks) <- c("name","value")

sortino_ranks <- mf_data[order(-mf_data$Sortino_Ratio), c("Name", "Sortino_Ratio")]
colnames(sortino_ranks) <- c("name","value")

comparison <- data.frame(
  Rank = 1:10,
  Top_Sharpe = sharpe_ranks$name[1:10],
  Top_Sortino = sortino_ranks$name[1:10]
)

comparison

##    Rank                               Top_Sharpe
## 1     1   Axis Ultra Short Duration Fund(W-IDCW)
## 2     2   Axis Ultra Short Duration Fund(W-IDCW)
## 3     3   Axis Ultra Short Duration Fund(M-IDCW)
## 4     4   Axis Ultra Short Duration Fund(M-IDCW)
## 5     5            SBI Savings Fund(IDCW-Payout)
## 6     6                   SBI Savings Fund(IDCW)
## 7     7                         SBI Savings Fund
## 8     8                SBI Savings Fund(DD-IDCW)
## 9     9 ICICI Pru Money Market Fund(IDCW-Payout)
## 10   10        ICICI Pru Money Market Fund(IDCW)
##                                Top_Sortino
## 1            SBI Savings Fund(IDCW-Payout)
## 2                   SBI Savings Fund(IDCW)
## 3     Nippon India Low Duration Fund(IDCW)
## 4           Nippon India Low Duration Fund
## 5  Nippon India Low Duration Fund(DD-IDCW)
## 6   Nippon India Low Duration Fund(Q-IDCW)
## 7   Nippon India Low Duration Fund(W-IDCW)
## 8           Nippon India Low Duration Fund
## 9   Nippon India Low Duration Fund(M-IDCW)
## 10  Nippon India Low Duration Fund(M-IDCW)

Interpretation: Differences in rankings between Sharpe and Sortino ratios highlight the importance of downside risk. Sortino provides a more realistic evaluation by penalizing only negative volatility, making it more relevant for risk-averse investors.

4.4 Do SEBI risk categories align with actual observed volatility.

ggplot(mf_data,aes(x = Volatility, y = SEBI_Risk_Category, fill = SEBI_Risk_Category))+
  geom_violin(trim = FALSE, alpha =  0.5)+
  geom_boxplot(width = 0.1, color="black", outlier.shape = NA)+
  geom_jitter(width = 0.1, alpha = 0.2)+
  theme_minimal()+
  labs(
    title = "Do SEBI Labels match Actual Volatility? ",
    x = "Observed Volatility",
    y = "SEBI Riskometer category"
  )

## Error : The fig.showtext code chunk option must be TRUE

Interpretation: Comparison of SEBI risk categories with actual volatility shows partial alignment. Some overlap indicates inconsistencies, suggesting that official classifications may not always accurately reflect real risk levels.

4.5 How stable is the risk–return relationship using CAGR and Absolute returns.

ggplot(central_tendency_and_spread, aes(x = SD, y = Mean, label = Metric)) +
  geom_point(size = 2, color = "steelblue")+
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "grey")+
  geom_text(aes(label = Metric),vjust = -1.2)+
  theme_minimal()+
  labs(
    title = "Risk vs Return Stability",
    x = "Observed Risk",
    y = "Average Returns"
  )

## Error : The fig.showtext code chunk option must be TRUE

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Interpretation: The relationship between risk and return across different metrics shows moderate stability. However, variability indicates that returns are not fully predictable, emphasizing the uncertain nature of financial markets.

Phase 5: Optimization, Cost & Efficiency Analysis.

5.1 How does expense ratio impact net returns and risk-adjusted performance.

mf_data |> mutate(Fee_Quartile = as.factor(ntile(Expense_Ratio, 4))) |>
  group_by(Fee_Quartile) |>
  summarise(Avg_Expense = mean(Expense_Ratio),
            Avg_Net_Return = mean(CAGR_3Y)) |>
  # knitr::kable(caption = "Performance decay by expense Quartile")
  
ggplot(aes(x = Fee_Quartile, y = Avg_Net_Return, fill = Avg_Net_Return)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = round(Avg_Net_Return, 2)), vjust = -0.5, size = 4) +
  scale_fill_gradient(low = "#FF6B6B", high = "#00DAC6") + # Red for low, Teal for high
  theme_minimal() +
  labs(title = "Average Net Return (CAGR) by Expense Quartile",
       subtitle = "Quartile 1: Cheapest Funds | Quartile 4: Most Expensive Funds",
       x = "Fee Quartile (Low to High Expense)",
       y = "Average 3Y CAGR (%)")

## Error : The fig.showtext code chunk option must be TRUE

Interpretation: Higher expense ratios reduce net returns, negatively affecting fund performance. This highlights the importance of cost efficiency, as even small differences in fees can significantly impact long-term investment outcomes.

5.2 How to select optimal funds using multi-criteria optimization (return, risk, cost).

scale_min_max <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}

# adding SD column to mutual funds data
mf_data <- mf_data |>
  rowwise() |>
  mutate(
    SD = sd(c(Absolute_Returns_3M, Absolute_Returns_6M, Absolute_Returns_1Y, CAGR_5Y, CAGR_3Y), na.rm = TRUE)
  ) |>
  ungroup()

cagr_3y_sd <- central_tendency_and_spread |> 
  filter(Metric == "CAGR_3Y") |> 
  pull(SD)

# creating an optimized mutual fund data frame
mf_optimized <- mf_data |>
  mutate(
    score_return = scale_min_max(CAGR_3Y),
    score_risk = 1 - (SD / max(cagr_3y_sd)),
    score_cost = 1 - scale_min_max(Expense_Ratio)
  )

mf_optimized <- mf_optimized |>
  mutate(Final_Score = (score_return *0.40) +
                       (score_risk * 0.40)+
                       (score_cost * 0.20))

Interpretation: Combining return, risk, and cost into a weighted score identifies optimal funds. This approach balances multiple factors, providing a more comprehensive evaluation compared to relying on a single metric.

5.3 Which funds provide the best return per unit cost.

mf_optimized <- mf_data |> 
  mutate(
    Return_cost_Ratio = CAGR_3Y / Expense_Ratio
  ) |> 
  arrange(desc(Return_cost_Ratio))

head(mf_optimized,6)

## # A tibble: 6 × 49
##   Name             Sub_Category Plan     AUM CAGR_3Y CAGR_5Y Absolute_Returns_3M
##   <chr>            <chr>        <chr>  <dbl>   <dbl>   <dbl>               <dbl>
## 1 Mirae Asset Liq… Liquid Fund  IDCW  12395.    7.58    6.32               -1.39
## 2 Mirae Asset Liq… Liquid Fund  IDCW  12395.    7.58    6.32               -1.39
## 3 Mirae Asset Liq… Liquid Fund  IDCW  12395.    7.58    6.32               -1.39
## 4 Mirae Asset Liq… Liquid Fund  IDCW  12395.    7.58    6.32                0.07
## 5 Mirae Asset Liq… Liquid Fund  Grow… 12395.    6.98    6.01                1.42
## 6 Axis Liquid Fun… Liquid Fund  IDCW  43636.    7.58    6.32               -1.39
## # ℹ 42 more variables: Absolute_Returns_1Y <dbl>, Absolute_Returns_6M <dbl>,
## #   SEBI_Risk_Category <chr>, Volatility <dbl>, Category_St_Dev <dbl>,
## #   Other_Holdings <dbl>, Largecap_Holding <dbl>, Midcap_Holding <dbl>,
## #   Equity_Holding <dbl>, Smallcap_Holding <dbl>, Debt_Holding <dbl>,
## #   Category_YTM <dbl>, PE_Ratio <dbl>, Average_Maturity <dbl>,
## #   Category_PE_Ratio <dbl>, Sortino_Ratio <dbl>, Sharpe_Ratio <dbl>,
## #   Average_YTM <dbl>, Benchmark <dbl>, Time_since_inception <dbl>, …

Interpretation: The return-to-cost ratio identifies funds delivering higher returns relative to their expense ratio. This metric highlights cost-efficient investments, helping investors maximize returns while minimizing expenses.

5.4 What if linear or nonlinear optimization identify an optimal subset of funds.

# 1. Create a Risk-Return Efficiency Metric
mf_optimized <- mf_data |>
  rowwise() |>
  mutate(
    Calculated_SD = sd(c(Absolute_Returns_3M, Absolute_Returns_6M, Absolute_Returns_1Y, CAGR_3Y), na.rm = TRUE),
    Efficiency_Ratio = ifelse(Calculated_SD > 0, CAGR_3Y / Calculated_SD, 0)
  ) |>
  ungroup()

# 2. Identify the Optimal Subset
optimal_subset <- mf_optimized |>
  arrange(desc(Efficiency_Ratio)) |>
  head(10) |>
  select(Name, CAGR_3Y, Calculated_SD, Expense_Ratio, Efficiency_Ratio)

# 3. Output the Subset Table
knitr::kable(optimal_subset, 
             caption = "Optimal Subset Identified via Risk-Adjusted Optimization")

Optimal Subset Identified via Risk-Adjusted Optimization
Name	CAGR_3Y	Calculated_SD	Expense_Ratio	Efficiency_Ratio
DSP Liquidity Fund	6.99	2.655221	0.13	2.632549
Axis Liquid Fund	7.01	2.668762	0.11	2.626686
Aditya Birla SL Liquid Fund	7.03	2.677573	0.21	2.625511
ICICI Pru Liquid Fund(IDCW-Payout)	6.96	2.654242	0.20	2.622218
ICICI Pru Liquid Fund(IDCW)	6.96	2.654242	0.20	2.622218
ICICI Pru Liquid Fund	6.96	2.654242	0.20	2.622218
HDFC Liquid Fund	6.94	2.647426	0.20	2.621414
Mirae Asset Liquid Fund	6.98	2.663549	0.09	2.620564
Nippon India Liquid Fund	6.99	2.668262	0.20	2.619682
ICICI Pru Money Market Fund(IDCW-Payout)	7.44	3.064032	0.21	2.428173

# 4. Visualize the 'Optimal Frontier'
ggplot(mf_optimized, aes(x = Calculated_SD, y = CAGR_3Y)) +
  geom_point(aes(color = Expense_Ratio), alpha = 0.3) +
  # Highlight the Optimal Subset in a distinct color from your theme
  geom_point(data = optimal_subset, color = "#EA80FC", size = 4) +
  geom_text_repel(data = optimal_subset, aes(label = Name), size = 3, color = "#B8BCC2") +
  scale_color_gradient(low = "#00DAC6", high = "#FF6B6B") +
  theme_minimal() +
  labs(title = "Risk-Return Optimization Frontier",
       subtitle = "Highlighted funds represent the mathematically optimal subset",
       x = "Calculated Risk",
       y = "Annualized Return")

## Error : The fig.showtext code chunk option must be TRUE

Interpretation: An optimal subset of funds was identified based on efficiency ratios, representing the best trade-offs between risk and return. These funds lie closer to the efficient frontier and are considered superior investment choices.

5.5 How sensitive is fund ranking to changes in weighting schemes.

# Ensure we are using the dataframe, not just one column
mf_sensitivity <- mf_optimized |> 
  mutate(
    Rank_Balanced     = rank(-(Norm_Return * 0.4 + Norm_Risk * 0.4 + Norm_Score * 0.2)),
    Rank_Aggressive   = rank(-(Norm_Return * 0.7 + Norm_Risk * 0.15 + Norm_Score * 0.15)),
    Rank_Conservative = rank(-(Norm_Return * 0.2 + Norm_Risk * 0.6 + Norm_Score * 0.2)) 
  )

# Check the results
head(mf_sensitivity |> select(Name, starts_with("Rank_")))

## # A tibble: 6 × 4
##   Name                           Rank_Balanced Rank_Aggressive Rank_Conservative
##   <chr>                                  <dbl>           <dbl>             <dbl>
## 1 HDFC Liquid Fund(M-IDCW)                166.            152.              164.
## 2 HDFC Liquid Fund(DD-IDCW)                 1               1                 3 
## 3 HDFC Liquid Fund(W-IDCW)                166.            152.              164.
## 4 HDFC Liquid Fund                         66              64                67 
## 5 HDFC Liquid Fund(M-IDCW Reinv)          166.            152.              164.
## 6 HDFC Liquid Fund(W-IDCW)                166.            152.              164.

Interpretation: Fund rankings change when weights assigned to return, risk, and cost vary. This indicates that rankings are subjective and depend on investor preferences, emphasizing the need for personalized investment strategies.

Phase 6: Predictive Modeling & Fund Ranking.

6.1 Predicting Future returns using historical data.

# creating a model
returns_model <- lm(CAGR_3Y ~ CAGR_5Y + Volatility + Expense_Ratio, data = mf_data)

# generating predictions adding column for predicted data
mf_data <- mf_data |> mutate(Predicted_Returns = predict(returns_model, newdata = mf_data))

mf_data |> select(Name, CAGR_3Y, Predicted_Returns) |> head(5)

## # A tibble: 5 × 3
##   Name                           CAGR_3Y Predicted_Returns
##   <chr>                            <dbl>             <dbl>
## 1 HDFC Liquid Fund(M-IDCW)         -0.03             0.139
## 2 HDFC Liquid Fund(DD-IDCW)         7.58             7.69 
## 3 HDFC Liquid Fund(W-IDCW)         -0.03             0.103
## 4 HDFC Liquid Fund                  6.94             7.27 
## 5 HDFC Liquid Fund(M-IDCW Reinv)   -0.03             0.139

Interpretation: A regression model was used to predict returns based on historical variables. While it provides useful insights, predictions are limited due to market uncertainty and the dynamic nature of financial data.

6.2 How well does a multiple linear regression model explain return variability.

model_summary <- summary(returns_model)

# Extracting the fit metrics
cat("Percentage of variability explained:", round(model_summary$r.squared * 100, 2), "%\n")

## Percentage of variability explained: 96.46 %

cat("Adjusted R-squared:", round(model_summary$adj.r.squared * 100, 2), "%\n")

## Adjusted R-squared: 96.4 %

cat("Average Prediction Error:", round(model_summary$sigma, 2), "%\n")

## Average Prediction Error: 0.68 %

Interpretation: R^2 and adjusted R^2 indicate the proportion of variance explained by the model. Moderate values suggest that while the model captures some relationships, a significant portion of variability remains unexplained.

6.3 Can polynomial regression capture nonlinear relationships between risk and return.

poly_model <- lm(CAGR_3Y ~ poly(Volatility, 2) + Expense_Ratio, data = mf_data)
summary(poly_model)

## 
## Call:
## lm(formula = CAGR_3Y ~ poly(Volatility, 2) + Expense_Ratio, data = mf_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.690 -3.071 -1.959  4.047  5.439 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            2.6732     1.1295   2.367   0.0191 *
## poly(Volatility, 2)1  -7.8766     3.7077  -2.124   0.0351 *
## poly(Volatility, 2)2   2.7587     4.3459   0.635   0.5264  
## Expense_Ratio          0.7851     3.7701   0.208   0.8353  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.536 on 166 degrees of freedom
## Multiple R-squared:  0.0299, Adjusted R-squared:  0.01236 
## F-statistic: 1.705 on 3 and 166 DF,  p-value: 0.1679

Interpretation: Polynomial regression captures nonlinear relationships between risk and return more effectively than linear models. This reflects real-world scenarios where returns may increase at a decreasing rate with higher risk.

6.4 How accurate are predictive models (R², RMSE, residual analysis).

# R-squared
summary(returns_model)$adj.r.squared

## [1] 0.9639697

# RMSE
sqrt(mean(residuals(returns_model)^2))

## [1] 0.667384

# Residual Plot
plot(returns_model, which = 1)

## Error : The fig.showtext code chunk option must be TRUE

Interpretation: RMSE and residual analysis show prediction errors and model fit. Larger errors indicate limited predictive accuracy, highlighting the challenges of forecasting financial returns accurately.

6.5 How can a composite scoring model rank funds effectively.

mf_ranked <- mf_data |>
  mutate(
    # Normalize Return (Higher is Better)
    ret_score = (CAGR_3Y - min(CAGR_3Y)) / (max(CAGR_3Y) - min(CAGR_3Y)),
    
    # Normalize Risk (Lower Volatility is Better)
    risk_score = 1 - (Volatility - min(Volatility)) / (max(Volatility) - min(Volatility)),
    
    # Normalize Cost (Lower Expense is Better)
    cost_score = 1 - (Expense_Ratio - min(Expense_Ratio)) / (max(Expense_Ratio) - min(Expense_Ratio))
  ) |>
  mutate(
    # Final Weighted Score
    Final_Composite_Score = (ret_score * 0.5) + (risk_score * 0.3) + (cost_score * 0.2)
  ) |>
  arrange(desc(Final_Composite_Score))

Interpretation: A composite scoring model combining return, risk, and cost provides an overall ranking of funds. This method simplifies decision-making by integrating multiple performance indicators into a single score.

6.6 Which model linear or nonlinear provides better predictive performance.

# Model 1: Simple Linear Regression
linear_model <- lm(CAGR_3Y ~ Volatility, data = mf_data)

# Model 2: Nonlinear (Polynomial Degree 2) Regression
nonlinear_model <- lm(CAGR_3Y ~ poly(Volatility, 2), data = mf_data)

# 2. Compare them using ANOVA
anova_results <- anova(linear_model, nonlinear_model)
print(anova_results)

## Analysis of Variance Table
## 
## Model 1: CAGR_3Y ~ Volatility
## Model 2: CAGR_3Y ~ poly(Volatility, 2)
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    168 2081.1                           
## 2    167 2076.1  1    4.9844 0.4009 0.5275

# 3. Compare Accuracy Metrics
metrics <- data.frame(
  Model = c("Linear", "Nonlinear"),
  R_Squared = c(summary(linear_model)$adj.r.squared, summary(nonlinear_model)$adj.r.squared),
  RMSE = c(sqrt(mean(linear_model$residuals^2)), sqrt(mean(nonlinear_model$residuals^2)))
)
print(metrics)

##       Model  R_Squared     RMSE
## 1    Linear 0.02152310 3.498787
## 2 Nonlinear 0.01802156 3.494594

ggplot(mf_data, aes(x = Volatility, y = CAGR_3Y)) +
  geom_point(alpha = 0.4, color = "gray") +
  geom_smooth(method = "lm", formula = y ~ x, color = "#EA80FC", se = FALSE, linetype = "dashed") + # Linear
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "#00DAC6", se = FALSE) + # Nonlinear
  labs(title = "Linear vs. Nonlinear Fit",
       subtitle = "Nonlinear captures the diminishing returns of high risk",
       x = "Volatility", y = "CAGR 3Y") +
  theme_minimal()

## Error : The fig.showtext code chunk option must be TRUE

Interpretation: Nonlinear models slightly outperform linear models in capturing complex relationships. However, the improvement is limited, suggesting that increasing model complexity does not always significantly enhance predictive performance.

Quantitative Risk – Return Optimization and ranking of Mutual funds

Aditya Prajapati

2026-05-04