library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra) 
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(vcd)
## Loading required package: grid
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(corrplot)
library(coefplot)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.4.4
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess

Functions

detect_outliers <- function(inp, na.rm=TRUE) {
  i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
  i.max <- 1.5 * IQR(inp, na.rm=na.rm)
  otp <- inp
  otp[inp < (i.qnt[1] - i.max)] <- NA
  otp[inp > (i.qnt[2] + i.max)] <- NA
  #inp <- count(inp[is.na(otp)])
  sum(is.na(otp))
}

Non_outliers <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- NA
  y[x > (qnt[2] + H)] <- NA
  y
}

Remove_Outliers <- function ( z, na.rm = TRUE){
 Out <- Non_outliers(z)
 Out <-as.data.frame (Out)
 z <- Out$Out[match(z, Out$Out)]
 z
}

Graph_Boxplot <- function (input, na.rm = TRUE){
Plot <- ggplot(dfrModel, aes(x="", y=input)) +
            geom_boxplot(aes(fill=input), color="green") +
            labs(title="Outliers")
Plot
}

Dataset

dfrModel <- read.csv("D:/Welingkar/Trim 6/Data/Regression_data.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrModel)
head(dfrModel)
##   X1.Year.Return Investment.Style Market_Cap Market.Cap Turnover Fund.Size
## 1          12.90                1          3   66337.65       62         3
## 2          14.35                1          3   66337.65       62         3
## 3          16.39                2          3   50546.68       24         3
## 4          14.86                2          3   50546.68       24         3
## 5          11.32                1          3   63907.70       49         3
## 6          12.67                1          3   63907.70       49         3
##   Net.Assets..Cr. Standard.Deviation Sharpe.Ratio Sortino.Ratio Beta Alpha
## 1         5819.08              15.51         0.62          1.00 0.97  5.00
## 2         5819.08              15.52         0.69          1.12 0.97  6.19
## 3         1453.04              19.36         0.74          1.11 0.89  8.70
## 4         1453.04              19.35         0.69          1.04 0.89  7.75
## 5         8602.25              14.30         0.71          1.09 0.94  5.69
## 6         8602.25              14.32         0.78          1.20 0.94  6.70
##   R.Squared Expense.Ratio Tenure.1 Tenure.2 Tenure3
## 1      0.73          2.30      6.4      0.0       0
## 2      0.73          1.00      5.2      0.0       0
## 3      0.79          1.15      4.3      2.6       0
## 4      0.78          2.45      4.3      2.6       0
## 5      0.81          2.23      5.5      0.0       0
## 6      0.81          0.99      5.2      0.0       0

Data Mining

dfrModel$Market_Cap <- as.factor(dfrModel$Market_Cap)
levels(dfrModel$Market_Cap) <- c("Small Cap", "Mid Cap", "Large Cap")
group_by(dfrModel, Market_Cap) %>%
  summarise(
    count = n(),
    mean = mean(X1.Year.Return, na.rm = TRUE),
    sd = sd(X1.Year.Return, na.rm = TRUE)
  )
## # A tibble: 3 x 4
##   Market_Cap count     mean       sd
##       <fctr> <int>    <dbl>    <dbl>
## 1  Small Cap    12 25.13000 9.767865
## 2    Mid Cap    31 21.81194 6.785418
## 3  Large Cap   121 15.99413 4.027746

Data Visualization

ggboxplot(dfrModel, x = "Market_Cap", y = "X1.Year.Return", 
          color = "Market_Cap", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
          order = c("Small Cap", "Mid Cap", "Large Cap"),
          ylab = "Return", xlab = "Market Cap")

ggline(dfrModel, x = "Market_Cap", y = "X1.Year.Return", 
       add = c("mean_se", "jitter"), 
       order = c("Small Cap", "Mid Cap", "Large Cap"),
       ylab = "Return", xlab = "Market Cap")

boxplot(X1.Year.Return ~ Market_Cap, data = dfrModel,
        xlab = "Market Cap", ylab = "Return",
        frame = FALSE, col = c("#00AFBB", "#E7B800", "#FC4E07"))

# plotmeans
plotmeans(X1.Year.Return ~ Market_Cap, data = dfrModel, frame = FALSE,
          xlab = "Market Cap", ylab = "Return",
          main="Mean Plot with 95% CI")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

One Way Anova
Null Hypothesis
There is no significant difference between the average return of Large, Mid & Small Cap Mutual Funds.

Alternative Hypothesis:
There is significant difference between the average return of Large, Mid & Small Cap Mutual Funds.

# Compute the analysis of variance
res.aov <- aov(X1.Year.Return ~ Market_Cap, data = dfrModel)
# Summary of the analysis
summary(res.aov)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## Market_Cap    2   1538   769.0   28.28 2.97e-11 ***
## Residuals   161   4378    27.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Observations
1. As the p-value is less than the significance level 0.05, we can conclude that there are significant differences between the groups highlighted with “*" in the model summary
So Null hypothesis is rejected.

  1. In one-way ANOVA test, a significant p-value indicates that some of the group means are different, but we don’t know which pairs of groups are different.

  2. It’s possible to perform multiple pairwise-comparison, to determine if the mean difference between specific pairs of group are statistically significant.

Tukey multiple pairwise-comparisons

TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = X1.Year.Return ~ Market_Cap, data = dfrModel)
## 
## $Market_Cap
##                          diff        lwr        upr     p adj
## Mid Cap-Small Cap   -3.318065  -7.511818  0.8756894 0.1502703
## Large Cap-Small Cap -9.135868 -12.869080 -5.4026554 0.0000001
## Large Cap-Mid Cap   -5.817803  -8.300870 -3.3347362 0.0000004

Observations
diff: difference between means of the two groups
lwr, upr: the lower and the upper end point of the confidence interval at 95% (default)
p adj: p-value after adjustment for the multiple comparisons.

It can be seen from the output, that only the difference between Large Cap & Small Cap as well as Large Cap & Mid Cap is significant with an adjusted p-value of 0.0000001.

Note: Net assets show total assets of a fund net of liabilities and expenses. It is calculated as market value of all investments in the fund less liabilities and expenses.

Market capitalisation of a fund gives the weighted market cap of the fund. That is, in what type of stocks – large, mid or small – it has invested.