library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(vcd)
## Loading required package: grid
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(corrplot)
library(coefplot)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.4.4
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
Functions
detect_outliers <- function(inp, na.rm=TRUE) {
i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
i.max <- 1.5 * IQR(inp, na.rm=na.rm)
otp <- inp
otp[inp < (i.qnt[1] - i.max)] <- NA
otp[inp > (i.qnt[2] + i.max)] <- NA
#inp <- count(inp[is.na(otp)])
sum(is.na(otp))
}
Non_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
y
}
Remove_Outliers <- function ( z, na.rm = TRUE){
Out <- Non_outliers(z)
Out <-as.data.frame (Out)
z <- Out$Out[match(z, Out$Out)]
z
}
Graph_Boxplot <- function (input, na.rm = TRUE){
Plot <- ggplot(dfrModel, aes(x="", y=input)) +
geom_boxplot(aes(fill=input), color="green") +
labs(title="Outliers")
Plot
}
Dataset
dfrModel <- read.csv("D:/Welingkar/Trim 6/Data/Regression_data.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrModel)
head(dfrModel)
## X1.Year.Return Investment.Style Market_Cap Market.Cap Turnover Fund.Size
## 1 12.90 1 3 66337.65 62 3
## 2 14.35 1 3 66337.65 62 3
## 3 16.39 2 3 50546.68 24 3
## 4 14.86 2 3 50546.68 24 3
## 5 11.32 1 3 63907.70 49 3
## 6 12.67 1 3 63907.70 49 3
## Net.Assets..Cr. Standard.Deviation Sharpe.Ratio Sortino.Ratio Beta Alpha
## 1 5819.08 15.51 0.62 1.00 0.97 5.00
## 2 5819.08 15.52 0.69 1.12 0.97 6.19
## 3 1453.04 19.36 0.74 1.11 0.89 8.70
## 4 1453.04 19.35 0.69 1.04 0.89 7.75
## 5 8602.25 14.30 0.71 1.09 0.94 5.69
## 6 8602.25 14.32 0.78 1.20 0.94 6.70
## R.Squared Expense.Ratio Tenure.1 Tenure.2 Tenure3
## 1 0.73 2.30 6.4 0.0 0
## 2 0.73 1.00 5.2 0.0 0
## 3 0.79 1.15 4.3 2.6 0
## 4 0.78 2.45 4.3 2.6 0
## 5 0.81 2.23 5.5 0.0 0
## 6 0.81 0.99 5.2 0.0 0
Data Mining
dfrModel$Market_Cap <- as.factor(dfrModel$Market_Cap)
levels(dfrModel$Market_Cap) <- c("Small Cap", "Mid Cap", "Large Cap")
group_by(dfrModel, Market_Cap) %>%
summarise(
count = n(),
mean = mean(X1.Year.Return, na.rm = TRUE),
sd = sd(X1.Year.Return, na.rm = TRUE)
)
## # A tibble: 3 x 4
## Market_Cap count mean sd
## <fctr> <int> <dbl> <dbl>
## 1 Small Cap 12 25.13000 9.767865
## 2 Mid Cap 31 21.81194 6.785418
## 3 Large Cap 121 15.99413 4.027746
Data Visualization
ggboxplot(dfrModel, x = "Market_Cap", y = "X1.Year.Return",
color = "Market_Cap", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
order = c("Small Cap", "Mid Cap", "Large Cap"),
ylab = "Return", xlab = "Market Cap")
ggline(dfrModel, x = "Market_Cap", y = "X1.Year.Return",
add = c("mean_se", "jitter"),
order = c("Small Cap", "Mid Cap", "Large Cap"),
ylab = "Return", xlab = "Market Cap")
boxplot(X1.Year.Return ~ Market_Cap, data = dfrModel,
xlab = "Market Cap", ylab = "Return",
frame = FALSE, col = c("#00AFBB", "#E7B800", "#FC4E07"))
# plotmeans
plotmeans(X1.Year.Return ~ Market_Cap, data = dfrModel, frame = FALSE,
xlab = "Market Cap", ylab = "Return",
main="Mean Plot with 95% CI")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is
## not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
One Way Anova
Null Hypothesis
There is no significant difference between the average return of Large, Mid & Small Cap Mutual Funds.
Alternative Hypothesis:
There is significant difference between the average return of Large, Mid & Small Cap Mutual Funds.
# Compute the analysis of variance
res.aov <- aov(X1.Year.Return ~ Market_Cap, data = dfrModel)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Market_Cap 2 1538 769.0 28.28 2.97e-11 ***
## Residuals 161 4378 27.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Observations
1. As the p-value is less than the significance level 0.05, we can conclude that there are significant differences between the groups highlighted with “*" in the model summary
So Null hypothesis is rejected.
In one-way ANOVA test, a significant p-value indicates that some of the group means are different, but we don’t know which pairs of groups are different.
It’s possible to perform multiple pairwise-comparison, to determine if the mean difference between specific pairs of group are statistically significant.
Tukey multiple pairwise-comparisons
TukeyHSD(res.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = X1.Year.Return ~ Market_Cap, data = dfrModel)
##
## $Market_Cap
## diff lwr upr p adj
## Mid Cap-Small Cap -3.318065 -7.511818 0.8756894 0.1502703
## Large Cap-Small Cap -9.135868 -12.869080 -5.4026554 0.0000001
## Large Cap-Mid Cap -5.817803 -8.300870 -3.3347362 0.0000004
Observations
diff: difference between means of the two groups
lwr, upr: the lower and the upper end point of the confidence interval at 95% (default)
p adj: p-value after adjustment for the multiple comparisons.
It can be seen from the output, that only the difference between Large Cap & Small Cap as well as Large Cap & Mid Cap is significant with an adjusted p-value of 0.0000001.
Note: Net assets show total assets of a fund net of liabilities and expenses. It is calculated as market value of all investments in the fund less liabilities and expenses.
Market capitalisation of a fund gives the weighted market cap of the fund. That is, in what type of stocks – large, mid or small – it has invested.