\(\Large \sf To \ clean \ the \ environment \ variables\)
rm(list=ls(all=T))
\(\Large \sf Defining \ a \ Vector\)
vec <- c(20,34,56,23,45,22,60,23,56,78,23,45)
\(\Large \sf Mean \ of \ the \ vector\)
\(\frac{\Large\sum_{\Large i=1}^{\Large n}{\Large x_i}}{\Large n}\)
mean(vec)
## [1] 40.41667
\(\Large \sf Median\ of \ the \ vector\)
\(\Large \frac{(n+1)}{2} \normalsize \sf \ ranked \ value\)
median(vec)
## [1] 39.5
\(\Large \sf Harmonic\ Mean\ of\ a\ vector\)
\(\Large\frac {\Large n}{\Large \sum_{\Large i=1}^{\Large n}\Large \frac{\Large 1}{\Large x_i}}\)
hm_ <- function(vcetor) {
return(length(vec)/sum(1/vec))
}
hm_(vec)
## [1] 32.88149
\(\Large \sf Geometric\ Mean\ of\ a\ vector\)
\(\large \bar X_g = (X_1*X_2*X_3 * \ .... \ X_n )^\frac{1}{n}\)
gm_ <- function(vcetor) {
return(prod(vcetor)^(1/length(vcetor)))
}
gm_(vec)
## [1] 36.39962
\(\sf \huge Variation\ and\ shape\)
\(\sf \Large Range\ of\ a\ vector\)
\(\sf \normalsize Range = (\large x_{max}-\large x_{min})\)
range_ <- function(vcetor) {
return(max(vcetor)-min(vcetor))
}
range_(vec)
## [1] 58
\(\sf \Large Variance\ of\ a\ vector\)
\(\Large \sigma^2 \small =\Large \frac{\sum_{i=1}^{n} (x_i-\bar X)^2}{n}\)
variance_ <- function(vcetor){
var_ <- 0
for (ele in 1:length(vcetor)){
var_ = var_ + (vcetor[ele] - mean(vcetor))^2
}
return (var_/length(vcetor)-1)
}
variance_(vec)
## [1] 334.9097
\(\sf \Large Standard\ deviation\ of\ a\ vector\)
\(\Large \sigma \small =\sqrt{\Large \frac{\sum_{i=1}^{n} (x_i-\bar X)^2}{n}}\)
stddev_ <- function(vcetor){
return (variance_(vcetor) ^ 0.5)
}
stddev_(vec)
## [1] 18.30054
\(\sf \Large Coefficient\ of\ Variation\)
It measures the scatter in the data with respect to the mean
\(\Large (\frac {\sigma}{\mu})*100\)
CoeffVar_ <- function(vcetor){
return (stddev_(vcetor)/mean(vcetor)) *100
}
CoeffVar_(vec)
## [1] 0.4527968
\(\sf \Large Z-Score\)
\(\Large Z = \Large(\frac{x-\mu}{\sigma})\)
zscore <- function(vcetor){
temp_ <- c()
for (ele in 1:length(vcetor)){
temp_ <- append(temp_, (vcetor[ele]-mean(vcetor))/stddev_(vcetor))
}
return (temp_)
}
zscore(vec)
## [1] -1.1156320 -0.3506272 0.8515232 -0.9517024 0.2504480 -1.0063456
## [7] 1.0700960 -0.9517024 0.8515232 2.0536736 -0.9517024 0.2504480
\(\Large Skewness\)
\(\Large Kurtosis\)
\(\Large Quartiles\)
\(\normalsize \sf Quartiles\ split\ the\ data\ into\ 4\ equal\ parts\)
\(\normalsize Q1 = \frac {n+1}{4} \ \ \ \ \ \ \ \ Q2 = 2(\frac {n+1}{4})\)
\(\normalsize Q3 = 3(\frac {n+1}{4}) \ \ \ \ Q4 = 4(\frac {n+1}{4})\)
\(\Large Percentile\)
\(\normalsize \sf \text{Percentile divides the data into 100 equal parts}\)
\(\Large \sf \text{The Interquartile Range (IQR)}\)
\(\sf \normalsize \text{The interquartile range is the difference between third quartile }\textbf{(Q3) }\text{and first quartile }\textbf{(Q1)}\)
\(\sf \ \ \ \ \ \ \ \ \ \ \ \textbf{IQR} = \textbf{Q3 - Q1}\)
\(\Large \sf \text{The Empirical rule}\)
\(\sf \normalsize \text{The Empirical rule states that :}\)
\(\ \ \ \ \ \sf \normalsize \text{In a Normal distribution,}\)
\(\sf \Large \text{Chebyshev's theorem}\)
\(\sf \normalsize \text{For heavily skewed datasets that do not appear to be normally distributed, you should use chebyshev's theorem:}\)
\(\sf \normalsize \text{Regardless of the shape, the percentage of values that are found within distances of k standard deviations from the mean must be at least}\)
\(\Large (1-\frac{1}{k^2}) * 100\)
\(\sf \LARGE \text{The Covariance and the Coefficient of Correlation}\)
\(\large \sf \text{The Covariance}\)
\(\sf \normalsize \text{It measures the strength of a linear relationship between two numerical variables}\)
\(Sample,\large \sf \ cov(X,Y) = \frac{\sum_{i=1}^{n}(X_i-\bar X)(Y_i-\bar Y)}{n-1}\)
\(Population,\large \sf \ cov(X,Y) = \frac{\sum_{i=1}^{n}(X_i-\bar X)(Y_i-\bar Y)}{n}\)
vec1 <- c(23,45,34,23,34,56,78,65,45,34)
vec2 <- c(65,54,34,45,23,45,67,88,96,33)
cov_ <- function(vcetor_x,vcetor_y,sample){
numerator <- 0
for (i in 1:length(vcetor_x)){
numerator <- numerator + (vcetor_x[i]-mean(vcetor_x))*(vcetor_y[i]-mean(vcetor_y))
}
if (sample==T){
return (numerator/(length(vcetor_x)-1))
}else if (sample==F){
return (numerator/length(vcetor_x))
}else{
return ('Check your parameters !!!')
}
}
cov_(vec1,vec2,T) # sample Variance
## [1] 196.7778
cov_(vec1,vec2,F) # Population Variance
## [1] 177.1
\(\sf \large \text{The Coefficient of Correlation}\)
\(\sf \text{It measures the relative strength of a linear relationship between two numerical variables}\)
\(\large sample\ correlation,\ \ \bf r = \frac {Cov(X,Y)}{S_XS_Y}\)
vec1 <- c(23,45,34,23,34,56,78,65,45,34)
vec2 <- c(65,54,34,45,23,45,67,88,96,33)
corr_ <- function(vcetor_x,vcetor_y){
return(cov_(vcetor_x,vcetor_y,F)/(stddev_(vcetor_x)*stddev_(vcetor_y)))
}
corr_(vec1,vec2)
## [1] 0.4569768