Generate Data
getmode <- function(v) {
x<-v
lim.inf=min(x)-1; lim.sup=max(x)+1
# hist(x,freq=FALSE,breaks=seq(lim.inf,lim.sup,0.2))
s<-density(x,from=lim.inf,to=lim.sup,bw=0.2)
n<-length(s$y)
v1<-s$y[1:(n-2)];
v2<-s$y[2:(n-1)];
v3<-s$y[3:n]
ix<-1+which((v1<v2)&(v2>v3))
#lines(s$x,s$y,col="red")
#points(s$x[ix],s$y[ix],col="blue")
md <- s$x[which(s$y==max(s$y))]
md
}
createDataWithContamination_EXP <- function(sample_size , rateData , rateContamination ,contamination_prop ){
data <- rexp(sample_size , rate = rateData)
contaminations <- rexp(sample_size , rate =rateContamination)
contaminations <- sort(contaminations)[(sample_size-(sample_size*contamination_prop)):sample_size]
data_with_contamination <- c(data, contaminations)
list(
data = data,
contaminations = contaminations,
data_with_contamination = data_with_contamination
)
}
Data Generations Factors
- sample size (20,50,100,200)
- Distribution Parameters \(\lambda\) (0.5)
- Contamination Distribution Parameters (0.05)
- contamination (10% , 20% , 30%)
generateData <- function(sampleSize , rate,rate_contamination , contamination ){
results = NULL
#create 1000 random sample with contamination
for(i in 1:2000){
#generate data
data_with_contamination <- createDataWithContamination_EXP(sampleSize,rate,rate_contamination ,contamination)$data_with_contamination
#Code To Handling outlier
#Quantile based flooring and capping
#In this technique, the outlier is capped at a certain value above the upper percentile value or floored at a factor below the lower percentile value.
estimated_rate_Before_Handling <- eexp(data_with_contamination, ci=TRUE, conf = 0.95)$parameters[["rate"]]
q1 <- quantile(data_with_contamination , c(.25))[["25%"]]
q3 <- quantile(data_with_contamination , c(.75))[["75%"]]
IQR <- q3-q1
lower = q1 - 1.5*IQR
upper = q3 + 1.5*IQR
outliers <- boxplot(data_with_contamination, plot=FALSE)$out
outliersPos <- which(data_with_contamination %in% outliers)
dataAfterHandling_Q_b_F_C <- data_with_contamination
dataAfterHandling_mean <- data_with_contamination
dataAfterHandling_median <- data_with_contamination
dataAfterHandling_mode <- data_with_contamination
dataAfterHandling_Q_b_F_C[dataAfterHandling_Q_b_F_C<lower] <- round(lower)
dataAfterHandling_Q_b_F_C[dataAfterHandling_Q_b_F_C>upper] <- round(upper)
estimated_rate_After_Q_b_F_C <- eexp(dataAfterHandling_Q_b_F_C, ci=TRUE, conf = 0.95)$parameters[["rate"]]
mean = mean(dataAfterHandling_mean)
dataAfterHandling_mean[outliersPos] <- round(mean)
estimated_rate_After_mean <- eexp(dataAfterHandling_mean, ci=TRUE, conf = 0.95)$parameters[["rate"]]
median = median(dataAfterHandling_median)
dataAfterHandling_median[outliersPos] <- round(median)
estimated_rate_After_median <- eexp(dataAfterHandling_median, ci=TRUE, conf = 0.95)$parameters[["rate"]]
mode = getmode(dataAfterHandling_mode)
dataAfterHandling_mode[outliersPos] <- round(mode)
estimated_rate_After_mode <- eexp(dataAfterHandling_mode, ci=TRUE, conf = 0.95)$parameters[["rate"]]
results = rbind(
results,
data.frame(
i,
estimated_rate_Before_Handling,
estimated_rate_After_Q_b_F_C,
estimated_rate_After_mean,
estimated_rate_After_median,
estimated_rate_After_mode
))
}
results
}
# different sample size with rate = .5 and contamination=10%
data_20_5_10 <- generateData(20 , .3 ,0.05, .1)
data_50_5_10 <- generateData(50 , .3 , 0.05,.1)
data_100_5_10 <- generateData(100 , .3 , 0.05,0.1)
data_200_5_10 <- generateData(200 , .3 , 0.05,0.1)
# different sample size with rate = .5 and contamination=20%
data_20_5_20 <- generateData(20 ,.5 , 0.05,0.2)
data_50_5_20 <- generateData(50 ,.5 , 0.05,0.2)
Results
doCalculations <- function(data , sampleSize ,rate , contamination) {
data %>% summarize(
sampleSize = sampleSize ,
contamination = contamination,
rate = rate,
bias_rate_Before = abs((bias(estimated_rate_Before_Handling , rate))),
bias_rate_Q_b_F_C = abs((bias(estimated_rate_After_Q_b_F_C , rate))),
bias_rate_Mean = abs((bias(estimated_rate_After_mean ,rate))),
bias_rate_Median = abs((bias(estimated_rate_After_median , rate))),
bias_rate_Mode = abs((bias(estimated_rate_After_mode ,rate))) ,
MSE_rate_Before = MSE(estimated_rate_Before_Handling , rate),
MSE_rate_Q_b_F_C = MSE(estimated_rate_After_Q_b_F_C, rate),
MSE_rate_Mean = MSE(estimated_rate_After_mean, rate),
MSE_rate_Median = MSE(estimated_rate_After_median, rate),
MSE_rate_Mode = MSE(estimated_rate_After_mode, rate)
)
}
finalResult <- NULL
finalResult <- rbind(
finalResult ,
doCalculations(data_20_3_10 , 20,0.5,10),
doCalculations(data_50_3_10 , 50,0.5,10),
doCalculations(data_100_3_10 , 100,0.5,10),
doCalculations(data_200_3_10 , 200,0.5,10),
doCalculations(data_20_3_20 , 20,0.5,20),
doCalculations(data_50_3_20 , 50,0.5,20),
doCalculations(data_100_3_20 , 100,0.5,20),
doCalculations(data_200_3_20 , 200,0.5,20),
doCalculations(data_20_3_30 , 20,0.5,30),
doCalculations(data_50_3_30 , 50,0.5,30),
doCalculations(data_100_3_30 , 100,0.5,30),
doCalculations(data_200_3_30 , 200,0.5,30)
)
finalResult %>% select(sampleSize , contamination ,bias_rate_Before , bias_rate_Q_b_F_C , bias_rate_Mean , bias_rate_Median ,bias_rate_Mode)
finalResult %>% select(sampleSize , contamination ,MSE_rate_Before , MSE_rate_Q_b_F_C , MSE_rate_Mean, MSE_rate_Median , MSE_rate_Mode)
NA
NA
#Relation Between sample size and Biased_prop for each method
finalResult %>% filter(rate==0.5) %>% select( sampleSize , contamination, bias_rate_Q_b_F_C , bias_rate_Mean , bias_rate_Median ,bias_rate_Mode , bias_rate_Before) %>%
gather("Method" , "Biased_rate" , bias_rate_Q_b_F_C , bias_rate_Mean , bias_rate_Median ,bias_rate_Mode , bias_rate_Before ) %>%
ggplot(aes(x = (sampleSize) , y = Biased_rate)) +
geom_point( aes(colour = as.factor(contamination))) +
geom_line( aes(colour = as.factor(contamination))) +
facet_wrap(.~Method)

finalResult %>% filter(rate==0.5) %>% select( sampleSize , contamination,MSE_rate_Q_b_F_C , MSE_rate_Mean, MSE_rate_Median , MSE_rate_Mode , MSE_rate_Before) %>%
gather("Method" , "MSE_rate" , MSE_rate_Q_b_F_C , MSE_rate_Mean, MSE_rate_Median , MSE_rate_Mode ,MSE_rate_Before ) %>%
ggplot(aes(x = (sampleSize) , y = MSE_rate)) +
geom_point( aes(colour = as.factor(contamination))) +
geom_line( aes(colour = as.factor(contamination))) +
facet_wrap(.~Method)
Warning in gzfile(file, "wb") :
cannot open compressed file 'C:/Users/eyada/Desktop/master/University courses/Statistical inference for data science/OutlierHandlingMethods/.Rproj.user/shared/notebooks/B66824C9-Outlier Handling (Exponential Distribution)/1/CD6BAB30B80C719B/ch93dsphw4i6h_t/0aa43442d7bf49b7b24833350a841dae.snapshot', probable reason 'No such file or directory'
Error in gzfile(file, "wb") : cannot open the connection

Warning in gzfile(file, "wb") :
cannot open compressed file 'C:/Users/eyada/Desktop/master/University courses/Statistical inference for data science/OutlierHandlingMethods/.Rproj.user/shared/notebooks/B66824C9-Outlier Handling (Exponential Distribution)/1/CD6BAB30B80C719B/ch93dsphw4i6h_t/f31ee6ce127541ad9e03df0b7d488736.snapshot', probable reason 'No such file or directory'
Error in gzfile(file, "wb") : cannot open the connection
---
title: "Outlier Handling Methods (THE EXPONENTIAL distribution )"
output: html_notebook
---
 

```{r importLibrary , echo=FALSE , warning=FALSE , error=FALSE}
library(robust)
library(dplyr)
library(ggplot2)
library(MLmetrics)
library(SimDesign)
library(tidyr)
library(EnvStats)

```

# Introductiion 

THE EXPONENTIAL distribution is the most commonly used model in reliability & life-testing analysis

#Estimation

Let ${x} = (x_1, x_2, …, x_n)$ be a vector of n observations from an exponential distribution with parameter rate=$\lambda $.

The maximum likelihood estimator (mle) of $\lambda $ is given by:
$\hat{λ}_{mle} = \frac{1}{\bar{x}}$ where $\bar{x} = \frac{1}{n}∑^n_{i=1} x_i$


# Generate Data 

```{r FunctionUsedToGenerateDataWith_contamination ,echo=TRUE , warning=FALSE}

getmode <- function(v) {
  
    x<-v
    lim.inf=min(x)-1; lim.sup=max(x)+1

   # hist(x,freq=FALSE,breaks=seq(lim.inf,lim.sup,0.2))
    s<-density(x,from=lim.inf,to=lim.sup,bw=0.2)
    n<-length(s$y)
    v1<-s$y[1:(n-2)];
    v2<-s$y[2:(n-1)];
    v3<-s$y[3:n]
    ix<-1+which((v1<v2)&(v2>v3))
    
    #lines(s$x,s$y,col="red")
    #points(s$x[ix],s$y[ix],col="blue")
    
    md <- s$x[which(s$y==max(s$y))] 

    md
}

createDataWithContamination_EXP <- function(sample_size , rateData , rateContamination ,contamination_prop ){

data <- rexp(sample_size , rate = rateData) 
contaminations <- rexp(sample_size , rate =rateContamination)
contaminations <- sort(contaminations)[(sample_size-(sample_size*contamination_prop)):sample_size]
   
  data_with_contamination <- c(data, contaminations) 

   list(
     data = data,
     contaminations = contaminations,
     data_with_contamination = data_with_contamination
    )
   
}

```

### Data Generations Factors

1. sample size  (20,50,100,200) 
2. Distribution Parameters  $\lambda$ (0.5)
3. Contamination Distribution Parameters (0.05)
3. contamination (10% , 20% , 30%)



```{r generateDateFunction ,warning=FALSE}

generateData <- function(sampleSize , rate,rate_contamination , contamination ){
  
  
results = NULL
#create 1000 random sample with contamination
for(i in 1:2000){
  #generate data  
  data_with_contamination <-  createDataWithContamination_EXP(sampleSize,rate,rate_contamination ,contamination)$data_with_contamination
   
  #Code To Handling outlier 
  

  #Quantile based flooring and capping
  #In this technique, the outlier is capped at a certain value above the upper percentile value or floored   at a factor below the lower percentile value.
  
estimated_rate_Before_Handling <- eexp(data_with_contamination, ci=TRUE, conf = 0.95)$parameters[["rate"]]

  
q1 <- quantile(data_with_contamination  , c(.25))[["25%"]]
q3 <- quantile(data_with_contamination  , c(.75))[["75%"]]
IQR <- q3-q1
lower  = q1 - 1.5*IQR
upper = q3 + 1.5*IQR


outliers <- boxplot(data_with_contamination, plot=FALSE)$out
outliersPos <- which(data_with_contamination %in% outliers)
 
  
 
dataAfterHandling_Q_b_F_C <- data_with_contamination
dataAfterHandling_mean <- data_with_contamination
dataAfterHandling_median <- data_with_contamination
dataAfterHandling_mode <- data_with_contamination


dataAfterHandling_Q_b_F_C[dataAfterHandling_Q_b_F_C<lower] <- round(lower)
dataAfterHandling_Q_b_F_C[dataAfterHandling_Q_b_F_C>upper] <- round(upper)
estimated_rate_After_Q_b_F_C <- eexp(dataAfterHandling_Q_b_F_C, ci=TRUE, conf = 0.95)$parameters[["rate"]]


mean = mean(dataAfterHandling_mean)
dataAfterHandling_mean[outliersPos] <- round(mean)
estimated_rate_After_mean <-  eexp(dataAfterHandling_mean, ci=TRUE, conf = 0.95)$parameters[["rate"]]

 
median = median(dataAfterHandling_median)
dataAfterHandling_median[outliersPos] <- round(median)
estimated_rate_After_median <-  eexp(dataAfterHandling_median, ci=TRUE, conf = 0.95)$parameters[["rate"]]
 

mode = getmode(dataAfterHandling_mode)
dataAfterHandling_mode[outliersPos] <- round(mode) 
estimated_rate_After_mode <- eexp(dataAfterHandling_mode, ci=TRUE, conf = 0.95)$parameters[["rate"]]

 


 results = rbind(
   results,
   data.frame(
     i,
     estimated_rate_Before_Handling,
     estimated_rate_After_Q_b_F_C,
     estimated_rate_After_mean,
     estimated_rate_After_median,
     estimated_rate_After_mode 
     ))
}

results
}
```


```{r}
 

# different sample size with rate = .5  and contamination=10%
data_20_5_10 <- generateData(20 , .3 ,0.05, .1)
data_50_5_10 <- generateData(50 , .3 , 0.05,.1)
data_100_5_10 <- generateData(100 , .3 , 0.05,0.1)
data_200_5_10 <- generateData(200 , .3 , 0.05,0.1)

 
# different sample size  with rate = .5 and contamination=20%
data_20_5_20 <- generateData(20 ,.5 , 0.05,0.2)
data_50_5_20 <- generateData(50 ,.5 , 0.05,0.2)
data_100_5_20 <- generateData(100,.5 ,0.05,0.2)
data_200_5_20 <- generateData(200,.5 ,0.05,0.2)
  

 
# different sample size with rate = .5  and contamination=30%
data_20_5_30 <- generateData(20 , .5 ,0.05,0.3)
data_50_5_30 <- generateData(50 , .5 , 0.05,0.3)
data_100_5_30 <- generateData(100, .5 ,0.05,0.3)
data_200_5_30 <- generateData(200,.5 ,0.05,0.3)

 
 
```


# Results 

```{r}

doCalculations <- function(data , sampleSize ,rate , contamination) {
  
   data %>% summarize(
  sampleSize = sampleSize ,
  contamination = contamination,
  rate = rate,
 
 
  bias_rate_Before = abs((bias(estimated_rate_Before_Handling , rate))),
  bias_rate_Q_b_F_C = abs((bias(estimated_rate_After_Q_b_F_C , rate))),
  bias_rate_Mean = abs((bias(estimated_rate_After_mean ,rate))),
  bias_rate_Median = abs((bias(estimated_rate_After_median , rate))),
  bias_rate_Mode = abs((bias(estimated_rate_After_mode ,rate))) ,
  
  MSE_rate_Before = MSE(estimated_rate_Before_Handling , rate),
  MSE_rate_Q_b_F_C  = MSE(estimated_rate_After_Q_b_F_C, rate),
  MSE_rate_Mean  = MSE(estimated_rate_After_mean, rate),
  MSE_rate_Median   = MSE(estimated_rate_After_median, rate),
  MSE_rate_Mode  = MSE(estimated_rate_After_mode, rate)
) 
}



```

```{r call_CalculationsFunction  ,warning=FALSE }

finalResult <- NULL 

finalResult <- rbind(
  finalResult ,
                doCalculations(data_20_3_10 , 20,0.5,10),
                doCalculations(data_50_3_10 , 50,0.5,10),
                doCalculations(data_100_3_10 , 100,0.5,10),
                doCalculations(data_200_3_10 , 200,0.5,10),
  
                doCalculations(data_20_3_20 , 20,0.5,20),
                doCalculations(data_50_3_20 , 50,0.5,20),
                doCalculations(data_100_3_20 , 100,0.5,20),
                doCalculations(data_200_3_20 , 200,0.5,20),
     
                doCalculations(data_20_3_30 , 20,0.5,30),
                doCalculations(data_50_3_30 , 50,0.5,30),
                doCalculations(data_100_3_30 , 100,0.5,30),
                doCalculations(data_200_3_30 , 200,0.5,30)
  
   
   
                     )





```



```{r  warning=FALSE}

finalResult %>% select(sampleSize , contamination ,bias_rate_Before , bias_rate_Q_b_F_C , bias_rate_Mean , bias_rate_Median ,bias_rate_Mode)



finalResult %>% select(sampleSize , contamination ,MSE_rate_Before  , MSE_rate_Q_b_F_C  , MSE_rate_Mean, MSE_rate_Median , MSE_rate_Mode)
 

```

```{r ResultVisualization03}

#Relation Between sample size and Biased_prop for each method

finalResult %>% filter(rate==0.5) %>% select( sampleSize , contamination, bias_rate_Q_b_F_C , bias_rate_Mean , bias_rate_Median ,bias_rate_Mode , bias_rate_Before) %>% 
  gather("Method" , "Biased_rate" , bias_rate_Q_b_F_C , bias_rate_Mean , bias_rate_Median ,bias_rate_Mode , bias_rate_Before ) %>% 
  ggplot(aes(x = (sampleSize) , y = Biased_rate)) + 
  geom_point( aes(colour = as.factor(contamination))) + 
    geom_line( aes(colour = as.factor(contamination))) + 

  facet_wrap(.~Method)



finalResult %>% filter(rate==0.5)  %>% select( sampleSize , contamination,MSE_rate_Q_b_F_C  , MSE_rate_Mean, MSE_rate_Median , MSE_rate_Mode , MSE_rate_Before) %>% 
  gather("Method" , "MSE_rate" ,  MSE_rate_Q_b_F_C  , MSE_rate_Mean, MSE_rate_Median , MSE_rate_Mode ,MSE_rate_Before ) %>% 
  ggplot(aes(x = (sampleSize) , y = MSE_rate)) + 
  geom_point( aes(colour = as.factor(contamination))) + 
  geom_line( aes(colour = as.factor(contamination))) + 
  facet_wrap(.~Method)

 




```
 


# References 

1. https://rpubs.com/mpfoley73/459040

2. https://search.r-project.org/CRAN/refmans/EnvStats/html/eexp.html

