if X is the count of independent Bernoulli trials required to achieve the rth successful trial when the probability of success is constant p.
Generate Data
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
createDataWithContamination_NB_P <- function(sample_size , contamination_prop ){
data <- rnbinom(sample_size , size = 2, prob = .2)
contaminations <- rpois(round((sample_size*contamination_prop)), lambda = 32)
data_with_contamination <- c(data, contaminations)
list(
data = data,
contaminations = contaminations,
data_with_contamination = data_with_contamination
)
}
Data Generations Factors
- sample size (20,50,100,200)
- Distribution Parameters (2 , 0.2)
- contamination (10% , 20% , 30%)
generateData <- function(sampleSize , contamination ){
results = NULL
#create 1000 random sample with contamination
for(i in 1:2000){
#generate data
data_with_contamination <- createDataWithContamination_NB_P(sampleSize,contamination)$data_with_contamination
#Code To Handling outlier
#Quantile based flooring and capping
#In this technique, the outlier is capped at a certain value above the upper percentile value or floored at a factor below the lower percentile value.
lower = quantile(data_with_contamination , c(.025))[["2.5%"]]
upper = quantile(data_with_contamination , c(.975))[["97.5%"]]
outliers <- boxplot(data_with_contamination, plot=FALSE)$out
outliersPos <- which(data_with_contamination %in% outliers)
dataAfterHandling_Q_b_F_C <- data_with_contamination
dataAfterHandling_mean <- data_with_contamination
dataAfterHandling_median <- data_with_contamination
dataAfterHandling_mode <- data_with_contamination
dataAfterHandling_Q_b_F_C[dataAfterHandling_Q_b_F_C<lower] <- round(lower)
dataAfterHandling_Q_b_F_C[dataAfterHandling_Q_b_F_C>upper] <- round(upper)
estimated_prop_After_Q_b_F_C <- enbinom(dataAfterHandling_Q_b_F_C, size = 2)$parameters[2][["prob"]]
mean = mean(dataAfterHandling_mean)
dataAfterHandling_mean[outliersPos] <- round(mean)
estimated_prop_After_mean <- enbinom(dataAfterHandling_mean, size = 2)$parameters[2][["prob"]]
median = median(dataAfterHandling_median)
dataAfterHandling_median[outliersPos] <- round(median)
estimated_prop_After_median <- enbinom(dataAfterHandling_median, size = 2)$parameters[2][["prob"]]
mode = getmode(dataAfterHandling_mode)
dataAfterHandling_mode[outliersPos] <- round(mode)
estimated_prop_After_mode <- enbinom(dataAfterHandling_mode, size = 2)$parameters[2][["prob"]]
results = rbind(
results,
data.frame(
i,
estimated_prop_After_Q_b_F_C,
estimated_prop_After_mean,
estimated_prop_After_median,
estimated_prop_After_mode
))
}
results
}
# different sample size and contamination=10%
data_20_2_10 <- generateData(20 , .1)
data_50_2_10 <- generateData(50 , .1)
data_100_2_10 <- generateData(100 , 0.1)
data_200_2_10 <- generateData(200 , 0.1)
# different sample size and contamination=20%
data_20_2_20 <- generateData(20 , 0.2)
data_50_2_20 <- generateData(50 , 0.2)
data_100_2_20 <- generateData(100,0.2)
data_200_2_20 <- generateData(200,0.2)
# different sample size and contamination=20%
data_20_2_30 <- generateData(20 , 0.3)
data_50_2_30 <- generateData(50 , 0.3)
Results
doCalculations <- function(data , sampleSize , contamination) {
data %>% summarize(
sampleSize = sampleSize ,
contamination = contamination,
bias_prop_Q_b_F_C = bias(estimated_prop_After_Q_b_F_C , 0.2 ),
bias_prop_Mean = bias(estimated_prop_After_mean ,0.2),
bias_prop_Median = bias(estimated_prop_After_median , 0.2),
bias_prop_Mode = bias(estimated_prop_After_mode ,0.2) ,
MSE_prop_Q_b_F_C = MSE(estimated_prop_After_Q_b_F_C, 0.2),
MSE_prop_Mean = MSE(estimated_prop_After_mean, 0.2),
MSE_prop_Median = MSE(estimated_prop_After_median, 0.2),
MSE_prop_Mode = MSE(estimated_prop_After_mode, 0.2),
)
}
finalResult <- NULL
finalResult <- rbind(
finalResult ,
doCalculations(data_20_2_10 , 20,10),
doCalculations(data_50_2_10 , 50,10),
doCalculations(data_100_2_10 , 100,10),
doCalculations(data_200_2_10 , 200,10),
doCalculations(data_20_2_20 , 20,20),
doCalculations(data_50_2_20 , 50,20),
doCalculations(data_100_2_20 , 100,20),
doCalculations(data_200_2_20 , 200,20),
doCalculations(data_20_2_30 , 20,30),
doCalculations(data_50_2_30 , 50,30),
doCalculations(data_100_2_30 , 100,30),
doCalculations(data_200_2_30 , 200,30)
)
finalResult %>% select(sampleSize , contamination , bias_prop_Q_b_F_C , bias_prop_Mean , bias_prop_Median ,bias_prop_Mode)
finalResult %>% select(sampleSize , contamination , MSE_prop_Q_b_F_C , MSE_prop_Mean, MSE_prop_Median , MSE_prop_Mode)
NA
NA
#Relation Between sample size and Biased_prop for each method
finalResult %>% select( sampleSize , contamination, bias_prop_Q_b_F_C , bias_prop_Mean , bias_prop_Median ,bias_prop_Mode) %>%
gather("Method" , "Biased_prop" , bias_prop_Q_b_F_C , bias_prop_Mean , bias_prop_Median ,bias_prop_Mode ) %>%
ggplot(aes(x = (sampleSize) , y = Biased_prop)) +
geom_point( aes(colour = as.factor(contamination))) +
geom_line( aes(colour = as.factor(contamination))) +
facet_wrap(.~Method)

finalResult %>% select( sampleSize , contamination,MSE_prop_Q_b_F_C , MSE_prop_Mean, MSE_prop_Median , MSE_prop_Mode) %>%
gather("Method" , "MSE_prop" , MSE_prop_Q_b_F_C , MSE_prop_Mean, MSE_prop_Median , MSE_prop_Mode ) %>%
ggplot(aes(x = (sampleSize) , y = MSE_prop)) +
geom_point( aes(colour = as.factor(contamination))) +
geom_line( aes(colour = as.factor(contamination))) +
facet_wrap(.~Method)
Warning in gzfile(file, "wb") :
cannot open compressed file 'C:/Users/eyada/Desktop/master/University courses/Statistical inference for data science/OutlierHandlingMethods/.Rproj.user/shared/notebooks/D230BA69-OutlierHandling_Negative Binomial Distribution/1/CD6BAB307633CDB7/cojq04yy8pzmj_t/7a1201eaacdc4086a57059cbc89a92ef.snapshot', probable reason 'No such file or directory'
Error in gzfile(file, "wb") : cannot open the connection

Warning in gzfile(file, "wb") :
cannot open compressed file 'C:/Users/eyada/Desktop/master/University courses/Statistical inference for data science/OutlierHandlingMethods/.Rproj.user/shared/notebooks/D230BA69-OutlierHandling_Negative Binomial Distribution/1/CD6BAB307633CDB7/cojq04yy8pzmj_t/30f20766750349038d97ca4057157fb3.snapshot', probable reason 'No such file or directory'
Error in gzfile(file, "wb") : cannot open the connection
