cont_norm <- function(n, # sample size
mu1 = 0, # only one mu since the mean is the same for both distributions.
mu2 = 0, # only one mu since the mean is the same for both distributions.
sd1 = 1, # sd of the first Distribution
sd2 = 100, # sd of the second Distribution
prob = 0.1 # contamination proportion
){
data <- rnorm(n, mean = mu1, sd = sd1)
list_contamination <- rnorm(round((n*prob)), mean = mu2, sd = sd2)
# samplePos <- sample(1:n , round((n*prob)))
data_contamination <- c(data, list_contamination)
#data_contamination[samplePos] <- list_contamination[samplePos]
list(
data = data,
data_contamination = data_contamination
)
}
\(var_{contamination} = var_{original} * 2\)
generateData <- function(sampleSize , var , contamination ){
results = NULL
#create 1000 random sample with contamination
for(i in 1:2000){
#generate data
data_with_contamination <- cont_norm(sampleSize,
mu1 = 0,
mu2 = 4,
sd1 = var,
sd2 = var*2,
prob =contamination)$data_contamination
#Code To Handling outlier
#Quantile based flooring and capping
#In this technique, the outlier is capped at a certain value above the upper percentile value or floored at a factor below the lower percentile value.
lower = quantile(data_with_contamination , c(.025))[["2.5%"]]
upper = quantile(data_with_contamination , c(.975))[["97.5%"]]
outliers <- boxplot(data_with_contamination, plot=FALSE)$out
outliersPos <- which(data_with_contamination %in% outliers)
dataAfterHandling_Q_b_F_C <- data_with_contamination
dataAfterHandling_mean <- data_with_contamination
dataAfterHandling_median <- data_with_contamination
dataAfterHandling_mode <- data_with_contamination
dataAfterHandling_Q_b_F_C[dataAfterHandling_Q_b_F_C<lower] <- lower
dataAfterHandling_Q_b_F_C[dataAfterHandling_Q_b_F_C>upper] <- upper
x_bar_After_Handling_Q_b_F_C <- mean(dataAfterHandling_Q_b_F_C)
x_sd_After_Handling_Q_b_F_C <- sd(dataAfterHandling_Q_b_F_C)
#
#
mean = mean(dataAfterHandling_mean)
dataAfterHandling_mean[outliersPos] <- mean
# dataAfterHandling_mean[dataAfterHandling_mean>upper] <- mean
x_bar_After_mean <- mean(dataAfterHandling_mean)
x_sd_After_mean <- sd(dataAfterHandling_mean)
#
#
median = median(dataAfterHandling_median)
dataAfterHandling_median[outliersPos] <- median
# dataAfterHandling_median[dataAfterHandling_median>upper] <- median
x_bar_After_median <- mean(dataAfterHandling_median)
x_sd_After_median <- sd(dataAfterHandling_median)
#
mode = getmode(dataAfterHandling_mode)
dataAfterHandling_mode[outliersPos] <- mode
# dataAfterHandling_mode[dataAfterHandling_mode>upper] <- mode
x_bar_After_mode <- mean(dataAfterHandling_mode)
x_sd_After_mode <- sd(dataAfterHandling_mode)
# Normality Test After Quantile based flooring and capping
NormalityTest_p_value_After_Q_b_F_C <- shapiro.test(dataAfterHandling_Q_b_F_C)$p.value
NormalityTest_p_value_After_mean <- shapiro.test(dataAfterHandling_mean)$p.value
NormalityTest_p_value_After_median <- shapiro.test(dataAfterHandling_median)$p.value
NormalityTest_p_value_After_mode <- shapiro.test(dataAfterHandling_mode)$p.value
results = rbind(
results,
data.frame(
i,
NormalityTest_p_value_After_Q_b_F_C ,
NormalityTest_p_value_After_mean ,
NormalityTest_p_value_After_median ,
NormalityTest_p_value_After_mode ,
x_bar_After_Handling_Q_b_F_C,
x_bar_After_mean,
x_bar_After_median,
x_bar_After_mode ,
x_sd_After_Handling_Q_b_F_C,
x_sd_After_mean,
x_sd_After_median,
x_sd_After_mode
))
}
results
}
# different sample size with sd = 1 and contamination=10%
data_30_1_10 <- generateData(30 , 1 , .1)
data_50_1_10 <- generateData(50 , 1 , .1)
data_100_1_10 <- generateData(100 , 1 , 0.1)
data_500_1_10 <- generateData(500 , 1 , 0.1)
# different sample size with sd = 1 and contamination=20%
data_30_1_20 <- generateData(30 , 1,0.2)
data_50_1_20 <- generateData(50 , 1,0.2)
data_100_1_20 <- generateData(100,1,0.2)
data_500_1_20 <- generateData(500,1,0.2)
# different sample size with sd = 1 and contamination=20%
data_30_1_30 <- generateData(30 , 1,0.3)
data_50_1_30 <- generateData(50 , 1,0.3)
data_100_1_30 <- generateData(100,1,0.3)
data_500_1_30 <- generateData(500,1,0.3)
above_05 <- function(pValueList){
percent <- mean(pValueList>.05)
return (percent)
}
doCalculations <- function(data , sampleSize , contamination) {
data %>% summarize(
sampleSize = sampleSize ,
contamination = contamination,
bias_XBar_Q_b_F_C = bias(x_bar_After_Handling_Q_b_F_C , 0),
bias_XBar_Mean = bias(x_bar_After_mean ,0),
bias_XBar_Median = bias(x_bar_After_median , 0),
bias_XBar_Mode = bias(x_bar_After_mode ,0) ,
bias_SD_Q_b_F_C = bias(x_sd_After_Handling_Q_b_F_C , 1),
bias_SD_Mean = bias(x_sd_After_mean ,1),
bias_SD_Median = bias(x_sd_After_median , 1),
bias_SD_Mode = bias(x_sd_After_mode ,1) ,
MSE_XBar_Q_b_F_C = MSE(x_bar_After_Handling_Q_b_F_C, 0),
MSE_XBar_Mean = MSE(x_bar_After_mean, 0),
MSE_XBar_Median = MSE(x_bar_After_median, 0),
MSE_XBar_Mode = MSE(x_bar_After_mode, 0),
MSE_SD_Q_b_F_C = MSE(x_sd_After_Handling_Q_b_F_C, 1),
MSE_SD_Mean = MSE(x_sd_After_mean, 1),
MSE_SD_Median = MSE(x_sd_After_median, 1),
MSE_SD_Mode = MSE(x_sd_After_mode, 1),
above05_Q_b_F_C = above_05(NormalityTest_p_value_After_Q_b_F_C),
above05_Mean = above_05(NormalityTest_p_value_After_mean),
above05_Median = above_05(NormalityTest_p_value_After_median),
above05_Mode = above_05(NormalityTest_p_value_After_mode)
)
}
finalResult <- NULL
finalResult <- rbind(
finalResult ,
doCalculations(data_30_1_10 , 30,10),
doCalculations(data_50_1_10 , 50,10),
doCalculations(data_100_1_10 , 100,10),
doCalculations(data_500_1_10 , 500,10),
doCalculations(data_30_1_20 , 30,20),
doCalculations(data_50_1_20 , 50,20),
doCalculations(data_100_1_20 , 100,20),
doCalculations(data_500_1_20 , 500,20),
doCalculations(data_30_1_30 , 30,30),
doCalculations(data_50_1_30 , 50,30),
doCalculations(data_100_1_30 , 100,30),
doCalculations(data_500_1_30 , 500,30)
)
finalResult %>% select(sampleSize , contamination , bias_XBar_Q_b_F_C , bias_XBar_Mean , bias_XBar_Median , bias_XBar_Mode)
finalResult %>% select(sampleSize , contamination , bias_SD_Q_b_F_C , bias_SD_Mean , bias_SD_Median , bias_SD_Mode)
finalResult %>% select(sampleSize , contamination , MSE_XBar_Q_b_F_C , MSE_XBar_Mean , MSE_XBar_Median , MSE_XBar_Mode)
finalResult %>% select(sampleSize , contamination , MSE_SD_Q_b_F_C , MSE_SD_Mean , MSE_SD_Median , MSE_SD_Mode)
finalResult %>% select(sampleSize , contamination ,above05_Q_b_F_C , above05_Mean , above05_Median , above05_Mode)
NA
NA
#Relation Between sample size and Biased in X bar for each method
finalResult %>% select( sampleSize , contamination, bias_XBar_Q_b_F_C , bias_XBar_Mean , bias_XBar_Median , bias_XBar_Mode) %>% gather("Method" , "Biased_X_Bar" ,
bias_XBar_Q_b_F_C , bias_XBar_Mean , bias_XBar_Median , bias_XBar_Mode ) %>%
ggplot(aes(x = as.factor(sampleSize) , y = Biased_X_Bar)) +
geom_point( aes(colour = as.factor(contamination))) +
facet_wrap(.~Method)
finalResult %>% select( sampleSize , contamination, bias_SD_Q_b_F_C , bias_SD_Mean , bias_SD_Median , bias_SD_Mode) %>% gather("Method" , "Biased_X_SD" ,
bias_SD_Q_b_F_C , bias_SD_Mean , bias_SD_Median , bias_SD_Mode ) %>%
ggplot(aes(x = as.factor(sampleSize) , y = Biased_X_SD)) +
geom_point( aes(colour = as.factor(contamination))) +
facet_wrap(.~Method)
finalResult %>% select( sampleSize , contamination, MSE_XBar_Q_b_F_C , MSE_XBar_Mean , MSE_XBar_Median , MSE_XBar_Mode) %>% gather("Method" , "MSE_XBar" ,
MSE_XBar_Q_b_F_C , MSE_XBar_Mean , MSE_XBar_Median , MSE_XBar_Mode ) %>%
ggplot(aes(x = as.factor(sampleSize) , y = MSE_XBar)) +
geom_point( aes(colour = as.factor(contamination))) +
facet_wrap(.~Method)
finalResult %>% select( sampleSize , contamination, MSE_SD_Q_b_F_C , MSE_SD_Mean , MSE_SD_Median , MSE_SD_Mode) %>% gather("Method" , "MSE_SD" ,
MSE_SD_Q_b_F_C , MSE_SD_Mean , MSE_SD_Median , MSE_SD_Mode ) %>%
ggplot(aes(x = as.factor(sampleSize) , y = MSE_SD)) +
geom_point( aes(colour = as.factor(contamination))) +
facet_wrap(.~Method)
finalResult %>% select( sampleSize , contamination, above05_Q_b_F_C , above05_Mean , above05_Median , above05_Mode) %>% gather("Method" , "PValue_above05" ,
above05_Q_b_F_C , above05_Mean , above05_Median , above05_Mode ) %>%
ggplot(aes(x = as.factor(sampleSize) , y = PValue_above05)) +
geom_point( aes(colour = as.factor(contamination))) +
facet_wrap(.~Method)
NA
NA
NA
NA
NA
NA