Assignment 1 - Inference
Course - Inference Course Instructor - Dr. Amita Pal Submitted by - Rahul Verma Roll No - 24BM6JP44
knitr::opts_chunk$set(echo = TRUE)
  1. Write a short paragraph describing the dataset

The Pima Indians Diabetes Database contains 768 records of women of Pima Indian heritage, aiming to predict diabetes presence based on 8 medical predictors: pregnancies, glucose levels, blood pressure, skin thickness, insulin, BMI, diabetes pedigree (family history), and age. The binary outcome variable indicates diabetes status (1 = diabetic, 0 = non-diabetic). This dataset is widely used to study risk factors and develop predictive models for diabetes. Source – Kaggle Link - https://www.kaggle.com/datasets/mragpavank/diabetes/data

# Importing the dataset
dataset = read.csv("C:/Users/ADMIN/Downloads/diabetes.csv")

# Selecting a discrete variable ('Pregnancies') and a continuous variable ('BMI')
X = dataset$Pregnancies
Y = dataset$BMI

# 2a. Frequency distribution for the discrete variable (Pregnancies)
freq_table = table(X)
print("Frequency Distribution for Pregnancies:")
## [1] "Frequency Distribution for Pregnancies:"
print(freq_table)
## X
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  17 
## 111 135 103  75  68  57  50  45  38  28  24  11   9  10   2   1   1
write.table(freq_table, "clipboard", sep = ",", row.names = FALSE, col.names = TRUE)

# Frequency distribution for the continuous variable (BMI)
breaks = seq(floor(min(Y)), ceiling(max(Y)), by = 5)  # Define bin intervals (e.g., width of 5)
freq_table_Y = table(cut(Y, breaks = breaks))        # Create a frequency table with bins

print( "Frequency Distribution for BMI:" )
## [1] "Frequency Distribution for BMI:"
print(freq_table_Y)
## 
##   (0,5]  (5,10] (10,15] (15,20] (20,25] (25,30] (30,35] (35,40] (40,45] (45,50] 
##       0       0       0      14      98     180     221     148      61      27 
## (50,55] (55,60] (60,65] 
##       5       2       0
# 2b. Plotting histograms
par(mfrow=c(1,2), mar=c(5,2,5,-0))
# Discrete Variable (Pregnancies)
hist(X, 
     main = "Histogram of Pregnancies(X)", 
     xlab = "Number of Pregnancies", 
     col = "lightblue", 
     breaks = "FD") 

# Continuous Variable (BMI)
hist(Y, 
     main = "Histogram of BMI(Y)", 
     xlab = "BMI", 
     col = "lightgreen", 
     breaks = "Sturges") 

X (Positively Skewed Distribution)

Method Used: Freedman-Diaconis (FD) Reason for Choosing FD Method: The FD method is suitable for positively skewed distributions because it is robust to outliers and effectively captures the variability in the data. It ensures that the bin width adapts to the spread of the data, providing a detailed view of the skewness.

Y (Approximately Normal Distribution)

Method Used: Sturges Method Reason for Choosing Sturges Method: The Sturges method works well for approximately normal distributions, especially with smaller datasets. It provides a simple and interpretable histogram with a reasonable number of bins, making it ideal for normal-like data.

# 2c. Box-and-whisker plots
par(mfrow=c(1,2), mar=c(5,5,5,0))
# Boxplot for Pregnancies
boxplot(X, 
        main = "Boxplot of Pregnancies (X)", 
        ylab = "Number of Pregnancies", 
        col = "lightblue")

# Boxplot for BMI
boxplot(Y, 
        main = "Boxplot of BMI (Y)", 
        ylab = "BMI", 
        col = "lightgreen")

X (Pregnancies)

There are three outliers on the higher side of pregnancies. The median is shifted toward the lower side of the pregnancy data, indicating a positively skewed distribution.

Y (BMI)

There are multiple outliers on the higher side of BMI and only one outlier on the lower side. The median is positioned toward the lower side of the BMI data, suggesting a positively skewed distribution.

dataset <- read.csv("C:/Users/ADMIN/Downloads/diabetes.csv")  
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Datasets from problem 2
X <- dataset$Pregnancies  # Discrete variable
Y <- dataset$BMI          # Continuous variable

# Simulated datasets
set.seed(123)
data_poisson <- rpois(500, lambda = 5)
data_uniform <- runif(500, min = 3, max = 8)
data_exponential <- rexp(500, rate = 7)  # Exponential with rate = 7
data_beta <- rbeta(500, shape1 = 2, shape2 = 3)
data_lognormal <- rlnorm(500, meanlog = 0, sdlog = 1)

# Combine datasets into a list for iteration
datasets <- list(X = X, Y = Y, 
                 Poisson = data_poisson, 
                 Uniform = data_uniform, 
                 Exponential = data_exponential, 
                 Beta = data_beta, 
                 Lognormal = data_lognormal)

# Define sample sizes
sample_sizes <- c(10, 50, 100)

# Initialize list to store sample means
sample_means <- list()

# Compute means for each dataset and sample size
for (dataset_name in names(datasets)) {
  dataset <- datasets[[dataset_name]]  # Current dataset
  dataset_means <- list()              # To store means for different sample sizes
  
  for (n in sample_sizes) {
    # Draw 100 samples and compute their means
    sample_means_n <- replicate(100, mean(sample(dataset, size = n, replace = TRUE)))
    dataset_means[[paste0("n_", n)]] <- sample_means_n
  }
  
  sample_means[[dataset_name]] <- dataset_means  # Store all means for the current dataset
}

# Display sample means for each dataset and sample size
for (dataset_name in names(sample_means)) {
  cat("\nDataset:", dataset_name, "\n")
  cat("Mean of original dataset:", mean(datasets[[dataset_name]]), "\n")  # Print mean of original dataset
  
  for (n in sample_sizes) {
    sample_mean_values <- sample_means[[dataset_name]][[paste0("n_", n)]]
    cat("Sample size:", n, "\n")
    # Uncomment the lines below to print the individual means of the 100 samples
    #cat("Means of 100 samples:\n")
    #print(sample_mean_values)
    cat("Mean of sample means:", mean(sample_mean_values), "\n")
  }
}
## 
## Dataset: X 
## Mean of original dataset: 3.845052 
## Sample size: 10 
## Mean of sample means: 3.693 
## Sample size: 50 
## Mean of sample means: 3.754 
## Sample size: 100 
## Mean of sample means: 3.8597 
## 
## Dataset: Y 
## Mean of original dataset: 31.99258 
## Sample size: 10 
## Mean of sample means: 32.1688 
## Sample size: 50 
## Mean of sample means: 31.97598 
## Sample size: 100 
## Mean of sample means: 31.97046 
## 
## Dataset: Poisson 
## Mean of original dataset: 5.006 
## Sample size: 10 
## Mean of sample means: 4.988 
## Sample size: 50 
## Mean of sample means: 5.0002 
## Sample size: 100 
## Mean of sample means: 5.0175 
## 
## Dataset: Uniform 
## Mean of original dataset: 5.49636 
## Sample size: 10 
## Mean of sample means: 5.509886 
## Sample size: 50 
## Mean of sample means: 5.503686 
## Sample size: 100 
## Mean of sample means: 5.477841 
## 
## Dataset: Exponential 
## Mean of original dataset: 0.1497399 
## Sample size: 10 
## Mean of sample means: 0.149579 
## Sample size: 50 
## Mean of sample means: 0.1465004 
## Sample size: 100 
## Mean of sample means: 0.1475768 
## 
## Dataset: Beta 
## Mean of original dataset: 0.4014296 
## Sample size: 10 
## Mean of sample means: 0.3973461 
## Sample size: 50 
## Mean of sample means: 0.4055457 
## Sample size: 100 
## Mean of sample means: 0.3991292 
## 
## Dataset: Lognormal 
## Mean of original dataset: 1.769453 
## Sample size: 10 
## Mean of sample means: 1.858472 
## Sample size: 50 
## Mean of sample means: 1.779636 
## Sample size: 100 
## Mean of sample means: 1.753175

By comparing the sample means for different values of n with the population mean (original dataset), we observe that as n increases, the sample mean converges to the population mean for all types of distribution.

#3
# Plot histograms (7x4 grid)
par(mfrow = c(7, 4), mar = c(2, 2, 2, 2))  # Set up a grid for plotting

for (name in names(datasets)) {
  dataset <- datasets[[name]]  # Current dataset
  
  # Column 1: Histogram of the original dataset
  hist(dataset, 
       main = paste("Original:", name), 
       col = "skyblue", 
       xlab = "", 
       ylab = "", 
       breaks = 20)
  
  # Columns 2-4: Histograms of sample means for each n
  for (i in 1:length(sample_sizes)) {
    n <- sample_sizes[i]
    
    # Ensure sample means exist and are numeric
    sample_mean_data <- unlist(sample_means[[name]][[paste0("n_", n)]])
    
    # Plot the histogram of sample means
    hist(sample_mean_data, 
         main = paste("Means (n =", n, ")"), 
         col = "lightgreen", 
         xlab = "", 
         ylab = "", 
         breaks = 15)
  }
}

By observing the histograms for different sample sizes (n), we can conclude that as the sample size (n) increases, the distribution of the sample mean approaches a normal distribution, regardless of the underlying distribution of the data. This behavior aligns with the Central Limit Theorem, which states that the sampling distribution of the mean becomes approximately normal as the sample size increases, provided the samples are independent and identically distributed.

#4
library(missMethods)
## Warning: package 'missMethods' was built under R version 4.4.2
library(mice)
## Warning: package 'mice' was built under R version 4.4.2
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
par(mfrow=c(1,3),mar=c(2,2,2,2))

# Number of observations
n <- 500

# Generate 500 observations for X ~ N(0,1) and Y ~ N(6,4)
X <- rnorm(n, mean = 0, sd = 1)  # X ~ N(0, 1)
Y <- rnorm(n, mean = 6, sd = 2)  # Y ~ N(6, 4)

# Combine into a data frame
data<- data.frame(X = X, Y = Y)


# MCAR - Missing Completely at Random 
data_mcar <- delete_MCAR(data,.2,"X")

plot(data$X, data$Y, 
     col = "red",         # Plot missing values in red
     main = "MCAR Missingness", 
     xlab = "X", ylab = "Y", pch = 16)
points(data_mcar$X, data_mcar$Y, 
       col = "blue", pch = 16)  


# MAR - Missing at Random 
data_mar <- delete_MAR_1_to_x(data, 0.2 ,'X','Y',.2*n)

plot(data$X, data$Y, 
     col = "red",            # Plot missing values in red
     main = "MAR Missingness", 
     xlab = "X", ylab = "Y", pch = 16)
points(data_mar$X, data_mar$Y, 
       col = "blue", pch = 16) 

# NMAR - Not Missing at Random 
data_nmar <- delete_MNAR_1_to_x(data, 0.2 ,'X',.2*n)

plot(data$X, data$Y, 
     col = "red",             # Plot missing values in red
     main = "NMAR Missingness", 
     xlab = "X", ylab = "Y", pch = 16)
points(data_nmar$X, data_nmar$Y, 
       col = "blue", pch = 16) 

MCAR missingness(missing values does not depend on X and Y) MAR missingness(missing values in X depend on Y) NMAR missingness(missing values in X depend on X itself)

library(missMethods)
library(mice)

# Function to calculate RMSE
calculate_rmse <- function(true, imputed) {
  sqrt(mean((true - imputed)^2, na.rm = TRUE))
}

# Function to generate data with missing values
generate_missing_data <- function(data, type, missing_percentage) {
  if (type == "MCAR") {
    # Apply MCAR deletion to the entire dataset (not just a column)
    data <- delete_MCAR(data, p = missing_percentage)  
  } else if (type == "MAR") {
    # Apply MAR deletion to the entire dataset
    # Assuming we want to create missingness in column 'X' based on 'Y'
    data <- delete_MAR_1_to_x(data, p = missing_percentage, 
                               cols_mis = "X", cols_ctrl = "Y", x = 2)  # Adjust `x` as necessary
  } else if (type == "NMAR") {
    # Apply NMAR deletion to the entire dataset
    data <- delete_MNAR_1_to_x(data, p = missing_percentage, 
                               cols_mis = "X", x = 2)  # Adjust `x` as necessary
  }
  return(data)
}

# Function to compute RMSE for a given missing type and percentage
compute_rmse <- function(data_complete, missing_percentage, type) {
  data_missing <- generate_missing_data(data_complete, type, missing_percentage)
  
  # Imputation using PMM and Mean methods
  imputed_pmm <- complete(mice(data_missing, m = 5, method = "pmm", maxit = 5, seed = 123))
  imputed_mean <- complete(mice(data_missing, m = 1, method = "mean", maxit = 1, seed = 123))
  
  # RMSE Calculation
  missing_indices <- which(is.na(data_missing$X))
  true_values <- data_complete$X[missing_indices]
  rmse_pmm <- calculate_rmse(true_values, imputed_pmm$X[missing_indices])
  rmse_mean <- calculate_rmse(true_values, imputed_mean$X[missing_indices])
  
  return(c(rmse_pmm, rmse_mean))
}

# Generate complete data
set.seed(123)
data_complete <- data.frame(X = rnorm(500, mean = 0, sd = 1), 
                            Y = rnorm(500, mean = 6, sd = 4))

# Types of missing data
missing_types <- c("MCAR", "MAR", "NMAR")

# Missing percentages
missing_percentages <- c(0.2, 0.3)

# Results storage
results <- data.frame(Type = character(), Missing_Percentage = numeric(), 
                      RMSE_PMM = numeric(), RMSE_Mean = numeric())

# Compute RMSE for each missing type and percentage
for (type in missing_types) {
  for (p in missing_percentages) {
    rmse_values <- compute_rmse(data_complete, p, type)
    results <- rbind(results, 
                     data.frame(Type = type, 
                                Missing_Percentage = p * 100, 
                                RMSE_PMM = rmse_values[1], 
                                RMSE_Mean = rmse_values[2]))
  }
}
## 
##  iter imp variable
##   1   1  X  Y
##   1   2  X  Y
##   1   3  X  Y
##   1   4  X  Y
##   1   5  X  Y
##   2   1  X  Y
##   2   2  X  Y
##   2   3  X  Y
##   2   4  X  Y
##   2   5  X  Y
##   3   1  X  Y
##   3   2  X  Y
##   3   3  X  Y
##   3   4  X  Y
##   3   5  X  Y
##   4   1  X  Y
##   4   2  X  Y
##   4   3  X  Y
##   4   4  X  Y
##   4   5  X  Y
##   5   1  X  Y
##   5   2  X  Y
##   5   3  X  Y
##   5   4  X  Y
##   5   5  X  Y
## 
##  iter imp variable
##   1   1  X  Y
## 
##  iter imp variable
##   1   1  X  Y
##   1   2  X  Y
##   1   3  X  Y
##   1   4  X  Y
##   1   5  X  Y
##   2   1  X  Y
##   2   2  X  Y
##   2   3  X  Y
##   2   4  X  Y
##   2   5  X  Y
##   3   1  X  Y
##   3   2  X  Y
##   3   3  X  Y
##   3   4  X  Y
##   3   5  X  Y
##   4   1  X  Y
##   4   2  X  Y
##   4   3  X  Y
##   4   4  X  Y
##   4   5  X  Y
##   5   1  X  Y
##   5   2  X  Y
##   5   3  X  Y
##   5   4  X  Y
##   5   5  X  Y
## 
##  iter imp variable
##   1   1  X  Y
## 
##  iter imp variable
##   1   1  X
##   1   2  X
##   1   3  X
##   1   4  X
##   1   5  X
##   2   1  X
##   2   2  X
##   2   3  X
##   2   4  X
##   2   5  X
##   3   1  X
##   3   2  X
##   3   3  X
##   3   4  X
##   3   5  X
##   4   1  X
##   4   2  X
##   4   3  X
##   4   4  X
##   4   5  X
##   5   1  X
##   5   2  X
##   5   3  X
##   5   4  X
##   5   5  X
## 
##  iter imp variable
##   1   1  X
## 
##  iter imp variable
##   1   1  X
##   1   2  X
##   1   3  X
##   1   4  X
##   1   5  X
##   2   1  X
##   2   2  X
##   2   3  X
##   2   4  X
##   2   5  X
##   3   1  X
##   3   2  X
##   3   3  X
##   3   4  X
##   3   5  X
##   4   1  X
##   4   2  X
##   4   3  X
##   4   4  X
##   4   5  X
##   5   1  X
##   5   2  X
##   5   3  X
##   5   4  X
##   5   5  X
## 
##  iter imp variable
##   1   1  X
## 
##  iter imp variable
##   1   1  X
##   1   2  X
##   1   3  X
##   1   4  X
##   1   5  X
##   2   1  X
##   2   2  X
##   2   3  X
##   2   4  X
##   2   5  X
##   3   1  X
##   3   2  X
##   3   3  X
##   3   4  X
##   3   5  X
##   4   1  X
##   4   2  X
##   4   3  X
##   4   4  X
##   4   5  X
##   5   1  X
##   5   2  X
##   5   3  X
##   5   4  X
##   5   5  X
## 
##  iter imp variable
##   1   1  X
## 
##  iter imp variable
##   1   1  X
##   1   2  X
##   1   3  X
##   1   4  X
##   1   5  X
##   2   1  X
##   2   2  X
##   2   3  X
##   2   4  X
##   2   5  X
##   3   1  X
##   3   2  X
##   3   3  X
##   3   4  X
##   3   5  X
##   4   1  X
##   4   2  X
##   4   3  X
##   4   4  X
##   4   5  X
##   5   1  X
##   5   2  X
##   5   3  X
##   5   4  X
##   5   5  X
## 
##  iter imp variable
##   1   1  X
# Print RMSE results
print(results)
##   Type Missing_Percentage RMSE_PMM RMSE_Mean
## 1 MCAR                 20 1.633078 0.8656281
## 2 MCAR                 30 1.366929 0.8434411
## 3  MAR                 20 1.301241 0.9479431
## 4  MAR                 30 1.359620 1.0327414
## 5 NMAR                 20 1.208763 0.9809273
## 6 NMAR                 30 1.305560 1.0186408

•Mean imputation consistently outperforms PMM in terms of RMSE, with NMAR performing the best among the missing data mechanisms.

•As the missing data percentage increases, the RMSE increases for both imputation methods, but PMM is more sensitive to higher levels of missing data.

•The results suggest that simpler imputation methods like Mean imputation may be more robust under increasing levels of missingness, while more sophisticated methods like PMM might not always improve performance, especially with higher missingness percentages or when the missingness pattern is random.

# Import the dataset
dataset <- read.csv("C:/Users/ADMIN/Downloads/diabetes.csv")

# Dataset A (Pregnancies)
A <- dataset$Pregnancies

# 5
# Compute the median
a <- median(A, na.rm = TRUE)
print(paste("Median of A:", a))
## [1] "Median of A: 3"
# Select a random sample of size 100
set.seed(123)  # For reproducibility
sample_A <- sample(A, size = 100, replace = FALSE)

# Count the number of observations in the sample less than median (a)
Z <- sum(sample_A < a)
print(paste("Number of observations less than median (Z):", Z))
## [1] "Number of observations less than median (Z): 47"
# Hypothesis testing (p > 0.5) using prop.test
p_value <- prop.test(Z, length(sample_A), p = 0.5, alternative = "greater")$p.value
print(paste("P-value for the test:", p_value))
## [1] "P-value for the test: 0.691462461274013"
# Decision
if (p_value < 0.05) {
  print("Reject the null hypothesis: p is significantly larger than 0.5")
} else {
  print("Fail to reject the null hypothesis.")
}
## [1] "Fail to reject the null hypothesis."
# Confidence interval for p using prop.test
ci <- prop.test(Z, length(sample_A), p = 0.5)$conf.int
print(paste("95% Confidence Interval for p:", ci[1], "-", ci[2]))
## [1] "95% Confidence Interval for p: 0.370353467713434 - 0.5719774737526"
#6
# Dataset B (Glucose)
dataset <- read.csv("C:/Users/ADMIN/Downloads/diabetes.csv")
B <- dataset$Glucose
# Q-Q Plot
qqnorm(B, main = "Q-Q Plot of Glucose Data vs. Normal Distribution")
qqline(B, col = "red", lwd = 2)

# Kolmogorov-Smirnov test for goodness-of-fit
ks_test <- ks.test(B, "pnorm", mean = mean(B, na.rm = TRUE), sd = sd(B, na.rm = TRUE))
## Warning in ks.test.default(B, "pnorm", mean = mean(B, na.rm = TRUE), sd = sd(B,
## : ties should not be present for the one-sample Kolmogorov-Smirnov test
# Results
print(paste("Kolmogorov-Smirnov test statistic:", ks_test$statistic))
## [1] "Kolmogorov-Smirnov test statistic: 0.0639984760759441"
print(paste("P-value:", ks_test$p.value))
## [1] "P-value: 0.00370523270549822"
# Decision
if (ks_test$p.value < 0.05) {
  print("Reject the null hypothesis: Data does not follow a normal distribution")
} else {
  print("Fail to reject the null hypothesis: Data is normally distributed")
}
## [1] "Reject the null hypothesis: Data does not follow a normal distribution"

From Q-Q plot and Kolmogorov-Smirnov test we can say our data does not follow normal distribution

# 7
# Divide Dataset B randomly into two subsets (3:2 ratio)
set.seed(123)  # For reproducibility
n <- length(B)
idx <- sample(1:n, size = floor(0.6 * n))
B1 <- B[idx]
B2 <- B[-idx]

# (a) Test for equality of variances (F-test)
var_test <- var.test(B1, B2)
print(paste("F-test statistic:", var_test$statistic))
## [1] "F-test statistic: 1.03847305499081"
print(paste("P-value for variance test:", var_test$p.value))
## [1] "P-value for variance test: 0.722995952686646"
if (var_test$p.value < 0.01) {
  print("Reject the null hypothesis: Variances are significantly different")
} else {
  print("Fail to reject the null hypothesis: Variances are equal")
}
## [1] "Fail to reject the null hypothesis: Variances are equal"
# (b) Test for equality of means (t-test assuming equal/unequal variances)
t_test <- t.test(B1, B2, var.equal = (var_test$p.value >= 0.01))
print(paste("T-test statistic:", t_test$statistic))
## [1] "T-test statistic: 1.12282087004744"
print(paste("P-value for mean test:", t_test$p.value))
## [1] "P-value for mean test: 0.261865379818349"
if (t_test$p.value < 0.01) {
  print("Reject the null hypothesis: Means are significantly different")
} else {
  print("Fail to reject the null hypothesis: Means are equal")
}
## [1] "Fail to reject the null hypothesis: Means are equal"
# (c) 99% Confidence interval for the difference of means
ci_diff <- t_test$conf.int
print(paste("99% Confidence Interval for the difference of means:", ci_diff[1], "-", ci_diff[2]))
## [1] "99% Confidence Interval for the difference of means: -1.97759162940985 - 7.26291065820713"