এই নোটে আমাদের পুরো আলাপ লাইন-বাই-লাইন সাজানো
হলো—
Variance (σ²), Standard Deviation (σ), Standard Error (SE), Population
vs Sample (n বনাম n−1), Empirical rule (68–95–99.7%), sample SD থেকে σ
estimate, Levene’s test (mean vs median /
Brown–Forsythe), ম্যানুয়াল ক্যালকুলেশন, F থেকে p-value বের
করা, pooled t-test vs Welch’s t-test, df পার্থক্য,
one-tailed / two-tailed নিয়ম ও বাস্তব উদাহরণ, এবং
আপনার দেয়া R স্ক্রিপ্ট সম্পূর্ণ ব্যাখ্যাসহ।
Variance (σ² / s²): গড় থেকে বর্গ দূরত্বের গড় —
ডেটার ছড়ানোর পরিমাপ।
\[
\text{Population } \sigma^2 = \frac{\sum (x_i - \mu)^2}{N}, \qquad
\text{Sample } s^2 = \frac{\sum (x_i - \bar x)^2}{n-1} \quad
\text{(Bessel’s correction)}
\]
Standard Deviation (σ, s): Variance-এর বর্গমূল
(বোধগম্য ইউনিটে spread)
\[
SD = \sqrt{Variance}
\]
Standard Error (SE): sample mean-এর
অনিশ্চয়তা
\[
SE = \frac{s}{\sqrt{n}}
\]
Sample দিয়ে variance হিসাব করলে sample mean (\(\bar x\)) data-এর বেশি কাছে থাকে → variance underestimate হয়। এই bias ঠিক করতে (n−1) ব্যবহার করা হয়; এতে \(s^2\) হয় unbiased estimator of \(\sigma^2\)।
Data: 50, 55, 60, 65, 70
- \(\mu = 60\)
- \(\sigma^2 = 50\)
- \(\sigma = \sqrt{50} \approx
7.07\)
Sample: 50, 55, 60
- \(\bar x = 55\)
- \(s^2 = \dfrac{(50-55)^2 + (55-55)^2 +
(60-55)^2}{3-1} = 25\)
- \(s = 5\)
নোট: Population σ থাকলে এই রেঞ্জ reliable। Sample s ছোট হলে exact পার্সেন্টেজ match নাও করতে পারে; sample বড় হলে s ≈ σ।
set.seed(42)
mu <- 50; sigma <- 10
pop <- rnorm(1e5, mu, sigma)
est_s <- function(n, reps=500){
sapply(seq_len(reps), function(i){
x <- sample(pop, n)
sd(x) # sample SD (ddof=1 in R)
}) |> mean()
}
ns <- c(5,10,30,50,100,500,1000)
avg_s <- sapply(ns, est_s)
data.frame(n=ns, avg_sample_SD=avg_s, true_sigma=sigma)
## n avg_sample_SD true_sigma
## 1 5 9.345782 10
## 2 10 9.609166 10
## 3 30 9.895706 10
## 4 50 9.966052 10
## 5 100 9.941818 10
## 6 500 10.028773 10
## 7 1000 10.009633 10
Levene’s Test দিয়ে দুই বা ততোধিক গ্রুপের variance সমান কি না যাচাই করা হয়।
car::leveneTest(..., center=median) ডিফল্টভাবে
median-based চালায়।datA <- c(85, 90, 88, 75, 95)
datB <- c(80, 70, 78, 65, 74)
med_A <- median(datA)
med_A
med_B <- median(datB)
med_B
Z_A <-abs(med_A - datA)
Z_A
Z_B <- abs(med_B- datB)
Z_B
ZA_BAR <- mean(Z_A)
ZA_BAR
ZB_BAR <- mean(Z_B)
ZB_BAR
Z_BAR <- (sum(Z_A)+sum(Z_B))/ (length(Z_A)+length(Z_B))
Z_BAR
SSB <- (length(Z_A)* sum((ZA_BAR - Z_BAR)^2) + (length(Z_B)* sum((ZB_BAR - Z_BAR)^2)))
SSB
SSW <- sum((Z_A - ZA_BAR)^2) + sum((Z_B - ZB_BAR)^2)
SSW
df_between <- 1 # k-1
df_within <- 8 # N-K
MSB <- SSB/df_between
MSB
MSW <- SSW /df_within
MSW
F <- MSB / MSW
F
P_VALUE <-1- pf(F, 1,8)
P_VALUE
# Data for two groups
datA <- c(85, 90, 88, 75, 95)
datB <- c(80, 70, 78, 65, 74)
# Group medians (Brown–Forsythe uses median, robust to outliers)
med_A <- median(datA) # 88
med_B <- median(datB) # 74
# Absolute deviations from group medians: Z_ij = |x_ij - median_i|
Z_A <- abs(med_A - datA) # 3, 2, 0, 13, 7
Z_B <- abs(med_B - datB) # 6, 4, 4, 9, 0
# Group-wise mean of these deviations
ZA_BAR <- mean(Z_A) # 5.0
ZB_BAR <- mean(Z_B) # 4.6
# Overall mean of deviations (works for unequal n as well)
Z_BAR <- (sum(Z_A) + sum(Z_B)) / (length(Z_A) + length(Z_B)) # 4.8
# Between-group sum of squares (ANOVA on Z):
# SSB = Σ n_i * (mean_i - overall_mean)^2
# Note: sum((scalar)^2) == (scalar)^2; sum() is unnecessary but harmless.
SSB <- length(Z_A) * (ZA_BAR - Z_BAR)^2 +
length(Z_B) * (ZB_BAR - Z_BAR)^2
# Within-group sum of squares:
# SSW = Σ Σ (Z_ij - mean_i)^2
SSW <- sum((Z_A - ZA_BAR)^2) + sum((Z_B - ZB_BAR)^2)
# Degrees of freedom
df_between <- 1 # k - 1 = 2 - 1
df_within <- 8 # N - k = 10 - 2
# Mean squares
MSB <- SSB / df_between
MSW <- SSW / df_within
# F statistic
F <- MSB / MSW
# p-value from F (right-tail)
P_VALUE <- 1 - pf(F, df_between, df_within)
# Display
F
P_VALUE
car::leveneTest)# install.packages("car")
library(car)
group <- factor(c(rep("A", length(datA)), rep("B", length(datB))))
values <- c(datA, datB)
# Brown–Forsythe (median-based) by default:
leveneTest(values ~ group, center = median)
# Example: F=0.02145, df1=1, df2=8
1 - pf(0.02145, 1, 8)
Pooled t-test (equal var):
\[
t = \frac{\bar X_1 - \bar
X_2}{s_p\sqrt{\frac{1}{n_1}+\frac{1}{n_2}}},\quad
s_p^2 = \frac{(n_1-1)s_1^2 + (n_2-1)s_2^2}{n_1+n_2-2},\quad
df = n_1 + n_2 - 2
\]
Welch’s t-test (unequal var):
\[
t = \frac{\bar X_1 - \bar X_2}{\sqrt{\frac{s_1^2}{n_1} +
\frac{s_2^2}{n_2}}},\quad
df = \frac{(s_1^2/n_1 + s_2^2/n_2)^2}{\frac{(s_1^2/n_1)^2}{n_1-1} +
\frac{(s_2^2/n_2)^2}{n_2-1}}
\] (df সাধারণত ভগ্নাংশ)
A <- c(85, 90, 88, 75, 95)
B <- c(80, 70, 78, 65, 74)
# Pooled (equal variance)
t.test(A, B, var.equal = TRUE)
# Welch (default, unequal variance safe)
t.test(A, B, var.equal = FALSE)
curve_t <- function(df=9, type=c("left","right","two"),
alpha=0.05, t_value=NULL){
type <- match.arg(type)
xs <- seq(-5, 5, length.out = 1000)
ys <- dt(xs, df)
plot(xs, ys, type="l", lwd=2, col="blue",
ylab="Density", xlab="t", main=paste0("t (df=",df,") — ", type,"-tailed"))
if(type=="left"){
tcrit <- qt(alpha, df)
polygon(c(min(xs), xs[xs<=tcrit], tcrit),
c(0, ys[xs<=tcrit], 0), col=rgb(1,0,0,0.5), border=NA)
abline(v=tcrit, lty=2)
} else if(type=="right"){
tcrit <- qt(1-alpha, df)
polygon(c(tcrit, xs[xs>=tcrit], max(xs)),
c(0, ys[xs>=tcrit], 0), col=rgb(1,0,0,0.5), border=NA)
abline(v=tcrit, lty=2)
} else {
tcrit <- qt(1-alpha/2, df)
xl <- -tcrit; xr <- tcrit
polygon(c(min(xs), xs[xs<=xl], xl),
c(0, ys[xs<=xl], 0), col=rgb(1,0,0,0.5), border=NA)
polygon(c(xr, xs[xs>=xr], max(xs)),
c(0, ys[xs>=xr], 0), col=rgb(1,0,0,0.5), border=NA)
abline(v=xl, lty=2); abline(v=xr, lty=2)
}
if(!is.null(t_value)){
abline(v=t_value, col="purple", lwd=2, lty=3)
legend("topright", legend=c(sprintf("t_crit = %.3f", if(type=="two") qt(1-alpha/2, df) else if(type=="left") qt(alpha, df) else qt(1-alpha, df)),
sprintf("t_value = %.3f", t_value)),
bty="n")
}
}
par(mfrow=c(1,3))
curve_t(df=9, type="left", alpha=0.05, t_value=-2.5)
curve_t(df=9, type="right", alpha=0.05, t_value= 2.3)
curve_t(df=9, type="two", alpha=0.05, t_value= 2.5)
par(mfrow=c(1,1))
1 - pf(F, df1, df2)# From F and df:
p_from_F <- function(Fval, df1, df2) 1 - pf(Fval, df1, df2)
# From t and df (two-sided):
p_from_t_two <- function(tval, df) 2 * (1 - pt(abs(tval), df))
# From t and df (right- or left-sided):
p_from_t_right <- function(tval, df) 1 - pt(tval, df)
p_from_t_left <- function(tval, df) pt(tval, df)