library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
data_path <- "C:/Users/shanata/Downloads/smoking_driking_dataset_Ver01.csv"
data <- read.csv(data_path)
There is no significant difference in mean cholestral levels between drinkers and non-drinkers among females.
There is a significant difference in mean cholestral levels between drinkers and non-drinkers among females.
For this test:
It represents the acceptable risk of making a Type I error, which means wrongly concluding that there is an effect when there isn’t one. In simpler terms, we want to be reasonably sure that our results are not due to chance alone, so we set this threshold at 5% or 0.05.
The power level of 0.80 is often chosen as a standard because it represents an 80% chance of correctly detecting a real effect if it exists.
In this case, we selected 0.5, which is considered a moderate effect size. It’s like saying we want to be able to detect differences that are big enough to matter in practical terms.
library(pwr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
library(pwr)
alpha <- 0.05
power <- 0.80
effect_size <- 0.5
sample_size <- pwr.t.test(d = effect_size, sig.level = alpha, power = power, type = "two.sample", alternative = "two.sided")
# Check if the sample size is met
if (sample_size$n >= length(unique(data$DRK_YN))) {
cat("Sample size is met. We can perform the Neyman-Pearson test.\n")
} else {
cat("Sample size is not met. The dataset doesn't have enough unique values for smoking status to perform the test.\n")
}
## Sample size is met. We can perform the Neyman-Pearson test.
data|>
ggplot() +
geom_boxplot(mapping = aes(x = tot_chole,
y = DRK_YN)) +
labs(title = "Cholestral effect on drinkers and non-drinkers",
x = "cholestral levels [mg/dL]",
y = "Drinkers VS Non-Drinkers") +
theme_minimal()
It looks like that the average cholestral for drinkers appear to be slightly higher than the non-drinkers.
avg_cholestral <- data |>
group_by(DRK_YN) |>
summarize(avg_cholestral = mean(tot_chole)) |>
arrange(DRK_YN)
avg_cholestral
## # A tibble: 2 Ă— 2
## DRK_YN avg_cholestral
## <chr> <dbl>
## 1 N 195.
## 2 Y 196.
observed_diff <- (avg_cholestral$avg_cholestral[2] -
avg_cholestral$avg_cholestral[1])
paste("Observed Difference: ", observed_diff)
## [1] "Observed Difference: 1.52476774577156"
test <- pwrss.t.2means(mu1 = 100,
sd1 = sd(pluck(data, "tot_chole")),
kappa = 1,
power = .85, alpha = 0.1,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.85
## n1 = 4
## n2 = 4
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 6
## Non-centrality parameter = 3.658
## Type I error rate = 0.1
## Type II error rate = 0.15
plot(test)
We can notice that the the probability of correctly rejecting the null hypothesis when the alternative hypothesis is 94%
10% probability of rejecting the null hypothesis when it is true.
6% probability of accepting the null hypothesis when it is false
data <- na.omit(data)
# Subset data for Female individuals
female_data <- data[data$sex == "Female", ]
condition1 <- female_data$DRK_YN == "Y"
condition2 <- female_data$DRK_YN == "N"
# Use the subset() function to extract a specific column based on conditions
cholesterol_drinkers <- subset(female_data, condition1, select = tot_chole)
cholesterol_non_drinkers <- subset(female_data, condition2, select = tot_chole)
# Check the lengths of the subsets
print(nrow(cholesterol_drinkers))
## [1] 141171
print(nrow(cholesterol_non_drinkers))
## [1] 323760
# Check if there are enough observations for the t-test
if (nrow(cholesterol_drinkers) >= 2 && nrow(cholesterol_non_drinkers) >= 2) {
# Perform t-test for Total Cholesterol between Female Drinkers and Non-Drinkers
t_test_result <- t.test(cholesterol_drinkers, cholesterol_non_drinkers)
# Check for missing values and convert to numeric
cholesterol_drinkers <- as.numeric(cholesterol_drinkers[!is.na(cholesterol_drinkers)])
# Check for missing values and convert to numeric
cholesterol_non_drinkers <- as.numeric(cholesterol_non_drinkers[!is.na(cholesterol_non_drinkers)])
boxplot_data <- list(
Alcoholic_Female = cholesterol_drinkers,
Non_alcoholic_Female = cholesterol_non_drinkers
)
# Create a boxplot
boxplot(boxplot_data,
main = "Total Cholesterol Levels by Drinking Status among Females",
xlab = "Drinking Status",
ylab = "Total Cholesterol (mg/dL)")
# Print t-test results
cat("T-Test Result for Total Cholesterol Levels between Female Drinkers and Non-Drinkers:\n")
print(t_test_result)
} else {
cat("Not enough observations for the t-test.\n")
}
## T-Test Result for Total Cholesterol Levels between Female Drinkers and Non-Drinkers:
##
## Welch Two Sample t-test
##
## data: cholesterol_drinkers and cholesterol_non_drinkers
## t = -23.439, df = 286406, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.023485 -2.556857
## sample estimates:
## mean of x mean of y
## 194.5373 197.3275
Since the p-value is significantly less than any reasonable significance level, it provides strong evidence against the null hypothesis. In statistical hypothesis testing, a small p-value suggests that the observed differences between the groups are highly unlikely to have occurred by chance.
Therefore, in this case, we should reject the null hypothesis and conclude that there is a significant difference in mean cholesterol levels between female drinkers and female non-drinkers among females. The evidence from the t-test supports the alternative hypothesis, indicating that the difference in mean cholesterol levels is real and not due to random variability
var_test_result <- var.test(cholesterol_drinkers, cholesterol_non_drinkers)
# Print the test result
print(var_test_result)
##
## F test to compare two variances
##
## data: cholesterol_drinkers and cholesterol_non_drinkers
## F = 0.87211, num df = 141170, denom df = 323759, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.8644363 0.8798562
## sample estimates:
## ratio of variances
## 0.8721054
The p-value from the F-test is extremely small (p-value < 2.2e-16), indicating strong evidence against the null hypothesis. Therefore, we reject the null hypothesis. The F-test suggests that there is a significant difference in the variances of cholesterol levels between female drinkers and female non-drinkers among females.
In conclusion, based on the results of both the t-test and the F-test, you can confidently state that there is a significant difference in both mean cholesterol levels and variances of cholesterol levels between female drinkers and female non-drinkers among females.
Hypothesis 2: Impact of Smoking Status on Blood Pressure
Null Hypothesis (H0): There is no significant difference in mean systolic blood pressure (SBP) between smokers and non-smokers.
Alternative Hypothesis (H1): There is a significant difference in mean SBP between smokers and non-smokers.
For this hypothesis:
Alpha Level (Significance Level): Let’s choose an alpha level of 0.05, indicating a 5% chance of making a Type I error.
Power Level: We aim for a power of 0.80 or higher, indicating an 80% chance of detecting a true effect if it exists.
Minimum Effect Size: We need to determine the minimum effect size that is practically meaningful in your context. This could be, for example, a difference of 5 mm Hg in SBP between smokers and non-smokers.
# Subset data for Female individuals
male_data <- data[data$sex == "Male", ]
condition1 <- female_data$SMK_stat_type_cd == 1
condition2 <- female_data$SMK_stat_type_cd == 3
# Use the subset() function to extract a specific column based on conditions
systolic_non_smokers <- subset(male_data, condition1, select = SBP)
systolic_smokers <- subset(male_data, condition2, select = SBP)
# Check the lengths of the subsets
print(nrow(systolic_smokers))
## [1] 18387
print(nrow(systolic_non_smokers))
## [1] 495625
# Check if there are enough observations for the t-test
if (nrow(systolic_smokers) >= 2 && nrow(systolic_non_smokers) >= 2) {
# Perform t-test for Total Cholesterol between Female Drinkers and Non-Drinkers
t_test_result <- t.test(systolic_smokers, systolic_non_smokers)
# Check for missing values and convert to numeric
systolic_smokers <- as.numeric(systolic_smokers[!is.na(systolic_smokers)])
# Check for missing values and convert to numeric
systolic_non_smokers <- as.numeric(systolic_non_smokers[!is.na(systolic_non_smokers)])
boxplot_data <- list(
smokers_male = systolic_smokers,
Non_smokers_male = systolic_non_smokers
)
# Create a boxplot
boxplot(boxplot_data,
main = "Total systolic Blood Pressure Levels by smoking Status among males",
xlab = "smoking Status",
ylab = "Total systolic blood Pressure")
# Print t-test results
cat("T-Test Result for Total systolic blood pressure Levels between male smokers and Non-smokers:\n")
print(t_test_result)
} else {
cat("Not enough observations for the t-test.\n")
}
## T-Test Result for Total systolic blood pressure Levels between male smokers and Non-smokers:
##
## Welch Two Sample t-test
##
## data: systolic_smokers and systolic_non_smokers
## t = 0.52403, df = 19769, p-value = 0.6003
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1468081 0.2539506
## sample estimates:
## mean of x mean of y
## 124.8941 124.8405
The p-value is 0.6003, which is greater than the chosen alpha level of 0.05. This means that we fail to reject the null hypothesis.
var_test_result <- var.test(systolic_smokers, systolic_non_smokers)
# Print the test result
print(var_test_result)
##
## F test to compare two variances
##
## data: systolic_smokers and systolic_non_smokers
## F = 1.0037, num df = 18386, denom df = 495624, p-value = 0.727
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.9834724 1.0245147
## sample estimates:
## ratio of variances
## 1.003677
The p-value from the F-test is 0.727, which is greater than the chosen alpha level of 0.05. Therefore, we fail to reject the null hypothesis. The F-test suggests that there is no significant difference in the variances of SBP between smokers and non-smokers. In summary, based on the results of the F-test, we do not find strong evidence to suggest a significant difference in the variances of SBP between smokers and non-smokers.