data_path="C:/Users/shanata/Downloads/smoking_driking_dataset_Ver01.csv"
data= read.csv(data_path)
library(gplots)
## Warning: package 'gplots' was built under R version 4.2.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3

Finding correlation between alcohol drinkers and the other health indicators

selected_columns <- c("DRK_YN", "SBP", "DBP", "tot_chole", "hemoglobin", "gamma_GTP","age","height","weight","HDL_chole","LDL_chole","triglyceride")
selected_data <- data[, selected_columns]
selected_data$DRK_YN <- as.numeric(selected_data$DRK_YN == "Y")
correlation_matrix <- cor(selected_data, use = "complete.obs")
print(correlation_matrix)
##                   DRK_YN         SBP         DBP   tot_chole hemoglobin
## DRK_YN        1.00000000  0.03314876  0.10087576  0.01972015  0.2993408
## SBP           0.03314876  1.00000000  0.74113088  0.06855719  0.1665302
## DBP           0.10087576  0.74113088  1.00000000  0.11191493  0.2419801
## tot_chole     0.01972015  0.06855719  0.11191493  1.00000000  0.1212718
## hemoglobin    0.29934081  0.16653022  0.24198015  0.12127179  1.0000000
## gamma_GTP     0.20509398  0.16143364  0.17560997  0.09453711  0.2262180
## age          -0.28458709  0.26552985  0.10884678  0.01144622 -0.1730808
## height        0.37456561  0.03503012  0.10877957 -0.02323965  0.5318984
## weight        0.26428197  0.25077043  0.27789070  0.06323790  0.4994906
## HDL_chole     0.04251386 -0.11177157 -0.09383848  0.16386850 -0.1818647
## LDL_chole    -0.04369311  0.03361923  0.06698422  0.87736727  0.1016460
## triglyceride  0.10439795  0.18600294  0.19865096  0.27068333  0.2416452
##                 gamma_GTP         age      height      weight   HDL_chole
## DRK_YN        0.205093984 -0.28458709  0.37456561  0.26428197  0.04251386
## SBP           0.161433643  0.26552985  0.03503012  0.25077043 -0.11177157
## DBP           0.175609972  0.10884678  0.10877957  0.27789070 -0.09383848
## tot_chole     0.094537112  0.01144622 -0.02323965  0.06323790  0.16386850
## hemoglobin    0.226217983 -0.17308078  0.53189841  0.49949058 -0.18186469
## gamma_GTP     1.000000000  0.01739127  0.16233979  0.22188103 -0.05570872
## age           0.017391274  1.00000000 -0.39850118 -0.19533671 -0.10462419
## height        0.162339793 -0.39850118  1.00000000  0.66882349 -0.14859911
## weight        0.221881026 -0.19533671  0.66882349  1.00000000 -0.28768815
## HDL_chole    -0.055708725 -0.10462419 -0.14859911 -0.28768815  1.00000000
## LDL_chole    -0.008543624  0.02949652 -0.01545036  0.06785867  0.02208907
## triglyceride  0.299026552  0.04354936  0.13761061  0.28377397 -0.26836597
##                 LDL_chole triglyceride
## DRK_YN       -0.043693107   0.10439795
## SBP           0.033619232   0.18600294
## DBP           0.066984222   0.19865096
## tot_chole     0.877367272   0.27068333
## hemoglobin    0.101645958   0.24164518
## gamma_GTP    -0.008543624   0.29902655
## age           0.029496516   0.04354936
## height       -0.015450360   0.13761061
## weight        0.067858670   0.28377397
## HDL_chole     0.022089065  -0.26836597
## LDL_chole     1.000000000   0.02957086
## triglyceride  0.029570857   1.00000000
heatmap(correlation_matrix, 
        col = colorRampPalette(c("blue", "white", "red"))(20), 
        main = "Correlation Heatmap")

        #xlab = "Variables", ylab = "Variables")

Modeling the probability of being a drinker based on systolic blood pressure (SBP), diastolic blood pressure (DBP), total cholesterol (tot_chole), and gamma-Glutamyl Transferase (gamma_GTP).

selected_columns <- c("SBP", "DBP", "tot_chole", "gamma_GTP", "DRK_YN")

# Create a new data frame with selected columns
selected_data <- data[, selected_columns]

# Convert drinking column to numeric (assuming it is coded as Y/N)
selected_data$DRK_YN <- as.numeric(selected_data$DRK_YN == "Y")

# Fit logistic regression model
logistic_model <- glm(DRK_YN ~ SBP + DBP + tot_chole + gamma_GTP, data = selected_data, family = binomial)

# Print model summary
summary(logistic_model)
## 
## Call:
## glm(formula = DRK_YN ~ SBP + DBP + tot_chole + gamma_GTP, family = binomial, 
##     data = selected_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.9971  -1.0805  -0.7061   1.1850   2.1849  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.273e-01  2.066e-02  -20.68   <2e-16 ***
## SBP         -1.701e-02  2.166e-04  -78.54   <2e-16 ***
## DBP          2.804e-02  3.217e-04   87.13   <2e-16 ***
## tot_chole   -1.137e-03  5.549e-05  -20.49   <2e-16 ***
## gamma_GTP    1.803e-02  9.013e-05  200.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1374297  on 991345  degrees of freedom
## Residual deviance: 1298044  on 991341  degrees of freedom
## AIC: 1298054
## 
## Number of Fisher Scoring iterations: 5

Findings:

Individuals with higher systolic blood pressure are less likely to be drinkers

Individuals with higher total cholesterol are less likely to be drinkers compared to those with lower total cholesterol.

Possible reasons:

  1. Health consciousness : They might be actively managing their health, which could include lifestyle choices such as abstaining from alcohol consumption.

  2. Medical Advice: these induviduals may receive medical advice to limit or avoid alcohol consumption.

  3. Co-occurring Health Conditions: High blood pressure and elevated cholesterol levels are often associated with other health conditions, such as cardiovascular diseases. Individuals with these conditions may be more inclined to adopt healthier habits

The model suggests a relationship between health indicators and drinking habits, but it doesn’t provide evidence for causation.

# Predict probabilities
selected_data$predicted_prob <- predict(logistic_model, type = "response")
sum_prob <- sum(selected_data$predicted_prob)

# Calculate the percentage
percentage_drinker <- (sum_prob / length(selected_data$predicted_prob)) * 100

# Print the result
print(paste("Percentage of probability that someone is a drinker:", round(percentage_drinker, 2), "%"))
## [1] "Percentage of probability that someone is a drinker: 49.98 %"
# Assuming you have a data frame 'selected_data' with actual and predicted values
actual_values <- selected_data$DRK_YN

# Use the model to predict on the same dataset
predicted_probabilities <- predict(logistic_model, newdata = selected_data, type = "response")

# Convert predicted probabilities to binary predictions (0 or 1)
predicted_values <- ifelse(predicted_probabilities > 0.5, 1, 0)

# Create a confusion matrix
conf_matrix <- table(actual_values, predicted_values)

# Extract TP, TN, FP, FN from the confusion matrix
TP <- conf_matrix[2, 2]
TN <- conf_matrix[1, 1]
FP <- conf_matrix[1, 2]
FN <- conf_matrix[2, 1]

# Calculate accuracy
accuracy <- (TP + TN) / (TP + TN + FP + FN)

# Print accuracy
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.6253639

Most ocuring and least occuring

length(data$sex)
## [1] 991346
length(data$DRK_YN)
## [1] 991346
length(data$Hypertension)
## [1] 0
length(data$Hypotension)
## [1] 0
length(data$SMK_stat_type_cd)
## [1] 991346
cross_table <- table(data$sex, data$DRK_YN,data$SMK_stat_type_cd)

cross_table_df <- as.data.frame(cross_table)
cross_table_sum <- rowSums(cross_table)
cross_table_normalized <- scale(cross_table_sum)
print(cross_table_df)
##      Var1 Var2 Var3   Freq
## 1  Female    N    1 313180
## 2    Male    N    1  75830
## 3  Female    Y    1 124580
## 4    Male    Y    1  88851
## 5  Female    N    2   4430
## 6    Male    N    2  50041
## 7  Female    Y    2   6493
## 8    Male    Y    2 113987
## 9  Female    N    3   6150
## 10   Male    N    3  46227
## 11 Female    Y    3  10098
## 12   Male    Y    3 151479
df_sorted <- cross_table_df[order(-cross_table_df$Freq), ]
top_5_most_occuring <- head(df_sorted, 5)
top_5_least_occuring <- tail(df_sorted, 5)
print(top_5_least_occuring)
##      Var1 Var2 Var3  Freq
## 10   Male    N    3 46227
## 11 Female    Y    3 10098
## 7  Female    Y    2  6493
## 9  Female    N    3  6150
## 5  Female    N    2  4430
print(top_5_most_occuring)
##      Var1 Var2 Var3   Freq
## 1  Female    N    1 313180
## 12   Male    Y    3 151479
## 3  Female    Y    1 124580
## 8    Male    Y    2 113987
## 4    Male    Y    1  88851

Least occurring :

Male, Not a Drinker, still smoke

Female, Drinker, still smoke

Female, Drinker, used to smoke but quit

Female, Not a Drinker, still smoke

Female, Not a Drinker, used to smoke but quit

Most occuring

Female, Not a Drinker,Never Smoked

Male, Drinker,Still Smoke

Female, Drinker,Never Smoked

Male, Drinker,Used to Smoke but Quit

Male, Drinker, Never Smoked

Inference;

  1. Common Trends:

Non-Drinkers and Smoking:

Non-drinkers, particularly females, are less likely to engage in smoking. This is evident in the combinations where females who do not drink are less likely to smoke, whether currently or in the past. Drinking and Smoking:

Males who are drinkers are often observed to be smokers, whether they currently smoke or have quit in the past. This suggests a connection between drinking and smoking behaviors among males.

  1. Less Common Trends:

Females Quitting Smoking:

Females who quit smoking, especially those who are drinkers, are less common. This could indicate that females who quit smoking may also moderate their alcohol consumption. Non-Drinking Males Smoking:

Males who do not drink but still smoke are relatively uncommon. This supports the idea that individuals who abstain from drinking may also avoid other unhealthy behaviors like smoking.

  1. Prevalent Patterns:

Health-Conscious Group:

The most common combination involves females who do not drink and have never smoked. This may represent a health-conscious or risk-averse subgroup. Drinking and Smoking Among Males:

Males who are both drinkers and smokers, whether current or former, form a prevalent group. This suggests a subgroup where drinking and smoking behaviors tend to coexist among males.

library(ggplot2)

# Create a data frame for plotting
plot_data <- as.data.frame(cross_table_df)
plot_data$Combination <- interaction(plot_data$Var2, plot_data$Var3)

# Plot the bar chart
ggplot(plot_data, aes(x = Combination, y = Freq, fill = Var1)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Frequency of Combinations by Gender, Drinking, and Smoking Status",
       x = "Combination",
       y = "Frequency",
       fill = "Gender") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Hypothesis testing:

H0 (null hypothesis): There is no difference in mean systolic blood pressure between drinkers and non-drinkers.

H1 (alternative hypothesis): The mean systolic blood pressure is greater for drinkers than non-drinkers.

drinkers <- data[data$DRK_YN == "Y", ]
non_drinkers <- data[data$DRK_YN == "N", ]

t_test_result <- t.test(drinkers$SBP, non_drinkers$SBP, alternative = "greater")

print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  drinkers$SBP and non_drinkers$SBP
## t = 33.024, df = 988971, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.9161503       Inf
## sample estimates:
## mean of x mean of y 
##  122.9148  121.9506

Based on the results of the Welch Two Sample t-test, there is strong evidence to suggest that drinkers have a significantly higher mean systolic blood pressure than non-drinkers.

library(ggplot2)

# Create a boxplot
ggplot(data, aes(x = DRK_YN, y = SBP, fill = DRK_YN)) +
  geom_boxplot() +
  labs(title = "Boxplot of SBP by Drinking Status",
       x = "Drinking Status",
       y = "Systolic Blood Pressure") +
  theme_minimal()

Even though the boxplot might not visually show a significant difference, the statistical test takes into account the entire distribution and sample size, providing a more sensitive measure.

drinkers = data[data$DRK_YN=="Y",]
non_drinkers= data[data$DRK_YN=="N",]
mean_drinkers <- mean(drinkers$tot_chole)
se_drinkers <- sd(drinkers$tot_chole) / sqrt(length(drinkers$tot_chole))

mean_non_drinkers <- mean(non_drinkers$tot_chole)
se_non_drinkers <- sd(non_drinkers$tot_chole) / sqrt(length(non_drinkers$tot_chole))


ci_drinkers <- qnorm(c(0.025, 0.975), mean_drinkers, se_drinkers)
ci_non_drinkers <- qnorm(c(0.025, 0.975), mean_non_drinkers, se_non_drinkers)


print(paste("Drinkers Mean cholesterol: ", round(mean_drinkers, 2), " (", round(ci_drinkers[1], 2), "-", round(ci_drinkers[2], 2), ")"))
## [1] "Drinkers Mean cholesterol:  196.32  ( 196.21 - 196.43 )"
print(paste("Non-Drinkers Mean cholesterol: ", round(mean_non_drinkers, 2), " (", round(ci_non_drinkers[1], 2), "-", round(ci_non_drinkers[2], 2), ")"))
## [1] "Non-Drinkers Mean cholesterol:  194.79  ( 194.69 - 194.9 )"

Findings from confidence intervals:

The mean cholesterol level is higher in drinkers compared to non-drinkers

The confidence intervals for drinkers and non-drinkers do not overlap, suggesting a potential significant difference in mean cholesterol levels between the two groups.

data_plot <- data.frame(
  Group = c("Drinkers", "Non-Drinkers"),
  Mean_Cholesterol = c(196.32, 194.79),
  Lower_CI = c(196.21, 194.69),
  Upper_CI = c(196.43, 194.9)
)

# Create the bar plot
ggplot(data_plot, aes(x = Group, y = Mean_Cholesterol, fill = Group)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7, color = "black", alpha = 0.7) +
  geom_point(position = position_dodge(width = 0.7), size = 3, shape = 21, fill = "white") +
  geom_errorbar(aes(ymin = Lower_CI, ymax = Upper_CI), position = position_dodge(width = 0.7), width = 0.25) +
  labs(title = "Mean Cholesterol Levels of Drinkers and Non-Drinkers",
       y = "Mean Cholesterol",
       x = "Group") +
  theme_minimal()

Using ANOVA to find the impact of smoking on triglycerdie values

health_variable <- data$triglyceride
smoking_status <- factor(data$SMK_stat_type_cd)

# Perform ANOVA
anova_result <- aov(health_variable ~ smoking_status, data = data)

# Print ANOVA table
print(summary(anova_result))
##                    Df    Sum Sq   Mean Sq F value Pr(>F)    
## smoking_status      2 4.881e+08 244066328   24525 <2e-16 ***
## Residuals      991343 9.866e+09      9952                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Findings:

The p-value (Pr(>F)) is extremely small (less than 0.05), indicating that we can reject the null hypothesis. Therefore, there is strong evidence to suggest that there is a significant difference in mean triglyceride values among different smoking statuses.

# Perform Tukey's HSD test
tukey_result <- TukeyHSD(anova_result)

# Print Tukey's HSD results
print(tukey_result)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = health_variable ~ smoking_status, data = data)
## 
## $smoking_status
##         diff      lwr      upr p adj
## 2-1 32.36302 31.72804 32.99800     0
## 3-1 52.70392 52.11550 53.29234     0
## 3-2 20.34090 19.58727 21.09453     0

Interpretations:

The Tukey HSD results show that all pairwise differences between smoking statuses are significant (p-value = 0).

Specifically, individuals who currently smoke (group 3) have significantly higher triglyceride values compared to both those who never smoked (group 1) and those who used to smoke but quit (group 2).

The difference in triglyceride values between those who never smoked and those who used to smoke but quit is also significant.

Results:

Smoking appears to have a significant impact on triglyceride values, with current smokers having higher triglyceride levels compared to both non-smokers and former smokers.

# Assuming 'health_outcome' is a continuous variable, 'smoking' is binary, and 'drinking' is a factor with 3 levels
combined_model <- lm(gamma_GTP ~ SMK_stat_type_cd * DRK_YN, data = data)
summary(combined_model)
## 
## Call:
## lm(formula = gamma_GTP ~ SMK_stat_type_cd * DRK_YN, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -62.54 -16.99  -9.66   2.46 974.19 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               18.6339     0.1546  120.55   <2e-16 ***
## SMK_stat_type_cd           6.1802     0.1048   58.96   <2e-16 ***
## DRK_YNY                   -0.4192     0.2266   -1.85   0.0643 .  
## SMK_stat_type_cd:DRK_YNY   9.2614     0.1316   70.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.36 on 991342 degrees of freedom
## Multiple R-squared:  0.08025,    Adjusted R-squared:  0.08025 
## F-statistic: 2.883e+04 on 3 and 991342 DF,  p-value: < 2.2e-16

Findings:

When SMK_stat_type_cd is 3 and DRK_YN is Y, the estimated value for gamma_GTP is approximately 64.54.

when SMK_stat_type_cd is 2 and DRK_YN is Y, the estimated value for gamma_GTP is approximately 49.10.

When SMK_stat_type_cd is 1 and DRK_YN is Y, the estimated value for gamma_GTP is approximately 33.65.

when SMK_stat_type_cd is 3 and DRK_YN is N, the estimated value for gamma_GTP is approximately 49.52.

When SMK_stat_type_cd is 2 and DRK_YN is N, the estimated value for gamma_GTP is approximately 43.25.

when SMK_stat_type_cd is 1 and DRK_YN is N, the estimated value for gamma_GTP is approximately 27.81.

Results;

Individuals who both smoke (especially at higher levels) and drink (DRK_YN = Y) tend to have the highest estimated gamma_GTP.

The combined effect of smoking and drinking seems to have a more pronounced impact on increasing gamma_GTP compared to smoking or drinking alone.

The combined effect of smoking and drinking may have a synergistic impact on elevating gamma_GTP levels, indicating a potentially higher risk of cardiovascular disease.

estimated_values <- data.frame(
  Combination = c("3,Y", "2,Y", "1,Y", "3,N", "2,N", "1,N"),
  Estimated_Value = c(64.54, 49.10, 33.65, 49.52, 43.25, 27.81)
)

# Plotting the bar plot
barplot(estimated_values$Estimated_Value, names.arg = estimated_values$Combination,
        main = "Estimated Values of gamma_GTP",
        ylab = "Estimated Value",
        col = "skyblue",
        border = "black",
        ylim = c(0, max(estimated_values$Estimated_Value) + 10))

Corelation and covariance

subset_data <- data[, c("SMK_stat_type_cd", "DRK_YN", "SBP", "DBP", "tot_chole", "hemoglobin", "gamma_GTP")]
subset_data[] <- lapply(subset_data, as.numeric)
## Warning in lapply(subset_data, as.numeric): NAs introduced by coercion
# Calculate covariance matrix
cov_matrix <- cov(subset_data)

# Calculate correlation matrix
cor_matrix <- cor(subset_data)

# Display the covariance matrix
print("Covariance Matrix:")
## [1] "Covariance Matrix:"
print(cov_matrix)
##                  SMK_stat_type_cd DRK_YN        SBP        DBP    tot_chole
## SMK_stat_type_cd        0.6699538     NA   1.010831   1.026779    0.3744339
## DRK_YN                         NA     NA         NA         NA           NA
## SBP                     1.0108305     NA 211.503153 106.591298   38.5456210
## DBP                     1.0267788     NA 106.591298  97.799547   42.7878096
## tot_chole               0.3744339     NA  38.545621  42.787810 1494.6075577
## hemoglobin              0.5881647     NA   3.838497   3.792782    7.4307580
## gamma_GTP              10.0529775     NA 118.383474  87.570172  184.2911693
##                  hemoglobin  gamma_GTP
## SMK_stat_type_cd  0.5881647   10.05298
## DRK_YN                   NA         NA
## SBP               3.8384970  118.38347
## DBP               3.7927822   87.57017
## tot_chole         7.4307580  184.29117
## hemoglobin        2.5119991   18.07904
## gamma_GTP        18.0790446 2542.59516
# Display the correlation matrix
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(cor_matrix)
##                  SMK_stat_type_cd DRK_YN        SBP       DBP  tot_chole
## SMK_stat_type_cd       1.00000000     NA 0.08491756 0.1268487 0.01183285
## DRK_YN                         NA      1         NA        NA         NA
## SBP                    0.08491756     NA 1.00000000 0.7411309 0.06855719
## DBP                    0.12684871     NA 0.74113088 1.0000000 0.11191493
## tot_chole              0.01183285     NA 0.06855719 0.1119149 1.00000000
## hemoglobin             0.45338460     NA 0.16653022 0.2419801 0.12127179
## gamma_GTP              0.24357554     NA 0.16143364 0.1756100 0.09453711
##                  hemoglobin  gamma_GTP
## SMK_stat_type_cd  0.4533846 0.24357554
## DRK_YN                   NA         NA
## SBP               0.1665302 0.16143364
## DBP               0.2419801 0.17560997
## tot_chole         0.1212718 0.09453711
## hemoglobin        1.0000000 0.22621798
## gamma_GTP         0.2262180 1.00000000

It seems like there are no extremely high correlations (near 1 or -1) between smoking, drinking, and other health factors.