data_path="C:/Users/shanata/Downloads/smoking_driking_dataset_Ver01.csv"
data= read.csv(data_path)
library(gplots)
## Warning: package 'gplots' was built under R version 4.2.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
selected_columns <- c("DRK_YN", "SBP", "DBP", "tot_chole", "hemoglobin", "gamma_GTP","age","height","weight","HDL_chole","LDL_chole","triglyceride")
selected_data <- data[, selected_columns]
selected_data$DRK_YN <- as.numeric(selected_data$DRK_YN == "Y")
correlation_matrix <- cor(selected_data, use = "complete.obs")
print(correlation_matrix)
## DRK_YN SBP DBP tot_chole hemoglobin
## DRK_YN 1.00000000 0.03314876 0.10087576 0.01972015 0.2993408
## SBP 0.03314876 1.00000000 0.74113088 0.06855719 0.1665302
## DBP 0.10087576 0.74113088 1.00000000 0.11191493 0.2419801
## tot_chole 0.01972015 0.06855719 0.11191493 1.00000000 0.1212718
## hemoglobin 0.29934081 0.16653022 0.24198015 0.12127179 1.0000000
## gamma_GTP 0.20509398 0.16143364 0.17560997 0.09453711 0.2262180
## age -0.28458709 0.26552985 0.10884678 0.01144622 -0.1730808
## height 0.37456561 0.03503012 0.10877957 -0.02323965 0.5318984
## weight 0.26428197 0.25077043 0.27789070 0.06323790 0.4994906
## HDL_chole 0.04251386 -0.11177157 -0.09383848 0.16386850 -0.1818647
## LDL_chole -0.04369311 0.03361923 0.06698422 0.87736727 0.1016460
## triglyceride 0.10439795 0.18600294 0.19865096 0.27068333 0.2416452
## gamma_GTP age height weight HDL_chole
## DRK_YN 0.205093984 -0.28458709 0.37456561 0.26428197 0.04251386
## SBP 0.161433643 0.26552985 0.03503012 0.25077043 -0.11177157
## DBP 0.175609972 0.10884678 0.10877957 0.27789070 -0.09383848
## tot_chole 0.094537112 0.01144622 -0.02323965 0.06323790 0.16386850
## hemoglobin 0.226217983 -0.17308078 0.53189841 0.49949058 -0.18186469
## gamma_GTP 1.000000000 0.01739127 0.16233979 0.22188103 -0.05570872
## age 0.017391274 1.00000000 -0.39850118 -0.19533671 -0.10462419
## height 0.162339793 -0.39850118 1.00000000 0.66882349 -0.14859911
## weight 0.221881026 -0.19533671 0.66882349 1.00000000 -0.28768815
## HDL_chole -0.055708725 -0.10462419 -0.14859911 -0.28768815 1.00000000
## LDL_chole -0.008543624 0.02949652 -0.01545036 0.06785867 0.02208907
## triglyceride 0.299026552 0.04354936 0.13761061 0.28377397 -0.26836597
## LDL_chole triglyceride
## DRK_YN -0.043693107 0.10439795
## SBP 0.033619232 0.18600294
## DBP 0.066984222 0.19865096
## tot_chole 0.877367272 0.27068333
## hemoglobin 0.101645958 0.24164518
## gamma_GTP -0.008543624 0.29902655
## age 0.029496516 0.04354936
## height -0.015450360 0.13761061
## weight 0.067858670 0.28377397
## HDL_chole 0.022089065 -0.26836597
## LDL_chole 1.000000000 0.02957086
## triglyceride 0.029570857 1.00000000
heatmap(correlation_matrix,
col = colorRampPalette(c("blue", "white", "red"))(20),
main = "Correlation Heatmap")
#xlab = "Variables", ylab = "Variables")
selected_columns <- c("SBP", "DBP", "tot_chole", "gamma_GTP", "DRK_YN")
# Create a new data frame with selected columns
selected_data <- data[, selected_columns]
# Convert drinking column to numeric (assuming it is coded as Y/N)
selected_data$DRK_YN <- as.numeric(selected_data$DRK_YN == "Y")
# Fit logistic regression model
logistic_model <- glm(DRK_YN ~ SBP + DBP + tot_chole + gamma_GTP, data = selected_data, family = binomial)
# Print model summary
summary(logistic_model)
##
## Call:
## glm(formula = DRK_YN ~ SBP + DBP + tot_chole + gamma_GTP, family = binomial,
## data = selected_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.9971 -1.0805 -0.7061 1.1850 2.1849
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.273e-01 2.066e-02 -20.68 <2e-16 ***
## SBP -1.701e-02 2.166e-04 -78.54 <2e-16 ***
## DBP 2.804e-02 3.217e-04 87.13 <2e-16 ***
## tot_chole -1.137e-03 5.549e-05 -20.49 <2e-16 ***
## gamma_GTP 1.803e-02 9.013e-05 200.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1374297 on 991345 degrees of freedom
## Residual deviance: 1298044 on 991341 degrees of freedom
## AIC: 1298054
##
## Number of Fisher Scoring iterations: 5
Individuals with higher systolic blood pressure are less likely to be drinkers
Individuals with higher total cholesterol are less likely to be drinkers compared to those with lower total cholesterol.
Health consciousness : They might be actively managing their health, which could include lifestyle choices such as abstaining from alcohol consumption.
Medical Advice: these induviduals may receive medical advice to limit or avoid alcohol consumption.
Co-occurring Health Conditions: High blood pressure and elevated cholesterol levels are often associated with other health conditions, such as cardiovascular diseases. Individuals with these conditions may be more inclined to adopt healthier habits
The model suggests a relationship between health indicators and drinking habits, but it doesn’t provide evidence for causation.
# Predict probabilities
selected_data$predicted_prob <- predict(logistic_model, type = "response")
sum_prob <- sum(selected_data$predicted_prob)
# Calculate the percentage
percentage_drinker <- (sum_prob / length(selected_data$predicted_prob)) * 100
# Print the result
print(paste("Percentage of probability that someone is a drinker:", round(percentage_drinker, 2), "%"))
## [1] "Percentage of probability that someone is a drinker: 49.98 %"
# Assuming you have a data frame 'selected_data' with actual and predicted values
actual_values <- selected_data$DRK_YN
# Use the model to predict on the same dataset
predicted_probabilities <- predict(logistic_model, newdata = selected_data, type = "response")
# Convert predicted probabilities to binary predictions (0 or 1)
predicted_values <- ifelse(predicted_probabilities > 0.5, 1, 0)
# Create a confusion matrix
conf_matrix <- table(actual_values, predicted_values)
# Extract TP, TN, FP, FN from the confusion matrix
TP <- conf_matrix[2, 2]
TN <- conf_matrix[1, 1]
FP <- conf_matrix[1, 2]
FN <- conf_matrix[2, 1]
# Calculate accuracy
accuracy <- (TP + TN) / (TP + TN + FP + FN)
# Print accuracy
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.6253639
length(data$sex)
## [1] 991346
length(data$DRK_YN)
## [1] 991346
length(data$Hypertension)
## [1] 0
length(data$Hypotension)
## [1] 0
length(data$SMK_stat_type_cd)
## [1] 991346
cross_table <- table(data$sex, data$DRK_YN,data$SMK_stat_type_cd)
cross_table_df <- as.data.frame(cross_table)
cross_table_sum <- rowSums(cross_table)
cross_table_normalized <- scale(cross_table_sum)
print(cross_table_df)
## Var1 Var2 Var3 Freq
## 1 Female N 1 313180
## 2 Male N 1 75830
## 3 Female Y 1 124580
## 4 Male Y 1 88851
## 5 Female N 2 4430
## 6 Male N 2 50041
## 7 Female Y 2 6493
## 8 Male Y 2 113987
## 9 Female N 3 6150
## 10 Male N 3 46227
## 11 Female Y 3 10098
## 12 Male Y 3 151479
df_sorted <- cross_table_df[order(-cross_table_df$Freq), ]
top_5_most_occuring <- head(df_sorted, 5)
top_5_least_occuring <- tail(df_sorted, 5)
print(top_5_least_occuring)
## Var1 Var2 Var3 Freq
## 10 Male N 3 46227
## 11 Female Y 3 10098
## 7 Female Y 2 6493
## 9 Female N 3 6150
## 5 Female N 2 4430
print(top_5_most_occuring)
## Var1 Var2 Var3 Freq
## 1 Female N 1 313180
## 12 Male Y 3 151479
## 3 Female Y 1 124580
## 8 Male Y 2 113987
## 4 Male Y 1 88851
Male, Not a Drinker, still smoke
Female, Drinker, still smoke
Female, Drinker, used to smoke but quit
Female, Not a Drinker, still smoke
Female, Not a Drinker, used to smoke but quit
Female, Not a Drinker,Never Smoked
Male, Drinker,Still Smoke
Female, Drinker,Never Smoked
Male, Drinker,Used to Smoke but Quit
Male, Drinker, Never Smoked
Non-Drinkers and Smoking:
Non-drinkers, particularly females, are less likely to engage in smoking. This is evident in the combinations where females who do not drink are less likely to smoke, whether currently or in the past. Drinking and Smoking:
Males who are drinkers are often observed to be smokers, whether they currently smoke or have quit in the past. This suggests a connection between drinking and smoking behaviors among males.
Females Quitting Smoking:
Females who quit smoking, especially those who are drinkers, are less common. This could indicate that females who quit smoking may also moderate their alcohol consumption. Non-Drinking Males Smoking:
Males who do not drink but still smoke are relatively uncommon. This supports the idea that individuals who abstain from drinking may also avoid other unhealthy behaviors like smoking.
Health-Conscious Group:
The most common combination involves females who do not drink and have never smoked. This may represent a health-conscious or risk-averse subgroup. Drinking and Smoking Among Males:
Males who are both drinkers and smokers, whether current or former, form a prevalent group. This suggests a subgroup where drinking and smoking behaviors tend to coexist among males.
library(ggplot2)
# Create a data frame for plotting
plot_data <- as.data.frame(cross_table_df)
plot_data$Combination <- interaction(plot_data$Var2, plot_data$Var3)
# Plot the bar chart
ggplot(plot_data, aes(x = Combination, y = Freq, fill = Var1)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Frequency of Combinations by Gender, Drinking, and Smoking Status",
x = "Combination",
y = "Frequency",
fill = "Gender") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
H0 (null hypothesis): There is no difference in mean systolic blood pressure between drinkers and non-drinkers.
H1 (alternative hypothesis): The mean systolic blood pressure is greater for drinkers than non-drinkers.
drinkers <- data[data$DRK_YN == "Y", ]
non_drinkers <- data[data$DRK_YN == "N", ]
t_test_result <- t.test(drinkers$SBP, non_drinkers$SBP, alternative = "greater")
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: drinkers$SBP and non_drinkers$SBP
## t = 33.024, df = 988971, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.9161503 Inf
## sample estimates:
## mean of x mean of y
## 122.9148 121.9506
Based on the results of the Welch Two Sample t-test, there is strong evidence to suggest that drinkers have a significantly higher mean systolic blood pressure than non-drinkers.
library(ggplot2)
# Create a boxplot
ggplot(data, aes(x = DRK_YN, y = SBP, fill = DRK_YN)) +
geom_boxplot() +
labs(title = "Boxplot of SBP by Drinking Status",
x = "Drinking Status",
y = "Systolic Blood Pressure") +
theme_minimal()
Even though the boxplot might not visually show a significant difference, the statistical test takes into account the entire distribution and sample size, providing a more sensitive measure.
drinkers = data[data$DRK_YN=="Y",]
non_drinkers= data[data$DRK_YN=="N",]
mean_drinkers <- mean(drinkers$tot_chole)
se_drinkers <- sd(drinkers$tot_chole) / sqrt(length(drinkers$tot_chole))
mean_non_drinkers <- mean(non_drinkers$tot_chole)
se_non_drinkers <- sd(non_drinkers$tot_chole) / sqrt(length(non_drinkers$tot_chole))
ci_drinkers <- qnorm(c(0.025, 0.975), mean_drinkers, se_drinkers)
ci_non_drinkers <- qnorm(c(0.025, 0.975), mean_non_drinkers, se_non_drinkers)
print(paste("Drinkers Mean cholesterol: ", round(mean_drinkers, 2), " (", round(ci_drinkers[1], 2), "-", round(ci_drinkers[2], 2), ")"))
## [1] "Drinkers Mean cholesterol: 196.32 ( 196.21 - 196.43 )"
print(paste("Non-Drinkers Mean cholesterol: ", round(mean_non_drinkers, 2), " (", round(ci_non_drinkers[1], 2), "-", round(ci_non_drinkers[2], 2), ")"))
## [1] "Non-Drinkers Mean cholesterol: 194.79 ( 194.69 - 194.9 )"
The mean cholesterol level is higher in drinkers compared to non-drinkers
The confidence intervals for drinkers and non-drinkers do not overlap, suggesting a potential significant difference in mean cholesterol levels between the two groups.
data_plot <- data.frame(
Group = c("Drinkers", "Non-Drinkers"),
Mean_Cholesterol = c(196.32, 194.79),
Lower_CI = c(196.21, 194.69),
Upper_CI = c(196.43, 194.9)
)
# Create the bar plot
ggplot(data_plot, aes(x = Group, y = Mean_Cholesterol, fill = Group)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7, color = "black", alpha = 0.7) +
geom_point(position = position_dodge(width = 0.7), size = 3, shape = 21, fill = "white") +
geom_errorbar(aes(ymin = Lower_CI, ymax = Upper_CI), position = position_dodge(width = 0.7), width = 0.25) +
labs(title = "Mean Cholesterol Levels of Drinkers and Non-Drinkers",
y = "Mean Cholesterol",
x = "Group") +
theme_minimal()
health_variable <- data$triglyceride
smoking_status <- factor(data$SMK_stat_type_cd)
# Perform ANOVA
anova_result <- aov(health_variable ~ smoking_status, data = data)
# Print ANOVA table
print(summary(anova_result))
## Df Sum Sq Mean Sq F value Pr(>F)
## smoking_status 2 4.881e+08 244066328 24525 <2e-16 ***
## Residuals 991343 9.866e+09 9952
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value (Pr(>F)) is extremely small (less than 0.05), indicating that we can reject the null hypothesis. Therefore, there is strong evidence to suggest that there is a significant difference in mean triglyceride values among different smoking statuses.
# Perform Tukey's HSD test
tukey_result <- TukeyHSD(anova_result)
# Print Tukey's HSD results
print(tukey_result)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = health_variable ~ smoking_status, data = data)
##
## $smoking_status
## diff lwr upr p adj
## 2-1 32.36302 31.72804 32.99800 0
## 3-1 52.70392 52.11550 53.29234 0
## 3-2 20.34090 19.58727 21.09453 0
The Tukey HSD results show that all pairwise differences between smoking statuses are significant (p-value = 0).
Specifically, individuals who currently smoke (group 3) have significantly higher triglyceride values compared to both those who never smoked (group 1) and those who used to smoke but quit (group 2).
The difference in triglyceride values between those who never smoked and those who used to smoke but quit is also significant.
Smoking appears to have a significant impact on triglyceride values, with current smokers having higher triglyceride levels compared to both non-smokers and former smokers.
# Assuming 'health_outcome' is a continuous variable, 'smoking' is binary, and 'drinking' is a factor with 3 levels
combined_model <- lm(gamma_GTP ~ SMK_stat_type_cd * DRK_YN, data = data)
summary(combined_model)
##
## Call:
## lm(formula = gamma_GTP ~ SMK_stat_type_cd * DRK_YN, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.54 -16.99 -9.66 2.46 974.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.6339 0.1546 120.55 <2e-16 ***
## SMK_stat_type_cd 6.1802 0.1048 58.96 <2e-16 ***
## DRK_YNY -0.4192 0.2266 -1.85 0.0643 .
## SMK_stat_type_cd:DRK_YNY 9.2614 0.1316 70.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48.36 on 991342 degrees of freedom
## Multiple R-squared: 0.08025, Adjusted R-squared: 0.08025
## F-statistic: 2.883e+04 on 3 and 991342 DF, p-value: < 2.2e-16
When SMK_stat_type_cd is 3 and DRK_YN is Y, the estimated value for gamma_GTP is approximately 64.54.
when SMK_stat_type_cd is 2 and DRK_YN is Y, the estimated value for gamma_GTP is approximately 49.10.
When SMK_stat_type_cd is 1 and DRK_YN is Y, the estimated value for gamma_GTP is approximately 33.65.
when SMK_stat_type_cd is 3 and DRK_YN is N, the estimated value for gamma_GTP is approximately 49.52.
When SMK_stat_type_cd is 2 and DRK_YN is N, the estimated value for gamma_GTP is approximately 43.25.
when SMK_stat_type_cd is 1 and DRK_YN is N, the estimated value for gamma_GTP is approximately 27.81.
Individuals who both smoke (especially at higher levels) and drink (DRK_YN = Y) tend to have the highest estimated gamma_GTP.
The combined effect of smoking and drinking seems to have a more pronounced impact on increasing gamma_GTP compared to smoking or drinking alone.
The combined effect of smoking and drinking may have a synergistic impact on elevating gamma_GTP levels, indicating a potentially higher risk of cardiovascular disease.
estimated_values <- data.frame(
Combination = c("3,Y", "2,Y", "1,Y", "3,N", "2,N", "1,N"),
Estimated_Value = c(64.54, 49.10, 33.65, 49.52, 43.25, 27.81)
)
# Plotting the bar plot
barplot(estimated_values$Estimated_Value, names.arg = estimated_values$Combination,
main = "Estimated Values of gamma_GTP",
ylab = "Estimated Value",
col = "skyblue",
border = "black",
ylim = c(0, max(estimated_values$Estimated_Value) + 10))
subset_data <- data[, c("SMK_stat_type_cd", "DRK_YN", "SBP", "DBP", "tot_chole", "hemoglobin", "gamma_GTP")]
subset_data[] <- lapply(subset_data, as.numeric)
## Warning in lapply(subset_data, as.numeric): NAs introduced by coercion
# Calculate covariance matrix
cov_matrix <- cov(subset_data)
# Calculate correlation matrix
cor_matrix <- cor(subset_data)
# Display the covariance matrix
print("Covariance Matrix:")
## [1] "Covariance Matrix:"
print(cov_matrix)
## SMK_stat_type_cd DRK_YN SBP DBP tot_chole
## SMK_stat_type_cd 0.6699538 NA 1.010831 1.026779 0.3744339
## DRK_YN NA NA NA NA NA
## SBP 1.0108305 NA 211.503153 106.591298 38.5456210
## DBP 1.0267788 NA 106.591298 97.799547 42.7878096
## tot_chole 0.3744339 NA 38.545621 42.787810 1494.6075577
## hemoglobin 0.5881647 NA 3.838497 3.792782 7.4307580
## gamma_GTP 10.0529775 NA 118.383474 87.570172 184.2911693
## hemoglobin gamma_GTP
## SMK_stat_type_cd 0.5881647 10.05298
## DRK_YN NA NA
## SBP 3.8384970 118.38347
## DBP 3.7927822 87.57017
## tot_chole 7.4307580 184.29117
## hemoglobin 2.5119991 18.07904
## gamma_GTP 18.0790446 2542.59516
# Display the correlation matrix
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(cor_matrix)
## SMK_stat_type_cd DRK_YN SBP DBP tot_chole
## SMK_stat_type_cd 1.00000000 NA 0.08491756 0.1268487 0.01183285
## DRK_YN NA 1 NA NA NA
## SBP 0.08491756 NA 1.00000000 0.7411309 0.06855719
## DBP 0.12684871 NA 0.74113088 1.0000000 0.11191493
## tot_chole 0.01183285 NA 0.06855719 0.1119149 1.00000000
## hemoglobin 0.45338460 NA 0.16653022 0.2419801 0.12127179
## gamma_GTP 0.24357554 NA 0.16143364 0.1756100 0.09453711
## hemoglobin gamma_GTP
## SMK_stat_type_cd 0.4533846 0.24357554
## DRK_YN NA NA
## SBP 0.1665302 0.16143364
## DBP 0.2419801 0.17560997
## tot_chole 0.1212718 0.09453711
## hemoglobin 1.0000000 0.22621798
## gamma_GTP 0.2262180 1.00000000
It seems like there are no extremely high correlations (near 1 or -1) between smoking, drinking, and other health factors.