library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
library(ggplot2)

Reading data from the data path

data_path <- "C:/Users/shanata/Downloads/smoking_driking_dataset_Ver01.csv"
data <- read.csv(data_path)

Sampling of data

R1 - R5 : 5 sampling of data from the original data

R1 <- data %>%
  sample_frac(size= 0.5, replace= TRUE) 
R2 <- data %>%
  sample_frac(size= 0.5, replace= TRUE)
R3 <- data %>%
  sample_frac(size= 0.5, replace= TRUE)
R4 <- data %>%
  sample_frac(size= 0.5, replace= TRUE)
R5 <- data %>%
  sample_frac(size= 0.5, replace= TRUE)

Summary statistics

Comparing weight:

summary(data$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   55.00   60.00   63.28   70.00  140.00
sd(data$weight)
## [1] 12.51424
summary(R1$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   55.00   60.00   63.27   70.00  140.00
sd(R1$weight)
## [1] 12.51884
summary(R2$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   55.00   60.00   63.27   70.00  140.00
sd(R2$weight)
## [1] 12.53243
summary(R3$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    25.0    55.0    60.0    63.3    70.0   140.0
sd(R3$weight)
## [1] 12.4984
summary(R4$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   55.00   60.00   63.28   70.00  140.00
sd(R4$weight)
## [1] 12.50971
summary(R5$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   55.00   60.00   63.28   70.00  140.00
sd(R5$weight)
## [1] 12.53461

Findings:

  1. The summary statistics for each sub-sample, including minimum, 1st quartile, median, mean, 3rd quartile, and maximum values, are very similar to those of the entire dataset. This suggests that each sub-sample has a similar distribution to the full dataset.

  2. The standard deviation for each sub-sample is also similar, ranging from approximately 12.51 to 12.55. This consistency indicates that the variability within each sub-sample is comparable to the entire dataset.

Comparing hemoglobin:

summary(data$hemoglobin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   13.20   14.30   14.23   15.40   25.00
sd(data$hemoglobin)
## [1] 1.584929
summary(R1$hemoglobin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   13.20   14.30   14.23   15.40   23.60
sd(R1$hemoglobin)
## [1] 1.582443
summary(R2$hemoglobin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   13.20   14.30   14.22   15.40   23.60
sd(R2$hemoglobin)
## [1] 1.588875
summary(R3$hemoglobin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.70   13.20   14.30   14.23   15.40   25.00
sd(R3$hemoglobin)
## [1] 1.58353
summary(R4$hemoglobin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   13.20   14.30   14.23   15.40   25.00
sd(R4$hemoglobin)
## [1] 1.585154
summary(R5$hemoglobin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   13.20   14.30   14.23   15.40   24.20
sd(R5$hemoglobin)
## [1] 1.583404

Findings:

There are no significant differences in the summary statistics between the sub-samples and the full dataset.

Comparing Serum Ceratinine:

summary(data$serum_creatinine)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1000  0.7000  0.8000  0.8605  1.0000 98.0000
sd(data$serum_creatinine)
## [1] 0.4805304
summary(R1$serum_creatinine)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1000  0.7000  0.8000  0.8599  1.0000 98.0000
sd(R1$serum_creatinine)
## [1] 0.422911
summary(R2$serum_creatinine)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1000  0.7000  0.8000  0.8602  1.0000 96.0000
sd(R2$serum_creatinine)
## [1] 0.493827
summary(R3$serum_creatinine)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.100   0.700   0.800   0.861   1.000  98.000
sd(R3$serum_creatinine)
## [1] 0.5241427
summary(R4$serum_creatinine)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1000  0.7000  0.8000  0.8621  1.0000 98.0000
sd(R4$serum_creatinine)
## [1] 0.5795195
summary(R5$serum_creatinine)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1000  0.7000  0.8000  0.8604  1.0000 96.0000
sd(R5$serum_creatinine)
## [1] 0.4328674

Findings:

There are no significant differences in the summary statistics between the sub-samples and the full dataset.

Comparing Gamma-glutamyl Transferase:

summary(data$gamma_GTP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   16.00   23.00   37.14   39.00  999.00
sd(data$gamma_GTP)
## [1] 50.42415
summary(R1$gamma_GTP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   16.00   23.00   37.21   40.00  999.00
sd(R1$gamma_GTP)
## [1] 50.79919
summary(R2$gamma_GTP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    16.0    23.0    37.1    39.0   999.0
sd(R2$gamma_GTP)
## [1] 50.38365
summary(R3$gamma_GTP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   16.00   23.00   37.31   40.00  999.00
sd(R3$gamma_GTP)
## [1] 50.88582
summary(R4$gamma_GTP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   16.00   23.00   37.07   39.00  999.00
sd(R4$gamma_GTP)
## [1] 49.95388
summary(R5$gamma_GTP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   16.00   23.00   37.14   39.00  999.00
sd(R5$gamma_GTP)
## [1] 50.88369

Findings:

There are no significant differences in the summary statistics between the sub-samples and the full dataset.

Comparing drinking status:

f<- table(data$DRK_YN)
p<- (f/sum(f))*100
print(p)
## 
##        N        Y 
## 50.01866 49.98134
f<- table(R1$DRK_YN)
p<- (f/sum(f))*100
print(p)
## 
##        N        Y 
## 49.92848 50.07152
f<- table(R2$DRK_YN)
p<- (f/sum(f))*100
print(p)
## 
##        N        Y 
## 49.95168 50.04832
f<- table(R3$DRK_YN)
p<- (f/sum(f))*100
print(p)
## 
##       N       Y 
## 49.8849 50.1151
f<- table(R4$DRK_YN)
p<- (f/sum(f))*100
print(p)
## 
##        N        Y 
## 50.07112 49.92888
f<- table(R5$DRK_YN)
p<- (f/sum(f))*100
print(p)
## 
##        N        Y 
## 50.03056 49.96944

Findings:

  1. The number of people who drink and who don’t drink alcohol are almost the same in all samples.

Comparing smoking status:

f<- table(data$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
## 
##        1        2        3 
## 60.77000 17.64782 21.58217
f<- table(R1$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
## 
##        1        2        3 
## 60.84374 17.60959 21.54666
f<- table(R2$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
## 
##        1        2        3 
## 60.76002 17.62029 21.61970
f<- table(R3$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
## 
##        1        2        3 
## 60.68638 17.64490 21.66872
f<- table(R4$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
## 
##        1        2        3 
## 60.74932 17.57752 21.67316
f<- table(R5$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
## 
##        1        2        3 
## 60.84495 17.57832 21.57672

Findings:

  1. The smoking status of people are almost the same in all samples.

Visualize data distributions:

Drinking Vs Smoking:

 j1 <- mosaicplot(table(data$DRK_YN, data$SMK_stat_type_cd), color = TRUE, main="Drinking Vs Smoking")

 j2 <- mosaicplot(table(R1$DRK_YN, R1$SMK_stat_type_cd), color = TRUE, main="Sample 1")

 j3 <- mosaicplot(table(R2$DRK_YN, R2$SMK_stat_type_cd), color = TRUE, main="Sample 2")

 j4 <- mosaicplot(table(R3$DRK_YN, R3$SMK_stat_type_cd), color = TRUE, main="Sample 2")

 j5 <- mosaicplot(table(R4$DRK_YN, R4$SMK_stat_type_cd), color = TRUE, main="Sample 2")

 j6 <- mosaicplot(table(R5$DRK_YN, R5$SMK_stat_type_cd), color = TRUE, main="Sample 2")

Findings:

One interesting fact is that the number of alcohol consuming people among smokers and non-smokers also have remained consistent among the original data and the sub samples.

Systolic Vs Diastolic Pressure:

plot1 <- ggplot(data, aes(x = SBP, y = DBP)) +
  geom_point() +
   labs(title = "Scatter Plot", x = "Systolic BP", y = "Diastolic BP")
 
 plot2 <- ggplot(R1, aes(x = SBP, y = DBP)) +
   geom_point() +
   labs(title = "Sample-space 1", x = "Systolic BP", y = "Diastolic BP")
 
 plot3 <- ggplot(R2, aes(x = SBP, y = DBP)) +
   geom_point() +
   labs(title = "Sample-space 2", x = "Systolic BP", y = "Diastolic BP")
 
 plot4 <- ggplot(R3, aes(x = SBP, y = DBP)) +
   geom_point() +
   labs(title = "Sample-space 3", x = "Systolic BP", y = "Diastolic BP")
 
 plot5 <- ggplot(R4, aes(x = SBP, y = DBP)) +
   geom_point() +
   labs(title = "Sample-space 4", x = "Systolic BP", y = "Diastolic BP")
 
 plot6 <- ggplot(R5, aes(x = SBP, y = DBP)) +
   geom_point() +
   labs(title = "Sample-space 5", x = "Systolic BP", y = "Diastolic BP")
 
 # Arrange the plots side by side using grid Extra
 library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
 grid.arrange(plot1, plot2, plot3, plot4, plot5, plot6, nrow=2, ncol = 3)

findings:

  1. Sample space 4 is the only data set with diastolic pressure as high as 160

  2. Sample space 3 and 5 are more concentrated compared to other samples.

Outlier Variation across Samples:

 p1 <- ggplot(data, aes(y = triglyceride)) +
   geom_boxplot() +
   labs(title = "Boxplot for Data")
 
 p2 <- ggplot(R1, aes(y = triglyceride)) +
   geom_boxplot() +
   labs(title = "Boxplot for R1")
 
 p3 <- ggplot(R2, aes(y = triglyceride)) +
   geom_boxplot() +
   labs(title = "Boxplot for R2")
 
 p4 <- ggplot(R3, aes(y = triglyceride)) +
   geom_boxplot() +
   labs(title = "Boxplot for R3")
 
 p5 <- ggplot(R4, aes(y = triglyceride)) +
   geom_boxplot() +
   labs(title = "Boxplot for R4")
 
 p6 <- ggplot(R5, aes(y = triglyceride)) +
   geom_boxplot() +
   labs(title = "Boxplot for R5")
 
 
 grid.arrange(p1, p2, p3,p4,p5,p6 ,ncol = 3, nrow=2)

findings:

  1. We can notice that R4 and R2 have values as high as 7500. While for other samples the maximum value is 6000.

  2. R5 have more outliers compared to others.

Weight Vs Hemoglobin across samples:

 plot1 <- ggplot(data, aes(x = weight, y = hemoglobin)) +
   geom_point() +
   labs(title = "Scatter Plot", x = "Weight", y = "hemoglobin")
 
 plot2 <- ggplot(R1, aes(x = weight, y = hemoglobin)) +
   geom_point() +
   labs(title = "Sample-space 1", x = "Weight", y = "hemoglobin")
 
 plot3 <- ggplot(R2, aes(x = weight, y = hemoglobin)) +
   geom_point() +
   labs(title = "Sample-space 2", x = "Weight", y = "hemoglobin")
 
 plot4 <- ggplot(R3, aes(x = weight, y = hemoglobin)) +
   geom_point() +
   labs(title = "Sample-space 3", x = "Weight", y = "hemoglobin")
 
 plot5 <- ggplot(R4, aes(x = weight, y = hemoglobin)) +
   geom_point() +
   labs(title = "Sample-space 4", x = "Weight", y = "hemoglobin")
 
 plot6 <- ggplot(R5, aes(x = weight, y = hemoglobin)) +
   geom_point() +
   labs(title = "Sample-space 5", x = "Weight", y = "hemoglobin")
 
 # Arrange the plots side by side using grid Extra
 library(gridExtra)
 grid.arrange(plot1, plot2, plot3, plot4, plot5, plot6, nrow=2, ncol = 3)

findings:

  1. We can notice that Sample space 1 and sample space 2 have hemoglobin as high as 25, while for the rest of the samples the maximum limit is 20

Anomalies:

  1. One possible anomaly is to check if we have unusual hemoglobin levels. I have created a criteria and have applied it across the various data sets.
 threshold_low <- 12
 threshold_high <- 18
 
 Unusual <- ifelse(
   data$hemoglobin < threshold_low | data$hemoglobin > threshold_high,
   "Unusual",
   "Normal")
  Unusual1 <- ifelse(
   R1$hemoglobin < threshold_low | R1$hemoglobin > threshold_high,
   "Unusual",
   "Normal")
   Unusual2 <- ifelse(
   R2$hemoglobin < threshold_low | R2$hemoglobin > threshold_high,
   "Unusual",
   "Normal")
    Unusual3 <- ifelse(
   R3$hemoglobin < threshold_low | R3$hemoglobin > threshold_high,
   "Unusual",
   "Normal")
    Unusual4 <- ifelse(
   R4$hemoglobin < threshold_low | R4$hemoglobin > threshold_high,
   "Unusual",
   "Normal")
    Unusual5 <- ifelse(
   R5$hemoglobin < threshold_low | R5$hemoglobin > threshold_high,
   "Unusual",
   "Normal")

Once I have got the unusual data I will plot it in a bar plot the check it’s distribution across the different sub samples.

 f1 <- ggplot(data, aes(x =sex, fill = Unusual)) +
   geom_bar() +
   labs(
     title = "Data",
     x = "gender",
     y = "count",
     fill = "hemoglobin"
   ) +
   scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
 
 f2 <- ggplot(R1, aes(x =sex, fill = Unusual1)) +
   geom_bar() +
   labs(
     title = "Sample 1",
     x = "gender",
     y = "count",
     fill = "hemoglobin"
   ) +
   scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
  f3 <- ggplot(R2, aes(x =sex, fill = Unusual2)) +
   geom_bar() +
   labs(
     title = "Sample 2",
     x = "gender",
     y = "count",
     fill = "hemoglobin"
   ) +
   scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
  
    f4<- ggplot(R3, aes(x =sex, fill = Unusual3)) +
   geom_bar() +
   labs(
     title = "sample 3",
     x = "gender",
     y = "count",
     fill = "hemoglobin"
   ) +
   scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
    
      f5 <- ggplot(R4, aes(x =sex, fill = Unusual4)) +
   geom_bar() +
   labs(
     title = "Sample 4",
     x = "gender",
     y = "count",
     fill = "hemoglobin"
   ) +
   scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
      
        f6 <- ggplot(R5, aes(x =sex, fill = Unusual5)) +
   geom_bar() +
   labs(
     title = "Sample 5",
     x = "gender",
     y = "count",
     fill = "hemoglobin"
   ) +
   scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
 
 grid.arrange(f1,f2,f3,f4,f5,f6,ncol=2,nrow=3)

Findings:

Across all the samples, there is one thing common: High rates of unusual hemoglobin levels among female. this anomaly is found across all sub samples not only the original dataset.

####Conclusion:

The data sampling results show variations in serum creatinine levels, with occasional anomalies. Careful data cleaning, anomaly investigation, and analysis adjustments are needed to draw meaningful conclusions.