library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(ggplot2)
Reading data from the data path
data_path <- "C:/Users/shanata/Downloads/smoking_driking_dataset_Ver01.csv"
data <- read.csv(data_path)
R1 - R5 : 5 sampling of data from the original data
R1 <- data %>%
sample_frac(size= 0.5, replace= TRUE)
R2 <- data %>%
sample_frac(size= 0.5, replace= TRUE)
R3 <- data %>%
sample_frac(size= 0.5, replace= TRUE)
R4 <- data %>%
sample_frac(size= 0.5, replace= TRUE)
R5 <- data %>%
sample_frac(size= 0.5, replace= TRUE)
summary(data$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 55.00 60.00 63.28 70.00 140.00
sd(data$weight)
## [1] 12.51424
summary(R1$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 55.00 60.00 63.27 70.00 140.00
sd(R1$weight)
## [1] 12.51884
summary(R2$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 55.00 60.00 63.27 70.00 140.00
sd(R2$weight)
## [1] 12.53243
summary(R3$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.0 55.0 60.0 63.3 70.0 140.0
sd(R3$weight)
## [1] 12.4984
summary(R4$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 55.00 60.00 63.28 70.00 140.00
sd(R4$weight)
## [1] 12.50971
summary(R5$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 55.00 60.00 63.28 70.00 140.00
sd(R5$weight)
## [1] 12.53461
The summary statistics for each sub-sample, including minimum, 1st quartile, median, mean, 3rd quartile, and maximum values, are very similar to those of the entire dataset. This suggests that each sub-sample has a similar distribution to the full dataset.
The standard deviation for each sub-sample is also similar, ranging from approximately 12.51 to 12.55. This consistency indicates that the variability within each sub-sample is comparable to the entire dataset.
summary(data$hemoglobin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 13.20 14.30 14.23 15.40 25.00
sd(data$hemoglobin)
## [1] 1.584929
summary(R1$hemoglobin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 13.20 14.30 14.23 15.40 23.60
sd(R1$hemoglobin)
## [1] 1.582443
summary(R2$hemoglobin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 13.20 14.30 14.22 15.40 23.60
sd(R2$hemoglobin)
## [1] 1.588875
summary(R3$hemoglobin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.70 13.20 14.30 14.23 15.40 25.00
sd(R3$hemoglobin)
## [1] 1.58353
summary(R4$hemoglobin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 13.20 14.30 14.23 15.40 25.00
sd(R4$hemoglobin)
## [1] 1.585154
summary(R5$hemoglobin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 13.20 14.30 14.23 15.40 24.20
sd(R5$hemoglobin)
## [1] 1.583404
There are no significant differences in the summary statistics between the sub-samples and the full dataset.
summary(data$serum_creatinine)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1000 0.7000 0.8000 0.8605 1.0000 98.0000
sd(data$serum_creatinine)
## [1] 0.4805304
summary(R1$serum_creatinine)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1000 0.7000 0.8000 0.8599 1.0000 98.0000
sd(R1$serum_creatinine)
## [1] 0.422911
summary(R2$serum_creatinine)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1000 0.7000 0.8000 0.8602 1.0000 96.0000
sd(R2$serum_creatinine)
## [1] 0.493827
summary(R3$serum_creatinine)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.100 0.700 0.800 0.861 1.000 98.000
sd(R3$serum_creatinine)
## [1] 0.5241427
summary(R4$serum_creatinine)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1000 0.7000 0.8000 0.8621 1.0000 98.0000
sd(R4$serum_creatinine)
## [1] 0.5795195
summary(R5$serum_creatinine)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1000 0.7000 0.8000 0.8604 1.0000 96.0000
sd(R5$serum_creatinine)
## [1] 0.4328674
There are no significant differences in the summary statistics between the sub-samples and the full dataset.
summary(data$gamma_GTP)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 16.00 23.00 37.14 39.00 999.00
sd(data$gamma_GTP)
## [1] 50.42415
summary(R1$gamma_GTP)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 16.00 23.00 37.21 40.00 999.00
sd(R1$gamma_GTP)
## [1] 50.79919
summary(R2$gamma_GTP)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 16.0 23.0 37.1 39.0 999.0
sd(R2$gamma_GTP)
## [1] 50.38365
summary(R3$gamma_GTP)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 16.00 23.00 37.31 40.00 999.00
sd(R3$gamma_GTP)
## [1] 50.88582
summary(R4$gamma_GTP)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 16.00 23.00 37.07 39.00 999.00
sd(R4$gamma_GTP)
## [1] 49.95388
summary(R5$gamma_GTP)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 16.00 23.00 37.14 39.00 999.00
sd(R5$gamma_GTP)
## [1] 50.88369
There are no significant differences in the summary statistics between the sub-samples and the full dataset.
f<- table(data$DRK_YN)
p<- (f/sum(f))*100
print(p)
##
## N Y
## 50.01866 49.98134
f<- table(R1$DRK_YN)
p<- (f/sum(f))*100
print(p)
##
## N Y
## 49.92848 50.07152
f<- table(R2$DRK_YN)
p<- (f/sum(f))*100
print(p)
##
## N Y
## 49.95168 50.04832
f<- table(R3$DRK_YN)
p<- (f/sum(f))*100
print(p)
##
## N Y
## 49.8849 50.1151
f<- table(R4$DRK_YN)
p<- (f/sum(f))*100
print(p)
##
## N Y
## 50.07112 49.92888
f<- table(R5$DRK_YN)
p<- (f/sum(f))*100
print(p)
##
## N Y
## 50.03056 49.96944
f<- table(data$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
##
## 1 2 3
## 60.77000 17.64782 21.58217
f<- table(R1$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
##
## 1 2 3
## 60.84374 17.60959 21.54666
f<- table(R2$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
##
## 1 2 3
## 60.76002 17.62029 21.61970
f<- table(R3$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
##
## 1 2 3
## 60.68638 17.64490 21.66872
f<- table(R4$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
##
## 1 2 3
## 60.74932 17.57752 21.67316
f<- table(R5$SMK_stat_type_cd)
p<- (f/sum(f))*100
print(p)
##
## 1 2 3
## 60.84495 17.57832 21.57672
j1 <- mosaicplot(table(data$DRK_YN, data$SMK_stat_type_cd), color = TRUE, main="Drinking Vs Smoking")
j2 <- mosaicplot(table(R1$DRK_YN, R1$SMK_stat_type_cd), color = TRUE, main="Sample 1")
j3 <- mosaicplot(table(R2$DRK_YN, R2$SMK_stat_type_cd), color = TRUE, main="Sample 2")
j4 <- mosaicplot(table(R3$DRK_YN, R3$SMK_stat_type_cd), color = TRUE, main="Sample 2")
j5 <- mosaicplot(table(R4$DRK_YN, R4$SMK_stat_type_cd), color = TRUE, main="Sample 2")
j6 <- mosaicplot(table(R5$DRK_YN, R5$SMK_stat_type_cd), color = TRUE, main="Sample 2")
One interesting fact is that the number of alcohol consuming people among smokers and non-smokers also have remained consistent among the original data and the sub samples.
plot1 <- ggplot(data, aes(x = SBP, y = DBP)) +
geom_point() +
labs(title = "Scatter Plot", x = "Systolic BP", y = "Diastolic BP")
plot2 <- ggplot(R1, aes(x = SBP, y = DBP)) +
geom_point() +
labs(title = "Sample-space 1", x = "Systolic BP", y = "Diastolic BP")
plot3 <- ggplot(R2, aes(x = SBP, y = DBP)) +
geom_point() +
labs(title = "Sample-space 2", x = "Systolic BP", y = "Diastolic BP")
plot4 <- ggplot(R3, aes(x = SBP, y = DBP)) +
geom_point() +
labs(title = "Sample-space 3", x = "Systolic BP", y = "Diastolic BP")
plot5 <- ggplot(R4, aes(x = SBP, y = DBP)) +
geom_point() +
labs(title = "Sample-space 4", x = "Systolic BP", y = "Diastolic BP")
plot6 <- ggplot(R5, aes(x = SBP, y = DBP)) +
geom_point() +
labs(title = "Sample-space 5", x = "Systolic BP", y = "Diastolic BP")
# Arrange the plots side by side using grid Extra
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(plot1, plot2, plot3, plot4, plot5, plot6, nrow=2, ncol = 3)
Sample space 4 is the only data set with diastolic pressure as high as 160
Sample space 3 and 5 are more concentrated compared to other samples.
p1 <- ggplot(data, aes(y = triglyceride)) +
geom_boxplot() +
labs(title = "Boxplot for Data")
p2 <- ggplot(R1, aes(y = triglyceride)) +
geom_boxplot() +
labs(title = "Boxplot for R1")
p3 <- ggplot(R2, aes(y = triglyceride)) +
geom_boxplot() +
labs(title = "Boxplot for R2")
p4 <- ggplot(R3, aes(y = triglyceride)) +
geom_boxplot() +
labs(title = "Boxplot for R3")
p5 <- ggplot(R4, aes(y = triglyceride)) +
geom_boxplot() +
labs(title = "Boxplot for R4")
p6 <- ggplot(R5, aes(y = triglyceride)) +
geom_boxplot() +
labs(title = "Boxplot for R5")
grid.arrange(p1, p2, p3,p4,p5,p6 ,ncol = 3, nrow=2)
We can notice that R4 and R2 have values as high as 7500. While for other samples the maximum value is 6000.
R5 have more outliers compared to others.
plot1 <- ggplot(data, aes(x = weight, y = hemoglobin)) +
geom_point() +
labs(title = "Scatter Plot", x = "Weight", y = "hemoglobin")
plot2 <- ggplot(R1, aes(x = weight, y = hemoglobin)) +
geom_point() +
labs(title = "Sample-space 1", x = "Weight", y = "hemoglobin")
plot3 <- ggplot(R2, aes(x = weight, y = hemoglobin)) +
geom_point() +
labs(title = "Sample-space 2", x = "Weight", y = "hemoglobin")
plot4 <- ggplot(R3, aes(x = weight, y = hemoglobin)) +
geom_point() +
labs(title = "Sample-space 3", x = "Weight", y = "hemoglobin")
plot5 <- ggplot(R4, aes(x = weight, y = hemoglobin)) +
geom_point() +
labs(title = "Sample-space 4", x = "Weight", y = "hemoglobin")
plot6 <- ggplot(R5, aes(x = weight, y = hemoglobin)) +
geom_point() +
labs(title = "Sample-space 5", x = "Weight", y = "hemoglobin")
# Arrange the plots side by side using grid Extra
library(gridExtra)
grid.arrange(plot1, plot2, plot3, plot4, plot5, plot6, nrow=2, ncol = 3)
threshold_low <- 12
threshold_high <- 18
Unusual <- ifelse(
data$hemoglobin < threshold_low | data$hemoglobin > threshold_high,
"Unusual",
"Normal")
Unusual1 <- ifelse(
R1$hemoglobin < threshold_low | R1$hemoglobin > threshold_high,
"Unusual",
"Normal")
Unusual2 <- ifelse(
R2$hemoglobin < threshold_low | R2$hemoglobin > threshold_high,
"Unusual",
"Normal")
Unusual3 <- ifelse(
R3$hemoglobin < threshold_low | R3$hemoglobin > threshold_high,
"Unusual",
"Normal")
Unusual4 <- ifelse(
R4$hemoglobin < threshold_low | R4$hemoglobin > threshold_high,
"Unusual",
"Normal")
Unusual5 <- ifelse(
R5$hemoglobin < threshold_low | R5$hemoglobin > threshold_high,
"Unusual",
"Normal")
Once I have got the unusual data I will plot it in a bar plot the check it’s distribution across the different sub samples.
f1 <- ggplot(data, aes(x =sex, fill = Unusual)) +
geom_bar() +
labs(
title = "Data",
x = "gender",
y = "count",
fill = "hemoglobin"
) +
scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
f2 <- ggplot(R1, aes(x =sex, fill = Unusual1)) +
geom_bar() +
labs(
title = "Sample 1",
x = "gender",
y = "count",
fill = "hemoglobin"
) +
scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
f3 <- ggplot(R2, aes(x =sex, fill = Unusual2)) +
geom_bar() +
labs(
title = "Sample 2",
x = "gender",
y = "count",
fill = "hemoglobin"
) +
scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
f4<- ggplot(R3, aes(x =sex, fill = Unusual3)) +
geom_bar() +
labs(
title = "sample 3",
x = "gender",
y = "count",
fill = "hemoglobin"
) +
scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
f5 <- ggplot(R4, aes(x =sex, fill = Unusual4)) +
geom_bar() +
labs(
title = "Sample 4",
x = "gender",
y = "count",
fill = "hemoglobin"
) +
scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
f6 <- ggplot(R5, aes(x =sex, fill = Unusual5)) +
geom_bar() +
labs(
title = "Sample 5",
x = "gender",
y = "count",
fill = "hemoglobin"
) +
scale_fill_manual(values = c("Normal" = "blue", "Unusual" = "red"))
grid.arrange(f1,f2,f3,f4,f5,f6,ncol=2,nrow=3)
Across all the samples, there is one thing common: High rates of unusual hemoglobin levels among female. this anomaly is found across all sub samples not only the original dataset.
####Conclusion:
The data sampling results show variations in serum creatinine levels, with occasional anomalies. Careful data cleaning, anomaly investigation, and analysis adjustments are needed to draw meaningful conclusions.