data <- read_excel("~/Desktop/aicha_labs_for_data_mining/liver_data_with_metadata(4).xlsx")
# View the first few rows of the dataset
head(data)
## # A tibble: 6 × 11
## `Age of the patient` `Gender of the patient` `Total Bilirubin`
## <dbl> <chr> <dbl>
## 1 65 Female 0.7
## 2 62 Male 10.9
## 3 62 Male 7.3
## 4 58 Male 1
## 5 72 Male 3.9
## 6 46 Male 1.8
## # ℹ 8 more variables: `Direct Bilirubin` <dbl>,
## # ` Alkphos Alkaline Phosphotase` <dbl>,
## # ` Sgpt Alamine Aminotransferase` <dbl>,
## # `Sgot Aspartate Aminotransferase` <dbl>, `Total Protiens` <dbl>,
## # ` ALB Albumin` <dbl>, `A/G Ratio Albumin and Globulin Ratio` <dbl>,
## # Result <dbl>
dim(data)
## [1] 30691 11
nrow(data)
## [1] 30691
ncol(data)
## [1] 11
colnames(data)
## [1] "Age of the patient"
## [2] "Gender of the patient"
## [3] "Total Bilirubin"
## [4] "Direct Bilirubin"
## [5] " Alkphos Alkaline Phosphotase"
## [6] " Sgpt Alamine Aminotransferase"
## [7] "Sgot Aspartate Aminotransferase"
## [8] "Total Protiens"
## [9] " ALB Albumin"
## [10] "A/G Ratio Albumin and Globulin Ratio"
## [11] "Result"
str(data)
## tibble [30,691 × 11] (S3: tbl_df/tbl/data.frame)
## $ Age of the patient : num [1:30691] 65 62 62 58 72 46 26 29 17 55 ...
## $ Gender of the patient : chr [1:30691] "Female" "Male" "Male" "Male" ...
## $ Total Bilirubin : num [1:30691] 0.7 10.9 7.3 1 3.9 1.8 0.9 0.9 0.9 0.7 ...
## $ Direct Bilirubin : num [1:30691] 0.1 5.5 4.1 0.4 2 0.7 0.2 0.3 0.3 0.2 ...
## $ Alkphos Alkaline Phosphotase : num [1:30691] 187 699 490 182 195 208 154 202 202 290 ...
## $ Sgpt Alamine Aminotransferase : num [1:30691] 16 64 60 14 27 19 NA 14 22 53 ...
## $ Sgot Aspartate Aminotransferase : num [1:30691] 18 100 68 20 59 14 12 11 19 58 ...
## $ Total Protiens : num [1:30691] 6.8 7.5 7 6.8 7.3 7.6 7 6.7 7.4 6.8 ...
## $ ALB Albumin : num [1:30691] 3.3 3.2 3.3 3.4 2.4 4.4 3.5 3.6 4.1 3.4 ...
## $ A/G Ratio Albumin and Globulin Ratio: num [1:30691] 0.9 0.74 0.89 1 0.4 1.3 1 1.1 1.2 1 ...
## $ Result : num [1:30691] 1 1 1 1 1 1 1 1 2 1 ...
# gives basic statistics for numeric columns and a summary of factor levels
summary(data)
## Age of the patient Gender of the patient Total Bilirubin Direct Bilirubin
## Min. : 4.00 Length:30691 Min. : 0.40 Min. : 0.100
## 1st Qu.:32.00 Class :character 1st Qu.: 0.80 1st Qu.: 0.200
## Median :45.00 Mode :character Median : 1.00 Median : 0.300
## Mean :44.11 Mean : 3.37 Mean : 1.528
## 3rd Qu.:55.00 3rd Qu.: 2.70 3rd Qu.: 1.300
## Max. :90.00 Max. :75.00 Max. :19.700
## NA's :2 NA's :648 NA's :561
## Alkphos Alkaline Phosphotase Sgpt Alamine Aminotransferase
## Min. : 63.0 Min. : 10.00
## 1st Qu.: 175.0 1st Qu.: 23.00
## Median : 209.0 Median : 35.00
## Mean : 289.1 Mean : 81.49
## 3rd Qu.: 298.0 3rd Qu.: 62.00
## Max. :2110.0 Max. :2000.00
## NA's :796 NA's :538
## Sgot Aspartate Aminotransferase Total Protiens ALB Albumin
## Min. : 10.0 Min. :2.70 Min. :0.90
## 1st Qu.: 26.0 1st Qu.:5.80 1st Qu.:2.60
## Median : 42.0 Median :6.60 Median :3.10
## Mean : 111.5 Mean :6.48 Mean :3.13
## 3rd Qu.: 88.0 3rd Qu.:7.20 3rd Qu.:3.80
## Max. :4929.0 Max. :9.60 Max. :5.50
## NA's :462 NA's :463 NA's :494
## A/G Ratio Albumin and Globulin Ratio Result
## Min. :0.3000 Min. :1.000
## 1st Qu.:0.7000 1st Qu.:1.000
## Median :0.9000 Median :1.000
## Mean :0.9435 Mean :1.286
## 3rd Qu.:1.1000 3rd Qu.:2.000
## Max. :2.8000 Max. :2.000
## NA's :559
missmap(data)
vis_miss(data)
# Ensure summarytools is loaded
library(summarytools)
# Disable interactive view mode
st_options(plain.ascii = TRUE)
# Generate the summary table
summary <- dfSummary(data)
# Print the table in R Markdown with HTML styling
print(summary, method = "render")
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Age of the patient [numeric] |
|
77 distinct values | 30689 (100.0%) | 2 (0.0%) | ||||||||||||||||
| 2 | Gender of the patient [character] |
|
|
29789 (97.1%) | 902 (2.9%) | ||||||||||||||||
| 3 | Total Bilirubin [numeric] |
|
113 distinct values | 30043 (97.9%) | 648 (2.1%) | ||||||||||||||||
| 4 | Direct Bilirubin [numeric] |
|
80 distinct values | 30130 (98.2%) | 561 (1.8%) | ||||||||||||||||
| 5 | Alkphos Alkaline Phosphotase [numeric] |
|
263 distinct values | 29895 (97.4%) | 796 (2.6%) | ||||||||||||||||
| 6 | Sgpt Alamine Aminotransferase [numeric] |
|
152 distinct values | 30153 (98.2%) | 538 (1.8%) | ||||||||||||||||
| 7 | Sgot Aspartate Aminotransferase [numeric] |
|
177 distinct values | 30229 (98.5%) | 462 (1.5%) | ||||||||||||||||
| 8 | Total Protiens [numeric] |
|
58 distinct values | 30228 (98.5%) | 463 (1.5%) | ||||||||||||||||
| 9 | ALB Albumin [numeric] |
|
40 distinct values | 30197 (98.4%) | 494 (1.6%) | ||||||||||||||||
| 10 | A/G Ratio Albumin and Globulin Ratio [numeric] |
|
69 distinct values | 30132 (98.2%) | 559 (1.8%) | ||||||||||||||||
| 11 | Result [numeric] |
|
|
30691 (100.0%) | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.4.1)
2025-01-07
k <- which(unlist(lapply(data, is.numeric)) == TRUE) #The lapply() function checks each column to determine if it's numeric.
# unlist() converts the logical list result to a vector.
# which() identifies the indices where the value is TRUE
k # Print the indices of numeric columns for verification
## Age of the patient Total Bilirubin
## 1 3
## Direct Bilirubin Alkphos Alkaline Phosphotase
## 4 5
## Sgpt Alamine Aminotransferase Sgot Aspartate Aminotransferase
## 6 7
## Total Protiens ALB Albumin
## 8 9
## A/G Ratio Albumin and Globulin Ratio Result
## 10 11
Xdata <- log(data[, k])
# The log() function applies a logarithm to the selected numeric columns.
# This transformation is often used to reduce skewness in data.
pc <- imputePCA(Xdata)
# imputePCA() imputes missing values based on the structure of the data
# This method uses relationships between variables to predict and fill in missing data.
Xdata <- pc$completeObs
# pc$completeObs contains the imputed data where missing values have been replaced.
data[, k] <- exp(Xdata)
# The exp() function reverses the natural logarithm, restoring the original scale of the data.
summary <- dfSummary(data)
print(summary, method = "render")
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Age of the patient [numeric] |
|
78 distinct values | 30691 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 2 | Gender of the patient [character] |
|
|
29789 (97.1%) | 902 (2.9%) | ||||||||||||||||
| 3 | Total Bilirubin [numeric] |
|
643 distinct values | 30691 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 4 | Direct Bilirubin [numeric] |
|
560 distinct values | 30691 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 5 | Alkphos Alkaline Phosphotase [numeric] |
|
933 distinct values | 30691 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 6 | Sgpt Alamine Aminotransferase [numeric] |
|
604 distinct values | 30691 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 7 | Sgot Aspartate Aminotransferase [numeric] |
|
561 distinct values | 30691 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 8 | Total Protiens [numeric] |
|
415 distinct values | 30691 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 9 | ALB Albumin [numeric] |
|
438 distinct values | 30691 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 10 | A/G Ratio Albumin and Globulin Ratio [numeric] |
|
484 distinct values | 30691 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 11 | Result [numeric] |
|
|
30691 (100.0%) | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.4.1)
2025-01-07
j <- which(is.na(data$`Gender of the patient`) == TRUE)
# is.na(data$`Gender of the patient`): Checks each value in the "Gender of the patient" column.
# Returns TRUE for missing values (NA) and FALSE otherwise.
# which(... == TRUE): Finds the indices (row numbers) where the value is missing (TRUE).
# j: This variable stores the row indices of missing values for easier reference later.
data <- data[-j,]
# data[-j,]: Removes rows from the dataset where the row indices match those in `j`.
# The minus sign (-j) tells R to exclude these rows.
# After this step, the dataset will no longer have any rows with missing "Gender of the patient" values.
# This step ensures that analyses requiring this column won't fail due to missing data.
# The dataset has been cleaned by removing rows with missing values in the "Gender of the patient" column.
# Now, we generate an updated summary of the dataset to review its current structure and statistics.
summary <- dfSummary(data)
print(summary, method = "render")
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Age of the patient [numeric] |
|
78 distinct values | 29789 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 2 | Gender of the patient [character] |
|
|
29789 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 3 | Total Bilirubin [numeric] |
|
459 distinct values | 29789 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 4 | Direct Bilirubin [numeric] |
|
471 distinct values | 29789 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 5 | Alkphos Alkaline Phosphotase [numeric] |
|
871 distinct values | 29789 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 6 | Sgpt Alamine Aminotransferase [numeric] |
|
554 distinct values | 29789 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 7 | Sgot Aspartate Aminotransferase [numeric] |
|
517 distinct values | 29789 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 8 | Total Protiens [numeric] |
|
382 distinct values | 29789 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 9 | ALB Albumin [numeric] |
|
403 distinct values | 29789 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 10 | A/G Ratio Albumin and Globulin Ratio [numeric] |
|
449 distinct values | 29789 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 11 | Result [numeric] |
|
|
29789 (100.0%) | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.4.1)
2025-01-07
cat("The summary table has been updated")
## The summary table has been updated
# Use robust Mahalanobis distance to detect multivariate outliers
robust_dist <- covMcd(data[,-c(2,11)]) # Compute robust covariance and center for columns excluding 2 and 11 (gender and results)
threshold <- qchisq(0.99, df = ncol(data[,-c(2,11)])) # Chi-squared threshold for 99% confidence level
# Identify multivariate outliers based on robust distances
robust_outliers <- mahalanobis(data[,-c(2,11)], robust_dist$center, robust_dist$cov) > threshold
table(robust_outliers)
## robust_outliers
## FALSE TRUE
## 15034 14755
robust_clean_data <- data[!robust_outliers, ]
dim(robust_clean_data)
## [1] 15034 11
summary <- dfSummary(robust_clean_data)
print(summary, method = "render")
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing | |||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Age of the patient [numeric] |
|
75 distinct values | 15034 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 2 | Gender of the patient [character] |
|
|
15034 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 3 | Total Bilirubin [numeric] |
|
159 distinct values | 15034 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 4 | Direct Bilirubin [numeric] |
|
189 distinct values | 15034 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 5 | Alkphos Alkaline Phosphotase [numeric] |
|
450 distinct values | 15034 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 6 | Sgpt Alamine Aminotransferase [numeric] |
|
262 distinct values | 15034 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 7 | Sgot Aspartate Aminotransferase [numeric] |
|
242 distinct values | 15034 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 8 | Total Protiens [numeric] |
|
164 distinct values | 15034 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 9 | ALB Albumin [numeric] |
|
170 distinct values | 15034 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 10 | A/G Ratio Albumin and Globulin Ratio [numeric] |
|
209 distinct values | 15034 (100.0%) | 0 (0.0%) | ||||||||||||||||
| 11 | Result [numeric] |
|
|
15034 (100.0%) | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.4.1)
2025-01-07
write.csv(robust_clean_data, file = "cleaned_data.csv", row.names = FALSE)
# robust_clean_data: This is the cleaned data you want to save.
# file: The name of the output CSV file.
# row.names = FALSE: Ensures that row numbers are not added as a separate column in the CSV file.
data<-robust_clean_data
data <- data %>% clean_names()
data$age_cat <- cut(data$age_of_the_patient,
breaks = c(-Inf, 18, 40, 65, Inf), # Define the age group boundaries
labels = c("Youth (<18)", "Adult (18-40)", "Middle-aged (40-65)", "Senior (>65)")) # Assign group labels
data$total_bilirubin_cat <- cut(data$total_bilirubin,
breaks = c(-Inf, 1.2, Inf), # Threshold for 'Normal' and 'Elevated'
labels = c("Normal", "Elevated"))
data$direct_bilirubin_cat <- cut(data$direct_bilirubin,
breaks = c(-Inf, 0.3, Inf),
labels = c("Normal", "Elevated"))
data$alkphos_cat <- cut(data$alkphos_alkaline_phosphotase,
breaks = c(-Inf, 44, 147, Inf),
labels = c("Low", "Normal", "High"))
data$sgpt_cat <- cut(data$sgpt_alamine_aminotransferase,
breaks = c(-Inf, 45, Inf),
labels = c("Normal", "Elevated"))
data$sgot_cat <- cut(data$sgot_aspartate_aminotransferase,
breaks = c(-Inf, 40, Inf),
labels = c("Normal", "Elevated"))
data$total_protiens_cat <- cut(data$total_protiens,
breaks = c(-Inf, 6.4, 8.3, Inf),
labels = c("Low", "Normal", "High"))
data$alb_cat <- cut(data$alb_albumin,
breaks = c(-Inf, 3.5, 5.5, Inf),
labels = c("Low", "Normal", "High"))
data$a_g_ratio_cat <- cut(data$a_g_ratio_albumin_and_globulin_ratio,
breaks = c(-Inf, 1.2, 2.2, Inf),
labels = c("Low", "Normal", "High"))
data_categorized <- data.frame(
Age = data$age_cat,
Gender = data$gender_of_the_patient,
Total_Bilirubin = data$total_bilirubin_cat,
Direct_Bilirubin = data$direct_bilirubin_cat,
Alkaline_Phosphatase = data$alkphos_cat,
Alanine_Aminotransferase = data$sgpt_cat,
Aspartate_Aminotransferase = data$sgot_cat,
Total_Proteins = data$total_protiens_cat,
Albumin = data$alb_cat,
AG_Ratio = data$a_g_ratio_cat,
Result = as.factor(data$result) # Convert 'Result' to a factor for categorical analysis
)
data_categorized$Gender <- as.factor(data_categorized$Gender)
data_categorized <- droplevels(data_categorized)
summary(data_categorized)
## Age Gender Total_Bilirubin Direct_Bilirubin
## Youth (<18) : 831 Female: 4012 Normal :13232 Normal :12193
## Adult (18-40) :5506 Male :11022 Elevated: 1802 Elevated: 2841
## Middle-aged (40-65):7242
## Senior (>65) :1455
## Alkaline_Phosphatase Alanine_Aminotransferase Aspartate_Aminotransferase
## Normal: 1876 Normal :13092 Normal :11133
## High :13158 Elevated: 1942 Elevated: 3901
##
##
## Total_Proteins Albumin AG_Ratio Result
## Low :6653 Low :8690 Low :12379 1:8423
## Normal:7821 Normal:6344 Normal: 2655 2:6611
## High : 560
##
library(FactoMineR)
library(factoextra)
# PCA requires numeric (quantitative) variables.
library(readr)
data <- read_csv("cleaned_data.csv")
summary(data)
## Age of the patient Gender of the patient Total Bilirubin Direct Bilirubin
## Min. : 4.00 Length:15034 Min. :0.4598 Min. :0.1000
## 1st Qu.:33.00 Class :character 1st Qu.:0.7000 1st Qu.:0.2000
## Median :45.00 Mode :character Median :0.8000 Median :0.2000
## Mean :44.07 Mean :0.8991 Mean :0.2704
## 3rd Qu.:55.00 3rd Qu.:1.0000 3rd Qu.:0.3000
## Max. :90.00 Max. :2.2000 Max. :1.0000
## Alkphos Alkaline Phosphotase Sgpt Alamine Aminotransferase
## Min. : 63.0 Min. :10.00
## 1st Qu.:163.0 1st Qu.:20.00
## Median :188.0 Median :26.00
## Mean :197.1 Mean :29.21
## 3rd Qu.:215.0 3rd Qu.:36.00
## Max. :418.0 Max. :72.00
## Sgot Aspartate Aminotransferase Total Protiens ALB Albumin
## Min. :10.00 Min. :3.6 Min. :0.900
## 1st Qu.:21.00 1st Qu.:6.0 1st Qu.:2.900
## Median :28.03 Median :6.7 Median :3.400
## Mean :32.41 Mean :6.6 Mean :3.358
## 3rd Qu.:41.00 3rd Qu.:7.3 3rd Qu.:4.000
## Max. :95.00 Max. :9.2 Max. :5.500
## A/G Ratio Albumin and Globulin Ratio Result
## Min. :0.300 Min. :1.00
## 1st Qu.:0.880 1st Qu.:1.00
## Median :1.000 Median :1.00
## Mean :1.021 Mean :1.44
## 3rd Qu.:1.200 3rd Qu.:2.00
## Max. :1.800 Max. :2.00
pca_res <- PCA(data, quali.sup = c(2, 11), scale.unit = TRUE, graph = FALSE)
# scale.unit = TRUE in PCA to standardize your variables (subtract mean and divide by standard deviation
# (quali.sup) Some variables (like categorical ones) may not contribute to PCA but can provide additional insights. These are marked as "supplementary."
pca_res # Show how much variance each principal component explains.
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 15034 individuals, described by 11 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$quali.sup" "results for the supplementary categorical variables"
## 12 "$quali.sup$coord" "coord. for the supplementary categories"
## 13 "$quali.sup$v.test" "v-test of the supplementary categories"
## 14 "$call" "summary statistics"
## 15 "$call$centre" "mean of the variables"
## 16 "$call$ecart.type" "standard error of the variables"
## 17 "$call$row.w" "weights for the individuals"
## 18 "$call$col.w" "weights for the variables"
fviz_screeplot(pca_res, addlabels = TRUE) #Visualizes the variance explained by each principal component.
# (fviz_pca_var): Highlights which variables contribute most to the components.
fviz_pca_var(pca_res, repel = TRUE) # Variable contributions
## 4.5 Extract and display eigenvalues to understand the variance
captured by each component
pca_res$eig
## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 2.490702555 27.67447284 27.67447
## comp 2 2.113850233 23.48722481 51.16170
## comp 3 1.318682351 14.65202612 65.81372
## comp 4 1.023384805 11.37094228 77.18467
## comp 5 0.990938100 11.01042334 88.19509
## comp 6 0.588098669 6.53442965 94.72952
## comp 7 0.410417994 4.56019993 99.28972
## comp 8 0.057601672 0.64001858 99.92974
## comp 9 0.006323621 0.07026246 100.00000
library(readr)
data <- read_csv("cleaned_data.csv")
data$Result <- as.factor(data$Result) # Convert to factor
# The `glm()` function fits a generalized linear model.
# `result ~ .` means using all predictors to predict the `result` variable.
# `family = binomial` specifies that this is a logistic regression model.
full_model <- glm(Result ~ ., data = data, family = binomial)
# The `step()` function eliminates predictors based on their AIC (Akaike Information Criterion) value.
# `direction = "backward"` starts with all predictors and removes the least significant ones iteratively.
# AIC measures the goodness of fit of the model; lower values indicate better models.
stepwise_model <- step(full_model, direction = "backward")
## Start: AIC=20021.44
## Result ~ `Age of the patient` + `Gender of the patient` + `Total Bilirubin` +
## `Direct Bilirubin` + ` Alkphos Alkaline Phosphotase` + ` Sgpt Alamine Aminotransferase` +
## `Sgot Aspartate Aminotransferase` + `Total Protiens` + ` ALB Albumin` +
## `A/G Ratio Albumin and Globulin Ratio`
##
## Df Deviance AIC
## - `Age of the patient` 1 19999 20019
## - `Gender of the patient` 1 20000 20020
## - `Sgot Aspartate Aminotransferase` 1 20001 20021
## <none> 19999 20021
## - `Direct Bilirubin` 1 20027 20047
## - ` Alkphos Alkaline Phosphotase` 1 20034 20054
## - ` Sgpt Alamine Aminotransferase` 1 20042 20062
## - `Total Bilirubin` 1 20059 20079
## - `A/G Ratio Albumin and Globulin Ratio` 1 20276 20296
## - `Total Protiens` 1 20324 20344
## - ` ALB Albumin` 1 20328 20348
##
## Step: AIC=20019.44
## Result ~ `Gender of the patient` + `Total Bilirubin` + `Direct Bilirubin` +
## ` Alkphos Alkaline Phosphotase` + ` Sgpt Alamine Aminotransferase` +
## `Sgot Aspartate Aminotransferase` + `Total Protiens` + ` ALB Albumin` +
## `A/G Ratio Albumin and Globulin Ratio`
##
## Df Deviance AIC
## - `Gender of the patient` 1 20000 20018
## - `Sgot Aspartate Aminotransferase` 1 20001 20019
## <none> 19999 20019
## - `Direct Bilirubin` 1 20027 20045
## - ` Alkphos Alkaline Phosphotase` 1 20034 20052
## - ` Sgpt Alamine Aminotransferase` 1 20042 20060
## - `Total Bilirubin` 1 20059 20077
## - `A/G Ratio Albumin and Globulin Ratio` 1 20276 20294
## - `Total Protiens` 1 20324 20342
## - ` ALB Albumin` 1 20328 20346
##
## Step: AIC=20018.45
## Result ~ `Total Bilirubin` + `Direct Bilirubin` + ` Alkphos Alkaline Phosphotase` +
## ` Sgpt Alamine Aminotransferase` + `Sgot Aspartate Aminotransferase` +
## `Total Protiens` + ` ALB Albumin` + `A/G Ratio Albumin and Globulin Ratio`
##
## Df Deviance AIC
## - `Sgot Aspartate Aminotransferase` 1 20002 20018
## <none> 20000 20018
## - `Direct Bilirubin` 1 20028 20044
## - ` Alkphos Alkaline Phosphotase` 1 20036 20052
## - ` Sgpt Alamine Aminotransferase` 1 20043 20059
## - `Total Bilirubin` 1 20060 20076
## - `A/G Ratio Albumin and Globulin Ratio` 1 20277 20293
## - `Total Protiens` 1 20324 20340
## - ` ALB Albumin` 1 20329 20345
##
## Step: AIC=20018.25
## Result ~ `Total Bilirubin` + `Direct Bilirubin` + ` Alkphos Alkaline Phosphotase` +
## ` Sgpt Alamine Aminotransferase` + `Total Protiens` + ` ALB Albumin` +
## `A/G Ratio Albumin and Globulin Ratio`
##
## Df Deviance AIC
## <none> 20002 20018
## - `Direct Bilirubin` 1 20028 20042
## - ` Alkphos Alkaline Phosphotase` 1 20039 20053
## - `Total Bilirubin` 1 20061 20075
## - ` Sgpt Alamine Aminotransferase` 1 20079 20093
## - `A/G Ratio Albumin and Globulin Ratio` 1 20278 20292
## - `Total Protiens` 1 20326 20340
## - ` ALB Albumin` 1 20331 20345
# `summary()` provides the coefficients of the selected model and their statistical significance.
# Look at the `Pr(>|z|)` column to see which predictors are significant:
# Values < 0.05 indicate statistically significant predictors.
# The model's residual deviance and AIC are also useful for assessing overall fit.
summary(stepwise_model)
##
## Call:
## glm(formula = Result ~ `Total Bilirubin` + `Direct Bilirubin` +
## ` Alkphos Alkaline Phosphotase` + ` Sgpt Alamine Aminotransferase` +
## `Total Protiens` + ` ALB Albumin` + `A/G Ratio Albumin and Globulin Ratio`,
## family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.7816537 0.3630895 18.678 < 2e-16
## `Total Bilirubin` -1.2440321 0.1635907 -7.605 2.86e-14
## `Direct Bilirubin` 1.4184726 0.2771691 5.118 3.09e-07
## ` Alkphos Alkaline Phosphotase` -0.0019375 0.0003222 -6.013 1.82e-09
## ` Sgpt Alamine Aminotransferase` -0.0126591 0.0014520 -8.719 < 2e-16
## `Total Protiens` -2.0064790 0.1139280 -17.612 < 2e-16
## ` ALB Albumin` 3.9617879 0.2232686 17.744 < 2e-16
## `A/G Ratio Albumin and Globulin Ratio` -5.4919388 0.3371597 -16.289 < 2e-16
##
## (Intercept) ***
## `Total Bilirubin` ***
## `Direct Bilirubin` ***
## ` Alkphos Alkaline Phosphotase` ***
## ` Sgpt Alamine Aminotransferase` ***
## `Total Protiens` ***
## ` ALB Albumin` ***
## `A/G Ratio Albumin and Globulin Ratio` ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20623 on 15033 degrees of freedom
## Residual deviance: 20002 on 15026 degrees of freedom
## AIC: 20018
##
## Number of Fisher Scoring iterations: 4
library(readr)
data <- read_csv("cleaned_data.csv")
data$Result <- as.factor(data$Result)
set.seed(123)
# Uses all variables in dataset to predict 'result'
model <- glm(Result ~ ., data = data, family = binomial)
initial_probs <- fitted(model) # fitted(model) gets probability predictions
initial_preds <- ifelse(initial_probs > 0.5, 2, 1) # ifelse() converts probabilities to class labels (1 or 2)
print("Confusion Matrix with 0.5 threshold:")
## [1] "Confusion Matrix with 0.5 threshold:"
initial_conf_matrix <- table(initial_preds, data$Result) # table() compares predicted vs actual classes
# Shows how many predictions were correct/incorrect
print(initial_conf_matrix)
##
## initial_preds 1 2
## 1 6447 3986
## 2 1976 2625
library(pROC) # Load the `pROC` package to compute and visualize the ROC curve.
# Creates ROC curve comparing true vs false positive rates
# Higher AUC (Area Under Curve) means better model
roc_obj <- roc(data$Result, initial_probs)
plot(roc_obj, col = "blue", lwd = 2,
main = paste("ROC Curve (AUC =", round(auc(roc_obj), 3), ")"))
# coords() finds threshold that best balances sensitivity/specificity
# "best" method uses Youden's index
best_threshold <- coords(roc_obj, "best", ret = "threshold",
best.method = "youden")
print(paste("Best threshold:", round(best_threshold, 3)))
## [1] "Best threshold: 0.462"
initial_probs <- fitted(model) # fitted(model) gets probability predictions from the logistic regression model
best_threshold <- best_threshold[1,1] #best_threshold is a matrix containing the optimal threshold determined earlier, likely from an ROC curve analysis.
optimized_preds <- ifelse(initial_probs > best_threshold, 2, 1)
print("Confusion Matrix with optimal threshold:")
## [1] "Confusion Matrix with optimal threshold:"
optimized_conf_matrix <- table(optimized_preds, data$Result)
print(optimized_conf_matrix)
##
## optimized_preds 1 2
## 1 5499 2967
## 2 2924 3644
# `rpart` is a library used for recursive partitioning like building decision trees.
library(rpart)
library(readr)
data <- read_csv("cleaned_data.csv")
data$Result <- as.factor(data$Result)
model <- rpart(Result ~ ., data = data, method = "class")
# `result ~ .` means using all predictors in the dataset to predict the `result` variable.
# `data = data` specifies the dataset.
# `method = "class"` indicates that this is a classification tree.
# `rpart.plot` is a library used for plotting decision trees in an interpretable manner.
library(rpart.plot)
# rpart.plot(model) creates a visual representation of the decision tree.
# This plot shows how the data is split based on different predictors at each node.
# Leaf nodes represent the final predictions
rpart.plot(model)
# `predict()` generates predictions for the `result` variable based on the model.
# `newdata = data` specifies that predictions are being made on the same dataset used to train the model.
# `type = "class"` returns the predicted class labels.
predictions <- predict(model, newdata = data, type = "class")
# Load the `caret` library, which provides tools for model evaluation.
library(caret)
# Compare actual (`data$result`) and predicted (`predictions`) classes.
# `confusionMatrix()` computes metrics like accuracy, precision, recall, and F1 score.
confusionMatrix <- confusionMatrix(as.factor(predictions), as.factor(data$Result))
print(confusionMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 6870 1850
## 2 1553 4761
##
## Accuracy : 0.7736
## 95% CI : (0.7669, 0.7803)
## No Information Rate : 0.5603
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5384
##
## Mcnemar's Test P-Value : 3.893e-07
##
## Sensitivity : 0.8156
## Specificity : 0.7202
## Pos Pred Value : 0.7878
## Neg Pred Value : 0.7540
## Prevalence : 0.5603
## Detection Rate : 0.4570
## Detection Prevalence : 0.5800
## Balanced Accuracy : 0.7679
##
## 'Positive' Class : 1
##
# `glm()` fits a logistic regression model with `result` as the outcome and all other predictors as inputs.
# `family = binomial` specifies that this is a logistic regression for binary outcomes.
library(readr)
data <- read_csv("cleaned_data.csv")
data$Result <- as.factor(data$Result)
set.seed(123)
model1 <- glm(Result ~ ., data = data, family = binomial)
# `predict()` returns predicted probabilities for each observation.
# `type = "response"` specifies that probabilities (not log-odds) are returned.
predictions1 <- predict(model1, newdata = data, type = "response")
# A threshold of 0.5 is used for classification:
# If the predicted probability is greater than 0.5, classify as "2" (positive class).
# Otherwise, classify as "1" (negative class).
# Ensure the predicted classes align with the actual levels of `data$result`.
predicted_classes1 <- ifelse(predictions1 > 0.5, "2", "1")
# Compare the predicted class labels with the actual class labels.
# `positive = "1"` specifies that the positive class is labeled as "1".
# Ensure data$result is a factor
data$result <- as.factor(data$Result)
# Check and align levels for both variables
levels(predicted_classes1) <- levels(data$result)
# Compute the confusion matrix
confusionMatrix1 <- confusionMatrix(as.factor(predicted_classes1), as.factor(data$result), positive = "1")
# Print the confusion matrix
print(confusionMatrix1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 6447 3986
## 2 1976 2625
##
## Accuracy : 0.6034
## 95% CI : (0.5956, 0.6113)
## No Information Rate : 0.5603
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.168
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7654
## Specificity : 0.3971
## Pos Pred Value : 0.6179
## Neg Pred Value : 0.5705
## Prevalence : 0.5603
## Detection Rate : 0.4288
## Detection Prevalence : 0.6940
## Balanced Accuracy : 0.5812
##
## 'Positive' Class : 1
##