Import data
# Set the working directory to the folder where the "lung.csv" file is located
# setwd("path/to/folder")
# Import the "lung.csv" file into R as a data frame
lung_data <- read.csv("lung.csv")
# View the first few rows of the data frame to verify that the data has been imported correctly
head(lung_data)
## Patient.ID Age Sex Stage Smoking.Status Molecular.Stratification
## 1 1 59 M III Current smoker EGFR
## 2 2 62 F I 21 ALK
## 3 3 46 M IV Never smoker KRAS
## 4 4 NA F III Current smoker EGFR
## 5 5 55 M II Former smoker KRAS
## 6 6 64 F III Current smoker ALK
## ICDO.Histologic.Type Surgery.Done Volume.of.Tumor Vital.Status
## 1 Adenocarcinoma Yes 45 Dead
## 2 Squamous cell Yes 20 Alive
## 3 Small cell No 75 Dead
## 4 Large cell Yes 60 Dead
## 5 Adenocarcinoma Yes 30 alive
## 6 Squamous cell 40 Dead
## Duration.of.Follow.Up Relapse Duration.to.Relapse.Last.Follow.Up
## 1 30 Yes 8
## 2 20 No 20
## 3 24 Yes 9
## 4 17 Yes 14
## 5 22 No 22
## 6 23 Yes 12
Show structure of data
# Check the structure of the "lung_data" data frame
str(lung_data)
## 'data.frame': 49 obs. of 13 variables:
## $ Patient.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 59 62 46 NA 55 64 51 69 57 63 ...
## $ Sex : chr "M" "F" "M" "F" ...
## $ Stage : chr "III" "I" "IV" "III" ...
## $ Smoking.Status : chr "Current smoker" "21" "Never smoker" "Current smoker" ...
## $ Molecular.Stratification : chr "EGFR " "ALK " "KRAS " "EGFR " ...
## $ ICDO.Histologic.Type : chr "Adenocarcinoma" "Squamous cell" "Small cell" "Large cell" ...
## $ Surgery.Done : chr "Yes" "Yes" "No" "Yes" ...
## $ Volume.of.Tumor : int 45 20 75 60 30 40 50 80 35 55 ...
## $ Vital.Status : chr "Dead" "Alive" "Dead" "Dead" ...
## $ Duration.of.Follow.Up : int 30 20 24 17 22 23 19 8 26 20 ...
## $ Relapse : chr "Yes" "No" "Yes" "Yes" ...
## $ Duration.to.Relapse.Last.Follow.Up: int 8 20 9 14 22 12 8 6 22 15 ...
Data Cleaning
library(tidyverse)
# First, we remove the Patient.ID column as it does not provide useful information.
lung_data_clean <- select(lung_data, -Patient.ID)
# Replace outliers in Age with NA
lung_data_clean$Age[which(lung_data_clean$Age < 18 | lung_data_clean$Age > 120)] <- NA
# Replace outliers in Volume with NA
lung_data_clean$Volume.of.Tumor[which(lung_data_clean$Volume.of.Tumor > 50)] <- NA
# Replace NA values in Sex with "M"
lung_data_clean$Sex[is.na(lung_data_clean$Sex)] <- "M"
# Replace "21" in Smoking.Status with "Current smoker"
lung_data_clean$Smoking.Status[which(lung_data_clean$Smoking.Status == "21")] <- "Current smoker"
# Next, we remove any rows with missing Age values.
lung_data_clean <- filter(lung_data_clean, !is.na(Age))
# We can see that there is an unusual value "21" in the Smoking.Status column, which should be "Ex-smoker".
# We will replace this value and convert the column to a factor.
lung_data_clean <- mutate(lung_data_clean,
Smoking.Status = ifelse(Smoking.Status == "21", "Ex-smoker", Smoking.Status)) %>%
mutate(Smoking.Status = factor(Smoking.Status,
levels = c("Never smoker", "Ex-smoker", "Current smoker")))
# We can see that there is a trailing space in the Molecular.Stratification column that needs to be removed.
lung_data_clean$Molecular.Stratification <- str_trim(lung_data_clean$Molecular.Stratification)
# We can also see that the ICDO.Histologic.Type column should be a factor with specific levels.
# We will convert this column to a factor and relabel the levels accordingly.
lung_data_clean <- mutate(lung_data_clean,
ICDO.Histologic.Type = factor(ICDO.Histologic.Type,
levels = c("Adenocarcinoma", "Squamous cell", "Large cell", "Small cell")))
# Finally, we can remove the Duration.to.Relapse.Last.Follow.Up column as it is redundant with the Relapse column.
lung_data_clean <- select(lung_data_clean, -Duration.to.Relapse.Last.Follow.Up)
Explarotory data analysis
library(ggplot2)
# Create histogram of Age
ggplot(lung_data_clean, aes(x = Age)) +
geom_histogram() +
labs(x = "Age", y = "Count", title = "Distribution of Age")

# Create boxplot of Volume.of.Tumor by Stage
ggplot(lung_data_clean, aes(x = Stage, y = Volume.of.Tumor)) +
geom_boxplot() +
labs(x = "Stage", y = "Volume of Tumor", title = "Volume of Tumor by Stage")

# Create scatterplot of Age and Duration.of.Follow.Up
ggplot(lung_data_clean, aes(x = Age, y = Duration.of.Follow.Up)) +
geom_point() +
labs(x = "Age", y = "Duration of Follow-Up", title = "Age and Duration of Follow-Up")

# Scatter plot of age vs volume of tumor, colored by sex
ggplot(lung_data_clean, aes(x = Age, y = Volume.of.Tumor, color = Sex)) +
geom_point()

# Density plot of age, with separate curves for each smoking status
ggplot(lung_data_clean, aes(x = Age, fill = Smoking.Status)) +
geom_density(alpha = 0.5)

# Bar plot of stage frequency, sorted by frequency
ggplot(lung_data_clean, aes(x = Stage)) +
geom_bar() +
scale_x_discrete(limits = c("I", "II", "III", "IV"))

Scatterplot with regression line
library(ggplot2)
# Create scatterplot of Age and Volume.of.Tumor, with points colored by ICDO.Histologic.Type
plot <- ggplot(lung_data_clean, aes(x = Age, y = Volume.of.Tumor, color = ICDO.Histologic.Type)) +
geom_point() +
labs(x = "Age", y = "Volume of Tumor", color = "Histology") +
theme(legend.position = "bottom") # move legend to bottom of plot
# Add a regression line to the plot
reg_line <- geom_smooth(method = "lm", se = FALSE, color = "black")
plot <- plot + reg_line
# Get regression model summary
model_summary <- summary(lm(Volume.of.Tumor ~ Age, data = lung_data_clean))
# Extract p-value from model summary
p_value <- format(model_summary$coefficients[2, 4], digits = 3)
# Add p-value to the top-right corner of the plot
plot <- plot + annotate("text", x = Inf, y = Inf, hjust = 1, vjust = 1,
label = paste0("p = ", p_value), size = 5)
# Show the plot
plot

KM
library(survival)
library(survminer)
# subset data for only males and females
lung_data_gender <- subset(lung_data_clean, !is.na(Sex))
# create survival object
surv_obj <- Surv(lung_data_gender$Duration.of.Follow.Up, lung_data_gender$Relapse=="Yes")
# fit KM model by gender
km_model <- survfit(surv_obj ~ Sex, data = lung_data_gender)
# plot KM curves
ggsurvplot(km_model, data = lung_data_gender, palette = c("#FF69B4", "#00BFFF"), conf.int = FALSE,
risk.table = TRUE, pval = TRUE, pval.coord = c(0, 0.1))

table 1
library(gtsummary)
library(dplyr)
# Create table 1
tbl_summary(lung_data_clean,
type = list(Sex ~ "categorical",
Stage ~ "categorical",
Smoking.Status ~ "categorical",
Molecular.Stratification ~ "categorical"),
statistic = list(all_continuous() ~ "{mean} ({sd})"),
missing = "no"
)
| Characteristic |
N = 47 |
| Age |
59 (10) |
| Sex |
|
| F |
23 (49%) |
| M |
24 (51%) |
| Stage |
|
| I |
13 (28%) |
| II |
8 (17%) |
| III |
15 (32%) |
| IV |
11 (23%) |
| Smoking.Status |
|
| Never smoker |
13 (36%) |
| Ex-smoker |
0 (0%) |
| Current smoker |
23 (64%) |
| Molecular.Stratification |
|
| ALK |
14 (30%) |
| EGFR |
16 (34%) |
| KRAS |
17 (36%) |
| ICDO.Histologic.Type |
|
| Adenocarcinoma |
13 (28%) |
| Squamous cell |
12 (26%) |
| Large cell |
10 (21%) |
| Small cell |
12 (26%) |
| Surgery.Done |
|
| |
1 (2.1%) |
| No |
23 (49%) |
| Yes |
23 (49%) |
| Volume.of.Tumor |
|
| 20 |
4 (13%) |
| 25 |
4 (13%) |
| 30 |
7 (22%) |
| 35 |
4 (13%) |
| 40 |
4 (13%) |
| 45 |
4 (13%) |
| 50 |
5 (16%) |
| Vital.Status |
|
| alive |
1 (2.1%) |
| Alive |
12 (26%) |
| Dead |
33 (70%) |
| Unknown |
1 (2.1%) |
| Duration.of.Follow.Up |
18 (8) |
| Relapse |
40 (85%) |
table 2
# Filter the data to include only male and female sex
lung_data_clean_filtered <- lung_data_clean %>%
filter(Sex %in% c("M", "F"))
# Create Table 1 comparing sex of patients using the tbl_summary() function
table2 <- lung_data_clean_filtered %>%
tbl_summary(
by = Sex,
missing = "no",
label = list(
Age ~ "Age",
Stage ~ "Stage",
Smoking.Status ~ "Smoking Status",
Molecular.Stratification ~ "Molecular Stratification",
ICDO.Histologic.Type ~ "ICDO Histologic Type",
Surgery.Done ~ "Surgery Done",
Volume.of.Tumor ~ "Volume of Tumor",
Vital.Status ~ "Vital Status",
Duration.of.Follow.Up ~ "Duration of Follow Up",
Relapse ~ "Relapse"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
)
) %>%
add_overall(col_label = "Overall") %>%
add_p()
# Print Table 2
table2
| Characteristic |
Overall, N = 47 |
F, N = 23 |
M, N = 24 |
p-value |
| Age |
59 (10) |
54 (7) |
64 (9) |
<0.001 |
| Stage |
|
|
|
<0.001 |
| I |
13 (28%) |
11 (48%) |
2 (8.3%) |
|
| II |
8 (17%) |
6 (26%) |
2 (8.3%) |
|
| III |
15 (32%) |
4 (17%) |
11 (46%) |
|
| IV |
11 (23%) |
2 (8.7%) |
9 (38%) |
|
| Smoking Status |
|
|
|
0.002 |
| Never smoker |
13 (36%) |
10 (67%) |
3 (14%) |
|
| Ex-smoker |
0 (0%) |
0 (0%) |
0 (0%) |
|
| Current smoker |
23 (64%) |
5 (33%) |
18 (86%) |
|
| Molecular Stratification |
|
|
|
0.4 |
| ALK |
14 (30%) |
8 (35%) |
6 (25%) |
|
| EGFR |
16 (34%) |
9 (39%) |
7 (29%) |
|
| KRAS |
17 (36%) |
6 (26%) |
11 (46%) |
|
| ICDO Histologic Type |
|
|
|
0.078 |
| Adenocarcinoma |
13 (28%) |
7 (30%) |
6 (25%) |
|
| Squamous cell |
12 (26%) |
5 (22%) |
7 (29%) |
|
| Large cell |
10 (21%) |
2 (8.7%) |
8 (33%) |
|
| Small cell |
12 (26%) |
9 (39%) |
3 (13%) |
|
| Surgery Done |
|
|
|
0.7 |
| |
1 (2.1%) |
1 (4.3%) |
0 (0%) |
|
| No |
23 (49%) |
12 (52%) |
11 (46%) |
|
| Yes |
23 (49%) |
10 (43%) |
13 (54%) |
|
| Volume of Tumor |
|
|
|
0.009 |
| 20 |
4 (13%) |
4 (20%) |
0 (0%) |
|
| 25 |
4 (13%) |
3 (15%) |
1 (8.3%) |
|
| 30 |
7 (22%) |
6 (30%) |
1 (8.3%) |
|
| 35 |
4 (13%) |
3 (15%) |
1 (8.3%) |
|
| 40 |
4 (13%) |
3 (15%) |
1 (8.3%) |
|
| 45 |
4 (13%) |
1 (5.0%) |
3 (25%) |
|
| 50 |
5 (16%) |
0 (0%) |
5 (42%) |
|
| Vital Status |
|
|
|
<0.001 |
| alive |
1 (2.1%) |
0 (0%) |
1 (4.2%) |
|
| Alive |
12 (26%) |
11 (48%) |
1 (4.2%) |
|
| Dead |
33 (70%) |
11 (48%) |
22 (92%) |
|
| Unknown |
1 (2.1%) |
1 (4.3%) |
0 (0%) |
|
| Duration of Follow Up |
18 (8) |
17 (8) |
18 (8) |
>0.9 |
| Relapse |
40 (85%) |
17 (74%) |
23 (96%) |
0.048 |
Use AI for prediction (tidymodels to create a logistic regression
and a decision tree model and compare their accuracy)
# Load libraries
library(tidymodels) # For modeling
library(tidyverse) # For data manipulation
# Prepare the data
lung_data_clean <- lung_data_clean %>%
mutate(
# Convert categorical variables to factors
Sex = factor(Sex),
Stage = factor(Stage),
Smoking.Status = factor(Smoking.Status),
Molecular.Stratification = factor(Molecular.Stratification),
ICDO.Histologic.Type = factor(ICDO.Histologic.Type),
Surgery.Done = factor(Surgery.Done),
# Handle missing values in 'Age' by imputing with the mean age
Age = if_else(is.na(Age), mean(Age, na.rm = TRUE), Age),
# Create binary outcome variable for 1-year mortality
mortality_1yr = factor(if_else(Duration.of.Follow.Up <= 12 & Vital.Status == "Dead", 1, 0))
)
# Split the data into training and test sets
set.seed(42) # Set seed for reproducibility
data_split <- initial_split(lung_data_clean, prop = 0.7)
train_data <- training(data_split)
test_data <- testing(data_split)
# Logistic Regression Model
# Create a recipe for data preprocessing
logistic_recipe <- recipe(mortality_1yr ~ Age + Sex + Stage + Smoking.Status + Molecular.Stratification + ICDO.Histologic.Type + Surgery.Done + Volume.of.Tumor, data = train_data) %>%
# Create dummy variables for categorical predictors
step_dummy(all_nominal(), -all_outcomes())
# Specify the logistic regression model
logistic_spec <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
# Combine preprocessing and model into a workflow
logistic_workflow <- workflow() %>%
add_recipe(logistic_recipe) %>%
add_model(logistic_spec)
# Fit the logistic regression model
logistic_fit <- fit(logistic_workflow, train_data)
# Decision Tree Model
# Create a recipe for data preprocessing (same as for logistic regression)
tree_recipe <- recipe(mortality_1yr ~ Age + Sex + Stage + Smoking.Status + Molecular.Stratification + ICDO.Histologic.Type + Surgery.Done + Volume.of.Tumor, data = train_data) %>%
# Create dummy variables for categorical predictors
step_dummy(all_nominal(), -all_outcomes())
# Specify the decision tree model
tree_spec <- decision_tree() %>%
set_engine("rpart") %>%
set_mode("classification")
# Combine preprocessing and model into a workflow (similar to logistic regression)
tree_workflow <- workflow() %>%
add_recipe(tree_recipe) %>%
add_model(tree_spec)
# Fit the decision tree model
tree_fit <- fit(tree_workflow, train_data)
# Compare the accuracy of the models
# Get predictions and calculate accuracy for the logistic regression model
logistic_results <- logistic_fit %>%
predict(test_data) %>%
bind_cols(test_data %>% select(mortality_1yr)) %>%
metrics(truth = mortality_1yr, estimate = .pred_class)
# Get predictions and calculate accuracy for the decision tree model
tree_results <- tree_fit %>%
predict(test_data) %>%
bind_cols(test_data %>% select(mortality_1yr)) %>%
metrics(truth = mortality_1yr, estimate = .pred_class)
# Print accuracy results
cat("Logistic Regression Accuracy:", logistic_results %>% filter(.metric == "accuracy") %>% pull(.estimate), "\n")
## Logistic Regression Accuracy: 0.7142857
cat("Decision Tree Accuracy:", tree_results %>% filter(.metric== "accuracy") %>% pull(.estimate), "\n")
## Decision Tree Accuracy: 0.7333333
# Compare the performance of the logistic regression and decision tree models
model_comparison <- logistic_results %>%
filter(.metric == "accuracy") %>%
rename(logistic_accuracy = .estimate) %>%
bind_cols(tree_results %>% filter(.metric == "accuracy") %>% rename(tree_accuracy = .estimate))
# Print the model comparison
cat("Model comparison:\n")
## Model comparison:
print(model_comparison)
## # A tibble: 1 × 6
## .metric...1 .estimator...2 logistic_accuracy .metric...4 .estimator...5
## <chr> <chr> <dbl> <chr> <chr>
## 1 accuracy binary 0.714 accuracy binary
## # ℹ 1 more variable: tree_accuracy <dbl>
Decision Tree Accuracy: 0.7142857 Logistic Regression Accuracy:
0.835665