Import data

# Set the working directory to the folder where the "lung.csv" file is located
# setwd("path/to/folder")

# Import the "lung.csv" file into R as a data frame
lung_data <- read.csv("lung.csv")

# View the first few rows of the data frame to verify that the data has been imported correctly
head(lung_data)
##   Patient.ID Age Sex Stage Smoking.Status Molecular.Stratification
## 1          1  59   M   III Current smoker                    EGFR 
## 2          2  62   F     I             21                     ALK 
## 3          3  46   M    IV   Never smoker                    KRAS 
## 4          4  NA   F   III Current smoker                    EGFR 
## 5          5  55   M    II  Former smoker                    KRAS 
## 6          6  64   F   III Current smoker                     ALK 
##   ICDO.Histologic.Type Surgery.Done Volume.of.Tumor Vital.Status
## 1       Adenocarcinoma          Yes              45         Dead
## 2        Squamous cell          Yes              20        Alive
## 3           Small cell           No              75         Dead
## 4           Large cell          Yes              60         Dead
## 5       Adenocarcinoma          Yes              30        alive
## 6        Squamous cell                           40         Dead
##   Duration.of.Follow.Up Relapse Duration.to.Relapse.Last.Follow.Up
## 1                    30     Yes                                  8
## 2                    20      No                                 20
## 3                    24     Yes                                  9
## 4                    17     Yes                                 14
## 5                    22      No                                 22
## 6                    23     Yes                                 12

Show structure of data

# Check the structure of the "lung_data" data frame
str(lung_data)
## 'data.frame':    49 obs. of  13 variables:
##  $ Patient.ID                        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                               : int  59 62 46 NA 55 64 51 69 57 63 ...
##  $ Sex                               : chr  "M" "F" "M" "F" ...
##  $ Stage                             : chr  "III" "I" "IV" "III" ...
##  $ Smoking.Status                    : chr  "Current smoker" "21" "Never smoker" "Current smoker" ...
##  $ Molecular.Stratification          : chr  "EGFR " "ALK " "KRAS " "EGFR " ...
##  $ ICDO.Histologic.Type              : chr  "Adenocarcinoma" "Squamous cell" "Small cell" "Large cell" ...
##  $ Surgery.Done                      : chr  "Yes" "Yes" "No" "Yes" ...
##  $ Volume.of.Tumor                   : int  45 20 75 60 30 40 50 80 35 55 ...
##  $ Vital.Status                      : chr  "Dead" "Alive" "Dead" "Dead" ...
##  $ Duration.of.Follow.Up             : int  30 20 24 17 22 23 19 8 26 20 ...
##  $ Relapse                           : chr  "Yes" "No" "Yes" "Yes" ...
##  $ Duration.to.Relapse.Last.Follow.Up: int  8 20 9 14 22 12 8 6 22 15 ...

Data Cleaning

library(tidyverse)

# First, we remove the Patient.ID column as it does not provide useful information.
lung_data_clean <- select(lung_data, -Patient.ID)

# Replace outliers in Age with NA
lung_data_clean$Age[which(lung_data_clean$Age < 18 | lung_data_clean$Age > 120)] <- NA

# Replace outliers in Volume with NA
lung_data_clean$Volume.of.Tumor[which(lung_data_clean$Volume.of.Tumor > 50)] <- NA

# Replace NA values in Sex with "M"
lung_data_clean$Sex[is.na(lung_data_clean$Sex)] <- "M"

# Replace "21" in Smoking.Status with "Current smoker"
lung_data_clean$Smoking.Status[which(lung_data_clean$Smoking.Status == "21")] <- "Current smoker"



# Next, we remove any rows with missing Age values.
lung_data_clean <- filter(lung_data_clean, !is.na(Age))

# We can see that there is an unusual value "21" in the Smoking.Status column, which should be "Ex-smoker".
# We will replace this value and convert the column to a factor.
lung_data_clean <- mutate(lung_data_clean,
                          Smoking.Status = ifelse(Smoking.Status == "21", "Ex-smoker", Smoking.Status)) %>%
  mutate(Smoking.Status = factor(Smoking.Status,
                                 levels = c("Never smoker", "Ex-smoker", "Current smoker")))

# We can see that there is a trailing space in the Molecular.Stratification column that needs to be removed.
lung_data_clean$Molecular.Stratification <- str_trim(lung_data_clean$Molecular.Stratification)

# We can also see that the ICDO.Histologic.Type column should be a factor with specific levels.
# We will convert this column to a factor and relabel the levels accordingly.
lung_data_clean <- mutate(lung_data_clean,
                          ICDO.Histologic.Type = factor(ICDO.Histologic.Type,
                                                        levels = c("Adenocarcinoma", "Squamous cell", "Large cell", "Small cell")))


# Finally, we can remove the Duration.to.Relapse.Last.Follow.Up column as it is redundant with the Relapse column.
lung_data_clean <- select(lung_data_clean, -Duration.to.Relapse.Last.Follow.Up)

Explarotory data analysis

library(ggplot2)


# Create histogram of Age
ggplot(lung_data_clean, aes(x = Age)) + 
  geom_histogram() + 
  labs(x = "Age", y = "Count", title = "Distribution of Age")

# Create boxplot of Volume.of.Tumor by Stage
ggplot(lung_data_clean, aes(x = Stage, y = Volume.of.Tumor)) + 
  geom_boxplot() + 
  labs(x = "Stage", y = "Volume of Tumor", title = "Volume of Tumor by Stage")

# Create scatterplot of Age and Duration.of.Follow.Up
ggplot(lung_data_clean, aes(x = Age, y = Duration.of.Follow.Up)) + 
  geom_point() + 
  labs(x = "Age", y = "Duration of Follow-Up", title = "Age and Duration of Follow-Up")

# Scatter plot of age vs volume of tumor, colored by sex
ggplot(lung_data_clean, aes(x = Age, y = Volume.of.Tumor, color = Sex)) +
  geom_point()

# Density plot of age, with separate curves for each smoking status
ggplot(lung_data_clean, aes(x = Age, fill = Smoking.Status)) +
  geom_density(alpha = 0.5)

# Bar plot of stage frequency, sorted by frequency
ggplot(lung_data_clean, aes(x = Stage)) +
  geom_bar() +
  scale_x_discrete(limits = c("I", "II", "III", "IV"))

Scatterplot with regression line

library(ggplot2)



# Create scatterplot of Age and Volume.of.Tumor, with points colored by ICDO.Histologic.Type
plot <- ggplot(lung_data_clean, aes(x = Age, y = Volume.of.Tumor, color = ICDO.Histologic.Type)) + 
  geom_point() + 
  labs(x = "Age", y = "Volume of Tumor", color = "Histology") +
  theme(legend.position = "bottom") # move legend to bottom of plot

# Add a regression line to the plot
reg_line <- geom_smooth(method = "lm", se = FALSE, color = "black") 
plot <- plot + reg_line

# Get regression model summary
model_summary <- summary(lm(Volume.of.Tumor ~ Age, data = lung_data_clean))

# Extract p-value from model summary
p_value <- format(model_summary$coefficients[2, 4], digits = 3)

# Add p-value to the top-right corner of the plot
plot <- plot + annotate("text", x = Inf, y = Inf, hjust = 1, vjust = 1, 
                        label = paste0("p = ", p_value), size = 5)

# Show the plot
plot

KM

library(survival)
library(survminer)

# subset data for only males and females
lung_data_gender <- subset(lung_data_clean, !is.na(Sex))

# create survival object
surv_obj <- Surv(lung_data_gender$Duration.of.Follow.Up, lung_data_gender$Relapse=="Yes")

# fit KM model by gender
km_model <- survfit(surv_obj ~ Sex, data = lung_data_gender)

# plot KM curves
ggsurvplot(km_model, data = lung_data_gender, palette = c("#FF69B4", "#00BFFF"), conf.int = FALSE,
           risk.table = TRUE, pval = TRUE, pval.coord = c(0, 0.1))

table 1

library(gtsummary)
library(dplyr)



# Create table 1
tbl_summary(lung_data_clean,
  type = list(Sex ~ "categorical", 
              Stage ~ "categorical",
              Smoking.Status ~ "categorical",
              Molecular.Stratification ~ "categorical"),
  statistic = list(all_continuous() ~ "{mean} ({sd})"),
  missing = "no"
)
Characteristic N = 471
Age 59 (10)
Sex
    F 23 (49%)
    M 24 (51%)
Stage
    I 13 (28%)
    II 8 (17%)
    III 15 (32%)
    IV 11 (23%)
Smoking.Status
    Never smoker 13 (36%)
    Ex-smoker 0 (0%)
    Current smoker 23 (64%)
Molecular.Stratification
    ALK 14 (30%)
    EGFR 16 (34%)
    KRAS 17 (36%)
ICDO.Histologic.Type
    Adenocarcinoma 13 (28%)
    Squamous cell 12 (26%)
    Large cell 10 (21%)
    Small cell 12 (26%)
Surgery.Done
     1 (2.1%)
    No 23 (49%)
    Yes 23 (49%)
Volume.of.Tumor
    20 4 (13%)
    25 4 (13%)
    30 7 (22%)
    35 4 (13%)
    40 4 (13%)
    45 4 (13%)
    50 5 (16%)
Vital.Status
    alive 1 (2.1%)
    Alive 12 (26%)
    Dead 33 (70%)
    Unknown 1 (2.1%)
Duration.of.Follow.Up 18 (8)
Relapse 40 (85%)
1 Mean (SD); n (%)

table 2

# Filter the data to include only male and female sex
lung_data_clean_filtered <- lung_data_clean %>%
  filter(Sex %in% c("M", "F"))

# Create Table 1 comparing sex of patients using the tbl_summary() function
table2 <- lung_data_clean_filtered %>%
  tbl_summary(
    by = Sex,
    missing = "no",
    label = list(
      Age ~ "Age",
      Stage ~ "Stage",
      Smoking.Status ~ "Smoking Status",
      Molecular.Stratification ~ "Molecular Stratification",
      ICDO.Histologic.Type ~ "ICDO Histologic Type",
      Surgery.Done ~ "Surgery Done",
      Volume.of.Tumor ~ "Volume of Tumor",
      Vital.Status ~ "Vital Status",
      Duration.of.Follow.Up ~ "Duration of Follow Up",
      Relapse ~ "Relapse"
    ),
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    )
  ) %>%
  add_overall(col_label = "Overall") %>%
  add_p()

# Print Table 2
table2
Characteristic Overall, N = 471 F, N = 231 M, N = 241 p-value2
Age 59 (10) 54 (7) 64 (9) <0.001
Stage <0.001
    I 13 (28%) 11 (48%) 2 (8.3%)
    II 8 (17%) 6 (26%) 2 (8.3%)
    III 15 (32%) 4 (17%) 11 (46%)
    IV 11 (23%) 2 (8.7%) 9 (38%)
Smoking Status 0.002
    Never smoker 13 (36%) 10 (67%) 3 (14%)
    Ex-smoker 0 (0%) 0 (0%) 0 (0%)
    Current smoker 23 (64%) 5 (33%) 18 (86%)
Molecular Stratification 0.4
    ALK 14 (30%) 8 (35%) 6 (25%)
    EGFR 16 (34%) 9 (39%) 7 (29%)
    KRAS 17 (36%) 6 (26%) 11 (46%)
ICDO Histologic Type 0.078
    Adenocarcinoma 13 (28%) 7 (30%) 6 (25%)
    Squamous cell 12 (26%) 5 (22%) 7 (29%)
    Large cell 10 (21%) 2 (8.7%) 8 (33%)
    Small cell 12 (26%) 9 (39%) 3 (13%)
Surgery Done 0.7
     1 (2.1%) 1 (4.3%) 0 (0%)
    No 23 (49%) 12 (52%) 11 (46%)
    Yes 23 (49%) 10 (43%) 13 (54%)
Volume of Tumor 0.009
    20 4 (13%) 4 (20%) 0 (0%)
    25 4 (13%) 3 (15%) 1 (8.3%)
    30 7 (22%) 6 (30%) 1 (8.3%)
    35 4 (13%) 3 (15%) 1 (8.3%)
    40 4 (13%) 3 (15%) 1 (8.3%)
    45 4 (13%) 1 (5.0%) 3 (25%)
    50 5 (16%) 0 (0%) 5 (42%)
Vital Status <0.001
    alive 1 (2.1%) 0 (0%) 1 (4.2%)
    Alive 12 (26%) 11 (48%) 1 (4.2%)
    Dead 33 (70%) 11 (48%) 22 (92%)
    Unknown 1 (2.1%) 1 (4.3%) 0 (0%)
Duration of Follow Up 18 (8) 17 (8) 18 (8) >0.9
Relapse 40 (85%) 17 (74%) 23 (96%) 0.048
1 Mean (SD); n (%)
2 Wilcoxon rank sum test; Fisher’s exact test; Pearson’s Chi-squared test

Use AI for prediction (tidymodels to create a logistic regression and a decision tree model and compare their accuracy)

# Load libraries
library(tidymodels) # For modeling
library(tidyverse)  # For data manipulation

# Prepare the data
lung_data_clean <- lung_data_clean %>%
  mutate(
    # Convert categorical variables to factors
    Sex = factor(Sex),
    Stage = factor(Stage),
    Smoking.Status = factor(Smoking.Status),
    Molecular.Stratification = factor(Molecular.Stratification),
    ICDO.Histologic.Type = factor(ICDO.Histologic.Type),
    Surgery.Done = factor(Surgery.Done),
    # Handle missing values in 'Age' by imputing with the mean age
    Age = if_else(is.na(Age), mean(Age, na.rm = TRUE), Age),
    # Create binary outcome variable for 1-year mortality
    mortality_1yr = factor(if_else(Duration.of.Follow.Up <= 12 & Vital.Status == "Dead", 1, 0))
  )

# Split the data into training and test sets
set.seed(42) # Set seed for reproducibility
data_split <- initial_split(lung_data_clean, prop = 0.7)
train_data <- training(data_split)
test_data <- testing(data_split)

# Logistic Regression Model
# Create a recipe for data preprocessing
logistic_recipe <- recipe(mortality_1yr ~ Age + Sex + Stage + Smoking.Status + Molecular.Stratification + ICDO.Histologic.Type + Surgery.Done + Volume.of.Tumor, data = train_data) %>%
  # Create dummy variables for categorical predictors
  step_dummy(all_nominal(), -all_outcomes())

# Specify the logistic regression model
logistic_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

# Combine preprocessing and model into a workflow
logistic_workflow <- workflow() %>%
  add_recipe(logistic_recipe) %>%
  add_model(logistic_spec)

# Fit the logistic regression model
logistic_fit <- fit(logistic_workflow, train_data)

# Decision Tree Model
# Create a recipe for data preprocessing (same as for logistic regression)
tree_recipe <- recipe(mortality_1yr ~ Age + Sex + Stage + Smoking.Status + Molecular.Stratification + ICDO.Histologic.Type + Surgery.Done + Volume.of.Tumor, data = train_data) %>%
  # Create dummy variables for categorical predictors
  step_dummy(all_nominal(), -all_outcomes())

# Specify the decision tree model
tree_spec <- decision_tree() %>%
  set_engine("rpart") %>%
  set_mode("classification")

# Combine preprocessing and model into a workflow (similar to logistic regression)
tree_workflow <- workflow() %>%
  add_recipe(tree_recipe) %>%
  add_model(tree_spec)

# Fit the decision tree model
tree_fit <- fit(tree_workflow, train_data)

# Compare the accuracy of the models
# Get predictions and calculate accuracy for the logistic regression model
logistic_results <- logistic_fit %>%
  predict(test_data) %>%
  bind_cols(test_data %>% select(mortality_1yr)) %>%
  metrics(truth = mortality_1yr, estimate = .pred_class)

# Get predictions and calculate accuracy for the decision tree model
tree_results <- tree_fit %>%
  predict(test_data) %>%
  bind_cols(test_data %>% select(mortality_1yr)) %>%
  metrics(truth = mortality_1yr, estimate = .pred_class)

# Print accuracy results
cat("Logistic Regression Accuracy:", logistic_results %>% filter(.metric == "accuracy") %>% pull(.estimate), "\n")
## Logistic Regression Accuracy: 0.7142857
cat("Decision Tree Accuracy:", tree_results %>% filter(.metric== "accuracy") %>% pull(.estimate), "\n")
## Decision Tree Accuracy: 0.7333333
# Compare the performance of the logistic regression and decision tree models
model_comparison <- logistic_results %>%
  filter(.metric == "accuracy") %>%
  rename(logistic_accuracy = .estimate) %>%
  bind_cols(tree_results %>% filter(.metric == "accuracy") %>% rename(tree_accuracy = .estimate))

# Print the model comparison
cat("Model comparison:\n")
## Model comparison:
print(model_comparison)
## # A tibble: 1 × 6
##   .metric...1 .estimator...2 logistic_accuracy .metric...4 .estimator...5
##   <chr>       <chr>                      <dbl> <chr>       <chr>         
## 1 accuracy    binary                     0.714 accuracy    binary        
## # ℹ 1 more variable: tree_accuracy <dbl>

Decision Tree Accuracy: 0.7142857 Logistic Regression Accuracy: 0.835665