The project was about the return of thyroid cancer after initial treatment,even when the thyroid tumor is removed,the cancer can reappear in the neck or lymph nodes.

#installing packages
library(readr)
## Warning: package 'readr' was built under R version 4.5.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#for checking for missing value
library(visdat)
## Warning: package 'visdat' was built under R version 4.5.1
# For splitting data & evaluation
library(caret)
## Warning: package 'caret' was built under R version 4.5.1
## Loading required package: lattice
#for randon forest
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.1
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# For Logistic Regression evaluation
library(pROC)   # ROC & AUC
## Warning: package 'pROC' was built under R version 4.5.1
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Importing data

Thyriod<-read.csv("~/GLADYS FOLDER .R/Thyroid_Diff.csv",stringsAsFactors = FALSE)
View(Thyriod)
#checking the first Six head of the data
head(Thyriod)
##   Age Gender Smoking Hx.Smoking Hx.Radiothreapy Thyroid.Function
## 1  27      F      No         No              No        Euthyroid
## 2  34      F      No        Yes              No        Euthyroid
## 3  30      F      No         No              No        Euthyroid
## 4  62      F      No         No              No        Euthyroid
## 5  62      F      No         No              No        Euthyroid
## 6  52      M     Yes         No              No        Euthyroid
##          Physical.Examination Adenopathy      Pathology    Focality Risk   T  N
## 1  Single nodular goiter-left         No Micropapillary   Uni-Focal  Low T1a N0
## 2         Multinodular goiter         No Micropapillary   Uni-Focal  Low T1a N0
## 3 Single nodular goiter-right         No Micropapillary   Uni-Focal  Low T1a N0
## 4 Single nodular goiter-right         No Micropapillary   Uni-Focal  Low T1a N0
## 5         Multinodular goiter         No Micropapillary Multi-Focal  Low T1a N0
## 6         Multinodular goiter         No Micropapillary Multi-Focal  Low T1a N0
##    M Stage      Response Recurred
## 1 M0     I Indeterminate       No
## 2 M0     I     Excellent       No
## 3 M0     I     Excellent       No
## 4 M0     I     Excellent       No
## 5 M0     I     Excellent       No
## 6 M0     I Indeterminate       No
#To check the class or type of the data set
class(Thyriod)
## [1] "data.frame"
#Structure of the data set
str(Thyriod)
## 'data.frame':    383 obs. of  17 variables:
##  $ Age                 : int  27 34 30 62 62 52 41 46 51 40 ...
##  $ Gender              : chr  "F" "F" "F" "F" ...
##  $ Smoking             : chr  "No" "No" "No" "No" ...
##  $ Hx.Smoking          : chr  "No" "Yes" "No" "No" ...
##  $ Hx.Radiothreapy     : chr  "No" "No" "No" "No" ...
##  $ Thyroid.Function    : chr  "Euthyroid" "Euthyroid" "Euthyroid" "Euthyroid" ...
##  $ Physical.Examination: chr  "Single nodular goiter-left" "Multinodular goiter" "Single nodular goiter-right" "Single nodular goiter-right" ...
##  $ Adenopathy          : chr  "No" "No" "No" "No" ...
##  $ Pathology           : chr  "Micropapillary" "Micropapillary" "Micropapillary" "Micropapillary" ...
##  $ Focality            : chr  "Uni-Focal" "Uni-Focal" "Uni-Focal" "Uni-Focal" ...
##  $ Risk                : chr  "Low" "Low" "Low" "Low" ...
##  $ T                   : chr  "T1a" "T1a" "T1a" "T1a" ...
##  $ N                   : chr  "N0" "N0" "N0" "N0" ...
##  $ M                   : chr  "M0" "M0" "M0" "M0" ...
##  $ Stage               : chr  "I" "I" "I" "I" ...
##  $ Response            : chr  "Indeterminate" "Excellent" "Excellent" "Excellent" ...
##  $ Recurred            : chr  "No" "No" "No" "No" ...
summary(Thyriod)
##       Age           Gender            Smoking           Hx.Smoking       
##  Min.   :15.00   Length:383         Length:383         Length:383        
##  1st Qu.:29.00   Class :character   Class :character   Class :character  
##  Median :37.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.87                                                           
##  3rd Qu.:51.00                                                           
##  Max.   :82.00                                                           
##  Hx.Radiothreapy    Thyroid.Function   Physical.Examination  Adenopathy       
##  Length:383         Length:383         Length:383           Length:383        
##  Class :character   Class :character   Class :character     Class :character  
##  Mode  :character   Mode  :character   Mode  :character     Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##   Pathology           Focality             Risk                T            
##  Length:383         Length:383         Length:383         Length:383        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       N                  M                Stage             Response        
##  Length:383         Length:383         Length:383         Length:383        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Recurred        
##  Length:383        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#To visualize  where the missing values are located
vis_miss(Thyriod)

#To see the levels or categories that exit in th variable
unique(Thyriod$T)
## [1] "T1a" "T1b" "T2"  "T3a" "T3b" "T4a" "T4b"
unique(Thyriod$Focality)
## [1] "Uni-Focal"   "Multi-Focal"
unique(Thyriod$Risk)
## [1] "Low"          "Intermediate" "High"
unique(Thyriod$Response)
## [1] "Indeterminate"          "Excellent"              "Structural Incomplete" 
## [4] "Biochemical Incomplete"
unique(Thyriod$Recurred)
## [1] "No"  "Yes"
unique(Thyriod$Stage)
## [1] "I"   "II"  "IVB" "III" "IVA"
unique(Thyriod$Pathology)
## [1] "Micropapillary" "Papillary"      "Follicular"     "Hurthel cell"
unique(Thyriod$LymphNodes)
## NULL
unique(Thyriod$Metastasis)
## NULL
#To check how many patient fall into each categories
table(Thyriod$T)
## 
## T1a T1b  T2 T3a T3b T4a T4b 
##  49  43 151  96  16  20   8
table(Thyriod$Focality)
## 
## Multi-Focal   Uni-Focal 
##         136         247
table(Thyriod$Risk)
## 
##         High Intermediate          Low 
##           32          102          249
table(Thyriod$Response)
## 
## Biochemical Incomplete              Excellent          Indeterminate 
##                     23                    208                     61 
##  Structural Incomplete 
##                     91
table(Thyriod$Stage)
## 
##   I  II III IVA IVB 
## 333  32   4   3  11
table(Thyriod$Recurred)
## 
##  No Yes 
## 275 108
table(Thyriod$Metastasis)
## < table of extent 0 >

There were some column that the names were not explanatory ,so i renamed them for better understanding using dplyr function and aslo i saved my restructured data.

Thyriod<-Thyriod %>%
  rename(
    Tumor= T,
    LymphNodes=N,
    Metastasis=M,
    TreatmentResponse=Response ,
    Recurrence=Recurred
  )
head(Thyriod)
##   Age Gender Smoking Hx.Smoking Hx.Radiothreapy Thyroid.Function
## 1  27      F      No         No              No        Euthyroid
## 2  34      F      No        Yes              No        Euthyroid
## 3  30      F      No         No              No        Euthyroid
## 4  62      F      No         No              No        Euthyroid
## 5  62      F      No         No              No        Euthyroid
## 6  52      M     Yes         No              No        Euthyroid
##          Physical.Examination Adenopathy      Pathology    Focality Risk Tumor
## 1  Single nodular goiter-left         No Micropapillary   Uni-Focal  Low   T1a
## 2         Multinodular goiter         No Micropapillary   Uni-Focal  Low   T1a
## 3 Single nodular goiter-right         No Micropapillary   Uni-Focal  Low   T1a
## 4 Single nodular goiter-right         No Micropapillary   Uni-Focal  Low   T1a
## 5         Multinodular goiter         No Micropapillary Multi-Focal  Low   T1a
## 6         Multinodular goiter         No Micropapillary Multi-Focal  Low   T1a
##   LymphNodes Metastasis Stage TreatmentResponse Recurrence
## 1         N0         M0     I     Indeterminate         No
## 2         N0         M0     I         Excellent         No
## 3         N0         M0     I         Excellent         No
## 4         N0         M0     I         Excellent         No
## 5         N0         M0     I         Excellent         No
## 6         N0         M0     I     Indeterminate         No
#saving my restructured data

write.csv(Thyriod, "Thyroid_cleaned.csv", row.names = FALSE)
Thyriod_cleaned<-read.csv("Thyroid_cleaned.csv")
str(Thyriod_cleaned)
## 'data.frame':    383 obs. of  17 variables:
##  $ Age                 : int  27 34 30 62 62 52 41 46 51 40 ...
##  $ Gender              : chr  "F" "F" "F" "F" ...
##  $ Smoking             : chr  "No" "No" "No" "No" ...
##  $ Hx.Smoking          : chr  "No" "Yes" "No" "No" ...
##  $ Hx.Radiothreapy     : chr  "No" "No" "No" "No" ...
##  $ Thyroid.Function    : chr  "Euthyroid" "Euthyroid" "Euthyroid" "Euthyroid" ...
##  $ Physical.Examination: chr  "Single nodular goiter-left" "Multinodular goiter" "Single nodular goiter-right" "Single nodular goiter-right" ...
##  $ Adenopathy          : chr  "No" "No" "No" "No" ...
##  $ Pathology           : chr  "Micropapillary" "Micropapillary" "Micropapillary" "Micropapillary" ...
##  $ Focality            : chr  "Uni-Focal" "Uni-Focal" "Uni-Focal" "Uni-Focal" ...
##  $ Risk                : chr  "Low" "Low" "Low" "Low" ...
##  $ Tumor               : chr  "T1a" "T1a" "T1a" "T1a" ...
##  $ LymphNodes          : chr  "N0" "N0" "N0" "N0" ...
##  $ Metastasis          : chr  "M0" "M0" "M0" "M0" ...
##  $ Stage               : chr  "I" "I" "I" "I" ...
##  $ TreatmentResponse   : chr  "Indeterminate" "Excellent" "Excellent" "Excellent" ...
##  $ Recurrence          : chr  "No" "No" "No" "No" ...

So as to run my analysis well i grouped some columns for better analysis before converting to factor, because some columns needed to be grouped like Tumor,Stage,Lymphnodes and Metastasis .Then after grouping i converted them to factor and then to numerical factor .

#converting tumor and stage to character
Thyriod_cleaned$Tumor<-as.character(Thyriod_cleaned$Tumor)
Thyriod_cleaned$Stage<-as.character(Thyriod_cleaned$Stage)
Thyriod_cleaned$Stage<-toupper(Thyriod_cleaned$Stage)

#Grouping Tumor into four stages
Thyriod_cleaned$Tumor_group<- dplyr::case_when(
  Thyriod_cleaned$Tumor%in% c("T1a","T1b")~"T1",
  Thyriod_cleaned$Tumor=="T2"~"T2",
  Thyriod_cleaned$Tumor%in%c("T3a","T3b")~"T3", 
  Thyriod_cleaned$Tumor%in%c("T4a","T4b")~"T4")



#Grouping Stage
Thyriod_cleaned$Stage_group<- dplyr::case_when(
  Thyriod_cleaned$Stage=="I"~"I",
  Thyriod_cleaned$Stage=="II"~"II",
  Thyriod_cleaned$Stage=="III"~"III", 
  Thyriod_cleaned$Stage%in%c("IVA","IVB")~"IV",
  TRUE~NA_character_)



#grouping lymphnodes
Thyriod_cleaned$LymphNodes_group <- dplyr::case_when(
  Thyriod_cleaned$LymphNodes=="N0"~"N0",
  Thyriod_cleaned$LymphNodes%in%c("N1a","N1b")~"N1",
  TRUE~NA_character_)


#Changing character to factor
Thyriod_cleaned<-Thyriod_cleaned %>%
  mutate(across(c(Stage,Pathology,LymphNodes,TreatmentResponse,),as.factor))

#converting to factor
Thyriod_cleaned$Recurrence_fac<-as.factor(Thyriod_cleaned$Recurrence)

#converting factor to numeric factor
#change no to 0 and yes to 1
Thyriod_cleaned$Recurrence_num<-ifelse(Thyriod$Recurrence =="Yes",1,0)
class(Thyriod_cleaned$Recurrence_num)
## [1] "numeric"
#converting pathology to numeric factor
#where micropapillary will be 1 the rest will be 2,3,4

#Thyriod_cleaned$Pathology_num<-ifelse(Thyriod$Pathology =="Micropapillary",1,2,3,4)
#class(Thyriod_cleaned)




Thyriod_cleaned$Stage_group<-factor(Thyriod_cleaned$Stage_group,
                                    levels = c("I","II","III","IV"),ordered = TRUE)

Thyriod_cleaned$Tumor_group<-factor(Thyriod_cleaned$Tumor_group,
                                    levels = c("T1","T2","T3","T4"),ordered = TRUE)

Thyriod_cleaned$LymphNodes_group<-factor(Thyriod_cleaned$LymphNodes_group,
                                    levels = c("N0","N1"),ordered = TRUE)

Thyriod_cleaned$Metastasis<-factor(Thyriod_cleaned$Metastasis,
                                   levels = c("M0","M1"),ordered = TRUE)

str(Thyriod_cleaned)
## 'data.frame':    383 obs. of  22 variables:
##  $ Age                 : int  27 34 30 62 62 52 41 46 51 40 ...
##  $ Gender              : chr  "F" "F" "F" "F" ...
##  $ Smoking             : chr  "No" "No" "No" "No" ...
##  $ Hx.Smoking          : chr  "No" "Yes" "No" "No" ...
##  $ Hx.Radiothreapy     : chr  "No" "No" "No" "No" ...
##  $ Thyroid.Function    : chr  "Euthyroid" "Euthyroid" "Euthyroid" "Euthyroid" ...
##  $ Physical.Examination: chr  "Single nodular goiter-left" "Multinodular goiter" "Single nodular goiter-right" "Single nodular goiter-right" ...
##  $ Adenopathy          : chr  "No" "No" "No" "No" ...
##  $ Pathology           : Factor w/ 4 levels "Follicular","Hurthel cell",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Focality            : chr  "Uni-Focal" "Uni-Focal" "Uni-Focal" "Uni-Focal" ...
##  $ Risk                : chr  "Low" "Low" "Low" "Low" ...
##  $ Tumor               : chr  "T1a" "T1a" "T1a" "T1a" ...
##  $ LymphNodes          : Factor w/ 3 levels "N0","N1a","N1b": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Metastasis          : Ord.factor w/ 2 levels "M0"<"M1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Stage               : Factor w/ 5 levels "I","II","III",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ TreatmentResponse   : Factor w/ 4 levels "Biochemical Incomplete",..: 3 2 2 2 2 3 2 2 2 2 ...
##  $ Recurrence          : chr  "No" "No" "No" "No" ...
##  $ Tumor_group         : Ord.factor w/ 4 levels "T1"<"T2"<"T3"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Stage_group         : Ord.factor w/ 4 levels "I"<"II"<"III"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LymphNodes_group    : Ord.factor w/ 2 levels "N0"<"N1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Recurrence_fac      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Recurrence_num      : num  0 0 0 0 0 0 0 0 0 0 ...

Visualization

STAGE

This is the overall stage of cancer, determined using tumor,lymph nodes, and metastasis together.

Stage I-Early cancer, tumor still small, no spread.

Stage II-Larger tumor or some spread into nearby tissue.

Stage III-Cancer spread to local lymph nodes or tissues.

Stage IV-Advanced cancer, often with distant spread (metastasis).

#plotting out the distributions of tumor,stages and Lymphnodes
ggplot(Thyriod_cleaned,aes(x=Stage_group))+
  geom_bar(fill="blue")+
  labs(title="Distribution of Stages",x="Thyroid cancer Stages",y="Count")

TUMOR

This refers to the size and extent of the tumor in the thyroid.

T1-Very small tumor < 2 cm, limited to thyroid.

T2-Tumor between 2 – 4 cm, still within thyroid.

T3-Tumor > 4 cm or slightly extending outside thyroid.

T4-Tumor growing beyond thyroid into nearby tissues.

ggplot(Thyriod_cleaned,aes(x=Tumor_group))+
  geom_bar(fill = "red4")+
  labs(title="Distribution Tumor Level",x="Tumor Level",y="Count")

LYMPHNODES

This refers to whether the cancer has spread to nearby lymph nodes.

N0-No lymph node involvement.

N1-Cancer present in lymph nodes it can be N1a which is the nearby nodes, or N1b the more distant neck nodes.And this has been grouped.

ggplot(Thyriod_cleaned,aes(x=LymphNodes_group))+
  geom_bar(fill = "red3")+
  labs(title="Distribution Tumor Level",x="Lymphnodes",y="Count")

METASTASIS

This refers to whether the cancer has spread to distant organs (like lungs, bones, liver).

M0-No distant metastasis.

M1-Distant metastasis present.

ggplot(Thyriod_cleaned,aes(x=Metastasis))+
  geom_bar(fill = "red")+
  labs(title="Distribution Metastasis",x="Metastasis Level",y="Count")

getwd()
## [1] "C:/Users/hp/Documents"

Visualizing relationships between Recurrence and the clinical features which are stage,Tumor, Lymphnodes, Metastasis and the Age

#checking if younger patient have better outcomes,while older patients may have a higher chance of recuurence
ggplot(Thyriod_cleaned,aes(x=Age,fill = Recurrence))+
  geom_density(alpha=0.5)+
  labs(title = "Age Distribution by Recurred status")

#grouping age
Thyriod_cleaned <- Thyriod_cleaned %>%
  mutate(Age_Group = cut(Age,
                        breaks = c(0, 30, 40, 50, 60, 70, 100),
                        labels = c("0–30", "31–40", "41–50", "51–60", "61–70", "71+"),
                        right = FALSE))
Thyriod_cleaned$Age_Group<-as.factor(Thyriod_cleaned$Age_Group)
ggplot(Thyriod_cleaned, aes(x = Age_Group, fill = Recurrence)) +
  geom_bar(position = "dodge") +
  labs(title = "Age Group vs Recurrence",
       x = "Age Group",
       y = "Count of Patients") +
  theme_minimal() +
  scale_fill_brewer(palette = "Paired")

ggplot(Thyriod_cleaned, aes(x = Stage_group, fill = Recurrence)) +
  geom_bar(position = "dodge") +
  labs(title = "Stage_group vs Recurrence",
       x = "stage_group",
       y = "Count of Patients") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1",direction = -1)

ggplot(Thyriod_cleaned, aes(x = Tumor_group, fill = Recurrence)) +
  geom_bar(position = "dodge") +
  labs(title = "Tumor_group vs Recurrence",
       x = "Tumor_group",
       y = "Count of Patients") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2",direction = -1)

ggplot(Thyriod_cleaned, aes(x = LymphNodes_group, fill = Recurrence)) +
  geom_bar(position = "dodge") +
  labs(title = "LymphNodes_group vs Recurrence",
       x = "LymphNodes_group",
       y = "Count of Patients") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1",direction = -1)

 ggplot(Thyriod_cleaned, aes(x = Metastasis, fill = Stage_group)) + 
geom_bar(position = "dodge") +
  labs(title = "Metastasis vs stage_group",
       x = "Metastasis",
       y = "Count of Patients") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1",direction = -1)

### Predicting thyroid cancer using machine learning

I will diving into the machine learning aspect where i will be training and testing my data because my data set has many variables that can be used for prediction ,predicting the recurrence and identifying key risk factors.I used Logistic model and Random forest model

#splitting into training and test set
set.seed(123)
trainIndex <- createDataPartition(Thyriod_cleaned$Recurrence_fac, p = 0.7, list = FALSE)

train_data <- Thyriod_cleaned[trainIndex, ]
test_data <- Thyriod_cleaned[-trainIndex, ]

Logistic Regression Model

# Build logistic regression model
logit_model <- glm(Recurrence_fac ~ Age_Group+Stage_group+Tumor_group+LymphNodes_group+Metastasis,data = train_data,family="binomial")

# Model summary
summary(logit_model)
## 
## Call:
## glm(formula = Recurrence_fac ~ Age_Group + Stage_group + Tumor_group + 
##     LymphNodes_group + Metastasis, family = "binomial", data = train_data)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          14.9792  1935.4530   0.008  0.99382    
## Age_Group31–40        0.2924     0.5693   0.514  0.60756    
## Age_Group41–50       -0.6682     0.7453  -0.896  0.37000    
## Age_Group51–60        0.7752     0.7203   1.076  0.28183    
## Age_Group61–70        1.9946     0.8351   2.389  0.01691 *  
## Age_Group71+          1.5113     1.1122   1.359  0.17421    
## Stage_group.L        13.2979  1998.5582   0.007  0.99469    
## Stage_group.Q        -0.3355  3275.7097   0.000  0.99992    
## Stage_group.C        -7.0660  4179.2719  -0.002  0.99865    
## Tumor_group.L         2.5652     0.9263   2.769  0.00562 ** 
## Tumor_group.Q         0.7305     0.7045   1.037  0.29984    
## Tumor_group.C        -0.3854     0.4266  -0.903  0.36637    
## LymphNodes_group.L    1.9297     0.3198   6.033 1.61e-09 ***
## Metastasis.L         11.7566  1458.3661   0.008  0.99357    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 320.28  on 268  degrees of freedom
## Residual deviance: 161.98  on 255  degrees of freedom
## AIC: 189.98
## 
## Number of Fisher Scoring iterations: 18
# Odds ratios and confidence intervals
exp(cbind(OR=coef(logit_model), confint(logit_model)))
## Waiting for profiling to be done...
##                              OR        2.5 %        97.5 %
## (Intercept)        3.201611e+06          Inf 1.128787e+301
## Age_Group31–40     1.339601e+00 4.407451e-01  4.187234e+00
## Age_Group41–50     5.126503e-01 1.116303e-01  2.137189e+00
## Age_Group51–60     2.171066e+00 5.193063e-01  9.054304e+00
## Age_Group61–70     7.349534e+00 1.381187e+00  3.828720e+01
## Age_Group71+       4.532652e+00 4.435238e-01  3.769683e+01
## Stage_group.L      5.959249e+05 3.078701e-33 1.498085e+285
## Stage_group.Q      7.149821e-01 1.564766e-25  1.322162e+26
## Stage_group.C      8.536122e-04 1.081510e-39  1.245719e+33
## Tumor_group.L      1.300288e+01 2.466951e+00  1.178282e+02
## Tumor_group.Q      2.076046e+00 5.795281e-01  1.092664e+01
## Tumor_group.C      6.802060e-01 2.981342e-01  1.672326e+00
## LymphNodes_group.L 6.887108e+00 3.773671e+00  1.334819e+01
## Metastasis.L       1.275899e+05 1.470867e-23 9.700235e+208
# Evaluate model performance
logit_pred <- predict(logit_model, newdata=test_data, type="response")

logit_pred_class <- ifelse(logit_pred > 0.5, "Yes", "No")
confusionMatrix(as.factor(logit_pred_class), test_data$Recurrence_fac)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  78   7
##        Yes  4  25
##                                           
##                Accuracy : 0.9035          
##                  95% CI : (0.8339, 0.9508)
##     No Information Rate : 0.7193          
##     P-Value [Acc > NIR] : 1.358e-06       
##                                           
##                   Kappa : 0.754           
##                                           
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.9512          
##             Specificity : 0.7812          
##          Pos Pred Value : 0.9176          
##          Neg Pred Value : 0.8621          
##              Prevalence : 0.7193          
##          Detection Rate : 0.6842          
##    Detection Prevalence : 0.7456          
##       Balanced Accuracy : 0.8662          
##                                           
##        'Positive' Class : No              
## 

Random Forest

# Build random forest model
set.seed(123)
rf_model <- randomForest(Recurrence_fac ~ Age_Group+Stage_group+Tumor_group+LymphNodes_group+Metastasis,data = train_data, importance=TRUE)

# Variable importance
varImpPlot(rf_model, main="Variable Importance in Recurrence Prediction")

# Evaluate performance
rf_pred <- predict(rf_model, newdata=test_data)
confusionMatrix(rf_pred, test_data$Recurrence_fac)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  78   7
##        Yes  4  25
##                                           
##                Accuracy : 0.9035          
##                  95% CI : (0.8339, 0.9508)
##     No Information Rate : 0.7193          
##     P-Value [Acc > NIR] : 1.358e-06       
##                                           
##                   Kappa : 0.754           
##                                           
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.9512          
##             Specificity : 0.7812          
##          Pos Pred Value : 0.9176          
##          Neg Pred Value : 0.8621          
##              Prevalence : 0.7193          
##          Detection Rate : 0.6842          
##    Detection Prevalence : 0.7456          
##       Balanced Accuracy : 0.8662          
##                                           
##        'Positive' Class : No              
##