The project was about the return of thyroid cancer after initial treatment,even when the thyroid tumor is removed,the cancer can reappear in the neck or lymph nodes.
#installing packages
library(readr)
## Warning: package 'readr' was built under R version 4.5.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#for checking for missing value
library(visdat)
## Warning: package 'visdat' was built under R version 4.5.1
# For splitting data & evaluation
library(caret)
## Warning: package 'caret' was built under R version 4.5.1
## Loading required package: lattice
#for randon forest
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.1
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# For Logistic Regression evaluation
library(pROC) # ROC & AUC
## Warning: package 'pROC' was built under R version 4.5.1
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Thyriod<-read.csv("~/GLADYS FOLDER .R/Thyroid_Diff.csv",stringsAsFactors = FALSE)
View(Thyriod)
#checking the first Six head of the data
head(Thyriod)
## Age Gender Smoking Hx.Smoking Hx.Radiothreapy Thyroid.Function
## 1 27 F No No No Euthyroid
## 2 34 F No Yes No Euthyroid
## 3 30 F No No No Euthyroid
## 4 62 F No No No Euthyroid
## 5 62 F No No No Euthyroid
## 6 52 M Yes No No Euthyroid
## Physical.Examination Adenopathy Pathology Focality Risk T N
## 1 Single nodular goiter-left No Micropapillary Uni-Focal Low T1a N0
## 2 Multinodular goiter No Micropapillary Uni-Focal Low T1a N0
## 3 Single nodular goiter-right No Micropapillary Uni-Focal Low T1a N0
## 4 Single nodular goiter-right No Micropapillary Uni-Focal Low T1a N0
## 5 Multinodular goiter No Micropapillary Multi-Focal Low T1a N0
## 6 Multinodular goiter No Micropapillary Multi-Focal Low T1a N0
## M Stage Response Recurred
## 1 M0 I Indeterminate No
## 2 M0 I Excellent No
## 3 M0 I Excellent No
## 4 M0 I Excellent No
## 5 M0 I Excellent No
## 6 M0 I Indeterminate No
#To check the class or type of the data set
class(Thyriod)
## [1] "data.frame"
#Structure of the data set
str(Thyriod)
## 'data.frame': 383 obs. of 17 variables:
## $ Age : int 27 34 30 62 62 52 41 46 51 40 ...
## $ Gender : chr "F" "F" "F" "F" ...
## $ Smoking : chr "No" "No" "No" "No" ...
## $ Hx.Smoking : chr "No" "Yes" "No" "No" ...
## $ Hx.Radiothreapy : chr "No" "No" "No" "No" ...
## $ Thyroid.Function : chr "Euthyroid" "Euthyroid" "Euthyroid" "Euthyroid" ...
## $ Physical.Examination: chr "Single nodular goiter-left" "Multinodular goiter" "Single nodular goiter-right" "Single nodular goiter-right" ...
## $ Adenopathy : chr "No" "No" "No" "No" ...
## $ Pathology : chr "Micropapillary" "Micropapillary" "Micropapillary" "Micropapillary" ...
## $ Focality : chr "Uni-Focal" "Uni-Focal" "Uni-Focal" "Uni-Focal" ...
## $ Risk : chr "Low" "Low" "Low" "Low" ...
## $ T : chr "T1a" "T1a" "T1a" "T1a" ...
## $ N : chr "N0" "N0" "N0" "N0" ...
## $ M : chr "M0" "M0" "M0" "M0" ...
## $ Stage : chr "I" "I" "I" "I" ...
## $ Response : chr "Indeterminate" "Excellent" "Excellent" "Excellent" ...
## $ Recurred : chr "No" "No" "No" "No" ...
summary(Thyriod)
## Age Gender Smoking Hx.Smoking
## Min. :15.00 Length:383 Length:383 Length:383
## 1st Qu.:29.00 Class :character Class :character Class :character
## Median :37.00 Mode :character Mode :character Mode :character
## Mean :40.87
## 3rd Qu.:51.00
## Max. :82.00
## Hx.Radiothreapy Thyroid.Function Physical.Examination Adenopathy
## Length:383 Length:383 Length:383 Length:383
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Pathology Focality Risk T
## Length:383 Length:383 Length:383 Length:383
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## N M Stage Response
## Length:383 Length:383 Length:383 Length:383
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Recurred
## Length:383
## Class :character
## Mode :character
##
##
##
#To visualize where the missing values are located
vis_miss(Thyriod)
#To see the levels or categories that exit in th variable
unique(Thyriod$T)
## [1] "T1a" "T1b" "T2" "T3a" "T3b" "T4a" "T4b"
unique(Thyriod$Focality)
## [1] "Uni-Focal" "Multi-Focal"
unique(Thyriod$Risk)
## [1] "Low" "Intermediate" "High"
unique(Thyriod$Response)
## [1] "Indeterminate" "Excellent" "Structural Incomplete"
## [4] "Biochemical Incomplete"
unique(Thyriod$Recurred)
## [1] "No" "Yes"
unique(Thyriod$Stage)
## [1] "I" "II" "IVB" "III" "IVA"
unique(Thyriod$Pathology)
## [1] "Micropapillary" "Papillary" "Follicular" "Hurthel cell"
unique(Thyriod$LymphNodes)
## NULL
unique(Thyriod$Metastasis)
## NULL
#To check how many patient fall into each categories
table(Thyriod$T)
##
## T1a T1b T2 T3a T3b T4a T4b
## 49 43 151 96 16 20 8
table(Thyriod$Focality)
##
## Multi-Focal Uni-Focal
## 136 247
table(Thyriod$Risk)
##
## High Intermediate Low
## 32 102 249
table(Thyriod$Response)
##
## Biochemical Incomplete Excellent Indeterminate
## 23 208 61
## Structural Incomplete
## 91
table(Thyriod$Stage)
##
## I II III IVA IVB
## 333 32 4 3 11
table(Thyriod$Recurred)
##
## No Yes
## 275 108
table(Thyriod$Metastasis)
## < table of extent 0 >
There were some column that the names were not explanatory ,so i renamed them for better understanding using dplyr function and aslo i saved my restructured data.
Thyriod<-Thyriod %>%
rename(
Tumor= T,
LymphNodes=N,
Metastasis=M,
TreatmentResponse=Response ,
Recurrence=Recurred
)
head(Thyriod)
## Age Gender Smoking Hx.Smoking Hx.Radiothreapy Thyroid.Function
## 1 27 F No No No Euthyroid
## 2 34 F No Yes No Euthyroid
## 3 30 F No No No Euthyroid
## 4 62 F No No No Euthyroid
## 5 62 F No No No Euthyroid
## 6 52 M Yes No No Euthyroid
## Physical.Examination Adenopathy Pathology Focality Risk Tumor
## 1 Single nodular goiter-left No Micropapillary Uni-Focal Low T1a
## 2 Multinodular goiter No Micropapillary Uni-Focal Low T1a
## 3 Single nodular goiter-right No Micropapillary Uni-Focal Low T1a
## 4 Single nodular goiter-right No Micropapillary Uni-Focal Low T1a
## 5 Multinodular goiter No Micropapillary Multi-Focal Low T1a
## 6 Multinodular goiter No Micropapillary Multi-Focal Low T1a
## LymphNodes Metastasis Stage TreatmentResponse Recurrence
## 1 N0 M0 I Indeterminate No
## 2 N0 M0 I Excellent No
## 3 N0 M0 I Excellent No
## 4 N0 M0 I Excellent No
## 5 N0 M0 I Excellent No
## 6 N0 M0 I Indeterminate No
#saving my restructured data
write.csv(Thyriod, "Thyroid_cleaned.csv", row.names = FALSE)
Thyriod_cleaned<-read.csv("Thyroid_cleaned.csv")
str(Thyriod_cleaned)
## 'data.frame': 383 obs. of 17 variables:
## $ Age : int 27 34 30 62 62 52 41 46 51 40 ...
## $ Gender : chr "F" "F" "F" "F" ...
## $ Smoking : chr "No" "No" "No" "No" ...
## $ Hx.Smoking : chr "No" "Yes" "No" "No" ...
## $ Hx.Radiothreapy : chr "No" "No" "No" "No" ...
## $ Thyroid.Function : chr "Euthyroid" "Euthyroid" "Euthyroid" "Euthyroid" ...
## $ Physical.Examination: chr "Single nodular goiter-left" "Multinodular goiter" "Single nodular goiter-right" "Single nodular goiter-right" ...
## $ Adenopathy : chr "No" "No" "No" "No" ...
## $ Pathology : chr "Micropapillary" "Micropapillary" "Micropapillary" "Micropapillary" ...
## $ Focality : chr "Uni-Focal" "Uni-Focal" "Uni-Focal" "Uni-Focal" ...
## $ Risk : chr "Low" "Low" "Low" "Low" ...
## $ Tumor : chr "T1a" "T1a" "T1a" "T1a" ...
## $ LymphNodes : chr "N0" "N0" "N0" "N0" ...
## $ Metastasis : chr "M0" "M0" "M0" "M0" ...
## $ Stage : chr "I" "I" "I" "I" ...
## $ TreatmentResponse : chr "Indeterminate" "Excellent" "Excellent" "Excellent" ...
## $ Recurrence : chr "No" "No" "No" "No" ...
So as to run my analysis well i grouped some columns for better analysis before converting to factor, because some columns needed to be grouped like Tumor,Stage,Lymphnodes and Metastasis .Then after grouping i converted them to factor and then to numerical factor .
#converting tumor and stage to character
Thyriod_cleaned$Tumor<-as.character(Thyriod_cleaned$Tumor)
Thyriod_cleaned$Stage<-as.character(Thyriod_cleaned$Stage)
Thyriod_cleaned$Stage<-toupper(Thyriod_cleaned$Stage)
#Grouping Tumor into four stages
Thyriod_cleaned$Tumor_group<- dplyr::case_when(
Thyriod_cleaned$Tumor%in% c("T1a","T1b")~"T1",
Thyriod_cleaned$Tumor=="T2"~"T2",
Thyriod_cleaned$Tumor%in%c("T3a","T3b")~"T3",
Thyriod_cleaned$Tumor%in%c("T4a","T4b")~"T4")
#Grouping Stage
Thyriod_cleaned$Stage_group<- dplyr::case_when(
Thyriod_cleaned$Stage=="I"~"I",
Thyriod_cleaned$Stage=="II"~"II",
Thyriod_cleaned$Stage=="III"~"III",
Thyriod_cleaned$Stage%in%c("IVA","IVB")~"IV",
TRUE~NA_character_)
#grouping lymphnodes
Thyriod_cleaned$LymphNodes_group <- dplyr::case_when(
Thyriod_cleaned$LymphNodes=="N0"~"N0",
Thyriod_cleaned$LymphNodes%in%c("N1a","N1b")~"N1",
TRUE~NA_character_)
#Changing character to factor
Thyriod_cleaned<-Thyriod_cleaned %>%
mutate(across(c(Stage,Pathology,LymphNodes,TreatmentResponse,),as.factor))
#converting to factor
Thyriod_cleaned$Recurrence_fac<-as.factor(Thyriod_cleaned$Recurrence)
#converting factor to numeric factor
#change no to 0 and yes to 1
Thyriod_cleaned$Recurrence_num<-ifelse(Thyriod$Recurrence =="Yes",1,0)
class(Thyriod_cleaned$Recurrence_num)
## [1] "numeric"
#converting pathology to numeric factor
#where micropapillary will be 1 the rest will be 2,3,4
#Thyriod_cleaned$Pathology_num<-ifelse(Thyriod$Pathology =="Micropapillary",1,2,3,4)
#class(Thyriod_cleaned)
Thyriod_cleaned$Stage_group<-factor(Thyriod_cleaned$Stage_group,
levels = c("I","II","III","IV"),ordered = TRUE)
Thyriod_cleaned$Tumor_group<-factor(Thyriod_cleaned$Tumor_group,
levels = c("T1","T2","T3","T4"),ordered = TRUE)
Thyriod_cleaned$LymphNodes_group<-factor(Thyriod_cleaned$LymphNodes_group,
levels = c("N0","N1"),ordered = TRUE)
Thyriod_cleaned$Metastasis<-factor(Thyriod_cleaned$Metastasis,
levels = c("M0","M1"),ordered = TRUE)
str(Thyriod_cleaned)
## 'data.frame': 383 obs. of 22 variables:
## $ Age : int 27 34 30 62 62 52 41 46 51 40 ...
## $ Gender : chr "F" "F" "F" "F" ...
## $ Smoking : chr "No" "No" "No" "No" ...
## $ Hx.Smoking : chr "No" "Yes" "No" "No" ...
## $ Hx.Radiothreapy : chr "No" "No" "No" "No" ...
## $ Thyroid.Function : chr "Euthyroid" "Euthyroid" "Euthyroid" "Euthyroid" ...
## $ Physical.Examination: chr "Single nodular goiter-left" "Multinodular goiter" "Single nodular goiter-right" "Single nodular goiter-right" ...
## $ Adenopathy : chr "No" "No" "No" "No" ...
## $ Pathology : Factor w/ 4 levels "Follicular","Hurthel cell",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Focality : chr "Uni-Focal" "Uni-Focal" "Uni-Focal" "Uni-Focal" ...
## $ Risk : chr "Low" "Low" "Low" "Low" ...
## $ Tumor : chr "T1a" "T1a" "T1a" "T1a" ...
## $ LymphNodes : Factor w/ 3 levels "N0","N1a","N1b": 1 1 1 1 1 1 1 1 1 1 ...
## $ Metastasis : Ord.factor w/ 2 levels "M0"<"M1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Stage : Factor w/ 5 levels "I","II","III",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ TreatmentResponse : Factor w/ 4 levels "Biochemical Incomplete",..: 3 2 2 2 2 3 2 2 2 2 ...
## $ Recurrence : chr "No" "No" "No" "No" ...
## $ Tumor_group : Ord.factor w/ 4 levels "T1"<"T2"<"T3"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Stage_group : Ord.factor w/ 4 levels "I"<"II"<"III"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LymphNodes_group : Ord.factor w/ 2 levels "N0"<"N1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Recurrence_fac : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Recurrence_num : num 0 0 0 0 0 0 0 0 0 0 ...
STAGE
This is the overall stage of cancer, determined using tumor,lymph nodes, and metastasis together.
Stage I-Early cancer, tumor still small, no spread.
Stage II-Larger tumor or some spread into nearby tissue.
Stage III-Cancer spread to local lymph nodes or tissues.
Stage IV-Advanced cancer, often with distant spread (metastasis).
#plotting out the distributions of tumor,stages and Lymphnodes
ggplot(Thyriod_cleaned,aes(x=Stage_group))+
geom_bar(fill="blue")+
labs(title="Distribution of Stages",x="Thyroid cancer Stages",y="Count")
TUMOR
This refers to the size and extent of the tumor in the thyroid.
T1-Very small tumor < 2 cm, limited to thyroid.
T2-Tumor between 2 – 4 cm, still within thyroid.
T3-Tumor > 4 cm or slightly extending outside thyroid.
T4-Tumor growing beyond thyroid into nearby tissues.
ggplot(Thyriod_cleaned,aes(x=Tumor_group))+
geom_bar(fill = "red4")+
labs(title="Distribution Tumor Level",x="Tumor Level",y="Count")
LYMPHNODES
This refers to whether the cancer has spread to nearby lymph nodes.
N0-No lymph node involvement.
N1-Cancer present in lymph nodes it can be N1a which is the nearby nodes, or N1b the more distant neck nodes.And this has been grouped.
ggplot(Thyriod_cleaned,aes(x=LymphNodes_group))+
geom_bar(fill = "red3")+
labs(title="Distribution Tumor Level",x="Lymphnodes",y="Count")
METASTASIS
This refers to whether the cancer has spread to distant organs (like lungs, bones, liver).
M0-No distant metastasis.
M1-Distant metastasis present.
ggplot(Thyriod_cleaned,aes(x=Metastasis))+
geom_bar(fill = "red")+
labs(title="Distribution Metastasis",x="Metastasis Level",y="Count")
getwd()
## [1] "C:/Users/hp/Documents"
Visualizing relationships between Recurrence and the clinical features which are stage,Tumor, Lymphnodes, Metastasis and the Age
#checking if younger patient have better outcomes,while older patients may have a higher chance of recuurence
ggplot(Thyriod_cleaned,aes(x=Age,fill = Recurrence))+
geom_density(alpha=0.5)+
labs(title = "Age Distribution by Recurred status")
#grouping age
Thyriod_cleaned <- Thyriod_cleaned %>%
mutate(Age_Group = cut(Age,
breaks = c(0, 30, 40, 50, 60, 70, 100),
labels = c("0–30", "31–40", "41–50", "51–60", "61–70", "71+"),
right = FALSE))
Thyriod_cleaned$Age_Group<-as.factor(Thyriod_cleaned$Age_Group)
ggplot(Thyriod_cleaned, aes(x = Age_Group, fill = Recurrence)) +
geom_bar(position = "dodge") +
labs(title = "Age Group vs Recurrence",
x = "Age Group",
y = "Count of Patients") +
theme_minimal() +
scale_fill_brewer(palette = "Paired")
ggplot(Thyriod_cleaned, aes(x = Stage_group, fill = Recurrence)) +
geom_bar(position = "dodge") +
labs(title = "Stage_group vs Recurrence",
x = "stage_group",
y = "Count of Patients") +
theme_minimal() +
scale_fill_brewer(palette = "Set1",direction = -1)
ggplot(Thyriod_cleaned, aes(x = Tumor_group, fill = Recurrence)) +
geom_bar(position = "dodge") +
labs(title = "Tumor_group vs Recurrence",
x = "Tumor_group",
y = "Count of Patients") +
theme_minimal() +
scale_fill_brewer(palette = "Set2",direction = -1)
ggplot(Thyriod_cleaned, aes(x = LymphNodes_group, fill = Recurrence)) +
geom_bar(position = "dodge") +
labs(title = "LymphNodes_group vs Recurrence",
x = "LymphNodes_group",
y = "Count of Patients") +
theme_minimal() +
scale_fill_brewer(palette = "Set1",direction = -1)
ggplot(Thyriod_cleaned, aes(x = Metastasis, fill = Stage_group)) +
geom_bar(position = "dodge") +
labs(title = "Metastasis vs stage_group",
x = "Metastasis",
y = "Count of Patients") +
theme_minimal() +
scale_fill_brewer(palette = "Set1",direction = -1)
### Predicting thyroid cancer using machine learning
I will diving into the machine learning aspect where i will be training and testing my data because my data set has many variables that can be used for prediction ,predicting the recurrence and identifying key risk factors.I used Logistic model and Random forest model
#splitting into training and test set
set.seed(123)
trainIndex <- createDataPartition(Thyriod_cleaned$Recurrence_fac, p = 0.7, list = FALSE)
train_data <- Thyriod_cleaned[trainIndex, ]
test_data <- Thyriod_cleaned[-trainIndex, ]
Logistic Regression Model
# Build logistic regression model
logit_model <- glm(Recurrence_fac ~ Age_Group+Stage_group+Tumor_group+LymphNodes_group+Metastasis,data = train_data,family="binomial")
# Model summary
summary(logit_model)
##
## Call:
## glm(formula = Recurrence_fac ~ Age_Group + Stage_group + Tumor_group +
## LymphNodes_group + Metastasis, family = "binomial", data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 14.9792 1935.4530 0.008 0.99382
## Age_Group31–40 0.2924 0.5693 0.514 0.60756
## Age_Group41–50 -0.6682 0.7453 -0.896 0.37000
## Age_Group51–60 0.7752 0.7203 1.076 0.28183
## Age_Group61–70 1.9946 0.8351 2.389 0.01691 *
## Age_Group71+ 1.5113 1.1122 1.359 0.17421
## Stage_group.L 13.2979 1998.5582 0.007 0.99469
## Stage_group.Q -0.3355 3275.7097 0.000 0.99992
## Stage_group.C -7.0660 4179.2719 -0.002 0.99865
## Tumor_group.L 2.5652 0.9263 2.769 0.00562 **
## Tumor_group.Q 0.7305 0.7045 1.037 0.29984
## Tumor_group.C -0.3854 0.4266 -0.903 0.36637
## LymphNodes_group.L 1.9297 0.3198 6.033 1.61e-09 ***
## Metastasis.L 11.7566 1458.3661 0.008 0.99357
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 320.28 on 268 degrees of freedom
## Residual deviance: 161.98 on 255 degrees of freedom
## AIC: 189.98
##
## Number of Fisher Scoring iterations: 18
# Odds ratios and confidence intervals
exp(cbind(OR=coef(logit_model), confint(logit_model)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 3.201611e+06 Inf 1.128787e+301
## Age_Group31–40 1.339601e+00 4.407451e-01 4.187234e+00
## Age_Group41–50 5.126503e-01 1.116303e-01 2.137189e+00
## Age_Group51–60 2.171066e+00 5.193063e-01 9.054304e+00
## Age_Group61–70 7.349534e+00 1.381187e+00 3.828720e+01
## Age_Group71+ 4.532652e+00 4.435238e-01 3.769683e+01
## Stage_group.L 5.959249e+05 3.078701e-33 1.498085e+285
## Stage_group.Q 7.149821e-01 1.564766e-25 1.322162e+26
## Stage_group.C 8.536122e-04 1.081510e-39 1.245719e+33
## Tumor_group.L 1.300288e+01 2.466951e+00 1.178282e+02
## Tumor_group.Q 2.076046e+00 5.795281e-01 1.092664e+01
## Tumor_group.C 6.802060e-01 2.981342e-01 1.672326e+00
## LymphNodes_group.L 6.887108e+00 3.773671e+00 1.334819e+01
## Metastasis.L 1.275899e+05 1.470867e-23 9.700235e+208
# Evaluate model performance
logit_pred <- predict(logit_model, newdata=test_data, type="response")
logit_pred_class <- ifelse(logit_pred > 0.5, "Yes", "No")
confusionMatrix(as.factor(logit_pred_class), test_data$Recurrence_fac)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 78 7
## Yes 4 25
##
## Accuracy : 0.9035
## 95% CI : (0.8339, 0.9508)
## No Information Rate : 0.7193
## P-Value [Acc > NIR] : 1.358e-06
##
## Kappa : 0.754
##
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.9512
## Specificity : 0.7812
## Pos Pred Value : 0.9176
## Neg Pred Value : 0.8621
## Prevalence : 0.7193
## Detection Rate : 0.6842
## Detection Prevalence : 0.7456
## Balanced Accuracy : 0.8662
##
## 'Positive' Class : No
##
Random Forest
# Build random forest model
set.seed(123)
rf_model <- randomForest(Recurrence_fac ~ Age_Group+Stage_group+Tumor_group+LymphNodes_group+Metastasis,data = train_data, importance=TRUE)
# Variable importance
varImpPlot(rf_model, main="Variable Importance in Recurrence Prediction")
# Evaluate performance
rf_pred <- predict(rf_model, newdata=test_data)
confusionMatrix(rf_pred, test_data$Recurrence_fac)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 78 7
## Yes 4 25
##
## Accuracy : 0.9035
## 95% CI : (0.8339, 0.9508)
## No Information Rate : 0.7193
## P-Value [Acc > NIR] : 1.358e-06
##
## Kappa : 0.754
##
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.9512
## Specificity : 0.7812
## Pos Pred Value : 0.9176
## Neg Pred Value : 0.8621
## Prevalence : 0.7193
## Detection Rate : 0.6842
## Detection Prevalence : 0.7456
## Balanced Accuracy : 0.8662
##
## 'Positive' Class : No
##