library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(GGally)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
Thyroid <- read.csv("C:\\Users\\NCC-1701D\\Downloads\\archive (4)\\thyroidDF.csv")
# Replacing na with means of column
Thyroid <- Thyroid %>%
  mutate_if(is.numeric, ~ifelse(is.na(.), mean(., na.rm = TRUE), .))

Exploratory Data Analysis

summary(Thyroid)
##       age               sex            on_thyroxine       query_on_thyroxine
##  Min.   :    1.00   Length:9172        Length:9172        Length:9172       
##  1st Qu.:   37.00   Class :character   Class :character   Class :character  
##  Median :   55.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :   73.56                                                           
##  3rd Qu.:   68.00                                                           
##  Max.   :65526.00                                                           
##  on_antithyroid_meds     sick             pregnant         thyroid_surgery   
##  Length:9172         Length:9172        Length:9172        Length:9172       
##  Class :character    Class :character   Class :character   Class :character  
##  Mode  :character    Mode  :character   Mode  :character   Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##  I131_treatment     query_hypothyroid  query_hyperthyroid   lithium         
##  Length:9172        Length:9172        Length:9172        Length:9172       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     goitre             tumor           hypopituitary         psych          
##  Length:9172        Length:9172        Length:9172        Length:9172       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  TSH_measured            TSH          T3_measured              T3        
##  Length:9172        Min.   :  0.005   Length:9172        Min.   : 0.050  
##  Class :character   1st Qu.:  0.590   Class :character   1st Qu.: 1.700  
##  Mode  :character   Median :  1.600   Mode  :character   Median : 1.971  
##                     Mean   :  5.218                      Mean   : 1.971  
##                     3rd Qu.:  3.700                      3rd Qu.: 2.200  
##                     Max.   :530.000                      Max.   :18.000  
##  TT4_measured            TT4        T4U_measured            T4U        
##  Length:9172        Min.   :  2.0   Length:9172        Min.   :0.1700  
##  Class :character   1st Qu.: 88.0   Class :character   1st Qu.:0.8700  
##  Mode  :character   Median :106.0   Mode  :character   Median :0.9761  
##                     Mean   :108.7                      Mean   :0.9761  
##                     3rd Qu.:124.0                      3rd Qu.:1.0500  
##                     Max.   :600.0                      Max.   :2.3300  
##  FTI_measured            FTI        TBG_measured            TBG        
##  Length:9172        Min.   :  1.4   Length:9172        Min.   :  0.10  
##  Class :character   1st Qu.: 95.0   Class :character   1st Qu.: 29.87  
##  Mode  :character   Median :112.0   Mode  :character   Median : 29.87  
##                     Mean   :113.6                      Mean   : 29.87  
##                     3rd Qu.:126.0                      3rd Qu.: 29.87  
##                     Max.   :881.0                      Max.   :200.00  
##  referral_source       target            patient_id       
##  Length:9172        Length:9172        Min.   :840801013  
##  Class :character   Class :character   1st Qu.:850409012  
##  Mode  :character   Mode  :character   Median :851004026  
##                                        Mean   :852947347  
##                                        3rd Qu.:860711023  
##                                        Max.   :870119035
names(Thyroid)
##  [1] "age"                 "sex"                 "on_thyroxine"       
##  [4] "query_on_thyroxine"  "on_antithyroid_meds" "sick"               
##  [7] "pregnant"            "thyroid_surgery"     "I131_treatment"     
## [10] "query_hypothyroid"   "query_hyperthyroid"  "lithium"            
## [13] "goitre"              "tumor"               "hypopituitary"      
## [16] "psych"               "TSH_measured"        "TSH"                
## [19] "T3_measured"         "T3"                  "TT4_measured"       
## [22] "TT4"                 "T4U_measured"        "T4U"                
## [25] "FTI_measured"        "FTI"                 "TBG_measured"       
## [28] "TBG"                 "referral_source"     "target"             
## [31] "patient_id"
na_counts <- Thyroid %>% summarise_all(~sum(is.na(.)))
print(na_counts)
##   age sex on_thyroxine query_on_thyroxine on_antithyroid_meds sick pregnant
## 1   0   0            0                  0                   0    0        0
##   thyroid_surgery I131_treatment query_hypothyroid query_hyperthyroid lithium
## 1               0              0                 0                  0       0
##   goitre tumor hypopituitary psych TSH_measured TSH T3_measured T3 TT4_measured
## 1      0     0             0     0            0   0           0  0            0
##   TT4 T4U_measured T4U FTI_measured FTI TBG_measured TBG referral_source target
## 1   0            0   0            0   0            0   0               0      0
##   patient_id
## 1          0
Thyroid$sick <- Thyroid$sick == 't'

# Subsetting

selected_columns <- c( "TSH", "T3", "TT4", "T4U", "FTI", "TBG", "FTI")
thyroid_subset <- Thyroid %>% select(selected_columns)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(selected_columns)
## 
##   # Now:
##   data %>% select(all_of(selected_columns))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Use ggpairs to create the plot
suppressWarnings(ggpairs(thyroid_subset))

# Convert 'target' to boolean: TRUE if it's 'A', 'B', 'C', or 'D'; FALSE otherwise
Thyroid$target <- ifelse(Thyroid$target %in% c('A', 'B', 'C', 'D'), TRUE, FALSE)

# Feature selection
Thyroid_selected <- Thyroid %>%
  select(-patient_id, -referral_source, 
         -TSH_measured, -T3_measured, -TT4_measured, 
         -T4U_measured, -FTI_measured, -TBG_measured, 
         -starts_with("query_"))

Data cleaning and splitting into training and testing

Thyroid_selected$sex <- as.factor(Thyroid_selected$sex)
# Removing rows with any NA values

# Split the data into training and testing sets
set.seed(333) 
index <- createDataPartition(Thyroid_selected$target, p=0.8, list=FALSE)
trainData <- Thyroid_selected[index,]
testData <- Thyroid_selected[-index,]

trainData$target <- factor(trainData$target, levels = c(FALSE, TRUE))
testData$target <- factor(testData$target, levels = c(FALSE, TRUE))

Random forest and prediction

# Train the Random Forest model
rf_model <- randomForest(target ~ ., data=trainData, ntree=100)  

# Predict on the test data
rf_predictions <- predict(rf_model, newdata=testData)


rf_predictions <- factor(rf_predictions, levels = c(FALSE, TRUE))


# Evaluate model performance
confusionMatrix(rf_predictions, testData$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  1793   19
##      TRUE      5   17
##                                           
##                Accuracy : 0.9869          
##                  95% CI : (0.9806, 0.9916)
##     No Information Rate : 0.9804          
##     P-Value [Acc > NIR] : 0.021463        
##                                           
##                   Kappa : 0.58            
##                                           
##  Mcnemar's Test P-Value : 0.007963        
##                                           
##             Sensitivity : 0.9972          
##             Specificity : 0.4722          
##          Pos Pred Value : 0.9895          
##          Neg Pred Value : 0.7727          
##              Prevalence : 0.9804          
##          Detection Rate : 0.9776          
##    Detection Prevalence : 0.9880          
##       Balanced Accuracy : 0.7347          
##                                           
##        'Positive' Class : FALSE           
## 
# Find the optimal number of neighbors
control <- trainControl(method="cv", number=10)
knn_grid <- expand.grid(k=1:20)  # Checking for k values from 1 to 20

knn_model <- train(target ~ ., data=trainData, method="knn", trControl=control, tuneGrid=knn_grid)

# Best K value
best_k <- knn_model$bestTune$k

# Predict on test data using the best K
knn_predictions <- predict(knn_model, newdata=testData)

# Evaluate model performance
confusionMatrix(knn_predictions, testData$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  1791   28
##      TRUE      7    8
##                                           
##                Accuracy : 0.9809          
##                  95% CI : (0.9736, 0.9867)
##     No Information Rate : 0.9804          
##     P-Value [Acc > NIR] : 0.4771748       
##                                           
##                   Kappa : 0.3057          
##                                           
##  Mcnemar's Test P-Value : 0.0007232       
##                                           
##             Sensitivity : 0.9961          
##             Specificity : 0.2222          
##          Pos Pred Value : 0.9846          
##          Neg Pred Value : 0.5333          
##              Prevalence : 0.9804          
##          Detection Rate : 0.9766          
##    Detection Prevalence : 0.9918          
##       Balanced Accuracy : 0.6092          
##                                           
##        'Positive' Class : FALSE           
## 
# Convert the target variable to a factor
Thyroid_selected$target <- as.factor(Thyroid_selected$target)

set.seed(333)
control <- trainControl(method="boot", number=5)  # 100 bootstrapping resamples

# Random Forest with Bootstrapping
rf_model_boot <- train(target ~ ., data=Thyroid_selected, method="rf", trControl=control, ntree=100)

# KNN with Bootstrapping
knn_grid <- expand.grid(k=1:20)  # Adjust the range of k as needed
knn_model_boot <- train(target ~ ., data=Thyroid_selected, method="knn", trControl=control, tuneGrid=knn_grid)

# Print model summaries
print(rf_model_boot)
## Random Forest 
## 
## 9172 samples
##   19 predictor
##    2 classes: 'FALSE', 'TRUE' 
## 
## No pre-processing
## Resampling: Bootstrapped (5 reps) 
## Summary of sample sizes: 9172, 9172, 9172, 9172, 9172 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9866275  0.5388136
##   11    0.9884017  0.6773798
##   20    0.9878621  0.6708864
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 11.
print(knn_model_boot)
## k-Nearest Neighbors 
## 
## 9172 samples
##   19 predictor
##    2 classes: 'FALSE', 'TRUE' 
## 
## No pre-processing
## Resampling: Bootstrapped (5 reps) 
## Summary of sample sizes: 9172, 9172, 9172, 9172, 9172 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.9748919  0.3546291
##    2  0.9746539  0.3469561
##    3  0.9752613  0.3466190
##    4  0.9765161  0.3440742
##    5  0.9786036  0.3787964
##    6  0.9789042  0.3673729
##    7  0.9796190  0.3670805
##    8  0.9803906  0.3870553
##    9  0.9808669  0.4011299
##   10  0.9812818  0.4163365
##   11  0.9812245  0.4110615
##   12  0.9815225  0.4178772
##   13  0.9815799  0.4266157
##   14  0.9818150  0.4284107
##   15  0.9817537  0.4191698
##   16  0.9812784  0.4060467
##   17  0.9812204  0.4009469
##   18  0.9814008  0.4012008
##   19  0.9818808  0.4204076
##   20  0.9821807  0.4288574
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 20.

The problem that I’m trying to solve aims to identify key factors that contribute to thyroid disorders. By doing so, it can provide insights into early indicators of thyroid health issues and aid in preventative healthcare measures. Additionally, the findings can assist healthcare professionals in understanding patterns in thyroid health, facilitating better diagnosis and treatment strategies. In addition to knowing the factors that cause Thyroid disorders, I am also trying to find a substantive model that may or may not be interpretable to predict thyroid disorders. The dataset includes a range of variables such as age, sex, thyroid-related measurements (‘TSH’, ‘T3’, ‘TT4’, ‘T4U’, ‘FTI’, ‘TBG’), treatment and symptom indicators and other clinical and demographic information. Many of the columns contained booleans of whether or not their presence was observed which I removed as I considered them redundant. I also removed patient query information, id information and referral source as they did not pertain to our prediction. In my study on the Thyroid dataset to predict hyperthyroidism, I employed two machine learning models: Random Forest and K-Nearest Neighbors (KNN), each yielding distinct insights. The Random Forest model showed a high accuracy rate of 98.69% with exceptional sensitivity (99.72%), indicating its proficiency in identifying hyperthyroidism cases but with lower specificity (47.22%), suggesting less accuracy in identifying negative cases. On the other hand, the KNN model achieved a slightly lower accuracy of 98.09% and sensitivity of 99.61%. However, its specificity was notably lower at 22.22%, pointing to challenges in correctly identifying negative cases and a lower balanced accuracy of 60.92%. These results highlight the models’ strong ability to detect positive cases of hyperthyroidism but also underline a need for improved specificity, particularly in the KNN model.

In addition to the models i used above, I applied both Random Forest and K-Nearest Neighbors (KNN) models, using bootstrapping for model validation. The Random Forest model, with 19 predictors and bootstrapped over 5 repetitions, achieved its best performance at mtry = 11, demonstrating high accuracy (98.84%) and moderate Kappa (0.68). This indicates its strong capability in correctly identifying hyperthyroidism cases, albeit with some limitations in distinguishing false cases. On the other hand, the KNN model showed optimal results with k = 20, reaching an accuracy of 98.22% and a higher Kappa statistic, suggesting improved consistency in predictions compared to the Random Forest model. Both models exhibited high sensitivity, but the KNN model’s increased number of neighbors (k = 20) enhanced its predictive stability, making it slightly more reliable for this particular dataset. This comparative analysis highlights the effectiveness of both models in hyperthyroidism classification, with KNN showing a slight edge in balanced accuracy and consistency.