library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
Thyroid <- read.csv("C:\\Users\\NCC-1701D\\Downloads\\archive (4)\\thyroidDF.csv")
# Replacing na with means of column
Thyroid <- Thyroid %>%
mutate_if(is.numeric, ~ifelse(is.na(.), mean(., na.rm = TRUE), .))
summary(Thyroid)
## age sex on_thyroxine query_on_thyroxine
## Min. : 1.00 Length:9172 Length:9172 Length:9172
## 1st Qu.: 37.00 Class :character Class :character Class :character
## Median : 55.00 Mode :character Mode :character Mode :character
## Mean : 73.56
## 3rd Qu.: 68.00
## Max. :65526.00
## on_antithyroid_meds sick pregnant thyroid_surgery
## Length:9172 Length:9172 Length:9172 Length:9172
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## I131_treatment query_hypothyroid query_hyperthyroid lithium
## Length:9172 Length:9172 Length:9172 Length:9172
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## goitre tumor hypopituitary psych
## Length:9172 Length:9172 Length:9172 Length:9172
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## TSH_measured TSH T3_measured T3
## Length:9172 Min. : 0.005 Length:9172 Min. : 0.050
## Class :character 1st Qu.: 0.590 Class :character 1st Qu.: 1.700
## Mode :character Median : 1.600 Mode :character Median : 1.971
## Mean : 5.218 Mean : 1.971
## 3rd Qu.: 3.700 3rd Qu.: 2.200
## Max. :530.000 Max. :18.000
## TT4_measured TT4 T4U_measured T4U
## Length:9172 Min. : 2.0 Length:9172 Min. :0.1700
## Class :character 1st Qu.: 88.0 Class :character 1st Qu.:0.8700
## Mode :character Median :106.0 Mode :character Median :0.9761
## Mean :108.7 Mean :0.9761
## 3rd Qu.:124.0 3rd Qu.:1.0500
## Max. :600.0 Max. :2.3300
## FTI_measured FTI TBG_measured TBG
## Length:9172 Min. : 1.4 Length:9172 Min. : 0.10
## Class :character 1st Qu.: 95.0 Class :character 1st Qu.: 29.87
## Mode :character Median :112.0 Mode :character Median : 29.87
## Mean :113.6 Mean : 29.87
## 3rd Qu.:126.0 3rd Qu.: 29.87
## Max. :881.0 Max. :200.00
## referral_source target patient_id
## Length:9172 Length:9172 Min. :840801013
## Class :character Class :character 1st Qu.:850409012
## Mode :character Mode :character Median :851004026
## Mean :852947347
## 3rd Qu.:860711023
## Max. :870119035
names(Thyroid)
## [1] "age" "sex" "on_thyroxine"
## [4] "query_on_thyroxine" "on_antithyroid_meds" "sick"
## [7] "pregnant" "thyroid_surgery" "I131_treatment"
## [10] "query_hypothyroid" "query_hyperthyroid" "lithium"
## [13] "goitre" "tumor" "hypopituitary"
## [16] "psych" "TSH_measured" "TSH"
## [19] "T3_measured" "T3" "TT4_measured"
## [22] "TT4" "T4U_measured" "T4U"
## [25] "FTI_measured" "FTI" "TBG_measured"
## [28] "TBG" "referral_source" "target"
## [31] "patient_id"
na_counts <- Thyroid %>% summarise_all(~sum(is.na(.)))
print(na_counts)
## age sex on_thyroxine query_on_thyroxine on_antithyroid_meds sick pregnant
## 1 0 0 0 0 0 0 0
## thyroid_surgery I131_treatment query_hypothyroid query_hyperthyroid lithium
## 1 0 0 0 0 0
## goitre tumor hypopituitary psych TSH_measured TSH T3_measured T3 TT4_measured
## 1 0 0 0 0 0 0 0 0 0
## TT4 T4U_measured T4U FTI_measured FTI TBG_measured TBG referral_source target
## 1 0 0 0 0 0 0 0 0 0
## patient_id
## 1 0
Thyroid$sick <- Thyroid$sick == 't'
# Subsetting
selected_columns <- c( "TSH", "T3", "TT4", "T4U", "FTI", "TBG", "FTI")
thyroid_subset <- Thyroid %>% select(selected_columns)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(selected_columns)
##
## # Now:
## data %>% select(all_of(selected_columns))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Use ggpairs to create the plot
suppressWarnings(ggpairs(thyroid_subset))
# Convert 'target' to boolean: TRUE if it's 'A', 'B', 'C', or 'D'; FALSE otherwise
Thyroid$target <- ifelse(Thyroid$target %in% c('A', 'B', 'C', 'D'), TRUE, FALSE)
# Feature selection
Thyroid_selected <- Thyroid %>%
select(-patient_id, -referral_source,
-TSH_measured, -T3_measured, -TT4_measured,
-T4U_measured, -FTI_measured, -TBG_measured,
-starts_with("query_"))
Thyroid_selected$sex <- as.factor(Thyroid_selected$sex)
# Removing rows with any NA values
# Split the data into training and testing sets
set.seed(333)
index <- createDataPartition(Thyroid_selected$target, p=0.8, list=FALSE)
trainData <- Thyroid_selected[index,]
testData <- Thyroid_selected[-index,]
trainData$target <- factor(trainData$target, levels = c(FALSE, TRUE))
testData$target <- factor(testData$target, levels = c(FALSE, TRUE))
# Train the Random Forest model
rf_model <- randomForest(target ~ ., data=trainData, ntree=100)
# Predict on the test data
rf_predictions <- predict(rf_model, newdata=testData)
rf_predictions <- factor(rf_predictions, levels = c(FALSE, TRUE))
# Evaluate model performance
confusionMatrix(rf_predictions, testData$target)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 1793 19
## TRUE 5 17
##
## Accuracy : 0.9869
## 95% CI : (0.9806, 0.9916)
## No Information Rate : 0.9804
## P-Value [Acc > NIR] : 0.021463
##
## Kappa : 0.58
##
## Mcnemar's Test P-Value : 0.007963
##
## Sensitivity : 0.9972
## Specificity : 0.4722
## Pos Pred Value : 0.9895
## Neg Pred Value : 0.7727
## Prevalence : 0.9804
## Detection Rate : 0.9776
## Detection Prevalence : 0.9880
## Balanced Accuracy : 0.7347
##
## 'Positive' Class : FALSE
##
# Find the optimal number of neighbors
control <- trainControl(method="cv", number=10)
knn_grid <- expand.grid(k=1:20) # Checking for k values from 1 to 20
knn_model <- train(target ~ ., data=trainData, method="knn", trControl=control, tuneGrid=knn_grid)
# Best K value
best_k <- knn_model$bestTune$k
# Predict on test data using the best K
knn_predictions <- predict(knn_model, newdata=testData)
# Evaluate model performance
confusionMatrix(knn_predictions, testData$target)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 1791 28
## TRUE 7 8
##
## Accuracy : 0.9809
## 95% CI : (0.9736, 0.9867)
## No Information Rate : 0.9804
## P-Value [Acc > NIR] : 0.4771748
##
## Kappa : 0.3057
##
## Mcnemar's Test P-Value : 0.0007232
##
## Sensitivity : 0.9961
## Specificity : 0.2222
## Pos Pred Value : 0.9846
## Neg Pred Value : 0.5333
## Prevalence : 0.9804
## Detection Rate : 0.9766
## Detection Prevalence : 0.9918
## Balanced Accuracy : 0.6092
##
## 'Positive' Class : FALSE
##
# Convert the target variable to a factor
Thyroid_selected$target <- as.factor(Thyroid_selected$target)
set.seed(333)
control <- trainControl(method="boot", number=5) # 100 bootstrapping resamples
# Random Forest with Bootstrapping
rf_model_boot <- train(target ~ ., data=Thyroid_selected, method="rf", trControl=control, ntree=100)
# KNN with Bootstrapping
knn_grid <- expand.grid(k=1:20) # Adjust the range of k as needed
knn_model_boot <- train(target ~ ., data=Thyroid_selected, method="knn", trControl=control, tuneGrid=knn_grid)
# Print model summaries
print(rf_model_boot)
## Random Forest
##
## 9172 samples
## 19 predictor
## 2 classes: 'FALSE', 'TRUE'
##
## No pre-processing
## Resampling: Bootstrapped (5 reps)
## Summary of sample sizes: 9172, 9172, 9172, 9172, 9172
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9866275 0.5388136
## 11 0.9884017 0.6773798
## 20 0.9878621 0.6708864
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 11.
print(knn_model_boot)
## k-Nearest Neighbors
##
## 9172 samples
## 19 predictor
## 2 classes: 'FALSE', 'TRUE'
##
## No pre-processing
## Resampling: Bootstrapped (5 reps)
## Summary of sample sizes: 9172, 9172, 9172, 9172, 9172
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9748919 0.3546291
## 2 0.9746539 0.3469561
## 3 0.9752613 0.3466190
## 4 0.9765161 0.3440742
## 5 0.9786036 0.3787964
## 6 0.9789042 0.3673729
## 7 0.9796190 0.3670805
## 8 0.9803906 0.3870553
## 9 0.9808669 0.4011299
## 10 0.9812818 0.4163365
## 11 0.9812245 0.4110615
## 12 0.9815225 0.4178772
## 13 0.9815799 0.4266157
## 14 0.9818150 0.4284107
## 15 0.9817537 0.4191698
## 16 0.9812784 0.4060467
## 17 0.9812204 0.4009469
## 18 0.9814008 0.4012008
## 19 0.9818808 0.4204076
## 20 0.9821807 0.4288574
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 20.
The problem that I’m trying to solve aims to identify key factors that contribute to thyroid disorders. By doing so, it can provide insights into early indicators of thyroid health issues and aid in preventative healthcare measures. Additionally, the findings can assist healthcare professionals in understanding patterns in thyroid health, facilitating better diagnosis and treatment strategies. In addition to knowing the factors that cause Thyroid disorders, I am also trying to find a substantive model that may or may not be interpretable to predict thyroid disorders. The dataset includes a range of variables such as age, sex, thyroid-related measurements (‘TSH’, ‘T3’, ‘TT4’, ‘T4U’, ‘FTI’, ‘TBG’), treatment and symptom indicators and other clinical and demographic information. Many of the columns contained booleans of whether or not their presence was observed which I removed as I considered them redundant. I also removed patient query information, id information and referral source as they did not pertain to our prediction. In my study on the Thyroid dataset to predict hyperthyroidism, I employed two machine learning models: Random Forest and K-Nearest Neighbors (KNN), each yielding distinct insights. The Random Forest model showed a high accuracy rate of 98.69% with exceptional sensitivity (99.72%), indicating its proficiency in identifying hyperthyroidism cases but with lower specificity (47.22%), suggesting less accuracy in identifying negative cases. On the other hand, the KNN model achieved a slightly lower accuracy of 98.09% and sensitivity of 99.61%. However, its specificity was notably lower at 22.22%, pointing to challenges in correctly identifying negative cases and a lower balanced accuracy of 60.92%. These results highlight the models’ strong ability to detect positive cases of hyperthyroidism but also underline a need for improved specificity, particularly in the KNN model.
In addition to the models i used above, I applied both Random Forest and K-Nearest Neighbors (KNN) models, using bootstrapping for model validation. The Random Forest model, with 19 predictors and bootstrapped over 5 repetitions, achieved its best performance at mtry = 11, demonstrating high accuracy (98.84%) and moderate Kappa (0.68). This indicates its strong capability in correctly identifying hyperthyroidism cases, albeit with some limitations in distinguishing false cases. On the other hand, the KNN model showed optimal results with k = 20, reaching an accuracy of 98.22% and a higher Kappa statistic, suggesting improved consistency in predictions compared to the Random Forest model. Both models exhibited high sensitivity, but the KNN model’s increased number of neighbors (k = 20) enhanced its predictive stability, making it slightly more reliable for this particular dataset. This comparative analysis highlights the effectiveness of both models in hyperthyroidism classification, with KNN showing a slight edge in balanced accuracy and consistency.