#import dataset from computer #here file path “C:/Users/Owner/Downloads/thyroid_cancer_risk_data.csv” is likely incorrect and needs to be replaced with the actual path to the excel file on computer.
thyroid_cancer_risk_data1 <- read.csv ("C:/Users/user/Downloads/thyroid_cancer_risk_data.csv", header=TRUE)
View(thyroid_cancer_risk_data1)
str(thyroid_cancer_risk_data1)
## 'data.frame': 212691 obs. of 17 variables:
## $ Patient_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 66 29 86 75 35 89 89 38 17 36 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ Country : chr "Russia" "Germany" "Nigeria" "India" ...
## $ Ethnicity : chr "Caucasian" "Hispanic" "Caucasian" "Asian" ...
## $ Family_History : chr "No" "No" "No" "No" ...
## $ Radiation_Exposure : chr "Yes" "Yes" "No" "No" ...
## $ Iodine_Deficiency : chr "No" "No" "No" "No" ...
## $ Smoking : chr "No" "No" "No" "No" ...
## $ Obesity : chr "No" "No" "No" "No" ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ TSH_Level : num 9.37 1.83 6.26 4.1 9.1 4 4.7 5.54 2.3 1.34 ...
## $ T3_Level : num 1.67 1.73 2.59 2.62 2.11 0.98 0.62 3.49 2.6 0.56 ...
## $ T4_Level : num 6.16 10.54 10.57 11.04 10.71 ...
## $ Nodule_Size : num 1.08 4.05 4.61 2.46 2.11 0.02 0.01 4.3 0.81 1.44 ...
## $ Thyroid_Cancer_Risk: chr "Low" "Low" "Low" "Medium" ...
## $ Diagnosis : chr "Benign" "Benign" "Benign" "Benign" ...
names(thyroid_cancer_risk_data1)
## [1] "Patient_ID" "Age" "Gender"
## [4] "Country" "Ethnicity" "Family_History"
## [7] "Radiation_Exposure" "Iodine_Deficiency" "Smoking"
## [10] "Obesity" "Diabetes" "TSH_Level"
## [13] "T3_Level" "T4_Level" "Nodule_Size"
## [16] "Thyroid_Cancer_Risk" "Diagnosis"
head(thyroid_cancer_risk_data1,15)
## Patient_ID Age Gender Country Ethnicity Family_History
## 1 1 66 Male Russia Caucasian No
## 2 2 29 Male Germany Hispanic No
## 3 3 86 Male Nigeria Caucasian No
## 4 4 75 Female India Asian No
## 5 5 35 Female Germany African Yes
## 6 6 89 Male UK African No
## 7 7 89 Female South Korea Asian Yes
## 8 8 38 Female India African No
## 9 9 17 Female Russia African No
## 10 10 36 Male Germany Asian No
## 11 11 67 Male Nigeria African No
## 12 12 16 Female Nigeria Asian No
## 13 13 44 Male South Korea Asian Yes
## 14 14 52 Male Brazil Asian No
## 15 15 16 Female China Asian No
## Radiation_Exposure Iodine_Deficiency Smoking Obesity Diabetes TSH_Level
## 1 Yes No No No No 9.37
## 2 Yes No No No No 1.83
## 3 No No No No No 6.26
## 4 No No No No No 4.10
## 5 Yes No No No No 9.10
## 6 No No Yes Yes No 4.00
## 7 Yes No No Yes No 4.70
## 8 No No No No No 5.54
## 9 Yes No No No Yes 2.30
## 10 No No No Yes No 1.34
## 11 Yes No No No No 9.65
## 12 No No Yes No No 0.53
## 13 No No No No Yes 6.77
## 14 No No No No No 4.91
## 15 No No No No No 6.84
## T3_Level T4_Level Nodule_Size Thyroid_Cancer_Risk Diagnosis
## 1 1.67 6.16 1.08 Low Benign
## 2 1.73 10.54 4.05 Low Benign
## 3 2.59 10.57 4.61 Low Benign
## 4 2.62 11.04 2.46 Medium Benign
## 5 2.11 10.71 2.11 High Benign
## 6 0.98 5.52 0.02 Medium Benign
## 7 0.62 11.73 0.01 High Malignant
## 8 3.49 9.47 4.30 Medium Benign
## 9 2.60 11.89 0.81 High Malignant
## 10 0.56 4.51 1.44 Low Benign
## 11 1.82 8.17 0.35 High Malignant
## 12 1.13 9.56 3.87 Medium Benign
## 13 1.37 6.13 4.15 High Malignant
## 14 0.95 6.00 0.38 Low Benign
## 15 0.62 6.80 1.68 Medium Benign
categorize_age <- function(age){
if (is.na(age)){
return(NA)
} else if (age < 30){
return("Young")
} else if (age >= 30 & age <= 50){
return("Middle-aged")
} else {
return("Senior")
}
}
thyroid_cancer_risk_data1$Age_Category <- sapply(thyroid_cancer_risk_data1$Age, categorize_age)
Now for manipulation we need to run tidyverse package first.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
filtered_age <- thyroid_cancer_risk_data1 %>% filter(Thyroid_Cancer_Risk == "High" & Age > 50)
head(thyroid_cancer_risk_data1,50)
## Patient_ID Age Gender Country Ethnicity Family_History
## 1 1 66 Male Russia Caucasian No
## 2 2 29 Male Germany Hispanic No
## 3 3 86 Male Nigeria Caucasian No
## 4 4 75 Female India Asian No
## 5 5 35 Female Germany African Yes
## 6 6 89 Male UK African No
## 7 7 89 Female South Korea Asian Yes
## 8 8 38 Female India African No
## 9 9 17 Female Russia African No
## 10 10 36 Male Germany Asian No
## 11 11 67 Male Nigeria African No
## 12 12 16 Female Nigeria Asian No
## 13 13 44 Male South Korea Asian Yes
## 14 14 52 Male Brazil Asian No
## 15 15 16 Female China Asian No
## 16 16 78 Female Nigeria Caucasian Yes
## 17 17 74 Female India African Yes
## 18 18 35 Male Japan Hispanic No
## 19 19 47 Female USA Caucasian No
## 20 20 72 Female Japan Caucasian No
## 21 21 36 Male Russia Hispanic No
## 22 22 63 Female Nigeria Asian No
## 23 23 73 Male India Caucasian No
## 24 24 56 Male Nigeria Caucasian No
## 25 25 74 Female South Korea Asian No
## 26 26 29 Male Russia Asian Yes
## 27 27 76 Female Nigeria Hispanic Yes
## 28 28 76 Male Brazil Hispanic Yes
## 29 29 61 Female USA Hispanic No
## 30 30 76 Female India Asian Yes
## 31 31 65 Female India Caucasian No
## 32 32 69 Female Brazil Caucasian Yes
## 33 33 78 Male Russia African No
## 34 34 17 Male India African No
## 35 35 65 Female Germany African Yes
## 36 36 21 Female Japan Middle Eastern No
## 37 37 35 Female India Asian No
## 38 38 87 Female India African No
## 39 39 53 Male South Korea Caucasian Yes
## 40 40 32 Male China African No
## 41 41 18 Female Japan Caucasian Yes
## 42 42 74 Male Nigeria Caucasian No
## 43 43 28 Female China Caucasian No
## 44 44 23 Female India African No
## 45 45 67 Male Russia African No
## 46 46 16 Female Japan Caucasian Yes
## 47 47 74 Female Brazil Caucasian No
## 48 48 85 Male China African No
## 49 49 58 Female Russia African No
## 50 50 22 Male Nigeria Caucasian No
## Radiation_Exposure Iodine_Deficiency Smoking Obesity Diabetes TSH_Level
## 1 Yes No No No No 9.37
## 2 Yes No No No No 1.83
## 3 No No No No No 6.26
## 4 No No No No No 4.10
## 5 Yes No No No No 9.10
## 6 No No Yes Yes No 4.00
## 7 Yes No No Yes No 4.70
## 8 No No No No No 5.54
## 9 Yes No No No Yes 2.30
## 10 No No No Yes No 1.34
## 11 Yes No No No No 9.65
## 12 No No Yes No No 0.53
## 13 No No No No Yes 6.77
## 14 No No No No No 4.91
## 15 No No No No No 6.84
## 16 No Yes Yes No No 7.32
## 17 No No No No No 9.60
## 18 No No No No No 3.59
## 19 No No No No Yes 6.43
## 20 No No No No No 5.96
## 21 No Yes No Yes No 4.17
## 22 No No No Yes Yes 6.97
## 23 Yes No No Yes No 3.18
## 24 No No No No Yes 0.14
## 25 No No No No No 6.02
## 26 No No Yes No No 7.90
## 27 Yes Yes No No No 0.87
## 28 Yes No No No No 4.99
## 29 No No Yes No No 4.07
## 30 No No Yes No No 4.42
## 31 No No No Yes No 7.29
## 32 No No Yes No No 6.27
## 33 No No No No No 3.35
## 34 No Yes No Yes No 1.64
## 35 No No No No No 3.16
## 36 No No No No No 3.98
## 37 No Yes Yes No No 5.85
## 38 No No No No No 8.74
## 39 No No Yes No No 6.65
## 40 No No No Yes No 6.38
## 41 No Yes No No No 5.86
## 42 No Yes No No No 4.57
## 43 No No No No No 0.87
## 44 No No No Yes No 7.58
## 45 No Yes Yes No No 6.49
## 46 No No No Yes No 2.85
## 47 No No Yes No No 2.90
## 48 No No No No No 0.94
## 49 No No No No No 4.91
## 50 No No No Yes Yes 8.99
## T3_Level T4_Level Nodule_Size Thyroid_Cancer_Risk Diagnosis Age_Category
## 1 1.67 6.16 1.08 Low Benign Senior
## 2 1.73 10.54 4.05 Low Benign Young
## 3 2.59 10.57 4.61 Low Benign Senior
## 4 2.62 11.04 2.46 Medium Benign Senior
## 5 2.11 10.71 2.11 High Benign Middle-aged
## 6 0.98 5.52 0.02 Medium Benign Senior
## 7 0.62 11.73 0.01 High Malignant Senior
## 8 3.49 9.47 4.30 Medium Benign Middle-aged
## 9 2.60 11.89 0.81 High Malignant Young
## 10 0.56 4.51 1.44 Low Benign Middle-aged
## 11 1.82 8.17 0.35 High Malignant Senior
## 12 1.13 9.56 3.87 Medium Benign Young
## 13 1.37 6.13 4.15 High Malignant Middle-aged
## 14 0.95 6.00 0.38 Low Benign Senior
## 15 0.62 6.80 1.68 Medium Benign Young
## 16 1.90 11.82 2.86 Low Benign Senior
## 17 2.86 11.50 0.25 Low Benign Senior
## 18 1.83 4.95 4.93 Medium Benign Middle-aged
## 19 3.39 5.66 1.63 Medium Benign Middle-aged
## 20 1.26 7.89 2.27 Low Benign Senior
## 21 2.92 10.24 2.41 Low Benign Middle-aged
## 22 3.48 7.67 0.46 Low Malignant Senior
## 23 3.14 9.70 2.27 Low Malignant Senior
## 24 0.90 7.93 4.79 Low Benign Senior
## 25 2.75 11.41 3.63 Low Benign Senior
## 26 0.82 6.63 4.64 High Malignant Young
## 27 0.80 4.73 4.22 Low Benign Senior
## 28 1.56 6.48 1.54 Medium Benign Senior
## 29 2.27 10.98 4.26 Low Benign Senior
## 30 1.12 8.83 4.00 High Malignant Senior
## 31 3.02 6.52 3.06 Low Benign Senior
## 32 1.26 8.77 0.46 Medium Benign Senior
## 33 2.80 11.21 0.04 Medium Benign Senior
## 34 3.12 10.80 3.40 High Malignant Young
## 35 1.14 8.31 0.17 Low Benign Senior
## 36 1.33 6.40 0.09 Low Benign Young
## 37 1.16 7.95 2.68 High Malignant Middle-aged
## 38 2.63 5.74 3.90 Low Benign Senior
## 39 2.25 7.24 4.05 Medium Malignant Senior
## 40 2.77 6.26 3.76 Medium Benign Middle-aged
## 41 2.52 10.02 3.65 Medium Benign Young
## 42 2.14 4.65 1.14 Medium Benign Senior
## 43 1.64 5.76 3.56 Medium Benign Young
## 44 2.12 6.64 3.57 Low Benign Young
## 45 1.66 6.98 3.15 Low Benign Senior
## 46 1.29 10.85 4.21 Medium Benign Young
## 47 1.78 8.81 1.66 Low Benign Senior
## 48 3.07 7.04 0.71 Low Malignant Senior
## 49 1.24 4.84 4.93 Medium Benign Senior
## 50 2.51 10.39 4.27 Low Malignant Young
(Assuming dependent variable: Diagnosis, independent variables: Age, TSH_Level, Nodule_Size)
dependent_var <- "Diagnosis"
independent_vars <- c("Age", "TSH_Level", "Nodule_Size")
analysis_df <- thyroid_cancer_risk_data1 %>%
c(dependent_var, independent_vars)
thyroid_cancer_risk_data1 <- na.omit(thyroid_cancer_risk_data1)
This removes duplicate rows from the thyroid_cancer_risk_data1.and creates the second data set we named it as thyroid_cancer_data2
thyroid_cancer_risk_data2 = thyroid_cancer_risk_data1 %>% distinct (Age, .keep_all = TRUE)
thyroid_cancer_risk_data1 <- thyroid_cancer_risk_data1 %>% arrange(desc(Age),desc(Country),desc(TSH_Level))
thyroid_cancer_risk_data1<- thyroid_cancer_risk_data1 %>% rename( Triiodothyronine = T3_Level, Thyroxine= T4_Level)
thyroid_cancer_risk_data1 <- thyroid_cancer_risk_data1 %>% mutate( Actualpotency_Triiodothyronine= Thyroxine * 5)
set.seed(123)
train_indices <- sample(1:nrow(thyroid_cancer_risk_data1), size = 0.7*nrow(thyroid_cancer_risk_data1))
thyroid_cancer_risk_data1 <-thyroid_cancer_risk_data1[train_indices, ]
summary(thyroid_cancer_risk_data1)
## Patient_ID Age Gender Country
## Min. : 2 Min. :15.00 Length:148883 Length:148883
## 1st Qu.: 53076 1st Qu.:33.00 Class :character Class :character
## Median :106368 Median :52.00 Mode :character Mode :character
## Mean :106303 Mean :51.92
## 3rd Qu.:159507 3rd Qu.:71.00
## Max. :212691 Max. :89.00
## Ethnicity Family_History Radiation_Exposure Iodine_Deficiency
## Length:148883 Length:148883 Length:148883 Length:148883
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Smoking Obesity Diabetes TSH_Level
## Length:148883 Length:148883 Length:148883 Min. : 0.100
## Class :character Class :character Class :character 1st Qu.: 2.570
## Mode :character Mode :character Mode :character Median : 5.040
## Mean : 5.047
## 3rd Qu.: 7.530
## Max. :10.000
## Triiodothyronine Thyroxine Nodule_Size Thyroid_Cancer_Risk
## Min. :0.500 Min. : 4.500 Min. :0.000 Length:148883
## 1st Qu.:1.250 1st Qu.: 6.360 1st Qu.:1.250 Class :character
## Median :2.000 Median : 8.240 Median :2.510 Mode :character
## Mean :2.001 Mean : 8.246 Mean :2.502
## 3rd Qu.:2.750 3rd Qu.:10.120 3rd Qu.:3.750
## Max. :3.500 Max. :12.000 Max. :5.000
## Diagnosis Age_Category Actualpotency_Triiodothyronine
## Length:148883 Length:148883 Min. :22.50
## Class :character Class :character 1st Qu.:31.80
## Mode :character Mode :character Median :41.20
## Mean :41.23
## 3rd Qu.:50.60
## Max. :60.00
Mean_TSH_stats = mean(thyroid_cancer_risk_data1$TSH_Level)
Median_TSH_stats <- median(thyroid_cancer_risk_data1$TSH_Level)
Mode_TSH_stats = { as.numeric(thyroid_cancer_risk_data1$TSH_Level)}
Range_TSH_stats = range(thyroid_cancer_risk_data1$TSH_Level)
TSH_stats <- list (Mean_TSH_stats, Median_TSH_stats, Mode_TSH_stats, Range_TSH_stats)
Now for visulization active package ggplot2.I have taken the smaller data set with 75 value as large number of values in data set didn’t make any sense for the diagram. I created the jitter plot instead of scatter pot as scatter plot doesnt suit for analysis for this data set.
Analysis:The plot shows the distribution of individuals across different age groups for each thyroid cancer risk category. High-risk individuals appear sparsely across the age range, with only a few individuals in this category.Low-risk individuals are more prevalent and are scattered across the age spectrum. There might be a slight concentration of low-risk individuals in the middle age range.Medium-risk individuals also appear throughout the age range and seem to be the most frequent category.There is no visually apparent strong linear relationship between age and thyroid cancer risk category. The points are scattered, suggesting that age alone might not be a strong predictor of thyroid cancer risk
library(ggplot2)
ggplot(thyroid_cancer_risk_data2, aes(x=Age, y =Thyroid_Cancer_Risk,color = Thyroid_Cancer_Risk )) + geom_jitter(width = 0.2, height = 0.5)
I have used ggplot2 package for data visualization I have taken 75 observations dataset, where I used bar graph with two variables, that are ‘Country’ and ’Thyroid_Cancer_Risk.
India has the highest overall count of thyroid cancer risk cases, with a significant proportion classified as Low risk (green) and Medium risk (Blue). Nigeria and the USA also show high case counts, with varying distributions across the three risk levels.Japan, China, and Russia have relatively lower case counts, but still contain individuals in all risk categories. The presence of Medium risk (blue) and High risk (orange) varies across countries, indicating differences in thyroid cancer risk distribution globally.
ggplot(thyroid_cancer_risk_data2, aes(x = Country, fill = Thyroid_Cancer_Risk)) +
geom_bar(position = "stack")
( No correlation was found, value =0.08) #A value of 0.08 suggests a very weak positive linear relationship. As age increases, there is little to no tendency for TSH level to consistently increase or decrease.
cormethod= cor(thyroid_cancer_risk_data2$Age, thyroid_cancer_risk_data2$TSH_Level,method = "pearson")