Group 4
2025-11-30
#import dataset from computer
thyroid_cancer_risk_data1 <- read.csv ("C:/Users/samip/Downloads/thyroid_cancer_risk_data.csv", header=TRUE)#view the dataset
#print the structure of your dataset
## 'data.frame': 212691 obs. of 17 variables:
## $ Patient_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 66 29 86 75 35 89 89 38 17 36 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ Country : chr "Russia" "Germany" "Nigeria" "India" ...
## $ Ethnicity : chr "Caucasian" "Hispanic" "Caucasian" "Asian" ...
## $ Family_History : chr "No" "No" "No" "No" ...
## $ Radiation_Exposure : chr "Yes" "Yes" "No" "No" ...
## $ Iodine_Deficiency : chr "No" "No" "No" "No" ...
## $ Smoking : chr "No" "No" "No" "No" ...
## $ Obesity : chr "No" "No" "No" "No" ...
## $ Diabetes : chr "No" "No" "No" "No" ...
## $ TSH_Level : num 9.37 1.83 6.26 4.1 9.1 4 4.7 5.54 2.3 1.34 ...
## $ T3_Level : num 1.67 1.73 2.59 2.62 2.11 0.98 0.62 3.49 2.6 0.56 ...
## $ T4_Level : num 6.16 10.54 10.57 11.04 10.71 ...
## $ Nodule_Size : num 1.08 4.05 4.61 2.46 2.11 0.02 0.01 4.3 0.81 1.44 ...
## $ Thyroid_Cancer_Risk: chr "Low" "Low" "Low" "Medium" ...
## $ Diagnosis : chr "Benign" "Benign" "Benign" "Benign" ...
#List the variables in your dataset.
## [1] "Patient_ID" "Age" "Gender"
## [4] "Country" "Ethnicity" "Family_History"
## [7] "Radiation_Exposure" "Iodine_Deficiency" "Smoking"
## [10] "Obesity" "Diabetes" "TSH_Level"
## [13] "T3_Level" "T4_Level" "Nodule_Size"
## [16] "Thyroid_Cancer_Risk" "Diagnosis"
#Print the top 15 rows of your dataset.
## Patient_ID Age Gender Country Ethnicity Family_History
## 1 1 66 Male Russia Caucasian No
## 2 2 29 Male Germany Hispanic No
## 3 3 86 Male Nigeria Caucasian No
## 4 4 75 Female India Asian No
## 5 5 35 Female Germany African Yes
## 6 6 89 Male UK African No
## 7 7 89 Female South Korea Asian Yes
## 8 8 38 Female India African No
## 9 9 17 Female Russia African No
## 10 10 36 Male Germany Asian No
## 11 11 67 Male Nigeria African No
## 12 12 16 Female Nigeria Asian No
## 13 13 44 Male South Korea Asian Yes
## 14 14 52 Male Brazil Asian No
## 15 15 16 Female China Asian No
## Radiation_Exposure Iodine_Deficiency Smoking Obesity Diabetes TSH_Level
## 1 Yes No No No No 9.37
## 2 Yes No No No No 1.83
## 3 No No No No No 6.26
## 4 No No No No No 4.10
## 5 Yes No No No No 9.10
## 6 No No Yes Yes No 4.00
## 7 Yes No No Yes No 4.70
## 8 No No No No No 5.54
## 9 Yes No No No Yes 2.30
## 10 No No No Yes No 1.34
## 11 Yes No No No No 9.65
## 12 No No Yes No No 0.53
## 13 No No No No Yes 6.77
## 14 No No No No No 4.91
## 15 No No No No No 6.84
## T3_Level T4_Level Nodule_Size Thyroid_Cancer_Risk Diagnosis
## 1 1.67 6.16 1.08 Low Benign
## 2 1.73 10.54 4.05 Low Benign
## 3 2.59 10.57 4.61 Low Benign
## 4 2.62 11.04 2.46 Medium Benign
## 5 2.11 10.71 2.11 High Benign
## 6 0.98 5.52 0.02 Medium Benign
## 7 0.62 11.73 0.01 High Malignant
## 8 3.49 9.47 4.30 Medium Benign
## 9 2.60 11.89 0.81 High Malignant
## 10 0.56 4.51 1.44 Low Benign
## 11 1.82 8.17 0.35 High Malignant
## 12 1.13 9.56 3.87 Medium Benign
## 13 1.37 6.13 4.15 High Malignant
## 14 0.95 6.00 0.38 Low Benign
## 15 0.62 6.80 1.68 Medium Benign
filtered_age <- thyroid_cancer_risk_data1 %>%
filter('Thyroid_Cancer_Risk'== "High" & Age> 50)
print(filtered_age)## [1] Patient_ID Age Gender
## [4] Country Ethnicity Family_History
## [7] Radiation_Exposure Iodine_Deficiency Smoking
## [10] Obesity Diabetes TSH_Level
## [13] T3_Level T4_Level Nodule_Size
## [16] Thyroid_Cancer_Risk Diagnosis
## <0 rows> (or 0-length row.names)
(Assuming dependent variable: Diagnosis, independent variables: Age, TSH_Level, Nodule_Size)
#Create new dataframe with selected variables
thyroid_cancer_risk_data1 <- thyroid_cancer_risk_data1 %>% arrange(desc(Age),desc(Country),desc(TSH_Level))#Rename some column
## Patient_ID Age Gender Country
## Min. : 2 Min. :15.00 Length:148883 Length:148883
## 1st Qu.: 53076 1st Qu.:33.00 Class :character Class :character
## Median :106368 Median :52.00 Mode :character Mode :character
## Mean :106303 Mean :51.92
## 3rd Qu.:159507 3rd Qu.:71.00
## Max. :212691 Max. :89.00
## Ethnicity Family_History Radiation_Exposure Iodine_Deficiency
## Length:148883 Length:148883 Length:148883 Length:148883
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Smoking Obesity Diabetes TSH_Level
## Length:148883 Length:148883 Length:148883 Min. : 0.100
## Class :character Class :character Class :character 1st Qu.: 2.570
## Mode :character Mode :character Mode :character Median : 5.040
## Mean : 5.047
## 3rd Qu.: 7.530
## Max. :10.000
## Triiodothyronine Thyroxine Nodule_Size Thyroid_Cancer_Risk
## Min. :0.500 Min. : 4.500 Min. :0.000 Length:148883
## 1st Qu.:1.250 1st Qu.: 6.360 1st Qu.:1.250 Class :character
## Median :2.000 Median : 8.240 Median :2.510 Mode :character
## Mean :2.001 Mean : 8.246 Mean :2.502
## 3rd Qu.:2.750 3rd Qu.:10.120 3rd Qu.:3.750
## Max. :3.500 Max. :12.000 Max. :5.000
## Diagnosis Actualpotency_Triiodothyronine
## Length:148883 Min. :22.50
## Class :character 1st Qu.:31.80
## Mode :character Median :41.20
## Mean :41.23
## 3rd Qu.:50.60
## Max. :60.00
#Calculate Mean, Median, Mode & Range
Mean_TSH_stats = mean(thyroid_cancer_risk_data1$TSH_Level)
Median_TSH_stats <- median(thyroid_cancer_risk_data1$TSH_Level)
Mode_TSH_stats = { as.numeric(thyroid_cancer_risk_data1$TSH_Level)}
Range_TSH_stats = range(thyroid_cancer_risk_data1$TSH_Level)
TSH_stats <- list (Mean_TSH_stats, Median_TSH_stats, Mode_TSH_stats, Range_TSH_stats)ggplot(thyroid_cancer_risk_data1, aes(x=Age, y =Thyroid_Cancer_Risk,color = Thyroid_Cancer_Risk )) + geom_jitter(width = 0.2, height = 0.5)