Assignment 1- R Programming

Group 4

2025-11-30

R Markdown File

#import dataset from computer

thyroid_cancer_risk_data1 <- read.csv ("C:/Users/samip/Downloads/thyroid_cancer_risk_data.csv", header=TRUE)

#view the dataset

View(thyroid_cancer_risk_data1)

#print the structure of your dataset

str(thyroid_cancer_risk_data1)

## 'data.frame':    212691 obs. of  17 variables:
##  $ Patient_ID         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                : int  66 29 86 75 35 89 89 38 17 36 ...
##  $ Gender             : chr  "Male" "Male" "Male" "Female" ...
##  $ Country            : chr  "Russia" "Germany" "Nigeria" "India" ...
##  $ Ethnicity          : chr  "Caucasian" "Hispanic" "Caucasian" "Asian" ...
##  $ Family_History     : chr  "No" "No" "No" "No" ...
##  $ Radiation_Exposure : chr  "Yes" "Yes" "No" "No" ...
##  $ Iodine_Deficiency  : chr  "No" "No" "No" "No" ...
##  $ Smoking            : chr  "No" "No" "No" "No" ...
##  $ Obesity            : chr  "No" "No" "No" "No" ...
##  $ Diabetes           : chr  "No" "No" "No" "No" ...
##  $ TSH_Level          : num  9.37 1.83 6.26 4.1 9.1 4 4.7 5.54 2.3 1.34 ...
##  $ T3_Level           : num  1.67 1.73 2.59 2.62 2.11 0.98 0.62 3.49 2.6 0.56 ...
##  $ T4_Level           : num  6.16 10.54 10.57 11.04 10.71 ...
##  $ Nodule_Size        : num  1.08 4.05 4.61 2.46 2.11 0.02 0.01 4.3 0.81 1.44 ...
##  $ Thyroid_Cancer_Risk: chr  "Low" "Low" "Low" "Medium" ...
##  $ Diagnosis          : chr  "Benign" "Benign" "Benign" "Benign" ...

#List the variables in your dataset.

names(thyroid_cancer_risk_data1)

##  [1] "Patient_ID"          "Age"                 "Gender"             
##  [4] "Country"             "Ethnicity"           "Family_History"     
##  [7] "Radiation_Exposure"  "Iodine_Deficiency"   "Smoking"            
## [10] "Obesity"             "Diabetes"            "TSH_Level"          
## [13] "T3_Level"            "T4_Level"            "Nodule_Size"        
## [16] "Thyroid_Cancer_Risk" "Diagnosis"

#Print the top 15 rows of your dataset.

head(thyroid_cancer_risk_data1,15)

##    Patient_ID Age Gender     Country Ethnicity Family_History
## 1           1  66   Male      Russia Caucasian             No
## 2           2  29   Male     Germany  Hispanic             No
## 3           3  86   Male     Nigeria Caucasian             No
## 4           4  75 Female       India     Asian             No
## 5           5  35 Female     Germany   African            Yes
## 6           6  89   Male          UK   African             No
## 7           7  89 Female South Korea     Asian            Yes
## 8           8  38 Female       India   African             No
## 9           9  17 Female      Russia   African             No
## 10         10  36   Male     Germany     Asian             No
## 11         11  67   Male     Nigeria   African             No
## 12         12  16 Female     Nigeria     Asian             No
## 13         13  44   Male South Korea     Asian            Yes
## 14         14  52   Male      Brazil     Asian             No
## 15         15  16 Female       China     Asian             No
##    Radiation_Exposure Iodine_Deficiency Smoking Obesity Diabetes TSH_Level
## 1                 Yes                No      No      No       No      9.37
## 2                 Yes                No      No      No       No      1.83
## 3                  No                No      No      No       No      6.26
## 4                  No                No      No      No       No      4.10
## 5                 Yes                No      No      No       No      9.10
## 6                  No                No     Yes     Yes       No      4.00
## 7                 Yes                No      No     Yes       No      4.70
## 8                  No                No      No      No       No      5.54
## 9                 Yes                No      No      No      Yes      2.30
## 10                 No                No      No     Yes       No      1.34
## 11                Yes                No      No      No       No      9.65
## 12                 No                No     Yes      No       No      0.53
## 13                 No                No      No      No      Yes      6.77
## 14                 No                No      No      No       No      4.91
## 15                 No                No      No      No       No      6.84
##    T3_Level T4_Level Nodule_Size Thyroid_Cancer_Risk Diagnosis
## 1      1.67     6.16        1.08                 Low    Benign
## 2      1.73    10.54        4.05                 Low    Benign
## 3      2.59    10.57        4.61                 Low    Benign
## 4      2.62    11.04        2.46              Medium    Benign
## 5      2.11    10.71        2.11                High    Benign
## 6      0.98     5.52        0.02              Medium    Benign
## 7      0.62    11.73        0.01                High Malignant
## 8      3.49     9.47        4.30              Medium    Benign
## 9      2.60    11.89        0.81                High Malignant
## 10     0.56     4.51        1.44                 Low    Benign
## 11     1.82     8.17        0.35                High Malignant
## 12     1.13     9.56        3.87              Medium    Benign
## 13     1.37     6.13        4.15                High Malignant
## 14     0.95     6.00        0.38                 Low    Benign
## 15     0.62     6.80        1.68              Medium    Benign

Write a user defined function using any of the variables from the data set

categorize_age <- function(age) {
  if (is.na(age)) {
    return(NA)
  } else if (age < 30) {
    return("Young")
  } else if (age >= 30 & age <= 50) {
    return("Middle-aged")
  } else {
    return("Senior")
  }
}

Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset

filtered_age <- thyroid_cancer_risk_data1 %>% 
filter('Thyroid_Cancer_Risk'== "High" & Age> 50)
print(filtered_age)

##  [1] Patient_ID          Age                 Gender             
##  [4] Country             Ethnicity           Family_History     
##  [7] Radiation_Exposure  Iodine_Deficiency   Smoking            
## [10] Obesity             Diabetes            TSH_Level          
## [13] T3_Level            T4_Level            Nodule_Size        
## [16] Thyroid_Cancer_Risk Diagnosis          
## <0 rows> (or 0-length row.names)

Identify the dependent & independent variables and use reshaping techniques

(Assuming dependent variable: Diagnosis, independent variables: Age, TSH_Level, Nodule_Size)

dependent_var <- "Diagnosis"
independent_vars <- c("Age", "TSH_Level", "Nodule_Size")

#Create new dataframe with selected variables

analysis_df <- thyroid_cancer_risk_data1 %>% 
 select(all_of(c(dependent_var, independent_vars)))

Remove missing values

thyroid_cancer_risk_data1 <- na.omit(thyroid_cancer_risk_data1)

Identify and Remove Duplicated data

thyroid_cancer_risk_data1 = thyroid_cancer_risk_data1 %>% distinct (Patient_ID, .keep_all = TRUE)

Reorder rows- descending order

thyroid_cancer_risk_data1 <- thyroid_cancer_risk_data1 %>% arrange(desc(Age),desc(Country),desc(TSH_Level))

#Rename some column

thyroid_cancer_risk_data1<- thyroid_cancer_risk_data1 %>% rename( Triiodothyronine = T3_Level, Thyroxine= T4_Level)

Add new variable

thyroid_cancer_risk_data1 <- thyroid_cancer_risk_data1 %>% mutate( Actualpotency_Triiodothyronine= Thyroxine * 5)

Create training set

 set.seed(123)
train_indices <- sample(1:nrow(thyroid_cancer_risk_data1), size = 0.7*nrow(thyroid_cancer_risk_data1))
thyroid_cancer_risk_data1 <-thyroid_cancer_risk_data1[train_indices, ]

Print the summary statistics of your dataset

summary(thyroid_cancer_risk_data1)

##    Patient_ID          Age           Gender            Country         
##  Min.   :     2   Min.   :15.00   Length:148883      Length:148883     
##  1st Qu.: 53076   1st Qu.:33.00   Class :character   Class :character  
##  Median :106368   Median :52.00   Mode  :character   Mode  :character  
##  Mean   :106303   Mean   :51.92                                        
##  3rd Qu.:159507   3rd Qu.:71.00                                        
##  Max.   :212691   Max.   :89.00                                        
##   Ethnicity         Family_History     Radiation_Exposure Iodine_Deficiency 
##  Length:148883      Length:148883      Length:148883      Length:148883     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Smoking            Obesity            Diabetes           TSH_Level     
##  Length:148883      Length:148883      Length:148883      Min.   : 0.100  
##  Class :character   Class :character   Class :character   1st Qu.: 2.570  
##  Mode  :character   Mode  :character   Mode  :character   Median : 5.040  
##                                                           Mean   : 5.047  
##                                                           3rd Qu.: 7.530  
##                                                           Max.   :10.000  
##  Triiodothyronine   Thyroxine       Nodule_Size    Thyroid_Cancer_Risk
##  Min.   :0.500    Min.   : 4.500   Min.   :0.000   Length:148883      
##  1st Qu.:1.250    1st Qu.: 6.360   1st Qu.:1.250   Class :character   
##  Median :2.000    Median : 8.240   Median :2.510   Mode  :character   
##  Mean   :2.001    Mean   : 8.246   Mean   :2.502                      
##  3rd Qu.:2.750    3rd Qu.:10.120   3rd Qu.:3.750                      
##  Max.   :3.500    Max.   :12.000   Max.   :5.000                      
##   Diagnosis         Actualpotency_Triiodothyronine
##  Length:148883      Min.   :22.50                 
##  Class :character   1st Qu.:31.80                 
##  Mode  :character   Median :41.20                 
##                     Mean   :41.23                 
##                     3rd Qu.:50.60                 
##                     Max.   :60.00

#Calculate Mean, Median, Mode & Range

Mean_TSH_stats = mean(thyroid_cancer_risk_data1$TSH_Level)
Median_TSH_stats <- median(thyroid_cancer_risk_data1$TSH_Level)
Mode_TSH_stats = { as.numeric(thyroid_cancer_risk_data1$TSH_Level)}
Range_TSH_stats = range(thyroid_cancer_risk_data1$TSH_Level)
TSH_stats <- list (Mean_TSH_stats, Median_TSH_stats, Mode_TSH_stats, Range_TSH_stats)

Plot a scatter plot for any 2 variables in your dataset.

ggplot(thyroid_cancer_risk_data1, aes(x=Age, y =Thyroid_Cancer_Risk,color = Thyroid_Cancer_Risk )) + geom_jitter(width = 0.2, height = 0.5)

Plot a bar plot for any 2 variables in your dataset

ggplot(thyroid_cancer_risk_data1, aes(x = Country, fill = Thyroid_Cancer_Risk)) +
  geom_bar(position = "stack")

#Find the correlation between any 2 variables by applying least square linear regression model.( No correlation was found, value =0.08)

cormethod= cor(thyroid_cancer_risk_data1$Age, thyroid_cancer_risk_data1$TSH_Level,method = "pearson")