R Markdown File

#import dataset from computer #here file path “C:/Users/Owner/Downloads/thyroid_cancer_risk_data.csv” is likely incorrect and needs to be replaced with the actual path to the excel file on your computer.

thyroid_cancer_risk_data1 <- read.csv ("C:/Users/Owner/Downloads/thyroid_cancer_risk_data.csv", header=TRUE)

#view the dataset

View(thyroid_cancer_risk_data1)

#print the structure of your dataset

str(thyroid_cancer_risk_data1)
## 'data.frame':    212691 obs. of  17 variables:
##  $ Patient_ID         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                : int  66 29 86 75 35 89 89 38 17 36 ...
##  $ Gender             : chr  "Male" "Male" "Male" "Female" ...
##  $ Country            : chr  "Russia" "Germany" "Nigeria" "India" ...
##  $ Ethnicity          : chr  "Caucasian" "Hispanic" "Caucasian" "Asian" ...
##  $ Family_History     : chr  "No" "No" "No" "No" ...
##  $ Radiation_Exposure : chr  "Yes" "Yes" "No" "No" ...
##  $ Iodine_Deficiency  : chr  "No" "No" "No" "No" ...
##  $ Smoking            : chr  "No" "No" "No" "No" ...
##  $ Obesity            : chr  "No" "No" "No" "No" ...
##  $ Diabetes           : chr  "No" "No" "No" "No" ...
##  $ TSH_Level          : num  9.37 1.83 6.26 4.1 9.1 4 4.7 5.54 2.3 1.34 ...
##  $ T3_Level           : num  1.67 1.73 2.59 2.62 2.11 0.98 0.62 3.49 2.6 0.56 ...
##  $ T4_Level           : num  6.16 10.54 10.57 11.04 10.71 ...
##  $ Nodule_Size        : num  1.08 4.05 4.61 2.46 2.11 0.02 0.01 4.3 0.81 1.44 ...
##  $ Thyroid_Cancer_Risk: chr  "Low" "Low" "Low" "Medium" ...
##  $ Diagnosis          : chr  "Benign" "Benign" "Benign" "Benign" ...

#List the variables in your dataset.

names(thyroid_cancer_risk_data1)
##  [1] "Patient_ID"          "Age"                 "Gender"             
##  [4] "Country"             "Ethnicity"           "Family_History"     
##  [7] "Radiation_Exposure"  "Iodine_Deficiency"   "Smoking"            
## [10] "Obesity"             "Diabetes"            "TSH_Level"          
## [13] "T3_Level"            "T4_Level"            "Nodule_Size"        
## [16] "Thyroid_Cancer_Risk" "Diagnosis"

#Print the top 15 rows of your dataset.

head(thyroid_cancer_risk_data1,15)
##    Patient_ID Age Gender     Country Ethnicity Family_History
## 1           1  66   Male      Russia Caucasian             No
## 2           2  29   Male     Germany  Hispanic             No
## 3           3  86   Male     Nigeria Caucasian             No
## 4           4  75 Female       India     Asian             No
## 5           5  35 Female     Germany   African            Yes
## 6           6  89   Male          UK   African             No
## 7           7  89 Female South Korea     Asian            Yes
## 8           8  38 Female       India   African             No
## 9           9  17 Female      Russia   African             No
## 10         10  36   Male     Germany     Asian             No
## 11         11  67   Male     Nigeria   African             No
## 12         12  16 Female     Nigeria     Asian             No
## 13         13  44   Male South Korea     Asian            Yes
## 14         14  52   Male      Brazil     Asian             No
## 15         15  16 Female       China     Asian             No
##    Radiation_Exposure Iodine_Deficiency Smoking Obesity Diabetes TSH_Level
## 1                 Yes                No      No      No       No      9.37
## 2                 Yes                No      No      No       No      1.83
## 3                  No                No      No      No       No      6.26
## 4                  No                No      No      No       No      4.10
## 5                 Yes                No      No      No       No      9.10
## 6                  No                No     Yes     Yes       No      4.00
## 7                 Yes                No      No     Yes       No      4.70
## 8                  No                No      No      No       No      5.54
## 9                 Yes                No      No      No      Yes      2.30
## 10                 No                No      No     Yes       No      1.34
## 11                Yes                No      No      No       No      9.65
## 12                 No                No     Yes      No       No      0.53
## 13                 No                No      No      No      Yes      6.77
## 14                 No                No      No      No       No      4.91
## 15                 No                No      No      No       No      6.84
##    T3_Level T4_Level Nodule_Size Thyroid_Cancer_Risk Diagnosis
## 1      1.67     6.16        1.08                 Low    Benign
## 2      1.73    10.54        4.05                 Low    Benign
## 3      2.59    10.57        4.61                 Low    Benign
## 4      2.62    11.04        2.46              Medium    Benign
## 5      2.11    10.71        2.11                High    Benign
## 6      0.98     5.52        0.02              Medium    Benign
## 7      0.62    11.73        0.01                High Malignant
## 8      3.49     9.47        4.30              Medium    Benign
## 9      2.60    11.89        0.81                High Malignant
## 10     0.56     4.51        1.44                 Low    Benign
## 11     1.82     8.17        0.35                High Malignant
## 12     1.13     9.56        3.87              Medium    Benign
## 13     1.37     6.13        4.15                High Malignant
## 14     0.95     6.00        0.38                 Low    Benign
## 15     0.62     6.80        1.68              Medium    Benign

Write a user defined function using any of the variables from the data set

categorize_age <- function(age) {
  if (is.na(age)) {
    return(NA)
  } else if (age < 30) {
    return("Young")
  } else if (age >= 30 & age <= 50) {
    return("Middle-aged")
  } else {
    return("Senior")
  }
}
thyroid_cancer_risk_data1$Age_Category <- sapply(thyroid_cancer_risk_data1$Age, categorize_age)

Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset

#Now for manipulation we need to run tidyverse package first.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
filtered_age <- thyroid_cancer_risk_data1 %>% filter(Thyroid_Cancer_Risk == "High" & Age > 50)
head(thyroid_cancer_risk_data1,50)
##    Patient_ID Age Gender     Country      Ethnicity Family_History
## 1           1  66   Male      Russia      Caucasian             No
## 2           2  29   Male     Germany       Hispanic             No
## 3           3  86   Male     Nigeria      Caucasian             No
## 4           4  75 Female       India          Asian             No
## 5           5  35 Female     Germany        African            Yes
## 6           6  89   Male          UK        African             No
## 7           7  89 Female South Korea          Asian            Yes
## 8           8  38 Female       India        African             No
## 9           9  17 Female      Russia        African             No
## 10         10  36   Male     Germany          Asian             No
## 11         11  67   Male     Nigeria        African             No
## 12         12  16 Female     Nigeria          Asian             No
## 13         13  44   Male South Korea          Asian            Yes
## 14         14  52   Male      Brazil          Asian             No
## 15         15  16 Female       China          Asian             No
## 16         16  78 Female     Nigeria      Caucasian            Yes
## 17         17  74 Female       India        African            Yes
## 18         18  35   Male       Japan       Hispanic             No
## 19         19  47 Female         USA      Caucasian             No
## 20         20  72 Female       Japan      Caucasian             No
## 21         21  36   Male      Russia       Hispanic             No
## 22         22  63 Female     Nigeria          Asian             No
## 23         23  73   Male       India      Caucasian             No
## 24         24  56   Male     Nigeria      Caucasian             No
## 25         25  74 Female South Korea          Asian             No
## 26         26  29   Male      Russia          Asian            Yes
## 27         27  76 Female     Nigeria       Hispanic            Yes
## 28         28  76   Male      Brazil       Hispanic            Yes
## 29         29  61 Female         USA       Hispanic             No
## 30         30  76 Female       India          Asian            Yes
## 31         31  65 Female       India      Caucasian             No
## 32         32  69 Female      Brazil      Caucasian            Yes
## 33         33  78   Male      Russia        African             No
## 34         34  17   Male       India        African             No
## 35         35  65 Female     Germany        African            Yes
## 36         36  21 Female       Japan Middle Eastern             No
## 37         37  35 Female       India          Asian             No
## 38         38  87 Female       India        African             No
## 39         39  53   Male South Korea      Caucasian            Yes
## 40         40  32   Male       China        African             No
## 41         41  18 Female       Japan      Caucasian            Yes
## 42         42  74   Male     Nigeria      Caucasian             No
## 43         43  28 Female       China      Caucasian             No
## 44         44  23 Female       India        African             No
## 45         45  67   Male      Russia        African             No
## 46         46  16 Female       Japan      Caucasian            Yes
## 47         47  74 Female      Brazil      Caucasian             No
## 48         48  85   Male       China        African             No
## 49         49  58 Female      Russia        African             No
## 50         50  22   Male     Nigeria      Caucasian             No
##    Radiation_Exposure Iodine_Deficiency Smoking Obesity Diabetes TSH_Level
## 1                 Yes                No      No      No       No      9.37
## 2                 Yes                No      No      No       No      1.83
## 3                  No                No      No      No       No      6.26
## 4                  No                No      No      No       No      4.10
## 5                 Yes                No      No      No       No      9.10
## 6                  No                No     Yes     Yes       No      4.00
## 7                 Yes                No      No     Yes       No      4.70
## 8                  No                No      No      No       No      5.54
## 9                 Yes                No      No      No      Yes      2.30
## 10                 No                No      No     Yes       No      1.34
## 11                Yes                No      No      No       No      9.65
## 12                 No                No     Yes      No       No      0.53
## 13                 No                No      No      No      Yes      6.77
## 14                 No                No      No      No       No      4.91
## 15                 No                No      No      No       No      6.84
## 16                 No               Yes     Yes      No       No      7.32
## 17                 No                No      No      No       No      9.60
## 18                 No                No      No      No       No      3.59
## 19                 No                No      No      No      Yes      6.43
## 20                 No                No      No      No       No      5.96
## 21                 No               Yes      No     Yes       No      4.17
## 22                 No                No      No     Yes      Yes      6.97
## 23                Yes                No      No     Yes       No      3.18
## 24                 No                No      No      No      Yes      0.14
## 25                 No                No      No      No       No      6.02
## 26                 No                No     Yes      No       No      7.90
## 27                Yes               Yes      No      No       No      0.87
## 28                Yes                No      No      No       No      4.99
## 29                 No                No     Yes      No       No      4.07
## 30                 No                No     Yes      No       No      4.42
## 31                 No                No      No     Yes       No      7.29
## 32                 No                No     Yes      No       No      6.27
## 33                 No                No      No      No       No      3.35
## 34                 No               Yes      No     Yes       No      1.64
## 35                 No                No      No      No       No      3.16
## 36                 No                No      No      No       No      3.98
## 37                 No               Yes     Yes      No       No      5.85
## 38                 No                No      No      No       No      8.74
## 39                 No                No     Yes      No       No      6.65
## 40                 No                No      No     Yes       No      6.38
## 41                 No               Yes      No      No       No      5.86
## 42                 No               Yes      No      No       No      4.57
## 43                 No                No      No      No       No      0.87
## 44                 No                No      No     Yes       No      7.58
## 45                 No               Yes     Yes      No       No      6.49
## 46                 No                No      No     Yes       No      2.85
## 47                 No                No     Yes      No       No      2.90
## 48                 No                No      No      No       No      0.94
## 49                 No                No      No      No       No      4.91
## 50                 No                No      No     Yes      Yes      8.99
##    T3_Level T4_Level Nodule_Size Thyroid_Cancer_Risk Diagnosis Age_Category
## 1      1.67     6.16        1.08                 Low    Benign       Senior
## 2      1.73    10.54        4.05                 Low    Benign        Young
## 3      2.59    10.57        4.61                 Low    Benign       Senior
## 4      2.62    11.04        2.46              Medium    Benign       Senior
## 5      2.11    10.71        2.11                High    Benign  Middle-aged
## 6      0.98     5.52        0.02              Medium    Benign       Senior
## 7      0.62    11.73        0.01                High Malignant       Senior
## 8      3.49     9.47        4.30              Medium    Benign  Middle-aged
## 9      2.60    11.89        0.81                High Malignant        Young
## 10     0.56     4.51        1.44                 Low    Benign  Middle-aged
## 11     1.82     8.17        0.35                High Malignant       Senior
## 12     1.13     9.56        3.87              Medium    Benign        Young
## 13     1.37     6.13        4.15                High Malignant  Middle-aged
## 14     0.95     6.00        0.38                 Low    Benign       Senior
## 15     0.62     6.80        1.68              Medium    Benign        Young
## 16     1.90    11.82        2.86                 Low    Benign       Senior
## 17     2.86    11.50        0.25                 Low    Benign       Senior
## 18     1.83     4.95        4.93              Medium    Benign  Middle-aged
## 19     3.39     5.66        1.63              Medium    Benign  Middle-aged
## 20     1.26     7.89        2.27                 Low    Benign       Senior
## 21     2.92    10.24        2.41                 Low    Benign  Middle-aged
## 22     3.48     7.67        0.46                 Low Malignant       Senior
## 23     3.14     9.70        2.27                 Low Malignant       Senior
## 24     0.90     7.93        4.79                 Low    Benign       Senior
## 25     2.75    11.41        3.63                 Low    Benign       Senior
## 26     0.82     6.63        4.64                High Malignant        Young
## 27     0.80     4.73        4.22                 Low    Benign       Senior
## 28     1.56     6.48        1.54              Medium    Benign       Senior
## 29     2.27    10.98        4.26                 Low    Benign       Senior
## 30     1.12     8.83        4.00                High Malignant       Senior
## 31     3.02     6.52        3.06                 Low    Benign       Senior
## 32     1.26     8.77        0.46              Medium    Benign       Senior
## 33     2.80    11.21        0.04              Medium    Benign       Senior
## 34     3.12    10.80        3.40                High Malignant        Young
## 35     1.14     8.31        0.17                 Low    Benign       Senior
## 36     1.33     6.40        0.09                 Low    Benign        Young
## 37     1.16     7.95        2.68                High Malignant  Middle-aged
## 38     2.63     5.74        3.90                 Low    Benign       Senior
## 39     2.25     7.24        4.05              Medium Malignant       Senior
## 40     2.77     6.26        3.76              Medium    Benign  Middle-aged
## 41     2.52    10.02        3.65              Medium    Benign        Young
## 42     2.14     4.65        1.14              Medium    Benign       Senior
## 43     1.64     5.76        3.56              Medium    Benign        Young
## 44     2.12     6.64        3.57                 Low    Benign        Young
## 45     1.66     6.98        3.15                 Low    Benign       Senior
## 46     1.29    10.85        4.21              Medium    Benign        Young
## 47     1.78     8.81        1.66                 Low    Benign       Senior
## 48     3.07     7.04        0.71                 Low Malignant       Senior
## 49     1.24     4.84        4.93              Medium    Benign       Senior
## 50     2.51    10.39        4.27                 Low Malignant        Young

Identify the dependent & independent variables and use reshaping techniques

(Assuming dependent variable: Diagnosis, independent variables: Age, TSH_Level, Nodule_Size)

dependent_var <- "Diagnosis"
independent_vars <- c("Age", "TSH_Level", "Nodule_Size")

#Create new dataframe with selected variables

analysis_df <- thyroid_cancer_risk_data1 %>% 
c(dependent_var, independent_vars)

Remove missing values

thyroid_cancer_risk_data1 <- na.omit(thyroid_cancer_risk_data1)

Identify and Remove Duplicated data

#This removes duplicate rows from the thyroid_cancer_risk_data1.and creates the second data set we named it as thyroid_cancer_data2

thyroid_cancer_risk_data2 = thyroid_cancer_risk_data1 %>% distinct (Age, .keep_all = TRUE)

Reorder rows- descending order

thyroid_cancer_risk_data1 <- thyroid_cancer_risk_data1 %>% arrange(desc(Age),desc(Country),desc(TSH_Level))

#Rename some column

thyroid_cancer_risk_data1<- thyroid_cancer_risk_data1 %>% rename( Triiodothyronine = T3_Level, Thyroxine= T4_Level)

Add new variable

thyroid_cancer_risk_data1 <- thyroid_cancer_risk_data1 %>% mutate( Actualpotency_Triiodothyronine= Thyroxine * 5)

Create training set

 set.seed(123)
train_indices <- sample(1:nrow(thyroid_cancer_risk_data1), size = 0.7*nrow(thyroid_cancer_risk_data1))
thyroid_cancer_risk_data1 <-thyroid_cancer_risk_data1[train_indices, ]

Plot a scatter plot for any 2 variables in your dataset.

#Now for visulization active package ggplot2. # We have taken the smaller data set with 75 value because due to large number of values in data set the diagram didn’t make any sense. # We have created the jitter plot instade of scatter pot because we couldn’t find presentable graph for analysis in scartter plot. #Analysis:The plot shows the distribution of individuals across different age groups for each thyroid cancer risk category. #High-risk individuals appear sparsely across the age range, with only a few individuals in this category.Low-risk individuals are more prevalent and are scattered across the age spectrum. There might be a slight concentration of low-risk individuals in the middle age range.Medium-risk individuals also appear throughout the age range and seem to be the most frequent category.There is no visually apparent strong linear relationship between age and thyroid cancer risk category. The points are scattered, suggesting that age alone might not be a strong predictor of thyroid cancer risk

library(ggplot2)
ggplot(thyroid_cancer_risk_data2, aes(x=Age, y =Thyroid_Cancer_Risk,color = Thyroid_Cancer_Risk )) + geom_jitter(width = 0.2, height = 0.5)

Plot a bar plot for any 2 variables in your dataset

#We have used ggplot2 package for data visualization #We have taken 75 observations dataset, where are using bar graph with two variables, that are ‘Country’ and ’Thyroid_Cancer_Risk. #India has the highest overall count of thyroid cancer risk cases, with a significant proportion classified as Low risk (green) and Medium risk (Blue). #Nigeria and the USA also show high case counts, with varying distributions across the three risk levels. #Japan, China, and Russia have relatively lower case counts, but still contain individuals in all risk categories. #The presence of Medium risk (blue) and High risk (orange) varies across countries, indicating differences in thyroid cancer risk distribution globally.

ggplot(thyroid_cancer_risk_data2, aes(x = Country, fill = Thyroid_Cancer_Risk)) +
  geom_bar(position = "stack")

#Find the correlation between any 2 variables by applying least square linear regression model.( No correlation was found, value =0.08) #A value of 0.08 suggests a very weak positive linear relationship. As age increases, there is little to no tendency for TSH level to consistently increase or decrease.

cormethod= cor(thyroid_cancer_risk_data2$Age, thyroid_cancer_risk_data2$TSH_Level,method = "pearson")