DSA_406_001_SP25_project_ebbopp

Gentle introduction

Author

DSA_406_001_SP25_project_ebbopp

Reading in the Dataset

# Setting up the libraries needed for this project
library(data.table)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()     masks data.table::between()
✖ dplyr::filter()      masks stats::filter()
✖ dplyr::first()       masks data.table::first()
✖ lubridate::hour()    masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag()         masks stats::lag()
✖ dplyr::last()        masks data.table::last()
✖ lubridate::mday()    masks data.table::mday()
✖ lubridate::minute()  masks data.table::minute()
✖ lubridate::month()   masks data.table::month()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second()  masks data.table::second()
✖ purrr::transpose()   masks data.table::transpose()
✖ lubridate::wday()    masks data.table::wday()
✖ lubridate::week()    masks data.table::week()
✖ lubridate::yday()    masks data.table::yday()
✖ lubridate::year()    masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
# Reading in the dataset
brain_tumor_dt <- read.csv("data/Brain_Tumor_Prediction_Dataset.csv")

Brief Data Descriptions

# Getting a basic frame of reference for the dataset with into
# analysis tools
summary(brain_tumor_dt)
      Age           Gender            Country            Tumor_Size    
 Min.   : 5.00   Length:250000      Length:250000      Min.   : 0.500  
 1st Qu.:26.00   Class :character   Class :character   1st Qu.: 2.870  
 Median :47.00   Mode  :character   Mode  :character   Median : 5.260  
 Mean   :46.96                                         Mean   : 5.252  
 3rd Qu.:68.00                                         3rd Qu.: 7.630  
 Max.   :89.00                                         Max.   :10.000  
 Tumor_Location     MRI_Findings        Genetic_Risk Smoking_History   
 Length:250000      Length:250000      Min.   :  0   Length:250000     
 Class :character   Class :character   1st Qu.: 25   Class :character  
 Mode  :character   Mode  :character   Median : 50   Mode  :character  
                                       Mean   : 50                     
                                       3rd Qu.: 75                     
                                       Max.   :100                     
 Alcohol_Consumption Radiation_Exposure Head_Injury_History Chronic_Illness   
 Length:250000       Length:250000      Length:250000       Length:250000     
 Class :character    Class :character   Class :character    Class :character  
 Mode  :character    Mode  :character   Mode  :character    Mode  :character  
                                                                              
                                                                              
                                                                              
 Blood_Pressure       Diabetes          Tumor_Type        Treatment_Received
 Length:250000      Length:250000      Length:250000      Length:250000     
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
 Survival_Rate... Tumor_Growth_Rate  Family_History     Symptom_Severity  
 Min.   :10.00    Length:250000      Length:250000      Length:250000     
 1st Qu.:32.00    Class :character   Class :character   Class :character  
 Median :55.00    Mode  :character   Mode  :character   Mode  :character  
 Mean   :54.48                                                            
 3rd Qu.:77.00                                                            
 Max.   :99.00                                                            
 Brain_Tumor_Present
 Length:250000      
 Class :character   
 Mode  :character   
                    
                    
                    
dim(brain_tumor_dt)
[1] 250000     21
head(brain_tumor_dt)
  Age Gender   Country Tumor_Size Tumor_Location MRI_Findings Genetic_Risk
1  66  Other     China       8.70     Cerebellum       Severe           81
2  87 Female Australia       8.14       Temporal       Normal           65
3  41   Male    Canada       6.02      Occipital       Severe          100
4  52   Male     Japan       7.26      Occipital       Normal           19
5  84 Female    Brazil       7.94       Temporal     Abnormal           47
6  29   Male   Germany       7.97        Frontal     Abnormal           70
  Smoking_History Alcohol_Consumption Radiation_Exposure Head_Injury_History
1              No                 Yes             Medium                  No
2              No                 Yes             Medium                  No
3             Yes                  No                Low                 Yes
4             Yes                 Yes               High                 Yes
5              No                 Yes             Medium                  No
6             Yes                 Yes             Medium                  No
  Chronic_Illness Blood_Pressure Diabetes Tumor_Type Treatment_Received
1             Yes         122/88       No  Malignant               None
2              No        126/119       No  Malignant               None
3              No         118/65       No     Benign       Chemotherapy
4              No        165/119      Yes     Benign          Radiation
5             Yes         156/97      Yes  Malignant               None
6              No          95/85       No  Malignant            Surgery
  Survival_Rate... Tumor_Growth_Rate Family_History Symptom_Severity
1               58              Slow            Yes             Mild
2               13             Rapid            Yes           Severe
3               67              Slow            Yes         Moderate
4               85          Moderate             No         Moderate
5               17          Moderate             No         Moderate
6               65             Rapid            Yes           Severe
  Brain_Tumor_Present
1                  No
2                  No
3                 Yes
4                 Yes
5                  No
6                  No

As we can see from inspecting the data above, this dataset contains 250000 observations, each with 21 elements. This dataset is made up of qualitative and quantitative values and data types including integers floats, booleans, and free response strings.

I retrieved this dataset from kaggle at the following link: https://www.kaggle.com/datasets/ankushpanday1/brain-tumor-prediction-dataset

Questions to Answer

  • What is the dataset about?

    • The Brain Tumor Prediction Dataset is a comprehensive collection of medical records aimed at facilitating research and development in brain tumor diagnosis and prediction. It comprises 250,000 patient records, each encompassing 22 significant medical features. The dataset includes MRI scan results, demographic information, medical history, and other relevant attributes.
  • Where did the data come from?

    • I got this data online while browsing numerous dataset websites. I eventually found this specific dataset on Kaggle.
  • Is there a website or publication to cite the authors? If yes, include it.

    • https://www.kaggle.com/datasets/ankushpanday1/brain-tumor-prediction-dataset

Motivation

  • What are your motivations for exploring this dataset?

    • For many years now I have been very interested in improving medical diagnoses, and finding meaningful ways to help people and communities. For many years I wanted to be a radiologist looking at medical images and finding meaningful diagnoses within those images to help my patients. Just before college I learned about the potentials of computer science, data science, and computer algorithms within the medical space. This led me to want to dive into the computer algorithm space to develop software capable of outperforming the systems we have today to provide even greater widespread impacts on people and communities. This dataset strongly aligns with these goals and is a great step into the world of medical diagnoses algorithms, and methodologies to find an important interplay between the two fields.
  • What questions do you want to answer? (This involves identifying and articulating the key questions that you aim to explore through your analysis of the dataset. These questions set the direction for your research and data exploration. They are typically broad and open-ended, aimed at uncovering insights or patterns within the data. IE: What are the main factors that affect customer satisfaction?)

    • What are the main factors that affect the presence of brain tumors?

    • What factors influence the size and/or location of brain tumors?

    • Are there are factors that that correlate to the severity of a brain tumor?

    • Are there any factors commonly associated with tumors, not actually impact on the presence, location, or severity of tumor?

  • Provide a hypothesis about the dataset. (Formulating a hypothesis involves making a specific, testable statement based on your initial understanding or assumptions about the data. A hypothesis is more focused than a general question and often predicts a relationship between variables that you can test through your analysis. IE: Customers with shorter wait times report higher satisfaction levels, suggesting a significant negative correlation between wait time and satisfaction.)

    • Patients with a family history of brain tumors and a high genetic risk score are more likely to develop malignant tumors than those without a family history and low genetic risk.

Ethical Considerations

  • What are some ethical considerations? 

    • There are many ethical considerations to consider, including patient privacy & data sensitivity, bias in data collection, assumptions in medical diagnosis, and fair treatment & algorithmic bias.

      • Even though this dataset may be anonymized, medical data is highly sensitive. Ensuring compliance with HIPAA, GDPR, or similar regulations is critical if used in real-world applications.

      • The dataset might not represent all demographics equally, and if the data is skewed toward certain regions, ethnic groups, or medical histories, models trained on it could be less effective for underrepresented populations.

      • AI models built on this data should be used as decision-support tools, not replacements for medical professionals. False positives/negatives could have severe consequences, leading to unnecessary stress, procedures, or missed diagnoses.

      • If the dataset disproportionately represents certain tumor types, age groups, or genders, the AI model might generalize poorly. Addressing bias before deploying AI in healthcare is crucial to ensure fair treatment for all patients.

  • Do you have any bias coming into this analysis?

    1. Such as do you assume certain things already (we all have internal bias that we should recognize)
    • I don’t know that I have many biases coming into this. I believe there is likely to bea strong correlation between some of the factors such as family history, history of illness, age, etc. which may impact my views on the analysis and is something I need to keep in mind when performing this analysis

    • Additionally, I am making assumptions that this data has been collected appropriately in an ethical way that equally encompasses all populations. This likely is not the case in reality and will need to be considered further.

Table Creation/Data Dictionary

# Create a variable to hold descriptions
descriptions <- c(
  "Patient's age in years",
  "Gender of the patient (Male, Female, Other)",
  "Patient's country of residence",
  "Size of the tumor in cm",
  "Brain lobe affected (e.g., Frontal, Temporal, Parietal)",
  "Severity of MRI results (e.g., Normal, Abnormal, Severe))",
  "Score indicating genetic risk (0–100 scale)",
  "Whether the patient has a history of smoking (Yes/No)",
  "Whether the patient consumes alcohol (Yes/No)",
  "Level of radiation exposure (Low, Medium, High)",
  "History of head injury (Yes/No)",
  "Presence of chronic illnesses (Yes/No)",
  "Systolic/Diastolic values (e.g., 120/80)",
  "Presence of diabetes (Yes/No)",
  "Classification of tumor (Benign/Malignant)",
  "Type of treatment received (e.g., Chemotherapy, Radiation, None)",
  "Estimated 5-year survival probability",
  "Rate of tumor growth (Slow, Moderate, Rapid)",
  "Whether the patient has a family history of tumors (Yes/No)",
  "Severity of symptoms (Mild, Moderate, Severe)",
  "Whether the patient has a brain tumor (Yes/No)"
)
# Create the data dictionary using R functions
data_dictionary <- data.frame(
  Variable_Name = colnames(brain_tumor_dt),
  Class = sapply(brain_tumor_dt, class),
  Continuity = ifelse(sapply(brain_tumor_dt, is.numeric), "Continuous", "Discrete"),
  Description = descriptions
)
# Print out table
data_dictionary
                          Variable_Name     Class Continuity
Age                                 Age   integer Continuous
Gender                           Gender character   Discrete
Country                         Country character   Discrete
Tumor_Size                   Tumor_Size   numeric Continuous
Tumor_Location           Tumor_Location character   Discrete
MRI_Findings               MRI_Findings character   Discrete
Genetic_Risk               Genetic_Risk   integer Continuous
Smoking_History         Smoking_History character   Discrete
Alcohol_Consumption Alcohol_Consumption character   Discrete
Radiation_Exposure   Radiation_Exposure character   Discrete
Head_Injury_History Head_Injury_History character   Discrete
Chronic_Illness         Chronic_Illness character   Discrete
Blood_Pressure           Blood_Pressure character   Discrete
Diabetes                       Diabetes character   Discrete
Tumor_Type                   Tumor_Type character   Discrete
Treatment_Received   Treatment_Received character   Discrete
Survival_Rate...       Survival_Rate...   integer Continuous
Tumor_Growth_Rate     Tumor_Growth_Rate character   Discrete
Family_History           Family_History character   Discrete
Symptom_Severity       Symptom_Severity character   Discrete
Brain_Tumor_Present Brain_Tumor_Present character   Discrete
                                                                         Description
Age                                                           Patient's age in years
Gender                                   Gender of the patient (Male, Female, Other)
Country                                               Patient's country of residence
Tumor_Size                                                   Size of the tumor in cm
Tumor_Location               Brain lobe affected (e.g., Frontal, Temporal, Parietal)
MRI_Findings               Severity of MRI results (e.g., Normal, Abnormal, Severe))
Genetic_Risk                             Score indicating genetic risk (0–100 scale)
Smoking_History                Whether the patient has a history of smoking (Yes/No)
Alcohol_Consumption                    Whether the patient consumes alcohol (Yes/No)
Radiation_Exposure                   Level of radiation exposure (Low, Medium, High)
Head_Injury_History                                  History of head injury (Yes/No)
Chronic_Illness                               Presence of chronic illnesses (Yes/No)
Blood_Pressure                              Systolic/Diastolic values (e.g., 120/80)
Diabetes                                               Presence of diabetes (Yes/No)
Tumor_Type                                Classification of tumor (Benign/Malignant)
Treatment_Received  Type of treatment received (e.g., Chemotherapy, Radiation, None)
Survival_Rate...                               Estimated 5-year survival probability
Tumor_Growth_Rate                       Rate of tumor growth (Slow, Moderate, Rapid)
Family_History           Whether the patient has a family history of tumors (Yes/No)
Symptom_Severity                       Severity of symptoms (Mild, Moderate, Severe)
Brain_Tumor_Present                   Whether the patient has a brain tumor (Yes/No)