# print dataset

brain_tumor <-read.csv("/cloud/project/tumor/brain_tumor_dataset.csv")

head(brain_tumor)
##          Tumor.Type       Location Size..cm. Grade Patient.Age Gender
## 1 Oligodendroglioma Occipital Lobe      9.23     I          48 Female
## 2        Ependymoma Occipital Lobe      0.87    II          47   Male
## 3        Meningioma Occipital Lobe      2.33    II          12 Female
## 4        Ependymoma Occipital Lobe      1.45   III          38 Female
## 5        Ependymoma      Brainstem      6.45     I          35 Female
## 6       Astrocytoma      Brainstem      2.82   III          46   Male
# load packages

install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages ("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)

install.packages ("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)
# Load package
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2
        )
# What is the distribution of tumor size?

ggplot (data=brain_tumor, mapping=aes(x=Size..cm.))+
  geom_histogram(binwidth=5, fill='grey', color='black')+
  labs(title='Tumor Size Distribution',
       x='Size..cm.',
       y='Frequency') + theme_minimal()

## How do tumor types vary by location?
library(dplyr)
library (ggplot2)

tumor_location <- brain_tumor %>%
  group_by (Tumor.Type)%>%
  summarise (location_count=n(),.groups ='drop')%>%
  arrange(desc(location_count))
print (tumor_location)
## # A tibble: 5 × 2
##   Tumor.Type        location_count
##   <chr>                      <int>
## 1 Glioblastoma                 210
## 2 Oligodendroglioma            206
## 3 Ependymoma                   204
## 4 Astrocytoma                  190
## 5 Meningioma                   190
# visualization
ggplot(data=tumor_location, mapping=aes(x=Tumor.Type, y=location_count))+
  geom_col(fill='lightblue',color='black')

## Average tumor size and how it varies by tumor type?

tumorsize_type <- brain_tumor %>%
  group_by (Tumor.Type)%>%
  summarise(avg_tumorsize = mean(Size..cm., na.rm='TRUE'),.groups='drop')%>%
  arrange(desc(avg_tumorsize))
print(tumorsize_type)
## # A tibble: 5 × 2
##   Tumor.Type        avg_tumorsize
##   <chr>                     <dbl>
## 1 Meningioma                 5.39
## 2 Glioblastoma               5.37
## 3 Astrocytoma                5.15
## 4 Ependymoma                 5.11
## 5 Oligodendroglioma          5.10
## What is the distribution of tumor grades across different tumor types?

tumorgrade_type <- brain_tumor %>%
  group_by(Tumor.Type, Grade)%>%
  summarise(grade_count=n(),.groups='drop')
print(tumorgrade_type)
## # A tibble: 20 × 3
##    Tumor.Type        Grade grade_count
##    <chr>             <chr>       <int>
##  1 Astrocytoma       I              41
##  2 Astrocytoma       II             44
##  3 Astrocytoma       III            55
##  4 Astrocytoma       IV             50
##  5 Ependymoma        I              65
##  6 Ependymoma        II             49
##  7 Ependymoma        III            47
##  8 Ependymoma        IV             43
##  9 Glioblastoma      I              58
## 10 Glioblastoma      II             48
## 11 Glioblastoma      III            41
## 12 Glioblastoma      IV             63
## 13 Meningioma        I              49
## 14 Meningioma        II             52
## 15 Meningioma        III            42
## 16 Meningioma        IV             47
## 17 Oligodendroglioma I              51
## 18 Oligodendroglioma II             43
## 19 Oligodendroglioma III            59
## 20 Oligodendroglioma IV             53
# Visualization
ggplot(data=tumorgrade_type, mapping = aes(x=Tumor.Type, y=grade_count, fill=Grade))+
  geom_col() + labs(title="Grade by Tumor Types",
                    x="Tumor Type",
                    y="Grade Count") +
  theme(axis.text.x = element_text(angle=45, hjust=1))

## What is the distribution of tumor grade IV across different tumor types?

tumorgrade_type <- brain_tumor %>%
  filter(Grade%in% c('IV'))%>%
  group_by(Tumor.Type, Grade)%>%
  summarise(GradeIV_count=n(),.groups='drop')
print(tumorgrade_type)
## # A tibble: 5 × 3
##   Tumor.Type        Grade GradeIV_count
##   <chr>             <chr>         <int>
## 1 Astrocytoma       IV               50
## 2 Ependymoma        IV               43
## 3 Glioblastoma      IV               63
## 4 Meningioma        IV               47
## 5 Oligodendroglioma IV               53
# visualization
ggplot(data=tumorgrade_type, mapping=aes(x=Tumor.Type, y=GradeIV_count, fill=Tumor.Type))+
  geom_col() + labs(title="Grade IV by Tumor Type",
                    x="Tumor Type",
                    y="Grade-IV Count") +
  theme(axis.text.x = element_text(angle=45, hjust=1)) 

## HOW DOES TUMOR SIZE VARY WITH PATIENT AGE?
# create age category
age_categories <- cut(brain_tumor$Patient.Age,
                    breaks= c(1,17,34,49,69,89),
                    labels=c("1-17","18-34","35-49","50-69","70-89"),
                                     include.lowest = TRUE)

# create a new age category column using mutate function
brain_tumor <- brain_tumor %>%
  mutate (Age_Category=age_categories)

# Establish relationship
size_age <- brain_tumor %>%
  group_by (Age_Category)%>%
  summarise (avg_size = mean(Size..cm.,na.rm=TRUE),.groups='drop')%>%
arrange(desc(avg_size))
print (size_age)
## # A tibble: 5 × 2
##   Age_Category avg_size
##   <fct>           <dbl>
## 1 50-69            5.35
## 2 18-34            5.28
## 3 1-17             5.22
## 4 35-49            5.16
## 5 70-89            5.06
# Visualization
ggplot(data=size_age, mapping=aes(x=Age_Category, y=avg_size, fill=Age_Category))+
  geom_col(color='black') + 
  labs(title="How Tumor Size Varies by Age",
                               x="Age Category",
                               y="Average Size") + theme_minimal()

## What is the gender distribution of patients?

ggplot(data=brain_tumor, mapping=aes(x=Patient.Age))+geom_histogram(binwidth = 5, fill='lightblue',color='black')+
  labs(title="Age distribution",
       x="Patient Age",
       y="Frequency") + theme_minimal()

## Are there any significant differences in tumor size between male and female patients?

# Calculate the mean tumor size by gender
tumor_gender <- brain_tumor %>%
  group_by (Gender)%>%
  summarise (avg_size = mean (Size..cm.))
print (tumor_gender)
## # A tibble: 2 × 2
##   Gender avg_size
##   <chr>     <dbl>
## 1 Female     5.23
## 2 Male       5.21
# calculate t.test
t_test_results <- t.test (Size..cm.~Gender, data=brain_tumor)
print(t_test_results)
## 
##  Welch Two Sample t-test
## 
## data:  Size..cm. by Gender
## t = 0.11321, df = 983.15, p-value = 0.9099
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -0.3315745  0.3721744
## sample estimates:
## mean in group Female   mean in group Male 
##             5.231934             5.211634
## Are certain tumor grades more prevalent in specific locations?
location_grade <- brain_tumor %>%
  group_by (Location, Grade)%>%
  summarise (Grade_count = n(),.groups='drop')%>%
  mutate(Grade_proportion = Grade_count / sum(Grade_count))
print (location_grade)
## # A tibble: 24 × 4
##    Location     Grade Grade_count Grade_proportion
##    <chr>        <chr>       <int>            <dbl>
##  1 Brainstem    I              40            0.04 
##  2 Brainstem    II             43            0.043
##  3 Brainstem    III            41            0.041
##  4 Brainstem    IV             46            0.046
##  5 Cerebellum   I              49            0.049
##  6 Cerebellum   II             37            0.037
##  7 Cerebellum   III            35            0.035
##  8 Cerebellum   IV             40            0.04 
##  9 Frontal Lobe I              50            0.05 
## 10 Frontal Lobe II             25            0.025
## # ℹ 14 more rows
# Visualization
ggplot(data=location_grade, mapping=aes(x=Location, y=Grade_proportion, fill=Grade))+geom_col()+
  labs(titles="Prevalence of Tumor Grades across Locations",
       x="Location",
       y="Grade Proportion")+
  theme(axis.text.x = element_text(angle=45, hjust=1))

# Correlation between tumor size and patient age
size_age_cor <- cor(brain_tumor$Size..cm.,brain_tumor$Patient.Age, use='complete.obs')
print(size_age_cor)
## [1] -0.004660974