# print dataset
brain_tumor <-read.csv("/cloud/project/tumor/brain_tumor_dataset.csv")
head(brain_tumor)
## Tumor.Type Location Size..cm. Grade Patient.Age Gender
## 1 Oligodendroglioma Occipital Lobe 9.23 I 48 Female
## 2 Ependymoma Occipital Lobe 0.87 II 47 Male
## 3 Meningioma Occipital Lobe 2.33 II 12 Female
## 4 Ependymoma Occipital Lobe 1.45 III 38 Female
## 5 Ependymoma Brainstem 6.45 I 35 Female
## 6 Astrocytoma Brainstem 2.82 III 46 Male
# load packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages ("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)
install.packages ("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)
# Load package
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2
)
# What is the distribution of tumor size?
ggplot (data=brain_tumor, mapping=aes(x=Size..cm.))+
geom_histogram(binwidth=5, fill='grey', color='black')+
labs(title='Tumor Size Distribution',
x='Size..cm.',
y='Frequency') + theme_minimal()

## How do tumor types vary by location?
library(dplyr)
library (ggplot2)
tumor_location <- brain_tumor %>%
group_by (Tumor.Type)%>%
summarise (location_count=n(),.groups ='drop')%>%
arrange(desc(location_count))
print (tumor_location)
## # A tibble: 5 × 2
## Tumor.Type location_count
## <chr> <int>
## 1 Glioblastoma 210
## 2 Oligodendroglioma 206
## 3 Ependymoma 204
## 4 Astrocytoma 190
## 5 Meningioma 190
# visualization
ggplot(data=tumor_location, mapping=aes(x=Tumor.Type, y=location_count))+
geom_col(fill='lightblue',color='black')

## Average tumor size and how it varies by tumor type?
tumorsize_type <- brain_tumor %>%
group_by (Tumor.Type)%>%
summarise(avg_tumorsize = mean(Size..cm., na.rm='TRUE'),.groups='drop')%>%
arrange(desc(avg_tumorsize))
print(tumorsize_type)
## # A tibble: 5 × 2
## Tumor.Type avg_tumorsize
## <chr> <dbl>
## 1 Meningioma 5.39
## 2 Glioblastoma 5.37
## 3 Astrocytoma 5.15
## 4 Ependymoma 5.11
## 5 Oligodendroglioma 5.10
## What is the distribution of tumor grades across different tumor types?
tumorgrade_type <- brain_tumor %>%
group_by(Tumor.Type, Grade)%>%
summarise(grade_count=n(),.groups='drop')
print(tumorgrade_type)
## # A tibble: 20 × 3
## Tumor.Type Grade grade_count
## <chr> <chr> <int>
## 1 Astrocytoma I 41
## 2 Astrocytoma II 44
## 3 Astrocytoma III 55
## 4 Astrocytoma IV 50
## 5 Ependymoma I 65
## 6 Ependymoma II 49
## 7 Ependymoma III 47
## 8 Ependymoma IV 43
## 9 Glioblastoma I 58
## 10 Glioblastoma II 48
## 11 Glioblastoma III 41
## 12 Glioblastoma IV 63
## 13 Meningioma I 49
## 14 Meningioma II 52
## 15 Meningioma III 42
## 16 Meningioma IV 47
## 17 Oligodendroglioma I 51
## 18 Oligodendroglioma II 43
## 19 Oligodendroglioma III 59
## 20 Oligodendroglioma IV 53
# Visualization
ggplot(data=tumorgrade_type, mapping = aes(x=Tumor.Type, y=grade_count, fill=Grade))+
geom_col() + labs(title="Grade by Tumor Types",
x="Tumor Type",
y="Grade Count") +
theme(axis.text.x = element_text(angle=45, hjust=1))

## What is the distribution of tumor grade IV across different tumor types?
tumorgrade_type <- brain_tumor %>%
filter(Grade%in% c('IV'))%>%
group_by(Tumor.Type, Grade)%>%
summarise(GradeIV_count=n(),.groups='drop')
print(tumorgrade_type)
## # A tibble: 5 × 3
## Tumor.Type Grade GradeIV_count
## <chr> <chr> <int>
## 1 Astrocytoma IV 50
## 2 Ependymoma IV 43
## 3 Glioblastoma IV 63
## 4 Meningioma IV 47
## 5 Oligodendroglioma IV 53
# visualization
ggplot(data=tumorgrade_type, mapping=aes(x=Tumor.Type, y=GradeIV_count, fill=Tumor.Type))+
geom_col() + labs(title="Grade IV by Tumor Type",
x="Tumor Type",
y="Grade-IV Count") +
theme(axis.text.x = element_text(angle=45, hjust=1))

## HOW DOES TUMOR SIZE VARY WITH PATIENT AGE?
# create age category
age_categories <- cut(brain_tumor$Patient.Age,
breaks= c(1,17,34,49,69,89),
labels=c("1-17","18-34","35-49","50-69","70-89"),
include.lowest = TRUE)
# create a new age category column using mutate function
brain_tumor <- brain_tumor %>%
mutate (Age_Category=age_categories)
# Establish relationship
size_age <- brain_tumor %>%
group_by (Age_Category)%>%
summarise (avg_size = mean(Size..cm.,na.rm=TRUE),.groups='drop')%>%
arrange(desc(avg_size))
print (size_age)
## # A tibble: 5 × 2
## Age_Category avg_size
## <fct> <dbl>
## 1 50-69 5.35
## 2 18-34 5.28
## 3 1-17 5.22
## 4 35-49 5.16
## 5 70-89 5.06
# Visualization
ggplot(data=size_age, mapping=aes(x=Age_Category, y=avg_size, fill=Age_Category))+
geom_col(color='black') +
labs(title="How Tumor Size Varies by Age",
x="Age Category",
y="Average Size") + theme_minimal()

## What is the gender distribution of patients?
ggplot(data=brain_tumor, mapping=aes(x=Patient.Age))+geom_histogram(binwidth = 5, fill='lightblue',color='black')+
labs(title="Age distribution",
x="Patient Age",
y="Frequency") + theme_minimal()

## Are there any significant differences in tumor size between male and female patients?
# Calculate the mean tumor size by gender
tumor_gender <- brain_tumor %>%
group_by (Gender)%>%
summarise (avg_size = mean (Size..cm.))
print (tumor_gender)
## # A tibble: 2 × 2
## Gender avg_size
## <chr> <dbl>
## 1 Female 5.23
## 2 Male 5.21
# calculate t.test
t_test_results <- t.test (Size..cm.~Gender, data=brain_tumor)
print(t_test_results)
##
## Welch Two Sample t-test
##
## data: Size..cm. by Gender
## t = 0.11321, df = 983.15, p-value = 0.9099
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -0.3315745 0.3721744
## sample estimates:
## mean in group Female mean in group Male
## 5.231934 5.211634
## Are certain tumor grades more prevalent in specific locations?
location_grade <- brain_tumor %>%
group_by (Location, Grade)%>%
summarise (Grade_count = n(),.groups='drop')%>%
mutate(Grade_proportion = Grade_count / sum(Grade_count))
print (location_grade)
## # A tibble: 24 × 4
## Location Grade Grade_count Grade_proportion
## <chr> <chr> <int> <dbl>
## 1 Brainstem I 40 0.04
## 2 Brainstem II 43 0.043
## 3 Brainstem III 41 0.041
## 4 Brainstem IV 46 0.046
## 5 Cerebellum I 49 0.049
## 6 Cerebellum II 37 0.037
## 7 Cerebellum III 35 0.035
## 8 Cerebellum IV 40 0.04
## 9 Frontal Lobe I 50 0.05
## 10 Frontal Lobe II 25 0.025
## # ℹ 14 more rows
# Visualization
ggplot(data=location_grade, mapping=aes(x=Location, y=Grade_proportion, fill=Grade))+geom_col()+
labs(titles="Prevalence of Tumor Grades across Locations",
x="Location",
y="Grade Proportion")+
theme(axis.text.x = element_text(angle=45, hjust=1))

# Correlation between tumor size and patient age
size_age_cor <- cor(brain_tumor$Size..cm.,brain_tumor$Patient.Age, use='complete.obs')
print(size_age_cor)
## [1] -0.004660974