file_path <- file.choose()
dataset = read.csv(file_path)
head(dataset)
## Patient_ID Age Gender Protein1 Protein2 Protein3 Protein4 Tumour_Stage
## 1 TCGA-D8-A1XD 36 FEMALE 0.080353 0.42638 0.547150 0.273680 III
## 2 TCGA-EW-A1OX 43 FEMALE -0.420320 0.57807 0.614470 -0.031505 II
## 3 TCGA-A8-A079 69 FEMALE 0.213980 1.31140 -0.327470 -0.234260 III
## 4 TCGA-D8-A1XR 56 FEMALE 0.345090 -0.21147 -0.193040 0.124270 II
## 5 TCGA-BH-A0BF 56 FEMALE 0.221550 1.90680 0.520450 -0.311990 II
## 6 TCGA-AO-A1KQ 84 MALE -0.081872 1.72410 -0.057335 0.043025 III
## Histology ER.status PR.status HER2.status
## 1 Infiltrating Ductal Carcinoma Positive Positive Negative
## 2 Mucinous Carcinoma Positive Positive Negative
## 3 Infiltrating Ductal Carcinoma Positive Positive Negative
## 4 Infiltrating Ductal Carcinoma Positive Positive Negative
## 5 Infiltrating Ductal Carcinoma Positive Positive Negative
## 6 Infiltrating Ductal Carcinoma Positive Positive Negative
## Surgery_type Date_of_Surgery Date_of_Last_Visit Patient_Status
## 1 Modified Radical Mastectomy 15-Jan-17 19-Jun-17 Alive
## 2 Lumpectomy 26-Apr-17 09-Nov-18 Dead
## 3 Other 08-Sep-17 09-Jun-18 Alive
## 4 Modified Radical Mastectomy 25-Jan-17 12-Jul-17 Alive
## 5 Other 06-May-17 27-Jun-19 Dead
## 6 Modified Radical Mastectomy 18-Sep-17 15-Nov-21 Alive
str(dataset)
## 'data.frame': 341 obs. of 16 variables:
## $ Patient_ID : chr "TCGA-D8-A1XD" "TCGA-EW-A1OX" "TCGA-A8-A079" "TCGA-D8-A1XR" ...
## $ Age : int 36 43 69 56 56 84 53 50 77 40 ...
## $ Gender : chr "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
## $ Protein1 : num 0.0804 -0.4203 0.214 0.3451 0.2215 ...
## $ Protein2 : num 0.426 0.578 1.311 -0.211 1.907 ...
## $ Protein3 : num 0.547 0.614 -0.327 -0.193 0.52 ...
## $ Protein4 : num 0.2737 -0.0315 -0.2343 0.1243 -0.312 ...
## $ Tumour_Stage : chr "III" "II" "III" "II" ...
## $ Histology : chr "Infiltrating Ductal Carcinoma" "Mucinous Carcinoma" "Infiltrating Ductal Carcinoma" "Infiltrating Ductal Carcinoma" ...
## $ ER.status : chr "Positive" "Positive" "Positive" "Positive" ...
## $ PR.status : chr "Positive" "Positive" "Positive" "Positive" ...
## $ HER2.status : chr "Negative" "Negative" "Negative" "Negative" ...
## $ Surgery_type : chr "Modified Radical Mastectomy" "Lumpectomy" "Other" "Modified Radical Mastectomy" ...
## $ Date_of_Surgery : chr "15-Jan-17" "26-Apr-17" "08-Sep-17" "25-Jan-17" ...
## $ Date_of_Last_Visit: chr "19-Jun-17" "09-Nov-18" "09-Jun-18" "12-Jul-17" ...
## $ Patient_Status : chr "Alive" "Dead" "Alive" "Alive" ...
summary(dataset)
## Patient_ID Age Gender Protein1
## Length:341 Min. :29.00 Length:341 Min. :-2.340900
## Class :character 1st Qu.:49.00 Class :character 1st Qu.:-0.358888
## Mode :character Median :58.00 Mode :character Median : 0.006129
## Mean :58.89 Mean :-0.029991
## 3rd Qu.:68.00 3rd Qu.: 0.343598
## Max. :90.00 Max. : 1.593600
## NA's :7 NA's :7
## Protein2 Protein3 Protein4 Tumour_Stage
## Min. :-0.9787 Min. :-1.6274 Min. :-2.025500 Length:341
## 1st Qu.: 0.3622 1st Qu.:-0.5137 1st Qu.:-0.377090 Class :character
## Median : 0.9928 Median :-0.1732 Median : 0.041768 Mode :character
## Mean : 0.9469 Mean :-0.0902 Mean : 0.009819
## 3rd Qu.: 1.6279 3rd Qu.: 0.2784 3rd Qu.: 0.425630
## Max. : 3.4022 Max. : 2.1934 Max. : 1.629900
## NA's :7 NA's :7 NA's :7
## Histology ER.status PR.status HER2.status
## Length:341 Length:341 Length:341 Length:341
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Surgery_type Date_of_Surgery Date_of_Last_Visit Patient_Status
## Length:341 Length:341 Length:341 Length:341
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
dim(dataset)
## [1] 341 16
hist(dataset$Age)

plot(dataset$Protein1, dataset$Protein2)

BRCA <- dataset
BRCA$Tumour_Stage <- as.factor(BRCA$Tumour_Stage)
plot(BRCA$Tumour_Stage, BRCA$Age)

library(ggplot2)
ggplot(data = BRCA, aes(x = Age, fill = Patient_Status)) +
geom_histogram(binwidth = 5, alpha = 0.7, position = "identity") +
labs(title = "Histogram of Age by Patient Status", x = "Age (Years)", y = "Frequency") +
theme_minimal()
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data = BRCA, aes(x = Tumour_Stage, fill = Patient_Status)) +
geom_bar(position = "dodge") + # dodge places bars side-by-side
labs(title = "Tumour Stage by Patient Status") +
theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
