file_path <- file.choose()
dataset = read.csv(file_path)
head(dataset)
##     Patient_ID Age Gender  Protein1 Protein2  Protein3  Protein4 Tumour_Stage
## 1 TCGA-D8-A1XD  36 FEMALE  0.080353  0.42638  0.547150  0.273680          III
## 2 TCGA-EW-A1OX  43 FEMALE -0.420320  0.57807  0.614470 -0.031505           II
## 3 TCGA-A8-A079  69 FEMALE  0.213980  1.31140 -0.327470 -0.234260          III
## 4 TCGA-D8-A1XR  56 FEMALE  0.345090 -0.21147 -0.193040  0.124270           II
## 5 TCGA-BH-A0BF  56 FEMALE  0.221550  1.90680  0.520450 -0.311990           II
## 6 TCGA-AO-A1KQ  84   MALE -0.081872  1.72410 -0.057335  0.043025          III
##                       Histology ER.status PR.status HER2.status
## 1 Infiltrating Ductal Carcinoma  Positive  Positive    Negative
## 2            Mucinous Carcinoma  Positive  Positive    Negative
## 3 Infiltrating Ductal Carcinoma  Positive  Positive    Negative
## 4 Infiltrating Ductal Carcinoma  Positive  Positive    Negative
## 5 Infiltrating Ductal Carcinoma  Positive  Positive    Negative
## 6 Infiltrating Ductal Carcinoma  Positive  Positive    Negative
##                  Surgery_type Date_of_Surgery Date_of_Last_Visit Patient_Status
## 1 Modified Radical Mastectomy       15-Jan-17          19-Jun-17          Alive
## 2                  Lumpectomy       26-Apr-17          09-Nov-18           Dead
## 3                       Other       08-Sep-17          09-Jun-18          Alive
## 4 Modified Radical Mastectomy       25-Jan-17          12-Jul-17          Alive
## 5                       Other       06-May-17          27-Jun-19           Dead
## 6 Modified Radical Mastectomy       18-Sep-17          15-Nov-21          Alive
str(dataset)
## 'data.frame':    341 obs. of  16 variables:
##  $ Patient_ID        : chr  "TCGA-D8-A1XD" "TCGA-EW-A1OX" "TCGA-A8-A079" "TCGA-D8-A1XR" ...
##  $ Age               : int  36 43 69 56 56 84 53 50 77 40 ...
##  $ Gender            : chr  "FEMALE" "FEMALE" "FEMALE" "FEMALE" ...
##  $ Protein1          : num  0.0804 -0.4203 0.214 0.3451 0.2215 ...
##  $ Protein2          : num  0.426 0.578 1.311 -0.211 1.907 ...
##  $ Protein3          : num  0.547 0.614 -0.327 -0.193 0.52 ...
##  $ Protein4          : num  0.2737 -0.0315 -0.2343 0.1243 -0.312 ...
##  $ Tumour_Stage      : chr  "III" "II" "III" "II" ...
##  $ Histology         : chr  "Infiltrating Ductal Carcinoma" "Mucinous Carcinoma" "Infiltrating Ductal Carcinoma" "Infiltrating Ductal Carcinoma" ...
##  $ ER.status         : chr  "Positive" "Positive" "Positive" "Positive" ...
##  $ PR.status         : chr  "Positive" "Positive" "Positive" "Positive" ...
##  $ HER2.status       : chr  "Negative" "Negative" "Negative" "Negative" ...
##  $ Surgery_type      : chr  "Modified Radical Mastectomy" "Lumpectomy" "Other" "Modified Radical Mastectomy" ...
##  $ Date_of_Surgery   : chr  "15-Jan-17" "26-Apr-17" "08-Sep-17" "25-Jan-17" ...
##  $ Date_of_Last_Visit: chr  "19-Jun-17" "09-Nov-18" "09-Jun-18" "12-Jul-17" ...
##  $ Patient_Status    : chr  "Alive" "Dead" "Alive" "Alive" ...
summary(dataset)
##   Patient_ID             Age           Gender             Protein1        
##  Length:341         Min.   :29.00   Length:341         Min.   :-2.340900  
##  Class :character   1st Qu.:49.00   Class :character   1st Qu.:-0.358888  
##  Mode  :character   Median :58.00   Mode  :character   Median : 0.006129  
##                     Mean   :58.89                      Mean   :-0.029991  
##                     3rd Qu.:68.00                      3rd Qu.: 0.343598  
##                     Max.   :90.00                      Max.   : 1.593600  
##                     NA's   :7                          NA's   :7          
##     Protein2          Protein3          Protein4         Tumour_Stage      
##  Min.   :-0.9787   Min.   :-1.6274   Min.   :-2.025500   Length:341        
##  1st Qu.: 0.3622   1st Qu.:-0.5137   1st Qu.:-0.377090   Class :character  
##  Median : 0.9928   Median :-0.1732   Median : 0.041768   Mode  :character  
##  Mean   : 0.9469   Mean   :-0.0902   Mean   : 0.009819                     
##  3rd Qu.: 1.6279   3rd Qu.: 0.2784   3rd Qu.: 0.425630                     
##  Max.   : 3.4022   Max.   : 2.1934   Max.   : 1.629900                     
##  NA's   :7         NA's   :7         NA's   :7                             
##   Histology          ER.status          PR.status         HER2.status       
##  Length:341         Length:341         Length:341         Length:341        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Surgery_type       Date_of_Surgery    Date_of_Last_Visit Patient_Status    
##  Length:341         Length:341         Length:341         Length:341        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
## 
dim(dataset)
## [1] 341  16
hist(dataset$Age)

plot(dataset$Protein1, dataset$Protein2)

BRCA <- dataset
BRCA$Tumour_Stage <- as.factor(BRCA$Tumour_Stage)
plot(BRCA$Tumour_Stage, BRCA$Age)

library(ggplot2)
ggplot(data = BRCA, aes(x = Age, fill = Patient_Status)) +
  geom_histogram(binwidth = 5, alpha = 0.7, position = "identity") +
  labs(title = "Histogram of Age by Patient Status", x = "Age (Years)", y = "Frequency") +
  theme_minimal()
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data = BRCA, aes(x = Tumour_Stage, fill = Patient_Status)) +
  geom_bar(position = "dodge") + # dodge places bars side-by-side
  labs(title = "Tumour Stage by Patient Status") +
  theme_light() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels