# Specify the file path (update if necessary)
file_path <- "lgggbm_tcga_pub.tar.gz"

# Use untar() to extract the contents
untar(file_path, exdir = ".")

# Check the files extracted
list.files("extracted_files")
## character(0)
# Define the file path (you should update this to the correct file path)
file_path <- "lgggbm_tcga_pub/data_clinical_patient.txt"

# Read the data into a data frame called 'pts'
pts <- read.table(file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE, comment.char = "#")

# Check the first few rows to ensure it was imported correctly
head(pts)
##     PATIENT_ID                    STUDY BCR_STATUS HISTOLOGICAL_DIAGNOSIS AGE
## 1 TCGA-CS-4938 Brain Lower Grade Glioma        IGC            astrocytoma  31
## 2 TCGA-CS-4941 Brain Lower Grade Glioma        IGC            astrocytoma  67
## 3 TCGA-CS-4942 Brain Lower Grade Glioma        IGC            astrocytoma  44
## 4 TCGA-CS-4943 Brain Lower Grade Glioma        IGC            astrocytoma  37
## 5 TCGA-CS-4944 Brain Lower Grade Glioma        IGC            astrocytoma  50
## 6 TCGA-CS-5390 Brain Lower Grade Glioma        IGC      oligodendroglioma  47
##      SEX OS_MONTHS  OS_STATUS KARNOFSKY_PERFORMANCE_SCORE
## 1 Female       4.7   0:LIVING                          90
## 2   Male       7.7 1:DECEASED                          90
## 3 Female      43.9 1:DECEASED                          90
## 4   Male      18.1   0:LIVING                          50
## 5   Male      10.6   0:LIVING                          90
## 6 Female      64.6   0:LIVING                         100
# Install required packages if you don't have them yet
# install.packages(c("dplyr", "kableExtra", "gt"))

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(gt)
# install.packages("summarytools")
summarytools::dfSummary(pts)
## Data Frame Summary  
## pts  
## Dimensions: 1122 x 9  
## Duplicates: 0  
## 
## ---------------------------------------------------------------------------------------------------------------------------------
## No   Variable                      Stats / Values                Freqs (% of Valid)    Graph                 Valid      Missing  
## ---- ----------------------------- ----------------------------- --------------------- --------------------- ---------- ---------
## 1    PATIENT_ID                    1. TCGA-02-0001                  1 ( 0.1%)                                1122       0        
##      [character]                   2. TCGA-02-0003                  1 ( 0.1%)                                (100.0%)   (0.0%)   
##                                    3. TCGA-02-0004                  1 ( 0.1%)                                                    
##                                    4. TCGA-02-0006                  1 ( 0.1%)                                                    
##                                    5. TCGA-02-0007                  1 ( 0.1%)                                                    
##                                    6. TCGA-02-0009                  1 ( 0.1%)                                                    
##                                    7. TCGA-02-0010                  1 ( 0.1%)                                                    
##                                    8. TCGA-02-0011                  1 ( 0.1%)                                                    
##                                    9. TCGA-02-0014                  1 ( 0.1%)                                                    
##                                    10. TCGA-02-0015                 1 ( 0.1%)                                                    
##                                    [ 1112 others ]               1112 (99.1%)          IIIIIIIIIIIIIIIIIII                       
## 
## 2    STUDY                         1. Brain Lower Grade Glioma   516 (46.0%)           IIIIIIIII             1122       0        
##      [character]                   2. Glioblastoma multiforme    606 (54.0%)           IIIIIIIIII            (100.0%)   (0.0%)   
## 
## 3    BCR_STATUS                    1. IGC                        951 (84.8%)           IIIIIIIIIIIIIIII      1122       0        
##      [character]                   2. NCH                        171 (15.2%)           III                   (100.0%)   (0.0%)   
## 
## 4    HISTOLOGICAL_DIAGNOSIS        1. (Empty string)              75 ( 6.7%)           I                     1122       0        
##      [character]                   2. astrocytoma                169 (15.1%)           III                   (100.0%)   (0.0%)   
##                                    3. glioblastoma               590 (52.6%)           IIIIIIIIII                                
##                                    4. oligoastrocytoma           114 (10.2%)           II                                        
##                                    5. oligodendroglioma          174 (15.5%)           III                                       
## 
## 5    AGE                           Mean (sd) : 51.4 (15.8)       75 distinct values              : :         1047       75       
##      [numeric]                     min < med < max:                                          : . : :         (93.3%)    (6.7%)   
##                                    10 < 52 < 89                                            : : : : : :                           
##                                    IQR (CV) : 24 (0.3)                                     : : : : : : :                         
##                                                                                          : : : : : : : : .                       
## 
## 6    SEX                           1. (Empty string)              75 ( 6.7%)           I                     1122       0        
##      [character]                   2. Female                     429 (38.2%)           IIIIIII               (100.0%)   (0.0%)   
##                                    3. Male                       618 (55.1%)           IIIIIIIIIII                               
## 
## 7    OS_MONTHS                     Mean (sd) : 19.1 (24.2)       417 distinct values   :                     1047       75       
##      [numeric]                     min < med < max:                                    :                     (93.3%)    (6.7%)   
##                                    0 < 11.7 < 211                                      :                                         
##                                    IQR (CV) : 15.7 (1.3)                               :                                         
##                                                                                        : : .                                     
## 
## 8    OS_STATUS                     1. (Empty string)              76 ( 6.8%)           I                     1122       0        
##      [character]                   2. 0:LIVING                   522 (46.5%)           IIIIIIIII             (100.0%)   (0.0%)   
##                                    3. 1:DECEASED                 524 (46.7%)           IIIIIIIII                                 
## 
## 9    KARNOFSKY_PERFORMANCE_SCORE   Mean (sd) : 81.3 (14.5)       20 :   2 ( 0.3%)                            697        425      
##      [numeric]                     min < med < max:              40 :  18 ( 2.6%)                            (62.1%)    (37.9%)  
##                                    20 < 80 < 100                 50 :   4 ( 0.6%)                                                
##                                    IQR (CV) : 10 (0.2)           60 :  95 (13.6%)      II                                        
##                                                                  70 :  26 ( 3.7%)                                                
##                                                                  80 : 289 (41.5%)      IIIIIIII                                  
##                                                                  90 : 123 (17.6%)      III                                       
##                                                                  100 : 140 (20.1%)     IIII                                      
## ---------------------------------------------------------------------------------------------------------------------------------
# Install the gtsummary package if you don't have it installed yet
# install.packages("gtsummary")

library(gtsummary)
library(dplyr)
# Create a summary table excluding PATIENT_ID column
table_summary <- pts %>%
  select(-PATIENT_ID) %>%  # Exclude the PATIENT_ID column
  tbl_summary(by = "HISTOLOGICAL_DIAGNOSIS")

# Print the summary table
table_summary
Characteristic

N = 751

astrocytoma
N = 169
1
glioblastoma
N = 590
1
oligoastrocytoma
N = 114
1
oligodendroglioma
N = 174
1
STUDY




    Brain Lower Grade Glioma 59 (79%) 169 (100%) 0 (0%) 114 (100%) 174 (100%)
    Glioblastoma multiforme 16 (21%) 0 (0%) 590 (100%) 0 (0%) 0 (0%)
BCR_STATUS




    IGC 34 (45%) 113 (67%) 585 (99%) 88 (77%) 131 (75%)
    NCH 41 (55%) 56 (33%) 5 (0.8%) 26 (23%) 43 (25%)
AGE NA (NA, NA) 39 (33, 51) 59 (50, 68) 38 (30, 51) 46 (34, 54)
    Unknown 75 0 0 0 0
SEX




     75 (100%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    Female 0 (0%) 74 (44%) 228 (39%) 51 (45%) 76 (44%)
    Male 0 (0%) 95 (56%) 362 (61%) 63 (55%) 98 (56%)
OS_MONTHS NA (NA, NA) 12 (6, 22) 11 (5, 18) 16 (6, 30) 16 (5, 34)
    Unknown 75 0 0 0 0
OS_STATUS




     75 (100%) 0 (0%) 1 (0.2%) 0 (0%) 0 (0%)
    0:LIVING 0 (0%) 136 (80%) 145 (25%) 96 (84%) 145 (83%)
    1:DECEASED 0 (0%) 33 (20%) 444 (75%) 18 (16%) 29 (17%)
KARNOFSKY_PERFORMANCE_SCORE




    20 0 (NA%) 0 (0%) 2 (0.5%) 0 (0%) 0 (0%)
    40 0 (NA%) 1 (1.0%) 16 (3.7%) 0 (0%) 1 (1.1%)
    50 0 (NA%) 2 (1.9%) 0 (0%) 0 (0%) 2 (2.2%)
    60 0 (NA%) 4 (3.9%) 85 (20%) 2 (2.9%) 4 (4.4%)
    70 0 (NA%) 6 (5.8%) 7 (1.6%) 3 (4.4%) 10 (11%)
    80 0 (NA%) 20 (19%) 248 (57%) 10 (15%) 11 (12%)
    90 0 (NA%) 41 (40%) 12 (2.8%) 37 (54%) 33 (36%)
    100 0 (NA%) 29 (28%) 65 (15%) 16 (24%) 30 (33%)
    Unknown 75 66 155 46 83
1 n (%); Median (Q1, Q3)
pts %>%
  filter(!is.na(HISTOLOGICAL_DIAGNOSIS)) %>% 
  filter(HISTOLOGICAL_DIAGNOSIS!="") %>% 
  select(-PATIENT_ID) %>% 
  tbl_summary(by=HISTOLOGICAL_DIAGNOSIS ) %>% 
  add_n() %>% 
  add_p()
## The following errors were returned during `add_p()`:
## ✖ For variable `KARNOFSKY_PERFORMANCE_SCORE` (`HISTOLOGICAL_DIAGNOSIS`) and
##   "estimate", "p.value", "conf.low", and "conf.high" statistics:
##   FEXACT[f3xact()] error: hash key 6e+09 > INT_MAX, kyy=434, it[i (= nco = 5)]=
##   0. Rather set 'simulate.p.value=TRUE'
## ✖ For variable `OS_STATUS` (`HISTOLOGICAL_DIAGNOSIS`) and "estimate",
##   "p.value", "conf.low", and "conf.high" statistics: FEXACT error 7(location).
##   LDSTP=18480 is too small for this problem, (pastp=153.626,
##   ipn_0:=ipoin[itp=140]=549, stp[ipn_0]=153.701). Increase workspace or
##   consider using 'simulate.p.value=TRUE'
Characteristic N astrocytoma
N = 169
1
glioblastoma
N = 590
1
oligoastrocytoma
N = 114
1
oligodendroglioma
N = 174
1
p-value2
STUDY 1,047



<0.001
    Brain Lower Grade Glioma
169 (100%) 0 (0%) 114 (100%) 174 (100%)
    Glioblastoma multiforme
0 (0%) 590 (100%) 0 (0%) 0 (0%)
BCR_STATUS 1,047



<0.001
    IGC
113 (67%) 585 (99%) 88 (77%) 131 (75%)
    NCH
56 (33%) 5 (0.8%) 26 (23%) 43 (25%)
AGE 1,047 39 (33, 51) 59 (50, 68) 38 (30, 51) 46 (34, 54) <0.001
SEX 1,047



0.4
    Female
74 (44%) 228 (39%) 51 (45%) 76 (44%)
    Male
95 (56%) 362 (61%) 63 (55%) 98 (56%)
OS_MONTHS 1,047 12 (6, 22) 11 (5, 18) 16 (6, 30) 16 (5, 34) <0.001
OS_STATUS 1,047




    
0 (0%) 1 (0.2%) 0 (0%) 0 (0%)
    0:LIVING
136 (80%) 145 (25%) 96 (84%) 145 (83%)
    1:DECEASED
33 (20%) 444 (75%) 18 (16%) 29 (17%)
KARNOFSKY_PERFORMANCE_SCORE 697




    20
0 (0%) 2 (0.5%) 0 (0%) 0 (0%)
    40
1 (1.0%) 16 (3.7%) 0 (0%) 1 (1.1%)
    50
2 (1.9%) 0 (0%) 0 (0%) 2 (2.2%)
    60
4 (3.9%) 85 (20%) 2 (2.9%) 4 (4.4%)
    70
6 (5.8%) 7 (1.6%) 3 (4.4%) 10 (11%)
    80
20 (19%) 248 (57%) 10 (15%) 11 (12%)
    90
41 (40%) 12 (2.8%) 37 (54%) 33 (36%)
    100
29 (28%) 65 (15%) 16 (24%) 30 (33%)
    Unknown
66 155 46 83
1 n (%); Median (Q1, Q3)
2 Pearson’s Chi-squared test; Kruskal-Wallis rank sum test

“Differential Genetic and Clinical Profiles of Low-Grade Gliomas and Glioblastoma: Implications for Diagnosis and Treatment”

Tables and Figures to Include: Tables:

Demographic and Clinical Characteristics of Patients:

Table including age, sex, diagnosis, treatment history, and clinical outcomes for both LGG and GBM patients. Frequency of Genetic Mutations:

Table listing key mutations (e.g., IDH1, TP53, ATRX) and their frequencies in LGG versus GBM. Copy-Number Alterations Comparison:

A detailed table showing the frequency of specific CNAs like CDKN2A/B loss or EGFR amplification in LGG and GBM. Survival Data:

Summary table of median survival times, along with the results from Kaplan-Meier analysis and Cox model outputs, stratified by LGG and GBM. Treatment Response Rates:

Table summarizing treatment types and response rates (complete response, partial response, stable disease, progression) for each tumor type. Figures:

Kaplan-Meier Survival Curves:

A figure displaying survival curves for LGG and GBM, highlighting significant differences in overall survival. Bar Graph of Mutational Frequencies:

A graph illustrating the percentage of LGG and GBM cases with mutations in key genes. Heatmap of Gene Expression Profiles:

Display differential gene expression patterns between LGG and GBM, focusing on pathways involved in cell cycle, apoptosis, and tumor progression. CNA Profile Plots:

Stacked bar charts or line plots showing the frequency of major CNAs across chromosome locations for both LGG and GBM. Box Plots of Biomarker Levels:

Compare the expression levels of potential biomarkers (e.g., proteins, RNA) in LGG versus GBM, using box plots to show distribution and outliers. Pie Charts of Treatment Distribution:

Pie charts showing the proportion of different treatment modalities used in LGG and GBM patients. Scatter Plots of Prognostic Biomarkers vs. Survival:

Scatter plots correlating key prognostic biomarkers with patient survival times, differentiated by tumor type. Additional Visual Aipts:

Pathway Diagrams: Diagrams illustrating key signaling pathways that are differently regulated in LGG and GBM, potentially based on transcriptomic data. Cluster Analysis: Dendrograms or other clustering figures to show how LGG and GBM group based on genetic or molecular profiles.