Do remove the knitr::opts_chunk$set()

One hashtag

Two

Three

  • Bullet1
  • Bullet 2

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

ds <- mtcars
class(ds)
## [1] "data.frame"
dim(ds)
## [1] 32 11
head(ds)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
colnames(ds)
##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"
summary(ds)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
# View(ds)
str(ds)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
# Average miles per gal for cares with 4 cylinders

# Calculate the average mpg for cars with 4 cylinders
avg_mpg_4_cyl <- mean(ds$mpg[ds$cyl == 4])

# Print the result
print(avg_mpg_4_cyl)
## [1] 26.66364
for (n in 1:10){
  print("hello")
  print(n*3+5)
  
}
## [1] "hello"
## [1] 8
## [1] "hello"
## [1] 11
## [1] "hello"
## [1] 14
## [1] "hello"
## [1] 17
## [1] "hello"
## [1] 20
## [1] "hello"
## [1] 23
## [1] "hello"
## [1] 26
## [1] "hello"
## [1] 29
## [1] "hello"
## [1] 32
## [1] "hello"
## [1] 35
multiply_and_add <- function(x){
  x*3 + 5
}
multiply_and_add(6)
## [1] 23
for (n in 1:10){print(multiply_and_add(n))}
## [1] 8
## [1] 11
## [1] 14
## [1] 17
## [1] 20
## [1] 23
## [1] 26
## [1] 29
## [1] 32
## [1] 35
vec<- 1:10
lapply(vec, multiply_and_add)
## [[1]]
## [1] 8
## 
## [[2]]
## [1] 11
## 
## [[3]]
## [1] 14
## 
## [[4]]
## [1] 17
## 
## [[5]]
## [1] 20
## 
## [[6]]
## [1] 23
## 
## [[7]]
## [1] 26
## 
## [[8]]
## [1] 29
## 
## [[9]]
## [1] 32
## 
## [[10]]
## [1] 35
ds <- read.csv("/cloud/project/students/Lung_Cancer_Patients_clean.csv")
dim(ds)
## [1] 100  15
head(ds)
##   Patient.ID Date.of.Birth Date.of.Diagnosis Nationality
## 1     PT-001     3/24/1968         3/26/2019      German
## 2     PT-002    12/21/1943        11/24/2013    Canadian
## 3     PT-003      5/1/1964          5/4/2017     Chinese
## 4     PT-004     8/17/1969        11/25/2021     Chinese
## 5     PT-005     9/21/1940         6/13/2011    American
## 6     PT-006    12/19/1964         2/14/2023            
##   Hemoglobin.at.Diagnosis Weight..kg.          Histopathology     Stage
## 1                    15.3        53.6 Squamous Cell Carcinoma  Stage II
## 2                    14.7        97.9          Adenocarcinoma   Stage I
## 3                    16.0        53.0          Adenocarcinoma   Stage I
## 4                    15.8        51.1  Small Cell Lung Cancer Stage III
## 5                    11.2        59.8          Adenocarcinoma  Stage II
## 6                    16.3        88.7    Large Cell Carcinoma Stage IV 
##   Surgical.Resection Chemotherapy Immunotherapy       Event Date.of.Event Dead
## 1           Complete           No            No        None                 No
## 2            Partial           No            No  Recurrence     3/26/2015   No
## 3           Complete           No           Yes Progression     1/31/2021  Yes
## 4           Complete          Yes           Yes        None                 No
## 5           Complete          Yes            No  Recurrence      6/8/2016  Yes
## 6           Complete          Yes            No  Recurrence     4/24/2024  Yes
##   Date.of.Death.or.Last.Follow.up
## 1                       9/28/2024
## 2                       1/17/2024
## 3                       9/17/2024
## 4                       8/14/2024
## 5                       3/22/2024
## 6                       9/15/2024
colnames(ds)
##  [1] "Patient.ID"                      "Date.of.Birth"                  
##  [3] "Date.of.Diagnosis"               "Nationality"                    
##  [5] "Hemoglobin.at.Diagnosis"         "Weight..kg."                    
##  [7] "Histopathology"                  "Stage"                          
##  [9] "Surgical.Resection"              "Chemotherapy"                   
## [11] "Immunotherapy"                   "Event"                          
## [13] "Date.of.Event"                   "Dead"                           
## [15] "Date.of.Death.or.Last.Follow.up"
summary(ds)
##   Patient.ID        Date.of.Birth      Date.of.Diagnosis  Nationality       
##  Length:100         Length:100         Length:100         Length:100        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Hemoglobin.at.Diagnosis  Weight..kg.    Histopathology        Stage          
##  Min.   : 10.10          Min.   :50.50   Length:100         Length:100        
##  1st Qu.: 11.90          1st Qu.:59.15   Class :character   Class :character  
##  Median : 13.70          Median :73.40   Mode  :character   Mode  :character  
##  Mean   : 15.25          Mean   :74.08                                        
##  3rd Qu.: 15.85          3rd Qu.:87.50                                        
##  Max.   :150.00          Max.   :99.20                                        
##  NA's   :1               NA's   :1                                            
##  Surgical.Resection Chemotherapy       Immunotherapy         Event          
##  Length:100         Length:100         Length:100         Length:100        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Date.of.Event          Dead           Date.of.Death.or.Last.Follow.up
##  Length:100         Length:100         Length:100                     
##  Class :character   Class :character   Class :character               
##  Mode  :character   Mode  :character   Mode  :character               
##                                                                       
##                                                                       
##                                                                       
## 
str(ds)
## 'data.frame':    100 obs. of  15 variables:
##  $ Patient.ID                     : chr  "PT-001" "PT-002" "PT-003" "PT-004" ...
##  $ Date.of.Birth                  : chr  "3/24/1968" "12/21/1943" "5/1/1964" "8/17/1969" ...
##  $ Date.of.Diagnosis              : chr  "3/26/2019" "11/24/2013" "5/4/2017" "11/25/2021" ...
##  $ Nationality                    : chr  "German" "Canadian" "Chinese" "Chinese" ...
##  $ Hemoglobin.at.Diagnosis        : num  15.3 14.7 16 15.8 11.2 16.3 14.5 16 17.4 10.6 ...
##  $ Weight..kg.                    : num  53.6 97.9 53 51.1 59.8 88.7 73.8 78.7 98.1 97.2 ...
##  $ Histopathology                 : chr  "Squamous Cell Carcinoma" "Adenocarcinoma" "Adenocarcinoma" "Small Cell Lung Cancer" ...
##  $ Stage                          : chr  "Stage II" "Stage I" "Stage I" "Stage III" ...
##  $ Surgical.Resection             : chr  "Complete" "Partial" "Complete" "Complete" ...
##  $ Chemotherapy                   : chr  "No" "No" "No" "Yes" ...
##  $ Immunotherapy                  : chr  "No" "No" "Yes" "Yes" ...
##  $ Event                          : chr  "None" "Recurrence" "Progression" "None" ...
##  $ Date.of.Event                  : chr  "" "3/26/2015" "1/31/2021" "" ...
##  $ Dead                           : chr  "No" "No" "Yes" "No" ...
##  $ Date.of.Death.or.Last.Follow.up: chr  "9/28/2024" "1/17/2024" "9/17/2024" "8/14/2024" ...
janitor::clean_names(ds)->ds
colnames(ds)
##  [1] "patient_id"                      "date_of_birth"                  
##  [3] "date_of_diagnosis"               "nationality"                    
##  [5] "hemoglobin_at_diagnosis"         "weight_kg"                      
##  [7] "histopathology"                  "stage"                          
##  [9] "surgical_resection"              "chemotherapy"                   
## [11] "immunotherapy"                   "event"                          
## [13] "date_of_event"                   "dead"                           
## [15] "date_of_death_or_last_follow_up"
str(ds)
## 'data.frame':    100 obs. of  15 variables:
##  $ patient_id                     : chr  "PT-001" "PT-002" "PT-003" "PT-004" ...
##  $ date_of_birth                  : chr  "3/24/1968" "12/21/1943" "5/1/1964" "8/17/1969" ...
##  $ date_of_diagnosis              : chr  "3/26/2019" "11/24/2013" "5/4/2017" "11/25/2021" ...
##  $ nationality                    : chr  "German" "Canadian" "Chinese" "Chinese" ...
##  $ hemoglobin_at_diagnosis        : num  15.3 14.7 16 15.8 11.2 16.3 14.5 16 17.4 10.6 ...
##  $ weight_kg                      : num  53.6 97.9 53 51.1 59.8 88.7 73.8 78.7 98.1 97.2 ...
##  $ histopathology                 : chr  "Squamous Cell Carcinoma" "Adenocarcinoma" "Adenocarcinoma" "Small Cell Lung Cancer" ...
##  $ stage                          : chr  "Stage II" "Stage I" "Stage I" "Stage III" ...
##  $ surgical_resection             : chr  "Complete" "Partial" "Complete" "Complete" ...
##  $ chemotherapy                   : chr  "No" "No" "No" "Yes" ...
##  $ immunotherapy                  : chr  "No" "No" "Yes" "Yes" ...
##  $ event                          : chr  "None" "Recurrence" "Progression" "None" ...
##  $ date_of_event                  : chr  "" "3/26/2015" "1/31/2021" "" ...
##  $ dead                           : chr  "No" "No" "Yes" "No" ...
##  $ date_of_death_or_last_follow_up: chr  "9/28/2024" "1/17/2024" "9/17/2024" "8/14/2024" ...
# Step 1: Convert date columns to Date format
ds$date_of_birth <- as.Date(ds$date_of_birth, format = "%m/%d/%Y")
ds$date_of_diagnosis <- as.Date(ds$date_of_diagnosis, format = "%m/%d/%Y")
ds$date_of_event <- as.Date(ds$date_of_event, format = "%m/%d/%Y")
ds$date_of_death_or_last_follow_up <- as.Date(ds$date_of_death_or_last_follow_up, format = "%m/%d/%Y")

# Step 2: Handle missing values (example: impute hemoglobin)
ds$hemoglobin_at_diagnosis[is.na(ds$hemoglobin_at_diagnosis)] <- mean(ds$hemoglobin_at_diagnosis, na.rm = TRUE)
# one value of hemoglobin was missing and was imputed using the mean of the column

# text 
## make all small or capital letter
# Convert selected columns to lowercase (or use toupper() for uppercase)
ds$nationality <- tolower(ds$nationality)  # or toupper(ds$nationality)
ds$stage <- tolower(ds$stage)              # or toupper(ds$stage)
ds$event <- tolower(ds$event)              # or toupper(ds$event)
ds$dead <- tolower(ds$dead)                # or toupper(ds$dead)

## remove stage from stages column
# Remove the word "Stage" from the stage column
ds$stage <- gsub("Stage ", "", ds$stage, ignore.case = TRUE)
ds$stage <- gsub("Stge ", "", ds$stage, ignore.case = TRUE)

## remove all spaces from stage, event, death, nationality
# Step 3: Remove all spaces from stage, event, dead, and nationality columns
ds$stage <- gsub(" ", "", ds$stage)
ds$event <- gsub(" ", "", ds$event)
ds$dead <- gsub(" ", "", ds$dead)
ds$nationality <- gsub(" ", "", ds$nationality)


# Step 3: Convert categorical variables to factors
ds$nationality <- as.factor(ds$nationality)
ds$histopathology <- as.factor(ds$histopathology)
ds$stage <- as.factor(ds$stage)
ds$surgical_resection <- as.factor(ds$surgical_resection)
ds$chemotherapy <- as.factor(ds$chemotherapy)
ds$immunotherapy <- as.factor(ds$immunotherapy)
ds$dead <- as.factor(ds$dead)
ds$event<- as.factor(ds$event)

# check for duplication
# Check for duplicated rows
duplicated_rows <- duplicated(ds$patient_id)
print(sum(duplicated_rows))
## [1] 1
# Display TRUE if there are duplicated rows, FALSE otherwise
ds[which(duplicated_rows), "patient_id"]<- "PT-021"


# we should convert all text to upper or lower case

# we did not check for outliers
summary(ds)
##   patient_id        date_of_birth        date_of_diagnosis       nationality
##  Length:100         Min.   :1935-04-19   Min.   :1992-12-04   canadian :15  
##  Class :character   1st Qu.:1943-08-21   1st Qu.:2007-11-23   french   :13  
##  Mode  :character   Median :1953-01-13   Median :2014-11-22   italian  :11  
##                     Mean   :1953-05-09   Mean   :2013-07-09   brazilian:10  
##                     3rd Qu.:1962-01-15   3rd Qu.:2019-12-22   british  : 9  
##                     Max.   :1973-12-15   Max.   :2033-01-26   chinese  : 9  
##                     NA's   :1                                 (Other)  :33  
##  hemoglobin_at_diagnosis   weight_kg                     histopathology
##  Min.   : 10.10          Min.   :50.50   Adenocarcinoma         :23    
##  1st Qu.: 11.95          1st Qu.:59.15   Large Cell Carcinoma   :28    
##  Median : 13.80          Median :73.40   Small Cell Lung Cancer :27    
##  Mean   : 15.25          Mean   :74.08   Squamous Cell Carcinoma:22    
##  3rd Qu.: 15.82          3rd Qu.:87.50                                 
##  Max.   :150.00          Max.   :99.20                                 
##                          NA's   :1                                     
##  stage    surgical_resection chemotherapy immunotherapy         event   
##  i  :28   Complete:29        No :53       No :42        none       :36  
##  ii :28   None    :39        Yes:45       Yes:57        progression:34  
##  iii:23   Partial :32        YES: 2       YES: 1        recurrence :30  
##  iv :21                                                                 
##                                                                         
##                                                                         
##                                                                         
##  date_of_event         dead    date_of_death_or_last_follow_up
##  Min.   :1994-03-13   no :47   Min.   :2024-01-05             
##  1st Qu.:2010-08-09   yes:53   1st Qu.:2024-03-15             
##  Median :2016-08-17            Median :2024-05-16             
##  Mean   :2015-05-30            Mean   :2024-05-25             
##  3rd Qu.:2021-08-03            3rd Qu.:2024-08-12             
##  Max.   :2027-03-21            Max.   :2024-10-27             
##  NA's   :36
summarytools::dfSummary(ds)
## Data Frame Summary  
## ds  
## Dimensions: 100 x 15  
## Duplicates: 0  
## 
## -----------------------------------------------------------------------------------------------------------------------------------
## No   Variable                          Stats / Values               Freqs (% of Valid)   Graph                 Valid      Missing  
## ---- --------------------------------- ---------------------------- -------------------- --------------------- ---------- ---------
## 1    patient_id                        1. PT-001                     1 ( 1.0%)                                 100        0        
##      [character]                       2. PT-002                     1 ( 1.0%)                                 (100.0%)   (0.0%)   
##                                        3. PT-003                     1 ( 1.0%)                                                     
##                                        4. PT-004                     1 ( 1.0%)                                                     
##                                        5. PT-005                     1 ( 1.0%)                                                     
##                                        6. PT-006                     1 ( 1.0%)                                                     
##                                        7. PT-007                     1 ( 1.0%)                                                     
##                                        8. PT-008                     1 ( 1.0%)                                                     
##                                        9. PT-009                     1 ( 1.0%)                                                     
##                                        10. PT-010                    1 ( 1.0%)                                                     
##                                        [ 90 others ]                90 (90.0%)           IIIIIIIIIIIIIIIIII                        
## 
## 2    date_of_birth                     min : 1935-04-19             99 distinct values     : .     .           99         1        
##      [Date]                            med : 1953-01-13                                    : : : : : :         (99.0%)    (1.0%)   
##                                        max : 1973-12-15                                    : : : : : :                             
##                                        range : 38y 7m 26d                                  : : : : : : :                           
##                                                                                          : : : : : : : :                           
## 
## 3    date_of_diagnosis                 min : 1992-12-04             99 distinct values           : :           100        0        
##      [Date]                            med : 2014-11-22                                          : :           (100.0%)   (0.0%)   
##                                        max : 2033-01-26                                      : : : :                               
##                                        range : 40y 1m 22d                                    : : : :                               
##                                                                                          : : : : : :                               
## 
## 4    nationality                       1. (Empty string)             2 ( 2.0%)                                 100        0        
##      [factor]                          2. american                   7 ( 7.0%)           I                     (100.0%)   (0.0%)   
##                                        3. brazilian                 10 (10.0%)           II                                        
##                                        4. british                    9 ( 9.0%)           I                                         
##                                        5. canadian                  15 (15.0%)           III                                       
##                                        6. chinese                    9 ( 9.0%)           I                                         
##                                        7. french                    13 (13.0%)           II                                        
##                                        8. german                     7 ( 7.0%)           I                                         
##                                        9. indian                     9 ( 9.0%)           I                                         
##                                        10. italian                  11 (11.0%)           II                                        
##                                        11. japanese                  8 ( 8.0%)           I                                         
## 
## 5    hemoglobin_at_diagnosis           Mean (sd) : 15.3 (13.8)      56 distinct values   :                     100        0        
##      [numeric]                         min < med < max:                                  :                     (100.0%)   (0.0%)   
##                                        10.1 < 13.8 < 150                                 :                                         
##                                        IQR (CV) : 3.9 (0.9)                              :                                         
##                                                                                          :                                         
## 
## 6    weight_kg                         Mean (sd) : 74.1 (15.4)      91 distinct values   :                 .   99         1        
##      [numeric]                         min < med < max:                                  : :     .     .   :   (99.0%)    (1.0%)   
##                                        50.5 < 73.4 < 99.2                                : : . : :   . : . :                       
##                                        IQR (CV) : 28.4 (0.2)                             : : : : : . : : : :                       
##                                                                                          : : : : : : : : : :                       
## 
## 7    histopathology                    1. Adenocarcinoma            23 (23.0%)           IIII                  100        0        
##      [factor]                          2. Large Cell Carcinoma      28 (28.0%)           IIIII                 (100.0%)   (0.0%)   
##                                        3. Small Cell Lung Cancer    27 (27.0%)           IIIII                                     
##                                        4. Squamous Cell Carcinoma   22 (22.0%)           IIII                                      
## 
## 8    stage                             1. i                         28 (28.0%)           IIIII                 100        0        
##      [factor]                          2. ii                        28 (28.0%)           IIIII                 (100.0%)   (0.0%)   
##                                        3. iii                       23 (23.0%)           IIII                                      
##                                        4. iv                        21 (21.0%)           IIII                                      
## 
## 9    surgical_resection                1. Complete                  29 (29.0%)           IIIII                 100        0        
##      [factor]                          2. None                      39 (39.0%)           IIIIIII               (100.0%)   (0.0%)   
##                                        3. Partial                   32 (32.0%)           IIIIII                                    
## 
## 10   chemotherapy                      1. No                        53 (53.0%)           IIIIIIIIII            100        0        
##      [factor]                          2. Yes                       45 (45.0%)           IIIIIIIII             (100.0%)   (0.0%)   
##                                        3. YES                        2 ( 2.0%)                                                     
## 
## 11   immunotherapy                     1. No                        42 (42.0%)           IIIIIIII              100        0        
##      [factor]                          2. Yes                       57 (57.0%)           IIIIIIIIIII           (100.0%)   (0.0%)   
##                                        3. YES                        1 ( 1.0%)                                                     
## 
## 12   event                             1. none                      36 (36.0%)           IIIIIII               100        0        
##      [factor]                          2. progression               34 (34.0%)           IIIIII                (100.0%)   (0.0%)   
##                                        3. recurrence                30 (30.0%)           IIIIII                                    
## 
## 13   date_of_event                     min : 1994-03-13             64 distinct values             :           64         36       
##      [Date]                            med : 2016-08-17                                        . : :           (64.0%)    (36.0%)  
##                                        max : 2027-03-21                                        : : :                               
##                                        range : 33y 0m 8d                                   . : : : :                               
##                                                                                          : : : : : : .                             
## 
## 14   dead                              1. no                        47 (47.0%)           IIIIIIIII             100        0        
##      [factor]                          2. yes                       53 (53.0%)           IIIIIIIIII            (100.0%)   (0.0%)   
## 
## 15   date_of_death_or_last_follow_up   min : 2024-01-05             84 distinct values     :       .           100        0        
##      [Date]                            med : 2024-05-16                                  : : :   : :           (100.0%)   (0.0%)   
##                                        max : 2024-10-27                                  : : : : : :                               
##                                        range : 9m 22d                                    : : : : : :                               
##                                                                                          : : : : : :                               
## -----------------------------------------------------------------------------------------------------------------------------------
gptr::get_response(user_input = "What are redox reactions?",
                        system_specification = "You are a knowledgeable and helpful chemist \n
                        who will answer any questions in English",
                   model="gpt-4o-mini",
                   api_key="make your key")
## $error
## $error$message
## [1] "Incorrect API key provided: make you* key. You can find your API key at https://platform.openai.com/account/api-keys."
## 
## $error$type
## [1] "invalid_request_error"
## 
## $error$param
## NULL
## 
## $error$code
## [1] "invalid_api_key"
# you have to learn data transformation using dplyr/tidyr,  ask gpt to teach you
# you have to learn to pipe
ds %>% 
  dplyr::select(-patient_id) %>% 
  tbl_summary() 
Characteristic N = 1001
date_of_birth 1935-04-19 to 1973-12-15
    Unknown 1
date_of_diagnosis 1992-12-04 to 2033-01-26
nationality
     2 (2.0%)
    american 7 (7.0%)
    brazilian 10 (10%)
    british 9 (9.0%)
    canadian 15 (15%)
    chinese 9 (9.0%)
    french 13 (13%)
    german 7 (7.0%)
    indian 9 (9.0%)
    italian 11 (11%)
    japanese 8 (8.0%)
hemoglobin_at_diagnosis 13.80 (11.95, 15.83)
weight_kg 73 (59, 88)
    Unknown 1
histopathology
    Adenocarcinoma 23 (23%)
    Large Cell Carcinoma 28 (28%)
    Small Cell Lung Cancer 27 (27%)
    Squamous Cell Carcinoma 22 (22%)
stage
    i 28 (28%)
    ii 28 (28%)
    iii 23 (23%)
    iv 21 (21%)
surgical_resection
    Complete 29 (29%)
    None 39 (39%)
    Partial 32 (32%)
chemotherapy
    No 53 (53%)
    Yes 45 (45%)
    YES 2 (2.0%)
immunotherapy
    No 42 (42%)
    Yes 57 (57%)
    YES 1 (1.0%)
event
    none 36 (36%)
    progression 34 (34%)
    recurrence 30 (30%)
date_of_event 1994-03-13 to 2027-03-21
    Unknown 36
dead 53 (53%)
date_of_death_or_last_follow_up 2024-01-05 to 2024-10-27
1 Range; n (%); Median (IQR)
library(dplyr)
library(gtsummary)

ds %>%
  mutate(stage=toupper(stage)) %>% 
  # Exclude columns that have 'date' in their names
  dplyr::select(-contains("date")) %>%
  # Exclude 'patient_id'
  dplyr::select(-patient_id) %>%
  # Create summary table grouped by the 'stage' column
  tbl_summary(by = stage)
Characteristic I, N = 281 II, N = 281 III, N = 231 IV, N = 211
nationality



     0 (0%) 0 (0%) 0 (0%) 2 (9.5%)
    american 2 (7.1%) 4 (14%) 0 (0%) 1 (4.8%)
    brazilian 3 (11%) 2 (7.1%) 3 (13%) 2 (9.5%)
    british 4 (14%) 2 (7.1%) 2 (8.7%) 1 (4.8%)
    canadian 3 (11%) 6 (21%) 2 (8.7%) 4 (19%)
    chinese 4 (14%) 1 (3.6%) 3 (13%) 1 (4.8%)
    french 2 (7.1%) 2 (7.1%) 7 (30%) 2 (9.5%)
    german 2 (7.1%) 5 (18%) 0 (0%) 0 (0%)
    indian 2 (7.1%) 2 (7.1%) 1 (4.3%) 4 (19%)
    italian 3 (11%) 2 (7.1%) 4 (17%) 2 (9.5%)
    japanese 3 (11%) 2 (7.1%) 1 (4.3%) 2 (9.5%)
hemoglobin_at_diagnosis 13.75 (12.30, 15.78) 13.55 (11.95, 15.58) 13.40 (11.75, 15.50) 14.60 (12.30, 16.00)
weight_kg 72 (63, 83) 69 (58, 82) 85 (64, 93) 71 (59, 86)
    Unknown 0 1 0 0
histopathology



    Adenocarcinoma 9 (32%) 7 (25%) 3 (13%) 4 (19%)
    Large Cell Carcinoma 9 (32%) 5 (18%) 7 (30%) 7 (33%)
    Small Cell Lung Cancer 6 (21%) 8 (29%) 6 (26%) 7 (33%)
    Squamous Cell Carcinoma 4 (14%) 8 (29%) 7 (30%) 3 (14%)
surgical_resection



    Complete 7 (25%) 11 (39%) 5 (22%) 6 (29%)
    None 13 (46%) 6 (21%) 13 (57%) 7 (33%)
    Partial 8 (29%) 11 (39%) 5 (22%) 8 (38%)
chemotherapy



    No 12 (43%) 19 (68%) 11 (48%) 11 (52%)
    Yes 14 (50%) 9 (32%) 12 (52%) 10 (48%)
    YES 2 (7.1%) 0 (0%) 0 (0%) 0 (0%)
immunotherapy



    No 14 (50%) 13 (46%) 7 (30%) 8 (38%)
    Yes 13 (46%) 15 (54%) 16 (70%) 13 (62%)
    YES 1 (3.6%) 0 (0%) 0 (0%) 0 (0%)
event



    none 10 (36%) 14 (50%) 7 (30%) 5 (24%)
    progression 10 (36%) 8 (29%) 10 (43%) 6 (29%)
    recurrence 8 (29%) 6 (21%) 6 (26%) 10 (48%)
dead 17 (61%) 14 (50%) 12 (52%) 10 (48%)
1 n (%); Median (IQR)
ds %>%
  mutate(stage=toupper(stage)) %>% 
  # Exclude columns that have 'date' in their names
  dplyr::select(-contains("date")) %>%
  # Exclude 'patient_id'
  dplyr::select(-patient_id) %>%
  # Create summary table grouped by the 'stage' column
  tbl_summary(by = stage) %>% 
  add_p() %>% 
  add_overall()
Characteristic Overall, N = 1001 I, N = 281 II, N = 281 III, N = 231 IV, N = 211 p-value2
nationality





     2 (2.0%) 0 (0%) 0 (0%) 0 (0%) 2 (9.5%)
    american 7 (7.0%) 2 (7.1%) 4 (14%) 0 (0%) 1 (4.8%)
    brazilian 10 (10%) 3 (11%) 2 (7.1%) 3 (13%) 2 (9.5%)
    british 9 (9.0%) 4 (14%) 2 (7.1%) 2 (8.7%) 1 (4.8%)
    canadian 15 (15%) 3 (11%) 6 (21%) 2 (8.7%) 4 (19%)
    chinese 9 (9.0%) 4 (14%) 1 (3.6%) 3 (13%) 1 (4.8%)
    french 13 (13%) 2 (7.1%) 2 (7.1%) 7 (30%) 2 (9.5%)
    german 7 (7.0%) 2 (7.1%) 5 (18%) 0 (0%) 0 (0%)
    indian 9 (9.0%) 2 (7.1%) 2 (7.1%) 1 (4.3%) 4 (19%)
    italian 11 (11%) 3 (11%) 2 (7.1%) 4 (17%) 2 (9.5%)
    japanese 8 (8.0%) 3 (11%) 2 (7.1%) 1 (4.3%) 2 (9.5%)
hemoglobin_at_diagnosis 13.80 (11.95, 15.83) 13.75 (12.30, 15.78) 13.55 (11.95, 15.58) 13.40 (11.75, 15.50) 14.60 (12.30, 16.00) >0.9
weight_kg 73 (59, 88) 72 (63, 83) 69 (58, 82) 85 (64, 93) 71 (59, 86) 0.3
    Unknown 1 0 1 0 0
histopathology




0.7
    Adenocarcinoma 23 (23%) 9 (32%) 7 (25%) 3 (13%) 4 (19%)
    Large Cell Carcinoma 28 (28%) 9 (32%) 5 (18%) 7 (30%) 7 (33%)
    Small Cell Lung Cancer 27 (27%) 6 (21%) 8 (29%) 6 (26%) 7 (33%)
    Squamous Cell Carcinoma 22 (22%) 4 (14%) 8 (29%) 7 (30%) 3 (14%)
surgical_resection




0.3
    Complete 29 (29%) 7 (25%) 11 (39%) 5 (22%) 6 (29%)
    None 39 (39%) 13 (46%) 6 (21%) 13 (57%) 7 (33%)
    Partial 32 (32%) 8 (29%) 11 (39%) 5 (22%) 8 (38%)
chemotherapy




0.3
    No 53 (53%) 12 (43%) 19 (68%) 11 (48%) 11 (52%)
    Yes 45 (45%) 14 (50%) 9 (32%) 12 (52%) 10 (48%)
    YES 2 (2.0%) 2 (7.1%) 0 (0%) 0 (0%) 0 (0%)
immunotherapy




0.5
    No 42 (42%) 14 (50%) 13 (46%) 7 (30%) 8 (38%)
    Yes 57 (57%) 13 (46%) 15 (54%) 16 (70%) 13 (62%)
    YES 1 (1.0%) 1 (3.6%) 0 (0%) 0 (0%) 0 (0%)
event




0.4
    none 36 (36%) 10 (36%) 14 (50%) 7 (30%) 5 (24%)
    progression 34 (34%) 10 (36%) 8 (29%) 10 (43%) 6 (29%)
    recurrence 30 (30%) 8 (29%) 6 (21%) 6 (26%) 10 (48%)
dead 53 (53%) 17 (61%) 14 (50%) 12 (52%) 10 (48%) 0.8
1 n (%); Median (IQR)
2 Kruskal-Wallis rank sum test; Fisher’s exact test; Pearson’s Chi-squared test
library(dplyr)
library(gtsummary)

ds %>%
  mutate(stage = toupper(stage)) %>% 
  # Exclude columns that have 'date' in their names
  dplyr::select(-contains("date")) %>%
  # Exclude 'patient_id'
  dplyr::select(-patient_id) %>%
  # Create summary table grouped by the 'stage' column, use mean (SD) for numeric data
  tbl_summary(
    by = stage,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",  # Set mean and SD for continuous variables
      all_categorical() ~ "{n} ({p}%)"     # Default counts and percentages for categorical variables
    )
  ) %>% 
  add_p() %>%    # Add p-values for comparisons
  add_overall()  # Add an overall summary column
Characteristic Overall, N = 1001 I, N = 281 II, N = 281 III, N = 231 IV, N = 211 p-value2
nationality





     2 (2.0%) 0 (0%) 0 (0%) 0 (0%) 2 (9.5%)
    american 7 (7.0%) 2 (7.1%) 4 (14%) 0 (0%) 1 (4.8%)
    brazilian 10 (10%) 3 (11%) 2 (7.1%) 3 (13%) 2 (9.5%)
    british 9 (9.0%) 4 (14%) 2 (7.1%) 2 (8.7%) 1 (4.8%)
    canadian 15 (15%) 3 (11%) 6 (21%) 2 (8.7%) 4 (19%)
    chinese 9 (9.0%) 4 (14%) 1 (3.6%) 3 (13%) 1 (4.8%)
    french 13 (13%) 2 (7.1%) 2 (7.1%) 7 (30%) 2 (9.5%)
    german 7 (7.0%) 2 (7.1%) 5 (18%) 0 (0%) 0 (0%)
    indian 9 (9.0%) 2 (7.1%) 2 (7.1%) 1 (4.3%) 4 (19%)
    italian 11 (11%) 3 (11%) 2 (7.1%) 4 (17%) 2 (9.5%)
    japanese 8 (8.0%) 3 (11%) 2 (7.1%) 1 (4.3%) 2 (9.5%)
hemoglobin_at_diagnosis 15.25 (13.79) 18.61 (25.82) 13.98 (2.24) 13.70 (2.34) 14.19 (2.38) >0.9
weight_kg 74 (15) 74 (15) 71 (15) 79 (17) 73 (15) 0.3
    Unknown 1 0 1 0 0
histopathology




0.7
    Adenocarcinoma 23 (23%) 9 (32%) 7 (25%) 3 (13%) 4 (19%)
    Large Cell Carcinoma 28 (28%) 9 (32%) 5 (18%) 7 (30%) 7 (33%)
    Small Cell Lung Cancer 27 (27%) 6 (21%) 8 (29%) 6 (26%) 7 (33%)
    Squamous Cell Carcinoma 22 (22%) 4 (14%) 8 (29%) 7 (30%) 3 (14%)
surgical_resection




0.3
    Complete 29 (29%) 7 (25%) 11 (39%) 5 (22%) 6 (29%)
    None 39 (39%) 13 (46%) 6 (21%) 13 (57%) 7 (33%)
    Partial 32 (32%) 8 (29%) 11 (39%) 5 (22%) 8 (38%)
chemotherapy




0.3
    No 53 (53%) 12 (43%) 19 (68%) 11 (48%) 11 (52%)
    Yes 45 (45%) 14 (50%) 9 (32%) 12 (52%) 10 (48%)
    YES 2 (2.0%) 2 (7.1%) 0 (0%) 0 (0%) 0 (0%)
immunotherapy




0.5
    No 42 (42%) 14 (50%) 13 (46%) 7 (30%) 8 (38%)
    Yes 57 (57%) 13 (46%) 15 (54%) 16 (70%) 13 (62%)
    YES 1 (1.0%) 1 (3.6%) 0 (0%) 0 (0%) 0 (0%)
event




0.4
    none 36 (36%) 10 (36%) 14 (50%) 7 (30%) 5 (24%)
    progression 34 (34%) 10 (36%) 8 (29%) 10 (43%) 6 (29%)
    recurrence 30 (30%) 8 (29%) 6 (21%) 6 (26%) 10 (48%)
dead 53 (53%) 17 (61%) 14 (50%) 12 (52%) 10 (48%) 0.8
1 n (%); Mean (SD)
2 Kruskal-Wallis rank sum test; Fisher’s exact test; Pearson’s Chi-squared test
# Load required libraries
library(dplyr)
library(lubridate)

# Assuming the dataset is already cleaned and the date columns are in Date format
ds <- ds %>%
  # Step 1: Calculate age at diagnosis
  mutate(
    date_of_birth = as.Date(date_of_birth, format = "%m/%d/%Y"),
    date_of_diagnosis = as.Date(date_of_diagnosis, format = "%m/%d/%Y"),
    date_of_event = as.Date(date_of_event, format = "%m/%d/%Y"),
    date_of_death_or_last_follow_up = as.Date(date_of_death_or_last_follow_up, format = "%m/%d/%Y"),
    
    # Calculate age at diagnosis
    age_at_diagnosis = as.numeric(difftime(date_of_diagnosis, date_of_birth, units = "days")) / 365.25,
    
    # Step 2: Calculate event-free survival
    event_free_survival = if_else(
      !is.na(date_of_event), 
      as.numeric(difftime(date_of_event, date_of_diagnosis, units = "days")) / 365.25, 
      as.numeric(difftime(date_of_death_or_last_follow_up, date_of_diagnosis, units = "days")) / 365.25
    ),
    
    # Step 3: Calculate overall survival
    overall_survival = as.numeric(difftime(date_of_death_or_last_follow_up, date_of_diagnosis, units = "days")) / 365.25
  )

# # View the updated dataset with new columns
head(ds)
##   patient_id date_of_birth date_of_diagnosis nationality
## 1     PT-001    1968-03-24        2019-03-26      german
## 2     PT-002    1943-12-21        2013-11-24    canadian
## 3     PT-003    1964-05-01        2017-05-04     chinese
## 4     PT-004    1969-08-17        2021-11-25     chinese
## 5     PT-005    1940-09-21        2011-06-13    american
## 6     PT-006    1964-12-19        2023-02-14            
##   hemoglobin_at_diagnosis weight_kg          histopathology stage
## 1                    15.3      53.6 Squamous Cell Carcinoma    ii
## 2                    14.7      97.9          Adenocarcinoma     i
## 3                    16.0      53.0          Adenocarcinoma     i
## 4                    15.8      51.1  Small Cell Lung Cancer   iii
## 5                    11.2      59.8          Adenocarcinoma    ii
## 6                    16.3      88.7    Large Cell Carcinoma    iv
##   surgical_resection chemotherapy immunotherapy       event date_of_event dead
## 1           Complete           No            No        none          <NA>   no
## 2            Partial           No            No  recurrence    2015-03-26   no
## 3           Complete           No           Yes progression    2021-01-31  yes
## 4           Complete          Yes           Yes        none          <NA>   no
## 5           Complete          Yes            No  recurrence    2016-06-08  yes
## 6           Complete          Yes            No  recurrence    2024-04-24  yes
##   date_of_death_or_last_follow_up age_at_diagnosis event_free_survival
## 1                      2024-09-28         51.00342            5.511294
## 2                      2024-01-17         69.92745            1.333333
## 3                      2024-09-17         53.00753            3.745380
## 4                      2024-08-14         52.27379            2.718686
## 5                      2024-03-22         70.72416            4.988364
## 6                      2024-09-15         58.15469            1.190965
##   overall_survival
## 1         5.511294
## 2        10.146475
## 3         7.373032
## 4         2.718686
## 5        12.774812
## 6         1.585216
# Step 1: Install and load necessary libraries
# install.packages("finalfit")
# install.packages("survival")

# Load the required libraries
library(finalfit)
library(survival)
library(dplyr)
ds$dead<- ifelse(ds$dead=="yes",1,0)
outcome <- "Surv(overall_survival, dead==1)"  # Survival outcome in days
explanatory <- c("age_at_diagnosis", "chemotherapy", "immunotherapy", "weight_kg", "hemoglobin_at_diagnosis")


finalfit::finalfit (dependent=outcome, explanatory=explanatory, .data=ds)
##  Dependent: Surv(overall_survival, dead==1)                   all
##                            age_at_diagnosis Mean (SD)  60.1 (8.4)
##                                chemotherapy        No   53 (53.0)
##                                                   Yes   45 (45.0)
##                                                   YES     2 (2.0)
##                               immunotherapy        No   42 (42.0)
##                                                   Yes   57 (57.0)
##                                                   YES     1 (1.0)
##                                   weight_kg Mean (SD) 74.1 (15.4)
##                     hemoglobin_at_diagnosis Mean (SD) 15.3 (13.8)
##            HR (univariable)          HR (multivariable)
##   1.03 (0.99-1.07, p=0.096)   1.04 (0.99-1.08, p=0.087)
##                           -                           -
##   1.48 (0.82-2.65, p=0.189)   1.50 (0.81-2.77, p=0.200)
##  7.24 (1.63-32.21, p=0.009) 10.06 (2.11-47.88, p=0.004)
##                           -                           -
##   1.60 (0.90-2.85, p=0.112)   1.98 (1.07-3.68, p=0.030)
##  2.08 (0.27-15.79, p=0.479)  1.82 (0.21-15.50, p=0.584)
##   0.98 (0.96-1.00, p=0.087)   0.98 (0.96-1.00, p=0.105)
##   1.01 (0.99-1.02, p=0.280)   1.01 (0.99-1.02, p=0.405)
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)
# Assuming your dataset is named 'ds' and contains the columns 'stage', 'chemotherapy', and 'immunotherapy'

# Prepare the data: summarizing the count of patients receiving chemotherapy and immunotherapy by stage
data_plot <- ds %>%
  group_by(stage) %>%
  dplyr::summarize(
    chemotherapy_count = sum(chemotherapy == "Yes", na.rm = TRUE),
    immunotherapy_count = sum(immunotherapy == "Yes", na.rm = TRUE)
  ) %>%
  pivot_longer(cols = c(chemotherapy_count, immunotherapy_count),
               names_to = "treatment_type", values_to = "count")

# Create the plot
ggplot(data_plot, aes(x = stage, y = count, fill = treatment_type)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Stage",
    y = "Count of Patients",
    fill = "Treatment Type"
  ) +
  theme_minimal() +
  scale_fill_manual(values = c("blue", "red"))