This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
ds <- mtcars
class(ds)
## [1] "data.frame"
dim(ds)
## [1] 32 11
head(ds)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
colnames(ds)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
summary(ds)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
# View(ds)
str(ds)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Average miles per gal for cares with 4 cylinders
# Calculate the average mpg for cars with 4 cylinders
avg_mpg_4_cyl <- mean(ds$mpg[ds$cyl == 4])
# Print the result
print(avg_mpg_4_cyl)
## [1] 26.66364
for (n in 1:10){
print("hello")
print(n*3+5)
}
## [1] "hello"
## [1] 8
## [1] "hello"
## [1] 11
## [1] "hello"
## [1] 14
## [1] "hello"
## [1] 17
## [1] "hello"
## [1] 20
## [1] "hello"
## [1] 23
## [1] "hello"
## [1] 26
## [1] "hello"
## [1] 29
## [1] "hello"
## [1] 32
## [1] "hello"
## [1] 35
multiply_and_add <- function(x){
x*3 + 5
}
multiply_and_add(6)
## [1] 23
for (n in 1:10){print(multiply_and_add(n))}
## [1] 8
## [1] 11
## [1] 14
## [1] 17
## [1] 20
## [1] 23
## [1] 26
## [1] 29
## [1] 32
## [1] 35
vec<- 1:10
lapply(vec, multiply_and_add)
## [[1]]
## [1] 8
##
## [[2]]
## [1] 11
##
## [[3]]
## [1] 14
##
## [[4]]
## [1] 17
##
## [[5]]
## [1] 20
##
## [[6]]
## [1] 23
##
## [[7]]
## [1] 26
##
## [[8]]
## [1] 29
##
## [[9]]
## [1] 32
##
## [[10]]
## [1] 35
ds <- read.csv("/cloud/project/students/Lung_Cancer_Patients_clean.csv")
dim(ds)
## [1] 100 15
head(ds)
## Patient.ID Date.of.Birth Date.of.Diagnosis Nationality
## 1 PT-001 3/24/1968 3/26/2019 German
## 2 PT-002 12/21/1943 11/24/2013 Canadian
## 3 PT-003 5/1/1964 5/4/2017 Chinese
## 4 PT-004 8/17/1969 11/25/2021 Chinese
## 5 PT-005 9/21/1940 6/13/2011 American
## 6 PT-006 12/19/1964 2/14/2023
## Hemoglobin.at.Diagnosis Weight..kg. Histopathology Stage
## 1 15.3 53.6 Squamous Cell Carcinoma Stage II
## 2 14.7 97.9 Adenocarcinoma Stage I
## 3 16.0 53.0 Adenocarcinoma Stage I
## 4 15.8 51.1 Small Cell Lung Cancer Stage III
## 5 11.2 59.8 Adenocarcinoma Stage II
## 6 16.3 88.7 Large Cell Carcinoma Stage IV
## Surgical.Resection Chemotherapy Immunotherapy Event Date.of.Event Dead
## 1 Complete No No None No
## 2 Partial No No Recurrence 3/26/2015 No
## 3 Complete No Yes Progression 1/31/2021 Yes
## 4 Complete Yes Yes None No
## 5 Complete Yes No Recurrence 6/8/2016 Yes
## 6 Complete Yes No Recurrence 4/24/2024 Yes
## Date.of.Death.or.Last.Follow.up
## 1 9/28/2024
## 2 1/17/2024
## 3 9/17/2024
## 4 8/14/2024
## 5 3/22/2024
## 6 9/15/2024
colnames(ds)
## [1] "Patient.ID" "Date.of.Birth"
## [3] "Date.of.Diagnosis" "Nationality"
## [5] "Hemoglobin.at.Diagnosis" "Weight..kg."
## [7] "Histopathology" "Stage"
## [9] "Surgical.Resection" "Chemotherapy"
## [11] "Immunotherapy" "Event"
## [13] "Date.of.Event" "Dead"
## [15] "Date.of.Death.or.Last.Follow.up"
summary(ds)
## Patient.ID Date.of.Birth Date.of.Diagnosis Nationality
## Length:100 Length:100 Length:100 Length:100
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Hemoglobin.at.Diagnosis Weight..kg. Histopathology Stage
## Min. : 10.10 Min. :50.50 Length:100 Length:100
## 1st Qu.: 11.90 1st Qu.:59.15 Class :character Class :character
## Median : 13.70 Median :73.40 Mode :character Mode :character
## Mean : 15.25 Mean :74.08
## 3rd Qu.: 15.85 3rd Qu.:87.50
## Max. :150.00 Max. :99.20
## NA's :1 NA's :1
## Surgical.Resection Chemotherapy Immunotherapy Event
## Length:100 Length:100 Length:100 Length:100
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Date.of.Event Dead Date.of.Death.or.Last.Follow.up
## Length:100 Length:100 Length:100
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
str(ds)
## 'data.frame': 100 obs. of 15 variables:
## $ Patient.ID : chr "PT-001" "PT-002" "PT-003" "PT-004" ...
## $ Date.of.Birth : chr "3/24/1968" "12/21/1943" "5/1/1964" "8/17/1969" ...
## $ Date.of.Diagnosis : chr "3/26/2019" "11/24/2013" "5/4/2017" "11/25/2021" ...
## $ Nationality : chr "German" "Canadian" "Chinese" "Chinese" ...
## $ Hemoglobin.at.Diagnosis : num 15.3 14.7 16 15.8 11.2 16.3 14.5 16 17.4 10.6 ...
## $ Weight..kg. : num 53.6 97.9 53 51.1 59.8 88.7 73.8 78.7 98.1 97.2 ...
## $ Histopathology : chr "Squamous Cell Carcinoma" "Adenocarcinoma" "Adenocarcinoma" "Small Cell Lung Cancer" ...
## $ Stage : chr "Stage II" "Stage I" "Stage I" "Stage III" ...
## $ Surgical.Resection : chr "Complete" "Partial" "Complete" "Complete" ...
## $ Chemotherapy : chr "No" "No" "No" "Yes" ...
## $ Immunotherapy : chr "No" "No" "Yes" "Yes" ...
## $ Event : chr "None" "Recurrence" "Progression" "None" ...
## $ Date.of.Event : chr "" "3/26/2015" "1/31/2021" "" ...
## $ Dead : chr "No" "No" "Yes" "No" ...
## $ Date.of.Death.or.Last.Follow.up: chr "9/28/2024" "1/17/2024" "9/17/2024" "8/14/2024" ...
janitor::clean_names(ds)->ds
colnames(ds)
## [1] "patient_id" "date_of_birth"
## [3] "date_of_diagnosis" "nationality"
## [5] "hemoglobin_at_diagnosis" "weight_kg"
## [7] "histopathology" "stage"
## [9] "surgical_resection" "chemotherapy"
## [11] "immunotherapy" "event"
## [13] "date_of_event" "dead"
## [15] "date_of_death_or_last_follow_up"
str(ds)
## 'data.frame': 100 obs. of 15 variables:
## $ patient_id : chr "PT-001" "PT-002" "PT-003" "PT-004" ...
## $ date_of_birth : chr "3/24/1968" "12/21/1943" "5/1/1964" "8/17/1969" ...
## $ date_of_diagnosis : chr "3/26/2019" "11/24/2013" "5/4/2017" "11/25/2021" ...
## $ nationality : chr "German" "Canadian" "Chinese" "Chinese" ...
## $ hemoglobin_at_diagnosis : num 15.3 14.7 16 15.8 11.2 16.3 14.5 16 17.4 10.6 ...
## $ weight_kg : num 53.6 97.9 53 51.1 59.8 88.7 73.8 78.7 98.1 97.2 ...
## $ histopathology : chr "Squamous Cell Carcinoma" "Adenocarcinoma" "Adenocarcinoma" "Small Cell Lung Cancer" ...
## $ stage : chr "Stage II" "Stage I" "Stage I" "Stage III" ...
## $ surgical_resection : chr "Complete" "Partial" "Complete" "Complete" ...
## $ chemotherapy : chr "No" "No" "No" "Yes" ...
## $ immunotherapy : chr "No" "No" "Yes" "Yes" ...
## $ event : chr "None" "Recurrence" "Progression" "None" ...
## $ date_of_event : chr "" "3/26/2015" "1/31/2021" "" ...
## $ dead : chr "No" "No" "Yes" "No" ...
## $ date_of_death_or_last_follow_up: chr "9/28/2024" "1/17/2024" "9/17/2024" "8/14/2024" ...
# Step 1: Convert date columns to Date format
ds$date_of_birth <- as.Date(ds$date_of_birth, format = "%m/%d/%Y")
ds$date_of_diagnosis <- as.Date(ds$date_of_diagnosis, format = "%m/%d/%Y")
ds$date_of_event <- as.Date(ds$date_of_event, format = "%m/%d/%Y")
ds$date_of_death_or_last_follow_up <- as.Date(ds$date_of_death_or_last_follow_up, format = "%m/%d/%Y")
# Step 2: Handle missing values (example: impute hemoglobin)
ds$hemoglobin_at_diagnosis[is.na(ds$hemoglobin_at_diagnosis)] <- mean(ds$hemoglobin_at_diagnosis, na.rm = TRUE)
# one value of hemoglobin was missing and was imputed using the mean of the column
# text
## make all small or capital letter
# Convert selected columns to lowercase (or use toupper() for uppercase)
ds$nationality <- tolower(ds$nationality) # or toupper(ds$nationality)
ds$stage <- tolower(ds$stage) # or toupper(ds$stage)
ds$event <- tolower(ds$event) # or toupper(ds$event)
ds$dead <- tolower(ds$dead) # or toupper(ds$dead)
## remove stage from stages column
# Remove the word "Stage" from the stage column
ds$stage <- gsub("Stage ", "", ds$stage, ignore.case = TRUE)
ds$stage <- gsub("Stge ", "", ds$stage, ignore.case = TRUE)
## remove all spaces from stage, event, death, nationality
# Step 3: Remove all spaces from stage, event, dead, and nationality columns
ds$stage <- gsub(" ", "", ds$stage)
ds$event <- gsub(" ", "", ds$event)
ds$dead <- gsub(" ", "", ds$dead)
ds$nationality <- gsub(" ", "", ds$nationality)
# Step 3: Convert categorical variables to factors
ds$nationality <- as.factor(ds$nationality)
ds$histopathology <- as.factor(ds$histopathology)
ds$stage <- as.factor(ds$stage)
ds$surgical_resection <- as.factor(ds$surgical_resection)
ds$chemotherapy <- as.factor(ds$chemotherapy)
ds$immunotherapy <- as.factor(ds$immunotherapy)
ds$dead <- as.factor(ds$dead)
ds$event<- as.factor(ds$event)
# check for duplication
# Check for duplicated rows
duplicated_rows <- duplicated(ds$patient_id)
print(sum(duplicated_rows))
## [1] 1
# Display TRUE if there are duplicated rows, FALSE otherwise
ds[which(duplicated_rows), "patient_id"]<- "PT-021"
# we should convert all text to upper or lower case
# we did not check for outliers
summary(ds)
## patient_id date_of_birth date_of_diagnosis nationality
## Length:100 Min. :1935-04-19 Min. :1992-12-04 canadian :15
## Class :character 1st Qu.:1943-08-21 1st Qu.:2007-11-23 french :13
## Mode :character Median :1953-01-13 Median :2014-11-22 italian :11
## Mean :1953-05-09 Mean :2013-07-09 brazilian:10
## 3rd Qu.:1962-01-15 3rd Qu.:2019-12-22 british : 9
## Max. :1973-12-15 Max. :2033-01-26 chinese : 9
## NA's :1 (Other) :33
## hemoglobin_at_diagnosis weight_kg histopathology
## Min. : 10.10 Min. :50.50 Adenocarcinoma :23
## 1st Qu.: 11.95 1st Qu.:59.15 Large Cell Carcinoma :28
## Median : 13.80 Median :73.40 Small Cell Lung Cancer :27
## Mean : 15.25 Mean :74.08 Squamous Cell Carcinoma:22
## 3rd Qu.: 15.82 3rd Qu.:87.50
## Max. :150.00 Max. :99.20
## NA's :1
## stage surgical_resection chemotherapy immunotherapy event
## i :28 Complete:29 No :53 No :42 none :36
## ii :28 None :39 Yes:45 Yes:57 progression:34
## iii:23 Partial :32 YES: 2 YES: 1 recurrence :30
## iv :21
##
##
##
## date_of_event dead date_of_death_or_last_follow_up
## Min. :1994-03-13 no :47 Min. :2024-01-05
## 1st Qu.:2010-08-09 yes:53 1st Qu.:2024-03-15
## Median :2016-08-17 Median :2024-05-16
## Mean :2015-05-30 Mean :2024-05-25
## 3rd Qu.:2021-08-03 3rd Qu.:2024-08-12
## Max. :2027-03-21 Max. :2024-10-27
## NA's :36
summarytools::dfSummary(ds)
## Data Frame Summary
## ds
## Dimensions: 100 x 15
## Duplicates: 0
##
## -----------------------------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- --------------------------------- ---------------------------- -------------------- --------------------- ---------- ---------
## 1 patient_id 1. PT-001 1 ( 1.0%) 100 0
## [character] 2. PT-002 1 ( 1.0%) (100.0%) (0.0%)
## 3. PT-003 1 ( 1.0%)
## 4. PT-004 1 ( 1.0%)
## 5. PT-005 1 ( 1.0%)
## 6. PT-006 1 ( 1.0%)
## 7. PT-007 1 ( 1.0%)
## 8. PT-008 1 ( 1.0%)
## 9. PT-009 1 ( 1.0%)
## 10. PT-010 1 ( 1.0%)
## [ 90 others ] 90 (90.0%) IIIIIIIIIIIIIIIIII
##
## 2 date_of_birth min : 1935-04-19 99 distinct values : . . 99 1
## [Date] med : 1953-01-13 : : : : : : (99.0%) (1.0%)
## max : 1973-12-15 : : : : : :
## range : 38y 7m 26d : : : : : : :
## : : : : : : : :
##
## 3 date_of_diagnosis min : 1992-12-04 99 distinct values : : 100 0
## [Date] med : 2014-11-22 : : (100.0%) (0.0%)
## max : 2033-01-26 : : : :
## range : 40y 1m 22d : : : :
## : : : : : :
##
## 4 nationality 1. (Empty string) 2 ( 2.0%) 100 0
## [factor] 2. american 7 ( 7.0%) I (100.0%) (0.0%)
## 3. brazilian 10 (10.0%) II
## 4. british 9 ( 9.0%) I
## 5. canadian 15 (15.0%) III
## 6. chinese 9 ( 9.0%) I
## 7. french 13 (13.0%) II
## 8. german 7 ( 7.0%) I
## 9. indian 9 ( 9.0%) I
## 10. italian 11 (11.0%) II
## 11. japanese 8 ( 8.0%) I
##
## 5 hemoglobin_at_diagnosis Mean (sd) : 15.3 (13.8) 56 distinct values : 100 0
## [numeric] min < med < max: : (100.0%) (0.0%)
## 10.1 < 13.8 < 150 :
## IQR (CV) : 3.9 (0.9) :
## :
##
## 6 weight_kg Mean (sd) : 74.1 (15.4) 91 distinct values : . 99 1
## [numeric] min < med < max: : : . . : (99.0%) (1.0%)
## 50.5 < 73.4 < 99.2 : : . : : . : . :
## IQR (CV) : 28.4 (0.2) : : : : : . : : : :
## : : : : : : : : : :
##
## 7 histopathology 1. Adenocarcinoma 23 (23.0%) IIII 100 0
## [factor] 2. Large Cell Carcinoma 28 (28.0%) IIIII (100.0%) (0.0%)
## 3. Small Cell Lung Cancer 27 (27.0%) IIIII
## 4. Squamous Cell Carcinoma 22 (22.0%) IIII
##
## 8 stage 1. i 28 (28.0%) IIIII 100 0
## [factor] 2. ii 28 (28.0%) IIIII (100.0%) (0.0%)
## 3. iii 23 (23.0%) IIII
## 4. iv 21 (21.0%) IIII
##
## 9 surgical_resection 1. Complete 29 (29.0%) IIIII 100 0
## [factor] 2. None 39 (39.0%) IIIIIII (100.0%) (0.0%)
## 3. Partial 32 (32.0%) IIIIII
##
## 10 chemotherapy 1. No 53 (53.0%) IIIIIIIIII 100 0
## [factor] 2. Yes 45 (45.0%) IIIIIIIII (100.0%) (0.0%)
## 3. YES 2 ( 2.0%)
##
## 11 immunotherapy 1. No 42 (42.0%) IIIIIIII 100 0
## [factor] 2. Yes 57 (57.0%) IIIIIIIIIII (100.0%) (0.0%)
## 3. YES 1 ( 1.0%)
##
## 12 event 1. none 36 (36.0%) IIIIIII 100 0
## [factor] 2. progression 34 (34.0%) IIIIII (100.0%) (0.0%)
## 3. recurrence 30 (30.0%) IIIIII
##
## 13 date_of_event min : 1994-03-13 64 distinct values : 64 36
## [Date] med : 2016-08-17 . : : (64.0%) (36.0%)
## max : 2027-03-21 : : :
## range : 33y 0m 8d . : : : :
## : : : : : : .
##
## 14 dead 1. no 47 (47.0%) IIIIIIIII 100 0
## [factor] 2. yes 53 (53.0%) IIIIIIIIII (100.0%) (0.0%)
##
## 15 date_of_death_or_last_follow_up min : 2024-01-05 84 distinct values : . 100 0
## [Date] med : 2024-05-16 : : : : : (100.0%) (0.0%)
## max : 2024-10-27 : : : : : :
## range : 9m 22d : : : : : :
## : : : : : :
## -----------------------------------------------------------------------------------------------------------------------------------
gptr::get_response(user_input = "What are redox reactions?",
system_specification = "You are a knowledgeable and helpful chemist \n
who will answer any questions in English",
model="gpt-4o-mini",
api_key="make your key")
## $error
## $error$message
## [1] "Incorrect API key provided: make you* key. You can find your API key at https://platform.openai.com/account/api-keys."
##
## $error$type
## [1] "invalid_request_error"
##
## $error$param
## NULL
##
## $error$code
## [1] "invalid_api_key"
# you have to learn data transformation using dplyr/tidyr, ask gpt to teach you
# you have to learn to pipe
ds %>%
dplyr::select(-patient_id) %>%
tbl_summary()
| Characteristic | N = 1001 |
|---|---|
| date_of_birth | 1935-04-19 to 1973-12-15 |
| Unknown | 1 |
| date_of_diagnosis | 1992-12-04 to 2033-01-26 |
| nationality | |
| 2 (2.0%) | |
| american | 7 (7.0%) |
| brazilian | 10 (10%) |
| british | 9 (9.0%) |
| canadian | 15 (15%) |
| chinese | 9 (9.0%) |
| french | 13 (13%) |
| german | 7 (7.0%) |
| indian | 9 (9.0%) |
| italian | 11 (11%) |
| japanese | 8 (8.0%) |
| hemoglobin_at_diagnosis | 13.80 (11.95, 15.83) |
| weight_kg | 73 (59, 88) |
| Unknown | 1 |
| histopathology | |
| Adenocarcinoma | 23 (23%) |
| Large Cell Carcinoma | 28 (28%) |
| Small Cell Lung Cancer | 27 (27%) |
| Squamous Cell Carcinoma | 22 (22%) |
| stage | |
| i | 28 (28%) |
| ii | 28 (28%) |
| iii | 23 (23%) |
| iv | 21 (21%) |
| surgical_resection | |
| Complete | 29 (29%) |
| None | 39 (39%) |
| Partial | 32 (32%) |
| chemotherapy | |
| No | 53 (53%) |
| Yes | 45 (45%) |
| YES | 2 (2.0%) |
| immunotherapy | |
| No | 42 (42%) |
| Yes | 57 (57%) |
| YES | 1 (1.0%) |
| event | |
| none | 36 (36%) |
| progression | 34 (34%) |
| recurrence | 30 (30%) |
| date_of_event | 1994-03-13 to 2027-03-21 |
| Unknown | 36 |
| dead | 53 (53%) |
| date_of_death_or_last_follow_up | 2024-01-05 to 2024-10-27 |
| 1 Range; n (%); Median (IQR) | |
library(dplyr)
library(gtsummary)
ds %>%
mutate(stage=toupper(stage)) %>%
# Exclude columns that have 'date' in their names
dplyr::select(-contains("date")) %>%
# Exclude 'patient_id'
dplyr::select(-patient_id) %>%
# Create summary table grouped by the 'stage' column
tbl_summary(by = stage)
| Characteristic | I, N = 281 | II, N = 281 | III, N = 231 | IV, N = 211 |
|---|---|---|---|---|
| nationality | ||||
| 0 (0%) | 0 (0%) | 0 (0%) | 2 (9.5%) | |
| american | 2 (7.1%) | 4 (14%) | 0 (0%) | 1 (4.8%) |
| brazilian | 3 (11%) | 2 (7.1%) | 3 (13%) | 2 (9.5%) |
| british | 4 (14%) | 2 (7.1%) | 2 (8.7%) | 1 (4.8%) |
| canadian | 3 (11%) | 6 (21%) | 2 (8.7%) | 4 (19%) |
| chinese | 4 (14%) | 1 (3.6%) | 3 (13%) | 1 (4.8%) |
| french | 2 (7.1%) | 2 (7.1%) | 7 (30%) | 2 (9.5%) |
| german | 2 (7.1%) | 5 (18%) | 0 (0%) | 0 (0%) |
| indian | 2 (7.1%) | 2 (7.1%) | 1 (4.3%) | 4 (19%) |
| italian | 3 (11%) | 2 (7.1%) | 4 (17%) | 2 (9.5%) |
| japanese | 3 (11%) | 2 (7.1%) | 1 (4.3%) | 2 (9.5%) |
| hemoglobin_at_diagnosis | 13.75 (12.30, 15.78) | 13.55 (11.95, 15.58) | 13.40 (11.75, 15.50) | 14.60 (12.30, 16.00) |
| weight_kg | 72 (63, 83) | 69 (58, 82) | 85 (64, 93) | 71 (59, 86) |
| Unknown | 0 | 1 | 0 | 0 |
| histopathology | ||||
| Adenocarcinoma | 9 (32%) | 7 (25%) | 3 (13%) | 4 (19%) |
| Large Cell Carcinoma | 9 (32%) | 5 (18%) | 7 (30%) | 7 (33%) |
| Small Cell Lung Cancer | 6 (21%) | 8 (29%) | 6 (26%) | 7 (33%) |
| Squamous Cell Carcinoma | 4 (14%) | 8 (29%) | 7 (30%) | 3 (14%) |
| surgical_resection | ||||
| Complete | 7 (25%) | 11 (39%) | 5 (22%) | 6 (29%) |
| None | 13 (46%) | 6 (21%) | 13 (57%) | 7 (33%) |
| Partial | 8 (29%) | 11 (39%) | 5 (22%) | 8 (38%) |
| chemotherapy | ||||
| No | 12 (43%) | 19 (68%) | 11 (48%) | 11 (52%) |
| Yes | 14 (50%) | 9 (32%) | 12 (52%) | 10 (48%) |
| YES | 2 (7.1%) | 0 (0%) | 0 (0%) | 0 (0%) |
| immunotherapy | ||||
| No | 14 (50%) | 13 (46%) | 7 (30%) | 8 (38%) |
| Yes | 13 (46%) | 15 (54%) | 16 (70%) | 13 (62%) |
| YES | 1 (3.6%) | 0 (0%) | 0 (0%) | 0 (0%) |
| event | ||||
| none | 10 (36%) | 14 (50%) | 7 (30%) | 5 (24%) |
| progression | 10 (36%) | 8 (29%) | 10 (43%) | 6 (29%) |
| recurrence | 8 (29%) | 6 (21%) | 6 (26%) | 10 (48%) |
| dead | 17 (61%) | 14 (50%) | 12 (52%) | 10 (48%) |
| 1 n (%); Median (IQR) | ||||
ds %>%
mutate(stage=toupper(stage)) %>%
# Exclude columns that have 'date' in their names
dplyr::select(-contains("date")) %>%
# Exclude 'patient_id'
dplyr::select(-patient_id) %>%
# Create summary table grouped by the 'stage' column
tbl_summary(by = stage) %>%
add_p() %>%
add_overall()
| Characteristic | Overall, N = 1001 | I, N = 281 | II, N = 281 | III, N = 231 | IV, N = 211 | p-value2 |
|---|---|---|---|---|---|---|
| nationality | ||||||
| 2 (2.0%) | 0 (0%) | 0 (0%) | 0 (0%) | 2 (9.5%) | ||
| american | 7 (7.0%) | 2 (7.1%) | 4 (14%) | 0 (0%) | 1 (4.8%) | |
| brazilian | 10 (10%) | 3 (11%) | 2 (7.1%) | 3 (13%) | 2 (9.5%) | |
| british | 9 (9.0%) | 4 (14%) | 2 (7.1%) | 2 (8.7%) | 1 (4.8%) | |
| canadian | 15 (15%) | 3 (11%) | 6 (21%) | 2 (8.7%) | 4 (19%) | |
| chinese | 9 (9.0%) | 4 (14%) | 1 (3.6%) | 3 (13%) | 1 (4.8%) | |
| french | 13 (13%) | 2 (7.1%) | 2 (7.1%) | 7 (30%) | 2 (9.5%) | |
| german | 7 (7.0%) | 2 (7.1%) | 5 (18%) | 0 (0%) | 0 (0%) | |
| indian | 9 (9.0%) | 2 (7.1%) | 2 (7.1%) | 1 (4.3%) | 4 (19%) | |
| italian | 11 (11%) | 3 (11%) | 2 (7.1%) | 4 (17%) | 2 (9.5%) | |
| japanese | 8 (8.0%) | 3 (11%) | 2 (7.1%) | 1 (4.3%) | 2 (9.5%) | |
| hemoglobin_at_diagnosis | 13.80 (11.95, 15.83) | 13.75 (12.30, 15.78) | 13.55 (11.95, 15.58) | 13.40 (11.75, 15.50) | 14.60 (12.30, 16.00) | >0.9 |
| weight_kg | 73 (59, 88) | 72 (63, 83) | 69 (58, 82) | 85 (64, 93) | 71 (59, 86) | 0.3 |
| Unknown | 1 | 0 | 1 | 0 | 0 | |
| histopathology | 0.7 | |||||
| Adenocarcinoma | 23 (23%) | 9 (32%) | 7 (25%) | 3 (13%) | 4 (19%) | |
| Large Cell Carcinoma | 28 (28%) | 9 (32%) | 5 (18%) | 7 (30%) | 7 (33%) | |
| Small Cell Lung Cancer | 27 (27%) | 6 (21%) | 8 (29%) | 6 (26%) | 7 (33%) | |
| Squamous Cell Carcinoma | 22 (22%) | 4 (14%) | 8 (29%) | 7 (30%) | 3 (14%) | |
| surgical_resection | 0.3 | |||||
| Complete | 29 (29%) | 7 (25%) | 11 (39%) | 5 (22%) | 6 (29%) | |
| None | 39 (39%) | 13 (46%) | 6 (21%) | 13 (57%) | 7 (33%) | |
| Partial | 32 (32%) | 8 (29%) | 11 (39%) | 5 (22%) | 8 (38%) | |
| chemotherapy | 0.3 | |||||
| No | 53 (53%) | 12 (43%) | 19 (68%) | 11 (48%) | 11 (52%) | |
| Yes | 45 (45%) | 14 (50%) | 9 (32%) | 12 (52%) | 10 (48%) | |
| YES | 2 (2.0%) | 2 (7.1%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| immunotherapy | 0.5 | |||||
| No | 42 (42%) | 14 (50%) | 13 (46%) | 7 (30%) | 8 (38%) | |
| Yes | 57 (57%) | 13 (46%) | 15 (54%) | 16 (70%) | 13 (62%) | |
| YES | 1 (1.0%) | 1 (3.6%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| event | 0.4 | |||||
| none | 36 (36%) | 10 (36%) | 14 (50%) | 7 (30%) | 5 (24%) | |
| progression | 34 (34%) | 10 (36%) | 8 (29%) | 10 (43%) | 6 (29%) | |
| recurrence | 30 (30%) | 8 (29%) | 6 (21%) | 6 (26%) | 10 (48%) | |
| dead | 53 (53%) | 17 (61%) | 14 (50%) | 12 (52%) | 10 (48%) | 0.8 |
| 1 n (%); Median (IQR) | ||||||
| 2 Kruskal-Wallis rank sum test; Fisher’s exact test; Pearson’s Chi-squared test | ||||||
library(dplyr)
library(gtsummary)
ds %>%
mutate(stage = toupper(stage)) %>%
# Exclude columns that have 'date' in their names
dplyr::select(-contains("date")) %>%
# Exclude 'patient_id'
dplyr::select(-patient_id) %>%
# Create summary table grouped by the 'stage' column, use mean (SD) for numeric data
tbl_summary(
by = stage,
statistic = list(
all_continuous() ~ "{mean} ({sd})", # Set mean and SD for continuous variables
all_categorical() ~ "{n} ({p}%)" # Default counts and percentages for categorical variables
)
) %>%
add_p() %>% # Add p-values for comparisons
add_overall() # Add an overall summary column
| Characteristic | Overall, N = 1001 | I, N = 281 | II, N = 281 | III, N = 231 | IV, N = 211 | p-value2 |
|---|---|---|---|---|---|---|
| nationality | ||||||
| 2 (2.0%) | 0 (0%) | 0 (0%) | 0 (0%) | 2 (9.5%) | ||
| american | 7 (7.0%) | 2 (7.1%) | 4 (14%) | 0 (0%) | 1 (4.8%) | |
| brazilian | 10 (10%) | 3 (11%) | 2 (7.1%) | 3 (13%) | 2 (9.5%) | |
| british | 9 (9.0%) | 4 (14%) | 2 (7.1%) | 2 (8.7%) | 1 (4.8%) | |
| canadian | 15 (15%) | 3 (11%) | 6 (21%) | 2 (8.7%) | 4 (19%) | |
| chinese | 9 (9.0%) | 4 (14%) | 1 (3.6%) | 3 (13%) | 1 (4.8%) | |
| french | 13 (13%) | 2 (7.1%) | 2 (7.1%) | 7 (30%) | 2 (9.5%) | |
| german | 7 (7.0%) | 2 (7.1%) | 5 (18%) | 0 (0%) | 0 (0%) | |
| indian | 9 (9.0%) | 2 (7.1%) | 2 (7.1%) | 1 (4.3%) | 4 (19%) | |
| italian | 11 (11%) | 3 (11%) | 2 (7.1%) | 4 (17%) | 2 (9.5%) | |
| japanese | 8 (8.0%) | 3 (11%) | 2 (7.1%) | 1 (4.3%) | 2 (9.5%) | |
| hemoglobin_at_diagnosis | 15.25 (13.79) | 18.61 (25.82) | 13.98 (2.24) | 13.70 (2.34) | 14.19 (2.38) | >0.9 |
| weight_kg | 74 (15) | 74 (15) | 71 (15) | 79 (17) | 73 (15) | 0.3 |
| Unknown | 1 | 0 | 1 | 0 | 0 | |
| histopathology | 0.7 | |||||
| Adenocarcinoma | 23 (23%) | 9 (32%) | 7 (25%) | 3 (13%) | 4 (19%) | |
| Large Cell Carcinoma | 28 (28%) | 9 (32%) | 5 (18%) | 7 (30%) | 7 (33%) | |
| Small Cell Lung Cancer | 27 (27%) | 6 (21%) | 8 (29%) | 6 (26%) | 7 (33%) | |
| Squamous Cell Carcinoma | 22 (22%) | 4 (14%) | 8 (29%) | 7 (30%) | 3 (14%) | |
| surgical_resection | 0.3 | |||||
| Complete | 29 (29%) | 7 (25%) | 11 (39%) | 5 (22%) | 6 (29%) | |
| None | 39 (39%) | 13 (46%) | 6 (21%) | 13 (57%) | 7 (33%) | |
| Partial | 32 (32%) | 8 (29%) | 11 (39%) | 5 (22%) | 8 (38%) | |
| chemotherapy | 0.3 | |||||
| No | 53 (53%) | 12 (43%) | 19 (68%) | 11 (48%) | 11 (52%) | |
| Yes | 45 (45%) | 14 (50%) | 9 (32%) | 12 (52%) | 10 (48%) | |
| YES | 2 (2.0%) | 2 (7.1%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| immunotherapy | 0.5 | |||||
| No | 42 (42%) | 14 (50%) | 13 (46%) | 7 (30%) | 8 (38%) | |
| Yes | 57 (57%) | 13 (46%) | 15 (54%) | 16 (70%) | 13 (62%) | |
| YES | 1 (1.0%) | 1 (3.6%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| event | 0.4 | |||||
| none | 36 (36%) | 10 (36%) | 14 (50%) | 7 (30%) | 5 (24%) | |
| progression | 34 (34%) | 10 (36%) | 8 (29%) | 10 (43%) | 6 (29%) | |
| recurrence | 30 (30%) | 8 (29%) | 6 (21%) | 6 (26%) | 10 (48%) | |
| dead | 53 (53%) | 17 (61%) | 14 (50%) | 12 (52%) | 10 (48%) | 0.8 |
| 1 n (%); Mean (SD) | ||||||
| 2 Kruskal-Wallis rank sum test; Fisher’s exact test; Pearson’s Chi-squared test | ||||||
# Load required libraries
library(dplyr)
library(lubridate)
# Assuming the dataset is already cleaned and the date columns are in Date format
ds <- ds %>%
# Step 1: Calculate age at diagnosis
mutate(
date_of_birth = as.Date(date_of_birth, format = "%m/%d/%Y"),
date_of_diagnosis = as.Date(date_of_diagnosis, format = "%m/%d/%Y"),
date_of_event = as.Date(date_of_event, format = "%m/%d/%Y"),
date_of_death_or_last_follow_up = as.Date(date_of_death_or_last_follow_up, format = "%m/%d/%Y"),
# Calculate age at diagnosis
age_at_diagnosis = as.numeric(difftime(date_of_diagnosis, date_of_birth, units = "days")) / 365.25,
# Step 2: Calculate event-free survival
event_free_survival = if_else(
!is.na(date_of_event),
as.numeric(difftime(date_of_event, date_of_diagnosis, units = "days")) / 365.25,
as.numeric(difftime(date_of_death_or_last_follow_up, date_of_diagnosis, units = "days")) / 365.25
),
# Step 3: Calculate overall survival
overall_survival = as.numeric(difftime(date_of_death_or_last_follow_up, date_of_diagnosis, units = "days")) / 365.25
)
# # View the updated dataset with new columns
head(ds)
## patient_id date_of_birth date_of_diagnosis nationality
## 1 PT-001 1968-03-24 2019-03-26 german
## 2 PT-002 1943-12-21 2013-11-24 canadian
## 3 PT-003 1964-05-01 2017-05-04 chinese
## 4 PT-004 1969-08-17 2021-11-25 chinese
## 5 PT-005 1940-09-21 2011-06-13 american
## 6 PT-006 1964-12-19 2023-02-14
## hemoglobin_at_diagnosis weight_kg histopathology stage
## 1 15.3 53.6 Squamous Cell Carcinoma ii
## 2 14.7 97.9 Adenocarcinoma i
## 3 16.0 53.0 Adenocarcinoma i
## 4 15.8 51.1 Small Cell Lung Cancer iii
## 5 11.2 59.8 Adenocarcinoma ii
## 6 16.3 88.7 Large Cell Carcinoma iv
## surgical_resection chemotherapy immunotherapy event date_of_event dead
## 1 Complete No No none <NA> no
## 2 Partial No No recurrence 2015-03-26 no
## 3 Complete No Yes progression 2021-01-31 yes
## 4 Complete Yes Yes none <NA> no
## 5 Complete Yes No recurrence 2016-06-08 yes
## 6 Complete Yes No recurrence 2024-04-24 yes
## date_of_death_or_last_follow_up age_at_diagnosis event_free_survival
## 1 2024-09-28 51.00342 5.511294
## 2 2024-01-17 69.92745 1.333333
## 3 2024-09-17 53.00753 3.745380
## 4 2024-08-14 52.27379 2.718686
## 5 2024-03-22 70.72416 4.988364
## 6 2024-09-15 58.15469 1.190965
## overall_survival
## 1 5.511294
## 2 10.146475
## 3 7.373032
## 4 2.718686
## 5 12.774812
## 6 1.585216
# Step 1: Install and load necessary libraries
# install.packages("finalfit")
# install.packages("survival")
# Load the required libraries
library(finalfit)
library(survival)
library(dplyr)
ds$dead<- ifelse(ds$dead=="yes",1,0)
outcome <- "Surv(overall_survival, dead==1)" # Survival outcome in days
explanatory <- c("age_at_diagnosis", "chemotherapy", "immunotherapy", "weight_kg", "hemoglobin_at_diagnosis")
finalfit::finalfit (dependent=outcome, explanatory=explanatory, .data=ds)
## Dependent: Surv(overall_survival, dead==1) all
## age_at_diagnosis Mean (SD) 60.1 (8.4)
## chemotherapy No 53 (53.0)
## Yes 45 (45.0)
## YES 2 (2.0)
## immunotherapy No 42 (42.0)
## Yes 57 (57.0)
## YES 1 (1.0)
## weight_kg Mean (SD) 74.1 (15.4)
## hemoglobin_at_diagnosis Mean (SD) 15.3 (13.8)
## HR (univariable) HR (multivariable)
## 1.03 (0.99-1.07, p=0.096) 1.04 (0.99-1.08, p=0.087)
## - -
## 1.48 (0.82-2.65, p=0.189) 1.50 (0.81-2.77, p=0.200)
## 7.24 (1.63-32.21, p=0.009) 10.06 (2.11-47.88, p=0.004)
## - -
## 1.60 (0.90-2.85, p=0.112) 1.98 (1.07-3.68, p=0.030)
## 2.08 (0.27-15.79, p=0.479) 1.82 (0.21-15.50, p=0.584)
## 0.98 (0.96-1.00, p=0.087) 0.98 (0.96-1.00, p=0.105)
## 1.01 (0.99-1.02, p=0.280) 1.01 (0.99-1.02, p=0.405)
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)
# Assuming your dataset is named 'ds' and contains the columns 'stage', 'chemotherapy', and 'immunotherapy'
# Prepare the data: summarizing the count of patients receiving chemotherapy and immunotherapy by stage
data_plot <- ds %>%
group_by(stage) %>%
dplyr::summarize(
chemotherapy_count = sum(chemotherapy == "Yes", na.rm = TRUE),
immunotherapy_count = sum(immunotherapy == "Yes", na.rm = TRUE)
) %>%
pivot_longer(cols = c(chemotherapy_count, immunotherapy_count),
names_to = "treatment_type", values_to = "count")
# Create the plot
ggplot(data_plot, aes(x = stage, y = count, fill = treatment_type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
x = "Stage",
y = "Count of Patients",
fill = "Treatment Type"
) +
theme_minimal() +
scale_fill_manual(values = c("blue", "red"))