A) Data import. Import the data into R, using your favorite function(s). Each CSV file can be kept as its own data frame. Since the CSV files were exported directly from a database, missing data are encoded by the string “NULL”. You will want to recode the NULLs into the NAs conventionally used by R for missing data (there are multiple ways to do this). Depending on which function(s) you use to import the data, it may also be necessary to convert date and time columns so that they are recognized as such by R, such as by using the appropriate functions in base R or the lubridate package.
library(readr)
departments <- read_csv("departments.csv")
na = "NULL"
library(readr)
disease_types <- read_csv("disease_types.csv")
na = "NULL"
library(readr)
diseases <- read_csv("diseases.csv")
na = "NULL"
library(readr)
encounters <- read_csv("encounters.csv")
na = "NULL"
library(readr)
medication_types <- read_csv("medication_types.csv")
na = "NULL"
library(readr)
medications <- read_csv("medications.csv")
na = "NULL"
library(readr)
patients <- read_csv("patients.csv")
na = "NULL"
library(readr)
providers <- read_csv("providers.csv")
na = "NULL"
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
patients$birth_date <- ymd(patients$birth_date)
medications$date <- ymd_hms(medications$date)
encounters$admit_date <- ymd_hms(encounters$admit_date)
encounters$discharge_date <- ymd_hms(encounters$discharge_date)
## Warning: 1601615 failed to parse.
diseases$date <- ymd_hms(diseases$date)
B) Data summary. Summarize select parts of the data as follows:
nrow(patients)
## [1] 500000
#Answer: 500,000 patients are located in the data set
nrow(encounters)
## [1] 8673082
#Answer: 8,673,082 encounters are located in the data set
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
partb1c <- count(medication_types, medication_name)
nrow(medication_types)
## [1] 2330
#Answer: 2,330 medication types are located in the data set
partb1d <- count(medications, medication_id, patient_id)
nrow(partb1d)
## [1] 313745
#Answer: 313,745 specific meds were given to specific patients
nrow(disease_types)
## [1] 9175
#Answer: 9,175 different potential disease types are located in the data
partb1f <- count(diseases, disease_id, encounter_id)
nrow(partb1f)
## [1] 9687836
#Answer: 9,687,836 specific diseases of specific patients are located in the data
nrow(departments)
## [1] 74
#Answer: 74 departments are located in the data
nrow(providers)
## [1] 6097
#Answer: 6,097 providers are located in the data
partb1i <- count(patients, gender, race, marital_status)
print(partb1i)
## # A tibble: 126 x 4
## gender race marital_status n
## <chr> <chr> <chr> <int>
## 1 F AFRICAN AMERICAN DIVORCED 392
## 2 F AFRICAN AMERICAN LIFE PARTNER 2
## 3 F AFRICAN AMERICAN MARRIED 1043
## 4 F AFRICAN AMERICAN SEPARATED 175
## 5 F AFRICAN AMERICAN SINGLE 3565
## 6 F AFRICAN AMERICAN UNKNOWN 56
## 7 F AFRICAN AMERICAN WIDOWED 339
## 8 F ASIAN DIVORCED 46
## 9 F ASIAN MARRIED 1318
## 10 F ASIAN SEPARATED 14
## # ... with 116 more rows
2.Within the set of encounters, what are the ten most common: a. Medications (i.e., the most common medications prescribed to patients)?
topmedications <- merge(medications, medication_types, by="medication_id")
topmedicationstable <- count(topmedications, medication_name, sort=TRUE)
topmedicationstable <- top_n(topmedicationstable, 10)
## Selecting by n
print(topmedicationstable)
## medication_name n
## 1 HEPARIN 165533
## 2 SODIUM CHLORIDE 128494
## 3 ACETAMINOPHEN 98849
## 4 DOCUSATE SODIUM 80953
## 5 MORPHINE 69788
## 6 CHLORHEXIDINE GLUCONATE 61846
## 7 FAT EMULSION 59350
## 8 0.9% SODIUM CHLORIDE 56427
## 9 0.5 NORMAL SALINE WITH POTASSIUM CHLORIDE 54295
## 10 INSULIN REGULAR 52199
topdiseases <- merge(diseases, disease_types, by="disease_id")
topdiseasestable <-count(topdiseases, name, sort=TRUE)
topdiseasestable <- top_n(topdiseasestable, 10)
## Selecting by n
print(topdiseasestable)
## name
## 1 UNSPECIFIED ESSENTIAL HYPERTENSION
## 2 DIABETES MELLITUS WITHOUT MENTION OF COMPLICATION, TYPE II OR UNSPECIFIED TYPE, NOT STATED AS UNCONTROLLED
## 3 SUPERVISION OF OTHER NORMAL PREGNANCY
## 4 SCREENING EXAMINATION FOR PULMONARY TUBERCULOSIS
## 5 PAIN IN LIMB
## 6 LUMBAGO
## 7 OTHER AND UNSPECIFIED HYPERLIPIDEMIA
## 8 ASTHMA, UNSPECIFIED TYPE, UNSPECIFIED
## 9 DEPRESSIVE DISORDER, NOT ELSEWHERE CLASSIFIED
## 10 ROUTINE INFANT OR CHILD HEALTH CHECK
## n
## 1 199128
## 2 105793
## 3 96188
## 4 77709
## 5 74145
## 6 56882
## 7 56880
## 8 55351
## 9 54355
## 10 52834
topdepartments <- merge(departments, encounters, by="department_id")
topdepartmentstable <-count(topdepartments, department_name, sort=TRUE)
topdepartmentstable <- top_n(topdepartmentstable, 10)
## Selecting by n
print(topdepartmentstable)
## department_name n
## 1 INTERNAL MEDICINE 6161008
## 2 RADIOLOGY - GENERAL 274655
## 3 FAMILY PRACTICE 256618
## 4 OPHTHALMOLOGY 205301
## 5 ORTHOPAEDIC 152366
## 6 PEDIATRICS - GENERAL 127959
## 7 OBG - GENERAL 125574
## 8 CANCER CENTER 99552
## 9 PSYCHIATRY 98276
## 10 EMERGENCY MEDICINE 97810
topproviders <-merge(providers, encounters, by="provider_id")
topproviderstable <- count(topproviders, last_name, sort=TRUE)
topproviderstable <- top_n(topproviderstable, 10)
## Selecting by n
print(topdepartmentstable)
## department_name n
## 1 INTERNAL MEDICINE 6161008
## 2 RADIOLOGY - GENERAL 274655
## 3 FAMILY PRACTICE 256618
## 4 OPHTHALMOLOGY 205301
## 5 ORTHOPAEDIC 152366
## 6 PEDIATRICS - GENERAL 127959
## 7 OBG - GENERAL 125574
## 8 CANCER CENTER 99552
## 9 PSYCHIATRY 98276
## 10 EMERGENCY MEDICINE 97810
C) Data manipulation. Using the height and weight data that are available, calculate BMIs and report their means in three ways: by sex/gender; by race/ethnicity; and by sex/gender and race/ethnicity simultaneously (i.e., three different tables). Note that a patient’s height and weight are obtained at each encounter, so that a single patient may have many heights and weights, and thus many calculated BMIs in the data set. For purposes of this problem, you may ignore this issue, and just look at all means, duplicated or not. (See below for an optional challenge to filter out and only report the latest BMIs for patients.)
encounters <- mutate(encounters, BMI = (as.numeric(weight) * 703)/(as.numeric(height)^2))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
C1 <- merge(encounters, patients, by="patient_id")
patBMI <- merge(encounters, patients, by="patient_id")
patientrace <- select(patBMI, race, BMI)
patientgender<- select(patBMI, gender, BMI)
raceandgender <- select(patBMI, race, gender, BMI)
patientrace <- group_by(patientrace, race)
raceBMI <- summarize(patientrace, averageBMI=mean(BMI, na.rm=TRUE))
print(raceBMI)
## # A tibble: 10 x 2
## race averageBMI
## <chr> <dbl>
## 1 AFRICAN AMERICAN 46.1
## 2 ASIAN 27.5
## 3 CAUCASIAN 243.
## 4 DECLINED 26.5
## 5 HISPANIC 81.9
## 6 MULTIRACIAL 25.1
## 7 NATIVE AMERICAN 80.9
## 8 OTHER 30.2
## 9 PACIFIC ISLANDER 25.7
## 10 UNKNOWN 34.1
patientgender <- group_by(patientgender, gender)
genderBMI <-summarize(patientgender, averageBMI=mean(BMI, na.rm=TRUE))
print(genderBMI)
## # A tibble: 2 x 2
## gender averageBMI
## <chr> <dbl>
## 1 F 58.5
## 2 M 367.
patientBMImerged <- select (patBMI, race, gender, BMI)
patientBMImerged <- group_by(patientBMImerged, race, gender)
patientBMImergedavg <- summarize(patientBMImerged, average= mean(BMI, na.rm=TRUE))
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.
print(patientBMImergedavg)
## # A tibble: 20 x 3
## # Groups: race [10]
## race gender average
## <chr> <chr> <dbl>
## 1 AFRICAN AMERICAN F 40.0
## 2 AFRICAN AMERICAN M 52.3
## 3 ASIAN F 28.6
## 4 ASIAN M 25.8
## 5 CAUCASIAN F 62.6
## 6 CAUCASIAN M 454.
## 7 DECLINED F 26.8
## 8 DECLINED M 26.1
## 9 HISPANIC F 108.
## 10 HISPANIC M 54.6
## 11 MULTIRACIAL F 21.6
## 12 MULTIRACIAL M 28.6
## 13 NATIVE AMERICAN F 88.0
## 14 NATIVE AMERICAN M 70.4
## 15 OTHER F 29.5
## 16 OTHER M 30.9
## 17 PACIFIC ISLANDER F 26.2
## 18 PACIFIC ISLANDER M 25.3
## 19 UNKNOWN F 32.5
## 20 UNKNOWN M 35.9
LOS <- mutate(encounters, LOS=(discharge_date-admit_date))
LOS<- mutate(LOS, LOSday=floor(as.numeric(LOS)/86400))
LOS <- filter(LOS, LOSday >1)
LOSoverall<- summarize(LOS, average=mean(LOSday, nar.rm=TRUE))
print(LOSoverall)
## # A tibble: 1 x 1
## average
## <dbl>
## 1 9.98
LOStotal <- merge(LOS, patients, by="patient_id")
LOSgender <- select(LOStotal, LOSday, gender)
LOSgender <-group_by(LOSgender, gender)
LOSgenderavg <- summarize (LOSgender, average= mean(LOSday, na.rm=TRUE))
print(LOSgenderavg)
## # A tibble: 2 x 2
## gender average
## <chr> <dbl>
## 1 F 8.95
## 2 M 10.9
LOSoverall <- merge(LOS, departments, by="department_id")
LOSdepartment <- select(LOSoverall, LOSday, department_name)
LOSdepartment <- group_by(LOSdepartment, department_name)
LOSdepartmentavg <- summarize(LOSdepartment, average= mean(LOSday, na.rm=TRUE))
print(LOSdepartmentavg)
## # A tibble: 68 x 2
## department_name average
## <chr> <dbl>
## 1 ALLERGY AND IMMUNOLOGY 3.75
## 2 ANESTHESIOLOGY 6.61
## 3 AUDIOLOGY 121.
## 4 BEHAVIORAL HEALTH 10.3
## 5 BURN UNIT 15.7
## 6 CANCER CENTER 5.95
## 7 CARDIAC REHABILITATION 6.5
## 8 CARDIOLOGY 6.67
## 9 CLINICAL RESEARCH 5.29
## 10 DENTISTRY 4.89
## # ... with 58 more rows
LOSlarge <- merge(LOSoverall, patients, by = "patient_id")
LOSmerge <- select(LOSlarge, gender, department_name, LOSday)
LOSlarge <- merge(LOSoverall, patients, by = "patient_id")
LOSmerge <- select(LOSlarge, gender, department_name, LOSday)
LOSmerge<-group_by(LOSmerge, gender, department_name)
LOSmergeavg <- summarize(LOSmerge, average=mean(LOSday, na.rm=TRUE))
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
print(LOSmergeavg)
## # A tibble: 126 x 3
## # Groups: gender [2]
## gender department_name average
## <chr> <chr> <dbl>
## 1 F ALLERGY AND IMMUNOLOGY 4.33
## 2 F ANESTHESIOLOGY 6.7
## 3 F AUDIOLOGY 7
## 4 F BEHAVIORAL HEALTH 9.63
## 5 F BURN UNIT 18.0
## 6 F CANCER CENTER 5.5
## 7 F CARDIAC REHABILITATION 2
## 8 F CARDIOLOGY 6.46
## 9 F CLINICAL RESEARCH 6.5
## 10 F DENTISTRY 5
## # ... with 116 more rows
D) Data visualization. Create the following figures: 1. A histogram of patient age in years at time of encounter.
hist(encounters$age)
encounter_number <- count(encounters, patient_id)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v stringr 1.4.0
## v tidyr 1.2.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
ggplot (encounter_number, aes(x=n)) +
geom_bar()+
xlab("Patient Encounters")+
ylab("Freqeuncy")+
xlim(0,1000)
## Warning: Removed 62 rows containing non-finite values (stat_count).
library(tidyverse)
ggplot(patBMI, aes(x=age, y=BMI, color=gender))+
geom_point()+
xlim(0,100)+
ylim(0,70)
## Warning: Removed 8138896 rows containing missing values (geom_point).
ggplot(patBMI, aes(x=age, y=BMI, color=gender))+
geom_point()+
facet_wrap(~race, scale="free_y")+
xlim(0,100)+
ylim(0,100)
## Warning: Removed 8138264 rows containing missing values (geom_point).
E) Missing values. Some variables may have missing data. What are the approximate rates of missing data? Would any of these have an impact on data analysis? You may keep this discussion mostly qualitative, and you don’t have to report on every single variable – just the ones you think might be most important in typical analyses. Please discuss at least four variables.
nasummary <- summary(patBMI)
print(nasummary)
## patient_id encounter_id admit_date
## Min. : 4 Min. : 1 Min. :1943-04-25 11:33:41
## 1st Qu.: 303091 1st Qu.: 6060620 1st Qu.:1998-02-05 23:45:13
## Median : 571442 Median :12054864 Median :2004-10-27 09:16:15
## Mean : 624275 Mean :12074997 Mean :2003-08-30 21:24:19
## 3rd Qu.: 936300 3rd Qu.:18102795 3rd Qu.:2009-12-28 08:34:32
## Max. :1393915 Max. :24163384 Max. :2015-04-02 06:45:48
##
## discharge_date department_id provider_id age
## Min. :1947-06-21 17:17:58 Min. : 1.0 Min. : 1.0 Min. : 0.00
## 1st Qu.:1997-07-27 10:37:35 1st Qu.:22.0 1st Qu.: 141.2 1st Qu.: 24.00
## Median :2004-03-14 10:08:14 Median :22.0 Median :2492.0 Median : 41.00
## Mean :2003-03-14 10:01:59 Mean :25.3 Mean :2682.7 Mean : 41.24
## 3rd Qu.:2009-07-12 23:28:57 3rd Qu.:22.0 3rd Qu.:5155.0 3rd Qu.: 58.00
## Max. :2014-08-22 05:15:55 Max. :74.0 Max. :6225.0 Max. :111.70
## NA's :1601615
## bp_systolic bp_diastolic temperature pulse
## Length:8673082 Length:8673082 Length:8673082 Length:8673082
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## height weight BMI first_name
## Length:8673082 Length:8673082 Min. : 0 Length:8673082
## Class :character Class :character 1st Qu.: 20 Class :character
## Mode :character Mode :character Median : 25 Mode :character
## Mean : 201
## 3rd Qu.: 31
## Max. :39434397
## NA's :8134992
## middle_initial last_name gender race
## Length:8673082 Length:8673082 Length:8673082 Length:8673082
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## religion birth_date marital_status language
## Length:8673082 Min. :1895-04-21 Length:8673082 Length:8673082
## Class :character 1st Qu.:1945-08-31 Class :character Class :character
## Mode :character Median :1962-06-22 Mode :character Mode :character
## Mean :1962-05-20
## 3rd Qu.:1979-12-28
## Max. :2014-08-10
##
## occupation street city state
## Length:8673082 Length:8673082 Length:8673082 Length:8673082
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## zip
## Min. :24598
## 1st Qu.:27560
## Median :28086
## Mean :28034
## 3rd Qu.:28501
## Max. :37601
##
nadischarge <- 1601615/8673082
print(nadischarge)
## [1] 0.184665
naweight <- 7665346/8673082
print(naweight)
## [1] 0.8838088
naheight <- 8128158/8673082
print(naheight)
## [1] 0.9371707
naBMI <- 8134992/8673082
print(naBMI)
## [1] 0.9379586
The rate of missing data for discharges, weight, height, and BMI are 18.47%, 88.38%, 93.72%, and 93.80% respectively. Having such large portions of the data missing could affect the spread of our analyses quite significantly.
F) Data validation. Are there any unreasonable values for the data? That is, are there certain measurements that do not make any sense, given what you know or can find out about normal values? (Hint: Look at the ranges of observed values for different variables.) What would you recommend doing with these values, if found? Are there any data which appear to be duplicated? Again, you don’t have to report on every variable – just discuss the ones you think might be the most important in typical analyses. Again, please discuss at least four variables.
weightrange <- select(encounters, weight)
weightrange <- subset(weightrange, weight != "NULL")
weightrange <- as.numeric(weightrange$weight)
range(weightrange)
## [1] 0.01 1260.00
heightrange <- select(encounters, height)
heightrange <- subset(heightrange, height != "NULL")
heightrange <- as.numeric(heightrange$height)
range(heightrange)
## [1] 0.06 131.88
BMIrange <- select(encounters, BMI)
BMIrange <- subset(BMIrange, BMI != "NULL")
BMIrange <- as.numeric(BMIrange$BMI)
range(BMIrange)
## [1] 1.750052e-03 3.943440e+07
temprange <- select(encounters, temperature)
temprange <- subset(temprange, temperature != "NULL")
temprange <- as.numeric(temprange$temperature)
range(temprange)
## [1] 32.90 107.96
Regarding weight, the extremes were 0.01 and 1260.00. Removing such extreme outliers would aid the fidelity of the data.
Regarding height, the extremes were 0.06 and 131.88 inches. A possible solution may be to filter by a known average range and gender, excluding data that is extreme or abnormal.
Regarding BMI, the extremes were 1.75 e-03 and 3.94 e+07. One solution may be to exclude data that is beyond 3 standard deviations of the average BMI, essentially removing any outliers. However, it may be of interest to know what these outliers are so that follow up can be performed. It’s important that we don’t ignore our outliers completely, as they can be indicative of an issue in our reporting or the potential health of our patient.
Regarding temperature, the extremes were 32.90 and 107.96 degrees. It’s possible that creating a normal distribution could be helpful, and then if temperatures approach extremes, a warning could be activated to either correct the input or escalate care accordingly.
Part 2
Choose two of the following three options and add their analyses to your writeup:
part2BMI <- filter(patBMI, !is.na(patBMI$BMI))
part2BMIlast <- arrange(part2BMI, part2BMI$patient_id, part2BMI$admit_date)
part2BMIlast <- group_by(part2BMIlast, patient_id)
part2BMIlast <- summarize(part2BMIlast, across(everything(),last))
ageval <- as.Date(patBMI$admit_date)
ageval <- mutate(patBMI, "admit_date2" = as.Date(patBMI$admit_date) )
ageval <- mutate(ageval, "Age_Check" = (admit_date2-birth_date))
ageval <- mutate(ageval, "Age_Check" = as.numeric((admit_date2-birth_date)/365.25))
ageval <- mutate(ageval, "Age_Diff" = (Age_Check - age))
agevalcnt <- data.frame(count(ageval, Age_Diff >= 0.00274))
agevalcnt <- mutate(agevalcnt, "Percentage" = (n/8673082))
library(knitr)
kable(agevalcnt)
| Age_Diff….0.00274 | n | Percentage |
|---|---|---|
| FALSE | 3991642 | 0.4602334 |
| TRUE | 4681440 | 0.5397666 |