# CRP241
# Assignment 1 KeY
# last updated 09-06-21
# data dictionary for assignment1-demo-data
# id Patient ID code
# age age of patient in years
# sex 0 is male; 1 is female
# migraine 0 no migraine diagnosis; 1 migraine diagnosis
# chronicity chronicity of headaches in years
# group treatment group 0 is control and 1 is acupuncture
# data dictionary for assignment1-form-data
# p2cgp number of GP visits at 3 months
# p3cgp number of GP visits at 6 months
# p4cgp number of GP visits at 9 months
# p5cgp number of GP visits at 12 months
# Import Data
# Import assignment1-form-data.xlsx
# You may need to install the package
# install.packages("readxl")
library(readxl)
url <- "http://people.duke.edu/~sgrambow/crp241data/assignment1_form_data-2021.xlsx"
destfile <- "assignment1_form_data.xlsx"
curl::curl_download(url, destfile)
assignment1_form_data <- read_excel(destfile)
View(assignment1_form_data)
# Import assignment1-demo-data
url <- "http://people.duke.edu/~sgrambow/crp241data/assignment1_demo_data-2021.xlsx"
destfile <- "assignment1_demo_data.xlsx"
curl::curl_download(url, destfile)
assignment1_demo_data <- read_excel(destfile)
View(assignment1_demo_data)
# Problem 1a -- primary key for form data
# sort form-data & demo-data by id
# in order to sort you need to identify the primary
# key for each data frame.
# for the form data, it is id+form
# for the demo data, it is simply id
sorted_form <- assignment1_form_data[order(assignment1_form_data$id,
assignment1_form_data$form),]
sorted_demo <- assignment1_demo_data[order(assignment1_demo_data$id),]
# Problem 1b
# Figure 1 displays data values for how many unique study participants
# use the unique function
unique.participants <- unique(sorted_form$id)
length(unique.participants) # 4 participants
## [1] 7
# Problem 1c
# In Figure 1, how many of those unique participants
# have at least one missing data value
# use subset to select only the rows with missing values
missing_atleast1 <- subset(sorted_form, value=='NA')
# Now use unique to find how many unique participants
unique(missing_atleast1$id)
## [1] 401 414
# This gives IDs 150 and 151 so 2 of the 7 uniques
# You can compute this number using length function
length(unique(missing_atleast1$id))
## [1] 2
# which yields the answer 2
# Problem 1d
# Referring again to Figure 1, if you were
# interested in creating a data file that included
# only the score values for the 'p2cspec' form and participant id
#
# lets create a new data frame that
# take the original figure 1 data
# and subsets on FORM=p2cgp
new.subset <- subset(assignment1_form_data,form=="p2cgp")
new.subset$p2cgp_num <- new.subset$form
new.subset$form <- NULL
# Problem #1 e
# Suppose that an analyst on the study merged the
# form-data file shown in Figure 1 with the demo-data
# file shown in Figure 2.
# merge sorted_form & sorted_demo by id variable
test <- merge(sorted_form,sorted_demo,by="id")
# check merge using a temporary data frame object
str(test)
## 'data.frame': 28 obs. of 8 variables:
## $ id : num 401 401 401 401 404 404 404 404 406 406 ...
## $ form : chr "p2cgp" "p3cgp" "p4cgp" "p5cgp" ...
## $ value : chr "2" "NA" "1" "1" ...
## $ age : num 57 57 57 57 50 50 50 50 45 45 ...
## $ sex : num 0 0 0 0 0 0 0 0 1 1 ...
## $ migraine : num 0 0 0 0 0 0 0 0 1 1 ...
## $ chronicity: num 20 20 20 20 8 8 8 8 5 5 ...
## $ group : num 1 1 1 1 0 0 0 0 0 0 ...
View(test)
# rename to final data frame
combined <- test
# write combined data frame back to a .csv file
# completely reproducible process.
write.csv(combined,'combined.csv')