# CRP241 
# Assignment 1 KeY
# last updated 09-06-21

# data dictionary for assignment1-demo-data
# id           Patient ID code
# age          age of patient in years
# sex          0 is male; 1 is female
# migraine     0 no migraine diagnosis; 1 migraine diagnosis
# chronicity   chronicity of headaches in years
# group        treatment group 0 is control and 1 is acupuncture

# data dictionary for assignment1-form-data
# p2cgp      number of GP visits at 3 months
# p3cgp      number of GP visits at 6 months
# p4cgp      number of GP visits at 9 months
# p5cgp      number of GP visits at 12 months
# Import Data
# Import assignment1-form-data.xlsx
# You may need to install the package
# install.packages("readxl")
library(readxl)
url <- "http://people.duke.edu/~sgrambow/crp241data/assignment1_form_data-2021.xlsx"
destfile <- "assignment1_form_data.xlsx"
curl::curl_download(url, destfile)
assignment1_form_data <- read_excel(destfile)
View(assignment1_form_data)

# Import assignment1-demo-data
url <- "http://people.duke.edu/~sgrambow/crp241data/assignment1_demo_data-2021.xlsx"
destfile <- "assignment1_demo_data.xlsx"
curl::curl_download(url, destfile)
assignment1_demo_data <- read_excel(destfile)
View(assignment1_demo_data)

# Problem 1a -- primary key for form data
# sort form-data & demo-data by id
# in order to sort you need to identify the primary
# key for each data frame.
# for the form data, it is id+form
# for the demo data, it is simply id
sorted_form <- assignment1_form_data[order(assignment1_form_data$id,
                                           assignment1_form_data$form),]
sorted_demo <- assignment1_demo_data[order(assignment1_demo_data$id),]

# Problem 1b
# Figure 1 displays data values for how many unique study participants
# use the unique function
unique.participants <- unique(sorted_form$id)
length(unique.participants) # 4 participants
## [1] 7
# Problem 1c
# In Figure 1, how many of those unique participants 
# have at least one missing data value
# use subset to select only the rows with missing values
missing_atleast1 <- subset(sorted_form, value=='NA')
# Now use unique to find how many unique participants
unique(missing_atleast1$id)
## [1] 401 414
# This gives IDs 150 and 151 so 2 of the 7 uniques
# You can compute this number using length function
length(unique(missing_atleast1$id))
## [1] 2
# which yields the answer 2

# Problem 1d
# Referring again to Figure 1, if you were 
# interested in creating a data file that included 
# only the score values for the 'p2cspec' form and participant id
# 
# lets create a new data frame that 
# take the original figure 1 data
# and subsets on FORM=p2cgp
new.subset <- subset(assignment1_form_data,form=="p2cgp")
new.subset$p2cgp_num <- new.subset$form
new.subset$form <- NULL

# Problem #1 e
# Suppose that an analyst on the study merged the 
# form-data file shown in Figure 1 with the demo-data 
# file shown in Figure 2.  
# merge sorted_form & sorted_demo by id variable
test <- merge(sorted_form,sorted_demo,by="id")

# check merge using a temporary data frame object
str(test)
## 'data.frame':    28 obs. of  8 variables:
##  $ id        : num  401 401 401 401 404 404 404 404 406 406 ...
##  $ form      : chr  "p2cgp" "p3cgp" "p4cgp" "p5cgp" ...
##  $ value     : chr  "2" "NA" "1" "1" ...
##  $ age       : num  57 57 57 57 50 50 50 50 45 45 ...
##  $ sex       : num  0 0 0 0 0 0 0 0 1 1 ...
##  $ migraine  : num  0 0 0 0 0 0 0 0 1 1 ...
##  $ chronicity: num  20 20 20 20 8 8 8 8 5 5 ...
##  $ group     : num  1 1 1 1 0 0 0 0 0 0 ...
View(test)

# rename to final data frame
combined <- test

# write combined data frame back to a .csv file
# completely reproducible process.
write.csv(combined,'combined.csv')