require(plyr)
## Loading required package: plyr
require(psych)
## Loading required package: psych
df <- read.csv("matching_data.csv", header=TRUE)
df
## ID COPD V1 V2 Gender Age
## 1 A04271 cont 0 0 1 41
## 2 A23215 cont 0 0 1 64
## 3 A23216 cont 0 0 0 73
## 4 A38144 case 0 0 1 41
## 5 A38686 cont 0 0 0 45
## 6 B16848 case 0 0 1 64
## 7 C12345 case 0 0 1 45
## 8 D12345 cont 0 0 1 41
## 9 E12345 case 0 0 1 41
## 10 F12345 case 0 0 0 41
## 11 G12345 cont 0 0 0 41
## 12 H12345 case 0 0 0 41
## 13 I2345 case 0 0 0 41
## 14 G23454 cont 0 0 0 41
## 15 F12345 cont 0 0 0 41
## 16 F12345 cont 0 0 0 41
## 17 F12345 cont 0 0 0 41
## 18 F12345 cont 0 0 0 41
# Arrange the data frame according to Age, Gender, COPD
df <- arrange(df, Age, Gender, COPD)
df
## ID COPD V1 V2 Gender Age
## 1 F12345 case 0 0 0 41
## 2 H12345 case 0 0 0 41
## 3 I2345 case 0 0 0 41
## 4 G12345 cont 0 0 0 41
## 5 G23454 cont 0 0 0 41
## 6 F12345 cont 0 0 0 41
## 7 F12345 cont 0 0 0 41
## 8 F12345 cont 0 0 0 41
## 9 F12345 cont 0 0 0 41
## 10 A38144 case 0 0 1 41
## 11 E12345 case 0 0 1 41
## 12 A04271 cont 0 0 1 41
## 13 D12345 cont 0 0 1 41
## 14 A38686 cont 0 0 0 45
## 15 C12345 case 0 0 1 45
## 16 B16848 case 0 0 1 64
## 17 A23215 cont 0 0 1 64
## 18 A23216 cont 0 0 0 73
# Create the order of the groups in data frame for subseting data
df2 <- count(df, c('Age','Gender')) # get the number of each group combination of Age and Gender in df
df2$seq <- 1:nrow(df2) # create an order of group
Group <- NULL
for (i in 1:nrow(df2)){
seq <- rep(df2$seq[i], df2$freq[i])
Group <- c(Group, seq)
}
df$Group <- Group
df_new = NULL
for (i in unique(df$Group)){
df_subset <- subset(df, Group==i)
df_subset_case <- subset(df_subset, COPD=="case")
if (nrow(df_subset_case)==0){
df_subset_case <- df_subset_case
} else {
df_subset_case <- df_subset_case[sample(1:nrow(df_subset_case)),] # make the matching process ramdomly
df_subset_case$no <- seq(from=1, to = 2*nrow(df_subset_case)-1, by = 2)
}
df_subset_cont <- subset(df_subset, COPD=="cont")
if (nrow(df_subset_cont)==0){
df_subset_cont <- df_subset_cont
} else {
df_subset_cont <- df_subset_cont[sample(1:nrow(df_subset_cont)),] # # make the matching process ramdomly
df_subset_cont$no <- seq(from=2, to = 2*nrow(df_subset_cont ), by=2)
}
df_subset <- rbind(df_subset_case, df_subset_cont)
df_subset <- arrange(df_subset, no)
df_new <- rbind(df_new, df_subset)
}
df_new
## ID COPD V1 V2 Gender Age Group no
## 1 I2345 case 0 0 0 41 1 1
## 2 G23454 cont 0 0 0 41 1 2
## 3 H12345 case 0 0 0 41 1 3
## 4 F12345 cont 0 0 0 41 1 4
## 5 F12345 case 0 0 0 41 1 5
## 6 G12345 cont 0 0 0 41 1 6
## 7 F12345 cont 0 0 0 41 1 8
## 8 F12345 cont 0 0 0 41 1 10
## 9 F12345 cont 0 0 0 41 1 12
## 10 A38144 case 0 0 1 41 2 1
## 11 D12345 cont 0 0 1 41 2 2
## 12 E12345 case 0 0 1 41 2 3
## 13 A04271 cont 0 0 1 41 2 4
## 14 A38686 cont 0 0 0 45 3 2
## 15 C12345 case 0 0 1 45 4 1
## 16 B16848 case 0 0 1 64 5 1
## 17 A23215 cont 0 0 1 64 5 2
## 18 A23216 cont 0 0 0 73 6 2