R Markdown

require(ggplot2)

## Loading required package: ggplot2

require(reshape2)

## Loading required package: reshape2

Data_R_SAS_SPSS_Pubs <- read.csv('https://umich.instructure.com/files/2361245/download?download_frd=1', header=T)
df <- data.frame(Data_R_SAS_SPSS_Pubs)

# convert to long format (http://www.cookbook-r.com/Manipulating_data/Converting_data_between_wide_and_long_format/)
df <- melt(df , id.vars = 'Year', variable.name = 'Software')
ggplot(data=df, aes(x=Year, 
                    y=value, 
                    color=Software, 
                    group = Software)) + geom_line() + geom_line(size=4) + labs(x='Year', y='Citations')

rawdata_wide <- read.table(header=TRUE, text='
CaseID Gender Age Condition1 Condition2
1 M 5 13 10.5
2 F 6 16 11.2
3 F 8 10 18.3
4 M 9 9.5 18.1
5 M 10 12.1 19
')

# Make the CaseID column a factor
rawdata_wide$subject <- factor(rawdata_wide$CaseID)
rawdata_wide

##   CaseID Gender Age Condition1 Condition2 subject
## 1      1      M   5       13.0       10.5       1
## 2      2      F   6       16.0       11.2       2
## 3      3      F   8       10.0       18.3       3
## 4      4      M   9        9.5       18.1       4
## 5      5      M  10       12.1       19.0       5

library(reshape2)
# Specify id.vars: the variables to keep (don't split apart on!)
melt(rawdata_wide, id.vars=c("CaseID", "Gender"))

## Warning: attributes are not identical across measure variables; they will
## be dropped

##    CaseID Gender   variable value
## 1       1      M        Age     5
## 2       2      F        Age     6
## 3       3      F        Age     8
## 4       4      M        Age     9
## 5       5      M        Age    10
## 6       1      M Condition1    13
## 7       2      F Condition1    16
## 8       3      F Condition1    10
## 9       4      M Condition1   9.5
## 10      5      M Condition1  12.1
## 11      1      M Condition2  10.5
## 12      2      F Condition2  11.2
## 13      3      F Condition2  18.3
## 14      4      M Condition2  18.1
## 15      5      M Condition2    19
## 16      1      M    subject     1
## 17      2      F    subject     2
## 18      3      F    subject     3
## 19      4      M    subject     4
## 20      5      M    subject     5

data_long <- melt(rawdata_wide,
# ID variables - all the variables to keep but not split apart on
id.vars=c("CaseID", "Gender"),
# The source columns
measure.vars=c("Age", "Condition1", "Condition2" ),
# Name of the destination column that will identify the original
# column that the measurement came from
variable.name="Feature",
value.name="Measurement"
)
data_long

##    CaseID Gender    Feature Measurement
## 1       1      M        Age         5.0
## 2       2      F        Age         6.0
## 3       3      F        Age         8.0
## 4       4      M        Age         9.0
## 5       5      M        Age        10.0
## 6       1      M Condition1        13.0
## 7       2      F Condition1        16.0
## 8       3      F Condition1        10.0
## 9       4      M Condition1         9.5
## 10      5      M Condition1        12.1
## 11      1      M Condition2        10.5
## 12      2      F Condition2        11.2
## 13      3      F Condition2        18.3
## 14      4      M Condition2        18.1
## 15      5      M Condition2        19.0

Popular data generation functions are c(), seq(), rep(), and data.frame(). Sometimes, we may also use list() and array() to generate data.

a<-c(1, 2, 3, 5, 6, 7, 10, 1, 4)
a

## [1]  1  2  3  5  6  7 10  1  4

c(list(A = c(Z = 1, Y = 2), B = c(X = 7), C = c(W = 7, V=3, U=-1.9)), recursive = TRUE)

##  A.Z  A.Y  B.X  C.W  C.V  C.U 
##  1.0  2.0  7.0  7.0  3.0 -1.9

seq(1, 20, by=0.5)

##  [1]  1.0  1.5  2.0  2.5  3.0  3.5  4.0  4.5  5.0  5.5  6.0  6.5  7.0  7.5
## [15]  8.0  8.5  9.0  9.5 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5
## [29] 15.0 15.5 16.0 16.5 17.0 17.5 18.0 18.5 19.0 19.5 20.0

seq(1, 20, length=9)

## [1]  1.000  3.375  5.750  8.125 10.500 12.875 15.250 17.625 20.000

seq(along=c(5, 4, 5, 6))

## [1] 1 2 3 4

rep(c(1, 2, 3), 4)

##  [1] 1 2 3 1 2 3 1 2 3 1 2 3

rep(c(1, 2, 3), each=4)

##  [1] 1 1 1 1 2 2 2 2 3 3 3 3

X <- seq(along=c(1, 2, 3)); replicate(4, X+1)

##      [,1] [,2] [,3] [,4]
## [1,]    2    2    2    2
## [2,]    3    3    3    3
## [3,]    4    4    4    4

data.frame(v=1:4, ch=c("a", "B", "C", "d"), n=c(10, 11))

##   v ch  n
## 1 1  a 10
## 2 2  B 11
## 3 3  C 10
## 4 4  d 11

l<-list(a=c(1, 2), b="hi", c=-3+3i)
l

## $a
## [1] 1 2
## 
## $b
## [1] "hi"
## 
## $c
## [1] -3+3i

l$a[[2]]

## [1] 2

l$b

## [1] "hi"

ar <- array(1:24, dim=c(3, 4, 2)); ar

## , , 1
## 
##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9   12
## 
## , , 2
## 
##      [,1] [,2] [,3] [,4]
## [1,]   13   16   19   22
## [2,]   14   17   20   23
## [3,]   15   18   21   24

ar[2, 3, 1]

## [1] 8

ar[2, ,1]

## [1]  2  5  8 11

x <- seq(1, 10, by=0.5)
y <- list(a = 1, b = TRUE, c = "oops")
save(x, y, file="xy.RData")
load("xy.RData")

data("iris")
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

data.txt<-read.table("https://umich.instructure.com/files/1628628/download?download_frd=1", header=T, as.is = T) # 01a_data.txt
summary(data.txt)

##      Name               Team             Position             Height    
##  Length:1034        Length:1034        Length:1034        Min.   :67.0  
##  Class :character   Class :character   Class :character   1st Qu.:72.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :74.0  
##                                                           Mean   :73.7  
##                                                           3rd Qu.:75.0  
##                                                           Max.   :83.0  
##      Weight           Age       
##  Min.   :150.0   Min.   :20.90  
##  1st Qu.:187.0   1st Qu.:25.44  
##  Median :200.0   Median :27.93  
##  Mean   :201.7   Mean   :28.74  
##  3rd Qu.:215.0   3rd Qu.:31.23  
##  Max.   :290.0   Max.   :48.52

data.csv<-read.csv("https://umich.instructure.com/files/1628650/download?download_frd=1", header = T) # 01_hdp.csv
summary(data.csv)

##    tumorsize           co2             pain           wound      
##  Min.   : 33.97   Min.   :1.222   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 62.49   1st Qu.:1.519   1st Qu.:4.000   1st Qu.:5.000  
##  Median : 70.07   Median :1.601   Median :5.000   Median :6.000  
##  Mean   : 70.88   Mean   :1.605   Mean   :5.473   Mean   :5.732  
##  3rd Qu.: 79.02   3rd Qu.:1.687   3rd Qu.:6.000   3rd Qu.:7.000  
##  Max.   :116.46   Max.   :2.128   Max.   :9.000   Max.   :9.000  
##     mobility       ntumors        nmorphine        remission     
##  Min.   :1.00   Min.   :0.000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:5.00   1st Qu.:1.000   1st Qu.: 2.000   1st Qu.:0.0000  
##  Median :6.00   Median :3.000   Median : 3.000   Median :0.0000  
##  Mean   :6.08   Mean   :3.066   Mean   : 3.624   Mean   :0.2957  
##  3rd Qu.:7.00   3rd Qu.:5.000   3rd Qu.: 5.000   3rd Qu.:1.0000  
##  Max.   :9.00   Max.   :9.000   Max.   :18.000   Max.   :1.0000  
##   lungcapacity          Age           Married    FamilyHx     SmokingHx   
##  Min.   :0.01612   Min.   :26.32   Min.   :0.0   no :6820   current:1705  
##  1st Qu.:0.67647   1st Qu.:46.69   1st Qu.:0.0   yes:1705   former :1705  
##  Median :0.81560   Median :50.93   Median :1.0              never  :5115  
##  Mean   :0.77409   Mean   :50.97   Mean   :0.6                            
##  3rd Qu.:0.91150   3rd Qu.:55.27   3rd Qu.:1.0                            
##  Max.   :0.99980   Max.   :74.48   Max.   :1.0                            
##      Sex       CancerStage  LengthofStay         WBC            RBC       
##  female:5115   I  :2558    Min.   : 1.000   Min.   :2131   Min.   :3.919  
##  male  :3410   II :3409    1st Qu.: 5.000   1st Qu.:5323   1st Qu.:4.802  
##                III:1705    Median : 5.000   Median :6007   Median :4.994  
##                IV : 853    Mean   : 5.492   Mean   :5998   Mean   :4.995  
##                            3rd Qu.: 6.000   3rd Qu.:6663   3rd Qu.:5.190  
##                            Max.   :10.000   Max.   :9776   Max.   :6.065  
##       BMI             IL6                CRP               DID       
##  Min.   :18.38   Min.   : 0.03521   Min.   : 0.0451   Min.   :  1.0  
##  1st Qu.:24.20   1st Qu.: 1.93039   1st Qu.: 2.6968   1st Qu.:100.0  
##  Median :27.73   Median : 3.34400   Median : 4.3330   Median :199.0  
##  Mean   :29.07   Mean   : 4.01698   Mean   : 4.9730   Mean   :203.3  
##  3rd Qu.:32.54   3rd Qu.: 5.40551   3rd Qu.: 6.5952   3rd Qu.:309.0  
##  Max.   :58.00   Max.   :23.72777   Max.   :28.7421   Max.   :407.0  
##    Experience        School        Lawsuits          HID       
##  Min.   : 7.00   average:6405   Min.   :0.000   Min.   : 1.00  
##  1st Qu.:15.00   top    :2120   1st Qu.:1.000   1st Qu.: 9.00  
##  Median :18.00                  Median :2.000   Median :17.00  
##  Mean   :17.64                  Mean   :1.866   Mean   :17.76  
##  3rd Qu.:21.00                  3rd Qu.:3.000   3rd Qu.:27.00  
##  Max.   :29.00                  Max.   :9.000   Max.   :35.00  
##     Medicaid     
##  Min.   :0.1416  
##  1st Qu.:0.3369  
##  Median :0.5215  
##  Mean   :0.5125  
##  3rd Qu.:0.7083  
##  Max.   :0.8187

match(c(1, 2, 4, 5), c(1, 4, 4, 5, 6, 7))

## [1]  1 NA  2  4

length(x) gives us the number of elements in x.

x<-c(1, 3, 10, 23, 1, 3)
length(x)

## [1] 6

dim(x) retrieves or sets the dimension of an object.

x<-1:12
dim(x)<-c(3, 4)
x

##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9   12

dimnames(x) retrieves or sets the dimension names of an object. For higher dimensional objects like matrix or arrays we can combine dimnames() with list.

dimnames(x)<-list(c("R1", "R2", "R3"), c("C1", "C2", "C3", "C4")); x

##    C1 C2 C3 C4
## R1  1  4  7 10
## R2  2  5  8 11
## R3  3  6  9 12

nrow(x) number of rows; ncol(x) number of columns.

nrow(x)

## [1] 3

ncol(x)

## [1] 4

class(x) get or set the class of x. Note that we can use unclass(x) to remove the class attribute of x.

class(x)

## [1] "matrix"

class(x)<-"myclass"
x<-unclass(x)
x

##    C1 C2 C3 C4
## R1  1  4  7 10
## R2  2  5  8 11
## R3  3  6  9 12

attr(x, "class")

## NULL

attr(x, "dim")<-c(2, 6)
x

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    3    5    7    9   11
## [2,]    2    4    6    8   10   12

attributes(x) <- list(mycomment = "really special", dim = 3:4, 
                      dimnames = list(LETTERS[1:3], 
                                      letters[1:4]), 
                      names = paste(1:12))
x

##   a b c  d
## A 1 4 7 10
## B 2 5 8 11
## C 3 6 9 12
## attr(,"mycomment")
## [1] "really special"
## attr(,"names")
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"

Data selection and manipulation

x<-c(1, 5, 2, 1, 10, 40, 3)
which.max(x)

## [1] 6

which.min(x)

## [1] 1

rev(x)

## [1]  3 40 10  1  2  5  1

sort(x)

## [1]  1  1  2  3  5 10 40

rev(sort(x))

## [1] 40 10  5  3  2  1  1

cut(x, breaks) divides x into intervals with same length (sometimes factors). breaks is the number of cut intervals or a vector of cut points.

## [1]  1  5  2  1 10 40  3

cut(x, 3)

## [1] (0.961,14] (0.961,14] (0.961,14] (0.961,14] (0.961,14] (27,40]   
## [7] (0.961,14]
## Levels: (0.961,14] (14,27] (27,40]

cut(x, c(0, 5, 20, 30))

## [1] (0,5]  (0,5]  (0,5]  (0,5]  (5,20] <NA>   (0,5] 
## Levels: (0,5] (5,20] (20,30]

which(x ¼¼ a) returns a vector of the indices of x if the comparison operation is true (TRUE).

## [1]  1  5  2  1 10 40  3

which(x==2)

## [1] 3

na.omit(x) suppresses the observations with missing data (NA). It suppresses the corresponding line if x is a matrix or a data frame.

df<-data.frame(a=1:5, b=c(1, 3, NA, 9, 8)); df

##   a  b
## 1 1  1
## 2 2  3
## 3 3 NA
## 4 4  9
## 5 5  8

na.omit(df)

##   a b
## 1 1 1
## 2 2 3
## 4 4 9
## 5 5 8

unique(x) If x is a vector or a data frame, it returns a similar object but with the duplicate elements suppressed.

df1<-data.frame(a=c(1, 1, 7, 6, 8), b=c(1, 1, NA, 9, 8))
df1

##   a  b
## 1 1  1
## 2 1  1
## 3 7 NA
## 4 6  9
## 5 8  8

unique(df1)

##   a  b
## 1 1  1
## 3 7 NA
## 4 6  9
## 5 8  8

table(x) returns a table with the different values of x and their frequencies (typically for integers or factors).

v<-c(1, 2, 4, 2, 2, 5, 6, 4, 7, 8, 8)
table(v)

## v
## 1 2 4 5 6 7 8 
## 1 3 2 1 1 1 2

subset(x, …) returns a selection of x with respect to criteria

sub<-subset(df1, df1$a>5); sub

##   a  b
## 3 7 NA
## 4 6  9
## 5 8  8

sub<-subset(df1, select=-a)
sub

##    b
## 1  1
## 2  1
## 3 NA
## 4  9
## 5  8

sample(x, size) resamples randomly and without replacement size elements in the vector x, the option replace ¼ TRUE allows to resample with replacement.

##  [1] 1 2 4 2 2 5 6 4 7 8 8

sample(df1$a, 20, replace = T)

##  [1] 7 7 7 8 7 1 1 1 8 6 7 7 8 1 1 1 8 8 8 1

prop.table(x, margin¼) table entries as fraction of marginal table.

prop.table(table(v))

## v
##          1          2          4          5          6          7 
## 0.09090909 0.27272727 0.18181818 0.09090909 0.09090909 0.09090909 
##          8 
## 0.18181818

Matrix Operations

mat1 <- cbind(c(1, -1/5), c(-1/3, 1))
mat1.inv <- solve(mat1)

mat1.identity <- mat1.inv %*% mat1
mat1.identity

##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1

b <- c(1, 2)
x <- solve (mat1, b)
x

## [1] 1.785714 2.357143

Advanced Data Processing

apply(X, INDEX, FUN¼) a vector or array or list of values obtained by applying a function FUN to margins (INDEX ¼ 1 means row, INDEX ¼ 2 means column) of X.

df1

##   a  b
## 1 1  1
## 2 1  1
## 3 7 NA
## 4 6  9
## 5 8  8

apply(df1, 2, mean, na.rm=T)

##    a    b 
## 4.60 4.75

lapply(X, FUN) apply FUN to each member of the list X. If X is a data frame, it will apply the FUN to each column and return a list.

lapply(df1, mean, na.rm=T)

## $a
## [1] 4.6
## 
## $b
## [1] 4.75

lapply(list(a=c(1, 23, 5, 6, 1), b=c(9, 90, 999)), median)

## $a
## [1] 5
## 
## $b
## [1] 90

tapply(X, INDEX, FUN¼) apply FUN to each cell of a ragged array given by X with indexes equals to INDEX. Note that X is an atomic object, typically a vector.

##  [1] 1 2 4 2 2 5 6 4 7 8 8

fac <- factor(rep(1:3, length = 11), levels = 1:3)
table(fac)

## fac
## 1 2 3 
## 4 4 3

tapply(v, fac, sum)

##  1  2  3 
## 17 16 16

by(data, INDEX, FUN) apply FUN to data frame data subsetted by INDEX.

by(df1, df1[, 1], sum)

## df1[, 1]: 1
## [1] 4
## -------------------------------------------------------- 
## df1[, 1]: 6
## [1] 15
## -------------------------------------------------------- 
## df1[, 1]: 7
## [1] NA
## -------------------------------------------------------- 
## df1[, 1]: 8
## [1] 16

merge(a, b) merge two data frames by common columns or row names. We can use option by = to specify the index column.

df2<-data.frame(a=c(1, 1, 7, 6, 8), c=1:5)
df2

##   a c
## 1 1 1
## 2 1 2
## 3 7 3
## 4 6 4
## 5 8 5

df3<-merge(df1, df2, by="a")
df3

##   a  b c
## 1 1  1 1
## 2 1  1 2
## 3 1  1 1
## 4 1  1 2
## 5 6  9 4
## 6 7 NA 3
## 7 8  8 5

xtabs(a ~ b, data ¼ x) a contingency table from cross-classifying factors.

DF <- as.data.frame(UCBAdmissions)
## 'DF' is a data frame with a grid of the factors and the counts in variable 'Freq'.
DF

##       Admit Gender Dept Freq
## 1  Admitted   Male    A  512
## 2  Rejected   Male    A  313
## 3  Admitted Female    A   89
## 4  Rejected Female    A   19
## 5  Admitted   Male    B  353
## 6  Rejected   Male    B  207
## 7  Admitted Female    B   17
## 8  Rejected Female    B    8
## 9  Admitted   Male    C  120
## 10 Rejected   Male    C  205
## 11 Admitted Female    C  202
## 12 Rejected Female    C  391
## 13 Admitted   Male    D  138
## 14 Rejected   Male    D  279
## 15 Admitted Female    D  131
## 16 Rejected Female    D  244
## 17 Admitted   Male    E   53
## 18 Rejected   Male    E  138
## 19 Admitted Female    E   94
## 20 Rejected Female    E  299
## 21 Admitted   Male    F   22
## 22 Rejected   Male    F  351
## 23 Admitted Female    F   24
## 24 Rejected Female    F  317

## Nice for taking margins ...
xtabs(Freq ~ Gender + Admit, DF)

##         Admit
## Gender   Admitted Rejected
##   Male       1198     1493
##   Female      557     1278

## And for testing independence ...
summary(xtabs(Freq ~ ., DF))

## Call: xtabs(formula = Freq ~ ., data = DF)
## Number of cases in table: 4526 
## Number of factors: 3 
## Test for independence of all factors:
##  Chisq = 2000.3, df = 16, p-value = 0

aggregate(x, by, FUN) splits the data frame x into subsets, computes summary statistics for each, and returns the result in a convenient form. by is a list of grouping elements, that each have the same length as the variables in x.

list(rep(1:3, length=7))

## [[1]]
## [1] 1 2 3 1 2 3 1

aggregate(df3, by=list(rep(1:3, length=7)), sum)

##   Group.1  a  b c
## 1       1 10 10 8
## 2       2  7 10 6
## 3       3  8 NA 4

stack(x, …) transform data, stored as separate columns in a data frame or a list, into a single column and unstack(x, …) is the inverse of stack().

stack(df3)

##    values ind
## 1       1   a
## 2       1   a
## 3       1   a
## 4       1   a
## 5       6   a
## 6       7   a
## 7       8   a
## 8       1   b
## 9       1   b
## 10      1   b
## 11      1   b
## 12      9   b
## 13     NA   b
## 14      8   b
## 15      1   c
## 16      2   c
## 17      1   c
## 18      2   c
## 19      4   c
## 20      3   c
## 21      5   c

unstack(stack(df3))

##   a  b c
## 1 1  1 1
## 2 1  1 2
## 3 1  1 1
## 4 1  1 2
## 5 6  9 4
## 6 7 NA 3
## 7 8  8 5

reshape(x, …) reshapes a data frame between “wide” format with repeated measurements in separate columns of the same record and “long” format with the repeated measurements in separate records. Use direction = “wide” or direction = “long”.

df4 <- data.frame(school = rep(1:3, each = 4), class = rep(9:10, 6), 
                  time = rep(c(1, 1, 2, 2), 3), 
                  score = rnorm(12))
wide <- reshape(df4, 
                idvar = c("school", "class"), 
                direction = "wide")
wide

##    school class    score.1    score.2
## 1       1     9 -0.3709484  0.8752090
## 2       1    10 -0.7630987  1.2038424
## 5       2     9  1.1932622 -1.7511004
## 6       2    10  0.3346709  0.1487124
## 9       3     9 -0.8803714  1.5405514
## 10      3    10  1.9414497  1.4827459

long <- reshape(wide, idvar = c("school", "class"), direction = "long")
long

##        school class time    score.1
## 1.9.1       1     9    1 -0.3709484
## 1.10.1      1    10    1 -0.7630987
## 2.9.1       2     9    1  1.1932622
## 2.10.1      2    10    1  0.3346709
## 3.9.1       3     9    1 -0.8803714
## 3.10.1      3    10    1  1.9414497
## 1.9.2       1     9    2  0.8752090
## 1.10.2      1    10    2  1.2038424
## 2.9.2       2     9    2 -1.7511004
## 2.10.2      2    10    2  0.1487124
## 3.9.2       3     9    2  1.5405514
## 3.10.2      3    10    2  1.4827459

strings

paste(…) concatenates vectors after converting to character. It has a few options. Sep = is the string to separate terms (a single space is the default). collapse = is an optional string to separate “collapsed” results.

a<-"today"
b<-"is a good day"
paste(a, b)

## [1] "today is a good day"

paste(a, b, sep=", ")

## [1] "today, is a good day"

substr(x, start, stop) substrings in a character vector. It can also assign values (with the same length) to part of a string, as substr(x, start, stop) <- value.

a<-"When the going gets tough, the tough get going!"
substr(a, 10, 40)

## [1] "going gets tough, the tough get"

substr(a, 1, 9)<-"........."
a

## [1] ".........going gets tough, the tough get going!"

**strsplit(x, split) split x according to the substring split. Use fixed = TRUE for non-regular expressions.

strsplit("a.b.c", ".", fixed = TRUE)

## [[1]]
## [1] "a" "b" "c"

grep(pattern, x) searches for matches to pattern within x. It will return a vector of the indices of the elements of x that yielded a match.

letters

##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"

grep("[a-z]", letters)

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26

gsub(pattern, replacement, x) replacement of matches determined by regular expression matching. sub() is the same but only replaces the first occurrence.

a<-c("e", 0, "kj", 10, ";")
gsub("[a-z]", "letters", a)

## [1] "letters"        "0"              "lettersletters" "10"            
## [5] ";"

sub("[a-z]", "letters", a)

## [1] "letters"  "0"        "lettersj" "10"       ";"

match(x, table) a vector of the positions of first matches for the elements of x among table, with a short hand x %in% table, which returns a logical vector.pmatch(x, table) partial matches for the elements of x among table.

x<-c(1, 2, 10, 19, 29)
match(x, c(1, 10))

## [1]  1 NA  2 NA NA

x %in% c(1, 10)

## [1]  TRUE FALSE  TRUE FALSE FALSE

pmatch("m", c("mean", "median", "mode")) # returns NA

## [1] NA

pmatch("med", c("mean", "median", "mode")) # returns 2

## [1] 2

plotting

#QQ Normal Probability Plot

X <- rnorm(1000)
Y <- rcauchy(1500)
# compare X to StdNormal distribution
qqnorm(X, 
       main="Normal Q-Q Plot of the data", 
       xlab="Theoretical Quantiles of the Normal", 
       ylab="Sample Quantiles of the X (Normal) Data")
qqline(X)

qqplot(X, Y)

# Y against StdNormal
qqnorm(Y, 
       main="Normal Q-Q Plot of the data", 
       xlab="Theoretical Quantiles of the Normal", 
       ylab="Sample Quantiles of the Y (Cauchy) Data", ylim= range(-4, 4))
# Why is the y-range specified here?
qqline(Y)

# Q-Q plot data (X) vs. simulation(Y)
myQQ <- function(x, y, ...) {
  #rang <- range(x, y, na.rm=T)
  rang <- range(-4, 4, na.rm=T)
  qqplot(x, y, xlim=rang, ylim=rang)
}
myQQ(X, Y) # where the Y is the newly simulated data for X
qqline(X)

# Subsampling
x <- matrix(rnorm(100), ncol = 5)
y <- c(1, seq(19))
z <- cbind(x, y)
z.df <- data.frame(z)
z.df

##             V1          V2           V3         V4          V5  y
## 1  -1.22509468  0.82955659 -1.092971055  0.5878067  1.86982070  1
## 2   0.99245033  0.32499812 -0.436002389  1.5217486  0.98448510  1
## 3   0.97150642  0.80425693  3.062870216  0.4735737 -0.68743995  2
## 4  -0.25781948 -0.32469366  1.469630084 -0.2721595 -0.69735728  3
## 5  -0.49775853  0.04524268 -0.878116986  0.1905110 -0.48990186  4
## 6   1.61538522  0.27183953  0.072936409  0.6678561  0.58405016  5
## 7   2.29011285  0.08029258 -0.956966761  0.6911037 -0.56486222  6
## 8  -1.32503061 -0.63084685 -0.192148196 -1.1622565 -0.58877876  7
## 9   0.60489299 -0.69430928  1.069026019 -0.7955919  0.44487870  8
## 10 -0.57948663  0.18461281  0.001370878  0.3544221  0.56358930  9
## 11  1.18262147 -0.67733420  0.029742922 -1.1426595  0.40858927 10
## 12 -0.05163258  1.78901696 -0.250644104  1.2256983  0.22124809 11
## 13  2.21563405  0.61208738  0.406798982 -0.6885104 -0.82273583 12
## 14 -1.73642202 -0.95839720 -0.427841126 -0.6460477  0.60175773 13
## 15 -0.37743926  0.15708628  2.320528256  0.8858242 -0.38251903 14
## 16  0.41046832 -0.18634933  1.206240937 -1.0122277 -0.07589161 15
## 17 -2.55631033  0.20541021  0.502399456 -0.4503726  0.05881044 16
## 18  0.03435792  1.36978932  0.330742552  0.6657285  0.67296838 17
## 19  0.67374652  0.62210218 -0.843005080  1.2102319 -0.34578502 18
## 20 -0.17829250 -0.28242693  1.550986760  1.6681200 -1.32492210 19

names(z.df)

## [1] "V1" "V2" "V3" "V4" "V5" "y"

# subsetting rows
z.sub <- subset(z.df, y > 2 & (y<10 | V1>0))
z.sub

##             V1          V2           V3         V4          V5  y
## 4  -0.25781948 -0.32469366  1.469630084 -0.2721595 -0.69735728  3
## 5  -0.49775853  0.04524268 -0.878116986  0.1905110 -0.48990186  4
## 6   1.61538522  0.27183953  0.072936409  0.6678561  0.58405016  5
## 7   2.29011285  0.08029258 -0.956966761  0.6911037 -0.56486222  6
## 8  -1.32503061 -0.63084685 -0.192148196 -1.1622565 -0.58877876  7
## 9   0.60489299 -0.69430928  1.069026019 -0.7955919  0.44487870  8
## 10 -0.57948663  0.18461281  0.001370878  0.3544221  0.56358930  9
## 11  1.18262147 -0.67733420  0.029742922 -1.1426595  0.40858927 10
## 13  2.21563405  0.61208738  0.406798982 -0.6885104 -0.82273583 12
## 16  0.41046832 -0.18634933  1.206240937 -1.0122277 -0.07589161 15
## 18  0.03435792  1.36978932  0.330742552  0.6657285  0.67296838 17
## 19  0.67374652  0.62210218 -0.843005080  1.2102319 -0.34578502 18

z.sub1 <- z.df[z.df$y == 1, ]
z.sub1

##           V1        V2         V3        V4        V5 y
## 1 -1.2250947 0.8295566 -1.0929711 0.5878067 1.8698207 1
## 2  0.9924503 0.3249981 -0.4360024 1.5217486 0.9844851 1

z.sub2 <- z.df[z.df$y %in% c(1, 4), ]
z.sub2

##           V1         V2         V3        V4         V5 y
## 1 -1.2250947 0.82955659 -1.0929711 0.5878067  1.8698207 1
## 2  0.9924503 0.32499812 -0.4360024 1.5217486  0.9844851 1
## 5 -0.4977585 0.04524268 -0.8781170 0.1905110 -0.4899019 4

#subsetting columns
z.sub6 <- z.df[, 1:2]
z.sub6

##             V1          V2
## 1  -1.22509468  0.82955659
## 2   0.99245033  0.32499812
## 3   0.97150642  0.80425693
## 4  -0.25781948 -0.32469366
## 5  -0.49775853  0.04524268
## 6   1.61538522  0.27183953
## 7   2.29011285  0.08029258
## 8  -1.32503061 -0.63084685
## 9   0.60489299 -0.69430928
## 10 -0.57948663  0.18461281
## 11  1.18262147 -0.67733420
## 12 -0.05163258  1.78901696
## 13  2.21563405  0.61208738
## 14 -1.73642202 -0.95839720
## 15 -0.37743926  0.15708628
## 16  0.41046832 -0.18634933
## 17 -2.55631033  0.20541021
## 18  0.03435792  1.36978932
## 19  0.67374652  0.62210218
## 20 -0.17829250 -0.28242693

programming

The standard setting for defining new functions is: function.name<-function(x) { expr(an expression) return(value) }, where x is the parameter in the expression.

adding<-function(x=0, y=0){z<-x+y
return(z)}
adding(x=5, y=10)

## [1] 15

conditions setting: if(cond) {expr} or if(cond) cons.expr else alt.expr

x<-10
if(x>10) z="T" else z="F"
z

## [1] "F"

Alternatively, ifelse represents a vectorized and extremely efficient conditional mechanism that provides one of the main advantages of R.

For loop:for(var in seq) expr

x<-c()
for(i in 1:10) x[i]=i
x

##  [1]  1  2  3  4  5  6  7  8  9 10

Data simulation

data_1 <- read.csv("https://umich.instructure.com/files/1628625/download?dow
nload_frd=1", as.is=T, header=T)
# data_1 = read.csv(file.choose( ))
attach(data_1)
# to ensure all variables are accessible within R, e.g., using "age" instead of data_1$age

# i2 maximum number of drinks (standard units) consumed per day (in the past 30 days range 0-184) see also i1
# treat randomization group (0=usual care, 1=HELP clinic)
# pcs SF-36 Physical Component Score (range 14-75)
# mcs SF-36 Mental Component Score(range 7-62)
# cesd Center for Epidemiologic Studies Depression scale (range 0-60)
# indtot Inventory of Drug Use Consequences (InDUC) total score (range 4-45)
# pss_fr perceived social supports (friends, range 0-14) see also dayslink
# drugrisk Risk-Assessment Battery(RAB) drug risk score (range0-21)
# satreat any BSAS substance abuse treatment at baseline (0=no, 1=yes)

summary(data_1)

##        ID               i2              age            treat       
##  Min.   :  1.00   Min.   :  0.00   Min.   : 3.00   Min.   :0.0000  
##  1st Qu.: 24.25   1st Qu.:  1.00   1st Qu.:27.00   1st Qu.:0.0000  
##  Median : 50.50   Median : 15.50   Median :34.00   Median :0.0000  
##  Mean   : 50.29   Mean   : 27.08   Mean   :34.31   Mean   :0.1222  
##  3rd Qu.: 74.75   3rd Qu.: 39.00   3rd Qu.:43.00   3rd Qu.:0.0000  
##  Max.   :100.00   Max.   :137.00   Max.   :65.00   Max.   :2.0000  
##     homeless           pcs             mcs             cesd      
##  Min.   :0.0000   Min.   : 6.00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:0.0000   1st Qu.:41.25   1st Qu.:20.25   1st Qu.:17.25  
##  Median :0.0000   Median :48.50   Median :29.00   Median :30.00  
##  Mean   :0.1444   Mean   :47.61   Mean   :30.49   Mean   :30.21  
##  3rd Qu.:0.0000   3rd Qu.:57.00   3rd Qu.:39.75   3rd Qu.:43.00  
##  Max.   :1.0000   Max.   :76.00   Max.   :93.00   Max.   :68.00  
##      indtot          pss_fr          drugrisk         sexrisk      
##  Min.   : 0.00   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:31.25   1st Qu.: 2.000   1st Qu.: 0.000   1st Qu.: 1.250  
##  Median :36.00   Median : 6.000   Median : 0.000   Median : 5.000  
##  Mean   :37.03   Mean   : 6.533   Mean   : 2.578   Mean   : 4.922  
##  3rd Qu.:45.00   3rd Qu.:10.000   3rd Qu.: 3.000   3rd Qu.: 7.750  
##  Max.   :60.00   Max.   :20.000   Max.   :23.000   Max.   :13.000  
##     satreat            female         substance           racegrp         
##  Min.   :0.00000   Min.   :0.00000   Length:90          Length:90         
##  1st Qu.:0.00000   1st Qu.:0.00000   Class :character   Class :character  
##  Median :0.00000   Median :0.00000   Mode  :character   Mode  :character  
##  Mean   :0.07778   Mean   :0.05556                                        
##  3rd Qu.:0.00000   3rd Qu.:0.00000                                        
##  Max.   :1.00000   Max.   :1.00000

x.norm <- rnorm(n=200, m=10, sd=20)
hist(x.norm, main='N(10, 20) Histogram')

mean(data_1$age)

## [1] 34.31111

sd(data_1$age)

## [1] 11.68947

Next, we will simulate new synthetic data to match the properties/characteristics of the observed data (using Uniform, Normal, and Poisson distributions).

# i2 [0: 184]
# age m=34, sd=12
# treat {0, 1}
# homeless {0, 1}
# pcs 14-75
# mcs 7-62
# cesd 0-60
# indtot 4-45
# pss_fr 0-14
# drugrisk 0-21
# sexrisk
# satreat (0=no, 1=yes)
# female (0=no, 1=yes)
# racegrp (black, white, other)
# Demographics variables
# Define number of subjects
NumSubj <- 282
NumTime <- 4
# Define data elements
# Cases
Cases <- c(2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 17, 18, 20, 21, 22, 23, 24,
25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 37, 41, 42, 43, 44, 45, 53, 55, 58,
60, 62, 67, 69, 71, 72, 74, 79, 80, 85, 87, 90, 95, 97, 99, 100, 101, 106,
107, 109, 112, 120, 123, 125, 128, 129, 132, 134, 136, 139, 142, 147, 149,
153, 158, 160, 162, 163, 167, 172, 174, 178, 179, 180, 182, 192, 195, 201,
208, 211, 215, 217, 223, 227, 228, 233, 235, 236, 240, 245, 248, 250, 251,
254, 257, 259, 261, 264, 268, 269, 272, 273, 275, 279, 288, 289, 291, 296,
298, 303, 305, 309, 314, 318, 324, 325, 326, 328, 331, 332, 333, 334, 336,
338, 339, 341, 344, 346, 347, 350, 353, 354, 359, 361, 363, 364, 366, 367,
368, 369, 370, 371, 372, 374, 375, 376, 377, 378, 381, 382, 384, 385, 386,
387, 389, 390, 393, 395, 398, 400, 410, 421, 423, 428, 433, 435, 443, 447,
449, 450, 451, 453, 454, 455, 456, 457, 458, 459, 460, 461, 465, 466, 467,
470, 471, 472, 476, 477, 478, 479, 480, 481, 483, 484, 485, 486, 487, 488,
489, 492, 493, 494, 496, 498, 501, 504, 507, 510, 513, 515, 528, 530, 533,
537, 538, 542, 545, 546, 549, 555, 557, 559, 560, 566, 572, 573, 576, 582,
586, 590, 592, 597, 603, 604, 611, 619, 621, 623, 624, 625, 631, 633, 634,
635, 637, 640, 641, 643, 644, 645, 646, 647, 648, 649, 650, 652, 654, 656,
658, 660, 664, 665, 670, 673, 677, 678, 679, 680, 682, 683, 686, 687, 688,
689, 690, 692)
# Imaging Biomarkers
L_caudate_ComputeArea <- rpois(NumSubj, 600)
L_caudate_Volume <- rpois(NumSubj, 800)
R_caudate_ComputeArea <- rpois(NumSubj, 893)
R_caudate_Volume <- rpois(NumSubj, 1000)
L_putamen_ComputeArea <- rpois(NumSubj, 900)
L_putamen_Volume <- rpois(NumSubj, 1400)
R_putamen_ComputeArea <- rpois(NumSubj, 1300)
R_putamen_Volume <- rpois(NumSubj, 3000)
L_hippocampus_ComputeArea <- rpois(NumSubj, 1300)
L_hippocampus_Volume <- rpois(NumSubj, 3200)
R_hippocampus_ComputeArea <- rpois(NumSubj, 1500)
R_hippocampus_Volume <- rpois(NumSubj, 3800)
cerebellum_ComputeArea <- rpois(NumSubj, 16700)
cerebellum_Volume <- rpois(NumSubj, 14000)
L_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300)
L_lingual_gyrus_Volume <- rpois(NumSubj, 11000)
R_lingual_gyrus_ComputeArea <- rpois(NumSubj, 3300)
R_lingual_gyrus_Volume <- rpois(NumSubj, 12000)
L_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3600)
L_fusiform_gyrus_Volume <- rpois(NumSubj, 11000)
R_fusiform_gyrus_ComputeArea <- rpois(NumSubj, 3300)
R_fusiform_gyrus_Volume <- rpois(NumSubj, 10000)

Sex <- ifelse(runif(NumSubj)<.5, 0, 1)
Weight <- as.integer(rnorm(NumSubj, 80, 10))
Age <- as.integer(rnorm(NumSubj, 62, 10))

# Diagnostic labels (DX):
Dx <- c(rep("PD", 100), rep("HC", 100), rep("SWEDD", 82))
# Genetics traits
chr12_rs34637584_GT <- c(ifelse(runif(100)<.3, 0, 1), ifelse(runif(100)<.6,
0, 1), ifelse(runif(82)<.4, 0, 1)) # NumSubj Bernoulli trials
chr17_rs11868035_GT <- c(ifelse(runif(100)<.7, 0, 1), ifelse(runif(100)<.4,
0, 1), ifelse(runif(82)<.5, 0, 1)) # NumSubj Bernoulli trials
# Clinical # rpois(NumSubj, 15) + rpois(NumSubj, 6)
UPDRS_part_I <- c(ifelse(runif(100)<.7,0,1) + ifelse(runif(100) < .7, 0, 1),
ifelse(runif(100)<.6, 0, 1)+ ifelse(runif(100)<.6, 0, 1),
ifelse(runif(82)<.4, 0, 1)+ ifelse(runif(82)<.4, 0, 1) )

UPDRS_part_II <- c(sample.int(20, 100, replace=T), sample.int(14, 100,
replace=T),
sample.int(18, 82, replace=T) )
UPDRS_part_III <- c(sample.int(30, 100, replace=T), sample.int(20, 100,
replace=T), sample.int(25, 82, replace=T) )

# Time: VisitTime - done automatically below in aggregator
# Data (putting all components together)
sim_PD_Data <- cbind(
rep(Cases, each= NumTime), # Cases
rep(L_caudate_ComputeArea, each= NumTime), # Imaging
rep(Sex, each= NumTime), # Demographics
rep(Weight, each= NumTime),
rep(Age, each= NumTime),
rep(Dx, each= NumTime), # Dx
rep(chr12_rs34637584_GT, each= NumTime), # Genetics
rep(chr17_rs11868035_GT, each= NumTime),
rep(UPDRS_part_I, each= NumTime), # Clinical
rep(UPDRS_part_II, each= NumTime),
rep(UPDRS_part_III, each= NumTime),
rep(c(0, 6, 12, 18), NumSubj) # Time
)

# Assign the column names
colnames(sim_PD_Data) <- c(
"Cases","L_caudate_ComputeArea",
"Sex", "Weight", "Age",
"Dx", "chr12_rs34637584_GT", "chr17_rs11868035_GT",
"UPDRS_part_I", "UPDRS_part_II", "UPDRS_part_III", "Time"
)

# some QC
summary(sim_PD_Data)

##      Cases      L_caudate_ComputeArea Sex         Weight         Age     
##  10     :   4   604    : 40           0:620   80     : 64   61     : 68  
##  100    :   4   592    : 32           1:508   81     : 64   59     : 56  
##  101    :   4   594    : 28                   84     : 56   60     : 56  
##  106    :   4   597    : 28                   72     : 52   63     : 56  
##  107    :   4   575    : 24                   76     : 52   56     : 52  
##  109    :   4   580    : 24                   70     : 48   58     : 52  
##  (Other):1104   (Other):952                   (Other):792   (Other):788  
##      Dx      chr12_rs34637584_GT chr17_rs11868035_GT UPDRS_part_I
##  HC   :400   0:468               0:616               0:380       
##  PD   :400   1:660               1:512               1:548       
##  SWEDD:328                                           2:200       
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##  UPDRS_part_II UPDRS_part_III Time    
##  10     : 80   12     : 84    0 :282  
##  3      : 80   17     : 64    12:282  
##  13     : 76   10     : 60    18:282  
##  1      : 72   1      : 56    6 :282  
##  12     : 72   16     : 52            
##  14     : 72   8      : 52            
##  (Other):676   (Other):760

dim(sim_PD_Data)

## [1] 1128   12

head(sim_PD_Data)

##      Cases L_caudate_ComputeArea Sex Weight Age  Dx   chr12_rs34637584_GT
## [1,] "2"   "592"                 "0" "94"   "62" "PD" "1"                
## [2,] "2"   "592"                 "0" "94"   "62" "PD" "1"                
## [3,] "2"   "592"                 "0" "94"   "62" "PD" "1"                
## [4,] "2"   "592"                 "0" "94"   "62" "PD" "1"                
## [5,] "3"   "628"                 "0" "73"   "43" "PD" "1"                
## [6,] "3"   "628"                 "0" "73"   "43" "PD" "1"                
##      chr17_rs11868035_GT UPDRS_part_I UPDRS_part_II UPDRS_part_III Time
## [1,] "0"                 "1"          "14"          "3"            "0" 
## [2,] "0"                 "1"          "14"          "3"            "6" 
## [3,] "0"                 "1"          "14"          "3"            "12"
## [4,] "0"                 "1"          "14"          "3"            "18"
## [5,] "0"                 "0"          "18"          "17"           "0" 
## [6,] "0"                 "0"          "18"          "17"           "6"

hist(data_1$age, freq=FALSE, right=FALSE, ylim = c(0,0.05))
lines(density(as.numeric(as.data.frame(sim_PD_Data)$Age)),lwd=2, col="blue")
legend("topright", c("Raw Data", "Simulated Data"), fill=c("black", "blue"))

# Save Results
# Write out (save) the result to a file that can be shared
write.table(sim_PD_Data, "output_data.csv", sep=", ", row.names=FALSE, col.names=TRUE)

HTML SOCR Data Import

SOCR Datasets can automatically be downloaded into the R environment using the following protocol.

library(rvest)

## Loading required package: xml2

wiki_url <- read_html("http://wiki.socr.umich.edu/index.php/SOCR_Data_PD_BiomedBigMetadata")
html_nodes(wiki_url, "#content")

## {xml_nodeset (1)}
## [1] <div id="content" class="mw-body-primary" role="main">\n\t<a id="top ...

pd_data <- html_table(html_nodes(wiki_url, "table")[[1]])
head(pd_data); summary(pd_data)

##   Cases L_caudate_ComputeArea L_caudate_Volume R_caudate_ComputeArea
## 1     2                   597              767                   855
## 2     2                   597              767                   855
## 3     2                   597              767                   855
## 4     2                   597              767                   855
## 5     3                   604              873                   935
## 6     3                   604              873                   935
##   R_caudate_Volume L_putamen_ComputeArea L_putamen_Volume
## 1              968                   842             1357
## 2              968                   842             1357
## 3              968                   842             1357
## 4              968                   842             1357
## 5             1043                   892             1366
## 6             1043                   892             1366
##   R_putamen_ComputeArea R_putamen_Volume L_hippocampus_ComputeArea
## 1                  1285             3052                      1306
## 2                  1285             3052                      1306
## 3                  1285             3052                      1306
## 4                  1285             3052                      1306
## 5                  1305             2920                      1292
## 6                  1305             2920                      1292
##   L_hippocampus_Volume R_hippocampus_ComputeArea R_hippocampus_Volume
## 1                 3238                      1513                 3759
## 2                 3238                      1513                 3759
## 3                 3238                      1513                 3759
## 4                 3238                      1513                 3759
## 5                 3079                      1516                 3827
## 6                 3079                      1516                 3827
##   cerebellum_ComputeArea cerebellum_Volume L_lingual_gyrus_ComputeArea
## 1                  16845             13949                        3268
## 2                  16845             13949                        3268
## 3                  16845             13949                        3268
## 4                  16845             13949                        3268
## 5                  16698             14076                        3243
## 6                  16698             14076                        3243
##   L_lingual_gyrus_Volume R_lingual_gyrus_ComputeArea
## 1                  11130                        3294
## 2                  11130                        3294
## 3                  11130                        3294
## 4                  11130                        3294
## 5                  11033                        3190
## 6                  11033                        3190
##   R_lingual_gyrus_Volume L_fusiform_gyrus_ComputeArea
## 1                  12221                         3625
## 2                  12221                         3625
## 3                  12221                         3625
## 4                  12221                         3625
## 5                  12187                         3631
## 6                  12187                         3631
##   L_fusiform_gyrus_Volume R_fusiform_gyrus_ComputeArea
## 1                   11087                         3232
## 2                   11087                         3232
## 3                   11087                         3232
## 4                   11087                         3232
## 5                   11116                         3302
## 6                   11116                         3302
##   R_fusiform_gyrus_Volume Sex Weight Age Dx chr12_rs34637584_GT
## 1                   10122   1     84  67 PD                   1
## 2                   10122   1     84  67 PD                   1
## 3                   10122   1     84  67 PD                   1
## 4                   10122   1     84  67 PD                   1
## 5                   10162   0     97  39 PD                   1
## 6                   10162   0     97  39 PD                   1
##   chr17_rs11868035_GT UPDRS_part_I UPDRS_part_II UPDRS_part_III Time
## 1                   0            1            12              1    0
## 2                   0            1            12              1    6
## 3                   0            1            12              1   12
## 4                   0            1            12              1   18
## 5                   1            0            19             22    0
## 6                   1            0            19             22    6

##      Cases       L_caudate_ComputeArea L_caudate_Volume
##  Min.   :  2.0   Min.   :525.0         Min.   :719.0   
##  1st Qu.:158.0   1st Qu.:582.0         1st Qu.:784.0   
##  Median :363.5   Median :600.0         Median :800.0   
##  Mean   :346.1   Mean   :600.4         Mean   :800.3   
##  3rd Qu.:504.0   3rd Qu.:619.0         3rd Qu.:819.0   
##  Max.   :692.0   Max.   :667.0         Max.   :890.0   
##  R_caudate_ComputeArea R_caudate_Volume L_putamen_ComputeArea
##  Min.   :795.0         Min.   : 916     Min.   : 815.0       
##  1st Qu.:875.0         1st Qu.: 979     1st Qu.: 879.0       
##  Median :897.0         Median : 998     Median : 897.5       
##  Mean   :894.5         Mean   :1001     Mean   : 898.9       
##  3rd Qu.:916.0         3rd Qu.:1022     3rd Qu.: 919.0       
##  Max.   :977.0         Max.   :1094     Max.   :1003.0       
##  L_putamen_Volume R_putamen_ComputeArea R_putamen_Volume
##  Min.   :1298     Min.   :1198          Min.   :2846    
##  1st Qu.:1376     1st Qu.:1276          1st Qu.:2959    
##  Median :1400     Median :1302          Median :3000    
##  Mean   :1400     Mean   :1300          Mean   :3000    
##  3rd Qu.:1427     3rd Qu.:1321          3rd Qu.:3039    
##  Max.   :1507     Max.   :1392          Max.   :3148    
##  L_hippocampus_ComputeArea L_hippocampus_Volume R_hippocampus_ComputeArea
##  Min.   :1203              Min.   :3036         Min.   :1414             
##  1st Qu.:1277              1st Qu.:3165         1st Qu.:1479             
##  Median :1300              Median :3200         Median :1504             
##  Mean   :1302              Mean   :3198         Mean   :1504             
##  3rd Qu.:1325              3rd Qu.:3228         3rd Qu.:1529             
##  Max.   :1422              Max.   :3381         Max.   :1602             
##  R_hippocampus_Volume cerebellum_ComputeArea cerebellum_Volume
##  Min.   :3634         Min.   :16378          Min.   :13680    
##  1st Qu.:3761         1st Qu.:16617          1st Qu.:13933    
##  Median :3802         Median :16699          Median :13996    
##  Mean   :3799         Mean   :16700          Mean   :14002    
##  3rd Qu.:3833         3rd Qu.:16784          3rd Qu.:14077    
##  Max.   :4013         Max.   :17096          Max.   :14370    
##  L_lingual_gyrus_ComputeArea L_lingual_gyrus_Volume
##  Min.   :3136                Min.   :10709         
##  1st Qu.:3262                1st Qu.:10943         
##  Median :3299                Median :11007         
##  Mean   :3300                Mean   :11010         
##  3rd Qu.:3333                3rd Qu.:11080         
##  Max.   :3469                Max.   :11488         
##  R_lingual_gyrus_ComputeArea R_lingual_gyrus_Volume
##  Min.   :3135                Min.   :11679         
##  1st Qu.:3258                1st Qu.:11935         
##  Median :3294                Median :12001         
##  Mean   :3296                Mean   :12008         
##  3rd Qu.:3338                3rd Qu.:12079         
##  Max.   :3490                Max.   :12324         
##  L_fusiform_gyrus_ComputeArea L_fusiform_gyrus_Volume
##  Min.   :3446                 Min.   :10682          
##  1st Qu.:3554                 1st Qu.:10947          
##  Median :3594                 Median :11016          
##  Mean   :3598                 Mean   :11011          
##  3rd Qu.:3637                 3rd Qu.:11087          
##  Max.   :3763                 Max.   :11394          
##  R_fusiform_gyrus_ComputeArea R_fusiform_gyrus_Volume      Sex        
##  Min.   :3094                 Min.   : 9736           Min.   :0.0000  
##  1st Qu.:3260                 1st Qu.: 9928           1st Qu.:0.0000  
##  Median :3296                 Median : 9994           Median :1.0000  
##  Mean   :3299                 Mean   : 9996           Mean   :0.5851  
##  3rd Qu.:3332                 3rd Qu.:10058           3rd Qu.:1.0000  
##  Max.   :3443                 Max.   :10235           Max.   :1.0000  
##      Weight            Age             Dx            chr12_rs34637584_GT
##  Min.   : 51.00   Min.   :31.00   Length:1128        Min.   :0.000      
##  1st Qu.: 71.00   1st Qu.:54.00   Class :character   1st Qu.:0.000      
##  Median : 78.50   Median :61.00   Mode  :character   Median :1.000      
##  Mean   : 78.45   Mean   :60.64                      Mean   :0.539      
##  3rd Qu.: 84.00   3rd Qu.:68.00                      3rd Qu.:1.000      
##  Max.   :109.00   Max.   :87.00                      Max.   :1.000      
##  chr17_rs11868035_GT  UPDRS_part_I   UPDRS_part_II    UPDRS_part_III 
##  Min.   :0.0000      Min.   :0.000   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.:0.0000      1st Qu.:0.000   1st Qu.: 5.000   1st Qu.: 6.00  
##  Median :0.0000      Median :1.000   Median : 9.000   Median :13.00  
##  Mean   :0.4184      Mean   :0.773   Mean   : 8.879   Mean   :13.02  
##  3rd Qu.:1.0000      3rd Qu.:1.000   3rd Qu.:13.000   3rd Qu.:18.00  
##  Max.   :1.0000      Max.   :2.000   Max.   :20.000   Max.   :30.00  
##       Time     
##  Min.   : 0.0  
##  1st Qu.: 4.5  
##  Median : 9.0  
##  Mean   : 9.0  
##  3rd Qu.:13.5  
##  Max.   :18.0

Predictive Analytics

Kushan De Silva

4 March 2019