Day2 - Practise

library(reshape)
## Loading required package: plyr
## 
## Attaching package: 'reshape'
## 
## The following objects are masked from 'package:plyr':
## 
##     rename, round_any
library(ggplot2)

clindat <- read.table("~/scripts/day2/GSE7390.clindat.txt", header = T, sep = "\t")
clindat <- clindat[complete.cases(clindat), ]

str(clindat)
## 'data.frame':    80 obs. of  28 variables:
##  $ Patient     : Factor w/ 198 levels "GSM177885","GSM177886",..: 1 2 3 4 5 6 7 8 10 11 ...
##  $ AOL_os_10y  : num  62.7 69 66.2 84.9 80.2 83.1 80.6 82.1 81.5 77.8 ...
##  $ Angioinv    : int  1 1 0 1 1 1 1 1 2 1 ...
##  $ Histtype    : int  1 2 1 1 1 2 1 2 2 1 ...
##  $ Lymp_infil  : int  2 3 2 3 2 2 3 2 3 2 ...
##  $ NPI         : num  4.6 4.6 4.5 4.36 3.6 3.4 4.4 2.5 3.5 4.4 ...
##  $ Surgery_type: int  0 0 1 0 0 0 1 1 0 1 ...
##  $ age         : int  57 57 48 42 46 58 44 58 38 59 ...
##  $ e.dmfs      : int  1 0 1 1 1 0 0 1 1 1 ...
##  $ e.os        : int  1 0 1 1 1 0 0 1 1 1 ...
##  $ e.rfs       : int  1 1 1 1 1 0 1 1 1 1 ...
##  $ e.tdm       : int  1 0 1 0 1 0 0 0 1 1 ...
##  $ er          : int  0 1 0 1 1 1 0 1 1 1 ...
##  $ filename    : Factor w/ 198 levels "1-4105_GUYA.CEL.gz",..: 2 140 169 191 116 68 59 135 160 99 ...
##  $ grade       : int  3 3 3 3 2 2 3 1 2 3 ...
##  $ hospital    : Factor w/ 5 levels "GUY","IGR","JRH",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ id          : Factor w/ 197 levels "1000","1006",..: 142 143 144 145 146 147 148 149 151 152 ...
##  $ node        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ risk_AOL    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ risknpi     : int  2 2 2 2 2 2 2 1 2 2 ...
##  $ risksg      : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ samplename  : Factor w/ 198 levels "VDXGUYU_4002",..: 1 2 3 4 5 6 7 8 10 11 ...
##  $ size        : num  3 3 2.5 1.8 3 2 2 2.5 2.5 2 ...
##  $ t.dmfs      : int  723 6591 524 6255 3822 6507 5947 5816 1233 1136 ...
##  $ t.os        : int  937 6591 922 6255 4133 6507 5947 5816 1484 1911 ...
##  $ t.rfs       : int  723 183 524 2192 3822 6507 709 5816 422 1136 ...
##  $ t.tdm       : int  723 6591 524 6255 3822 6507 5947 5816 1233 1136 ...
##  $ veridex_risk: Factor w/ 2 levels "Good","Poor": 2 2 2 2 2 1 2 1 2 2 ...
summary(clindat)
##       Patient     AOL_os_10y      Angioinv       Histtype      Lymp_infil 
##  GSM177885: 1   Min.   :62.1   Min.   :0.00   Min.   :1.00   Min.   :1.0  
##  GSM177886: 1   1st Qu.:74.6   1st Qu.:1.00   1st Qu.:1.00   1st Qu.:1.0  
##  GSM177887: 1   Median :81.3   Median :1.00   Median :1.00   Median :2.0  
##  GSM177888: 1   Mean   :80.2   Mean   :1.24   Mean   :1.48   Mean   :1.9  
##  GSM177889: 1   3rd Qu.:87.2   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:2.0  
##  GSM177890: 1   Max.   :94.3   Max.   :3.00   Max.   :9.00   Max.   :3.0  
##  (Other)  :74                                                             
##       NPI        Surgery_type        age           e.dmfs     
##  Min.   :2.16   Min.   :0.000   Min.   :30.0   Min.   :0.000  
##  1st Qu.:3.30   1st Qu.:0.000   1st Qu.:42.0   1st Qu.:0.000  
##  Median :3.51   Median :0.000   Median :46.0   Median :0.000  
##  Mean   :3.67   Mean   :0.263   Mean   :46.2   Mean   :0.338  
##  3rd Qu.:4.40   3rd Qu.:1.000   3rd Qu.:50.2   3rd Qu.:1.000  
##  Max.   :4.90   Max.   :1.000   Max.   :59.0   Max.   :1.000  
##                                                               
##       e.os           e.rfs           e.tdm             er       
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.000   Median :0.000   Median :0.000   Median :1.000  
##  Mean   :0.338   Mean   :0.487   Mean   :0.287   Mean   :0.688  
##  3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.000   Max.   :1.000  
##                                                                 
##                 filename      grade      hospital       id          node  
##  1-4105_GUYA.CEL.gz : 1   Min.   :1.00   GUY:34   166154 : 1   Min.   :0  
##  10-4002_GUYA.CEL.gz: 1   1st Qu.:2.00   IGR:45   171150 : 1   1st Qu.:0  
##  11-4076_GUYA.CEL.gz: 1   Median :2.00   JRH: 0   171260 : 1   Median :0  
##  12-4073_GUYA.CEL.gz: 1   Mean   :2.21   KAR: 0   171558 : 1   Mean   :0  
##  14-4088_GUYA.CEL.gz: 1   3rd Qu.:3.00   RH : 1   171620 : 1   3rd Qu.:0  
##  15-4074_GUYA.CEL.gz: 1   Max.   :3.00            172086 : 1   Max.   :0  
##  (Other)            :74                           (Other):74              
##     risk_AOL        risknpi         risksg            samplename
##  Min.   :0.000   Min.   :1.00   Min.   :1.00   VDXGUYU_4002: 1  
##  1st Qu.:1.000   1st Qu.:1.00   1st Qu.:2.00   VDXGUYU_4008: 1  
##  Median :1.000   Median :2.00   Median :2.00   VDXGUYU_4011: 1  
##  Mean   :0.812   Mean   :1.73   Mean   :1.91   VDXGUYU_4014: 1  
##  3rd Qu.:1.000   3rd Qu.:2.00   3rd Qu.:2.00   VDXGUYU_4022: 1  
##  Max.   :1.000   Max.   :2.00   Max.   :2.00   VDXGUYU_4033: 1  
##                                                (Other)     :74  
##       size          t.dmfs          t.os          t.rfs     
##  Min.   :0.60   Min.   : 421   Min.   : 667   Min.   : 121  
##  1st Qu.:1.90   1st Qu.:1679   1st Qu.:2123   1st Qu.:1007  
##  Median :2.20   Median :4146   Median :4422   Median :3064  
##  Mean   :2.31   Mean   :3705   Mean   :3924   Mean   :3039  
##  3rd Qu.:2.62   3rd Qu.:5454   3rd Qu.:5542   3rd Qu.:4969  
##  Max.   :4.50   Max.   :7057   Max.   :7057   Max.   :6507  
##                                                             
##      t.tdm      veridex_risk
##  Min.   : 421   Good:25     
##  1st Qu.:1679   Poor:55     
##  Median :4146               
##  Mean   :3705               
##  3rd Qu.:5454               
##  Max.   :7057               
## 

How many rows in clindat, what does it mean by each row?

198 rows

How many columns in clindat?

28 variablen

The column age denotes the age of patients. Which patients is the oldest in the experiment?

60

How old is he/she? Which one is the youngest? How old is he/she? 24

How many patients are older than 55 year?

11

###Which patients are older than 59 year, please show the sample name, which is given in column samplename. hint: function which()

clindat[which(clindat$age > 59), ]$samplename
## factor(0)
## 198 Levels: VDXGUYU_4002 VDXGUYU_4008 VDXGUYU_4011 ... VDXRHU_535

The column hospital tells us by which hospital the sample is supplied. So how many samples are

from hospital IGR?

nrow(clindat[which(clindat$hospital == "IGR"), ])
## [1] 45

Each sample is graded before MA experiment, which is shown by column grade. So how many

different grades in our patients? hint: function unique()

unique(clindat$grade)
## [1] 3 2 1

The column e.tdm means “to distal metastasis”. This column contains binary value 0 and 1, namely, non- metastasis and metastasis samples.

How many samples are metastases?

nrow(clindat[which(clindat$e.tdm == 1), ])
## [1] 23

How many samples are non-metastases?

nrow(clindat[which(clindat$e.tdm == 0), ])
## [1] 57

Using scatter plot to explore the relation between quantitative variables.

plot(data = clindat, age ~ size)

plot of chunk unnamed-chunk-7

pairs(clindat[, c(2, 6, 8, 23, 24, 25, 26, 27)])

plot of chunk unnamed-chunk-7

clindat.mx <- cor(clindat[, c("age", "AOL_os_10y", "NPI", "size")], method = "spearman")
par(omd = c(0.1, 0.9, 0.1, 0.9))

heatmap(clindat.mx)

plot of chunk unnamed-chunk-8

Using boxplot to explore the relation between quantitative variables and categorical or ordinal

variables.

df2 <- data.frame(Surgery_type = clindat$Surgery_type, hospital = clindat$hospital, 
    age = clindat$age, NPI = clindat$NPI, AOL_os_10y = clindat$AOL_os_10y, grade = clindat$grade)
df2.melt <- melt(df2, id = "hospital")
ggplot(df2.melt, aes(x = variable, y = value, fill = hospital)) + geom_boxplot() + 
    scale_y_log10()
## Warning: Removed 59 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-9




boxplot(data = df2, age ~ hospital)

plot of chunk unnamed-chunk-9

meltData <- melt(clindat)
## Using Patient, filename, hospital, id, samplename, veridex_risk as id variables
boxplot(data = meltData, value ~ variable)

plot of chunk unnamed-chunk-9

p <- ggplot(meltData, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale = "free")

plot of chunk unnamed-chunk-9