Converting data

## Load data from survival package
library(survival)
## Loading required package: splines
data(pbc)
## I'll use first 6 rows only for demonstration.
pbcHead <- head(pbc)
## Check dimension
dim(pbcHead)
## [1]  6 20

## The class determines how the object is displayed.
class(pbcHead)
## [1] "data.frame"
## Implicit print()
pbcHead
##   id time status trt   age sex ascites hepato spiders edema bili chol
## 1  1  400      2   1 58.77   f       1      1       1   1.0 14.5  261
## 2  2 4500      0   1 56.45   f       0      1       1   0.0  1.1  302
## 3  3 1012      2   1 70.07   m       0      0       0   0.5  1.4  176
## 4  4 1925      2   1 54.74   f       0      1       1   0.5  1.8  244
## 5  5 1504      1   2 38.11   f       0      1       1   0.0  3.4  279
## 6  6 2503      2   2 66.26   f       0      1       0   0.0  0.8  248
##   albumin copper alk.phos    ast trig platelet protime stage
## 1    2.60    156     1718 137.95  172      190    12.2     4
## 2    4.14     54     7395 113.52   88      221    10.6     3
## 3    3.48    210      516  96.10   55      151    12.0     4
## 4    2.54     64     6122  60.63   92      183    10.3     4
## 5    3.53    143      671 113.15   72      136    10.9     3
## 6    3.98     50      944  93.00   63       NA    11.0     3
## print.default ignores the class, and shows the raw data (a list of
## variables)
print.default(pbcHead)
## $id
## [1] 1 2 3 4 5 6
## 
## $time
## [1]  400 4500 1012 1925 1504 2503
## 
## $status
## [1] 2 0 2 2 1 2
## 
## $trt
## [1] 1 1 1 1 2 2
## 
## $age
## [1] 58.77 56.45 70.07 54.74 38.11 66.26
## 
## $sex
## [1] f f m f f f
## Levels: m f
## 
## $ascites
## [1] 1 0 0 0 0 0
## 
## $hepato
## [1] 1 1 0 1 1 1
## 
## $spiders
## [1] 1 1 0 1 1 0
## 
## $edema
## [1] 1.0 0.0 0.5 0.5 0.0 0.0
## 
## $bili
## [1] 14.5  1.1  1.4  1.8  3.4  0.8
## 
## $chol
## [1] 261 302 176 244 279 248
## 
## $albumin
## [1] 2.60 4.14 3.48 2.54 3.53 3.98
## 
## $copper
## [1] 156  54 210  64 143  50
## 
## $alk.phos
## [1] 1718 7395  516 6122  671  944
## 
## $ast
## [1] 137.95 113.52  96.10  60.63 113.15  93.00
## 
## $trig
## [1] 172  88  55  92  72  63
## 
## $platelet
## [1] 190 221 151 183 136  NA
## 
## $protime
## [1] 12.2 10.6 12.0 10.3 10.9 11.0
## 
## $stage
## [1] 4 3 4 4 3 3
## 
## attr(,"class")
## [1] "data.frame"
## print() actually calls print.data.frame respecting the class
print(pbcHead)
##   id time status trt   age sex ascites hepato spiders edema bili chol
## 1  1  400      2   1 58.77   f       1      1       1   1.0 14.5  261
## 2  2 4500      0   1 56.45   f       0      1       1   0.0  1.1  302
## 3  3 1012      2   1 70.07   m       0      0       0   0.5  1.4  176
## 4  4 1925      2   1 54.74   f       0      1       1   0.5  1.8  244
## 5  5 1504      1   2 38.11   f       0      1       1   0.0  3.4  279
## 6  6 2503      2   2 66.26   f       0      1       0   0.0  0.8  248
##   albumin copper alk.phos    ast trig platelet protime stage
## 1    2.60    156     1718 137.95  172      190    12.2     4
## 2    4.14     54     7395 113.52   88      221    10.6     3
## 3    3.48    210      516  96.10   55      151    12.0     4
## 4    2.54     64     6122  60.63   92      183    10.3     4
## 5    3.53    143      671 113.15   72      136    10.9     3
## 6    3.98     50      944  93.00   63       NA    11.0     3
## print.data.frame() actually does the job, thus same output
print.data.frame(pbcHead)
##   id time status trt   age sex ascites hepato spiders edema bili chol
## 1  1  400      2   1 58.77   f       1      1       1   1.0 14.5  261
## 2  2 4500      0   1 56.45   f       0      1       1   0.0  1.1  302
## 3  3 1012      2   1 70.07   m       0      0       0   0.5  1.4  176
## 4  4 1925      2   1 54.74   f       0      1       1   0.5  1.8  244
## 5  5 1504      1   2 38.11   f       0      1       1   0.0  3.4  279
## 6  6 2503      2   2 66.26   f       0      1       0   0.0  0.8  248
##   albumin copper alk.phos    ast trig platelet protime stage
## 1    2.60    156     1718 137.95  172      190    12.2     4
## 2    4.14     54     7395 113.52   88      221    10.6     3
## 3    3.48    210      516  96.10   55      151    12.0     4
## 4    2.54     64     6122  60.63   92      183    10.3     4
## 5    3.53    143      671 113.15   72      136    10.9     3
## 6    3.98     50      944  93.00   63       NA    11.0     3

## As you can see print.default() output, each element is a vector
pbcHead$time
## [1]  400 4500 1012 1925 1504 2503
## Each vector also has class
class(pbcHead$time)
## [1] "integer"
## sex appears differently
pbcHead$sex
## [1] f f m f f f
## Levels: m f
## because it is a factor, i.e., categorical variable
class(pbcHead$sex)
## [1] "factor"
## print.default() shows the 'raw' data, which is sotred as numbers
print.default(pbcHead$sex)
## [1] 2 2 1 2 2 2
## You can see the levels assigned to these numbers by attributes()
attributes(pbcHead$sex)
## $levels
## [1] "m" "f"
## 
## $class
## [1] "factor"
## or by levels() more directly
levels(pbcHead$sex)
## [1] "m" "f"

## factors are summarized in tables
summary(pbcHead$sex)
## m f 
## 1 5
## Numbers are summarized in numerical summaries, althought this status
## variable is really categories
summary(pbcHead$status)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.25    2.00    1.50    2.00    2.00
## You can force it to be summarized in categorical way.  This is basically
## pretending that it is a factor.
summary.factor(pbcHead$status)
## 0 1 2 
## 1 1 4
## Check the variable names
names(pbcHead)
##  [1] "id"       "time"     "status"   "trt"      "age"      "sex"     
##  [7] "ascites"  "hepato"   "spiders"  "edema"    "bili"     "chol"    
## [13] "albumin"  "copper"   "alk.phos" "ast"      "trig"     "platelet"
## [19] "protime"  "stage"
## We can check class one variable at a time, but it's tedious.
class(pbcHead$status)
## [1] "integer"
## Remember the dataset is a list.
print.default(pbcHead)
## $id
## [1] 1 2 3 4 5 6
## 
## $time
## [1]  400 4500 1012 1925 1504 2503
## 
## $status
## [1] 2 0 2 2 1 2
## 
## $trt
## [1] 1 1 1 1 2 2
## 
## $age
## [1] 58.77 56.45 70.07 54.74 38.11 66.26
## 
## $sex
## [1] f f m f f f
## Levels: m f
## 
## $ascites
## [1] 1 0 0 0 0 0
## 
## $hepato
## [1] 1 1 0 1 1 1
## 
## $spiders
## [1] 1 1 0 1 1 0
## 
## $edema
## [1] 1.0 0.0 0.5 0.5 0.0 0.0
## 
## $bili
## [1] 14.5  1.1  1.4  1.8  3.4  0.8
## 
## $chol
## [1] 261 302 176 244 279 248
## 
## $albumin
## [1] 2.60 4.14 3.48 2.54 3.53 3.98
## 
## $copper
## [1] 156  54 210  64 143  50
## 
## $alk.phos
## [1] 1718 7395  516 6122  671  944
## 
## $ast
## [1] 137.95 113.52  96.10  60.63 113.15  93.00
## 
## $trig
## [1] 172  88  55  92  72  63
## 
## $platelet
## [1] 190 221 151 183 136  NA
## 
## $protime
## [1] 12.2 10.6 12.0 10.3 10.9 11.0
## 
## $stage
## [1] 4 3 4 4 3 3
## 
## attr(,"class")
## [1] "data.frame"
## Use List APPLY, to apply the class() function for each variable
## sequentially.
lapply(X = pbcHead, FUN = class)
## $id
## [1] "integer"
## 
## $time
## [1] "integer"
## 
## $status
## [1] "integer"
## 
## $trt
## [1] "integer"
## 
## $age
## [1] "numeric"
## 
## $sex
## [1] "factor"
## 
## $ascites
## [1] "integer"
## 
## $hepato
## [1] "integer"
## 
## $spiders
## [1] "integer"
## 
## $edema
## [1] "numeric"
## 
## $bili
## [1] "numeric"
## 
## $chol
## [1] "integer"
## 
## $albumin
## [1] "numeric"
## 
## $copper
## [1] "integer"
## 
## $alk.phos
## [1] "numeric"
## 
## $ast
## [1] "numeric"
## 
## $trig
## [1] "integer"
## 
## $platelet
## [1] "integer"
## 
## $protime
## [1] "numeric"
## 
## $stage
## [1] "integer"
## These 7 variables are numerically coded categorical variables, let's
## convert them.
varsReallyCategorical <- c("status", "trt", "ascites", "hepato", "spiders", 
    "edema", "stage")
## I just created a vector holding the names of these variables
varsReallyCategorical
## [1] "status"  "trt"     "ascites" "hepato"  "spiders" "edema"   "stage"
## You can access the part of the dataset using [row,column] indexing. Empty
## implies all.
pbcHead[, varsReallyCategorical]
##   status trt ascites hepato spiders edema stage
## 1      2   1       1      1       1   1.0     4
## 2      0   1       0      1       1   0.0     3
## 3      2   1       0      0       0   0.5     4
## 4      2   1       0      1       1   0.5     4
## 5      1   2       0      1       1   0.0     3
## 6      2   2       0      1       0   0.0     3
## Recheck the class, they are all numbers (integers/numeric)
lapply(pbcHead[, varsReallyCategorical], class)
## $status
## [1] "integer"
## 
## $trt
## [1] "integer"
## 
## $ascites
## [1] "integer"
## 
## $hepato
## [1] "integer"
## 
## $spiders
## [1] "integer"
## 
## $edema
## [1] "numeric"
## 
## $stage
## [1] "integer"
## You can convert one at a time, which is also tedious
pbcHead$status <- factor(pbcHead$status)
## Now it's a factor
class(pbcHead$status)
## [1] "factor"
## lapply() can do it at once
pbcHead[, varsReallyCategorical] <- lapply(pbcHead[, varsReallyCategorical], 
    factor)
## Now check the class of all variables
lapply(pbcHead, class)
## $id
## [1] "integer"
## 
## $time
## [1] "integer"
## 
## $status
## [1] "factor"
## 
## $trt
## [1] "factor"
## 
## $age
## [1] "numeric"
## 
## $sex
## [1] "factor"
## 
## $ascites
## [1] "factor"
## 
## $hepato
## [1] "factor"
## 
## $spiders
## [1] "factor"
## 
## $edema
## [1] "factor"
## 
## $bili
## [1] "numeric"
## 
## $chol
## [1] "integer"
## 
## $albumin
## [1] "numeric"
## 
## $copper
## [1] "integer"
## 
## $alk.phos
## [1] "numeric"
## 
## $ast
## [1] "numeric"
## 
## $trig
## [1] "integer"
## 
## $platelet
## [1] "integer"
## 
## $protime
## [1] "numeric"
## 
## $stage
## [1] "factor"