Converting data
## Load data from survival package
library(survival)
## Loading required package: splines
data(pbc)
## I'll use first 6 rows only for demonstration.
pbcHead <- head(pbc)
## Check dimension
dim(pbcHead)
## [1] 6 20
## The class determines how the object is displayed.
class(pbcHead)
## [1] "data.frame"
## Implicit print()
pbcHead
## id time status trt age sex ascites hepato spiders edema bili chol
## 1 1 400 2 1 58.77 f 1 1 1 1.0 14.5 261
## 2 2 4500 0 1 56.45 f 0 1 1 0.0 1.1 302
## 3 3 1012 2 1 70.07 m 0 0 0 0.5 1.4 176
## 4 4 1925 2 1 54.74 f 0 1 1 0.5 1.8 244
## 5 5 1504 1 2 38.11 f 0 1 1 0.0 3.4 279
## 6 6 2503 2 2 66.26 f 0 1 0 0.0 0.8 248
## albumin copper alk.phos ast trig platelet protime stage
## 1 2.60 156 1718 137.95 172 190 12.2 4
## 2 4.14 54 7395 113.52 88 221 10.6 3
## 3 3.48 210 516 96.10 55 151 12.0 4
## 4 2.54 64 6122 60.63 92 183 10.3 4
## 5 3.53 143 671 113.15 72 136 10.9 3
## 6 3.98 50 944 93.00 63 NA 11.0 3
## print.default ignores the class, and shows the raw data (a list of
## variables)
print.default(pbcHead)
## $id
## [1] 1 2 3 4 5 6
##
## $time
## [1] 400 4500 1012 1925 1504 2503
##
## $status
## [1] 2 0 2 2 1 2
##
## $trt
## [1] 1 1 1 1 2 2
##
## $age
## [1] 58.77 56.45 70.07 54.74 38.11 66.26
##
## $sex
## [1] f f m f f f
## Levels: m f
##
## $ascites
## [1] 1 0 0 0 0 0
##
## $hepato
## [1] 1 1 0 1 1 1
##
## $spiders
## [1] 1 1 0 1 1 0
##
## $edema
## [1] 1.0 0.0 0.5 0.5 0.0 0.0
##
## $bili
## [1] 14.5 1.1 1.4 1.8 3.4 0.8
##
## $chol
## [1] 261 302 176 244 279 248
##
## $albumin
## [1] 2.60 4.14 3.48 2.54 3.53 3.98
##
## $copper
## [1] 156 54 210 64 143 50
##
## $alk.phos
## [1] 1718 7395 516 6122 671 944
##
## $ast
## [1] 137.95 113.52 96.10 60.63 113.15 93.00
##
## $trig
## [1] 172 88 55 92 72 63
##
## $platelet
## [1] 190 221 151 183 136 NA
##
## $protime
## [1] 12.2 10.6 12.0 10.3 10.9 11.0
##
## $stage
## [1] 4 3 4 4 3 3
##
## attr(,"class")
## [1] "data.frame"
## print() actually calls print.data.frame respecting the class
print(pbcHead)
## id time status trt age sex ascites hepato spiders edema bili chol
## 1 1 400 2 1 58.77 f 1 1 1 1.0 14.5 261
## 2 2 4500 0 1 56.45 f 0 1 1 0.0 1.1 302
## 3 3 1012 2 1 70.07 m 0 0 0 0.5 1.4 176
## 4 4 1925 2 1 54.74 f 0 1 1 0.5 1.8 244
## 5 5 1504 1 2 38.11 f 0 1 1 0.0 3.4 279
## 6 6 2503 2 2 66.26 f 0 1 0 0.0 0.8 248
## albumin copper alk.phos ast trig platelet protime stage
## 1 2.60 156 1718 137.95 172 190 12.2 4
## 2 4.14 54 7395 113.52 88 221 10.6 3
## 3 3.48 210 516 96.10 55 151 12.0 4
## 4 2.54 64 6122 60.63 92 183 10.3 4
## 5 3.53 143 671 113.15 72 136 10.9 3
## 6 3.98 50 944 93.00 63 NA 11.0 3
## print.data.frame() actually does the job, thus same output
print.data.frame(pbcHead)
## id time status trt age sex ascites hepato spiders edema bili chol
## 1 1 400 2 1 58.77 f 1 1 1 1.0 14.5 261
## 2 2 4500 0 1 56.45 f 0 1 1 0.0 1.1 302
## 3 3 1012 2 1 70.07 m 0 0 0 0.5 1.4 176
## 4 4 1925 2 1 54.74 f 0 1 1 0.5 1.8 244
## 5 5 1504 1 2 38.11 f 0 1 1 0.0 3.4 279
## 6 6 2503 2 2 66.26 f 0 1 0 0.0 0.8 248
## albumin copper alk.phos ast trig platelet protime stage
## 1 2.60 156 1718 137.95 172 190 12.2 4
## 2 4.14 54 7395 113.52 88 221 10.6 3
## 3 3.48 210 516 96.10 55 151 12.0 4
## 4 2.54 64 6122 60.63 92 183 10.3 4
## 5 3.53 143 671 113.15 72 136 10.9 3
## 6 3.98 50 944 93.00 63 NA 11.0 3
## As you can see print.default() output, each element is a vector
pbcHead$time
## [1] 400 4500 1012 1925 1504 2503
## Each vector also has class
class(pbcHead$time)
## [1] "integer"
## sex appears differently
pbcHead$sex
## [1] f f m f f f
## Levels: m f
## because it is a factor, i.e., categorical variable
class(pbcHead$sex)
## [1] "factor"
## print.default() shows the 'raw' data, which is sotred as numbers
print.default(pbcHead$sex)
## [1] 2 2 1 2 2 2
## You can see the levels assigned to these numbers by attributes()
attributes(pbcHead$sex)
## $levels
## [1] "m" "f"
##
## $class
## [1] "factor"
## or by levels() more directly
levels(pbcHead$sex)
## [1] "m" "f"
## factors are summarized in tables
summary(pbcHead$sex)
## m f
## 1 5
## Numbers are summarized in numerical summaries, althought this status
## variable is really categories
summary(pbcHead$status)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.25 2.00 1.50 2.00 2.00
## You can force it to be summarized in categorical way. This is basically
## pretending that it is a factor.
summary.factor(pbcHead$status)
## 0 1 2
## 1 1 4
## Check the variable names
names(pbcHead)
## [1] "id" "time" "status" "trt" "age" "sex"
## [7] "ascites" "hepato" "spiders" "edema" "bili" "chol"
## [13] "albumin" "copper" "alk.phos" "ast" "trig" "platelet"
## [19] "protime" "stage"
## We can check class one variable at a time, but it's tedious.
class(pbcHead$status)
## [1] "integer"
## Remember the dataset is a list.
print.default(pbcHead)
## $id
## [1] 1 2 3 4 5 6
##
## $time
## [1] 400 4500 1012 1925 1504 2503
##
## $status
## [1] 2 0 2 2 1 2
##
## $trt
## [1] 1 1 1 1 2 2
##
## $age
## [1] 58.77 56.45 70.07 54.74 38.11 66.26
##
## $sex
## [1] f f m f f f
## Levels: m f
##
## $ascites
## [1] 1 0 0 0 0 0
##
## $hepato
## [1] 1 1 0 1 1 1
##
## $spiders
## [1] 1 1 0 1 1 0
##
## $edema
## [1] 1.0 0.0 0.5 0.5 0.0 0.0
##
## $bili
## [1] 14.5 1.1 1.4 1.8 3.4 0.8
##
## $chol
## [1] 261 302 176 244 279 248
##
## $albumin
## [1] 2.60 4.14 3.48 2.54 3.53 3.98
##
## $copper
## [1] 156 54 210 64 143 50
##
## $alk.phos
## [1] 1718 7395 516 6122 671 944
##
## $ast
## [1] 137.95 113.52 96.10 60.63 113.15 93.00
##
## $trig
## [1] 172 88 55 92 72 63
##
## $platelet
## [1] 190 221 151 183 136 NA
##
## $protime
## [1] 12.2 10.6 12.0 10.3 10.9 11.0
##
## $stage
## [1] 4 3 4 4 3 3
##
## attr(,"class")
## [1] "data.frame"
## Use List APPLY, to apply the class() function for each variable
## sequentially.
lapply(X = pbcHead, FUN = class)
## $id
## [1] "integer"
##
## $time
## [1] "integer"
##
## $status
## [1] "integer"
##
## $trt
## [1] "integer"
##
## $age
## [1] "numeric"
##
## $sex
## [1] "factor"
##
## $ascites
## [1] "integer"
##
## $hepato
## [1] "integer"
##
## $spiders
## [1] "integer"
##
## $edema
## [1] "numeric"
##
## $bili
## [1] "numeric"
##
## $chol
## [1] "integer"
##
## $albumin
## [1] "numeric"
##
## $copper
## [1] "integer"
##
## $alk.phos
## [1] "numeric"
##
## $ast
## [1] "numeric"
##
## $trig
## [1] "integer"
##
## $platelet
## [1] "integer"
##
## $protime
## [1] "numeric"
##
## $stage
## [1] "integer"
## These 7 variables are numerically coded categorical variables, let's
## convert them.
varsReallyCategorical <- c("status", "trt", "ascites", "hepato", "spiders",
"edema", "stage")
## I just created a vector holding the names of these variables
varsReallyCategorical
## [1] "status" "trt" "ascites" "hepato" "spiders" "edema" "stage"
## You can access the part of the dataset using [row,column] indexing. Empty
## implies all.
pbcHead[, varsReallyCategorical]
## status trt ascites hepato spiders edema stage
## 1 2 1 1 1 1 1.0 4
## 2 0 1 0 1 1 0.0 3
## 3 2 1 0 0 0 0.5 4
## 4 2 1 0 1 1 0.5 4
## 5 1 2 0 1 1 0.0 3
## 6 2 2 0 1 0 0.0 3
## Recheck the class, they are all numbers (integers/numeric)
lapply(pbcHead[, varsReallyCategorical], class)
## $status
## [1] "integer"
##
## $trt
## [1] "integer"
##
## $ascites
## [1] "integer"
##
## $hepato
## [1] "integer"
##
## $spiders
## [1] "integer"
##
## $edema
## [1] "numeric"
##
## $stage
## [1] "integer"
## You can convert one at a time, which is also tedious
pbcHead$status <- factor(pbcHead$status)
## Now it's a factor
class(pbcHead$status)
## [1] "factor"
## lapply() can do it at once
pbcHead[, varsReallyCategorical] <- lapply(pbcHead[, varsReallyCategorical],
factor)
## Now check the class of all variables
lapply(pbcHead, class)
## $id
## [1] "integer"
##
## $time
## [1] "integer"
##
## $status
## [1] "factor"
##
## $trt
## [1] "factor"
##
## $age
## [1] "numeric"
##
## $sex
## [1] "factor"
##
## $ascites
## [1] "factor"
##
## $hepato
## [1] "factor"
##
## $spiders
## [1] "factor"
##
## $edema
## [1] "factor"
##
## $bili
## [1] "numeric"
##
## $chol
## [1] "integer"
##
## $albumin
## [1] "numeric"
##
## $copper
## [1] "integer"
##
## $alk.phos
## [1] "numeric"
##
## $ast
## [1] "numeric"
##
## $trig
## [1] "integer"
##
## $platelet
## [1] "integer"
##
## $protime
## [1] "numeric"
##
## $stage
## [1] "factor"