Reading data in and basic data manipulation in R

Corresponding slides

Reading various data files into R

CSV files

blood.csv <- read.csv("BLOOD.DAT.txt", quote = "'")
head(blood.csv)
      ID matchid case curpmh ageblood estradol estrone testost prolactn
1 100013  164594    0      1       46       57      65      25    11.12
2 100241  107261    0      0       65       11      26     999     2.80
3 100696  110294    0      1       66        3     999       8    38.00
4 101266  101266    1      0       57        4      18       6     8.90
5 101600  101600    1      0       66        6      18      25     6.90
6 102228  155717    0      1       57       10     999      31    13.94

Excel file using xlsx package

library(xlsx)
nhefs.xls <- read.xlsx("nhefs_book.xls", sheetIndex = 1)
head(nhefs.xls)
  seqn qsmk death yrdth sbp dbp sex age race income marital school       ht  wt71      wt82    wt82_71 birthplace
1  233    0     0    NA 175  96   0  42    1     19       2      7 174.1875 79.04  68.94604 -10.093960         47
2  235    0     0    NA 123  80   0  36    0     18       2      9 159.3750 58.63  61.23497   2.604970         42
3  244    0     0    NA 115  75   1  56    1     15       3     11 168.5000 56.81  66.22449   9.414486         51
4  245    0     1    85 148  78   0  68    1     15       3      5 170.1875 59.42  64.41012   4.990117         37
5  252    0     0    NA 118  77   0  40    0     18       2     11 181.8750 87.09  92.07925   4.989251         42
6  257    0     0    NA 141  83   1  43    1     11       4      9 162.1875 99.00 103.41906   4.419060         34
  smokeintensity smkintensity82_71 smokeyrs asthma bronch tb hf hbp pepticulcer colitis hepatitis chroniccough
1             30               -10       29      0      0  0  0   1           1       0         0            0
2             20               -10       24      0      0  0  0   0           0       0         0            0
3             20               -14       26      0      0  0  0   0           0       0         0            0
4              3                 4       53      0      0  0  0   1           0       0         0            0
5             20                 0       19      0      0  0  0   0           0       0         0            0
6             10                10       21      0      0  0  0   0           0       0         0            0
  hayfever diabetes polio tumor nervousbreak alcoholpy alcoholfreq alcoholtype alcoholhowmuch pica headache
1        0        1     0     0            0         1           1           3              7    0        1
2        0        0     0     0            0         1           0           1              4    0        1
3        1        0     0     1            0         1           3           4             NA    0        1
4        0        0     0     0            0         1           2           3              4    0        0
5        0        0     0     0            0         1           2           1              2    0        1
6        0        0     0     0            0         1           3           2              1    0        1
  otherpain weakheart allergies nerves lackpep hbpmed boweltrouble wtloss infection active exercise birthcontrol
1         0         0         0      0       0      1            0      0         0      0        2            2
2         0         0         0      0       0      0            0      0         1      0        0            2
3         1         0         0      1       0      0            0      0         0      0        2            0
4         1         1         0      0       0      0            0      0         0      1        2            2
5         0         0         0      0       0      0            1      0         0      1        1            2
6         0         0         0      0       0      0            0      0         0      1        1            0
  pregnancies cholesterol hightax82  price71  price82     tax71     tax82 price71_82  tax71_82
1          NA         197         0 2.183594 1.739990 1.1022949 0.4619751 0.44378662 0.6403809
2          NA         301         0 2.346680 1.797363 1.3649902 0.5718994 0.54931641 0.7929688
3           2         157         0 1.569580 1.513428 0.5512695 0.2309875 0.05619812 0.3202515
4          NA         174         0 1.506592 1.451904 0.5249023 0.2199707 0.05479431 0.3049927
5          NA         216         0 2.346680 1.797363 1.3649902 0.5718994 0.54931641 0.7929688
6           1         212         1 2.209961 2.025879 1.1547852 0.7479248 0.18408203 0.4069824

SAS native file using sas7bdat package

library(sas7bdat)
nhefs.sas <- read.sas7bdat("nhefs_book.sas7bdat")
head(nhefs.sas)
  seqn qsmk death yrdth sbp dbp sex age race income marital school       ht  wt71      wt82    wt82_71 birthplace
1  233    0     0   NaN 175  96   0  42    1     19       2      7 174.1875 79.04  68.94604 -10.093960         47
2  235    0     0   NaN 123  80   0  36    0     18       2      9 159.3750 58.63  61.23497   2.604970         42
3  244    0     0   NaN 115  75   1  56    1     15       3     11 168.5000 56.81  66.22449   9.414486         51
4  245    0     1    85 148  78   0  68    1     15       3      5 170.1875 59.42  64.41012   4.990117         37
5  252    0     0   NaN 118  77   0  40    0     18       2     11 181.8750 87.09  92.07925   4.989251         42
6  257    0     0   NaN 141  83   1  43    1     11       4      9 162.1875 99.00 103.41906   4.419060         34
  smokeintensity smkintensity82_71 smokeyrs asthma bronch tb hf hbp pepticulcer colitis hepatitis chroniccough
1             30               -10       29      0      0  0  0   1           1       0         0            0
2             20               -10       24      0      0  0  0   0           0       0         0            0
3             20               -14       26      0      0  0  0   0           0       0         0            0
4              3                 4       53      0      0  0  0   1           0       0         0            0
5             20                 0       19      0      0  0  0   0           0       0         0            0
6             10                10       21      0      0  0  0   0           0       0         0            0
  hayfever diabetes polio tumor nervousbreak alcoholpy alcoholfreq alcoholtype alcoholhowmuch pica headache
1        0        1     0     0            0         1           1           3              7    0        1
2        0        0     0     0            0         1           0           1              4    0        1
3        1        0     0     1            0         1           3           4            NaN    0        1
4        0        0     0     0            0         1           2           3              4    0        0
5        0        0     0     0            0         1           2           1              2    0        1
6        0        0     0     0            0         1           3           2              1    0        1
  otherpain weakheart allergies nerves lackpep hbpmed boweltrouble wtloss infection active exercise birthcontrol
1         0         0         0      0       0      1            0      0         0      0        2            2
2         0         0         0      0       0      0            0      0         1      0        0            2
3         1         0         0      1       0      0            0      0         0      0        2            0
4         1         1         0      0       0      0            0      0         0      1        2            2
5         0         0         0      0       0      0            1      0         0      1        1            2
6         0         0         0      0       0      0            0      0         0      1        1            0
  pregnancies cholesterol hightax82  price71  price82     tax71     tax82 price71_82  tax71_82
1         NaN         197         0 2.183594 1.739990 1.1022949 0.4619751 0.44378662 0.6403809
2         NaN         301         0 2.346680 1.797363 1.3649902 0.5718994 0.54931641 0.7929688
3           2         157         0 1.569580 1.513428 0.5512695 0.2309875 0.05619812 0.3202515
4         NaN         174         0 1.506592 1.451904 0.5249023 0.2199707 0.05479431 0.3049927
5         NaN         216         0 2.346680 1.797363 1.3649902 0.5718994 0.54931641 0.7929688
6           1         212         1 2.209961 2.025879 1.1547852 0.7479248 0.18408203 0.4069824

Stata file using foreign package

library(foreign)
blood.dta <- read.dta("BLOOD.DAT.dta")
head(blood.dta)
      id matchid case curpmh ageblood estradol estrone testost prolactn
1 100013  164594    0      1       46       57      65      25    11.12
2 100241  107261    0      0       65       11      26     999     2.80
3 100696  110294    0      1       66        3     999       8    38.00
4 101266  101266    1      0       57        4      18       6     8.90
5 101600  101600    1      0       66        6      18      25     6.90
6 102228  155717    0      1       57       10     999      31    13.94

Vectors: objects that can contain only one class of data values

## A numeric vector created by combining 4 numbers
vec1 <- c(2013, 2, 15, -10)
vec1
[1] 2013    2   15  -10

## integers 1 to 16
vec2 <- 1:16
vec2
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16

Arrays and Matrices: Vectors with dimentions

## Create a vector (single dimesion)
vec3 <- 1:16
vec3
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16

## Give 4 x 4 two-dimensional structure
dim(vec3) <- c(4, 4)
vec3
     [,1] [,2] [,3] [,4]
[1,]    1    5    9   13
[2,]    2    6   10   14
[3,]    3    7   11   15
[4,]    4    8   12   16

## Give a 2 x 2 x 4 three-dimensional structure
dim(vec3) <- c(2, 2, 4)
vec3
, , 1

     [,1] [,2]
[1,]    1    3
[2,]    2    4

, , 2

     [,1] [,2]
[1,]    5    7
[2,]    6    8

, , 3

     [,1] [,2]
[1,]    9   11
[2,]   10   12

, , 4

     [,1] [,2]
[1,]   13   15
[2,]   14   16

## Directly create an array
arr1 <- array(1:60, dim = c(3,4,5))
arr1
, , 1

     [,1] [,2] [,3] [,4]
[1,]    1    4    7   10
[2,]    2    5    8   11
[3,]    3    6    9   12

, , 2

     [,1] [,2] [,3] [,4]
[1,]   13   16   19   22
[2,]   14   17   20   23
[3,]   15   18   21   24

, , 3

     [,1] [,2] [,3] [,4]
[1,]   25   28   31   34
[2,]   26   29   32   35
[3,]   27   30   33   36

, , 4

     [,1] [,2] [,3] [,4]
[1,]   37   40   43   46
[2,]   38   41   44   47
[3,]   39   42   45   48

, , 5

     [,1] [,2] [,3] [,4]
[1,]   49   52   55   58
[2,]   50   53   56   59
[3,]   51   54   57   60

Lists: objects that can contain multiple classes of data values

## List of a vector and a matrix
list1 <- list(first = 1:17, second = matrix(letters, 13,2))
list1
$first
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17

$second
      [,1] [,2]
 [1,] "a"  "n" 
 [2,] "b"  "o" 
 [3,] "c"  "p" 
 [4,] "d"  "q" 
 [5,] "e"  "r" 
 [6,] "f"  "s" 
 [7,] "g"  "t" 
 [8,] "h"  "u" 
 [9,] "i"  "v" 
[10,] "j"  "w" 
[11,] "k"  "x" 
[12,] "l"  "y" 
[13,] "m"  "z" 

## List of two vectors of the same length
list2 <- list(alpha = c(1,4,5,7), beta = c("h","s","p","h"))
list2
$alpha
[1] 1 4 5 7

$beta
[1] "h" "s" "p" "h"

Data frames: lists of same-length vectors vertically aligned

## Convert a list to a data frame
df1 <- data.frame(list2)
df1
  alpha beta
1     1    h
2     4    s
3     5    p
4     7    h

## Create a list with vectors of different classes
list3 <- list(small = letters, large = LETTERS, number = 1:26)
list3
$small
 [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z"

$large
 [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W" "X" "Y" "Z"

$number
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26

## Convert to a data frame
df2 <- data.frame(list3)
df2
   small large number
1      a     A      1
2      b     B      2
3      c     C      3
4      d     D      4
5      e     E      5
6      f     F      6
7      g     G      7
8      h     H      8
9      i     I      9
10     j     J     10
11     k     K     11
12     l     L     12
13     m     M     13
14     n     N     14
15     o     O     15
16     p     P     16
17     q     Q     17
18     r     R     18
19     s     S     19
20     t     T     20
21     u     U     21
22     v     V     22
23     w     W     23
24     x     X     24
25     y     Y     25
26     z     Z     26

## Directly create a data frame
df3 <- data.frame(small = letters, large = LETTERS, number = 1:26)
df3
   small large number
1      a     A      1
2      b     B      2
3      c     C      3
4      d     D      4
5      e     E      5
6      f     F      6
7      g     G      7
8      h     H      8
9      i     I      9
10     j     J     10
11     k     K     11
12     l     L     12
13     m     M     13
14     n     N     14
15     o     O     15
16     p     P     16
17     q     Q     17
18     r     R     18
19     s     S     19
20     t     T     20
21     u     U     21
22     v     V     22
23     w     W     23
24     x     X     24
25     y     Y     25
26     z     Z     26

Indexing by numbers

letters
 [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z"
letters[3] # 1-dimensional object
[1] "c"

arr1
, , 1

     [,1] [,2] [,3] [,4]
[1,]    1    4    7   10
[2,]    2    5    8   11
[3,]    3    6    9   12

, , 2

     [,1] [,2] [,3] [,4]
[1,]   13   16   19   22
[2,]   14   17   20   23
[3,]   15   18   21   24

, , 3

     [,1] [,2] [,3] [,4]
[1,]   25   28   31   34
[2,]   26   29   32   35
[3,]   27   30   33   36

, , 4

     [,1] [,2] [,3] [,4]
[1,]   37   40   43   46
[2,]   38   41   44   47
[3,]   39   42   45   48

, , 5

     [,1] [,2] [,3] [,4]
[1,]   49   52   55   58
[2,]   50   53   56   59
[3,]   51   54   57   60
arr1[1,2,3] # 3-dimensional object
[1] 28
arr1[1, ,3] # implies 1,(all),3
[1] 25 28 31 34

df1
  alpha beta
1     1    h
2     4    s
3     5    p
4     7    h
df1[2, ] # implies 2,(all)
  alpha beta
2     4    s

list1
$first
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17

$second
      [,1] [,2]
 [1,] "a"  "n" 
 [2,] "b"  "o" 
 [3,] "c"  "p" 
 [4,] "d"  "q" 
 [5,] "e"  "r" 
 [6,] "f"  "s" 
 [7,] "g"  "t" 
 [8,] "h"  "u" 
 [9,] "i"  "v" 
[10,] "j"  "w" 
[11,] "k"  "x" 
[12,] "l"  "y" 
[13,] "m"  "z" 
list1[[1]] # list needs [[ ]]
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17

Naming elements

list3
$small
 [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z"

$large
 [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W" "X" "Y" "Z"

$number
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
list3$small
 [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z"
list3[["small"]]
 [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z"

df1
  alpha beta
1     1    h
2     4    s
3     5    p
4     7    h
df1$alpha
[1] 1 4 5 7
df1[, "beta"]
[1] h s p h
Levels: h p s