Hi all, this section is about data preparation using R codes. Examples are:

  1. Forming a vector
  2. Creating a matrix / data frame
  3. Structure of an Arbitrary R Object
  4. Writing and reading files
  5. NA values
  6. Taking subset and creating new variable(column)
  7. Recoding
  8. Other useful functions

1. Forming a vector

# stores 1 to 11 into x1
x1 <- 1:11
# similarly
x2 <- c(1,2,3,4,5,6,7,8,9,10,11)
x1 == x2
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# eleven 1's in a vector, rep is repeat
y <- rep(1, 11)

2. Creating a matrix / data frame

# create a 4 by 5 matrix
m <- matrix(1:20, 4, 5)

# creating vectors
id <- c(1,2,3,4,5)
manager <- c(1,2,3,4,5)
country <- c("US", "US", "UK", "UK", "UK")
age <- c(32, 45, 25, 39, 99)
# create vector using sample with replacement
gender <- sample(c("Male", "Female"), 5, replace = TRUE)
q1 <- c(5,3,3,3,2)
q2 <- c(4,5,5,3,2)
q3 <- c(5,2,5,4,1)
q4 <- c(5,5,5,NA,2)
q5 <- c(5,5,2,NA,1)
leadership <- data.frame(id, manager, country, gender, age, q1,q2,q3,q4,q5)

3. Structure of an Arbitrary R Object

# note country & gender are "Factor"
str(leadership)
## 'data.frame':    5 obs. of  10 variables:
##  $ id     : num  1 2 3 4 5
##  $ manager: num  1 2 3 4 5
##  $ country: Factor w/ 2 levels "UK","US": 2 2 1 1 1
##  $ gender : Factor w/ 2 levels "Female","Male": 2 1 1 2 2
##  $ age    : num  32 45 25 39 99
##  $ q1     : num  5 3 3 3 2
##  $ q2     : num  4 5 5 3 2
##  $ q3     : num  5 2 5 4 1
##  $ q4     : num  5 5 5 NA 2
##  $ q5     : num  5 5 2 NA 1
# convert a column into factor
leadership$manager <- factor(leadership$manager)
str(leadership)
## 'data.frame':    5 obs. of  10 variables:
##  $ id     : num  1 2 3 4 5
##  $ manager: Factor w/ 5 levels "1","2","3","4",..: 1 2 3 4 5
##  $ country: Factor w/ 2 levels "UK","US": 2 2 1 1 1
##  $ gender : Factor w/ 2 levels "Female","Male": 2 1 1 2 2
##  $ age    : num  32 45 25 39 99
##  $ q1     : num  5 3 3 3 2
##  $ q2     : num  4 5 5 3 2
##  $ q3     : num  5 2 5 4 1
##  $ q4     : num  5 5 5 NA 2
##  $ q5     : num  5 5 2 NA 1

4. Writing and reading files

write.table(leadership, "leadership.txt", sep = ",")
leadership1 <- read.table("leadership.txt", header = TRUE, sep = ",", stringsAsFactors = FALSE)
write.table(leadership1, "leadership.csv", sep = ",")
leadership2 <- read.csv("leadership.txt")

5. NA values

# show which values is NA, ";" runs the next code in a line
is.na(leadership); sum(is.na(leadership))
##         id manager country gender   age    q1    q2    q3    q4    q5
## [1,] FALSE   FALSE   FALSE  FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE   FALSE   FALSE  FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE   FALSE   FALSE  FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE   FALSE   FALSE  FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
## [5,] FALSE   FALSE   FALSE  FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1] 2
# "!" is negate (or lay man meaning of opposite)
sum(!is.na(leadership))
## [1] 48
# give the probability of NAs in the data frame
mean(is.na(leadership))
## [1] 0.04
# give the boolean of where row is complete(i.e. no NA)
complete.cases(leadership)
## [1]  TRUE  TRUE  TRUE FALSE  TRUE
# number of complete observations
sum(complete.cases(leadership))
## [1] 4
# remove rows with NA values
leadershipNoMissing <- na.omit(leadership)

6. Taking subset and creating new variable(column)

# taking row 1
leadership[1,]
##   id manager country gender age q1 q2 q3 q4 q5
## 1  1       1      US   Male  32  5  4  5  5  5
# taking col 2
leadership[,2]
## [1] 1 2 3 4 5
## Levels: 1 2 3 4 5
# taking row 3 and column 4
leadership[3,4]
## [1] Female
## Levels: Female Male
# taking first 3 rows
leadership[1:3,]
##   id manager country gender age q1 q2 q3 q4 q5
## 1  1       1      US   Male  32  5  4  5  5  5
## 2  2       2      US Female  45  3  5  2  5  5
## 3  3       3      UK Female  25  3  5  5  5  2
# taking first 2 columns
leadership[,1:4]
##   id manager country gender
## 1  1       1      US   Male
## 2  2       2      US Female
## 3  3       3      UK Female
## 4  4       4      UK   Male
## 5  5       5      UK   Male
# calls variable with name "manager". 
# "$" works with data.frame only
leadership$manager
## [1] 1 2 3 4 5
## Levels: 1 2 3 4 5
# another way of taking subset
s1 <- subset(leadership, leadership$country == "UK")
# simlarly
s2 <- leadership[leadership$country == "UK",]
# more subsetting
s3 <- subset(leadership, 
             leadership$country == "UK", select = q1:q5)

# creating new variable(column)
attach(leadership)
## The following objects are masked _by_ .GlobalEnv:
## 
##     age, country, gender, id, manager, q1, q2, q3, q4, q5
leadership$mean <- (q1+q2+q3+q4+q5)/5
detach(leadership)
leadership
##   id manager country gender age q1 q2 q3 q4 q5 mean
## 1  1       1      US   Male  32  5  4  5  5  5  4.8
## 2  2       2      US Female  45  3  5  2  5  5  4.0
## 3  3       3      UK Female  25  3  5  5  5  2  4.0
## 4  4       4      UK   Male  39  3  3  4 NA NA   NA
## 5  5       5      UK   Male  99  2  2  1  2  1  1.6

7. Recoding

# recoding 99 to NA
leadership$agecat[leadership$age == 99] <- NA
# similarly
leadership$agecat[leadership$age <= 55] <- "Young"
leadership$agecat[leadership$age > 55 
                   & leadership$age <= 75] <- "Middle Aged"
leadership$agecat[leadership$age > 75] <- "Elder"
leadership
##   id manager country gender age q1 q2 q3 q4 q5 mean agecat
## 1  1       1      US   Male  32  5  4  5  5  5  4.8  Young
## 2  2       2      US Female  45  3  5  2  5  5  4.0  Young
## 3  3       3      UK Female  25  3  5  5  5  2  4.0  Young
## 4  4       4      UK   Male  39  3  3  4 NA NA   NA  Young
## 5  5       5      UK   Male  99  2  2  1  2  1  1.6  Elder

8. Other useful functions

# row binds
dfrow <- rbind(leadership, leadership)
# column binds
dfcol <- cbind(leadership, leadership)

# check dimension
dim(dfrow); dim(dfcol)
## [1] 10 12
## [1]  5 24
# first 6 lines
head(dfrow)
##   id manager country gender age q1 q2 q3 q4 q5 mean agecat
## 1  1       1      US   Male  32  5  4  5  5  5  4.8  Young
## 2  2       2      US Female  45  3  5  2  5  5  4.0  Young
## 3  3       3      UK Female  25  3  5  5  5  2  4.0  Young
## 4  4       4      UK   Male  39  3  3  4 NA NA   NA  Young
## 5  5       5      UK   Male  99  2  2  1  2  1  1.6  Elder
## 6  1       1      US   Male  32  5  4  5  5  5  4.8  Young
# first 3 lines
head(dfrow , 3)
##   id manager country gender age q1 q2 q3 q4 q5 mean agecat
## 1  1       1      US   Male  32  5  4  5  5  5  4.8  Young
## 2  2       2      US Female  45  3  5  2  5  5  4.0  Young
## 3  3       3      UK Female  25  3  5  5  5  2  4.0  Young
# last 6 lines
tail(dfcol)
##   id manager country gender age q1 q2 q3 q4 q5 mean agecat id manager
## 1  1       1      US   Male  32  5  4  5  5  5  4.8  Young  1       1
## 2  2       2      US Female  45  3  5  2  5  5  4.0  Young  2       2
## 3  3       3      UK Female  25  3  5  5  5  2  4.0  Young  3       3
## 4  4       4      UK   Male  39  3  3  4 NA NA   NA  Young  4       4
## 5  5       5      UK   Male  99  2  2  1  2  1  1.6  Elder  5       5
##   country gender age q1 q2 q3 q4 q5 mean agecat
## 1      US   Male  32  5  4  5  5  5  4.8  Young
## 2      US Female  45  3  5  2  5  5  4.0  Young
## 3      UK Female  25  3  5  5  5  2  4.0  Young
## 4      UK   Male  39  3  3  4 NA NA   NA  Young
## 5      UK   Male  99  2  2  1  2  1  1.6  Elder

Return to contents page