Hi all, this section is about data preparation using R codes. Examples are:
- Forming a vector
- Creating a matrix / data frame
- Structure of an Arbitrary R Object
- Writing and reading files
- NA values
- Taking subset and creating new variable(column)
- Recoding
- Other useful functions
2. Creating a matrix / data frame
# create a 4 by 5 matrix
m <- matrix(1:20, 4, 5)
# creating vectors
id <- c(1,2,3,4,5)
manager <- c(1,2,3,4,5)
country <- c("US", "US", "UK", "UK", "UK")
age <- c(32, 45, 25, 39, 99)
# create vector using sample with replacement
gender <- sample(c("Male", "Female"), 5, replace = TRUE)
q1 <- c(5,3,3,3,2)
q2 <- c(4,5,5,3,2)
q3 <- c(5,2,5,4,1)
q4 <- c(5,5,5,NA,2)
q5 <- c(5,5,2,NA,1)
leadership <- data.frame(id, manager, country, gender, age, q1,q2,q3,q4,q5)
3. Structure of an Arbitrary R Object
# note country & gender are "Factor"
str(leadership)
## 'data.frame': 5 obs. of 10 variables:
## $ id : num 1 2 3 4 5
## $ manager: num 1 2 3 4 5
## $ country: Factor w/ 2 levels "UK","US": 2 2 1 1 1
## $ gender : Factor w/ 2 levels "Female","Male": 2 1 1 2 2
## $ age : num 32 45 25 39 99
## $ q1 : num 5 3 3 3 2
## $ q2 : num 4 5 5 3 2
## $ q3 : num 5 2 5 4 1
## $ q4 : num 5 5 5 NA 2
## $ q5 : num 5 5 2 NA 1
# convert a column into factor
leadership$manager <- factor(leadership$manager)
str(leadership)
## 'data.frame': 5 obs. of 10 variables:
## $ id : num 1 2 3 4 5
## $ manager: Factor w/ 5 levels "1","2","3","4",..: 1 2 3 4 5
## $ country: Factor w/ 2 levels "UK","US": 2 2 1 1 1
## $ gender : Factor w/ 2 levels "Female","Male": 2 1 1 2 2
## $ age : num 32 45 25 39 99
## $ q1 : num 5 3 3 3 2
## $ q2 : num 4 5 5 3 2
## $ q3 : num 5 2 5 4 1
## $ q4 : num 5 5 5 NA 2
## $ q5 : num 5 5 2 NA 1
4. Writing and reading files
- write.table()
- read.table()
- read.csv()
write.table(leadership, "leadership.txt", sep = ",")
leadership1 <- read.table("leadership.txt", header = TRUE, sep = ",", stringsAsFactors = FALSE)
write.table(leadership1, "leadership.csv", sep = ",")
leadership2 <- read.csv("leadership.txt")
5. NA values
- is.na()
- !is.na()
- complete.cases()
- na.omit()
# show which values is NA, ";" runs the next code in a line
is.na(leadership); sum(is.na(leadership))
## id manager country gender age q1 q2 q3 q4 q5
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [5,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1] 2
# "!" is negate (or lay man meaning of opposite)
sum(!is.na(leadership))
## [1] 48
# give the probability of NAs in the data frame
mean(is.na(leadership))
## [1] 0.04
# give the boolean of where row is complete(i.e. no NA)
complete.cases(leadership)
## [1] TRUE TRUE TRUE FALSE TRUE
# number of complete observations
sum(complete.cases(leadership))
## [1] 4
# remove rows with NA values
leadershipNoMissing <- na.omit(leadership)
6. Taking subset and creating new variable(column)
- square brackets
- subset()
- $
# taking row 1
leadership[1,]
## id manager country gender age q1 q2 q3 q4 q5
## 1 1 1 US Male 32 5 4 5 5 5
# taking col 2
leadership[,2]
## [1] 1 2 3 4 5
## Levels: 1 2 3 4 5
# taking row 3 and column 4
leadership[3,4]
## [1] Female
## Levels: Female Male
# taking first 3 rows
leadership[1:3,]
## id manager country gender age q1 q2 q3 q4 q5
## 1 1 1 US Male 32 5 4 5 5 5
## 2 2 2 US Female 45 3 5 2 5 5
## 3 3 3 UK Female 25 3 5 5 5 2
# taking first 2 columns
leadership[,1:4]
## id manager country gender
## 1 1 1 US Male
## 2 2 2 US Female
## 3 3 3 UK Female
## 4 4 4 UK Male
## 5 5 5 UK Male
# calls variable with name "manager".
# "$" works with data.frame only
leadership$manager
## [1] 1 2 3 4 5
## Levels: 1 2 3 4 5
# another way of taking subset
s1 <- subset(leadership, leadership$country == "UK")
# simlarly
s2 <- leadership[leadership$country == "UK",]
# more subsetting
s3 <- subset(leadership,
leadership$country == "UK", select = q1:q5)
# creating new variable(column)
attach(leadership)
## The following objects are masked _by_ .GlobalEnv:
##
## age, country, gender, id, manager, q1, q2, q3, q4, q5
leadership$mean <- (q1+q2+q3+q4+q5)/5
detach(leadership)
leadership
## id manager country gender age q1 q2 q3 q4 q5 mean
## 1 1 1 US Male 32 5 4 5 5 5 4.8
## 2 2 2 US Female 45 3 5 2 5 5 4.0
## 3 3 3 UK Female 25 3 5 5 5 2 4.0
## 4 4 4 UK Male 39 3 3 4 NA NA NA
## 5 5 5 UK Male 99 2 2 1 2 1 1.6
7. Recoding
- $, square brackets, <, <=, ==, >
# recoding 99 to NA
leadership$agecat[leadership$age == 99] <- NA
# similarly
leadership$agecat[leadership$age <= 55] <- "Young"
leadership$agecat[leadership$age > 55
& leadership$age <= 75] <- "Middle Aged"
leadership$agecat[leadership$age > 75] <- "Elder"
leadership
## id manager country gender age q1 q2 q3 q4 q5 mean agecat
## 1 1 1 US Male 32 5 4 5 5 5 4.8 Young
## 2 2 2 US Female 45 3 5 2 5 5 4.0 Young
## 3 3 3 UK Female 25 3 5 5 5 2 4.0 Young
## 4 4 4 UK Male 39 3 3 4 NA NA NA Young
## 5 5 5 UK Male 99 2 2 1 2 1 1.6 Elder
8. Other useful functions
- rbind()
- cbind()
- dim()
- head()
- tail()
# row binds
dfrow <- rbind(leadership, leadership)
# column binds
dfcol <- cbind(leadership, leadership)
# check dimension
dim(dfrow); dim(dfcol)
## [1] 10 12
## [1] 5 24
# first 6 lines
head(dfrow)
## id manager country gender age q1 q2 q3 q4 q5 mean agecat
## 1 1 1 US Male 32 5 4 5 5 5 4.8 Young
## 2 2 2 US Female 45 3 5 2 5 5 4.0 Young
## 3 3 3 UK Female 25 3 5 5 5 2 4.0 Young
## 4 4 4 UK Male 39 3 3 4 NA NA NA Young
## 5 5 5 UK Male 99 2 2 1 2 1 1.6 Elder
## 6 1 1 US Male 32 5 4 5 5 5 4.8 Young
# first 3 lines
head(dfrow , 3)
## id manager country gender age q1 q2 q3 q4 q5 mean agecat
## 1 1 1 US Male 32 5 4 5 5 5 4.8 Young
## 2 2 2 US Female 45 3 5 2 5 5 4.0 Young
## 3 3 3 UK Female 25 3 5 5 5 2 4.0 Young
# last 6 lines
tail(dfcol)
## id manager country gender age q1 q2 q3 q4 q5 mean agecat id manager
## 1 1 1 US Male 32 5 4 5 5 5 4.8 Young 1 1
## 2 2 2 US Female 45 3 5 2 5 5 4.0 Young 2 2
## 3 3 3 UK Female 25 3 5 5 5 2 4.0 Young 3 3
## 4 4 4 UK Male 39 3 3 4 NA NA NA Young 4 4
## 5 5 5 UK Male 99 2 2 1 2 1 1.6 Elder 5 5
## country gender age q1 q2 q3 q4 q5 mean agecat
## 1 US Male 32 5 4 5 5 5 4.8 Young
## 2 US Female 45 3 5 2 5 5 4.0 Young
## 3 UK Female 25 3 5 5 5 2 4.0 Young
## 4 UK Male 39 3 3 4 NA NA NA Young
## 5 UK Male 99 2 2 1 2 1 1.6 Elder