Note: The purpose of creating this document is to help me be reminded of some of the concepts/codes/commands in R that I was interested in when I first learned R. I also intent to use this document as a guide if I were to help out a co-woker or friend with unfamiliar R concepts.
Figure 1.1, 1.2, 1.3,1.4
Dataset is rectangular array of data with rows representing obersations and columns representing variables.
** Matrix is two dimentions and Array is more than two dimention.
Creating matrix with names:
# By row false mean filled by cols
y <- matrix(1:30, nrow = 5, ncol = 6, byrow = FALSE)
colnames <- c("One", "Two", "Three", "Four", "Five", "Six")
rownames <- c("yi", "er", "san", "si", "wu")
# dimnames takes only list, and the list should be row name first, then col names.
y <- matrix(1:30, nrow = 5, ncol = 6, byrow = FALSE, dimnames = list(rownames, colnames))
print(y)
## One Two Three Four Five Six
## yi 1 6 11 16 21 26
## er 2 7 12 17 22 27
## san 3 8 13 18 23 28
## si 4 9 14 19 24 29
## wu 5 10 15 20 25 30
Creating array with names:
dim1 <- c('A1', 'A2')
dim2 <- c('B1', 'B2', 'B3')
dim3 <- c('C1', 'C2', 'C3', 'C4')
z <- array(1:24, c(2, 3, 4), dimnames = list (dim1, dim2, dim3))
z
## , , C1
##
## B1 B2 B3
## A1 1 3 5
## A2 2 4 6
##
## , , C2
##
## B1 B2 B3
## A1 7 9 11
## A2 8 10 12
##
## , , C3
##
## B1 B2 B3
## A1 13 15 17
## A2 14 16 18
##
## , , C4
##
## B1 B2 B3
## A1 19 21 23
## A2 20 22 24
Creating a data frame:
Data Frame allows you to create something like excel, it will have the numbers on the left. You can assign the col names by assigning variables at the time you create them.
col1 <- c(1,2,3, 4,5)
col2 <- c(11, 22, 33, 44, 55)
col3 <- c('Type1', 'Type2', 'Type1', 'Type1', 'Type2')
mydataframe <- data.frame(col1, col2, col3)
mydataframe
## col1 col2 col3
## 1 1 11 Type1
## 2 2 22 Type2
## 3 3 33 Type1
## 4 4 44 Type1
## 5 5 55 Type2
Creating a factor:
diabetes <- c ("T1", "T2", "T3")
FactorDiabetes <- factor(diabetes, levels = c(1, 2, 3), order = T, labels = c("T1", "T2", "T3"))
FactorDiabetes
## [1] <NA> <NA> <NA>
## Levels: T1 < T2 < T3
Creating a list:
# myllist <- (object1, object2, object3, ...)
if(T){
g <- "My first List"
h <- c(23, 23, 44, 33)
j <- matrix(1:12, nrow = 4)
k <- c("one", "two", "three")
mylist <- list(title=g, ages = h, j, k)
mylist
}
## $title
## [1] "My first List"
##
## $ages
## [1] 23 23 44 33
##
## [[3]]
## [,1] [,2] [,3]
## [1,] 1 5 9
## [2,] 2 6 10
## [3,] 3 7 11
## [4,] 4 8 12
##
## [[4]]
## [1] "one" "two" "three"
# you can use if (False) with brackets to not letting R to run the codes.
Creating a table to mannually enter data:
mydata <- data.frame(age=numeric(0),gender=character(0), weight=numeric(0))
# You can unblock the following line if you are rerunning the code
# mydata <- edit(mydata)
mydata
## [1] age gender weight
## <0 rows> (or 0-length row.names)
# you can also use fix(mydata) if you need to update anything
Importing Data:
# read.table()
# read.csv()
# read.xlsx()
# you need to install.packages("xlsx")
# read.spss()
# library(Hmisc)
# read.ssd() and sas.get() for SAS data
Figure 2.2, Table 2.2
getwd()
## [1] "C:/Users/yduan3/Desktop"
setwd("C:/Users/yduan3/Desktop")
# make sure to put the codes in between of the pdf() and dev.off()
pdf("Ethan.pdf")
attach(mtcars)
plot(wt,mpg)
abline(lm(mpg~wt))
title('Regression of MPG on Weight')
detach(mtcars)
dev.off()
## png
## 2
# win.metafile(), png(), jpeg(), bmp(), tiff(), xfig() and postscript() are all used the same way.
dose <- c(20, 30, 40, 45, 60)
drugA <- c(16, 20, 27, 40, 60)
drugB <- c(15, 18, 25, 31, 40)
opar <- par(no.readonly = TRUE)
par(lty=2, pch=17)
plot(dose, drugA, type = 'b')
par(opar)
# vs.
plot(dose, drugA, type = 'b', lty = 2, pch =17, col.axis = "red", col.lab = "Blue", col.main = "purple", fg = "pink", bg = "yellow")
Table 3.2, figure 3.4, 3.5. Table 3.4, 3.5
# This chapter also included how to add a line and adding texts on different
# areas of the graph. I finished chapter 3 but didn't take notes on R studio.
# Please refer to the book.
Now I am folloing one of the example in the book:
manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadership <- data.frame(manager, date, country, gender, age,
q1, q2, q3, q4, q5, stringsAsFactors=FALSE)
leadership
## manager date country gender age q1 q2 q3 q4 q5
## 1 1 10/24/08 US M 32 5 4 5 5 5
## 2 2 10/28/08 US F 45 3 5 2 5 5
## 3 3 10/1/08 UK F 25 3 5 5 5 2
## 4 4 10/12/08 UK M 39 3 3 4 NA NA
## 5 5 5/1/09 UK F 99 2 2 1 2 1
variable <- expression
mydata<-data.frame(x1 = c(2, 2, 6, 4),
x2 = c(3, 4, 2, 8))
mydata$sumx <- mydata$x1 + mydata$x2
mydata$meanx <- (mydata$x1 + mydata$x2)/2
mydata
## x1 x2 sumx meanx
## 1 2 3 5 2.5
## 2 2 4 6 3.0
## 3 6 2 8 4.0
## 4 4 8 12 6.0
attach(mydata)
mydata$sumx <- x1 + x2
mydata$meanx <- (x1 + x2)/2
detach(mydata)
mydata <- transform(mydata,
sumx = x1 + x2,
meanx = (x1 + x2)/2)
leadership$age[leadership$age == 99] <- NA
leadership
## manager date country gender age q1 q2 q3 q4 q5
## 1 1 10/24/08 US M 32 5 4 5 5 5
## 2 2 10/28/08 US F 45 3 5 2 5 5
## 3 3 10/1/08 UK F 25 3 5 5 5 2
## 4 4 10/12/08 UK M 39 3 3 4 NA NA
## 5 5 5/1/09 UK F NA 2 2 1 2 1
Recoding variables under certain condition. *The statementvariable [condition] <- expression will only make the assignment when condition is true.
leadership$agecat[leadership$age > 75] <- "Elder"
leadership$agecat[leadership$age >= 55 &
leadership$age <= 75] <- "Middle Aged"
leadership$agecat[leadership$age < 55] <- "Young"
leadership
## manager date country gender age q1 q2 q3 q4 q5 agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
Here’s a more creative way for recoding variables:
leadership <- within(leadership,{
agecat <- NA
agecat[age > 75] <- "Elder"
agecat[age >= 55 & age <= 75] <- "Middle Aged"
agecat[age < 55] <- "Young" })
leadership
## manager date country gender age q1 q2 q3 q4 q5 agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
Renaming variables:
# fix(leadership)
names(leadership)[2] <- "testDate"
names(leadership)[6:10] <- c("item1", "item2", "item3", "item4", "item5")
leadership
## manager testDate country gender age item1 item2 item3 item4 item5 agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
Missing values:
y <- c(1, 2, 3, NA)
is.na(y)
## [1] FALSE FALSE FALSE TRUE
is.na(leadership[, 6:10])
## item1 item2 item3 item4 item5
## [1,] FALSE FALSE FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE TRUE TRUE
## [5,] FALSE FALSE FALSE FALSE FALSE
Recoding 99 values to NA
leadership$age[leadership$age == 99] <- NA
x <- c(1, 2, NA, 3)
y <- x[1] + x[2] + x[3] + x[4]
z <- sum(x)
# na.rm= True will remove NA values
x <- c(1, 2, NA, 3)
y <- sum(x, na.rm=TRUE)
# na.omit() function deletes any row with missing data, this is called a listwise deletion.
leadership
## manager testDate country gender age item1 item2 item3 item4 item5 agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
newdata <- na.omit(leadership)
newdata
## manager testDate country gender age item1 item2 item3 item4 item5 agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 Young
Now onto date values:
mydates <- as.Date(c("2007-06-22", "2004-02-13"))
mydates
## [1] "2007-06-22" "2004-02-13"
strDates <- c("01/05/1965", "08/16/1975")
dates <- as.Date(strDates, "%m/%d/%Y")
strDates
## [1] "01/05/1965" "08/16/1975"
dates
## [1] "1965-01-05" "1975-08-16"
Sys.Date()
## [1] "2019-06-13"
date()
## [1] "Thu Jun 13 15:47:22 2019"
Type conversion:
Soring Data:
newdata <- leadership[order(leadership$age),]
newdata
## manager testDate country gender age item1 item2 item3 item4 item5 agecat
## 3 3 10/1/08 UK F 25 3 5 5 5 2 Young
## 1 1 10/24/08 US M 32 5 4 5 5 5 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
attach(leadership)
## The following objects are masked _by_ .GlobalEnv:
##
## age, country, gender, manager
newdata <- leadership[order(gender, age),]
newdata
## manager testDate country gender age item1 item2 item3 item4 item5 agecat
## 3 3 10/1/08 UK F 25 3 5 5 5 2 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
## 1 1 10/24/08 US M 32 5 4 5 5 5 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA Young
detach(leadership)
Merging datasets:
Adding columns to a dataframe
# total <- merge(dataframeA, dataframeB, by="ID")
# total <- merge(dataframeA, dataframeB, by=c("ID","Country"))
sqrt(c(4, 9, 25))
## [1] 2 3 5
"1. x equals c(1, 2, 3, 4, 5, 6, 7, 8), and mean x equals 4.5 (length(x) returns the number of elements in x).
2. (x – meanx) subtracts 4.5 from each element of x, resulting in
c(-3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5)
3. (x – meanx)^2 squares each element of (x - meanx), resulting in
c(12.25, 6.25, 2.25, 0.25, 0.25, 2.25, 6.25, 12.25)
4. sum((x - meanx)^2) sums each of the elements of (x - meanx)^2), resulting in 42."
## [1] "1. x equals c(1, 2, 3, 4, 5, 6, 7, 8), and mean x equals 4.5 (length(x) returns the number of elements in x).\n\n2. (x – meanx) subtracts 4.5 from each element of x, resulting in\n\nc(-3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5)\n\n3. (x – meanx)^2 squares each element of (x - meanx), resulting in\n\nc(12.25, 6.25, 2.25, 0.25, 0.25, 2.25, 6.25, 12.25)\n\n4. sum((x - meanx)^2) sums each of the elements of (x - meanx)^2), resulting in 42."
yy <- c(2, 4, 5, 3, 5, 3, 6, 2)
mean(yy)
## [1] 3.75
sd(yy)
## [1] 1.488048
# Scaling with desidred standard deviation and mean
new_yy <- transform(yy, yy = scale(yy)*5+4)
new_yy
## X_data yy
## 1 2 -1.880188
## 2 4 4.840027
## 3 5 8.200134
## 4 3 1.479919
## 5 5 8.200134
## 6 3 1.479919
## 7 6 11.560242
## 8 2 -1.880188
"Each time you generate pseudo-random deviates, a different seed, and therefore different results, are produced. To make your results reproducible, you can specify the seed explicitly, using the set.seed() function. An example is given in the next listing. Here, the runif() function is used to generate pseudo-random numbers from a uniform distribution on the interval 0 to 1."
## [1] "Each time you generate pseudo-random deviates, a different seed, and therefore different results, are produced. To make your results reproducible, you can specify the seed explicitly, using the set.seed() function. An example is given in the next listing. Here, the runif() function is used to generate pseudo-random numbers from a uniform distribution on the interval 0 to 1."
runif(5)
## [1] 0.063806718 0.004451789 0.833637670 0.599869327 0.395219954
# 0.8725344 0.3962501 0.6826534 0.3667821 0.9255909
runif(5)
## [1] 0.9396374 0.2974506 0.2667988 0.2592743 0.5693475
# 0.4273903 0.2641101 0.3550058 0.3233044 0.6584988
set.seed(1234)
runif(5)
## [1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
# 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
set.seed(1234)
runif(5)
## [1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
# 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
Character functions: