This tutorial will provide you Hands-On practice in R programming. This tutorial assumes that you have basic familiarity with programming in any other language. Little theory is provided but if you have any queries then use ?functionName for details.

1 Getting Started

1.1 Installing and Loading Packages

# install.packages("ggplot2")
# install.packages("coefplot")

I’ve commented the instructions since I already have them installed.

library(ggplot2)

require(coefplot)

## Loading required package: coefplot

require(useful)

## Loading required package: useful

ggplot

## function (data = NULL, ...) 
## UseMethod("ggplot")
## <environment: namespace:ggplot2>

1.2 Simple Calculations

1 + 1

## [1] 2

2 * 3

## [1] 6

3 / 4

## [1] 0.75

3 %% 2

## [1] 1

5 - 3

## [1] 2

8 / (3 + 4)

## [1] 1.142857

2 ** 3

## [1] 8

1 + 2i

## [1] 1+2i

(1  + 2i) - (3 + 4i)

## [1] -2-2i

R follows PEMDAS rules. Maths students should know that.

1.3 Variables

x <- 2

x + 1

## [1] 3

y <- 3

x * y

## [1] 6

a <- b <- 5

a

## [1] 5

## [1] 5

assign(x = "var", value = 10)

var

## [1] 10

rm(a)

rm(list = ls())

Using = for assignment isn’t recommended in R community, although it works equally well in most situations.

a = 23

R is case-sensitive

hero <- 1

# Hero

# Error : object 'X' not found

rm(list = ls())

1.4 Data Types

R is dynamically-typed.

x <- 2

class(x)

## [1] "numeric"

is.numeric(x)

## [1] TRUE

i <- 5L

class(i)

## [1] "integer"

is.integer(i)

## [1] TRUE

is.numeric(i)

## [1] TRUE

class(4L)

## [1] "integer"

4L * 2.8

## [1] 11.2

class(4L * 2.8)

## [1] "numeric"

5L / 2L

## [1] 2.5

class(5L / 2L)

## [1] "numeric"

x <- "data"

x

## [1] "data"

class(x)

## [1] "character"

y <- factor("data")

y

## [1] data
## Levels: data

class(y)

## [1] "factor"

nchar(x)

## [1] 4

nchar("hello")

## [1] 5

nchar(3)

## [1] 1

nchar(452)

## [1] 3

Note: nchar() does not work with factor

date1 <- as.Date("2012-06-23")

date1

## [1] "2012-06-23"

class(date1)

## [1] "Date"

as.numeric(date1)

## [1] 15514

date2 <- as.POSIXct(x = "2012-06-23 17:32")

date2

## [1] "2012-06-23 17:32:00 IST"

class(date2)

## [1] "POSIXct" "POSIXt"

as.numeric(date2)

## [1] 1340452920

TRUE

## [1] TRUE

FALSE

## [1] FALSE

TRUE * 5

## [1] 5

FALSE * 5

## [1] 0

as.numeric(TRUE)

## [1] 1

as.numeric(FALSE)

## [1] 0

class(TRUE)

## [1] "logical"

is.logical(TRUE)

## [1] TRUE

## [1] TRUE

## [1] FALSE

T <- 1

T <- 100

T

## [1] 100

class(T)

## [1] "numeric"

T <- TRUE

T

## [1] TRUE

2 == 2

## [1] TRUE

2 == 3

## [1] FALSE

2 != 3

## [1] TRUE

2 < 3

## [1] TRUE

2 > 3

## [1] FALSE

2 <= 3

## [1] TRUE

2 >= 3

## [1] FALSE

"data" == "data"

## [1] TRUE

"data" == "Data"

## [1] FALSE

"data" < "Data"

## [1] TRUE

rm(list = ls())

1.5 Vectors

x <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

x * 3

##  [1]  3  6  9 12 15 18 21 24 27 30

x + 2

##  [1]  3  4  5  6  7  8  9 10 11 12

x - 4

##  [1] -3 -2 -1  0  1  2  3  4  5  6

x / 8

##  [1] 0.125 0.250 0.375 0.500 0.625 0.750 0.875 1.000 1.125 1.250

x ** 6

##  [1]       1      64     729    4096   15625   46656  117649  262144
##  [9]  531441 1000000

sqrt(x)

##  [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751
##  [8] 2.828427 3.000000 3.162278

1:10

##  [1]  1  2  3  4  5  6  7  8  9 10

10:1

##  [1] 10  9  8  7  6  5  4  3  2  1

-2:5

## [1] -2 -1  0  1  2  3  4  5

5:-6

##  [1]  5  4  3  2  1  0 -1 -2 -3 -4 -5 -6

x <- 1:10

y <- -5:4

x + y

##  [1] -4 -2  0  2  4  6  8 10 12 14

x - y

##  [1] 6 6 6 6 6 6 6 6 6 6

x * y

##  [1] -5 -8 -9 -8 -5  0  7 16 27 40

x / y

##  [1] -0.2 -0.5 -1.0 -2.0 -5.0  Inf  7.0  4.0  3.0  2.5

x ** y

##  [1] 1.000000e+00 6.250000e-02 3.703704e-02 6.250000e-02 2.000000e-01
##  [6] 1.000000e+00 7.000000e+00 6.400000e+01 7.290000e+02 1.000000e+04

x ^ y

##  [1] 1.000000e+00 6.250000e-02 3.703704e-02 6.250000e-02 2.000000e-01
##  [6] 1.000000e+00 7.000000e+00 6.400000e+01 7.290000e+02 1.000000e+04

length(x)

## [1] 10

length(y)

## [1] 10

length(x + y)

## [1] 10

x + c(1, 3)

##  [1]  2  5  4  7  6  9  8 11 10 13

x + c(1, 3, 5)

## Warning in x + c(1, 3, 5): longer object length is not a multiple of
## shorter object length

##  [1]  2  5  8  5  8 11  8 11 14 11

x <= 4

##  [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE

x[x <= 4]

## [1] 1 2 3 4

x > y

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

y > x

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

x <- 10:1

y <- -4:5

x < y

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE

any(x < y)

## [1] TRUE

all(x < y)

## [1] FALSE

sports <- c("Hockey", "Basketball", "Football", "Cricket", "Badminton", "Table Tennis", "Rugby", "Baseball", "Lawn Tennis", "Soccer")

sports

##  [1] "Hockey"       "Basketball"   "Football"     "Cricket"     
##  [5] "Badminton"    "Table Tennis" "Rugby"        "Baseball"    
##  [9] "Lawn Tennis"  "Soccer"

nchar(sports)

##  [1]  6 10  8  7  9 12  5  8 11  6

number <- 7

number

## [1] 7

##  [1] 10  9  8  7  6  5  4  3  2  1

x[1]

## [1] 10

x[c(1, 2)]

## [1] 10  9

x[1:2]

## [1] 10  9

x[c(1, 3, 5, 9)]

## [1] 10  8  6  2

a <- c(One = "a", Two = "y", Three = "r")

names(a)

## [1] "One"   "Two"   "Three"

w <- 1:3

names(w)

## NULL

names(w) <- c("One", "Two", "Three")

w

##   One   Two Three 
##     1     2     3

names(w)

## [1] "One"   "Two"   "Three"

sports2 <- c(sports, "Hockey", "Badminton", "Cricket", "Football", "Hockey", "Water Polo")

sports2

##  [1] "Hockey"       "Basketball"   "Football"     "Cricket"     
##  [5] "Badminton"    "Table Tennis" "Rugby"        "Baseball"    
##  [9] "Lawn Tennis"  "Soccer"       "Hockey"       "Badminton"   
## [13] "Cricket"      "Football"     "Hockey"       "Water Polo"

sports2.factor <- factor(sports2)

sports2.factor

##  [1] Hockey       Basketball   Football     Cricket      Badminton   
##  [6] Table Tennis Rugby        Baseball     Lawn Tennis  Soccer      
## [11] Hockey       Badminton    Cricket      Football     Hockey      
## [16] Water Polo  
## 11 Levels: Badminton Baseball Basketball Cricket Football ... Water Polo

table(sports2.factor)

## sports2.factor
##    Badminton     Baseball   Basketball      Cricket     Football 
##            2            1            1            2            2 
##       Hockey  Lawn Tennis        Rugby       Soccer Table Tennis 
##            3            1            1            1            1 
##   Water Polo 
##            1

class(sports2.factor)

## [1] "factor"

as.numeric(sports2.factor)

##  [1]  6  3  5  4  1 10  8  2  7  9  6  1  4  5  6 11

rm(list = ls())

z <- c(1, 2, 3, NA, 8, 9, 10, NA)

z

## [1]  1  2  3 NA  8  9 10 NA

is.na(z)

## [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE

z[!is.na(z)]

## [1]  1  2  3  8  9 10

z.character <- c("Hockey", NA, "Cricket")

is.na(z.character)

## [1] FALSE  TRUE FALSE

z <- c(1, NULL, 3)

z

## [1] 1 3

d <- NULL

is.null(d)

## [1] TRUE

is.null(z)

## [1] FALSE

rm(list = ls())

1.6 Function Calls

x <- 1:10

mean(x)

## [1] 5.5

sum(x)

## [1] 55

nchar(x)

##  [1] 1 1 1 1 1 1 1 1 1 2

x <- 1:1000

x[200:300] <- NA

mean(x)

## [1] NA

mean(x = x, na.rm = TRUE)

## [1] 528.6429

mean(x = x, trim = 0.1 , na.rm = TRUE)

## [1] 535.5908

2 Data Structures

2.1 Data Frames

x <- 10:1

y <- -4:5

q <- c("Hockey", "Basketball", "Cricket", "Billiards", "Chess", "Table Tennis", "Rugby", "Water Polo", "Lawn Tennis", "Football")

df <- data.frame(x, y, q)

df

##     x  y            q
## 1  10 -4       Hockey
## 2   9 -3   Basketball
## 3   8 -2      Cricket
## 4   7 -1    Billiards
## 5   6  0        Chess
## 6   5  1 Table Tennis
## 7   4  2        Rugby
## 8   3  3   Water Polo
## 9   2  4  Lawn Tennis
## 10  1  5     Football

df <- data.frame(First = x, Second = y, Sports = q)

df

##    First Second       Sports
## 1     10     -4       Hockey
## 2      9     -3   Basketball
## 3      8     -2      Cricket
## 4      7     -1    Billiards
## 5      6      0        Chess
## 6      5      1 Table Tennis
## 7      4      2        Rugby
## 8      3      3   Water Polo
## 9      2      4  Lawn Tennis
## 10     1      5     Football

class(df$Sports)

## [1] "factor"

df <- data.frame(First = x, Second = y, Sports = q, stringsAsFactors = FALSE)

class(df$Sports)

## [1] "character"

nrow(df)

## [1] 10

ncol(df)

## [1] 3

dim(df)

## [1] 10  3

NROW(x = df)

## [1] 10

NCOL(x = df)

## [1] 3

##  [1] 10  9  8  7  6  5  4  3  2  1

nrow(x)

## NULL

NROW(x)

## [1] 10

length(x)

## [1] 10

ncol(x)

## NULL

NCOL(x)

## [1] 1

names(df)

## [1] "First"  "Second" "Sports"

names(df)[3]

## [1] "Sports"

rownames(df)

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"

rownames(df) <- letters[1:10]

df

##   First Second       Sports
## a    10     -4       Hockey
## b     9     -3   Basketball
## c     8     -2      Cricket
## d     7     -1    Billiards
## e     6      0        Chess
## f     5      1 Table Tennis
## g     4      2        Rugby
## h     3      3   Water Polo
## i     2      4  Lawn Tennis
## j     1      5     Football

rownames(df)

##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"

rownames(df) <- NULL

rownames(df)

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"

head(df)

##   First Second       Sports
## 1    10     -4       Hockey
## 2     9     -3   Basketball
## 3     8     -2      Cricket
## 4     7     -1    Billiards
## 5     6      0        Chess
## 6     5      1 Table Tennis

tail(df)

##    First Second       Sports
## 5      6      0        Chess
## 6      5      1 Table Tennis
## 7      4      2        Rugby
## 8      3      3   Water Polo
## 9      2      4  Lawn Tennis
## 10     1      5     Football

head(x = df, n = 7)

##   First Second       Sports
## 1    10     -4       Hockey
## 2     9     -3   Basketball
## 3     8     -2      Cricket
## 4     7     -1    Billiards
## 5     6      0        Chess
## 6     5      1 Table Tennis
## 7     4      2        Rugby

tail(x = df, n = 7)

##    First Second       Sports
## 4      7     -1    Billiards
## 5      6      0        Chess
## 6      5      1 Table Tennis
## 7      4      2        Rugby
## 8      3      3   Water Polo
## 9      2      4  Lawn Tennis
## 10     1      5     Football

class(df)

## [1] "data.frame"

df$Sports

##  [1] "Hockey"       "Basketball"   "Cricket"      "Billiards"   
##  [5] "Chess"        "Table Tennis" "Rugby"        "Water Polo"  
##  [9] "Lawn Tennis"  "Football"

df[3, 2]

## [1] -2

df[4, 3]

## [1] "Billiards"

df[3:5, 2:3]

##   Second    Sports
## 3     -2   Cricket
## 4     -1 Billiards
## 5      0     Chess

df[c(3, 6), 2]

## [1] -2  1

df[, c(1, 3)]

##    First       Sports
## 1     10       Hockey
## 2      9   Basketball
## 3      8      Cricket
## 4      7    Billiards
## 5      6        Chess
## 6      5 Table Tennis
## 7      4        Rugby
## 8      3   Water Polo
## 9      2  Lawn Tennis
## 10     1     Football

df[, 3]

##  [1] "Hockey"       "Basketball"   "Cricket"      "Billiards"   
##  [5] "Chess"        "Table Tennis" "Rugby"        "Water Polo"  
##  [9] "Lawn Tennis"  "Football"

class(df[, 3])

## [1] "character"

df[, 3, drop = FALSE]

##          Sports
## 1        Hockey
## 2    Basketball
## 3       Cricket
## 4     Billiards
## 5         Chess
## 6  Table Tennis
## 7         Rugby
## 8    Water Polo
## 9   Lawn Tennis
## 10     Football

class(df[, 3, drop = FALSE])

## [1] "data.frame"

df[2, ]

##   First Second     Sports
## 2     9     -3 Basketball

class(df[2, ])

## [1] "data.frame"

df[2:4, ]

##   First Second     Sports
## 2     9     -3 Basketball
## 3     8     -2    Cricket
## 4     7     -1  Billiards

df[, "Sports"]

##  [1] "Hockey"       "Basketball"   "Cricket"      "Billiards"   
##  [5] "Chess"        "Table Tennis" "Rugby"        "Water Polo"  
##  [9] "Lawn Tennis"  "Football"

df[, c("First", "Sports")]

##    First       Sports
## 1     10       Hockey
## 2      9   Basketball
## 3      8      Cricket
## 4      7    Billiards
## 5      6        Chess
## 6      5 Table Tennis
## 7      4        Rugby
## 8      3   Water Polo
## 9      2  Lawn Tennis
## 10     1     Football

df[, c("Sports", "First")]

##          Sports First
## 1        Hockey    10
## 2    Basketball     9
## 3       Cricket     8
## 4     Billiards     7
## 5         Chess     6
## 6  Table Tennis     5
## 7         Rugby     4
## 8    Water Polo     3
## 9   Lawn Tennis     2
## 10     Football     1

df[, "Sports", drop = FALSE]

##          Sports
## 1        Hockey
## 2    Basketball
## 3       Cricket
## 4     Billiards
## 5         Chess
## 6  Table Tennis
## 7         Rugby
## 8    Water Polo
## 9   Lawn Tennis
## 10     Football

df["Sports"]

##          Sports
## 1        Hockey
## 2    Basketball
## 3       Cricket
## 4     Billiards
## 5         Chess
## 6  Table Tennis
## 7         Rugby
## 8    Water Polo
## 9   Lawn Tennis
## 10     Football

df[["Sports"]]

##  [1] "Hockey"       "Basketball"   "Cricket"      "Billiards"   
##  [5] "Chess"        "Table Tennis" "Rugby"        "Water Polo"  
##  [9] "Lawn Tennis"  "Football"

df[c("First", "Sports")]

##    First       Sports
## 1     10       Hockey
## 2      9   Basketball
## 3      8      Cricket
## 4      7    Billiards
## 5      6        Chess
## 6      5 Table Tennis
## 7      4        Rugby
## 8      3   Water Polo
## 9      2  Lawn Tennis
## 10     1     Football

rm(list = ls())

2.2 Lists

list1 <- list(1, 2, 3)

list1

## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 3

list2 <- list(c(1, 2, 3))

list2

## [[1]]
## [1] 1 2 3

list3 <- list(c(1, 2, 3), 3:7)

list3

## [[1]]
## [1] 1 2 3
## 
## [[2]]
## [1] 3 4 5 6 7

df <- data.frame(First = 1:5, Second = 5:1, Sport = c("Hockey", "Cricket", "Football", "Rugby", "Badminton"), stringsAsFactors = FALSE)

df

##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton

list4 <- list(df, 1:10)

list4

## [[1]]
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## [[2]]
##  [1]  1  2  3  4  5  6  7  8  9 10

list5 <- list(df, 1:10, list3)

list5

## [[1]]
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## [[2]]
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## [[3]]
## [[3]][[1]]
## [1] 1 2 3
## 
## [[3]][[2]]
## [1] 3 4 5 6 7

names(list5)

## NULL

names(list5) <- c("data.frame", "vector", "list")

names(list5)

## [1] "data.frame" "vector"     "list"

list5

## $data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## $vector
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $list
## $list[[1]]
## [1] 1 2 3
## 
## $list[[2]]
## [1] 3 4 5 6 7

list6 <- list(DataFrame = df, Vector = 1:10, List = list3)

names(list6)

## [1] "DataFrame" "Vector"    "List"

list6

## $DataFrame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## $Vector
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $List
## $List[[1]]
## [1] 1 2 3
## 
## $List[[2]]
## [1] 3 4 5 6 7

empty.list <- vector(mode = "list", length = 4L)

empty.list

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL

empty.list[[1]] <- 5

empty.list

## [[1]]
## [1] 5
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL

list5[1]

## $data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton

list5[[1]]

##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton

list5[["data.frame"]]

##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton

list5$data.frame

##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton

list5[["data.frame"]]$First

## [1] 1 2 3 4 5

list5[[1]][2]

##   Second
## 1      5
## 2      4
## 3      3
## 4      2
## 5      1

list5[[1]][[2]]

## [1] 5 4 3 2 1

list5[[1]][, "Second", drop = FALSE]

##   Second
## 1      5
## 2      4
## 3      3
## 4      2
## 5      1

length(list5)

## [1] 3

NROW(list5)

## [1] 3

list5[[4]] <- 2 # memory and processor inefficient

list5

## $data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## $vector
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $list
## $list[[1]]
## [1] 1 2 3
## 
## $list[[2]]
## [1] 3 4 5 6 7
## 
## 
## [[4]]
## [1] 2

list5[["new.element"]] <- 3:7

list5

## $data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## $vector
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $list
## $list[[1]]
## [1] 1 2 3
## 
## $list[[2]]
## [1] 3 4 5 6 7
## 
## 
## [[4]]
## [1] 2
## 
## $new.element
## [1] 3 4 5 6 7

names(list5)

## [1] "data.frame"  "vector"      "list"        ""            "new.element"

list5[1:3]

## $data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## $vector
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $list
## $list[[1]]
## [1] 1 2 3
## 
## $list[[2]]
## [1] 3 4 5 6 7

rm(list = ls())

2.3 Matrices

A <- matrix(data = 1:10, nrow = 5, ncol = 2)

A

##      [,1] [,2]
## [1,]    1    6
## [2,]    2    7
## [3,]    3    8
## [4,]    4    9
## [5,]    5   10

B <- matrix(data = 21:30, nrow = 5, ncol = 2)

B

##      [,1] [,2]
## [1,]   21   26
## [2,]   22   27
## [3,]   23   28
## [4,]   24   29
## [5,]   25   30

C <- matrix(data = 21:40, nrow = 2, ncol = 10)

C

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]   21   23   25   27   29   31   33   35   37    39
## [2,]   22   24   26   28   30   32   34   36   38    40

nrow(A)

## [1] 5

ncol(A)

## [1] 2

dim(A)

## [1] 5 2

A + B

##      [,1] [,2]
## [1,]   22   32
## [2,]   24   34
## [3,]   26   36
## [4,]   28   38
## [5,]   30   40

A * B

##      [,1] [,2]
## [1,]   21  156
## [2,]   44  189
## [3,]   69  224
## [4,]   96  261
## [5,]  125  300

A == B

##       [,1]  [,2]
## [1,] FALSE FALSE
## [2,] FALSE FALSE
## [3,] FALSE FALSE
## [4,] FALSE FALSE
## [5,] FALSE FALSE

ncol(A)

## [1] 2

nrow(B)

## [1] 5

t(B)

##      [,1] [,2] [,3] [,4] [,5]
## [1,]   21   22   23   24   25
## [2,]   26   27   28   29   30

A %*% t(B)

##      [,1] [,2] [,3] [,4] [,5]
## [1,]  177  184  191  198  205
## [2,]  224  233  242  251  260
## [3,]  271  282  293  304  315
## [4,]  318  331  344  357  370
## [5,]  365  380  395  410  425

A %*% C

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]  153  167  181  195  209  223  237  251  265   279
## [2,]  196  214  232  250  268  286  304  322  340   358
## [3,]  239  261  283  305  327  349  371  393  415   437
## [4,]  282  308  334  360  386  412  438  464  490   516
## [5,]  325  355  385  415  445  475  505  535  565   595

colnames(A)

## NULL

rownames(A)

## NULL

colnames(A) <- c("Left", "Right")

rownames(A) <- c("First", "Second", "Third", "Fourth", "Fifth")

A

##        Left Right
## First     1     6
## Second    2     7
## Third     3     8
## Fourth    4     9
## Fifth     5    10

colnames(B) <- c("First", "Second")

rownames(B) <- c("One", "Two", "Three", "Four", "Five")

B

##       First Second
## One      21     26
## Two      22     27
## Three    23     28
## Four     24     29
## Five     25     30

letters

##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"

LETTERS

##  [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q"
## [18] "R" "S" "T" "U" "V" "W" "X" "Y" "Z"

colnames(C) <- LETTERS[1:10] 

rownames(C) <- c("Top", "Bottom")

C

##         A  B  C  D  E  F  G  H  I  J
## Top    21 23 25 27 29 31 33 35 37 39
## Bottom 22 24 26 28 30 32 34 36 38 40

##        Left Right
## First     1     6
## Second    2     7
## Third     3     8
## Fourth    4     9
## Fifth     5    10

t(A)

##       First Second Third Fourth Fifth
## Left      1      2     3      4     5
## Right     6      7     8      9    10

A %*% C

##          A   B   C   D   E   F   G   H   I   J
## First  153 167 181 195 209 223 237 251 265 279
## Second 196 214 232 250 268 286 304 322 340 358
## Third  239 261 283 305 327 349 371 393 415 437
## Fourth 282 308 334 360 386 412 438 464 490 516
## Fifth  325 355 385 415 445 475 505 535 565 595

2.4 Arrays

arr <- array(data = 1:12, dim = c(2, 3, 2))

arr

## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]    7    9   11
## [2,]    8   10   12

arr[1, , ]

##      [,1] [,2]
## [1,]    1    7
## [2,]    3    9
## [3,]    5   11

arr[, 2, ]

##      [,1] [,2]
## [1,]    3    9
## [2,]    4   10

arr[, , 2]

##      [,1] [,2] [,3]
## [1,]    7    9   11
## [2,]    8   10   12

arr[1, , 1]

## [1] 1 3 5

arr[, , 1]

##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6

rm(list = ls())

3 Reading Data

3.1 Reading CSV files

fileURL <- "http://www.jaredlander.com/data/Tomato%20First.csv"

tomato <- read.table(file = fileURL, header = TRUE, sep = ",")

head(tomato)

##   Round             Tomato Price      Source Sweet Acid Color Texture
## 1     1         Simpson SM  3.99 Whole Foods   2.8  2.8   3.7     3.4
## 2     1  Tuttorosso (blue)  2.99     Pioneer   3.3  2.8   3.4     3.0
## 3     1 Tuttorosso (green)  0.99     Pioneer   2.8  2.6   3.3     2.8
## 4     1     La Fede SM DOP  3.99   Shop Rite   2.6  2.8   3.0     2.3
## 5     2       Cento SM DOP  5.49  D Agostino   3.3  3.1   2.9     2.8
## 6     2      Cento Organic  4.99  D Agostino   3.2  2.9   2.9     3.1
##   Overall Avg.of.Totals Total.of.Avg
## 1     3.4          16.1         16.1
## 2     2.9          15.3         15.3
## 3     2.9          14.3         14.3
## 4     2.8          13.4         13.4
## 5     3.1          14.4         15.2
## 6     2.9          15.5         15.1

class(tomato)

## [1] "data.frame"

class(tomato$Tomato)

## [1] "factor"

tomato <- read.table(file = fileURL, header = TRUE, sep = ",", stringsAsFactors = FALSE)

head(tomato)

##   Round             Tomato Price      Source Sweet Acid Color Texture
## 1     1         Simpson SM  3.99 Whole Foods   2.8  2.8   3.7     3.4
## 2     1  Tuttorosso (blue)  2.99     Pioneer   3.3  2.8   3.4     3.0
## 3     1 Tuttorosso (green)  0.99     Pioneer   2.8  2.6   3.3     2.8
## 4     1     La Fede SM DOP  3.99   Shop Rite   2.6  2.8   3.0     2.3
## 5     2       Cento SM DOP  5.49  D Agostino   3.3  3.1   2.9     2.8
## 6     2      Cento Organic  4.99  D Agostino   3.2  2.9   2.9     3.1
##   Overall Avg.of.Totals Total.of.Avg
## 1     3.4          16.1         16.1
## 2     2.9          15.3         15.3
## 3     2.9          14.3         14.3
## 4     2.8          13.4         13.4
## 5     3.1          14.4         15.2
## 6     2.9          15.5         15.1

class(tomato$Tomato)

## [1] "character"

Also refer to ?read.csv and ?read.csv2

3.2 Reading from Databases

# library(RODBC)

# RShowDoc("RODBC", package = "RODBC")

Uncomment and execute the instructions to read the vignette for more information.

3.3 Reading from Foreign Statistical Softwares

Download and read the documentation of foreign package from CRAN.

Read about the following functions:

read.spss
read.dta
read.ssd
read.octave
read.mtp
read.systat

3.4 Working with Binary Data

fileURL <- "http://www.jaredlander.com/data/Tomato%20First.csv"

tomato <- read.csv(file = fileURL, stringsAsFactors = FALSE)

save(list = c("tomato"), file = "tomato.rdata")

rm(tomato)

load(file = "tomato.rdata")

head(tomato)

##   Round             Tomato Price      Source Sweet Acid Color Texture
## 1     1         Simpson SM  3.99 Whole Foods   2.8  2.8   3.7     3.4
## 2     1  Tuttorosso (blue)  2.99     Pioneer   3.3  2.8   3.4     3.0
## 3     1 Tuttorosso (green)  0.99     Pioneer   2.8  2.6   3.3     2.8
## 4     1     La Fede SM DOP  3.99   Shop Rite   2.6  2.8   3.0     2.3
## 5     2       Cento SM DOP  5.49  D Agostino   3.3  3.1   2.9     2.8
## 6     2      Cento Organic  4.99  D Agostino   3.2  2.9   2.9     3.1
##   Overall Avg.of.Totals Total.of.Avg
## 1     3.4          16.1         16.1
## 2     2.9          15.3         15.3
## 3     2.9          14.3         14.3
## 4     2.8          13.4         13.4
## 5     3.1          14.4         15.2
## 6     2.9          15.5         15.1

n <- 20

r <- 1:10

w <- data.frame(n, r) 

w

##     n  r
## 1  20  1
## 2  20  2
## 3  20  3
## 4  20  4
## 5  20  5
## 6  20  6
## 7  20  7
## 8  20  8
## 9  20  9
## 10 20 10

save(list = c("n", "r", "w"), file = "multiple.rdata")

rm(list = c("n", "r", "w"))

load(file = "multiple.rdata")

n

## [1] 20

##  [1]  1  2  3  4  5  6  7  8  9 10

##     n  r
## 1  20  1
## 2  20  2
## 3  20  3
## 4  20  4
## 5  20  5
## 6  20  6
## 7  20  7
## 8  20  8
## 9  20  9
## 10 20 10

rm(list = ls())

3.5 Working with data supplied with R

library(ggplot2)

data("diamonds")

head(diamonds)

##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48

data(list = c("tips"), package = "reshape2")

head(tips)

##   total_bill  tip    sex smoker day   time size
## 1      16.99 1.01 Female     No Sun Dinner    2
## 2      10.34 1.66   Male     No Sun Dinner    3
## 3      21.01 3.50   Male     No Sun Dinner    3
## 4      23.68 3.31   Male     No Sun Dinner    2
## 5      24.59 3.61 Female     No Sun Dinner    4
## 6      25.29 4.71   Male     No Sun Dinner    4

data()

rm(list = ls())

3.6 Web Scraping

library(XML)

fileURL <- "http://www.w3schools.com/html/html_tables.asp"

myTable <- readHTMLTable(doc = fileURL, which = 1, header = TRUE, stringsAsFactors = FALSE)

myTable

##   Number First Name Last Name Points
## 1      1        Eve   Jackson     94
## 2      2       John       Doe     80
## 3      3       Adam   Johnson     67
## 4      4       Jill     Smith     50

rm(list = ls())

4 Statistical Graphs

4.1 `diamonds` dataset

library(ggplot2)

data("diamonds")

head(diamonds)

##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48

4.2 Base R Graphics

4.2.1 Histograms

hist(x = diamonds$carat)

hist(x = diamonds$carat, main = "Carat Histogram", xlab = "Carat")

4.2.2 Scatterplots

plot(x = diamonds$carat, y = diamonds$price)

plot(formula = price ~ carat, data = diamonds)

plot(formula = price ~ carat, data = diamonds, main = "Price vs Carat")

4.2.3 Boxplots

boxplot(x = diamonds$carat)

4.3 `ggplot2`

4.3.1 Histograms and Densities

library(ggplot2)

data(diamonds)

ggplot(data = diamonds) + geom_histogram(mapping = aes(x = diamonds$carat))

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

ggplot(data = diamonds) + geom_histogram(mapping = aes(x = diamonds$carat), binwidth = 0.5)

ggplot(data = diamonds) + geom_histogram(mapping = aes(x = diamonds$carat), binwidth = 0.1)

ggplot(data = diamonds) + geom_density(mapping = aes(x = carat))

ggplot(data = diamonds) + geom_density(mapping = aes(x = carat), fill = "grey50")

4.3.2 Scatterplots

ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point()

g <- ggplot(data = diamonds, aes(x = carat, y = price))

g + geom_point()

g + geom_point(aes(color = color))

g + geom_point(aes(color = color, shape = clarity))

## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 8.
## Consider specifying shapes manually if you must have them.

## Warning: Removed 5445 rows containing missing values (geom_point).

## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 8.
## Consider specifying shapes manually if you must have them.

g + geom_point(aes(color = color, shape = cut))

4.3.3 Box Plots and Violin Plots

ggplot(data = diamonds, aes(y = carat, x = 1)) + geom_boxplot()

ggplot(data = diamonds, aes(y = carat, x = cut)) + geom_boxplot()

ggplot(data = diamonds, aes(y = carat, x = cut)) + geom_violin()

g <- ggplot(data = diamonds, aes(y = carat, x = cut))

g + geom_point() + geom_violin()

g + geom_violin() + geom_point()

g + geom_jitter()

g + geom_jitter() + geom_violin()

g + geom_jitter(aes(color = color)) + geom_violin()

rm(list = ls())

4.3.4 Line Plots

data("economics")

head(economics)

##         date   pce    pop psavert uempmed unemploy
## 1 1967-06-30 507.8 198712     9.8     4.5     2944
## 2 1967-07-31 510.9 198911     9.8     4.7     2945
## 3 1967-08-31 516.7 199113     9.0     4.6     2958
## 4 1967-09-30 513.3 199311     9.8     4.9     3143
## 5 1967-10-31 518.5 199498     9.7     4.7     3066
## 6 1967-11-30 526.2 199657     9.4     4.8     3018

ggplot(data = economics, mapping = aes(x = date, y = pop)) + geom_line()

library(lubridate)

economics$year <- year(economics$date)

economics$month <- month(economics$date)

head(economics)

##         date   pce    pop psavert uempmed unemploy year month
## 1 1967-06-30 507.8 198712     9.8     4.5     2944 1967     6
## 2 1967-07-31 510.9 198911     9.8     4.7     2945 1967     7
## 3 1967-08-31 516.7 199113     9.0     4.6     2958 1967     8
## 4 1967-09-30 513.3 199311     9.8     4.9     3143 1967     9
## 5 1967-10-31 518.5 199498     9.7     4.7     3066 1967    10
## 6 1967-11-30 526.2 199657     9.4     4.8     3018 1967    11

econ2000 <- economics[which(economics$year >= 2000), ]

nrow(economics)

## [1] 478

nrow(econ2000)

## [1] 87

head(econ2000)

##           date    pce    pop psavert uempmed unemploy year month
## 392 2000-01-31 6618.5 281190     2.4     6.1     5858 2000     1
## 393 2000-02-29 6685.3 281409     2.0     6.0     5733 2000     2
## 394 2000-03-31 6664.2 281653     2.4     6.1     5481 2000     3
## 395 2000-04-30 6688.0 281891     2.4     5.8     5758 2000     4
## 396 2000-05-31 6712.1 282156     2.5     5.7     5651 2000     5
## 397 2000-06-30 6745.8 282430     2.9     6.0     5747 2000     6

econ2000$month <- month(econ2000$date, label = TRUE)

head(econ2000)

##           date    pce    pop psavert uempmed unemploy year month
## 392 2000-01-31 6618.5 281190     2.4     6.1     5858 2000   Jan
## 393 2000-02-29 6685.3 281409     2.0     6.0     5733 2000   Feb
## 394 2000-03-31 6664.2 281653     2.4     6.1     5481 2000   Mar
## 395 2000-04-30 6688.0 281891     2.4     5.8     5758 2000   Apr
## 396 2000-05-31 6712.1 282156     2.5     5.7     5651 2000   May
## 397 2000-06-30 6745.8 282430     2.9     6.0     5747 2000   Jun

library(scales)

g <- ggplot(data = econ2000, aes(x = month, y = pop))

g <- g + geom_line(aes(color = factor(year), group = year))

g

g <- g + scale_color_discrete(name = "Year")

g

g <- g + scale_y_continuous(labels = comma)

g

g <- g + labs(title = "Population Growth", x = "Month", y = "Population")

g

g <- g + theme(axis.text.x = element_text(angle = 90, hjust = 1))

g

4.3.5 Faceting

g <- ggplot(data = diamonds, mapping = aes(x = carat, y = price))

g + geom_point(mapping = aes(color = color)) + facet_wrap(~color)

g + geom_point(mapping = aes(color = color)) + facet_grid(cut ~ clarity)

ggplot(diamonds, aes(x = carat)) + geom_histogram() + facet_wrap(~color)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

4.3.6 Colors and Shapes

ggplot(data = diamonds, aes(x = carat, y = price, shape = cut, size = depth, color = color)) + geom_point()

4.3.7 Themes

library(ggthemes)

g <- ggplot(data = diamonds, aes(x = carat, y = price, color = color)) + geom_point()

g + theme_wsj()

g + theme_economist() + scale_color_economist()

g + theme_tufte()

g + theme_excel() + scale_color_excel()

rm(list = ls())

4.4 `ggplot2`: Digging Deeper

4.4.1 `qplot` function

library(ggplot2)

data("mpg")

str(mpg)

## 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: Factor w/ 15 levels "audi","chevrolet",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ model       : Factor w/ 38 levels "4runner 4wd",..: 2 2 2 2 2 2 2 3 3 3 ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : Factor w/ 10 levels "auto(av)","auto(l3)",..: 4 9 10 1 4 9 1 9 4 10 ...
##  $ drv         : Factor w/ 3 levels "4","f","r": 2 2 2 2 2 2 2 1 1 1 ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : Factor w/ 5 levels "c","d","e","p",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ class       : Factor w/ 7 levels "2seater","compact",..: 2 2 2 2 2 2 2 2 2 2 ...

qplot(x = displ, y = hwy, data = mpg)

qplot(x = displ, y = hwy, data = mpg, color = drv)

qplot(x = displ, y = hwy, data = mpg, geom = c("point", "smooth"))

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

qplot(x = displ, y = hwy, data = mpg, color = drv, geom = c("point", "smooth"))

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

qplot(x = hwy, data = mpg)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x = hwy, data = mpg, fill = drv)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x = displ, y = hwy, data = mpg, facets = . ~ drv)

qplot(x = hwy, data = mpg, facets = drv ~ ., binwidth = 2)

# The data file can be made available upon request

load("maacs.Rda")

head(maacs)

##   id eno duBedMusM   pm25 mopos
## 1  1 141      2423 15.560   yes
## 2  2 124      2793 34.370   yes
## 3  3 126      3055 38.953   yes
## 4  4 164       775 33.249   yes
## 5  5  99      1634 27.060   yes
## 6  6  68       939 18.890   yes

str(maacs)

## 'data.frame':    750 obs. of  5 variables:
##  $ id       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ eno      : num  141 124 126 164 99 68 41 50 12 30 ...
##  $ duBedMusM: num  2423 2793 3055 775 1634 ...
##  $ pm25     : num  15.6 34.4 39 33.2 27.1 ...
##  $ mopos    : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...

qplot(x = log(eno), data = maacs)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x = log(eno), data = maacs, fill = mopos)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x = log(eno), data = maacs, geom = "density")

## Warning: Removed 108 rows containing non-finite values (stat_density).

qplot(x = log(eno), data = maacs, geom = "density", color = mopos)

## Warning: Removed 49 rows containing non-finite values (stat_density).

## Warning: Removed 59 rows containing non-finite values (stat_density).

qplot(x = log(pm25), y = log(eno), data = maacs)

## Warning: Removed 184 rows containing missing values (geom_point).

qplot(x = log(pm25), y = log(eno), data = maacs, shape = mopos)

## Warning: Removed 184 rows containing missing values (geom_point).

qplot(x = log(pm25), y = log(eno), data = maacs, color = mopos)

## Warning: Removed 184 rows containing missing values (geom_point).

qplot(x = log(pm25), y = log(eno), data = maacs, facets = . ~ mopos)

## Warning: Removed 86 rows containing missing values (geom_point).

## Warning: Removed 98 rows containing missing values (geom_point).

qplot(x = log(pm25), y = log(eno), data = maacs, color = mopos, geom = c("point", "smooth"), method = "lm")

## Warning: Removed 86 rows containing missing values (stat_smooth).

## Warning: Removed 98 rows containing missing values (stat_smooth).

## Warning: Removed 184 rows containing missing values (geom_point).

qplot(x = log(pm25), y = log(eno), data = maacs, facets = . ~ mopos, geom = c("point", "smooth"), method = "lm")

## Warning: Removed 86 rows containing missing values (stat_smooth).

## Warning: Removed 98 rows containing missing values (stat_smooth).

## Warning: Removed 86 rows containing missing values (geom_point).

## Warning: Removed 98 rows containing missing values (geom_point).

4.4.2 `ggplot` function

qplot(x = log(pm25), y = eno, data = maacs, facets = . ~ mopos, geom = c("point", "smooth"), method = "lm")

## Warning: Removed 86 rows containing missing values (stat_smooth).

## Warning: Removed 98 rows containing missing values (stat_smooth).

## Warning: Removed 86 rows containing missing values (geom_point).

## Warning: Removed 98 rows containing missing values (geom_point).

g <- ggplot(data = maacs, mapping = aes(x = log(pm25), y = eno))

summary(g)

## data: id, eno, duBedMusM, pm25, mopos [750x5]
## mapping:  x = log(pm25), y = eno
## faceting: facet_null()

g + geom_point()

## Warning: Removed 184 rows containing missing values (geom_point).

g + geom_point() + geom_smooth()

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## Warning: Removed 184 rows containing missing values (stat_smooth).

## Warning: Removed 184 rows containing missing values (geom_point).

g + geom_point() + geom_smooth(method = "lm")

## Warning: Removed 184 rows containing missing values (stat_smooth).

## Warning: Removed 184 rows containing missing values (geom_point).

g + geom_point() + facet_grid(facets = . ~ mopos) + geom_smooth(method = "lm")

## Warning: Removed 86 rows containing missing values (stat_smooth).

## Warning: Removed 98 rows containing missing values (stat_smooth).

## Warning: Removed 86 rows containing missing values (geom_point).

## Warning: Removed 98 rows containing missing values (geom_point).

g + geom_point(color = "steelblue", size = 4, alpha = 1/2)

## Warning: Removed 184 rows containing missing values (geom_point).

g + geom_point(aes(color = mopos), size = 4, alpha = 1/2)

## Warning: Removed 184 rows containing missing values (geom_point).

g + geom_point(aes(color = mopos), size = 4, alpha = 1/2) + labs(x = "Log of PM25", y = "ENO", title = "MAACS")

## Warning: Removed 184 rows containing missing values (geom_point).

g + geom_point(aes(color = mopos), size = 4, alpha = 1/2) + labs(x = "Log of PM25", y = "ENO", title = "MAACS") + geom_smooth(size = 4, linetype = 3, method = "lm")

## Warning: Removed 184 rows containing missing values (stat_smooth).

## Warning: Removed 184 rows containing missing values (geom_point).

g + geom_point(aes(color = mopos), size = 4, alpha = 1/2) + labs(x = "Log of PM25", y = "ENO", title = "MAACS") + geom_smooth(size = 4, linetype = 3, method = "lm", se = FALSE)

## Warning: Removed 184 rows containing missing values (stat_smooth).

## Warning: Removed 184 rows containing missing values (geom_point).

g + geom_point(aes(color = mopos), size = 4, alpha = 1/2) + labs(x = "Log of PM25", y = "ENO", title = "MAACS") + geom_smooth(size = 4, linetype = 3, method = "lm", se = FALSE)

## Warning: Removed 184 rows containing missing values (stat_smooth).

## Warning: Removed 184 rows containing missing values (geom_point).

testData <- data.frame(x = 1:100, y = rnorm(100)) 

head(testData)

##   x           y
## 1 1  1.03867599
## 2 2 -0.82879997
## 3 3 -0.74745592
## 4 4 -0.96679162
## 5 5  0.08761936
## 6 6 -0.96893042

# Setting Outlier
testData[50, 2] <- 100 

plot(testData$x, testData$y, type = "l", ylim = c(-3, 3))

g <- ggplot(testData, aes(x = x, y = y))

g + geom_line()

# Outlier Missing
g + geom_line() + ylim(c(-3, 3))

# Outlier Included
g + geom_line() + coord_cartesian(ylim = c(-3, 3))

cutpoints <- quantile(x = maacs$duBedMusM, breaks = seq(0, 1, length.out = 4), na.rm = TRUE)

cutpoints

##        0%       25%       50%       75%      100% 
##      0.01    308.00   1151.00   3881.00 124919.00

maacs$newCol <- cut(x = maacs$duBedMusM, cutpoints)

levels(maacs$newCol)

## [1] "(0.01,308]"          "(308,1.15e+03]"      "(1.15e+03,3.88e+03]"
## [4] "(3.88e+03,1.25e+05]"

library(ggthemes)

g <- ggplot(data = maacs, aes(x = log(pm25), y = eno))

g + geom_point(alpha = 1/3) + facet_wrap(facets = newCol ~ mopos) + geom_smooth(method = "lm", se = FALSE, col = "steelblue") + theme_bw(base_size = 10) + labs(x = expression("log " * PM[2.5]), title = "MAACS")

## Warning: Removed 8 rows containing missing values (stat_smooth).

## Warning: Removed 8 rows containing missing values (stat_smooth).

## Warning: Removed 9 rows containing missing values (stat_smooth).

## Warning: Removed 10 rows containing missing values (stat_smooth).

## Warning: Removed 15 rows containing missing values (stat_smooth).

## Warning: Removed 6 rows containing missing values (stat_smooth).

## Warning: Removed 14 rows containing missing values (stat_smooth).

## Warning: Removed 7 rows containing missing values (stat_smooth).

## Warning: Removed 40 rows containing missing values (stat_smooth).

## Warning: Removed 67 rows containing missing values (stat_smooth).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 9 rows containing missing values (geom_point).

## Warning: Removed 10 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 6 rows containing missing values (geom_point).

## Warning: Removed 14 rows containing missing values (geom_point).

## Warning: Removed 7 rows containing missing values (geom_point).

## Warning: Removed 40 rows containing missing values (geom_point).

## Warning: Removed 67 rows containing missing values (geom_point).

rm(list = ls())

R Programming: Basics to Advanced (Part 1), First Edition

Naimish Agarwal

1 Getting Started

1.1 Installing and Loading Packages

1.2 Simple Calculations

1.3 Variables

1.4 Data Types

1.5 Vectors

1.6 Function Calls

2 Data Structures

2.1 Data Frames

2.2 Lists

2.3 Matrices

2.4 Arrays

3 Reading Data

3.1 Reading CSV files

3.2 Reading from Databases

3.3 Reading from Foreign Statistical Softwares

3.4 Working with Binary Data

3.5 Working with data supplied with R

3.6 Web Scraping

4 Statistical Graphs

4.1 `diamonds` dataset

4.2 Base R Graphics

4.2.1 Histograms

4.2.2 Scatterplots

4.2.3 Boxplots

4.3 `ggplot2`

4.3.1 Histograms and Densities

4.3.2 Scatterplots

4.3.3 Box Plots and Violin Plots

4.3.4 Line Plots

4.3.5 Faceting

4.3.6 Colors and Shapes

4.3.7 Themes

4.4 `ggplot2`: Digging Deeper

4.4.1 `qplot` function

4.4.2 `ggplot` function

R Programming: Basics to Advanced (Part 1), First Edition

Naimish Agarwal

1 Getting Started

1.1 Installing and Loading Packages

1.2 Simple Calculations

1.3 Variables

1.4 Data Types

1.5 Vectors

1.6 Function Calls

2 Data Structures

2.1 Data Frames

2.2 Lists

2.3 Matrices

2.4 Arrays

3 Reading Data

3.1 Reading CSV files

3.2 Reading from Databases

3.3 Reading from Foreign Statistical Softwares

3.4 Working with Binary Data

3.5 Working with data supplied with R

3.6 Web Scraping

4 Statistical Graphs

4.1 diamonds dataset

4.2 Base R Graphics

4.2.1 Histograms

4.2.2 Scatterplots

4.2.3 Boxplots

4.3 ggplot2

4.3.1 Histograms and Densities

4.3.2 Scatterplots

4.3.3 Box Plots and Violin Plots

4.3.4 Line Plots

4.3.5 Faceting

4.3.6 Colors and Shapes

4.3.7 Themes

4.4 ggplot2: Digging Deeper

4.4.1 qplot function

4.4.2 ggplot function

4.1 `diamonds` dataset

4.3 `ggplot2`

4.4 `ggplot2`: Digging Deeper

4.4.1 `qplot` function

4.4.2 `ggplot` function