This tutorial will provide you Hands-On practice in R programming. This tutorial assumes that you have basic familiarity with programming in any other language. Little theory is provided but if you have any queries then use ?functionName for details.

1 Getting Started

1.1 Installing and Loading Packages

# install.packages("ggplot2")
# install.packages("coefplot")

I’ve commented the instructions since I already have them installed.

library(ggplot2)

require(coefplot)
## Loading required package: coefplot
require(useful)
## Loading required package: useful
ggplot
## function (data = NULL, ...) 
## UseMethod("ggplot")
## <environment: namespace:ggplot2>

1.2 Simple Calculations

1 + 1
## [1] 2
2 * 3
## [1] 6
3 / 4
## [1] 0.75
3 %% 2
## [1] 1
5 - 3
## [1] 2
8 / (3 + 4)
## [1] 1.142857
2 ** 3
## [1] 8
1 + 2i
## [1] 1+2i
(1  + 2i) - (3 + 4i) 
## [1] -2-2i

R follows PEMDAS rules. Maths students should know that.

1.3 Variables

x <- 2

x + 1
## [1] 3
y <- 3

x * y
## [1] 6
a <- b <- 5

a
## [1] 5
b
## [1] 5
assign(x = "var", value = 10)

var
## [1] 10
rm(a)

rm(list = ls())

Using = for assignment isn’t recommended in R community, although it works equally well in most situations.

a = 23

R is case-sensitive

hero <- 1

# Hero

# Error : object 'X' not found

rm(list = ls())

1.4 Data Types

R is dynamically-typed.

x <- 2

class(x)
## [1] "numeric"
is.numeric(x)
## [1] TRUE
i <- 5L

class(i)
## [1] "integer"
is.integer(i)
## [1] TRUE
is.numeric(i)
## [1] TRUE
class(4L)
## [1] "integer"
4L * 2.8
## [1] 11.2
class(4L * 2.8)
## [1] "numeric"
5L / 2L
## [1] 2.5
class(5L / 2L)
## [1] "numeric"
x <- "data"

x
## [1] "data"
class(x)
## [1] "character"
y <- factor("data")

y
## [1] data
## Levels: data
class(y)
## [1] "factor"
nchar(x)
## [1] 4
nchar("hello")
## [1] 5
nchar(3)
## [1] 1
nchar(452)
## [1] 3

Note: nchar() does not work with factor

date1 <- as.Date("2012-06-23")

date1
## [1] "2012-06-23"
class(date1)
## [1] "Date"
as.numeric(date1)
## [1] 15514
date2 <- as.POSIXct(x = "2012-06-23 17:32")

date2
## [1] "2012-06-23 17:32:00 IST"
class(date2)
## [1] "POSIXct" "POSIXt"
as.numeric(date2)
## [1] 1340452920
TRUE
## [1] TRUE
FALSE
## [1] FALSE
TRUE * 5
## [1] 5
FALSE * 5
## [1] 0
as.numeric(TRUE)
## [1] 1
as.numeric(FALSE)
## [1] 0
class(TRUE)
## [1] "logical"
is.logical(TRUE)
## [1] TRUE
T
## [1] TRUE
F
## [1] FALSE
T <- 1

T <- 100

T
## [1] 100
class(T)
## [1] "numeric"
T <- TRUE

T
## [1] TRUE
2 == 2
## [1] TRUE
2 == 3
## [1] FALSE
2 != 3
## [1] TRUE
2 < 3
## [1] TRUE
2 > 3
## [1] FALSE
2 <= 3
## [1] TRUE
2 >= 3
## [1] FALSE
"data" == "data"
## [1] TRUE
"data" == "Data"
## [1] FALSE
"data" < "Data"
## [1] TRUE
rm(list = ls())

1.5 Vectors

x <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

x * 3
##  [1]  3  6  9 12 15 18 21 24 27 30
x + 2
##  [1]  3  4  5  6  7  8  9 10 11 12
x - 4
##  [1] -3 -2 -1  0  1  2  3  4  5  6
x / 8
##  [1] 0.125 0.250 0.375 0.500 0.625 0.750 0.875 1.000 1.125 1.250
x ** 6
##  [1]       1      64     729    4096   15625   46656  117649  262144
##  [9]  531441 1000000
sqrt(x)
##  [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751
##  [8] 2.828427 3.000000 3.162278
1:10
##  [1]  1  2  3  4  5  6  7  8  9 10
10:1
##  [1] 10  9  8  7  6  5  4  3  2  1
-2:5
## [1] -2 -1  0  1  2  3  4  5
5:-6
##  [1]  5  4  3  2  1  0 -1 -2 -3 -4 -5 -6
x <- 1:10

y <- -5:4

x + y
##  [1] -4 -2  0  2  4  6  8 10 12 14
x - y
##  [1] 6 6 6 6 6 6 6 6 6 6
x * y
##  [1] -5 -8 -9 -8 -5  0  7 16 27 40
x / y
##  [1] -0.2 -0.5 -1.0 -2.0 -5.0  Inf  7.0  4.0  3.0  2.5
x ** y
##  [1] 1.000000e+00 6.250000e-02 3.703704e-02 6.250000e-02 2.000000e-01
##  [6] 1.000000e+00 7.000000e+00 6.400000e+01 7.290000e+02 1.000000e+04
x ^ y
##  [1] 1.000000e+00 6.250000e-02 3.703704e-02 6.250000e-02 2.000000e-01
##  [6] 1.000000e+00 7.000000e+00 6.400000e+01 7.290000e+02 1.000000e+04
length(x)
## [1] 10
length(y)
## [1] 10
length(x + y)
## [1] 10
x + c(1, 3)
##  [1]  2  5  4  7  6  9  8 11 10 13
x + c(1, 3, 5)
## Warning in x + c(1, 3, 5): longer object length is not a multiple of
## shorter object length
##  [1]  2  5  8  5  8 11  8 11 14 11
x <= 4
##  [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
x[x <= 4]
## [1] 1 2 3 4
x > y
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
y > x
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
x <- 10:1

y <- -4:5

x < y
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
any(x < y)
## [1] TRUE
all(x < y)
## [1] FALSE
sports <- c("Hockey", "Basketball", "Football", "Cricket", "Badminton", "Table Tennis", "Rugby", "Baseball", "Lawn Tennis", "Soccer")

sports
##  [1] "Hockey"       "Basketball"   "Football"     "Cricket"     
##  [5] "Badminton"    "Table Tennis" "Rugby"        "Baseball"    
##  [9] "Lawn Tennis"  "Soccer"
nchar(sports)
##  [1]  6 10  8  7  9 12  5  8 11  6
number <- 7

number
## [1] 7
x
##  [1] 10  9  8  7  6  5  4  3  2  1
x[1]
## [1] 10
x[c(1, 2)]
## [1] 10  9
x[1:2]
## [1] 10  9
x[c(1, 3, 5, 9)]
## [1] 10  8  6  2
a <- c(One = "a", Two = "y", Three = "r")

names(a)
## [1] "One"   "Two"   "Three"
w <- 1:3

names(w)
## NULL
names(w) <- c("One", "Two", "Three")

w
##   One   Two Three 
##     1     2     3
names(w)
## [1] "One"   "Two"   "Three"
sports2 <- c(sports, "Hockey", "Badminton", "Cricket", "Football", "Hockey", "Water Polo")

sports2
##  [1] "Hockey"       "Basketball"   "Football"     "Cricket"     
##  [5] "Badminton"    "Table Tennis" "Rugby"        "Baseball"    
##  [9] "Lawn Tennis"  "Soccer"       "Hockey"       "Badminton"   
## [13] "Cricket"      "Football"     "Hockey"       "Water Polo"
sports2.factor <- factor(sports2)

sports2.factor
##  [1] Hockey       Basketball   Football     Cricket      Badminton   
##  [6] Table Tennis Rugby        Baseball     Lawn Tennis  Soccer      
## [11] Hockey       Badminton    Cricket      Football     Hockey      
## [16] Water Polo  
## 11 Levels: Badminton Baseball Basketball Cricket Football ... Water Polo
table(sports2.factor)
## sports2.factor
##    Badminton     Baseball   Basketball      Cricket     Football 
##            2            1            1            2            2 
##       Hockey  Lawn Tennis        Rugby       Soccer Table Tennis 
##            3            1            1            1            1 
##   Water Polo 
##            1
class(sports2.factor)
## [1] "factor"
as.numeric(sports2.factor)
##  [1]  6  3  5  4  1 10  8  2  7  9  6  1  4  5  6 11
rm(list = ls())

z <- c(1, 2, 3, NA, 8, 9, 10, NA)

z
## [1]  1  2  3 NA  8  9 10 NA
is.na(z)
## [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
z[!is.na(z)]
## [1]  1  2  3  8  9 10
z.character <- c("Hockey", NA, "Cricket")

is.na(z.character)
## [1] FALSE  TRUE FALSE
z <- c(1, NULL, 3)

z
## [1] 1 3
d <- NULL

is.null(d)
## [1] TRUE
is.null(z)
## [1] FALSE
rm(list = ls())

1.6 Function Calls

x <- 1:10

mean(x)
## [1] 5.5
sum(x)
## [1] 55
nchar(x)
##  [1] 1 1 1 1 1 1 1 1 1 2
x <- 1:1000

x[200:300] <- NA

mean(x)
## [1] NA
mean(x = x, na.rm = TRUE)
## [1] 528.6429
mean(x = x, trim = 0.1 , na.rm = TRUE)
## [1] 535.5908

2 Data Structures

2.1 Data Frames

x <- 10:1

y <- -4:5

q <- c("Hockey", "Basketball", "Cricket", "Billiards", "Chess", "Table Tennis", "Rugby", "Water Polo", "Lawn Tennis", "Football")

df <- data.frame(x, y, q)

df
##     x  y            q
## 1  10 -4       Hockey
## 2   9 -3   Basketball
## 3   8 -2      Cricket
## 4   7 -1    Billiards
## 5   6  0        Chess
## 6   5  1 Table Tennis
## 7   4  2        Rugby
## 8   3  3   Water Polo
## 9   2  4  Lawn Tennis
## 10  1  5     Football
df <- data.frame(First = x, Second = y, Sports = q)

df
##    First Second       Sports
## 1     10     -4       Hockey
## 2      9     -3   Basketball
## 3      8     -2      Cricket
## 4      7     -1    Billiards
## 5      6      0        Chess
## 6      5      1 Table Tennis
## 7      4      2        Rugby
## 8      3      3   Water Polo
## 9      2      4  Lawn Tennis
## 10     1      5     Football
class(df$Sports)
## [1] "factor"
df <- data.frame(First = x, Second = y, Sports = q, stringsAsFactors = FALSE)

class(df$Sports)
## [1] "character"
nrow(df)
## [1] 10
ncol(df)
## [1] 3
dim(df)
## [1] 10  3
NROW(x = df)
## [1] 10
NCOL(x = df)
## [1] 3
x
##  [1] 10  9  8  7  6  5  4  3  2  1
nrow(x)
## NULL
NROW(x)
## [1] 10
length(x)
## [1] 10
ncol(x)
## NULL
NCOL(x)
## [1] 1
names(df)
## [1] "First"  "Second" "Sports"
names(df)[3]
## [1] "Sports"
rownames(df)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
rownames(df) <- letters[1:10]

df
##   First Second       Sports
## a    10     -4       Hockey
## b     9     -3   Basketball
## c     8     -2      Cricket
## d     7     -1    Billiards
## e     6      0        Chess
## f     5      1 Table Tennis
## g     4      2        Rugby
## h     3      3   Water Polo
## i     2      4  Lawn Tennis
## j     1      5     Football
rownames(df)
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
rownames(df) <- NULL

rownames(df)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
head(df)
##   First Second       Sports
## 1    10     -4       Hockey
## 2     9     -3   Basketball
## 3     8     -2      Cricket
## 4     7     -1    Billiards
## 5     6      0        Chess
## 6     5      1 Table Tennis
tail(df)
##    First Second       Sports
## 5      6      0        Chess
## 6      5      1 Table Tennis
## 7      4      2        Rugby
## 8      3      3   Water Polo
## 9      2      4  Lawn Tennis
## 10     1      5     Football
head(x = df, n = 7)
##   First Second       Sports
## 1    10     -4       Hockey
## 2     9     -3   Basketball
## 3     8     -2      Cricket
## 4     7     -1    Billiards
## 5     6      0        Chess
## 6     5      1 Table Tennis
## 7     4      2        Rugby
tail(x = df, n = 7)
##    First Second       Sports
## 4      7     -1    Billiards
## 5      6      0        Chess
## 6      5      1 Table Tennis
## 7      4      2        Rugby
## 8      3      3   Water Polo
## 9      2      4  Lawn Tennis
## 10     1      5     Football
class(df)
## [1] "data.frame"
df$Sports
##  [1] "Hockey"       "Basketball"   "Cricket"      "Billiards"   
##  [5] "Chess"        "Table Tennis" "Rugby"        "Water Polo"  
##  [9] "Lawn Tennis"  "Football"
df[3, 2]
## [1] -2
df[4, 3]
## [1] "Billiards"
df[3:5, 2:3]
##   Second    Sports
## 3     -2   Cricket
## 4     -1 Billiards
## 5      0     Chess
df[c(3, 6), 2]
## [1] -2  1
df[, c(1, 3)]
##    First       Sports
## 1     10       Hockey
## 2      9   Basketball
## 3      8      Cricket
## 4      7    Billiards
## 5      6        Chess
## 6      5 Table Tennis
## 7      4        Rugby
## 8      3   Water Polo
## 9      2  Lawn Tennis
## 10     1     Football
df[, 3]
##  [1] "Hockey"       "Basketball"   "Cricket"      "Billiards"   
##  [5] "Chess"        "Table Tennis" "Rugby"        "Water Polo"  
##  [9] "Lawn Tennis"  "Football"
class(df[, 3])
## [1] "character"
df[, 3, drop = FALSE]
##          Sports
## 1        Hockey
## 2    Basketball
## 3       Cricket
## 4     Billiards
## 5         Chess
## 6  Table Tennis
## 7         Rugby
## 8    Water Polo
## 9   Lawn Tennis
## 10     Football
class(df[, 3, drop = FALSE])
## [1] "data.frame"
df[2, ]
##   First Second     Sports
## 2     9     -3 Basketball
class(df[2, ])
## [1] "data.frame"
df[2:4, ]
##   First Second     Sports
## 2     9     -3 Basketball
## 3     8     -2    Cricket
## 4     7     -1  Billiards
df[, "Sports"]
##  [1] "Hockey"       "Basketball"   "Cricket"      "Billiards"   
##  [5] "Chess"        "Table Tennis" "Rugby"        "Water Polo"  
##  [9] "Lawn Tennis"  "Football"
df[, c("First", "Sports")]
##    First       Sports
## 1     10       Hockey
## 2      9   Basketball
## 3      8      Cricket
## 4      7    Billiards
## 5      6        Chess
## 6      5 Table Tennis
## 7      4        Rugby
## 8      3   Water Polo
## 9      2  Lawn Tennis
## 10     1     Football
df[, c("Sports", "First")]
##          Sports First
## 1        Hockey    10
## 2    Basketball     9
## 3       Cricket     8
## 4     Billiards     7
## 5         Chess     6
## 6  Table Tennis     5
## 7         Rugby     4
## 8    Water Polo     3
## 9   Lawn Tennis     2
## 10     Football     1
df[, "Sports", drop = FALSE]
##          Sports
## 1        Hockey
## 2    Basketball
## 3       Cricket
## 4     Billiards
## 5         Chess
## 6  Table Tennis
## 7         Rugby
## 8    Water Polo
## 9   Lawn Tennis
## 10     Football
df["Sports"]
##          Sports
## 1        Hockey
## 2    Basketball
## 3       Cricket
## 4     Billiards
## 5         Chess
## 6  Table Tennis
## 7         Rugby
## 8    Water Polo
## 9   Lawn Tennis
## 10     Football
df[["Sports"]]
##  [1] "Hockey"       "Basketball"   "Cricket"      "Billiards"   
##  [5] "Chess"        "Table Tennis" "Rugby"        "Water Polo"  
##  [9] "Lawn Tennis"  "Football"
df[c("First", "Sports")]
##    First       Sports
## 1     10       Hockey
## 2      9   Basketball
## 3      8      Cricket
## 4      7    Billiards
## 5      6        Chess
## 6      5 Table Tennis
## 7      4        Rugby
## 8      3   Water Polo
## 9      2  Lawn Tennis
## 10     1     Football
rm(list = ls())

2.2 Lists

list1 <- list(1, 2, 3)

list1
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 3
list2 <- list(c(1, 2, 3))

list2
## [[1]]
## [1] 1 2 3
list3 <- list(c(1, 2, 3), 3:7)

list3
## [[1]]
## [1] 1 2 3
## 
## [[2]]
## [1] 3 4 5 6 7
df <- data.frame(First = 1:5, Second = 5:1, Sport = c("Hockey", "Cricket", "Football", "Rugby", "Badminton"), stringsAsFactors = FALSE)

df
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
list4 <- list(df, 1:10)

list4
## [[1]]
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## [[2]]
##  [1]  1  2  3  4  5  6  7  8  9 10
list5 <- list(df, 1:10, list3)

list5
## [[1]]
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## [[2]]
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## [[3]]
## [[3]][[1]]
## [1] 1 2 3
## 
## [[3]][[2]]
## [1] 3 4 5 6 7
names(list5)
## NULL
names(list5) <- c("data.frame", "vector", "list")

names(list5)
## [1] "data.frame" "vector"     "list"
list5
## $data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## $vector
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $list
## $list[[1]]
## [1] 1 2 3
## 
## $list[[2]]
## [1] 3 4 5 6 7
list6 <- list(DataFrame = df, Vector = 1:10, List = list3)

names(list6)
## [1] "DataFrame" "Vector"    "List"
list6
## $DataFrame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## $Vector
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $List
## $List[[1]]
## [1] 1 2 3
## 
## $List[[2]]
## [1] 3 4 5 6 7
empty.list <- vector(mode = "list", length = 4L)

empty.list
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
empty.list[[1]] <- 5

empty.list
## [[1]]
## [1] 5
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
list5[1]
## $data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
list5[[1]]
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
list5[["data.frame"]]
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
list5$data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
list5[["data.frame"]]$First
## [1] 1 2 3 4 5
list5[[1]][2]
##   Second
## 1      5
## 2      4
## 3      3
## 4      2
## 5      1
list5[[1]][[2]]
## [1] 5 4 3 2 1
list5[[1]][, "Second", drop = FALSE]
##   Second
## 1      5
## 2      4
## 3      3
## 4      2
## 5      1
length(list5)
## [1] 3
NROW(list5)
## [1] 3
list5[[4]] <- 2 # memory and processor inefficient

list5
## $data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## $vector
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $list
## $list[[1]]
## [1] 1 2 3
## 
## $list[[2]]
## [1] 3 4 5 6 7
## 
## 
## [[4]]
## [1] 2
list5[["new.element"]] <- 3:7

list5
## $data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## $vector
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $list
## $list[[1]]
## [1] 1 2 3
## 
## $list[[2]]
## [1] 3 4 5 6 7
## 
## 
## [[4]]
## [1] 2
## 
## $new.element
## [1] 3 4 5 6 7
names(list5)
## [1] "data.frame"  "vector"      "list"        ""            "new.element"
list5[1:3]
## $data.frame
##   First Second     Sport
## 1     1      5    Hockey
## 2     2      4   Cricket
## 3     3      3  Football
## 4     4      2     Rugby
## 5     5      1 Badminton
## 
## $vector
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $list
## $list[[1]]
## [1] 1 2 3
## 
## $list[[2]]
## [1] 3 4 5 6 7
rm(list = ls())

2.3 Matrices

A <- matrix(data = 1:10, nrow = 5, ncol = 2)

A
##      [,1] [,2]
## [1,]    1    6
## [2,]    2    7
## [3,]    3    8
## [4,]    4    9
## [5,]    5   10
B <- matrix(data = 21:30, nrow = 5, ncol = 2)

B
##      [,1] [,2]
## [1,]   21   26
## [2,]   22   27
## [3,]   23   28
## [4,]   24   29
## [5,]   25   30
C <- matrix(data = 21:40, nrow = 2, ncol = 10)

C
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]   21   23   25   27   29   31   33   35   37    39
## [2,]   22   24   26   28   30   32   34   36   38    40
nrow(A)
## [1] 5
ncol(A)
## [1] 2
dim(A)
## [1] 5 2
A + B
##      [,1] [,2]
## [1,]   22   32
## [2,]   24   34
## [3,]   26   36
## [4,]   28   38
## [5,]   30   40
A * B
##      [,1] [,2]
## [1,]   21  156
## [2,]   44  189
## [3,]   69  224
## [4,]   96  261
## [5,]  125  300
A == B
##       [,1]  [,2]
## [1,] FALSE FALSE
## [2,] FALSE FALSE
## [3,] FALSE FALSE
## [4,] FALSE FALSE
## [5,] FALSE FALSE
ncol(A)
## [1] 2
nrow(B)
## [1] 5
t(B)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]   21   22   23   24   25
## [2,]   26   27   28   29   30
A %*% t(B)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  177  184  191  198  205
## [2,]  224  233  242  251  260
## [3,]  271  282  293  304  315
## [4,]  318  331  344  357  370
## [5,]  365  380  395  410  425
A %*% C
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]  153  167  181  195  209  223  237  251  265   279
## [2,]  196  214  232  250  268  286  304  322  340   358
## [3,]  239  261  283  305  327  349  371  393  415   437
## [4,]  282  308  334  360  386  412  438  464  490   516
## [5,]  325  355  385  415  445  475  505  535  565   595
colnames(A)
## NULL
rownames(A)
## NULL
colnames(A) <- c("Left", "Right")

rownames(A) <- c("First", "Second", "Third", "Fourth", "Fifth")

A
##        Left Right
## First     1     6
## Second    2     7
## Third     3     8
## Fourth    4     9
## Fifth     5    10
colnames(B) <- c("First", "Second")

rownames(B) <- c("One", "Two", "Three", "Four", "Five")

B
##       First Second
## One      21     26
## Two      22     27
## Three    23     28
## Four     24     29
## Five     25     30
letters
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
LETTERS
##  [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q"
## [18] "R" "S" "T" "U" "V" "W" "X" "Y" "Z"
colnames(C) <- LETTERS[1:10] 

rownames(C) <- c("Top", "Bottom")

C
##         A  B  C  D  E  F  G  H  I  J
## Top    21 23 25 27 29 31 33 35 37 39
## Bottom 22 24 26 28 30 32 34 36 38 40
A
##        Left Right
## First     1     6
## Second    2     7
## Third     3     8
## Fourth    4     9
## Fifth     5    10
t(A)
##       First Second Third Fourth Fifth
## Left      1      2     3      4     5
## Right     6      7     8      9    10
A %*% C
##          A   B   C   D   E   F   G   H   I   J
## First  153 167 181 195 209 223 237 251 265 279
## Second 196 214 232 250 268 286 304 322 340 358
## Third  239 261 283 305 327 349 371 393 415 437
## Fourth 282 308 334 360 386 412 438 464 490 516
## Fifth  325 355 385 415 445 475 505 535 565 595

2.4 Arrays

arr <- array(data = 1:12, dim = c(2, 3, 2))

arr
## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]    7    9   11
## [2,]    8   10   12
arr[1, , ]
##      [,1] [,2]
## [1,]    1    7
## [2,]    3    9
## [3,]    5   11
arr[, 2, ]
##      [,1] [,2]
## [1,]    3    9
## [2,]    4   10
arr[, , 2]
##      [,1] [,2] [,3]
## [1,]    7    9   11
## [2,]    8   10   12
arr[1, , 1]
## [1] 1 3 5
arr[, , 1]
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
rm(list = ls())

3 Reading Data

3.1 Reading CSV files

fileURL <- "http://www.jaredlander.com/data/Tomato%20First.csv"

tomato <- read.table(file = fileURL, header = TRUE, sep = ",")

head(tomato)
##   Round             Tomato Price      Source Sweet Acid Color Texture
## 1     1         Simpson SM  3.99 Whole Foods   2.8  2.8   3.7     3.4
## 2     1  Tuttorosso (blue)  2.99     Pioneer   3.3  2.8   3.4     3.0
## 3     1 Tuttorosso (green)  0.99     Pioneer   2.8  2.6   3.3     2.8
## 4     1     La Fede SM DOP  3.99   Shop Rite   2.6  2.8   3.0     2.3
## 5     2       Cento SM DOP  5.49  D Agostino   3.3  3.1   2.9     2.8
## 6     2      Cento Organic  4.99  D Agostino   3.2  2.9   2.9     3.1
##   Overall Avg.of.Totals Total.of.Avg
## 1     3.4          16.1         16.1
## 2     2.9          15.3         15.3
## 3     2.9          14.3         14.3
## 4     2.8          13.4         13.4
## 5     3.1          14.4         15.2
## 6     2.9          15.5         15.1
class(tomato)
## [1] "data.frame"
class(tomato$Tomato)
## [1] "factor"
tomato <- read.table(file = fileURL, header = TRUE, sep = ",", stringsAsFactors = FALSE)

head(tomato)
##   Round             Tomato Price      Source Sweet Acid Color Texture
## 1     1         Simpson SM  3.99 Whole Foods   2.8  2.8   3.7     3.4
## 2     1  Tuttorosso (blue)  2.99     Pioneer   3.3  2.8   3.4     3.0
## 3     1 Tuttorosso (green)  0.99     Pioneer   2.8  2.6   3.3     2.8
## 4     1     La Fede SM DOP  3.99   Shop Rite   2.6  2.8   3.0     2.3
## 5     2       Cento SM DOP  5.49  D Agostino   3.3  3.1   2.9     2.8
## 6     2      Cento Organic  4.99  D Agostino   3.2  2.9   2.9     3.1
##   Overall Avg.of.Totals Total.of.Avg
## 1     3.4          16.1         16.1
## 2     2.9          15.3         15.3
## 3     2.9          14.3         14.3
## 4     2.8          13.4         13.4
## 5     3.1          14.4         15.2
## 6     2.9          15.5         15.1
class(tomato$Tomato)
## [1] "character"

Also refer to ?read.csv and ?read.csv2

3.2 Reading from Databases

# library(RODBC)

# RShowDoc("RODBC", package = "RODBC")

Uncomment and execute the instructions to read the vignette for more information.

3.3 Reading from Foreign Statistical Softwares

Download and read the documentation of foreign package from CRAN.

Read about the following functions:

  • read.spss
  • read.dta
  • read.ssd
  • read.octave
  • read.mtp
  • read.systat

3.4 Working with Binary Data

fileURL <- "http://www.jaredlander.com/data/Tomato%20First.csv"

tomato <- read.csv(file = fileURL, stringsAsFactors = FALSE)

save(list = c("tomato"), file = "tomato.rdata")

rm(tomato)

load(file = "tomato.rdata")

head(tomato)
##   Round             Tomato Price      Source Sweet Acid Color Texture
## 1     1         Simpson SM  3.99 Whole Foods   2.8  2.8   3.7     3.4
## 2     1  Tuttorosso (blue)  2.99     Pioneer   3.3  2.8   3.4     3.0
## 3     1 Tuttorosso (green)  0.99     Pioneer   2.8  2.6   3.3     2.8
## 4     1     La Fede SM DOP  3.99   Shop Rite   2.6  2.8   3.0     2.3
## 5     2       Cento SM DOP  5.49  D Agostino   3.3  3.1   2.9     2.8
## 6     2      Cento Organic  4.99  D Agostino   3.2  2.9   2.9     3.1
##   Overall Avg.of.Totals Total.of.Avg
## 1     3.4          16.1         16.1
## 2     2.9          15.3         15.3
## 3     2.9          14.3         14.3
## 4     2.8          13.4         13.4
## 5     3.1          14.4         15.2
## 6     2.9          15.5         15.1
n <- 20

r <- 1:10

w <- data.frame(n, r) 

w
##     n  r
## 1  20  1
## 2  20  2
## 3  20  3
## 4  20  4
## 5  20  5
## 6  20  6
## 7  20  7
## 8  20  8
## 9  20  9
## 10 20 10
save(list = c("n", "r", "w"), file = "multiple.rdata")

rm(list = c("n", "r", "w"))

load(file = "multiple.rdata")

n
## [1] 20
r
##  [1]  1  2  3  4  5  6  7  8  9 10
w
##     n  r
## 1  20  1
## 2  20  2
## 3  20  3
## 4  20  4
## 5  20  5
## 6  20  6
## 7  20  7
## 8  20  8
## 9  20  9
## 10 20 10
rm(list = ls())

3.5 Working with data supplied with R

library(ggplot2)

data("diamonds")

head(diamonds)
##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
data(list = c("tips"), package = "reshape2")

head(tips)
##   total_bill  tip    sex smoker day   time size
## 1      16.99 1.01 Female     No Sun Dinner    2
## 2      10.34 1.66   Male     No Sun Dinner    3
## 3      21.01 3.50   Male     No Sun Dinner    3
## 4      23.68 3.31   Male     No Sun Dinner    2
## 5      24.59 3.61 Female     No Sun Dinner    4
## 6      25.29 4.71   Male     No Sun Dinner    4
data()

rm(list = ls())

3.6 Web Scraping

library(XML)

fileURL <- "http://www.w3schools.com/html/html_tables.asp"

myTable <- readHTMLTable(doc = fileURL, which = 1, header = TRUE, stringsAsFactors = FALSE)

myTable
##   Number First Name Last Name Points
## 1      1        Eve   Jackson     94
## 2      2       John       Doe     80
## 3      3       Adam   Johnson     67
## 4      4       Jill     Smith     50
rm(list = ls())

4 Statistical Graphs

4.1 diamonds dataset

library(ggplot2)

data("diamonds")

head(diamonds)
##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48

4.2 Base R Graphics

4.2.1 Histograms

hist(x = diamonds$carat)
hist(x = diamonds$carat, main = "Carat Histogram", xlab = "Carat")

4.2.2 Scatterplots

plot(x = diamonds$carat, y = diamonds$price)
plot(formula = price ~ carat, data = diamonds)
plot(formula = price ~ carat, data = diamonds, main = "Price vs Carat")

4.2.3 Boxplots

boxplot(x = diamonds$carat)

4.3 ggplot2

4.3.1 Histograms and Densities

library(ggplot2)

data(diamonds)

ggplot(data = diamonds) + geom_histogram(mapping = aes(x = diamonds$carat))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(data = diamonds) + geom_histogram(mapping = aes(x = diamonds$carat), binwidth = 0.5)
ggplot(data = diamonds) + geom_histogram(mapping = aes(x = diamonds$carat), binwidth = 0.1)
ggplot(data = diamonds) + geom_density(mapping = aes(x = carat))
ggplot(data = diamonds) + geom_density(mapping = aes(x = carat), fill = "grey50")

4.3.2 Scatterplots

ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point()
g <- ggplot(data = diamonds, aes(x = carat, y = price))

g + geom_point()
g + geom_point(aes(color = color))
g + geom_point(aes(color = color, shape = clarity))
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 8.
## Consider specifying shapes manually if you must have them.
## Warning: Removed 5445 rows containing missing values (geom_point).
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 8.
## Consider specifying shapes manually if you must have them.
g + geom_point(aes(color = color, shape = cut))

4.3.3 Box Plots and Violin Plots

ggplot(data = diamonds, aes(y = carat, x = 1)) + geom_boxplot()
ggplot(data = diamonds, aes(y = carat, x = cut)) + geom_boxplot()
ggplot(data = diamonds, aes(y = carat, x = cut)) + geom_violin()
g <- ggplot(data = diamonds, aes(y = carat, x = cut))

g + geom_point() + geom_violin()
g + geom_violin() + geom_point()
g + geom_jitter()
g + geom_jitter() + geom_violin()
g + geom_jitter(aes(color = color)) + geom_violin()
rm(list = ls())

4.3.4 Line Plots

data("economics")

head(economics)
##         date   pce    pop psavert uempmed unemploy
## 1 1967-06-30 507.8 198712     9.8     4.5     2944
## 2 1967-07-31 510.9 198911     9.8     4.7     2945
## 3 1967-08-31 516.7 199113     9.0     4.6     2958
## 4 1967-09-30 513.3 199311     9.8     4.9     3143
## 5 1967-10-31 518.5 199498     9.7     4.7     3066
## 6 1967-11-30 526.2 199657     9.4     4.8     3018
ggplot(data = economics, mapping = aes(x = date, y = pop)) + geom_line()
library(lubridate)

economics$year <- year(economics$date)

economics$month <- month(economics$date)

head(economics)
##         date   pce    pop psavert uempmed unemploy year month
## 1 1967-06-30 507.8 198712     9.8     4.5     2944 1967     6
## 2 1967-07-31 510.9 198911     9.8     4.7     2945 1967     7
## 3 1967-08-31 516.7 199113     9.0     4.6     2958 1967     8
## 4 1967-09-30 513.3 199311     9.8     4.9     3143 1967     9
## 5 1967-10-31 518.5 199498     9.7     4.7     3066 1967    10
## 6 1967-11-30 526.2 199657     9.4     4.8     3018 1967    11
econ2000 <- economics[which(economics$year >= 2000), ]

nrow(economics)
## [1] 478
nrow(econ2000)
## [1] 87
head(econ2000)
##           date    pce    pop psavert uempmed unemploy year month
## 392 2000-01-31 6618.5 281190     2.4     6.1     5858 2000     1
## 393 2000-02-29 6685.3 281409     2.0     6.0     5733 2000     2
## 394 2000-03-31 6664.2 281653     2.4     6.1     5481 2000     3
## 395 2000-04-30 6688.0 281891     2.4     5.8     5758 2000     4
## 396 2000-05-31 6712.1 282156     2.5     5.7     5651 2000     5
## 397 2000-06-30 6745.8 282430     2.9     6.0     5747 2000     6
econ2000$month <- month(econ2000$date, label = TRUE)

head(econ2000)
##           date    pce    pop psavert uempmed unemploy year month
## 392 2000-01-31 6618.5 281190     2.4     6.1     5858 2000   Jan
## 393 2000-02-29 6685.3 281409     2.0     6.0     5733 2000   Feb
## 394 2000-03-31 6664.2 281653     2.4     6.1     5481 2000   Mar
## 395 2000-04-30 6688.0 281891     2.4     5.8     5758 2000   Apr
## 396 2000-05-31 6712.1 282156     2.5     5.7     5651 2000   May
## 397 2000-06-30 6745.8 282430     2.9     6.0     5747 2000   Jun
library(scales)

g <- ggplot(data = econ2000, aes(x = month, y = pop))

g <- g + geom_line(aes(color = factor(year), group = year))

g 
g <- g + scale_color_discrete(name = "Year")

g
g <- g + scale_y_continuous(labels = comma)

g
g <- g + labs(title = "Population Growth", x = "Month", y = "Population")

g
g <- g + theme(axis.text.x = element_text(angle = 90, hjust = 1))

g

4.3.5 Faceting

g <- ggplot(data = diamonds, mapping = aes(x = carat, y = price))

g + geom_point(mapping = aes(color = color)) + facet_wrap(~color)
g + geom_point(mapping = aes(color = color)) + facet_grid(cut ~ clarity)
ggplot(diamonds, aes(x = carat)) + geom_histogram() + facet_wrap(~color)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

4.3.6 Colors and Shapes

ggplot(data = diamonds, aes(x = carat, y = price, shape = cut, size = depth, color = color)) + geom_point()

4.3.7 Themes

library(ggthemes)

g <- ggplot(data = diamonds, aes(x = carat, y = price, color = color)) + geom_point()

g + theme_wsj()
g + theme_economist() + scale_color_economist()
g + theme_tufte()
g + theme_excel() + scale_color_excel()
rm(list = ls())

4.4 ggplot2: Digging Deeper

4.4.1 qplot function

library(ggplot2)

data("mpg")

str(mpg)
## 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: Factor w/ 15 levels "audi","chevrolet",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ model       : Factor w/ 38 levels "4runner 4wd",..: 2 2 2 2 2 2 2 3 3 3 ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : Factor w/ 10 levels "auto(av)","auto(l3)",..: 4 9 10 1 4 9 1 9 4 10 ...
##  $ drv         : Factor w/ 3 levels "4","f","r": 2 2 2 2 2 2 2 1 1 1 ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : Factor w/ 5 levels "c","d","e","p",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ class       : Factor w/ 7 levels "2seater","compact",..: 2 2 2 2 2 2 2 2 2 2 ...
qplot(x = displ, y = hwy, data = mpg)
qplot(x = displ, y = hwy, data = mpg, color = drv)
qplot(x = displ, y = hwy, data = mpg, geom = c("point", "smooth"))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
qplot(x = displ, y = hwy, data = mpg, color = drv, geom = c("point", "smooth"))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
qplot(x = hwy, data = mpg)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x = hwy, data = mpg, fill = drv)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x = displ, y = hwy, data = mpg, facets = . ~ drv)
qplot(x = hwy, data = mpg, facets = drv ~ ., binwidth = 2)
# The data file can be made available upon request

load("maacs.Rda")

head(maacs)
##   id eno duBedMusM   pm25 mopos
## 1  1 141      2423 15.560   yes
## 2  2 124      2793 34.370   yes
## 3  3 126      3055 38.953   yes
## 4  4 164       775 33.249   yes
## 5  5  99      1634 27.060   yes
## 6  6  68       939 18.890   yes
str(maacs)
## 'data.frame':    750 obs. of  5 variables:
##  $ id       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ eno      : num  141 124 126 164 99 68 41 50 12 30 ...
##  $ duBedMusM: num  2423 2793 3055 775 1634 ...
##  $ pm25     : num  15.6 34.4 39 33.2 27.1 ...
##  $ mopos    : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
qplot(x = log(eno), data = maacs)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x = log(eno), data = maacs, fill = mopos)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x = log(eno), data = maacs, geom = "density")
## Warning: Removed 108 rows containing non-finite values (stat_density).
qplot(x = log(eno), data = maacs, geom = "density", color = mopos)
## Warning: Removed 49 rows containing non-finite values (stat_density).
## Warning: Removed 59 rows containing non-finite values (stat_density).
qplot(x = log(pm25), y = log(eno), data = maacs)
## Warning: Removed 184 rows containing missing values (geom_point).
qplot(x = log(pm25), y = log(eno), data = maacs, shape = mopos)
## Warning: Removed 184 rows containing missing values (geom_point).
qplot(x = log(pm25), y = log(eno), data = maacs, color = mopos)
## Warning: Removed 184 rows containing missing values (geom_point).
qplot(x = log(pm25), y = log(eno), data = maacs, facets = . ~ mopos)
## Warning: Removed 86 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing missing values (geom_point).
qplot(x = log(pm25), y = log(eno), data = maacs, color = mopos, geom = c("point", "smooth"), method = "lm")
## Warning: Removed 86 rows containing missing values (stat_smooth).
## Warning: Removed 98 rows containing missing values (stat_smooth).
## Warning: Removed 184 rows containing missing values (geom_point).
qplot(x = log(pm25), y = log(eno), data = maacs, facets = . ~ mopos, geom = c("point", "smooth"), method = "lm")
## Warning: Removed 86 rows containing missing values (stat_smooth).
## Warning: Removed 98 rows containing missing values (stat_smooth).
## Warning: Removed 86 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing missing values (geom_point).

4.4.2 ggplot function

qplot(x = log(pm25), y = eno, data = maacs, facets = . ~ mopos, geom = c("point", "smooth"), method = "lm")
## Warning: Removed 86 rows containing missing values (stat_smooth).
## Warning: Removed 98 rows containing missing values (stat_smooth).
## Warning: Removed 86 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing missing values (geom_point).
g <- ggplot(data = maacs, mapping = aes(x = log(pm25), y = eno))

summary(g)
## data: id, eno, duBedMusM, pm25, mopos [750x5]
## mapping:  x = log(pm25), y = eno
## faceting: facet_null()
g + geom_point()
## Warning: Removed 184 rows containing missing values (geom_point).
g + geom_point() + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## Warning: Removed 184 rows containing missing values (stat_smooth).
## Warning: Removed 184 rows containing missing values (geom_point).
g + geom_point() + geom_smooth(method = "lm")
## Warning: Removed 184 rows containing missing values (stat_smooth).
## Warning: Removed 184 rows containing missing values (geom_point).
g + geom_point() + facet_grid(facets = . ~ mopos) + geom_smooth(method = "lm")
## Warning: Removed 86 rows containing missing values (stat_smooth).
## Warning: Removed 98 rows containing missing values (stat_smooth).
## Warning: Removed 86 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing missing values (geom_point).
g + geom_point(color = "steelblue", size = 4, alpha = 1/2)
## Warning: Removed 184 rows containing missing values (geom_point).
g + geom_point(aes(color = mopos), size = 4, alpha = 1/2)
## Warning: Removed 184 rows containing missing values (geom_point).
g + geom_point(aes(color = mopos), size = 4, alpha = 1/2) + labs(x = "Log of PM25", y = "ENO", title = "MAACS")
## Warning: Removed 184 rows containing missing values (geom_point).
g + geom_point(aes(color = mopos), size = 4, alpha = 1/2) + labs(x = "Log of PM25", y = "ENO", title = "MAACS") + geom_smooth(size = 4, linetype = 3, method = "lm")
## Warning: Removed 184 rows containing missing values (stat_smooth).
## Warning: Removed 184 rows containing missing values (geom_point).
g + geom_point(aes(color = mopos), size = 4, alpha = 1/2) + labs(x = "Log of PM25", y = "ENO", title = "MAACS") + geom_smooth(size = 4, linetype = 3, method = "lm", se = FALSE)
## Warning: Removed 184 rows containing missing values (stat_smooth).
## Warning: Removed 184 rows containing missing values (geom_point).
g + geom_point(aes(color = mopos), size = 4, alpha = 1/2) + labs(x = "Log of PM25", y = "ENO", title = "MAACS") + geom_smooth(size = 4, linetype = 3, method = "lm", se = FALSE)
## Warning: Removed 184 rows containing missing values (stat_smooth).
## Warning: Removed 184 rows containing missing values (geom_point).
testData <- data.frame(x = 1:100, y = rnorm(100)) 

head(testData)
##   x           y
## 1 1  1.03867599
## 2 2 -0.82879997
## 3 3 -0.74745592
## 4 4 -0.96679162
## 5 5  0.08761936
## 6 6 -0.96893042
# Setting Outlier
testData[50, 2] <- 100 

plot(testData$x, testData$y, type = "l", ylim = c(-3, 3))
g <- ggplot(testData, aes(x = x, y = y))

g + geom_line()
# Outlier Missing
g + geom_line() + ylim(c(-3, 3))
# Outlier Included
g + geom_line() + coord_cartesian(ylim = c(-3, 3))
cutpoints <- quantile(x = maacs$duBedMusM, breaks = seq(0, 1, length.out = 4), na.rm = TRUE)

cutpoints
##        0%       25%       50%       75%      100% 
##      0.01    308.00   1151.00   3881.00 124919.00
maacs$newCol <- cut(x = maacs$duBedMusM, cutpoints)

levels(maacs$newCol)
## [1] "(0.01,308]"          "(308,1.15e+03]"      "(1.15e+03,3.88e+03]"
## [4] "(3.88e+03,1.25e+05]"
library(ggthemes)

g <- ggplot(data = maacs, aes(x = log(pm25), y = eno))

g + geom_point(alpha = 1/3) + facet_wrap(facets = newCol ~ mopos) + geom_smooth(method = "lm", se = FALSE, col = "steelblue") + theme_bw(base_size = 10) + labs(x = expression("log " * PM[2.5]), title = "MAACS")
## Warning: Removed 8 rows containing missing values (stat_smooth).
## Warning: Removed 8 rows containing missing values (stat_smooth).
## Warning: Removed 9 rows containing missing values (stat_smooth).
## Warning: Removed 10 rows containing missing values (stat_smooth).
## Warning: Removed 15 rows containing missing values (stat_smooth).
## Warning: Removed 6 rows containing missing values (stat_smooth).
## Warning: Removed 14 rows containing missing values (stat_smooth).
## Warning: Removed 7 rows containing missing values (stat_smooth).
## Warning: Removed 40 rows containing missing values (stat_smooth).
## Warning: Removed 67 rows containing missing values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_point).
## Warning: Removed 10 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 14 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing missing values (geom_point).
rm(list = ls())