Data Science with R

Lema LOGAMOU SEKNEWNA, PhD.

Operators

  • Addition: +
1+1
[1] 2
  • Substraction: -
1-1
[1] 0
  • Multiplication: *
1*1
[1] 1

  • Division: /
1/1
[1] 1
  • Exponent: ^ or **
2 ^ 10 # or 2 ** 10
[1] 1024
  • Modulus (Remainder from division): %%
1035 %% 2
[1] 1
  • Integer Division: %/%
1035 %/% 3
[1] 345

Logical operators

  • Less than: <
1 < 1
[1] FALSE
  • Less than or equal to: <=
1 <= 1
[1] TRUE
  • Greater than: >
1 > 1
[1] FALSE

  • Greater than or equal to: >=
1 >= 1
[1] TRUE
  • Exactly equal to: ==
"R" == "r"
[1] FALSE

R is case sensitive !!!

  • Not equal to: !=
1 != 1
[1] FALSE

  • Negation/NOT: !
!TRUE # or !T
[1] FALSE
  • AND: &
TRUE & TRUE; TRUE & FALSE; FALSE & FALSE
[1] TRUE
[1] FALSE
[1] FALSE
  • OR: |
TRUE | TRUE; TRUE | FALSE; FALSE | FALSE
[1] TRUE
[1] TRUE
[1] FALSE

R object and assignment

In R we can use <-, = (single equal sign !) and -> to assign a value to a variable.

A variable name:

  • can begin with a character or dot(s). Ex: a <- 1, 0 -> .a
  • should not contain space. Replace empty space with _.
v rsion <- 4.3.2
Error: <text>:1:3: unexpected symbol
1: v rsion
      ^
  • can contain numbers. Ex: a1 <- 1.
a <- 1
0 -> .a
a1 = .a

R variables

  • Numeric - (10.5, 55, 787)
PI <- pi; class(PI); typeof(PI)
[1] "numeric"
[1] "double"
n <- 55; class(n); typeof(n)
[1] "numeric"
[1] "double"
  • Integer - (1L, 55L, 100L, where the letter “L” declares this as an integer).
# Check the class of n <- 55L. What do you see?
  • Complex - (9 + 3i, where “i” is the imaginary part)
z <- 9 + 3i; class(z); #typeof(z)
[1] "complex"

  • Character/string - (“R”, “[R]eading week”, “FALSE”, “11.5”)
string <- "[R]eading week"; class(string)
[1] "character"
  • Logical/Boolean - (TRUE or FALSE)
TRUE # or T
[1] TRUE
FALSE # or F
[1] FALSE

Can also be an outcome of a test. Example: if we want to check if "[R]eading week" == "[R]eading Week"

R Data types/Structure

  • Scalars: any number in N, Z, D, Q, R, or C (Quantum Mecanics)
  • Vectors: collection of identical objects; can be a sequence;
v <- c(1, "R", T, FALSE, NA)
# print v

# what is the class of v?

# sequence
(x <- seq(0, 2*pi, length.out = 50))
 [1] 0.0000000 0.1282283 0.2564565 0.3846848 0.5129131 0.6411414 0.7693696
 [8] 0.8975979 1.0258262 1.1540544 1.2822827 1.4105110 1.5387393 1.6669675
[15] 1.7951958 1.9234241 2.0516523 2.1798806 2.3081089 2.4363372 2.5645654
[22] 2.6927937 2.8210220 2.9492502 3.0774785 3.2057068 3.3339351 3.4621633
[29] 3.5903916 3.7186199 3.8468481 3.9750764 4.1033047 4.2315330 4.3597612
[36] 4.4879895 4.6162178 4.7444460 4.8726743 5.0009026 5.1291309 5.2573591
[43] 5.3855874 5.5138157 5.6420439 5.7702722 5.8985005 6.0267288 6.1549570
[50] 6.2831853

# repetition
rep(c(0:1), c(50, 50))
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# sampling
sample(0:1, size = 100, replace = TRUE, prob = c(0.5, 0.5)) -> y
y
  [1] 1 0 0 1 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 1 1 1 0 1 0 1 0 0 1 1 0 1 0 1
 [38] 1 1 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0 0 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 1 0 0
 [75] 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1 0 1 0 1 0 0
sum(y == 0); sum(!y == 1)
[1] 53
[1] 53

table(y)
y
 0  1 
53 47 
prop.table(table(y))
y
   0    1 
0.53 0.47 
# converting a continuous variable into a categorical variable
set.seed(123)
age <- sample(0:120, size = 100)
(brks <- seq(0, 120, by = 10))
 [1]   0  10  20  30  40  50  60  70  80  90 100 110 120
# (brks <- seq(min(age), max(age), le = 40))
age_groups <- cut(age, breaks = brks, include.lowest = TRUE, right = )
# checking for missing values
(which(is.na(age_groups)) -> id_missing)
integer(0)

# convertion
age_factor <- factor(age_groups) # not necessay!
identical(age_groups, age_factor)
[1] TRUE
# count in each class/group
frequencies <- table(age_groups)
# plotting
barplot(frequencies)

pie(frequencies)

  • Matrices: two dimensional data set with columns and rows.
(A <- matrix(1:25, nrow = 5, ncol = 5))
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    6   11   16   21
[2,]    2    7   12   17   22
[3,]    3    8   13   18   23
[4,]    4    9   14   19   24
[5,]    5   10   15   20   25
(B <- matrix(1:25, nrow = 5, ncol = 5, byrow = T))
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    2    3    4    5
[2,]    6    7    8    9   10
[3,]   11   12   13   14   15
[4,]   16   17   18   19   20
[5,]   21   22   23   24   25

Matrix operations

(A <- matrix(c(1, 0, 2, 5, 2, 1, 4, 2, 0), nrow = 3))
     [,1] [,2] [,3]
[1,]    1    5    4
[2,]    0    2    2
[3,]    2    1    0
B <- matrix(c(2, 5, 2, 3, 1, 1, 0, 1, 1), nrow = 3)

# Transpose
(A_T <- t(A))
     [,1] [,2] [,3]
[1,]    1    0    2
[2,]    5    2    1
[3,]    4    2    0
# Addition
A + B
     [,1] [,2] [,3]
[1,]    3    8    4
[2,]    5    3    3
[3,]    4    2    1

# Substraction
A - B
     [,1] [,2] [,3]
[1,]   -1    2    4
[2,]   -5    1    1
[3,]    0    0   -1
# Multiplication
A %*% B
     [,1] [,2] [,3]
[1,]   35   12    9
[2,]   14    4    4
[3,]    9    7    1
# Inverse
(A_inv <- solve(A))
     [,1] [,2] [,3]
[1,]   -1  2.0    1
[2,]    2 -4.0   -1
[3,]   -2  4.5    1

# Division: multiply a matrix by the inverse of another. B/A = BA_inv
B %*% A_inv
     [,1] [,2] [,3]
[1,]    4 -8.0   -1
[2,]   -5 10.5    5
[3,]   -2  4.5    2
# Eigen values/vectors
ev <- eigen(A)
ev$values
[1]  4.7664355 -1.4836116 -0.2828239
ev$vectors
           [,1]       [,2]       [,3]
[1,] -0.8535725 -0.3668743  0.2177685
[2,] -0.3052279 -0.4631774 -0.6431613
[3,] -0.4221966  0.8067651  0.7341120

  • Arrays: can have more than two dimensions.
(aRray <- array(1:24, dim = c(3, 4, 2)))
, , 1

     [,1] [,2] [,3] [,4]
[1,]    1    4    7   10
[2,]    2    5    8   11
[3,]    3    6    9   12

, , 2

     [,1] [,2] [,3] [,4]
[1,]   13   16   19   22
[2,]   14   17   20   23
[3,]   15   18   21   24
class(aRray)
[1] "array"

dim(aRray)
[1] 3 4 2
# 
aRray[1, 1, 2] # element at i=1, j=1 from the second matrix
[1] 13

Elements extraction:

[row position, column position, matrix level]

  • Lists: collection of object of different types. The sizes of elements could be different.
list("matrix" = A, "sequence" = x)
$matrix
     [,1] [,2] [,3]
[1,]    1    5    4
[2,]    0    2    2
[3,]    2    1    0

$sequence
 [1] 0.0000000 0.1282283 0.2564565 0.3846848 0.5129131 0.6411414 0.7693696
 [8] 0.8975979 1.0258262 1.1540544 1.2822827 1.4105110 1.5387393 1.6669675
[15] 1.7951958 1.9234241 2.0516523 2.1798806 2.3081089 2.4363372 2.5645654
[22] 2.6927937 2.8210220 2.9492502 3.0774785 3.2057068 3.3339351 3.4621633
[29] 3.5903916 3.7186199 3.8468481 3.9750764 4.1033047 4.2315330 4.3597612
[36] 4.4879895 4.6162178 4.7444460 4.8726743 5.0009026 5.1291309 5.2573591
[43] 5.3855874 5.5138157 5.6420439 5.7702722 5.8985005 6.0267288 6.1549570
[50] 6.2831853

  • Data.frame: is also a list where all elements (columns) have the same length. A data frame in R is a table.
iris
    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1            5.1         3.5          1.4         0.2     setosa
2            4.9         3.0          1.4         0.2     setosa
3            4.7         3.2          1.3         0.2     setosa
4            4.6         3.1          1.5         0.2     setosa
5            5.0         3.6          1.4         0.2     setosa
6            5.4         3.9          1.7         0.4     setosa
7            4.6         3.4          1.4         0.3     setosa
8            5.0         3.4          1.5         0.2     setosa
9            4.4         2.9          1.4         0.2     setosa
10           4.9         3.1          1.5         0.1     setosa
11           5.4         3.7          1.5         0.2     setosa
12           4.8         3.4          1.6         0.2     setosa
13           4.8         3.0          1.4         0.1     setosa
14           4.3         3.0          1.1         0.1     setosa
15           5.8         4.0          1.2         0.2     setosa
16           5.7         4.4          1.5         0.4     setosa
17           5.4         3.9          1.3         0.4     setosa
18           5.1         3.5          1.4         0.3     setosa
19           5.7         3.8          1.7         0.3     setosa
20           5.1         3.8          1.5         0.3     setosa
21           5.4         3.4          1.7         0.2     setosa
22           5.1         3.7          1.5         0.4     setosa
23           4.6         3.6          1.0         0.2     setosa
24           5.1         3.3          1.7         0.5     setosa
25           4.8         3.4          1.9         0.2     setosa
26           5.0         3.0          1.6         0.2     setosa
27           5.0         3.4          1.6         0.4     setosa
28           5.2         3.5          1.5         0.2     setosa
29           5.2         3.4          1.4         0.2     setosa
30           4.7         3.2          1.6         0.2     setosa
31           4.8         3.1          1.6         0.2     setosa
32           5.4         3.4          1.5         0.4     setosa
33           5.2         4.1          1.5         0.1     setosa
34           5.5         4.2          1.4         0.2     setosa
35           4.9         3.1          1.5         0.2     setosa
36           5.0         3.2          1.2         0.2     setosa
37           5.5         3.5          1.3         0.2     setosa
38           4.9         3.6          1.4         0.1     setosa
39           4.4         3.0          1.3         0.2     setosa
40           5.1         3.4          1.5         0.2     setosa
41           5.0         3.5          1.3         0.3     setosa
42           4.5         2.3          1.3         0.3     setosa
43           4.4         3.2          1.3         0.2     setosa
44           5.0         3.5          1.6         0.6     setosa
45           5.1         3.8          1.9         0.4     setosa
46           4.8         3.0          1.4         0.3     setosa
47           5.1         3.8          1.6         0.2     setosa
48           4.6         3.2          1.4         0.2     setosa
49           5.3         3.7          1.5         0.2     setosa
50           5.0         3.3          1.4         0.2     setosa
51           7.0         3.2          4.7         1.4 versicolor
52           6.4         3.2          4.5         1.5 versicolor
53           6.9         3.1          4.9         1.5 versicolor
54           5.5         2.3          4.0         1.3 versicolor
55           6.5         2.8          4.6         1.5 versicolor
56           5.7         2.8          4.5         1.3 versicolor
57           6.3         3.3          4.7         1.6 versicolor
58           4.9         2.4          3.3         1.0 versicolor
59           6.6         2.9          4.6         1.3 versicolor
60           5.2         2.7          3.9         1.4 versicolor
61           5.0         2.0          3.5         1.0 versicolor
62           5.9         3.0          4.2         1.5 versicolor
63           6.0         2.2          4.0         1.0 versicolor
64           6.1         2.9          4.7         1.4 versicolor
65           5.6         2.9          3.6         1.3 versicolor
66           6.7         3.1          4.4         1.4 versicolor
67           5.6         3.0          4.5         1.5 versicolor
68           5.8         2.7          4.1         1.0 versicolor
69           6.2         2.2          4.5         1.5 versicolor
70           5.6         2.5          3.9         1.1 versicolor
71           5.9         3.2          4.8         1.8 versicolor
72           6.1         2.8          4.0         1.3 versicolor
73           6.3         2.5          4.9         1.5 versicolor
74           6.1         2.8          4.7         1.2 versicolor
75           6.4         2.9          4.3         1.3 versicolor
76           6.6         3.0          4.4         1.4 versicolor
77           6.8         2.8          4.8         1.4 versicolor
78           6.7         3.0          5.0         1.7 versicolor
79           6.0         2.9          4.5         1.5 versicolor
80           5.7         2.6          3.5         1.0 versicolor
81           5.5         2.4          3.8         1.1 versicolor
82           5.5         2.4          3.7         1.0 versicolor
83           5.8         2.7          3.9         1.2 versicolor
84           6.0         2.7          5.1         1.6 versicolor
85           5.4         3.0          4.5         1.5 versicolor
86           6.0         3.4          4.5         1.6 versicolor
87           6.7         3.1          4.7         1.5 versicolor
88           6.3         2.3          4.4         1.3 versicolor
89           5.6         3.0          4.1         1.3 versicolor
90           5.5         2.5          4.0         1.3 versicolor
91           5.5         2.6          4.4         1.2 versicolor
92           6.1         3.0          4.6         1.4 versicolor
93           5.8         2.6          4.0         1.2 versicolor
94           5.0         2.3          3.3         1.0 versicolor
95           5.6         2.7          4.2         1.3 versicolor
96           5.7         3.0          4.2         1.2 versicolor
97           5.7         2.9          4.2         1.3 versicolor
98           6.2         2.9          4.3         1.3 versicolor
99           5.1         2.5          3.0         1.1 versicolor
100          5.7         2.8          4.1         1.3 versicolor
101          6.3         3.3          6.0         2.5  virginica
102          5.8         2.7          5.1         1.9  virginica
103          7.1         3.0          5.9         2.1  virginica
104          6.3         2.9          5.6         1.8  virginica
105          6.5         3.0          5.8         2.2  virginica
106          7.6         3.0          6.6         2.1  virginica
107          4.9         2.5          4.5         1.7  virginica
108          7.3         2.9          6.3         1.8  virginica
109          6.7         2.5          5.8         1.8  virginica
110          7.2         3.6          6.1         2.5  virginica
111          6.5         3.2          5.1         2.0  virginica
112          6.4         2.7          5.3         1.9  virginica
113          6.8         3.0          5.5         2.1  virginica
114          5.7         2.5          5.0         2.0  virginica
115          5.8         2.8          5.1         2.4  virginica
116          6.4         3.2          5.3         2.3  virginica
117          6.5         3.0          5.5         1.8  virginica
118          7.7         3.8          6.7         2.2  virginica
119          7.7         2.6          6.9         2.3  virginica
120          6.0         2.2          5.0         1.5  virginica
121          6.9         3.2          5.7         2.3  virginica
122          5.6         2.8          4.9         2.0  virginica
123          7.7         2.8          6.7         2.0  virginica
124          6.3         2.7          4.9         1.8  virginica
125          6.7         3.3          5.7         2.1  virginica
126          7.2         3.2          6.0         1.8  virginica
127          6.2         2.8          4.8         1.8  virginica
128          6.1         3.0          4.9         1.8  virginica
129          6.4         2.8          5.6         2.1  virginica
130          7.2         3.0          5.8         1.6  virginica
131          7.4         2.8          6.1         1.9  virginica
132          7.9         3.8          6.4         2.0  virginica
133          6.4         2.8          5.6         2.2  virginica
134          6.3         2.8          5.1         1.5  virginica
135          6.1         2.6          5.6         1.4  virginica
136          7.7         3.0          6.1         2.3  virginica
137          6.3         3.4          5.6         2.4  virginica
138          6.4         3.1          5.5         1.8  virginica
139          6.0         3.0          4.8         1.8  virginica
140          6.9         3.1          5.4         2.1  virginica
141          6.7         3.1          5.6         2.4  virginica
142          6.9         3.1          5.1         2.3  virginica
143          5.8         2.7          5.1         1.9  virginica
144          6.8         3.2          5.9         2.3  virginica
145          6.7         3.3          5.7         2.5  virginica
146          6.7         3.0          5.2         2.3  virginica
147          6.3         2.5          5.0         1.9  virginica
148          6.5         3.0          5.2         2.0  virginica
149          6.2         3.4          5.4         2.3  virginica
150          5.9         3.0          5.1         1.8  virginica
class(iris)
[1] "data.frame"
is.list(iris)
[1] TRUE

We can convert a data frame into a list.

list_data <- as.list(iris)
class(list_data)
[1] "list"
# converting back to data.frame
# TODO

# converting back to a data.frame
as.data.frame(list_data)
    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1            5.1         3.5          1.4         0.2     setosa
2            4.9         3.0          1.4         0.2     setosa
3            4.7         3.2          1.3         0.2     setosa
4            4.6         3.1          1.5         0.2     setosa
5            5.0         3.6          1.4         0.2     setosa
6            5.4         3.9          1.7         0.4     setosa
7            4.6         3.4          1.4         0.3     setosa
8            5.0         3.4          1.5         0.2     setosa
9            4.4         2.9          1.4         0.2     setosa
10           4.9         3.1          1.5         0.1     setosa
11           5.4         3.7          1.5         0.2     setosa
12           4.8         3.4          1.6         0.2     setosa
13           4.8         3.0          1.4         0.1     setosa
14           4.3         3.0          1.1         0.1     setosa
15           5.8         4.0          1.2         0.2     setosa
16           5.7         4.4          1.5         0.4     setosa
17           5.4         3.9          1.3         0.4     setosa
18           5.1         3.5          1.4         0.3     setosa
19           5.7         3.8          1.7         0.3     setosa
20           5.1         3.8          1.5         0.3     setosa
21           5.4         3.4          1.7         0.2     setosa
22           5.1         3.7          1.5         0.4     setosa
23           4.6         3.6          1.0         0.2     setosa
24           5.1         3.3          1.7         0.5     setosa
25           4.8         3.4          1.9         0.2     setosa
26           5.0         3.0          1.6         0.2     setosa
27           5.0         3.4          1.6         0.4     setosa
28           5.2         3.5          1.5         0.2     setosa
29           5.2         3.4          1.4         0.2     setosa
30           4.7         3.2          1.6         0.2     setosa
31           4.8         3.1          1.6         0.2     setosa
32           5.4         3.4          1.5         0.4     setosa
33           5.2         4.1          1.5         0.1     setosa
34           5.5         4.2          1.4         0.2     setosa
35           4.9         3.1          1.5         0.2     setosa
36           5.0         3.2          1.2         0.2     setosa
37           5.5         3.5          1.3         0.2     setosa
38           4.9         3.6          1.4         0.1     setosa
39           4.4         3.0          1.3         0.2     setosa
40           5.1         3.4          1.5         0.2     setosa
41           5.0         3.5          1.3         0.3     setosa
42           4.5         2.3          1.3         0.3     setosa
43           4.4         3.2          1.3         0.2     setosa
44           5.0         3.5          1.6         0.6     setosa
45           5.1         3.8          1.9         0.4     setosa
46           4.8         3.0          1.4         0.3     setosa
47           5.1         3.8          1.6         0.2     setosa
48           4.6         3.2          1.4         0.2     setosa
49           5.3         3.7          1.5         0.2     setosa
50           5.0         3.3          1.4         0.2     setosa
51           7.0         3.2          4.7         1.4 versicolor
52           6.4         3.2          4.5         1.5 versicolor
53           6.9         3.1          4.9         1.5 versicolor
54           5.5         2.3          4.0         1.3 versicolor
55           6.5         2.8          4.6         1.5 versicolor
56           5.7         2.8          4.5         1.3 versicolor
57           6.3         3.3          4.7         1.6 versicolor
58           4.9         2.4          3.3         1.0 versicolor
59           6.6         2.9          4.6         1.3 versicolor
60           5.2         2.7          3.9         1.4 versicolor
61           5.0         2.0          3.5         1.0 versicolor
62           5.9         3.0          4.2         1.5 versicolor
63           6.0         2.2          4.0         1.0 versicolor
64           6.1         2.9          4.7         1.4 versicolor
65           5.6         2.9          3.6         1.3 versicolor
66           6.7         3.1          4.4         1.4 versicolor
67           5.6         3.0          4.5         1.5 versicolor
68           5.8         2.7          4.1         1.0 versicolor
69           6.2         2.2          4.5         1.5 versicolor
70           5.6         2.5          3.9         1.1 versicolor
71           5.9         3.2          4.8         1.8 versicolor
72           6.1         2.8          4.0         1.3 versicolor
73           6.3         2.5          4.9         1.5 versicolor
74           6.1         2.8          4.7         1.2 versicolor
75           6.4         2.9          4.3         1.3 versicolor
76           6.6         3.0          4.4         1.4 versicolor
77           6.8         2.8          4.8         1.4 versicolor
78           6.7         3.0          5.0         1.7 versicolor
79           6.0         2.9          4.5         1.5 versicolor
80           5.7         2.6          3.5         1.0 versicolor
81           5.5         2.4          3.8         1.1 versicolor
82           5.5         2.4          3.7         1.0 versicolor
83           5.8         2.7          3.9         1.2 versicolor
84           6.0         2.7          5.1         1.6 versicolor
85           5.4         3.0          4.5         1.5 versicolor
86           6.0         3.4          4.5         1.6 versicolor
87           6.7         3.1          4.7         1.5 versicolor
88           6.3         2.3          4.4         1.3 versicolor
89           5.6         3.0          4.1         1.3 versicolor
90           5.5         2.5          4.0         1.3 versicolor
91           5.5         2.6          4.4         1.2 versicolor
92           6.1         3.0          4.6         1.4 versicolor
93           5.8         2.6          4.0         1.2 versicolor
94           5.0         2.3          3.3         1.0 versicolor
95           5.6         2.7          4.2         1.3 versicolor
96           5.7         3.0          4.2         1.2 versicolor
97           5.7         2.9          4.2         1.3 versicolor
98           6.2         2.9          4.3         1.3 versicolor
99           5.1         2.5          3.0         1.1 versicolor
100          5.7         2.8          4.1         1.3 versicolor
101          6.3         3.3          6.0         2.5  virginica
102          5.8         2.7          5.1         1.9  virginica
103          7.1         3.0          5.9         2.1  virginica
104          6.3         2.9          5.6         1.8  virginica
105          6.5         3.0          5.8         2.2  virginica
106          7.6         3.0          6.6         2.1  virginica
107          4.9         2.5          4.5         1.7  virginica
108          7.3         2.9          6.3         1.8  virginica
109          6.7         2.5          5.8         1.8  virginica
110          7.2         3.6          6.1         2.5  virginica
111          6.5         3.2          5.1         2.0  virginica
112          6.4         2.7          5.3         1.9  virginica
113          6.8         3.0          5.5         2.1  virginica
114          5.7         2.5          5.0         2.0  virginica
115          5.8         2.8          5.1         2.4  virginica
116          6.4         3.2          5.3         2.3  virginica
117          6.5         3.0          5.5         1.8  virginica
118          7.7         3.8          6.7         2.2  virginica
119          7.7         2.6          6.9         2.3  virginica
120          6.0         2.2          5.0         1.5  virginica
121          6.9         3.2          5.7         2.3  virginica
122          5.6         2.8          4.9         2.0  virginica
123          7.7         2.8          6.7         2.0  virginica
124          6.3         2.7          4.9         1.8  virginica
125          6.7         3.3          5.7         2.1  virginica
126          7.2         3.2          6.0         1.8  virginica
127          6.2         2.8          4.8         1.8  virginica
128          6.1         3.0          4.9         1.8  virginica
129          6.4         2.8          5.6         2.1  virginica
130          7.2         3.0          5.8         1.6  virginica
131          7.4         2.8          6.1         1.9  virginica
132          7.9         3.8          6.4         2.0  virginica
133          6.4         2.8          5.6         2.2  virginica
134          6.3         2.8          5.1         1.5  virginica
135          6.1         2.6          5.6         1.4  virginica
136          7.7         3.0          6.1         2.3  virginica
137          6.3         3.4          5.6         2.4  virginica
138          6.4         3.1          5.5         1.8  virginica
139          6.0         3.0          4.8         1.8  virginica
140          6.9         3.1          5.4         2.1  virginica
141          6.7         3.1          5.6         2.4  virginica
142          6.9         3.1          5.1         2.3  virginica
143          5.8         2.7          5.1         1.9  virginica
144          6.8         3.2          5.9         2.3  virginica
145          6.7         3.3          5.7         2.5  virginica
146          6.7         3.0          5.2         2.3  virginica
147          6.3         2.5          5.0         1.9  virginica
148          6.5         3.0          5.2         2.0  virginica
149          6.2         3.4          5.4         2.3  virginica
150          5.9         3.0          5.1         1.8  virginica
data.frame(list_data)
    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1            5.1         3.5          1.4         0.2     setosa
2            4.9         3.0          1.4         0.2     setosa
3            4.7         3.2          1.3         0.2     setosa
4            4.6         3.1          1.5         0.2     setosa
5            5.0         3.6          1.4         0.2     setosa
6            5.4         3.9          1.7         0.4     setosa
7            4.6         3.4          1.4         0.3     setosa
8            5.0         3.4          1.5         0.2     setosa
9            4.4         2.9          1.4         0.2     setosa
10           4.9         3.1          1.5         0.1     setosa
11           5.4         3.7          1.5         0.2     setosa
12           4.8         3.4          1.6         0.2     setosa
13           4.8         3.0          1.4         0.1     setosa
14           4.3         3.0          1.1         0.1     setosa
15           5.8         4.0          1.2         0.2     setosa
16           5.7         4.4          1.5         0.4     setosa
17           5.4         3.9          1.3         0.4     setosa
18           5.1         3.5          1.4         0.3     setosa
19           5.7         3.8          1.7         0.3     setosa
20           5.1         3.8          1.5         0.3     setosa
21           5.4         3.4          1.7         0.2     setosa
22           5.1         3.7          1.5         0.4     setosa
23           4.6         3.6          1.0         0.2     setosa
24           5.1         3.3          1.7         0.5     setosa
25           4.8         3.4          1.9         0.2     setosa
26           5.0         3.0          1.6         0.2     setosa
27           5.0         3.4          1.6         0.4     setosa
28           5.2         3.5          1.5         0.2     setosa
29           5.2         3.4          1.4         0.2     setosa
30           4.7         3.2          1.6         0.2     setosa
31           4.8         3.1          1.6         0.2     setosa
32           5.4         3.4          1.5         0.4     setosa
33           5.2         4.1          1.5         0.1     setosa
34           5.5         4.2          1.4         0.2     setosa
35           4.9         3.1          1.5         0.2     setosa
36           5.0         3.2          1.2         0.2     setosa
37           5.5         3.5          1.3         0.2     setosa
38           4.9         3.6          1.4         0.1     setosa
39           4.4         3.0          1.3         0.2     setosa
40           5.1         3.4          1.5         0.2     setosa
41           5.0         3.5          1.3         0.3     setosa
42           4.5         2.3          1.3         0.3     setosa
43           4.4         3.2          1.3         0.2     setosa
44           5.0         3.5          1.6         0.6     setosa
45           5.1         3.8          1.9         0.4     setosa
46           4.8         3.0          1.4         0.3     setosa
47           5.1         3.8          1.6         0.2     setosa
48           4.6         3.2          1.4         0.2     setosa
49           5.3         3.7          1.5         0.2     setosa
50           5.0         3.3          1.4         0.2     setosa
51           7.0         3.2          4.7         1.4 versicolor
52           6.4         3.2          4.5         1.5 versicolor
53           6.9         3.1          4.9         1.5 versicolor
54           5.5         2.3          4.0         1.3 versicolor
55           6.5         2.8          4.6         1.5 versicolor
56           5.7         2.8          4.5         1.3 versicolor
57           6.3         3.3          4.7         1.6 versicolor
58           4.9         2.4          3.3         1.0 versicolor
59           6.6         2.9          4.6         1.3 versicolor
60           5.2         2.7          3.9         1.4 versicolor
61           5.0         2.0          3.5         1.0 versicolor
62           5.9         3.0          4.2         1.5 versicolor
63           6.0         2.2          4.0         1.0 versicolor
64           6.1         2.9          4.7         1.4 versicolor
65           5.6         2.9          3.6         1.3 versicolor
66           6.7         3.1          4.4         1.4 versicolor
67           5.6         3.0          4.5         1.5 versicolor
68           5.8         2.7          4.1         1.0 versicolor
69           6.2         2.2          4.5         1.5 versicolor
70           5.6         2.5          3.9         1.1 versicolor
71           5.9         3.2          4.8         1.8 versicolor
72           6.1         2.8          4.0         1.3 versicolor
73           6.3         2.5          4.9         1.5 versicolor
74           6.1         2.8          4.7         1.2 versicolor
75           6.4         2.9          4.3         1.3 versicolor
76           6.6         3.0          4.4         1.4 versicolor
77           6.8         2.8          4.8         1.4 versicolor
78           6.7         3.0          5.0         1.7 versicolor
79           6.0         2.9          4.5         1.5 versicolor
80           5.7         2.6          3.5         1.0 versicolor
81           5.5         2.4          3.8         1.1 versicolor
82           5.5         2.4          3.7         1.0 versicolor
83           5.8         2.7          3.9         1.2 versicolor
84           6.0         2.7          5.1         1.6 versicolor
85           5.4         3.0          4.5         1.5 versicolor
86           6.0         3.4          4.5         1.6 versicolor
87           6.7         3.1          4.7         1.5 versicolor
88           6.3         2.3          4.4         1.3 versicolor
89           5.6         3.0          4.1         1.3 versicolor
90           5.5         2.5          4.0         1.3 versicolor
91           5.5         2.6          4.4         1.2 versicolor
92           6.1         3.0          4.6         1.4 versicolor
93           5.8         2.6          4.0         1.2 versicolor
94           5.0         2.3          3.3         1.0 versicolor
95           5.6         2.7          4.2         1.3 versicolor
96           5.7         3.0          4.2         1.2 versicolor
97           5.7         2.9          4.2         1.3 versicolor
98           6.2         2.9          4.3         1.3 versicolor
99           5.1         2.5          3.0         1.1 versicolor
100          5.7         2.8          4.1         1.3 versicolor
101          6.3         3.3          6.0         2.5  virginica
102          5.8         2.7          5.1         1.9  virginica
103          7.1         3.0          5.9         2.1  virginica
104          6.3         2.9          5.6         1.8  virginica
105          6.5         3.0          5.8         2.2  virginica
106          7.6         3.0          6.6         2.1  virginica
107          4.9         2.5          4.5         1.7  virginica
108          7.3         2.9          6.3         1.8  virginica
109          6.7         2.5          5.8         1.8  virginica
110          7.2         3.6          6.1         2.5  virginica
111          6.5         3.2          5.1         2.0  virginica
112          6.4         2.7          5.3         1.9  virginica
113          6.8         3.0          5.5         2.1  virginica
114          5.7         2.5          5.0         2.0  virginica
115          5.8         2.8          5.1         2.4  virginica
116          6.4         3.2          5.3         2.3  virginica
117          6.5         3.0          5.5         1.8  virginica
118          7.7         3.8          6.7         2.2  virginica
119          7.7         2.6          6.9         2.3  virginica
120          6.0         2.2          5.0         1.5  virginica
121          6.9         3.2          5.7         2.3  virginica
122          5.6         2.8          4.9         2.0  virginica
123          7.7         2.8          6.7         2.0  virginica
124          6.3         2.7          4.9         1.8  virginica
125          6.7         3.3          5.7         2.1  virginica
126          7.2         3.2          6.0         1.8  virginica
127          6.2         2.8          4.8         1.8  virginica
128          6.1         3.0          4.9         1.8  virginica
129          6.4         2.8          5.6         2.1  virginica
130          7.2         3.0          5.8         1.6  virginica
131          7.4         2.8          6.1         1.9  virginica
132          7.9         3.8          6.4         2.0  virginica
133          6.4         2.8          5.6         2.2  virginica
134          6.3         2.8          5.1         1.5  virginica
135          6.1         2.6          5.6         1.4  virginica
136          7.7         3.0          6.1         2.3  virginica
137          6.3         3.4          5.6         2.4  virginica
138          6.4         3.1          5.5         1.8  virginica
139          6.0         3.0          4.8         1.8  virginica
140          6.9         3.1          5.4         2.1  virginica
141          6.7         3.1          5.6         2.4  virginica
142          6.9         3.1          5.1         2.3  virginica
143          5.8         2.7          5.1         1.9  virginica
144          6.8         3.2          5.9         2.3  virginica
145          6.7         3.3          5.7         2.5  virginica
146          6.7         3.0          5.2         2.3  virginica
147          6.3         2.5          5.0         1.9  virginica
148          6.5         3.0          5.2         2.0  virginica
149          6.2         3.4          5.4         2.3  virginica
150          5.9         3.0          5.1         1.8  virginica

Import data in R

The iris dataset exist already in the R environment. We can import data in R from different sources:

  • from a package without loading it using the library function.
data("spam", package = "kernlab")
# data structure
str(spam)
'data.frame':   4601 obs. of  58 variables:
 $ make             : num  0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
 $ address          : num  0.64 0.28 0 0 0 0 0 0 0 0.12 ...
 $ all              : num  0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
 $ num3d            : num  0 0 0 0 0 0 0 0 0 0 ...
 $ our              : num  0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
 $ over             : num  0 0.28 0.19 0 0 0 0 0 0 0.32 ...
 $ remove           : num  0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
 $ internet         : num  0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
 $ order            : num  0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
 $ mail             : num  0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
 $ receive          : num  0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
 $ will             : num  0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
 $ people           : num  0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
 $ report           : num  0 0.21 0 0 0 0 0 0 0 0 ...
 $ addresses        : num  0 0.14 1.75 0 0 0 0 0 0 0.12 ...
 $ free             : num  0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
 $ business         : num  0 0.07 0.06 0 0 0 0 0 0 0 ...
 $ email            : num  1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
 $ you              : num  1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
 $ credit           : num  0 0 0.32 0 0 0 0 0 3.53 0.06 ...
 $ your             : num  0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
 $ font             : num  0 0 0 0 0 0 0 0 0 0 ...
 $ num000           : num  0 0.43 1.16 0 0 0 0 0 0 0.19 ...
 $ money            : num  0 0.43 0.06 0 0 0 0 0 0.15 0 ...
 $ hp               : num  0 0 0 0 0 0 0 0 0 0 ...
 $ hpl              : num  0 0 0 0 0 0 0 0 0 0 ...
 $ george           : num  0 0 0 0 0 0 0 0 0 0 ...
 $ num650           : num  0 0 0 0 0 0 0 0 0 0 ...
 $ lab              : num  0 0 0 0 0 0 0 0 0 0 ...
 $ labs             : num  0 0 0 0 0 0 0 0 0 0 ...
 $ telnet           : num  0 0 0 0 0 0 0 0 0 0 ...
 $ num857           : num  0 0 0 0 0 0 0 0 0 0 ...
 $ data             : num  0 0 0 0 0 0 0 0 0.15 0 ...
 $ num415           : num  0 0 0 0 0 0 0 0 0 0 ...
 $ num85            : num  0 0 0 0 0 0 0 0 0 0 ...
 $ technology       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ num1999          : num  0 0.07 0 0 0 0 0 0 0 0 ...
 $ parts            : num  0 0 0 0 0 0 0 0 0 0 ...
 $ pm               : num  0 0 0 0 0 0 0 0 0 0 ...
 $ direct           : num  0 0 0.06 0 0 0 0 0 0 0 ...
 $ cs               : num  0 0 0 0 0 0 0 0 0 0 ...
 $ meeting          : num  0 0 0 0 0 0 0 0 0 0 ...
 $ original         : num  0 0 0.12 0 0 0 0 0 0.3 0 ...
 $ project          : num  0 0 0 0 0 0 0 0 0 0.06 ...
 $ re               : num  0 0 0.06 0 0 0 0 0 0 0 ...
 $ edu              : num  0 0 0.06 0 0 0 0 0 0 0 ...
 $ table            : num  0 0 0 0 0 0 0 0 0 0 ...
 $ conference       : num  0 0 0 0 0 0 0 0 0 0 ...
 $ charSemicolon    : num  0 0 0.01 0 0 0 0 0 0 0.04 ...
 $ charRoundbracket : num  0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
 $ charSquarebracket: num  0 0 0 0 0 0 0 0 0 0 ...
 $ charExclamation  : num  0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
 $ charDollar       : num  0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
 $ charHash         : num  0 0.048 0.01 0 0 0 0 0 0.022 0 ...
 $ capitalAve       : num  3.76 5.11 9.82 3.54 3.54 ...
 $ capitalLong      : num  61 101 485 40 40 15 4 11 445 43 ...
 $ capitalTotal     : num  278 1028 2259 191 191 ...
 $ type             : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...

  • Comma Separated Value file

Exercise: import all the csv files in a list using a for loop.

data_list <- list() # creating an empty list.
dir()
 [1] "AIMSRWlogo.jpg"                       
 [2] "calc.webp"                            
 [3] "data"                                 
 [4] "Day1.zip"                             
 [5] "Day3.zip"                             
 [6] "pipe.jpeg"                            
 [7] "ReadingWeekPracticals_files"          
 [8] "ReadingWeekPracticals.html"           
 [9] "ReadingWeekPracticals.qmd"            
[10] "ReadingWeekPracticals.rmarkdown"      
[11] "ReadingWeekPracticals.Rmd"            
[12] "rsconnect"                            
[13] "Screenshot 2024-02-20 at 17.05.25.png"
[14] "title-slide.html"                     
# check the files names in data/csv
path <- "./data/csv/"
file_names <- dir(path)
file_names <- paste0(path, file_names)
# import everything
for (f in 1:length(file_names)){
  data_list[[f]] <- read.csv(file_names[f])
}

# importing using lapply
data_list2 <- lapply(file_names, FUN = read.csv)
names(data_list2) <- gsub("-cancer-1.csv|-data-1.csv", "", dir(path))
# check the dimension of each element of data_list2
dims <- sapply(data_list2, FUN = dim)
rownames(dims) <- c("n", "p")
dims
  colon leukemia prostate
n    62       72       79
p  2001     3572      501
# calculate k=n/p
k <- sapply(data_list2, FUN = function(x) round(nrow(x)/ncol(x), 2))

rbind(dims, k)
    colon leukemia prostate
n   62.00    72.00    79.00
p 2001.00  3572.00   501.00
k    0.03     0.02     0.16

Controls with: if / else

if (condition/Boolean expression){
  ## code to be executed
}

Example

x <- 3
if (x < 4){
  print(TRUE)
} else {
  print(FALSE)
}
[1] TRUE
# one line
ifelse(x < 4, T, F)
[1] TRUE

We can embed if in existing if and/or else statements.

if (x < 4){
  if (x != 0){
    print("x is not equal to zero.")
  } else {
    print("x is equal to zero")
  }
  
  print("x is less than 4")
} else {
  if (x > 1){
    print("x is greater than 1.")
  } else {
    print("x is less than or equal to 1")
  }
  print("x is greater than 4.")
}
[1] "x is not equal to zero."
[1] "x is less than 4"

Loops

for

for (i in vector){
  ## code to be executed
}
m <- 6
for (i in 1:m) print(i)
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
for (i in 1:m) {
  print(i)
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6

Exercise:

Write a for loop that checks each of the first 10 positive integers if it is odd of even.

## TODO

while

for (i in vector){
  ## code to be executed
}
i <- 0 # should initialize an object that will be incremented in the loop
while (TRUE) { # infinite loop if there is no stooping condition 
  print(i)
  if (i >= 10) break
  i <- i+1
}
[1] 0
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10

repeat

i <- 0
repeat{
  print(i)
  i <- i + 1
  if (i > 10) break # repeat until condition holds.
}
[1] 0
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10

Apply Functions Over Array Margins

  • apply: return a vector or array or list of values obtained by applying a function to margins of an array or matrix.
apply(iris[-5], MARGIN = 2, mean) # MARGIN = 2 means column-wise
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    5.843333     3.057333     3.758000     1.199333 
  • sapply: use ?sapply to check the documentation.
sapply(iris[-5], mean)
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    5.843333     3.057333     3.758000     1.199333 

sapply can also return a list.

sapply(iris, summary)
$Sepal.Length
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  4.300   5.100   5.800   5.843   6.400   7.900 

$Sepal.Width
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.000   2.800   3.000   3.057   3.300   4.400 

$Petal.Length
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.600   4.350   3.758   5.100   6.900 

$Petal.Width
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.100   0.300   1.300   1.199   1.800   2.500 

$Species
    setosa versicolor  virginica 
        50         50         50 

  • lapply: returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X
lapply(iris[-5], mean) # MARGIN = 2 means column-wise
$Sepal.Length
[1] 5.843333

$Sepal.Width
[1] 3.057333

$Petal.Length
[1] 3.758

$Petal.Width
[1] 1.199333

tapply: check the documentation

xx <- factor(iris$Species)
tapply(iris$Sepal.Length, iris[[5]], mean)
    setosa versicolor  virginica 
     5.006      5.936      6.588 

vapply: check the documentation

vapply(X = as.list(iris[-5]), quantile, 
       c("0%" = 0, "25%" = 0, "50%" = 0, "75%" = 0, "100%" = 0))
     Sepal.Length Sepal.Width Petal.Length Petal.Width
0%            4.3         2.0         1.00         0.1
25%           5.1         2.8         1.60         0.3
50%           5.8         3.0         4.35         1.3
75%           6.4         3.3         5.10         1.8
100%          7.9         4.4         6.90         2.5

Define functions in R

function_name <- function(arg1, arg2, ...){
  # code to be executed
}
pp <- function(x) return(x+1)

i <- 1
(i <- pp(i))
[1] 2

Exercise:

Draw the flowchart of the quadratic equation \(ax^2+bx+c=0\) and write an R function that give solutions and comment according to the values of the discriminant.

Solution

solve_quadratic <- function(a, b, c){
  # testing for a
  if (a == 0){
    stop("a should not be equal to 0")
  } else {
    # calculate the discriminant
    d <- b^2 - 4*a*c
    if (d > 0){
      cat("We have two distinct roots\n") # \n adds a new line
      x1 <- (-b + sqrt(d))/(2*a)
      x2 <- (-b - sqrt(d))/(2*a)
    } else {
      if (d == 0){
        cat("We have two repeated roots\n")
        x1 <- x2 <- -b/(2*a)
      } else {
        cat("We have two complex roots\n")
        x1 <- (-b + 1i*sqrt(-d))/(2*a)
        x2 <- (-b - 1i*sqrt(-d))/(2*a)
      }
    }
  }
  return(c("x1" = x1, "x2" = x2))
}

solve_quadratic(1, -5, 6)
We have two distinct roots
x1 x2 
 3  2 

Exercise:

  1. Set the seed to 19223
  2. Create data frame with following columns
  • IDs: using the function paste or paste0. Example: STUD01, …, STUD60
  • Age: using runif() function with minimum and maximum ages equal to 23 and 30 respectively
  • Gender: sample Female and Male with replacement for the 60 students. Convert it into factor.
  • Grade: 60 grades from a normal distribution with mean = 68 and standard deviation 22.

  1. Write a function that take your simulated data and return its summary and a new version of it with an additional column Grade_Cat where
  • Grade_Cat == F if Grade < 60,
  • Grade_Cat == P if Grade >= 60 & Grade < 70,
  • Grade_Cat == GP if Grade >= 70 & Grade < 80,
  • Grade_Cat == VGP if Grade >= 80 & Grade < 85,
  • else Grade_Cat == D

Your function should also return the barplot of Grade_Cat.

Solutution

# 1
set.seed(19223)
# 2
n <- 60
stud_df <- data.frame(
  IDs = paste0("STUD", sprintf("%02d", 1:n)),
  Age = runif(n, 23, 30),
  Gender = factor(sample(c("Female", "Male"), size = n, replace = TRUE)),
  Grade = rnorm(n, 68, 22)
)

summary(stud_df)
     IDs                 Age           Gender       Grade      
 Length:60          Min.   :23.20   Female:29   Min.   :16.88  
 Class :character   1st Qu.:24.60   Male  :31   1st Qu.:50.57  
 Mode  :character   Median :26.21               Median :60.95  
                    Mean   :26.35               Mean   :61.73  
                    3rd Qu.:27.95               3rd Qu.:76.47  
                    Max.   :29.96               Max.   :97.75  

# 3
df_summary <- function(df){
  s <- summary(df)
  # function to categorizes grades
  grade_cat <- function(g) ifelse(g < 60, "F",
                                  ifelse(g < 70, "P", 
                                         ifelse(g < 80, "GP",
                                                ifelse(g < 85, "VGP", "D"))))
  df$Grade_Cat <- factor(grade_cat(df$Grade))
  
  # the barplot
  p <- barplot(table(df$Grade_Cat))
  res <- list(summary = s, data = df, plot = p)
  return(res)
}

ds <- df_summary(stud_df)

ds$summary
     IDs                 Age           Gender       Grade      
 Length:60          Min.   :23.20   Female:29   Min.   :16.88  
 Class :character   1st Qu.:24.60   Male  :31   1st Qu.:50.57  
 Mode  :character   Median :26.21               Median :60.95  
                    Mean   :26.35               Mean   :61.73  
                    3rd Qu.:27.95               3rd Qu.:76.47  
                    Max.   :29.96               Max.   :97.75  
ds$data
      IDs      Age Gender    Grade Grade_Cat
1  STUD01 27.92364   Male 76.20941        GP
2  STUD02 23.84776 Female 76.62890        GP
3  STUD03 23.38876   Male 56.20206         F
4  STUD04 23.69847 Female 59.36847         F
5  STUD05 25.23842 Female 91.39844         D
6  STUD06 28.81414   Male 49.87500         F
7  STUD07 29.93506   Male 59.32237         F
8  STUD08 26.32559 Female 78.38143        GP
9  STUD09 28.62278 Female 65.47434         P
10 STUD10 25.68633 Female 64.89740         P
11 STUD11 23.44684   Male 73.66076        GP
12 STUD12 23.22759 Female 16.94582         F
13 STUD13 25.59096 Female 69.00903         P
14 STUD14 25.94444   Male 64.68572         P
15 STUD15 23.28955 Female 22.30563         F
16 STUD16 25.18997   Male 51.39189         F
17 STUD17 27.79130   Male 59.72791         F
18 STUD18 25.99469   Male 50.17311         F
19 STUD19 28.66082   Male 48.75778         F
20 STUD20 25.27046 Female 16.88066         F
21 STUD21 29.59288 Female 59.45890         F
22 STUD22 26.70046   Male 80.27908       VGP
23 STUD23 24.11436   Male 53.09109         F
24 STUD24 26.11601   Male 51.67253         F
25 STUD25 29.28542 Female 86.72166         D
26 STUD26 29.55193   Male 48.17045         F
27 STUD27 28.85618 Female 63.18835         P
28 STUD28 23.68760   Male 78.80509        GP
29 STUD29 25.33464   Male 24.15672         F
30 STUD30 29.96413 Female 79.90677        GP
31 STUD31 26.87976 Female 50.94208         F
32 STUD32 24.70885 Female 76.41202        GP
33 STUD33 28.99526   Male 29.88608         F
34 STUD34 26.49617   Male 56.05513         F
35 STUD35 27.01572   Male 74.96593        GP
36 STUD36 27.36626   Male 84.32467       VGP
37 STUD37 24.53548 Female 57.49664         F
38 STUD38 23.76493 Female 85.58672         D
39 STUD39 24.46011 Female 39.49676         F
40 STUD40 29.03970 Female 88.30244         D
41 STUD41 24.62679 Female 67.29825         P
42 STUD42 27.84071   Male 56.76052         F
43 STUD43 26.34955   Male 35.91231         F
44 STUD44 23.24141   Male 43.75341         F
45 STUD45 27.84368 Female 67.83593         P
46 STUD46 26.79743   Male 75.83115        GP
47 STUD47 25.55552 Female 97.75107         D
48 STUD48 23.30528 Female 65.09067         P
49 STUD49 26.13170   Male 97.43973         D
50 STUD50 23.19650 Female 93.28693         D
51 STUD51 29.93580 Female 42.10498         F
52 STUD52 23.72384 Female 38.84387         F
53 STUD53 29.93277   Male 63.03584         P
54 STUD54 28.03977   Male 38.42682         F
55 STUD55 25.11203 Female 61.35042         P
56 STUD56 28.50399   Male 60.55292         P
57 STUD57 25.88938   Male 51.39525         F
58 STUD58 27.89994   Male 83.62630       VGP
59 STUD59 26.28096   Male 92.46526         D
60 STUD60 26.68884 Female 50.69787         F
ds$plot
     [,1]
[1,]  0.7
[2,]  1.9
[3,]  3.1
[4,]  4.3
[5,]  5.5

Packages

R packages are extensions to the R statistical programming language. R packages contain code, data, and documentation in a standardised collection format that can be installed by users of R, typically via a centralised software repository such as CRAN. Wikipedia

Most used packages in R data Science

  • dplyr: used for data manipulation tasks
  • ggplot2: ggplot2 is a powerful data visualization package known for its aesthetically pleasing and customizable plots.
  • plotly: cousin of ggplot2 that makes nice plots too.
  • tidyr: for that transformation/manipultation
  • caret: statistical models

  • lubridate: handling date
  • stringr: string processing and regular expressions.
  • tidymodels: converting models summaries into tables
  • kableExtra: printing nice tables and generating html or LaTeX codes.

Package installation

Way 1

install.packages("tidyverse", dependencies = TRUE)

Way 2

Regular expressions

Read here !

Data scraping

df <- data.frame(
  X = c(
    "<Sheryl>-F_34",
  "<Kisha>-F_45", 
  "<Brandon>-N_33",
  "<Sharon>-F_38", 
  "<Penny>-F_58",
  "<Justin>-M_41", 
  "<Patricia>-F_84"
  )
)

(y <- gsub("[<>]", "", df$X))
[1] "Sheryl-F_34"   "Kisha-F_45"    "Brandon-N_33"  "Sharon-F_38"  
[5] "Penny-F_58"    "Justin-M_41"   "Patricia-F_84"
(z <- stringr::str_split(y, pattern = "[-_]"))
[[1]]
[1] "Sheryl" "F"      "34"    

[[2]]
[1] "Kisha" "F"     "45"   

[[3]]
[1] "Brandon" "N"       "33"     

[[4]]
[1] "Sharon" "F"      "38"    

[[5]]
[1] "Penny" "F"     "58"   

[[6]]
[1] "Justin" "M"      "41"    

[[7]]
[1] "Patricia" "F"        "84"      

Method 2

extract_fun <- function(x, p) x[p]
names <- sapply(z, extract_fun, 1)
sex <- sapply(z, extract_fun, 2)
age <- as.numeric(sapply(z, extract_fun, 3))

l <- list(x = c(NA, 2, 4, 9, NA, 10), y = runif(6))
sapply(l, mean, na.rm = TRUE)
        x         y 
6.2500000 0.3827955 

Extract names

library(stringr)
df2 <- data.frame(
  names = str_extract(string = df$X, pattern = "\\w+"),
  # str_sub(string = df$X, pattern = "\\w+")
  sex = gsub("[-_]", "", str_extract(string = df$X, pattern = "-?[A-Z]_")),
  age = str_extract(string = df$X, pattern = "[0-9]+")
)

df2
     names sex age
1   Sheryl   F  34
2    Kisha   F  45
3  Brandon   N  33
4   Sharon   F  38
5    Penny   F  58
6   Justin   M  41
7 Patricia   F  84

(names <- sapply(z, function(x) x[[1]]))
[1] "Sheryl"   "Kisha"    "Brandon"  "Sharon"   "Penny"    "Justin"   "Patricia"
# sex <- #TODO
# age <- 

Data manipulation using tidyverse, tibble, tidyr, reshape2, kabbleExtra

Pipe

iris[-5] %>% sapply(mean)
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    5.843333     3.057333     3.758000     1.199333 
iris[-5] %>% colMeans() %>% barplot()
# barplot(colMeans(iris[-5]))

Data manipulation using tidyverse/dplyr

# importing data using openxlsx
library(openxlsx)
library(tidyverse)
dta <- read.xlsx("./data/excel/RwandaDistrictVegetation.xlsx", 
                 startRow = 2, check.names = TRUE)

head(dta[1:10]) # not mandatory!
    District JAN FEB MAR APR      MAY      JUN       JUL       AUG       SEP
1 Nyarugenge  NA  NA  NA  NA 127.1336 114.1947  97.70739  95.69933  88.72997
2     Gasabo  NA  NA  NA  NA 134.2744 121.7469 101.76102  96.82108  89.28289
3   Kicukiro  NA  NA  NA  NA 127.1976 112.7093  95.87081  93.06528  86.07577
4     Nyanza  NA  NA  NA  NA 130.4076 115.4761 100.63236  95.56556  92.33541
5   Gisagara  NA  NA  NA  NA 124.8776 113.0878 100.30353  97.35343  94.04728
6  Nyaruguru  NA  NA  NA  NA 129.6224 122.6007 111.28178 102.56060 104.06047
dim(dta)
[1]  30 181

Reshaping data using the reshape2 package

library(reshape2)
library(lubridate) # to convert month-year into a column of dates
dta_reshaped <- dta %>% 
  melt(id.vars = c("District"), variable.name = "Month") %>% 
  mutate(Year = rep(2000:2014, 12*30) %>% sort(),
         Month = gsub("\\.[0-9]+", "", Month), # regular expressions !!!
         # adding a month-year column to the dataset
         MontYear = paste(Month, Year, sep = " - "),
         # updating MontYear
         MontYear = my(MontYear)
         ) 


head(dta_reshaped, 100)
      District Month value Year   MontYear
1   Nyarugenge   JAN    NA 2000 2000-01-01
2       Gasabo   JAN    NA 2000 2000-01-01
3     Kicukiro   JAN    NA 2000 2000-01-01
4       Nyanza   JAN    NA 2000 2000-01-01
5     Gisagara   JAN    NA 2000 2000-01-01
6    Nyaruguru   JAN    NA 2000 2000-01-01
7         Huye   JAN    NA 2000 2000-01-01
8    Nyamagabe   JAN    NA 2000 2000-01-01
9      Ruhango   JAN    NA 2000 2000-01-01
10     Muhanga   JAN    NA 2000 2000-01-01
11     Kamonyi   JAN    NA 2000 2000-01-01
12     Karongi   JAN    NA 2000 2000-01-01
13     Rutsiro   JAN    NA 2000 2000-01-01
14      Rubavu   JAN    NA 2000 2000-01-01
15     Nyabihu   JAN    NA 2000 2000-01-01
16   Ngororero   JAN    NA 2000 2000-01-01
17      Rusizi   JAN    NA 2000 2000-01-01
18  Nyamasheke   JAN    NA 2000 2000-01-01
19     Rulindo   JAN    NA 2000 2000-01-01
20     Gakenke   JAN    NA 2000 2000-01-01
21     Musanze   JAN    NA 2000 2000-01-01
22      Burera   JAN    NA 2000 2000-01-01
23     Gicumbi   JAN    NA 2000 2000-01-01
24   Rwamagana   JAN    NA 2000 2000-01-01
25   Nyagatare   JAN    NA 2000 2000-01-01
26     Gatsibo   JAN    NA 2000 2000-01-01
27     Kayonza   JAN    NA 2000 2000-01-01
28      Kirehe   JAN    NA 2000 2000-01-01
29       Ngoma   JAN    NA 2000 2000-01-01
30    Bugesera   JAN    NA 2000 2000-01-01
31  Nyarugenge   FEB    NA 2000 2000-02-01
32      Gasabo   FEB    NA 2000 2000-02-01
33    Kicukiro   FEB    NA 2000 2000-02-01
34      Nyanza   FEB    NA 2000 2000-02-01
35    Gisagara   FEB    NA 2000 2000-02-01
36   Nyaruguru   FEB    NA 2000 2000-02-01
37        Huye   FEB    NA 2000 2000-02-01
38   Nyamagabe   FEB    NA 2000 2000-02-01
39     Ruhango   FEB    NA 2000 2000-02-01
40     Muhanga   FEB    NA 2000 2000-02-01
41     Kamonyi   FEB    NA 2000 2000-02-01
42     Karongi   FEB    NA 2000 2000-02-01
43     Rutsiro   FEB    NA 2000 2000-02-01
44      Rubavu   FEB    NA 2000 2000-02-01
45     Nyabihu   FEB    NA 2000 2000-02-01
46   Ngororero   FEB    NA 2000 2000-02-01
47      Rusizi   FEB    NA 2000 2000-02-01
48  Nyamasheke   FEB    NA 2000 2000-02-01
49     Rulindo   FEB    NA 2000 2000-02-01
50     Gakenke   FEB    NA 2000 2000-02-01
51     Musanze   FEB    NA 2000 2000-02-01
52      Burera   FEB    NA 2000 2000-02-01
53     Gicumbi   FEB    NA 2000 2000-02-01
54   Rwamagana   FEB    NA 2000 2000-02-01
55   Nyagatare   FEB    NA 2000 2000-02-01
56     Gatsibo   FEB    NA 2000 2000-02-01
57     Kayonza   FEB    NA 2000 2000-02-01
58      Kirehe   FEB    NA 2000 2000-02-01
59       Ngoma   FEB    NA 2000 2000-02-01
60    Bugesera   FEB    NA 2000 2000-02-01
61  Nyarugenge   MAR    NA 2000 2000-03-01
62      Gasabo   MAR    NA 2000 2000-03-01
63    Kicukiro   MAR    NA 2000 2000-03-01
64      Nyanza   MAR    NA 2000 2000-03-01
65    Gisagara   MAR    NA 2000 2000-03-01
66   Nyaruguru   MAR    NA 2000 2000-03-01
67        Huye   MAR    NA 2000 2000-03-01
68   Nyamagabe   MAR    NA 2000 2000-03-01
69     Ruhango   MAR    NA 2000 2000-03-01
70     Muhanga   MAR    NA 2000 2000-03-01
71     Kamonyi   MAR    NA 2000 2000-03-01
72     Karongi   MAR    NA 2000 2000-03-01
73     Rutsiro   MAR    NA 2000 2000-03-01
74      Rubavu   MAR    NA 2000 2000-03-01
75     Nyabihu   MAR    NA 2000 2000-03-01
76   Ngororero   MAR    NA 2000 2000-03-01
77      Rusizi   MAR    NA 2000 2000-03-01
78  Nyamasheke   MAR    NA 2000 2000-03-01
79     Rulindo   MAR    NA 2000 2000-03-01
80     Gakenke   MAR    NA 2000 2000-03-01
81     Musanze   MAR    NA 2000 2000-03-01
82      Burera   MAR    NA 2000 2000-03-01
83     Gicumbi   MAR    NA 2000 2000-03-01
84   Rwamagana   MAR    NA 2000 2000-03-01
85   Nyagatare   MAR    NA 2000 2000-03-01
86     Gatsibo   MAR    NA 2000 2000-03-01
87     Kayonza   MAR    NA 2000 2000-03-01
88      Kirehe   MAR    NA 2000 2000-03-01
89       Ngoma   MAR    NA 2000 2000-03-01
90    Bugesera   MAR    NA 2000 2000-03-01
91  Nyarugenge   APR    NA 2000 2000-04-01
92      Gasabo   APR    NA 2000 2000-04-01
93    Kicukiro   APR    NA 2000 2000-04-01
94      Nyanza   APR    NA 2000 2000-04-01
95    Gisagara   APR    NA 2000 2000-04-01
96   Nyaruguru   APR    NA 2000 2000-04-01
97        Huye   APR    NA 2000 2000-04-01
98   Nyamagabe   APR    NA 2000 2000-04-01
99     Ruhango   APR    NA 2000 2000-04-01
100    Muhanga   APR    NA 2000 2000-04-01
# cleaning the column of months
dta_reshaped <- dta_reshaped %>% na.omit() # removes any row containing NAs

Generate a latex code of a table

tab <- dta_reshaped %>% 
  filter(Year %in% c("2013", "2014")) %>% 
  group_by(District, Year) %>% 
  summarise(Average = mean(value), .groups = "drop")

d <- kableExtra::kable(tab, format = "latex")
print(d)

\begin{tabular}{l|r|r}
\hline
District & Year & Average\\
\hline
Bugesera & 2013 & 112.0397\\
\hline
Bugesera & 2014 & 109.2777\\
\hline
Burera & 2013 & 117.2697\\
\hline
Burera & 2014 & 118.6370\\
\hline
Gakenke & 2013 & 104.8141\\
\hline
Gakenke & 2014 & 106.6274\\
\hline
Gasabo & 2013 & 115.4867\\
\hline
Gasabo & 2014 & 115.7981\\
\hline
Gatsibo & 2013 & 118.2808\\
\hline
Gatsibo & 2014 & 118.3266\\
\hline
Gicumbi & 2013 & 117.7375\\
\hline
Gicumbi & 2014 & 119.2816\\
\hline
Gisagara & 2013 & 119.4178\\
\hline
Gisagara & 2014 & 117.2617\\
\hline
Huye & 2013 & 116.4850\\
\hline
Huye & 2014 & 115.4567\\
\hline
Kamonyi & 2013 & 116.3597\\
\hline
Kamonyi & 2014 & 119.8323\\
\hline
Karongi & 2013 & 118.4783\\
\hline
Karongi & 2014 & 119.1900\\
\hline
Kayonza & 2013 & 108.6446\\
\hline
Kayonza & 2014 & 107.6869\\
\hline
Kicukiro & 2013 & 114.2078\\
\hline
Kicukiro & 2014 & 112.7312\\
\hline
Kirehe & 2013 & 102.0931\\
\hline
Kirehe & 2014 & 101.9623\\
\hline
Muhanga & 2013 & 114.9686\\
\hline
Muhanga & 2014 & 118.3283\\
\hline
Musanze & 2013 & 117.6789\\
\hline
Musanze & 2014 & 120.1941\\
\hline
Ngoma & 2013 & 115.8536\\
\hline
Ngoma & 2014 & 115.7909\\
\hline
Ngororero & 2013 & 120.8323\\
\hline
Ngororero & 2014 & 120.0071\\
\hline
Nyabihu & 2013 & 121.6922\\
\hline
Nyabihu & 2014 & 121.9895\\
\hline
Nyagatare & 2013 & 119.7193\\
\hline
Nyagatare & 2014 & 120.4654\\
\hline
Nyamagabe & 2013 & 118.6475\\
\hline
Nyamagabe & 2014 & 120.2933\\
\hline
Nyamasheke & 2013 & 122.0184\\
\hline
Nyamasheke & 2014 & 121.7682\\
\hline
Nyanza & 2013 & 119.9416\\
\hline
Nyanza & 2014 & 119.0656\\
\hline
Nyarugenge & 2013 & 113.6571\\
\hline
Nyarugenge & 2014 & 116.3058\\
\hline
Nyaruguru & 2013 & 120.9532\\
\hline
Nyaruguru & 2014 & 120.3208\\
\hline
Rubavu & 2013 & 118.5041\\
\hline
Rubavu & 2014 & 119.1539\\
\hline
Ruhango & 2013 & 115.4667\\
\hline
Ruhango & 2014 & 119.4698\\
\hline
Rulindo & 2013 & 119.6110\\
\hline
Rulindo & 2014 & 121.0036\\
\hline
Rusizi & 2013 & 127.2511\\
\hline
Rusizi & 2014 & 125.7777\\
\hline
Rutsiro & 2013 & 122.3922\\
\hline
Rutsiro & 2014 & 120.3553\\
\hline
Rwamagana & 2013 & 117.6581\\
\hline
Rwamagana & 2014 & 115.8099\\
\hline
\end{tabular}

Spread data: reshape from long to wide

Plotting with ggplot2 and plotly

# dta_reshaped$District %>% table() %>% names() %>% edit()
selected_dist <- c("Bugesera", "Gasabo", "Ngoma", "Kicukiro", "Rulindo")
p <- dta_reshaped %>% filter(District %in% selected_dist) %>% 
  mutate(District = factor(District)) %>% 
  ggplot(aes(x = MontYear, y = value, col = District)) + 
  geom_line()
print(p)

Updating a ggplot

library(plotly)
pp <- p + facet_wrap(~District)
print(pp)

Plotly

plotly::ggplotly(p)

Violin plot

Machine Learning

Unsupervised learning

Supervised learning