CS 424 Big Data Analytics

Session 5: Basics of R

Instructor: Dr. Bob Batzinger
Academic year: 2021/2022
Semester: 1

Begins June 2021

R Studio Interface

Scalar numbers

## a = 14.92        Output: 14.92 
##  round(a,1)      Output: 14.9 
##  as.integer(a)   Output: 14 
##  is.numeric(a)   Output: TRUE 
##  is.integer(a)   Output: FALSE 
##    a * 2         Output: 29.84 
##    a / 2         Output: 7.46 
##    a + 2         Output: 16.92 
##    a - 2         Output: 12.92 
##    a ^ 2         Output: 222.6064 
##    a %% 2        Output: 0.92

Strings

## s = "PYU CS Dept"        Output: PYU CS Dept 
##  substr(s,start=5,stop=6)    Output: CS 
##  grep("/PYU/",s)         Output:  
##  gsub("PYU","Payap",s)       Output: Payap CS Dept 
##  strsplit(s, " ")        Output: PYU CS Dept 
##  paste("a","=",a)        Output: a = 14.92 
##  toupper(s)              Output: PYU CS DEPT 
##  tolower(s)              Output: pyu cs dept 
##  nchar(s)                Output: 11

Vectors

## vector:              x            2 13 5 17 11 3 
##  reversed:           rev(x)       3 11 17 5 13 2 
##  selected entries:   x[c(1,3,5)]  2 5 11 
##  number of elements: length(x)    6 
##  order map:          order(x)     1 6 3 5 2 4 
##  sorted list:        x[order(x)]  2 3 5 11 13 17 
##  scaled value:       3 * x        6 39 15 51 33 9 
##  offset value:       3 + x        5 16 8 20 14 6 
##  squared values:     x * x        4 169 25 289 121 9 
##  sequence:           5:10         5 6 7 8 9 10 
##  average:            mean(x)      8.5 
##  std dev:            sd(x)        6.058052 
##  sum:                sum(x)       51 
##  sequence:           seq(0,3,.5)  0 0.5 1 1.5 2 2.5 3 
##  repeated num:       rep(a,3)     14.92 14.92 14.92 
##  vector of Labels:   LETTERS[1:8]     A B C D E F G H 
##  vector of labels:   letters[1:8]     a b c d e f g h

Matrix

## mtxa=rbind(c(1,2,3),
##  c(6,5,4),c(7,9,8))
## mtxa -------------
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    6    5    4
## [3,]    7    9    8
## t(mtxa) ----------
##      [,1] [,2] [,3]
## [1,]    1    6    7
## [2,]    2    5    9
## [3,]    3    4    8
## colMeans(mtxa) ------
## [1] 4.666667 5.333333 5.000000
## rowSums(mtxa) ------
## [1]  6 15 24
## diag(3) ---------
##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1
## det(mtxa) -------
## [1] 21
## mtxa * mtxa -----
##      [,1] [,2] [,3]
## [1,]    1    4    9
## [2,]   36   25   16
## [3,]   49   81   64

Matrix multiplication

\[\begin{pmatrix} 1 & 2 & 3\\ 6 & 5 & 4\\ 7 & 9 & 8\\ \end{pmatrix}\begin{pmatrix} 10\\ 12\\ 20\\ \end{pmatrix} = \begin{pmatrix} 1 \cdot 10 + 2 \cdot 12 + 3 \cdot 20\\ 6 \cdot 10 + 5 \cdot 12 + 4 \cdot 20\\ 7 \cdot 10 + 9 \cdot 12 + 8 \cdot 20\\ \end{pmatrix} = \begin{pmatrix} 94\\ 200\\ 358\\ \end{pmatrix}\]

## mtxa = rbind(c(1,2,3),c(6,5,4),c(7,9,8))
## mtxb = c(10,12,20)
## mmprod = mtxa %*% mtxb
##      [,1]
## [1,]   94
## [2,]  200
## [3,]  338

Algebraic Solutions

Algebraic Equations

\[\begin{eqnarray} x + 2y + 3z & = & 10\\ 6x + 5y + 4z & = & 12\\ 7x + 9y + 8z & = & 20\\ \end{eqnarray}\]

Rendering in R

## result = solve(mtxa,mtxb)
## [1]  1.523810 -3.619048  5.238095

Matrix version

\[\begin{pmatrix} 1 & 2 & 3 \\ 6 & 5 & 4 \\ 7 & 9 & 8 \\ \end{pmatrix}\begin{pmatrix} x\\ y\\ z\\ \end{pmatrix} = \begin{pmatrix} 10\\ 12\\ 20\\ \end{pmatrix}\]

\[\begin{pmatrix} 1 & 2 & 3\\ 0 & 1 & 2\\ 0 & 0 & 1\\ \end{pmatrix} = \begin{pmatrix} 10.00000\\ 6.85714\\ 5.23809\\ \end{pmatrix}\]

\[\begin{pmatrix} x\\ y\\ z\\ \end{pmatrix} = \begin{pmatrix} 1.523810\\ -3.619048\\ 5.238095\\ \end{pmatrix} \]

Markov Chains

Transition map \[\begin{matrix} From/To & Fresh &Soph & Jr& Sr & Grad & Resign\\ Fresh &0.08& 0.72& 0& 0& 0 &0.2\\ Soph &0 & 0.08& 0.72& 0.1 &0 & 0.1\\ Jr & 0 & 0 & 0.08 & 0.82 & 0 & 0.1\\ Sr & 0 & 0 & 0& 0.1& 0.85 & 0.5\\ Grad & 0 & 0 & 0& 0 & 1 & 0\\ Resign & 0 & 0 &0 & 0 & 0 & 1\\ \end{matrix}\]

Markov Chain in R

Fresh =c(0.08,0.72, 0,0,0,0.20)
Soph = c(0,0.08,0.72,0.1,0,0.1)
Jr = c(0,0,0.08,0.82,0,0.1)
Sr = c(0,0,0,0.1,0.85,0.05)
Grad = c(0,0,0,0,1,0)
Resign = c(0,0,0,0,0,1)
markov=rbind(Fresh,Soph,Jr,Sr,
             Grad,Resign)
students = c(1000,0,0,0,0,0)
students = c(1000,0,0,0,0,0)
results = rbind(students)
for (i in 1:8){
  students = students %*% markov
  results = rbind(results,students)
}

Results of Markov Chain

Converting rates

dat = data.frame(rbind(
c(14, 2963,6.6),
c(14,10110,6.5),
c(11.1,1402112,7.0),
c(13,3214,2.2),
c(17.4,1380004,3.7),
c(6,51780,2.7)))
rownames(dat) = c("AR","AZ","CN",
                  "GE","IN","KR")
colnames(dat) = c('birthrate',
      'population', 'diff.100')
unmatched = round(
  (dat[,1] * dat[,2] * dat[,3]) /
    10000, 3)
dat = cbind(dat,unmatched)
birthrate population diff.100 unmatched
AR 14.0 2963 6.6 27.378
AZ 14.0 10110 6.5 92.001
CN 11.1 1402112 7.0 10894.410
GE 13.0 3214 2.2 9.192
IN 17.4 1380004 3.7 8884.466
KR 6.0 51780 2.7 83.884

Data frame

## dim(dat) ========
## [1] 6 4
## colnames(dat) ====
## [1] "birthrate"  "population" "diff.100"   "unmatched"
## rownames(dat) ====
## [1] "AR" "AZ" "CN" "GE" "IN" "KR"
## head(dat,2) ======
##    birthrate population diff.100 unmatched
## AR        14       2963      6.6    27.378
## AZ        14      10110      6.5    92.001
## tail(dat,2) ======
##    birthrate population diff.100 unmatched
## IN      17.4    1380004      3.7  8884.466
## KR       6.0      51780      2.7    83.884

Data frame inspection

## str(data) ======
## 'data.frame':    6 obs. of  4 variables:
##  $ birthrate : num  14 14 11.1 13 17.4 6
##  $ population: num  2963 10110 1402112 3214 1380004 ...
##  $ diff.100  : num  6.6 6.5 7 2.2 3.7 2.7
##  $ unmatched : num  27.38 92 10894.41 9.19 8884.47 ...
## 
## summary(dat) =====
##    birthrate       population         diff.100       unmatched        
##  Min.   : 6.00   Min.   :   2963   Min.   :2.200   Min.   :    9.192  
##  1st Qu.:11.57   1st Qu.:   4938   1st Qu.:2.950   1st Qu.:   41.505  
##  Median :13.50   Median :  30945   Median :5.100   Median :   87.942  
##  Mean   :12.58   Mean   : 475030   Mean   :4.783   Mean   : 3331.889  
##  3rd Qu.:14.00   3rd Qu.:1047948   3rd Qu.:6.575   3rd Qu.: 6686.350  
##  Max.   :17.40   Max.   :1402112   Max.   :7.000   Max.   :10894.410

Mean and Std Dev

doStat <- function(n=10000) {
    total = 0; sq = 0; cnt = 0; x = 1:n
    for (i in x) {total = total + i
      sq = sq + i * i; cnt = cnt + 1
    }
    mn = round(total / n,5); msq = mn * mn
    doStat = paste("mean=",mn,", std.dev=",
        round(sqrt((sq - msq*cnt)/(cnt-1)),5),"\n")
}

doStat2 <- function(n=10000) { x = 1:n
  doStat2 = paste("mean=",mean(x),", std.dev=",sd(x),"\n")
}

Comparison of Times

doStat()

## time= 0.03611 +/- 0.00215 
##  mean= 5000.5 , std.dev= 2886.89568 
## 

mean() and sd()

## time= 0.00289 +/- 0.00127 
##  mean= 5000.5 , std.dev= 2886.89567990717 
## 

Register for juliacon2021

juliacon2021 Assignment

Juliacon2021 116 Lightning Talks

Juliacon2021: 75 Talks

Juliacon2021: 21 Reports of Experience

JuliaCon2021: 6 birds of a feather session

Juliacon2021: 16 Workshops