?R

R is a language and environment for statistical computing and graphics. It is a GNU project which is similar to the S language and environment which was developed at Bell Laboratories (formerly AT&T, now Lucent Technologies) by John Chambers and colleagues. R can be considered as a different implementation of S. There are some important differences, but much code written for S runs unaltered under R.

R provides a wide variety of statistical (linear and nonlinear modelling, classical statistical tests, time-series analysis, classification, clustering, …) and graphical techniques, and is highly extensible.

https://www.r-project.org/about.html

?RStudio

RStudio is an integrated development environment (IDE) for R. It includes a console, syntax-highlighting editor that supports direct code execution, as well as tools for plotting, history, debugging and workspace management.

RStudio is available in open source and commercial editions and runs on the desktop (Windows, Mac, and Linux) or in a browser connected to RStudio Server or RStudio Server Pro (Debian/Ubuntu, RedHat/CentOS, and SUSE Linux).

https://www.rstudio.com/products/rstudio/

Packages and Help

Packages

  • install.packages(‘dplyr’) Download and install
  • library(dplyr) Load
  • data(iris) Load a built-in dataset

Help

  • ?mean Help function mean
  • help(package = ‘dplyr’) Help package.

Working Directory

Where am I?

Table of contents

========================================================

1 2 3 . . .

a <- 5

y <- f(x)

Vectors

(v <- c(1, 2, 3, 4, 5))
## [1] 1 2 3 4 5
(v1 <- 1:5)
## [1] 1 2 3 4 5
(v2 <- seq(from = 1, to = 5, by = 1))
## [1] 1 2 3 4 5

Vectors

( v3 <- rep(1:2, times = 3) )
## [1] 1 2 1 2 1 2
( v4 <- rep(1:2, each = 3) )
## [1] 1 1 1 2 2 2

Selecting Vector Elements

v5 <- rnorm(3)
v5
## [1]  2.07436307  0.32028396 -0.09851201
v5[2]
## [1] 0.320284
v5[-2]
## [1]  2.07436307 -0.09851201

Selecting Vector Elements

v5
## [1]  2.07436307  0.32028396 -0.09851201
v5[2:3]
## [1]  0.32028396 -0.09851201
v5[-(2:3)]
## [1] 2.074363

Selecting Vector Elements

v5
## [1]  2.07436307  0.32028396 -0.09851201
v5[c(1, 2)]
## [1] 2.074363 0.320284
v5[-c(1,2)]
## [1] -0.09851201

Selecting Vector Elements

v5
## [1]  2.07436307  0.32028396 -0.09851201
v5[v5 < 0]
## [1] -0.09851201
fruits <- c("apple", "pear", "apple")
fruits[fruits == "apple"]
## [1] "apple" "apple"

seq with Date Series

t <- seq(from = as.Date("2017-01-01"), 
         to = as.Date("2017-12-31"), 
         by = "days")

t[1:3]
## [1] "2017-01-01" "2017-01-02" "2017-01-03"
t2 <- seq(from = as.Date("2017-01-01"), 
          to = as.Date("2017-12-31"), 
          by = "months")

t2[1:3]
## [1] "2017-01-01" "2017-02-01" "2017-03-01"

Data Frame

df <- data.frame(
        Fruit = c("apple", "pear", "melon"),
        kg = c(100, 250, 560),
        PrUn = c(2.0, 4.5, 1.2)
      )

df
##   Fruit  kg PrUn
## 1 apple 100  2.0
## 2  pear 250  4.5
## 3 melon 560  1.2

Read and write external data

Read

  • df <- read.table(“file.txt”)
  • df <- read.csv(“file.csv”)
  • df <- read.csv(file.choose())
  • df <- read.excel(file.choose(), sheet = “Sheet1”)

Write

  • write.table(df, “file.txt”)
  • write.csv(df, “file.csv”)

Selecting Data Frame Elements

df
##   Fruit  kg PrUn
## 1 apple 100  2.0
## 2  pear 250  4.5
## 3 melon 560  1.2
df[1:2, ]
##   Fruit  kg PrUn
## 1 apple 100  2.0
## 2  pear 250  4.5
df[1, 1]
## [1] apple
## Levels: apple melon pear

Selecting Data Frame Elements

df
##   Fruit  kg PrUn
## 1 apple 100  2.0
## 2  pear 250  4.5
## 3 melon 560  1.2
df[ , 'Fruit']
## [1] apple pear  melon
## Levels: apple melon pear

Add column in Data Frame

df['tot'] <- df['kg'] * df['PrUn']
df
##   Fruit  kg PrUn  tot
## 1 apple 100  2.0  200
## 2  pear 250  4.5 1125
## 3 melon 560  1.2  672
df$tot <- df$kg * df$PrUn
df
##   Fruit  kg PrUn  tot
## 1 apple 100  2.0  200
## 2  pear 250  4.5 1125
## 3 melon 560  1.2  672

Delete column in Data Frame

df['tot'] <- NULL
df
##   Fruit  kg PrUn
## 1 apple 100  2.0
## 2  pear 250  4.5
## 3 melon 560  1.2
df$tot <- NULL
df
##   Fruit  kg PrUn
## 1 apple 100  2.0
## 2  pear 250  4.5
## 3 melon 560  1.2

Structure of Data Frame

df
##   Fruit  kg PrUn
## 1 apple 100  2.0
## 2  pear 250  4.5
## 3 melon 560  1.2
dim(df)
## [1] 3 3
names(df)
## [1] "Fruit" "kg"    "PrUn"

Structure of Data Frame

df
##   Fruit  kg PrUn
## 1 apple 100  2.0
## 2  pear 250  4.5
## 3 melon 560  1.2
str(df)
## 'data.frame':    3 obs. of  3 variables:
##  $ Fruit: Factor w/ 3 levels "apple","melon",..: 1 3 2
##  $ kg   : num  100 250 560
##  $ PrUn : num  2 4.5 1.2

head() and tail()

df1 <- data.frame(a = sample(x = 1:100), b = rnorm(100))
head(df1, 5)
##    a           b
## 1 69 -0.81252232
## 2 98  1.02755742
## 3 29 -0.27553963
## 4 99 -0.15941451
## 5 12  0.09592824
tail(df1, 5)
##      a          b
## 96  15 -0.6747514
## 97  27 -0.5045930
## 98  63  0.2304039
## 99  38  2.8594134
## 100 90 -0.1242792

plot()

head(df1)
##    a           b
## 1 69 -0.81252232
## 2 98  1.02755742
## 3 29 -0.27553963
## 4 99 -0.15941451
## 5 12  0.09592824
## 6 60 -0.66987630
plot(df1)

plot()

(df1.10 <- head(df1, 10))
##     a           b
## 1  69 -0.81252232
## 2  98  1.02755742
## 3  29 -0.27553963
## 4  99 -0.15941451
## 5  12  0.09592824
## 6  60 -0.66987630
## 7  76  1.08198439
## 8  84  1.56556105
## 9  87 -0.41985035
## 10 40  1.18134961
plot(df1.10$a, type = "l", col="red")

summary()

summary(df1)
##        a                b           
##  Min.   :  1.00   Min.   :-2.23800  
##  1st Qu.: 25.75   1st Qu.:-0.61497  
##  Median : 50.50   Median :-0.14508  
##  Mean   : 50.50   Mean   : 0.01599  
##  3rd Qu.: 75.25   3rd Qu.: 0.79671  
##  Max.   :100.00   Max.   : 3.26078

Linear Regression()

lr <- lm(formula = a ~ b, data = df1)
lr
## 
## Call:
## lm(formula = a ~ b, data = df1)
## 
## Coefficients:
## (Intercept)            b  
##      50.525       -1.548

Linear Regression()

par(mfrow=c(1,2))
plot(lr)

Mean Square with %>%

v <- c(22, 3, 2, 5, 6, 77, 8, 11, 9, 7, 57)
sqrt(mean(v^2))
## [1] 30.22792
library(magrittr)

v^2 %>% 
  mean() %>% 
  sqrt()
## [1] 30.22792

dplyr

dplyr

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df_a <- data.frame(year = rep(2013:2017, each = 12),
          month = rep(1:12, times = 5),
          num = sample(1:20),
          weight = runif(n = 60, min = 1.0, max = 10.0))

head(df_a, 4)
##   year month num   weight
## 1 2013     1   5 5.145590
## 2 2013     2   3 4.459135
## 3 2013     3   7 5.257585
## 4 2013     4  16 6.867127

dplyr - select()

df_a_select <- select(df_a, year, num, weight)

head(df_a_select)
##   year num   weight
## 1 2013   5 5.145590
## 2 2013   3 4.459135
## 3 2013   7 5.257585
## 4 2013  16 6.867127
## 5 2013   2 8.802364
## 6 2013  20 2.425743

dplyr - filter()

df_a_filter <- filter(df_a, month == 2, num > 3)
head(df_a_filter)
##   year month num   weight
## 1 2014     2  10 6.463854
## 2 2015     2  20 5.334409
## 3 2016     2  19 3.020158

dplyr - summarise()

df_a_summarise <- summarise(df_a, 
                            avg_n = mean(num),
                            avg_w = mean(weight))
head(df_a_summarise)
##   avg_n    avg_w
## 1  10.5 5.438779

dplyr - arrange()

df_a_arrange <- arrange(df_a, desc(num))
head(df_a_arrange)
##   year month num   weight
## 1 2013     6  20 2.425743
## 2 2015     2  20 5.334409
## 3 2016    10  20 7.893768
## 4 2014     6  19 4.861062
## 5 2016     2  19 3.020158
## 6 2017    10  19 7.413796

dplyr - mutate()

df_a_mutate <- mutate(df_a, tot = num * weight)
head(df_a_mutate)
##   year month num   weight       tot
## 1 2013     1   5 5.145590  25.72795
## 2 2013     2   3 4.459135  13.37741
## 3 2013     3   7 5.257585  36.80310
## 4 2013     4  16 6.867127 109.87404
## 5 2013     5   2 8.802364  17.60473
## 6 2013     6  20 2.425743  48.51485

dplyr - group_by() - summarise() with %>%

df_a_gr <- df_a %>% 
             group_by(year) %>%
             summarise(avg_n = mean(num), 
                       avg_w = mean(weight))

head(df_a_gr)
## # A tibble: 5 x 3
##    year     avg_n    avg_w
##   <int>     <dbl>    <dbl>
## 1  2013  8.666667 5.271529
## 2  2014 11.416667 5.853483
## 3  2015 10.583333 5.285630
## 4  2016  9.750000 6.315981
## 5  2017 12.083333 4.467274

dplyr - group_by() - summarise_all() with %>%

df_a_gr <- df_a %>% 
             group_by(year) %>%
             summarise_all(funs(mean))
head(df_a_gr)
## # A tibble: 5 x 4
##    year month       num   weight
##   <int> <dbl>     <dbl>    <dbl>
## 1  2013   6.5  8.666667 5.271529
## 2  2014   6.5 11.416667 5.853483
## 3  2015   6.5 10.583333 5.285630
## 4  2016   6.5  9.750000 6.315981
## 5  2017   6.5 12.083333 4.467274

ggplot2

library(ggplot2)

t <- seq(from = as.Date("2017-01-01"), 
         to = as.Date("2017-12-31"), 
         by = "days")

val <- rnorm(365)

df <- data.frame(tempo = t, valori = val)

f <- ggplot(df,aes(x = tempo, y = valori)) +
       geom_line() +
       theme_bw()
f 

ggplot2 - examples

start=as.Date("2017-01-01")

df9 <- data.frame(date = seq(from = start, to = start+99, by="days"),
                  descr = sample(x = c("cat","dog","mouse"), size = 100, replace = T),
                  val = sample(x = 1:30, size = 100, replace = T))

head(df9, 40)
##          date descr val
## 1  2017-01-01 mouse   3
## 2  2017-01-02   cat  13
## 3  2017-01-03   cat  11
## 4  2017-01-04   dog  21
## 5  2017-01-05 mouse  21
## 6  2017-01-06   cat   3
## 7  2017-01-07   cat   1
## 8  2017-01-08   cat  30
## 9  2017-01-09   dog  27
## 10 2017-01-10   cat  22
## 11 2017-01-11 mouse  13
## 12 2017-01-12   cat  18
## 13 2017-01-13 mouse  18
## 14 2017-01-14   cat  14
## 15 2017-01-15   cat  23
## 16 2017-01-16   dog  30
## 17 2017-01-17   cat  16
## 18 2017-01-18   dog  26
## 19 2017-01-19   dog  22
## 20 2017-01-20 mouse  21
## 21 2017-01-21 mouse  22
## 22 2017-01-22   cat   1
## 23 2017-01-23   dog  20
## 24 2017-01-24   cat  25
## 25 2017-01-25 mouse   7
## 26 2017-01-26 mouse  28
## 27 2017-01-27   dog  10
## 28 2017-01-28   dog  19
## 29 2017-01-29   dog   1
## 30 2017-01-30 mouse  13
## 31 2017-01-31 mouse  19
## 32 2017-02-01   cat   4
## 33 2017-02-02 mouse   5
## 34 2017-02-03   dog   5
## 35 2017-02-04   cat  17
## 36 2017-02-05 mouse   1
## 37 2017-02-06   cat  26
## 38 2017-02-07   cat  30
## 39 2017-02-08 mouse  25
## 40 2017-02-09 mouse  22

ggplot2 - examples

g   <- ggplot(df9, aes(descr, val))

gg  <- ggplot(df9, aes(x=val))

gg2 <- ggplot(df9, aes(x=val, fill=descr))

gg3 <- ggplot(df9, aes(x=descr, y=val, fill=descr))

g + geom_violin()

g + geom_violin()

g + geom_violin(aes(fill=descr))

g + geom_violin(aes(fill=descr))

g + geom_violin() + geom_jitter()

g + geom_violin() + geom_jitter(height=0,width=0.1)

g + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75))

g + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) 

gg + geom_histogram()

gg + geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

gg2 + geom_histogram()

gg2 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

gg2 + geom_density(alpha = .3)

gg2 + geom_density(alpha = .3)

gg2 + geom_density(alpha=.3) + facet_grid(descr ~.)

gg2 + geom_density(alpha=.3) + facet_grid(descr ~.)

gg3 + geom_boxplot(alpha = .3)

gg3 + geom_boxplot(alpha = .3)

gg3 + geom_boxplot() + stat_summary()

gg3 + geom_boxplot(alpha = .3) + 
  stat_summary(fun.y = mean, geom = "point", shape = 5, size = 4)

========================================================

“Analizzare i dati con R” Linux Day a Pisa 2017

Author: Alessandra Santi

E-mail: santi.info@gmail.com

License: CC-BY-SA