Alessandra Santi
28 Ottobre 2017
Linux Day 2017 a Pisa
R is a language and environment for statistical computing and graphics. It is a GNU project which is similar to the S language and environment which was developed at Bell Laboratories (formerly AT&T, now Lucent Technologies) by John Chambers and colleagues. R can be considered as a different implementation of S. There are some important differences, but much code written for S runs unaltered under R.
R provides a wide variety of statistical (linear and nonlinear modelling, classical statistical tests, time-series analysis, classification, clustering, …) and graphical techniques, and is highly extensible.
Download R: https://cran.r-project.org/
For Linux, Mac and Windows
Linux Repository
HowTo Installation
HowTo Installation
Help Manuals in HTML, PDF, EPUB
Ubuntu –> sources.list
Ubuntu –> sources.list –> mirror
Ubuntu –> sudo apt-get update
Ubuntu –> sudo apt-get install r-base
Terminal –> R
R environment –> install package (es. dplyr)
R environment –> install package (es. dplyr)
RStudio is an integrated development environment (IDE) for R. It includes a console, syntax-highlighting editor that supports direct code execution, as well as tools for plotting, history, debugging and workspace management.
RStudio is available in open source and commercial editions and runs on the desktop (Windows, Mac, and Linux) or in a browser connected to RStudio Server or RStudio Server Pro (Debian/Ubuntu, RedHat/CentOS, and SUSE Linux).
Open Source Edition and Commercial License
Cheatsheets
Cheatsheets
Cheatsheets
Packages
Help
Where am I?
Data Frames
Libraries
(v <- c(1, 2, 3, 4, 5))
[1] 1 2 3 4 5
(v1 <- 1:5)
[1] 1 2 3 4 5
(v2 <- seq(from=1, to=5, by=1))
[1] 1 2 3 4 5
(v3 <- rep(1:2, times=3))
[1] 1 2 1 2 1 2
(v4 <- rep(1:2, each=3))
[1] 1 1 1 2 2 2
v5 <- rnorm(3)
v5
[1] 1.019617 -1.726204 -1.382875
v5[2]
[1] -1.726204
v5[-2]
[1] 1.019617 -1.382875
v5
[1] 1.019617 -1.726204 -1.382875
v5[2:3]
[1] -1.726204 -1.382875
v5[-(2:3)]
[1] 1.019617
v5
[1] 1.019617 -1.726204 -1.382875
v5[c(1, 2)]
[1] 1.019617 -1.726204
v5[-c(1,2)]
[1] -1.382875
v5
[1] 1.019617 -1.726204 -1.382875
v5[v5 < 0]
[1] -1.726204 -1.382875
fruits <- c("apple", "pear", "apple")
fruits[fruits == "apple"]
[1] "apple" "apple"
t <- seq(from = as.Date("2017-01-01"),
to = as.Date("2017-12-31"),
by = "days")
t[1:3]
[1] "2017-01-01" "2017-01-02" "2017-01-03"
t2 <- seq(from = as.Date("2017-01-01"),
to = as.Date("2017-12-31"),
by = "months")
t2[1:3]
[1] "2017-01-01" "2017-02-01" "2017-03-01"
df <- data.frame(
Fruit=c("apple","pear","melon"),
kg=c(100, 250, 560),
PrUn=c(2.0, 4.5, 1.2)
)
df
Fruit kg PrUn
1 apple 100 2.0
2 pear 250 4.5
3 melon 560 1.2
Read
Write
df
Fruit kg PrUn
1 apple 100 2.0
2 pear 250 4.5
3 melon 560 1.2
df[1:2, ]
Fruit kg PrUn
1 apple 100 2.0
2 pear 250 4.5
df[1, 1]
[1] apple
Levels: apple melon pear
df
Fruit kg PrUn
1 apple 100 2.0
2 pear 250 4.5
3 melon 560 1.2
df[ , 'Fruit']
[1] apple pear melon
Levels: apple melon pear
df['tot'] <- df['kg'] * df['PrUn']
df
Fruit kg PrUn tot
1 apple 100 2.0 200
2 pear 250 4.5 1125
3 melon 560 1.2 672
df$tot <- df$kg * df$PrUn
df
Fruit kg PrUn tot
1 apple 100 2.0 200
2 pear 250 4.5 1125
3 melon 560 1.2 672
df['tot'] <- NULL
df
Fruit kg PrUn
1 apple 100 2.0
2 pear 250 4.5
3 melon 560 1.2
df$tot <- NULL
df
Fruit kg PrUn
1 apple 100 2.0
2 pear 250 4.5
3 melon 560 1.2
df
Fruit kg PrUn
1 apple 100 2.0
2 pear 250 4.5
3 melon 560 1.2
dim(df)
[1] 3 3
names(df)
[1] "Fruit" "kg" "PrUn"
df
Fruit kg PrUn
1 apple 100 2.0
2 pear 250 4.5
3 melon 560 1.2
str(df)
'data.frame': 3 obs. of 3 variables:
$ Fruit: Factor w/ 3 levels "apple","melon",..: 1 3 2
$ kg : num 100 250 560
$ PrUn : num 2 4.5 1.2
df1 <- data.frame(a=sample(x=1:100), b=rnorm(100))
head(df1, 2)
a b
1 40 -0.9167949
2 70 0.7531830
tail(df1, 2)
a b
99 73 0.5667689
100 42 -0.1077406
head(df1)
a b
1 40 -0.9167949
2 70 0.7531830
3 87 0.3538634
4 76 0.9907758
5 50 1.4268626
6 68 0.1808850
plot(df1)
(df1.10 <- head(df1, 10))
a b
1 40 -0.9167949
2 70 0.7531830
3 87 0.3538634
4 76 0.9907758
5 50 1.4268626
6 68 0.1808850
7 11 0.2417842
8 4 1.2177763
9 34 -0.7826337
10 80 -0.3418487
plot(df1.10$a,type="l", col="red")
summary(df1)
a b
Min. : 1.00 Min. :-2.11038
1st Qu.: 25.75 1st Qu.:-0.70474
Median : 50.50 Median :-0.10633
Mean : 50.50 Mean :-0.07583
3rd Qu.: 75.25 3rd Qu.: 0.53946
Max. :100.00 Max. : 2.97220
lr <- lm(formula = a ~ b, data = df1)
lr
Call:
lm(formula = a ~ b, data = df1)
Coefficients:
(Intercept) b
50.750 3.297
par(mfrow=c(2,2))
plot(lr)
v <- c(22, 3, 2, 5, 6, 77, 8, 11, 9, 7, 57)
sqrt(mean(v^2))
[1] 30.22792
library(magrittr)
v^2 %>% mean() %>% sqrt()
[1] 30.22792
library(dplyr)
df_a <- data.frame(year = rep(2013:2017,each=12),
month = rep(1:12, times=5),
num = sample(1:20),
weight = runif(n = 60, min = 1.0,
max = 10.0))
head(df_a, 4)
year month num weight
1 2013 1 20 6.249453
2 2013 2 18 4.340047
3 2013 3 12 4.000076
4 2013 4 10 7.860788
df_a_select <- select(df_a,
year, num, weight)
head(df_a_select)
year num weight
1 2013 20 6.249453
2 2013 18 4.340047
3 2013 12 4.000076
4 2013 10 7.860788
5 2013 14 1.957468
6 2013 6 9.132897
df_a_filter <- filter(df_a,
month == 2, num > 3)
head(df_a_filter)
year month num weight
1 2013 2 18 4.340047
2 2014 2 15 8.910923
3 2015 2 6 3.393272
4 2016 2 13 8.434680
5 2017 2 9 1.946008
df_a_summarise <- summarise(df_a,
avg_n=mean(num),
avg_w=mean(weight))
head(df_a_summarise)
avg_n avg_w
1 10.5 5.630199
df_a_arrange <- arrange(df_a, desc(num))
head(df_a_arrange)
year month num weight
1 2013 1 20 6.249453
2 2014 9 20 6.409159
3 2016 5 20 9.276693
4 2014 1 19 1.851002
5 2015 9 19 1.687055
6 2017 5 19 9.145018
df_a_mutate <- mutate(df_a, tot=num*weight)
head(df_a_mutate)
year month num weight tot
1 2013 1 20 6.249453 124.98906
2 2013 2 18 4.340047 78.12085
3 2013 3 12 4.000076 48.00091
4 2013 4 10 7.860788 78.60788
5 2013 5 14 1.957468 27.40455
6 2013 6 6 9.132897 54.79738
df_a_gr <- df_a %>%
group_by(year) %>%
summarise(avg_n=mean(num),
avg_w=mean(weight))
head(df_a_gr)
# A tibble: 5 x 3
year avg_n avg_w
<int> <dbl> <dbl>
1 2013 10.083333 4.946174
2 2014 12.416667 5.868181
3 2015 9.916667 4.284383
4 2016 10.333333 6.556034
5 2017 9.750000 6.496225
df_a_gr <- df_a %>%
group_by(year) %>%
summarise_all(funs(mean))
head(df_a_gr)
# A tibble: 5 x 4
year month num weight
<int> <dbl> <dbl> <dbl>
1 2013 6.5 10.083333 4.946174
2 2014 6.5 12.416667 5.868181
3 2015 6.5 9.916667 4.284383
4 2016 6.5 10.333333 6.556034
5 2017 6.5 9.750000 6.496225
library(ggplot2)
t <- seq(from=as.Date("2017-01-01"),
to=as.Date("2017-12-31"),
by="days")
val <- rnorm(365)
df <- data.frame(tempo=t, valori=val)
f <- ggplot(df,aes(x=tempo,y=valori))+
geom_line() +
theme_bw()
f
start=as.Date("2017-01-01")
df9 <- data.frame(date=seq(from=start,to=start+99,by="days"),
descr=sample(x=c("cat","dog","mouse"),100, replace=T),
val=sample(x=1:30,size=100,replace=T))
head(df9, 4)
date descr val
1 2017-01-01 mouse 20
2 2017-01-02 mouse 20
3 2017-01-03 mouse 2
4 2017-01-04 mouse 22
g <- ggplot(df9, aes(descr, val))
gg <- ggplot(df9, aes(x=val))
gg2 <- ggplot(df9, aes(x=val, fill=descr))
gg3 <- ggplot(df9, aes(x=descr, y=val, fill=descr))