R is a language and environment for statistical computing and graphics. It is a GNU project which is similar to the S language and environment which was developed at Bell Laboratories (formerly AT&T, now Lucent Technologies) by John Chambers and colleagues. R can be considered as a different implementation of S. There are some important differences, but much code written for S runs unaltered under R.
R provides a wide variety of statistical (linear and nonlinear modelling, classical statistical tests, time-series analysis, classification, clustering, …) and graphical techniques, and is highly extensible.
RStudio is an integrated development environment (IDE) for R. It includes a console, syntax-highlighting editor that supports direct code execution, as well as tools for plotting, history, debugging and workspace management.
RStudio is available in open source and commercial editions and runs on the desktop (Windows, Mac, and Linux) or in a browser connected to RStudio Server or RStudio Server Pro (Debian/Ubuntu, RedHat/CentOS, and SUSE Linux).
Where am I?
Data Frames
rmarkdown
========================================================
(v <- c(1, 2, 3, 4, 5))
## [1] 1 2 3 4 5
(v1 <- 1:5)
## [1] 1 2 3 4 5
(v2 <- seq(from = 1, to = 5, by = 1))
## [1] 1 2 3 4 5
( v3 <- rep(1:2, times = 3) )
## [1] 1 2 1 2 1 2
( v4 <- rep(1:2, each = 3) )
## [1] 1 1 1 2 2 2
v5 <- rnorm(3)
v5
## [1] 2.07436307 0.32028396 -0.09851201
v5[2]
## [1] 0.320284
v5[-2]
## [1] 2.07436307 -0.09851201
v5
## [1] 2.07436307 0.32028396 -0.09851201
v5[2:3]
## [1] 0.32028396 -0.09851201
v5[-(2:3)]
## [1] 2.074363
v5
## [1] 2.07436307 0.32028396 -0.09851201
v5[c(1, 2)]
## [1] 2.074363 0.320284
v5[-c(1,2)]
## [1] -0.09851201
v5
## [1] 2.07436307 0.32028396 -0.09851201
v5[v5 < 0]
## [1] -0.09851201
fruits <- c("apple", "pear", "apple")
fruits[fruits == "apple"]
## [1] "apple" "apple"
t <- seq(from = as.Date("2017-01-01"),
to = as.Date("2017-12-31"),
by = "days")
t[1:3]
## [1] "2017-01-01" "2017-01-02" "2017-01-03"
t2 <- seq(from = as.Date("2017-01-01"),
to = as.Date("2017-12-31"),
by = "months")
t2[1:3]
## [1] "2017-01-01" "2017-02-01" "2017-03-01"
df <- data.frame(
Fruit = c("apple", "pear", "melon"),
kg = c(100, 250, 560),
PrUn = c(2.0, 4.5, 1.2)
)
df
## Fruit kg PrUn
## 1 apple 100 2.0
## 2 pear 250 4.5
## 3 melon 560 1.2
df
## Fruit kg PrUn
## 1 apple 100 2.0
## 2 pear 250 4.5
## 3 melon 560 1.2
df[1:2, ]
## Fruit kg PrUn
## 1 apple 100 2.0
## 2 pear 250 4.5
df[1, 1]
## [1] apple
## Levels: apple melon pear
df
## Fruit kg PrUn
## 1 apple 100 2.0
## 2 pear 250 4.5
## 3 melon 560 1.2
df[ , 'Fruit']
## [1] apple pear melon
## Levels: apple melon pear
df['tot'] <- df['kg'] * df['PrUn']
df
## Fruit kg PrUn tot
## 1 apple 100 2.0 200
## 2 pear 250 4.5 1125
## 3 melon 560 1.2 672
df$tot <- df$kg * df$PrUn
df
## Fruit kg PrUn tot
## 1 apple 100 2.0 200
## 2 pear 250 4.5 1125
## 3 melon 560 1.2 672
df['tot'] <- NULL
df
## Fruit kg PrUn
## 1 apple 100 2.0
## 2 pear 250 4.5
## 3 melon 560 1.2
df$tot <- NULL
df
## Fruit kg PrUn
## 1 apple 100 2.0
## 2 pear 250 4.5
## 3 melon 560 1.2
df
## Fruit kg PrUn
## 1 apple 100 2.0
## 2 pear 250 4.5
## 3 melon 560 1.2
dim(df)
## [1] 3 3
names(df)
## [1] "Fruit" "kg" "PrUn"
df
## Fruit kg PrUn
## 1 apple 100 2.0
## 2 pear 250 4.5
## 3 melon 560 1.2
str(df)
## 'data.frame': 3 obs. of 3 variables:
## $ Fruit: Factor w/ 3 levels "apple","melon",..: 1 3 2
## $ kg : num 100 250 560
## $ PrUn : num 2 4.5 1.2
df1 <- data.frame(a = sample(x = 1:100), b = rnorm(100))
head(df1, 5)
## a b
## 1 69 -0.81252232
## 2 98 1.02755742
## 3 29 -0.27553963
## 4 99 -0.15941451
## 5 12 0.09592824
tail(df1, 5)
## a b
## 96 15 -0.6747514
## 97 27 -0.5045930
## 98 63 0.2304039
## 99 38 2.8594134
## 100 90 -0.1242792
head(df1)
## a b
## 1 69 -0.81252232
## 2 98 1.02755742
## 3 29 -0.27553963
## 4 99 -0.15941451
## 5 12 0.09592824
## 6 60 -0.66987630
plot(df1)
(df1.10 <- head(df1, 10))
## a b
## 1 69 -0.81252232
## 2 98 1.02755742
## 3 29 -0.27553963
## 4 99 -0.15941451
## 5 12 0.09592824
## 6 60 -0.66987630
## 7 76 1.08198439
## 8 84 1.56556105
## 9 87 -0.41985035
## 10 40 1.18134961
plot(df1.10$a, type = "l", col="red")
summary(df1)
## a b
## Min. : 1.00 Min. :-2.23800
## 1st Qu.: 25.75 1st Qu.:-0.61497
## Median : 50.50 Median :-0.14508
## Mean : 50.50 Mean : 0.01599
## 3rd Qu.: 75.25 3rd Qu.: 0.79671
## Max. :100.00 Max. : 3.26078
lr <- lm(formula = a ~ b, data = df1)
lr
##
## Call:
## lm(formula = a ~ b, data = df1)
##
## Coefficients:
## (Intercept) b
## 50.525 -1.548
par(mfrow=c(1,2))
plot(lr)
v <- c(22, 3, 2, 5, 6, 77, 8, 11, 9, 7, 57)
sqrt(mean(v^2))
## [1] 30.22792
library(magrittr)
v^2 %>%
mean() %>%
sqrt()
## [1] 30.22792
select() select columns
filter() filter rows
summarise() summarise values
arrange() re-order or arrange rows
mutate() create new columns
group_by() group data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df_a <- data.frame(year = rep(2013:2017, each = 12),
month = rep(1:12, times = 5),
num = sample(1:20),
weight = runif(n = 60, min = 1.0, max = 10.0))
head(df_a, 4)
## year month num weight
## 1 2013 1 5 5.145590
## 2 2013 2 3 4.459135
## 3 2013 3 7 5.257585
## 4 2013 4 16 6.867127
df_a_select <- select(df_a, year, num, weight)
head(df_a_select)
## year num weight
## 1 2013 5 5.145590
## 2 2013 3 4.459135
## 3 2013 7 5.257585
## 4 2013 16 6.867127
## 5 2013 2 8.802364
## 6 2013 20 2.425743
df_a_filter <- filter(df_a, month == 2, num > 3)
head(df_a_filter)
## year month num weight
## 1 2014 2 10 6.463854
## 2 2015 2 20 5.334409
## 3 2016 2 19 3.020158
df_a_summarise <- summarise(df_a,
avg_n = mean(num),
avg_w = mean(weight))
head(df_a_summarise)
## avg_n avg_w
## 1 10.5 5.438779
df_a_arrange <- arrange(df_a, desc(num))
head(df_a_arrange)
## year month num weight
## 1 2013 6 20 2.425743
## 2 2015 2 20 5.334409
## 3 2016 10 20 7.893768
## 4 2014 6 19 4.861062
## 5 2016 2 19 3.020158
## 6 2017 10 19 7.413796
df_a_mutate <- mutate(df_a, tot = num * weight)
head(df_a_mutate)
## year month num weight tot
## 1 2013 1 5 5.145590 25.72795
## 2 2013 2 3 4.459135 13.37741
## 3 2013 3 7 5.257585 36.80310
## 4 2013 4 16 6.867127 109.87404
## 5 2013 5 2 8.802364 17.60473
## 6 2013 6 20 2.425743 48.51485
df_a_gr <- df_a %>%
group_by(year) %>%
summarise(avg_n = mean(num),
avg_w = mean(weight))
head(df_a_gr)
## # A tibble: 5 x 3
## year avg_n avg_w
## <int> <dbl> <dbl>
## 1 2013 8.666667 5.271529
## 2 2014 11.416667 5.853483
## 3 2015 10.583333 5.285630
## 4 2016 9.750000 6.315981
## 5 2017 12.083333 4.467274
df_a_gr <- df_a %>%
group_by(year) %>%
summarise_all(funs(mean))
head(df_a_gr)
## # A tibble: 5 x 4
## year month num weight
## <int> <dbl> <dbl> <dbl>
## 1 2013 6.5 8.666667 5.271529
## 2 2014 6.5 11.416667 5.853483
## 3 2015 6.5 10.583333 5.285630
## 4 2016 6.5 9.750000 6.315981
## 5 2017 6.5 12.083333 4.467274
library(ggplot2)
t <- seq(from = as.Date("2017-01-01"),
to = as.Date("2017-12-31"),
by = "days")
val <- rnorm(365)
df <- data.frame(tempo = t, valori = val)
f <- ggplot(df,aes(x = tempo, y = valori)) +
geom_line() +
theme_bw()
f
start=as.Date("2017-01-01")
df9 <- data.frame(date = seq(from = start, to = start+99, by="days"),
descr = sample(x = c("cat","dog","mouse"), size = 100, replace = T),
val = sample(x = 1:30, size = 100, replace = T))
head(df9, 40)
## date descr val
## 1 2017-01-01 mouse 3
## 2 2017-01-02 cat 13
## 3 2017-01-03 cat 11
## 4 2017-01-04 dog 21
## 5 2017-01-05 mouse 21
## 6 2017-01-06 cat 3
## 7 2017-01-07 cat 1
## 8 2017-01-08 cat 30
## 9 2017-01-09 dog 27
## 10 2017-01-10 cat 22
## 11 2017-01-11 mouse 13
## 12 2017-01-12 cat 18
## 13 2017-01-13 mouse 18
## 14 2017-01-14 cat 14
## 15 2017-01-15 cat 23
## 16 2017-01-16 dog 30
## 17 2017-01-17 cat 16
## 18 2017-01-18 dog 26
## 19 2017-01-19 dog 22
## 20 2017-01-20 mouse 21
## 21 2017-01-21 mouse 22
## 22 2017-01-22 cat 1
## 23 2017-01-23 dog 20
## 24 2017-01-24 cat 25
## 25 2017-01-25 mouse 7
## 26 2017-01-26 mouse 28
## 27 2017-01-27 dog 10
## 28 2017-01-28 dog 19
## 29 2017-01-29 dog 1
## 30 2017-01-30 mouse 13
## 31 2017-01-31 mouse 19
## 32 2017-02-01 cat 4
## 33 2017-02-02 mouse 5
## 34 2017-02-03 dog 5
## 35 2017-02-04 cat 17
## 36 2017-02-05 mouse 1
## 37 2017-02-06 cat 26
## 38 2017-02-07 cat 30
## 39 2017-02-08 mouse 25
## 40 2017-02-09 mouse 22
g <- ggplot(df9, aes(descr, val))
gg <- ggplot(df9, aes(x=val))
gg2 <- ggplot(df9, aes(x=val, fill=descr))
gg3 <- ggplot(df9, aes(x=descr, y=val, fill=descr))
g + geom_violin()
g + geom_violin(aes(fill=descr))
g + geom_violin() + geom_jitter(height=0,width=0.1)
g + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75))
gg + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gg2 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gg2 + geom_density(alpha = .3)
gg2 + geom_density(alpha=.3) + facet_grid(descr ~.)
gg3 + geom_boxplot(alpha = .3)
gg3 + geom_boxplot(alpha = .3) +
stat_summary(fun.y = mean, geom = "point", shape = 5, size = 4)
========================================================
Author: Alessandra Santi
E-mail: santi.info@gmail.com
License: CC-BY-SA