from book ggplot2 essential movie examples simplified

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.4

library(ggplot2movies) # Набор данных movies вынесен из новых версий ggplot2
library(dplyr)

## 
## Attaching package: 'dplyr'

## Следующие объекты скрыты от 'package:stats':
## 
##     filter, lag

## Следующие объекты скрыты от 'package:base':
## 
##     intersect, setdiff, setequal, union

Наборы данных, используемые в примерах

Movie dataset (пакет ggplot2)

Набор данных movies переместили в отдельный пакет ggplot2movies из пакета ggplot2 в последних версиях.

Описание из пакета `ggplot2movies`:

A data frame with 28819 rows and 24 variables

title. Title of the movie.
year. Year of release.
budget. Total budget (if known) in US dollars
length. Length in minutes.
rating. Average IMDB user rating.
votes. Number of IMDB users who rated this movie.
r1-10. Multiplying by ten gives percentile (to nearest 10%) of users who rated this movie a 1.
mpaa. MPAA rating.
action, animation, comedy, drama, documentary, romance, short. Binary variables representing if movie was classified as belonging to that genre.

glimpse(movies)

## Observations: 58,788
## Variables: 24
## $ title       (chr) "$", "$1000 a Touchdown", "$21 a Day Once a Month"...
## $ year        (int) 1971, 1939, 1941, 1996, 1975, 2000, 2002, 2002, 19...
## $ length      (int) 121, 71, 7, 70, 71, 91, 93, 25, 97, 61, 99, 96, 10...
## $ budget      (int) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ rating      (dbl) 6.4, 6.0, 8.2, 8.2, 3.4, 4.3, 5.3, 6.7, 6.6, 6.0, ...
## $ votes       (int) 348, 20, 5, 6, 17, 45, 200, 24, 18, 51, 23, 53, 44...
## $ r1          (dbl) 4.5, 0.0, 0.0, 14.5, 24.5, 4.5, 4.5, 4.5, 4.5, 4.5...
## $ r2          (dbl) 4.5, 14.5, 0.0, 0.0, 4.5, 4.5, 0.0, 4.5, 4.5, 0.0,...
## $ r3          (dbl) 4.5, 4.5, 0.0, 0.0, 0.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...
## $ r4          (dbl) 4.5, 24.5, 0.0, 0.0, 14.5, 14.5, 4.5, 4.5, 0.0, 4....
## $ r5          (dbl) 14.5, 14.5, 0.0, 0.0, 14.5, 14.5, 24.5, 4.5, 0.0, ...
## $ r6          (dbl) 24.5, 14.5, 24.5, 0.0, 4.5, 14.5, 24.5, 14.5, 0.0,...
## $ r7          (dbl) 24.5, 14.5, 0.0, 0.0, 0.0, 4.5, 14.5, 14.5, 34.5, ...
## $ r8          (dbl) 14.5, 4.5, 44.5, 0.0, 0.0, 4.5, 4.5, 14.5, 14.5, 4...
## $ r9          (dbl) 4.5, 4.5, 24.5, 34.5, 0.0, 14.5, 4.5, 4.5, 4.5, 4....
## $ r10         (dbl) 4.5, 14.5, 24.5, 45.5, 24.5, 14.5, 14.5, 14.5, 24....
## $ mpaa        (chr) "", "", "", "", "", "", "R", "", "", "", "", "", "...
## $ Action      (int) 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,...
## $ Animation   (int) 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Comedy      (int) 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,...
## $ Drama       (int) 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ Documentary (int) 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Romance     (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Short       (int) 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,...

d1 <-data.frame(movies[movies$Action==1, c("budget", "Short",
"year")])
d1$Type <- "Animation"
d2 <-data.frame(movies[movies$Animation==1, c("budget", "Short",
"year")])
d2$Type <- "Animation"
d3 <-data.frame(movies[movies$Comedy==1, c("budget", "Short",
"year")])
d3$Type <- "Comedy"
d4 <-data.frame(movies[movies$Drama==1, c("budget", "Short",
"year")])
d4$Type <- "Drama"
d5 <-data.frame(movies[movies$Documentary==1, c("budget", "Short",
"year")])
d5$Type <- "Documentary"
d6 <-data.frame(movies[movies$Romance==1, c("budget", "Short",
"year")])
d6$Type <- "Romance"
myMovieData <- rbind(d1, d2, d3, d4, d5, d6)
names(myMovieData) <- c("Budget", "Short", "Year", "Type" )

Реструктуризованный набор данных (по жанрам)

glimpse(myMovieData)

## Observations: 55,676
## Variables: 4
## $ Budget (int) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ Short  (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
## $ Year   (int) 2002, 1983, 1987, 1988, 1974, 1975, 1981, 1990, 1970, 1...
## $ Type   (chr) "Animation", "Animation", "Animation", "Animation", "An...

Bar plots

ggplot(myMovieData,aes(Type,fill=Type)) + geom_bar()

Порядок столбцов

Для номинальных данных прядок расположения на оси X можно задать при помощи изменения порядка уровней (levels) в столбце типа фактор.

Упорядочим уровни (factor levels) для получения нужного порядка столбцов (по убыванию количества фильмов)

# myMovieData_o_by_cnt <- mutate(myMovieData,Type ,)
newLevelsMovieType <- sort(table(myMovieData$Type),decreasing = T)
newLevelsMovieType

## 
##       Drama      Comedy   Animation     Romance Documentary 
##       21811       17271        8378        4744        3472

myMovieData_o_by_cnt <-
  mutate(myMovieData,
        Type=factor(Type,levels = names(newLevelsMovieType)))
ggplot(myMovieData_o_by_cnt,aes(Type,fill=Type)) + geom_bar()

Цвета столбцов зависят от целых значений, которыми закодированы уровни столбца-фактора и палитры шклалы fill (запалнения).

bar_brewer <- ggplot(myMovieData_o_by_cnt,aes(Type,fill=Type)) + 
  geom_bar() + scale_fill_brewer(palette="PiYG")
bar_brewer

bar_brewer + scale_fill_brewer(palette="BuPu")

## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.

ggplot(myMovieData_o_by_cnt,aes(Type,fill=
                factor(Type,levels=rev(levels(Type))))) + 
  geom_bar() + scale_fill_brewer(palette="PuBu")

Раположение прямоугольников, соответствующих одному значению X

3 варианта position_adjustment для geom_bar(position="...") :

stack – один над другим
dodge – один рядом с другим
fill – заполняют всю высоту

gbar <-  ggplot(data=myMovieData_o_by_cnt,
                aes(x=Type, fill=factor(Short)))
gbar + geom_bar(position="stack") # default

gbar + geom_bar(position="dodge")

gbar + geom_bar(position="fill")

dist <- data.frame(value=rnorm(10000, 1:4), group=1:4)
ggplot(dist, aes(x=group, y=value, color=group)) +
geom_jitter(alpha=0.2,shape=21)

ggplot(dist, aes(x=group, y=value, color=factor(group))) +
geom_jitter(alpha=0.2,shape=21,width=0.1) +scale_color_brewer(palette="YlOrRd")

Пример с Boxplot

Шкалы и статистики

Удаляем неполные строки данных (с NA)

myMovieData <- na.omit(myMovieData)
glimpse(myMovieData)

## Observations: 5,977
## Variables: 4
## $ Budget (int) 85000000, 76000000, 200000, 62000000, 1500, 28000000, 8...
## $ Short  (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0...
## $ Year   (int) 1999, 2003, 1916, 2001, 1996, 2001, 2000, 2004, 1958, 2...
## $ Type   (chr) "Animation", "Animation", "Animation", "Animation", "An...

Сгруппируем по типу и подсчитаем средний бюджет

myMovieByTypeByShort <- 
myMovieData %>% 
  group_by(Type,Short) %>% 
  summarise(meanBudget=mean(Budget),sumBudget=sum(as.numeric(Budget)))

ggBudgetsByType <- 
  ggplot(myMovieByTypeByShort,aes(x=Type,y=sumBudget,fill=factor(Short))) +
  geom_bar(stat="identity",position="dodge")
ggBudgetsByType

ggBudgetsByType +scale_y_log10()

Замечание 1 geom_bar() неправильно воспринимается с логарифмической шкалой по Y.

Замечание 2 geom_bar(binwidth=1) с ныне устаревшим параметром использовался для построения столбцового графика по качественной переменной на оси X. Теперь для количественной шкалы X используется geom_histogram()

ggplot(data=myMovieData,
aes(Type,Budget)) +
 geom_jitter() + 
  geom_boxplot(alpha=I(0.6))+
  scale_y_log10()

## Warning: Removed 16 rows containing non-finite values (stat_boxplot).

ggplot(data=myMovieData,
aes(Type,Budget)) +
  geom_jitter() + 
   geom_boxplot(alpha=I(0.6))+
  scale_y_log10()

## Warning: Removed 16 rows containing non-finite values (stat_boxplot).

Цветовые шкалы

выдержка из help(package="RColorBrewer"):

ColorBrewer is Copyright (c) 2002 Cynthia Brewer, Mark Harrower, and The Pennsylvania State University. All rights reserved. The ColorBrewer palettes have been included in this R package with permission of the copyright holder. `

There are 3 types of palettes, sequential, diverging, and qualitative.

*Sequential palettes are suited to ordered data that progress from low to high. Lightness steps dominate the look of these schemes, with light colors for low data values to dark colors for high data values**.
Diverging palettes put equal emphasis on mid-range critical values and extremes at both ends of the data range. The critical class or break in the middle of the legend is emphasized with light colors and low and high extremes are emphasized with dark colors that have contrasting hues.
Qualitative palettes do not imply magnitude differences between legend classes, and hues are used to create the primary visual differences between classes. Qualitative schemes are best suited to representing nominal or categorical data.

 library(RColorBrewer)
 display.brewer.all(type = "seq")   ## количественные переменные -- правый край шкалы тёмный

 display.brewer.all(type = "div")   ## количественные переменные -- края шкалы темные

 display.brewer.all(type = "qual")  ## качественные переменные -- контрастные цвета

gbar_cnt  <-
  ggplot(myMovieData_o_by_cnt,aes(Type,fill=..count..)) +
     geom_bar()
gbar_cnt

Параметр alpha – Непрозрачность

movieByYearByType <- myMovieData %>%  group_by(Year,Type) %>% 
  summarise(meanBudget=mean(as.numeric(Budget))) %>% 
  arrange(Type,Year)
ggarea1 <- filter(movieByYearByType,Type %in% c('Drama','Comedy','Animation')) %>% 
  ggplot(data=., aes(x=Year,y=meanBudget,fill=Type)) + geom_area()
ggarea1

ggarea1+ scale_y_log10()

ggarea1+geom_area(alpha=0.5) + scale_y_log10()

ggplot(data=ToothGrowth, aes(x=dose, y=len, col=supp)) +
geom_point()+stat_smooth(method="lm",se=F)

ggplot(data=ToothGrowth, aes(x=dose, y=len, col=supp)) +
geom_point() + stat_smooth() + facet_grid(.~supp,margins=TRUE)

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.4925

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.5075

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.5661e-16

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2.2726

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 0.4925

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.5075

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 1.5661e-16

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 2.2726

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.4925

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.5075

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.5661e-16

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2.2726

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 0.4925

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.5075

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 1.5661e-16

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 2.2726

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.4925

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.5075

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 2.5631e-16

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2.2726

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 0.4925

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.5075

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 2.5631e-16

## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 2.2726

gfacet2  <- ggplot(data=myMovieData, aes(x=Year, y=Budget)) +
geom_point(shape=21,alpha=0.1,fill="light green") + theme_bw()
gfacet2 + facet_wrap(~ Type,ncol=2,)

gfacet2

gfacet2 + facet_grid(Type~Short)

gfacet2  <- ggplot(data=myMovieData, aes(x=Year, y=Budget)) +
geom_point(shape=21,alpha=0.1,fill="light green") + theme_bw()
gfacet2 + facet_wrap(~ Type,ncol=2,)

gfacet2

gfacet2 + facet_grid(Type~Short,margins = T)

from book ‘ggplot2 essential’ movie examples simplified

Илья Кочергин

29 апреля 2016 г

Наборы данных, используемые в примерах

Movie dataset (пакет ggplot2)

Описание из пакета `ggplot2movies`:

Bar plots

Порядок столбцов

Раположение прямоугольников, соответствующих одному значению X

Пример с Boxplot

Шкалы и статистики

Цветовые шкалы

Параметр alpha – Непрозрачность

from book ‘ggplot2 essential’ movie examples simplified

Илья Кочергин

29 апреля 2016 г

Наборы данных, используемые в примерах

Movie dataset (пакет ggplot2)

Описание из пакета ggplot2movies:

Bar plots

Порядок столбцов

Раположение прямоугольников, соответствующих одному значению X

Пример с Boxplot

Шкалы и статистики

Цветовые шкалы

Параметр alpha – Непрозрачность

Описание из пакета `ggplot2movies`: