library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
library(ggplot2movies) # Набор данных movies вынесен из новых версий ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## Следующие объекты скрыты от 'package:stats':
##
## filter, lag
## Следующие объекты скрыты от 'package:base':
##
## intersect, setdiff, setequal, union
Набор данных movies переместили в отдельный пакет ggplot2movies из пакета ggplot2 в последних версиях.
ggplot2movies:A data frame with 28819 rows and 24 variables
title. Title of the movie.
year. Year of release.
budget. Total budget (if known) in US dollars
length. Length in minutes.
rating. Average IMDB user rating.
votes. Number of IMDB users who rated this movie.
r1-10. Multiplying by ten gives percentile (to nearest 10%) of users who rated this movie a 1.
mpaa. MPAA rating.
action, animation, comedy, drama, documentary, romance, short. Binary variables representing if movie was classified as belonging to that genre.
glimpse(movies)
## Observations: 58,788
## Variables: 24
## $ title (chr) "$", "$1000 a Touchdown", "$21 a Day Once a Month"...
## $ year (int) 1971, 1939, 1941, 1996, 1975, 2000, 2002, 2002, 19...
## $ length (int) 121, 71, 7, 70, 71, 91, 93, 25, 97, 61, 99, 96, 10...
## $ budget (int) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ rating (dbl) 6.4, 6.0, 8.2, 8.2, 3.4, 4.3, 5.3, 6.7, 6.6, 6.0, ...
## $ votes (int) 348, 20, 5, 6, 17, 45, 200, 24, 18, 51, 23, 53, 44...
## $ r1 (dbl) 4.5, 0.0, 0.0, 14.5, 24.5, 4.5, 4.5, 4.5, 4.5, 4.5...
## $ r2 (dbl) 4.5, 14.5, 0.0, 0.0, 4.5, 4.5, 0.0, 4.5, 4.5, 0.0,...
## $ r3 (dbl) 4.5, 4.5, 0.0, 0.0, 0.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...
## $ r4 (dbl) 4.5, 24.5, 0.0, 0.0, 14.5, 14.5, 4.5, 4.5, 0.0, 4....
## $ r5 (dbl) 14.5, 14.5, 0.0, 0.0, 14.5, 14.5, 24.5, 4.5, 0.0, ...
## $ r6 (dbl) 24.5, 14.5, 24.5, 0.0, 4.5, 14.5, 24.5, 14.5, 0.0,...
## $ r7 (dbl) 24.5, 14.5, 0.0, 0.0, 0.0, 4.5, 14.5, 14.5, 34.5, ...
## $ r8 (dbl) 14.5, 4.5, 44.5, 0.0, 0.0, 4.5, 4.5, 14.5, 14.5, 4...
## $ r9 (dbl) 4.5, 4.5, 24.5, 34.5, 0.0, 14.5, 4.5, 4.5, 4.5, 4....
## $ r10 (dbl) 4.5, 14.5, 24.5, 45.5, 24.5, 14.5, 14.5, 14.5, 24....
## $ mpaa (chr) "", "", "", "", "", "", "R", "", "", "", "", "", "...
## $ Action (int) 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,...
## $ Animation (int) 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Comedy (int) 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,...
## $ Drama (int) 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ Documentary (int) 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Romance (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Short (int) 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,...
d1 <-data.frame(movies[movies$Action==1, c("budget", "Short",
"year")])
d1$Type <- "Animation"
d2 <-data.frame(movies[movies$Animation==1, c("budget", "Short",
"year")])
d2$Type <- "Animation"
d3 <-data.frame(movies[movies$Comedy==1, c("budget", "Short",
"year")])
d3$Type <- "Comedy"
d4 <-data.frame(movies[movies$Drama==1, c("budget", "Short",
"year")])
d4$Type <- "Drama"
d5 <-data.frame(movies[movies$Documentary==1, c("budget", "Short",
"year")])
d5$Type <- "Documentary"
d6 <-data.frame(movies[movies$Romance==1, c("budget", "Short",
"year")])
d6$Type <- "Romance"
myMovieData <- rbind(d1, d2, d3, d4, d5, d6)
names(myMovieData) <- c("Budget", "Short", "Year", "Type" )
Реструктуризованный набор данных (по жанрам)
glimpse(myMovieData)
## Observations: 55,676
## Variables: 4
## $ Budget (int) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ Short (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
## $ Year (int) 2002, 1983, 1987, 1988, 1974, 1975, 1981, 1990, 1970, 1...
## $ Type (chr) "Animation", "Animation", "Animation", "Animation", "An...
ggplot(myMovieData,aes(Type,fill=Type)) + geom_bar()
Для номинальных данных прядок расположения на оси X можно задать при помощи изменения порядка уровней (levels) в столбце типа фактор.
Упорядочим уровни (factor levels) для получения нужного порядка столбцов (по убыванию количества фильмов)
# myMovieData_o_by_cnt <- mutate(myMovieData,Type ,)
newLevelsMovieType <- sort(table(myMovieData$Type),decreasing = T)
newLevelsMovieType
##
## Drama Comedy Animation Romance Documentary
## 21811 17271 8378 4744 3472
myMovieData_o_by_cnt <-
mutate(myMovieData,
Type=factor(Type,levels = names(newLevelsMovieType)))
ggplot(myMovieData_o_by_cnt,aes(Type,fill=Type)) + geom_bar()
Цвета столбцов зависят от целых значений, которыми закодированы уровни столбца-фактора и палитры шклалы fill (запалнения).
bar_brewer <- ggplot(myMovieData_o_by_cnt,aes(Type,fill=Type)) +
geom_bar() + scale_fill_brewer(palette="PiYG")
bar_brewer
bar_brewer + scale_fill_brewer(palette="BuPu")
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
ggplot(myMovieData_o_by_cnt,aes(Type,fill=
factor(Type,levels=rev(levels(Type))))) +
geom_bar() + scale_fill_brewer(palette="PuBu")
3 варианта position_adjustment для geom_bar(position="...") :
stack – один над другим
dodge – один рядом с другим
fill – заполняют всю высоту
gbar <- ggplot(data=myMovieData_o_by_cnt,
aes(x=Type, fill=factor(Short)))
gbar + geom_bar(position="stack") # default
gbar + geom_bar(position="dodge")
gbar + geom_bar(position="fill")
dist <- data.frame(value=rnorm(10000, 1:4), group=1:4)
ggplot(dist, aes(x=group, y=value, color=group)) +
geom_jitter(alpha=0.2,shape=21)
ggplot(dist, aes(x=group, y=value, color=factor(group))) +
geom_jitter(alpha=0.2,shape=21,width=0.1) +scale_color_brewer(palette="YlOrRd")
Удаляем неполные строки данных (с NA)
myMovieData <- na.omit(myMovieData)
glimpse(myMovieData)
## Observations: 5,977
## Variables: 4
## $ Budget (int) 85000000, 76000000, 200000, 62000000, 1500, 28000000, 8...
## $ Short (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0...
## $ Year (int) 1999, 2003, 1916, 2001, 1996, 2001, 2000, 2004, 1958, 2...
## $ Type (chr) "Animation", "Animation", "Animation", "Animation", "An...
Сгруппируем по типу и подсчитаем средний бюджет
myMovieByTypeByShort <-
myMovieData %>%
group_by(Type,Short) %>%
summarise(meanBudget=mean(Budget),sumBudget=sum(as.numeric(Budget)))
ggBudgetsByType <-
ggplot(myMovieByTypeByShort,aes(x=Type,y=sumBudget,fill=factor(Short))) +
geom_bar(stat="identity",position="dodge")
ggBudgetsByType
ggBudgetsByType +scale_y_log10()
Замечание 1 geom_bar() неправильно воспринимается с логарифмической шкалой по Y.
Замечание 2 geom_bar(binwidth=1) с ныне устаревшим параметром использовался для построения столбцового графика по качественной переменной на оси X. Теперь для количественной шкалы X используется geom_histogram()
ggplot(data=myMovieData,
aes(Type,Budget)) +
geom_jitter() +
geom_boxplot(alpha=I(0.6))+
scale_y_log10()
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).
ggplot(data=myMovieData,
aes(Type,Budget)) +
geom_jitter() +
geom_boxplot(alpha=I(0.6))+
scale_y_log10()
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).
выдержка из help(package="RColorBrewer"):
ColorBrewer is Copyright (c) 2002 Cynthia Brewer, Mark Harrower, and The Pennsylvania State University. All rights reserved. The ColorBrewer palettes have been included in this R package with permission of the copyright holder. `
There are 3 types of palettes, sequential, diverging, and qualitative.
library(RColorBrewer)
display.brewer.all(type = "seq") ## количественные переменные -- правый край шкалы тёмный
display.brewer.all(type = "div") ## количественные переменные -- края шкалы темные
display.brewer.all(type = "qual") ## качественные переменные -- контрастные цвета
gbar_cnt <-
ggplot(myMovieData_o_by_cnt,aes(Type,fill=..count..)) +
geom_bar()
gbar_cnt
movieByYearByType <- myMovieData %>% group_by(Year,Type) %>%
summarise(meanBudget=mean(as.numeric(Budget))) %>%
arrange(Type,Year)
ggarea1 <- filter(movieByYearByType,Type %in% c('Drama','Comedy','Animation')) %>%
ggplot(data=., aes(x=Year,y=meanBudget,fill=Type)) + geom_area()
ggarea1
ggarea1+ scale_y_log10()
ggarea1+geom_area(alpha=0.5) + scale_y_log10()
ggplot(data=ToothGrowth, aes(x=dose, y=len, col=supp)) +
geom_point()+stat_smooth(method="lm",se=F)
ggplot(data=ToothGrowth, aes(x=dose, y=len, col=supp)) +
geom_point() + stat_smooth() + facet_grid(.~supp,margins=TRUE)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.4925
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.5075
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.5661e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2.2726
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 0.4925
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.5075
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 1.5661e-16
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 2.2726
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.4925
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.5075
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.5661e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2.2726
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 0.4925
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.5075
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 1.5661e-16
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 2.2726
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 0.4925
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.5075
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 2.5631e-16
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2.2726
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 0.4925
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 1.5075
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 2.5631e-16
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 2.2726
gfacet2 <- ggplot(data=myMovieData, aes(x=Year, y=Budget)) +
geom_point(shape=21,alpha=0.1,fill="light green") + theme_bw()
gfacet2 + facet_wrap(~ Type,ncol=2,)
gfacet2
gfacet2 + facet_grid(Type~Short)
gfacet2 <- ggplot(data=myMovieData, aes(x=Year, y=Budget)) +
geom_point(shape=21,alpha=0.1,fill="light green") + theme_bw()
gfacet2 + facet_wrap(~ Type,ncol=2,)
gfacet2
gfacet2 + facet_grid(Type~Short,margins = T)