Course webpage
install.packages("tidyverse")
##
## The downloaded binary packages are in
## /var/folders/79/1y_t9vcx3ws9shyf4nd1vblc0000gn/T//RtmpY9lUBM/downloaded_packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
stats::filter # функция filter из пакета stats
## function (x, filter, method = c("convolution", "recursive"),
## sides = 2L, circular = FALSE, init = NULL)
## {
## method <- match.arg(method)
## x <- as.ts(x)
## storage.mode(x) <- "double"
## xtsp <- tsp(x)
## n <- as.integer(NROW(x))
## if (is.na(n))
## stop(gettextf("invalid value of %s", "NROW(x)"), domain = NA)
## nser <- NCOL(x)
## filter <- as.double(filter)
## nfilt <- as.integer(length(filter))
## if (is.na(nfilt))
## stop(gettextf("invalid value of %s", "length(filter)"),
## domain = NA)
## if (anyNA(filter))
## stop("missing values in 'filter'")
## if (method == "convolution") {
## if (nfilt > n)
## stop("'filter' is longer than time series")
## sides <- as.integer(sides)
## if (is.na(sides) || (sides != 1L && sides != 2L))
## stop("argument 'sides' must be 1 or 2")
## circular <- as.logical(circular)
## if (is.na(circular))
## stop("'circular' must be logical and not NA")
## if (is.matrix(x)) {
## y <- matrix(NA, n, nser)
## for (i in seq_len(nser)) y[, i] <- .Call(C_cfilter,
## x[, i], filter, sides, circular)
## }
## else y <- .Call(C_cfilter, x, filter, sides, circular)
## }
## else {
## if (missing(init)) {
## init <- matrix(0, nfilt, nser)
## }
## else {
## ni <- NROW(init)
## if (ni != nfilt)
## stop("length of 'init' must equal length of 'filter'")
## if (NCOL(init) != 1L && NCOL(init) != nser) {
## stop(sprintf(ngettext(nser, "'init' must have %d column",
## "'init' must have 1 or %d columns", domain = "R-stats"),
## nser), domain = NA)
## }
## if (!is.matrix(init))
## dim(init) <- c(nfilt, nser)
## }
## ind <- seq_len(nfilt)
## if (is.matrix(x)) {
## y <- matrix(NA, n, nser)
## for (i in seq_len(nser)) y[, i] <- .Call(C_rfilter,
## x[, i], filter, c(rev(init[, i]), double(n)))[-ind]
## }
## else y <- .Call(C_rfilter, x, filter, c(rev(init[, 1L]),
## double(n)))[-ind]
## }
## tsp(y) <- xtsp
## class(y) <- if (nser > 1L)
## c("mts", "ts")
## else "ts"
## y
## }
## <bytecode: 0x7fc78c925038>
## <environment: namespace:stats>
dat <- read_csv("https://github.com/Godoy/imdb-5000-movie-dataset/raw/master/data/movie_metadata.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## color = col_character(),
## director_name = col_character(),
## actor_2_name = col_character(),
## genres = col_character(),
## actor_1_name = col_character(),
## movie_title = col_character(),
## actor_3_name = col_character(),
## plot_keywords = col_character(),
## movie_imdb_link = col_character(),
## language = col_character(),
## country = col_character(),
## content_rating = col_character()
## )
## See spec(...) for full column specifications.
dat
foo <- function(x, y) {
print(paste("x =", x, ", y =", y))
}
foo(2, 3)
## [1] "x = 2 , y = 3"
2 %>% foo(3)
## [1] "x = 2 , y = 3"
2 %>% sqrt %>% sin
## [1] 0.9877659
sin(sqrt(2))
## [1] 0.9877659
dat %>% select(color, director_name, duration, imdb_score)
select(dat, color, director_name, duration)
dat %>% select(color:duration)
dat %>% select(-(color:duration))
dat %>% select(ends_with("_likes"))
dat %>% select(matches(".*_.*"))
?select
dat %>% select(color:duration) %>% filter(director_name == "James Cameron",
duration > 170)
dat %>% select(color:duration) %>%
filter(director_name == "James Cameron" |
duration > 300)
dat %>% select(color:duration) %>%
filter(director_name == "James Cameron") %>%
arrange(desc(duration))
dat %>% select(color, duration) %>%
arrange(color, desc(duration))
dat %>% select(color:duration) %>%
group_by(director_name) %>%
summarise(mean_dur = mean(duration),
max_num_critic = max(num_critic_for_reviews),
n = n()) %>%
arrange(desc(mean_dur))
dat %>% select(color:duration) %>%
mutate(critics_per_minute = num_critic_for_reviews / duration)
dat %>% select(color:duration) %>%
group_by(director_name) %>%
mutate(n = n(),
mean_duration = mean(duration),
diff_from_average = duration - mean(duration)) %>%
filter(n > 3) %>%
arrange(director_name) %>%
ungroup() %>%
filter(! is.na(diff_from_average)) %>% # ! is not
summarise(mean_diff_from_average = mean(diff_from_average ** 2))
dat %>% select(director_name:duration) %>%
mutate(critics_per_minute = num_critic_for_reviews / duration)
dat %>% select(director_name:duration) %>%
transmute(critics_per_minute = num_critic_for_reviews / duration)
new_dat <- data.frame(x = c(1, 2, 3), y = c(2, 3, 4))
rownames(new_dat) <- c("alpha", "beta", "gamma")
new_dat %>% rownames_to_column(var = "index")
gradebook <- data.frame(first_name = c("Alice", "Alice", "Bob"),
last_name = c("Smith", "Smith", "Johnson"),
course = c("Algebra", "Geometry", "Algebra"),
midterm_grade = c(3, 5, 4),
final_grade = c(4, 3, 5))
gradebook
gradebook %>% pivot_wider(names_from = course,
values_from = c(midterm_grade,
final_grade),
values_fill=list(midterm_grade=0,
final_grade=1))
dat %>% select(director_name, matches("actor.*facebook_likes")) %>%
group_by(director_name) %>%
summarise(actor_1_facebook_likes = mean(actor_1_facebook_likes),
actor_2_facebook_likes = mean(actor_2_facebook_likes),
actor_3_facebook_likes = mean(actor_3_facebook_likes)) %>%
pivot_longer(cols=matches("actor.*_facebook_likes"),
names_to="likes")
gradebook <- data.frame(first_name = c("Alice", "Alice", "Bob", "Alice"),
last_name = c("Smith", "Smith", "Johnson", "Smith"),
course = c("Algebra", "Geometry", "Algebra", "Algebra"),
midterm_grade = c(3, 5, 4, 5),
final_grade = c(4, 3, 5, 5))
gradebook
gradebook %>% pivot_wider(names_from = course,
values_from = c(midterm_grade,
final_grade),
values_fill=list(midterm_grade=0,
final_grade=1),
values_fn=list(midterm_grade = max,
final_grade = mean))
student_to_id <- data.frame(student=c("Alice", "Bob", "Claudia"),
id=c(5, 9, 2))
id_to_grade <- data.frame(idx=c(9, 2, 10),
grade=c(10, 8, 3))
student_to_id
id_to_grade
left_join(student_to_id, id_to_grade, by=c(id="idx"))
inner_join(student_to_id, id_to_grade, by=c(id="idx"))