Setup

library(tidyverse)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(reprex)
library(readr)

Finishing from last week:

Setup

The Data

Movie_Ratings.csv

Download link: https://urldefense.com/v3/__https://www.dropbox.com/s/ebr2gzy95pb9lsx/Movie*20Ratings.csv?dl=1__;JQ!!BZ50a36bapWJ!43rCQauXRJbBH5rrZGHgcy0j9uY3ewa9J5kaubTBMW8B-1bi3zkTlfGEFLsiCIcpWZg$

Variables:

imdb_1000.csv

Download link: https://urldefense.com/v3/__https://www.dropbox.com/s/ov5cntaof9lj9v6/imdb_1000.csv?dl=1__;!!BZ50a36bapWJ!43rCQauXRJbBH5rrZGHgcy0j9uY3ewa9J5kaubTBMW8B-1bi3zkTlfGEFLsisKt5jA0$

Variables:

The Tasks

Cleaning/Plotting

  1. Read in and summarize the data.
library(here)
## here() starts at /data/biostat/a089861/A089861/R Trainings
library(readr)
rt <- read_csv("Movie Ratings.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Film = col_character(),
##   Genre = col_character(),
##   `Rotten Tomatoes Ratings %` = col_double(),
##   `Audience Ratings %` = col_double(),
##   `Budget (million $)` = col_double(),
##   `Year of release` = col_double()
## )
imdb <- read_csv("imdb_1000.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   star_rating = col_double(),
##   title = col_character(),
##   content_rating = col_character(),
##   genre = col_character(),
##   duration = col_double(),
##   actors_list = col_character()
## )

.

####Stringr

How many twilight films? problem- search through data and apply stringr functions to the column

found strings and looked for matches: results in 4 titles

str_subset (rt$Film, "Twilight")
## [1] "The Twilight Saga: Eclipse"  "The Twilight Saga: New Moon"
## [3] "Twilight"                    "Twilight: Breaking Dawn"
str_subset (rt$Film, "Twilight")
## [1] "The Twilight Saga: Eclipse"  "The Twilight Saga: New Moon"
## [3] "Twilight"                    "Twilight: Breaking Dawn"
rt %>%
  filter(str_detect(Film, "Twilight"))
## # A tibble: 4 x 6
##   Film           Genre  `Rotten Tomatoes Ra… `Audience Rating… `Budget (million…
##   <chr>          <chr>                 <dbl>             <dbl>             <dbl>
## 1 The Twilight … Drama                    50                74                68
## 2 The Twilight … Drama                    27                78                50
## 3 Twilight       Roman…                   49                82                37
## 4 Twilight: Bre… Roman…                   26                68               110
## # … with 1 more variable: Year of release <dbl>

change title of film to Vampire Saga

str_subset (rt$Film, "Twilight")
## [1] "The Twilight Saga: Eclipse"  "The Twilight Saga: New Moon"
## [3] "Twilight"                    "Twilight: Breaking Dawn"
rt %>%
  filter(str_detect(Film, "Twilight")) %>%
  mutate(
    Film = str_replace(Film, "Twilight", replacement = "Vampire"))
## # A tibble: 4 x 6
##   Film          Genre  `Rotten Tomatoes Rat… `Audience Rating… `Budget (million…
##   <chr>         <chr>                  <dbl>             <dbl>             <dbl>
## 1 The Vampire … Drama                     50                74                68
## 2 The Vampire … Drama                     27                78                50
## 3 Vampire       Roman…                    49                82                37
## 4 Vampire: Bre… Roman…                    26                68               110
## # … with 1 more variable: Year of release <dbl>
  1. Which movie has the shortest name

need to make a new column and mutate for title length

str_subset (rt$Film, "Twilight")
## [1] "The Twilight Saga: Eclipse"  "The Twilight Saga: New Moon"
## [3] "Twilight"                    "Twilight: Breaking Dawn"
rt %>%
  filter(str_detect(Film, "Twilight")) %>%
  mutate(
    Film = str_replace(Film, "Twilight", replacement = "Vampire"))
## # A tibble: 4 x 6
##   Film          Genre  `Rotten Tomatoes Rat… `Audience Rating… `Budget (million…
##   <chr>         <chr>                  <dbl>             <dbl>             <dbl>
## 1 The Vampire … Drama                     50                74                68
## 2 The Vampire … Drama                     27                78                50
## 3 Vampire       Roman…                    49                82                37
## 4 Vampire: Bre… Roman…                    26                68               110
## # … with 1 more variable: Year of release <dbl>
rt %>%
  mutate(
    Title_Length = str_length(Film)
  )
## # A tibble: 562 x 7
##    Film        Genre   `Rotten Tomatoes Rat… `Audience Rating… `Budget (million…
##    <chr>       <chr>                   <dbl>             <dbl>             <dbl>
##  1 (500) Days… Comedy                     87                81                 8
##  2 10,000 B.C. Advent…                     9                44               105
##  3 12 Rounds   Action                     30                52                20
##  4 127 Hours   Advent…                    93                84                18
##  5 17 Again    Comedy                     55                70                20
##  6 2012        Action                     39                63               200
##  7 27 Dresses  Comedy                     40                71                30
##  8 30 Days of… Horror                     50                57                32
##  9 30 Minutes… Comedy                     43                48                28
## 10 50/50       Comedy                     93                93                 8
## # … with 552 more rows, and 2 more variables: Year of release <dbl>,
## #   Title_Length <int>
str_subset (rt$Film, "Twilight")
## [1] "The Twilight Saga: Eclipse"  "The Twilight Saga: New Moon"
## [3] "Twilight"                    "Twilight: Breaking Dawn"
rt %>%
  filter(str_detect(Film, "Twilight")) %>%
  mutate(
    Film = str_replace(Film, "Twilight", replacement = "Vampire"))
## # A tibble: 4 x 6
##   Film          Genre  `Rotten Tomatoes Rat… `Audience Rating… `Budget (million…
##   <chr>         <chr>                  <dbl>             <dbl>             <dbl>
## 1 The Vampire … Drama                     50                74                68
## 2 The Vampire … Drama                     27                78                50
## 3 Vampire       Roman…                    49                82                37
## 4 Vampire: Bre… Roman…                    26                68               110
## # … with 1 more variable: Year of release <dbl>
rt %>%
  mutate(
    Title_Length = str_length(Film)
  ) %>%
  top_n(1, Title_Length)
## # A tibble: 1 x 7
##   Film              Genre  `Rotten Tomatoes R… `Audience Ratin… `Budget (millio…
##   <chr>             <chr>                <dbl>            <dbl>            <dbl>
## 1 The Chronicles o… Adven…                  49               63              155
## # … with 2 more variables: Year of release <dbl>, Title_Length <int>

Strinr_Lubridate

library(tidyverse)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Dates

save your birthday as a datetime object in R

on what day of the week does your birthday fall in 2020? on what day of the week should you plan your 100th birthday? Find the date that is exactly 9 months before the day you were born. If it is on a holiday, think about the implications.

my_bday <- mdy("April 2, 1989")

year(my_bday) <- 2020

my_bday
## [1] "2020-04-02"
wday(my_bday, label = TRUE)
## [1] Thu
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
my_bday <- mdy("April 2, 1989")

year(my_bday) <- 2020

my_bday_100 <- my_bday + years(100)

wday(my_bday_100, label = TRUE)
## [1] Tue
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
my_bday <- mdy("April 2, 1989")

year(my_bday) <- 2020

wday(my_bday_100, label = TRUE)
## [1] Tue
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
my_bday - months(9)
## [1] "2019-07-02"