movies.errors <- read.csv("http://nathanieldphillips.com/wp-content/uploads/2016/01/movies_errors.txt", , sep = "\t")

Q1

better.names <- c("name", "total.boxoffice.earnings", "dvd.us", "total.movie.budget", "rating", "genre", "time", "year.of.release", "sequel")
for (i in 1:9) {
  names(movies.errors)[i] <- better.names[i]
}
head(movies.errors)
##                          name total.boxoffice.earnings    dvd.us
## 1                      Avatar               2783918982 230915507
## 2                     Titanic               2207615668        NA
## 3              Jurassic World               1665443635        NA
## 4                The Avengers               1519479547 109515497
## 5                   Furious 7               1516246709  14947559
## 6 The Avengers: Age of Ultron               1404705868   7312791
##   total.movie.budget rating             genre time year.of.release sequel
## 1           4.25e+08     13            Action  162            2009      0
## 2           2.00e+08   PG13 Thriller/Suspense  194            1997      0
## 3           2.15e+08     13            Action  124            2015      1
## 4           2.25e+08     13         Adventure  143            2012      0
## 5           1.90e+08   PG13            Action  137            2014      1
## 6           2.50e+08   PG13            Action  141            2015      1

Q2

total.boxoffice.earnings seems to be ok.

range(movies.errors$total.boxoffice.earnings)
## [1]   12512317 2783918982

dvd.us seems to be ok.

summary(movies.errors$dvd.us)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##      6339   7563000  15800000  27980000  30570000 540400000      3566

total.movie.budget: A budget of zero seems to be unrealistic:

movies.errors$total.movie.budget[movies.errors$total.movie.budget == 0] <- NA

rating: There are different values that can be summarized:

table(movies.errors$rating)
## 
##        13         g         G   General        GP     NC-17 Not Rated 
##       452        58        46        54         1         5       196 
##        PG      PG13     PG-13         R         X 
##       699       462       457      1489         3
movies.errors$rating[movies.errors$rating == "13"] <- "PG-13"
movies.errors$rating[movies.errors$rating == "PG13"] <- "PG-13"
movies.errors$rating[movies.errors$rating == "General"] <- "G"
movies.errors$rating[movies.errors$rating == "GP"] <- "PG"
movies.errors$rating[movies.errors$rating == "Not Rated"] <- NA
movies.errors$rating[movies.errors$rating == "X"] <- "NC-17"
movies.errors$rating[movies.errors$rating == "g"] <- "G"

genre: Typing errors:

movies.errors$genre[movies.errors$genre == "action"] <- "Action"
movies.errors$genre[movies.errors$genre == "Comdy"] <- "Comedy"
movies.errors$genre[movies.errors$genre == "comedy"]<- "Comedy"
movies.errors$genre[movies.errors$genre == "drama"]<- "Drama"
movies.errors$genre[movies.errors$genre == "musical"]<- "Musical"
movies.errors$genre[movies.errors$genre == "REALITY"]<- "Reality"
movies.errors$genre[movies.errors$genre == "ROMANTIC COMEDY"]<- "Romantic Comedy"

time: By making the values numeric, you automatically transform incorrect values like -20 to NA

movies.errors$time <- as.numeric(movies.errors$time)

year.of.realease: There are unrealistic values:

movies.errors$year.of.release[movies.errors$year.of.release < 1900]<- NA
movies.errors$year.of.release[movies.errors$year.of.release > 2016]<- NA

sequel: Different versions of equal values:

movies.errors$sequel[movies.errors$sequel == "n"]<- 0
movies.errors$sequel[movies.errors$sequel == "no"]<- 0
movies.errors$sequel[movies.errors$sequel == "y"]<- 1
movies.errors$sequel[movies.errors$sequel == "yes"]<- 1

Q3

movies.errors$age.decade <- cut(
x = movies.errors$year.of.release, # The raw data
breaks = seq(1900, 2020, 10), # The break points of the cuts
right = FALSE
)
table(movies.errors$age.decade)
## 
##  [1.9e+03,1.91e+03) [1.91e+03,1.92e+03) [1.92e+03,1.93e+03) 
##                   0                   0                   1 
## [1.93e+03,1.94e+03) [1.94e+03,1.95e+03) [1.95e+03,1.96e+03) 
##                   3                  21                  36 
## [1.96e+03,1.97e+03) [1.97e+03,1.98e+03) [1.98e+03,1.99e+03) 
##                 123                 249                 639 
##    [1.99e+03,2e+03)    [2e+03,2.01e+03) [2.01e+03,2.02e+03) 
##                1020                1725                1113

Q4

movies.errors$time.30 <- cut(
x = movies.errors$time, # The raw data
breaks = seq(0, 240, 30), # The break points of the cuts
right = FALSE
)
table(movies.errors$time.30)
## 
##    [0,30)   [30,60)   [60,90)  [90,120) [120,150) [150,180) [180,210) 
##      1053       503        93        33        39        47       122 
## [210,240) 
##       510

Q5

age.df <- data.frame("rating"= c("PG-13", "PG", "G", "R", "NC-17"), "age" = c("child", "child", "child", "adult", "adult"))
movies.errors <- merge(movies.errors,
                       age.df,
                       by = "rating"
                       )