movies.errors <- read.csv("http://nathanieldphillips.com/wp-content/uploads/2016/01/movies_errors.txt", , sep = "\t")
better.names <- c("name", "total.boxoffice.earnings", "dvd.us", "total.movie.budget", "rating", "genre", "time", "year.of.release", "sequel")
for (i in 1:9) {
names(movies.errors)[i] <- better.names[i]
}
head(movies.errors)
## name total.boxoffice.earnings dvd.us
## 1 Avatar 2783918982 230915507
## 2 Titanic 2207615668 NA
## 3 Jurassic World 1665443635 NA
## 4 The Avengers 1519479547 109515497
## 5 Furious 7 1516246709 14947559
## 6 The Avengers: Age of Ultron 1404705868 7312791
## total.movie.budget rating genre time year.of.release sequel
## 1 4.25e+08 13 Action 162 2009 0
## 2 2.00e+08 PG13 Thriller/Suspense 194 1997 0
## 3 2.15e+08 13 Action 124 2015 1
## 4 2.25e+08 13 Adventure 143 2012 0
## 5 1.90e+08 PG13 Action 137 2014 1
## 6 2.50e+08 PG13 Action 141 2015 1
total.boxoffice.earnings seems to be ok.
range(movies.errors$total.boxoffice.earnings)
## [1] 12512317 2783918982
dvd.us seems to be ok.
summary(movies.errors$dvd.us)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 6339 7563000 15800000 27980000 30570000 540400000 3566
total.movie.budget: A budget of zero seems to be unrealistic:
movies.errors$total.movie.budget[movies.errors$total.movie.budget == 0] <- NA
rating: There are different values that can be summarized:
table(movies.errors$rating)
##
## 13 g G General GP NC-17 Not Rated
## 452 58 46 54 1 5 196
## PG PG13 PG-13 R X
## 699 462 457 1489 3
movies.errors$rating[movies.errors$rating == "13"] <- "PG-13"
movies.errors$rating[movies.errors$rating == "PG13"] <- "PG-13"
movies.errors$rating[movies.errors$rating == "General"] <- "G"
movies.errors$rating[movies.errors$rating == "GP"] <- "PG"
movies.errors$rating[movies.errors$rating == "Not Rated"] <- NA
movies.errors$rating[movies.errors$rating == "X"] <- "NC-17"
movies.errors$rating[movies.errors$rating == "g"] <- "G"
genre: Typing errors:
movies.errors$genre[movies.errors$genre == "action"] <- "Action"
movies.errors$genre[movies.errors$genre == "Comdy"] <- "Comedy"
movies.errors$genre[movies.errors$genre == "comedy"]<- "Comedy"
movies.errors$genre[movies.errors$genre == "drama"]<- "Drama"
movies.errors$genre[movies.errors$genre == "musical"]<- "Musical"
movies.errors$genre[movies.errors$genre == "REALITY"]<- "Reality"
movies.errors$genre[movies.errors$genre == "ROMANTIC COMEDY"]<- "Romantic Comedy"
time: By making the values numeric, you automatically transform incorrect values like -20 to NA
movies.errors$time <- as.numeric(movies.errors$time)
year.of.realease: There are unrealistic values:
movies.errors$year.of.release[movies.errors$year.of.release < 1900]<- NA
movies.errors$year.of.release[movies.errors$year.of.release > 2016]<- NA
sequel: Different versions of equal values:
movies.errors$sequel[movies.errors$sequel == "n"]<- 0
movies.errors$sequel[movies.errors$sequel == "no"]<- 0
movies.errors$sequel[movies.errors$sequel == "y"]<- 1
movies.errors$sequel[movies.errors$sequel == "yes"]<- 1
movies.errors$age.decade <- cut(
x = movies.errors$year.of.release, # The raw data
breaks = seq(1900, 2020, 10), # The break points of the cuts
right = FALSE
)
table(movies.errors$age.decade)
##
## [1.9e+03,1.91e+03) [1.91e+03,1.92e+03) [1.92e+03,1.93e+03)
## 0 0 1
## [1.93e+03,1.94e+03) [1.94e+03,1.95e+03) [1.95e+03,1.96e+03)
## 3 21 36
## [1.96e+03,1.97e+03) [1.97e+03,1.98e+03) [1.98e+03,1.99e+03)
## 123 249 639
## [1.99e+03,2e+03) [2e+03,2.01e+03) [2.01e+03,2.02e+03)
## 1020 1725 1113
movies.errors$time.30 <- cut(
x = movies.errors$time, # The raw data
breaks = seq(0, 240, 30), # The break points of the cuts
right = FALSE
)
table(movies.errors$time.30)
##
## [0,30) [30,60) [60,90) [90,120) [120,150) [150,180) [180,210)
## 1053 503 93 33 39 47 122
## [210,240)
## 510
age.df <- data.frame("rating"= c("PG-13", "PG", "G", "R", "NC-17"), "age" = c("child", "child", "child", "adult", "adult"))
movies.errors <- merge(movies.errors,
age.df,
by = "rating"
)