This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Question 0

Download the data and save it as a dataframe called movies.errors

movies.errors <- read.delim("http://nathanieldphillips.com/wp-content/uploads/2016/01/movies_errors.txt",header=TRUE)

Question 1

The column names in the dataframe are not great. Some contain some random numbers/letters, and others are too long. Change the column names of the dataframe so they make sense. I recommend making each name a single word with no capital letters. But it’s up to you.

names(movies.errors)
## [1] "movie7653.name"           "total.boxoffice.earnings"
## [3] "dvd.earnings.in.us.639c"  "total.movie.budget"      
## [5] "rating.GPGPG13RNC17"      "genreX8423"              
## [7] "TIME"                     "year.of.release"         
## [9] "sequel"
names(movies.errors)[1] <- "name"
names(movies.errors)[2] <- "boxoffice.total"
names(movies.errors)[3] <- "dvd.revenue"
names(movies.errors)[4] <- "budget"
names(movies.errors)[5] <- "rating"
names(movies.errors)[6] <- "genre"
names(movies.errors)[7] <- "time"

#The other two names make sense so I leave them as they are called.

Question 2

Check ALL the columns (except for the first “name” column) for errors! If you find any errors in a column, correct them!

Keep the following tips in mind:

To get a quick look at the values in a numeric column with many (e.g; over 100 possible values), use summary() or hist()

To get a quick look at the values in a string column (or a numeric column with only a few possible values), use table()

In numeric columns, check for values that don’t make any sense (that is, those that are too large or too small).

In character columns, check for misspelled values. If you find values that are misspelled, correct them.

If you want to convert a character column to numeric, make sure all the values look like numbers before using as.numeric(). For sample, if a numeric column has a value of “one hundred”, you’ll need to convert this to 100.

#boxoffice.total
summary(movies.errors$boxoffice.total)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 1.251e+07 2.256e+07 4.222e+07 9.821e+07 1.023e+08 2.784e+09
hist(movies.errors$boxoffice.total)

#seems all pretty good, I don't correct anything

#dvd.revenue
summary(movies.errors$dvd.revenue)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##      6339   7563000  15800000  27980000  30570000 540400000      3566
dvd.correction <- function(x, outlier.def = 2) {
is.outlier <- x > 500000000 | x <  500000
x.nooutliers <- x[is.outlier == F]
x[is.outlier == T] <- NA
return(x.nooutliers)
}


#budget
summary(movies.errors$budget)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 1.200e+07 1.882e+21 3.925e+07 9.770e+23
plot(movies.errors$budget)

budget.correction <- function(x, outlier.def = 2) {
is.outlier <- x > 20000000 | x <  200000
x.nooutliers <- x[is.outlier == F]
x[is.outlier == T] <- NA
return(x)
}

#rating

table(movies.errors$rating)
## 
##        13         g         G   General        GP     NC-17 Not Rated 
##       452        58        46        54         1         5       196 
##        PG     PG-13      PG13         R         X 
##       699       457       462      1489         3
rating.correction <- function(x) {
  true.ratings <- c("13","G", "PG", "PG -13", "R", "NC-17")
  x[true.ratings == F] <- NA
  return(x)

}


#genre

table(movies.errors$genre)
## 
##              action              Action           Adventure 
##                   1                 691                 485 
##        Black Comedy               Comdy              comedy 
##                  33                   1                   2 
##              Comedy Concert/Performance         Documentary 
##                1208                  14                  63 
##               drama               Drama              Horror 
##                   4                1083                 299 
##     Multiple Genres             musical             Musical 
##                   2                   2                  77 
##             Reality             REALITY     Romantic Comedy 
##                   2                   2                 248 
##     ROMANTIC COMEDY   Thriller/Suspense             Western 
##                   3                 427                  38
genre.correction <- function (x) {
  true.genres <- c("Action", "Adventure", "Black Comedy", "Comedy", "Concert/Performance", "Documentary " , "Drama", "Horror","Multiple Genres", "Musical", "Reality", "Romantic Comedy ", "Thriller/Suspense", "Western" )
  x[true.genres == F] <- NA
  return(x)
}

movies.errors$genre <- genre.correction(movies.errors$genre)

#time

movies.errors$time.n <- as.numeric(movies.errors$time)
time.correction <- function(x, outlier.def = 2) {
is.outlier <- x > 300 | x <  30
x[is.outlier == T] <- NA
return(x)
}

movies.errors$time <- time.correction(movies.errors$time.n)

```

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.