Dataset

movies.errors <- read.table("http://nathanieldphillips.com/wp-content/uploads/2016/01/movies_errors.txt",
                     sep = "\t", 
                     header = T, 
                     stringsAsFactors = F)

recode.v <- function(original.vector, 
                     old.values,
                     new.values, 
                     others = NULL) {
  
if(is.null(others)) { 
  new.vector <- original.vector
}
  
if(is.null(others) == F) {
  
new.vector <- rep(others, length(original.vector))

}
  
for (i in 1:length(old.values)) {
  
change.log <- new.vector == old.values[i] & is.na(new.vector) == F

new.vector[change.log] <- new.values[i]

} 
  return(new.vector) 
  
}

Question 1

names(movies.errors)
## [1] "movie7653.name"           "total.boxoffice.earnings"
## [3] "dvd.earnings.in.us.639c"  "total.movie.budget"      
## [5] "rating.GPGPG13RNC17"      "genreX8423"              
## [7] "TIME"                     "year.of.release"         
## [9] "sequel"
names(movies.errors)[names(movies.errors) == "movie7653.name"] <- "name"
names(movies.errors)[names(movies.errors) == "total.boxoffice.earnings"] <- "boxoffice"
names(movies.errors)[names(movies.errors) == "dvd.earnings.in.us.639c"] <- "dvd.earnings"
names(movies.errors)[names(movies.errors) == "total.movie.budget"] <- "budget"
names(movies.errors)[names(movies.errors) == "rating.GPGPG13RNC17"] <- "rating"
names(movies.errors)[names(movies.errors) == "genreX8423"] <- "genre"
names(movies.errors)[names(movies.errors) == "TIME"] <- "length"
names(movies.errors)[names(movies.errors) == "year.of.release"] <- "year"

Question 2

names(movies.errors)
## [1] "name"         "boxoffice"    "dvd.earnings" "budget"      
## [5] "rating"       "genre"        "length"       "year"        
## [9] "sequel"
summary(movies.errors)
##      name             boxoffice          dvd.earnings      
##  Length:5000        Min.   :1.251e+07   Min.   :     6339  
##  Class :character   1st Qu.:2.256e+07   1st Qu.:  7562806  
##  Mode  :character   Median :4.222e+07   Median : 15797917  
##                     Mean   :9.821e+07   Mean   : 27981856  
##                     3rd Qu.:1.023e+08   3rd Qu.: 30566375  
##                     Max.   :2.784e+09   Max.   :540396685  
##                                         NA's   :3566       
##      budget             rating             genre          
##  Min.   :0.000e+00   Length:5000        Length:5000       
##  1st Qu.:0.000e+00   Class :character   Class :character  
##  Median :1.200e+07   Mode  :character   Mode  :character  
##  Mean   :1.882e+21                                        
##  3rd Qu.:3.925e+07                                        
##  Max.   :9.770e+23                                        
##                                                           
##     length               year         sequel         
##  Length:5000        Min.   :  17   Length:5000       
##  Class :character   1st Qu.:1991   Class :character  
##  Mode  :character   Median :2002   Mode  :character  
##                     Mean   :2015                     
##                     3rd Qu.:2009                     
##                     Max.   :3997                     
## 
#year
table(movies.errors$year) 
## 
##   17   23   51   63   67   69   79 1925 1937 1939 1940 1941 1942 1943 1944 
##    1    1    1    1    1    1    1    1    1    2    2    1    2    2    2 
## 1945 1946 1947 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 
##    3    6    2    1    1    2    2    5    6    2    4    3    3    8    5 
## 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 
##   10    8   14    5   14   16   16   17   18   17   14   17   15   23   24 
## 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 
##   24   34   36   45   56   60   56   53   66   65   63   76   75   69   88 
## 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 
##   86   84   96  100  110  119  107  117  113  129  134  156  141  154  165 
## 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 3025 3036 3059 3109 
##  200  210  216  220  248  246  180  170  155  113    1    1    1    1    1 
## 3127 3136 3151 3163 3178 3186 3210 3240 3258 3289 3290 3296 3298 3334 3344 
##    1    1    1    1    1    1    2    1    1    1    1    1    1    1    1 
## 3349 3354 3359 3403 3430 3437 3442 3467 3479 3499 3524 3528 3536 3540 3545 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 3551 3571 3586 3595 3619 3630 3642 3661 3669 3674 3700 3707 3725 3730 3776 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 3795 3816 3826 3838 3842 3855 3862 3886 3899 3913 3977 3994 3997 
##    1    1    1    1    1    1    1    1    1    1    1    1    1
recode <- function(x, lb, ub) {
  outliers <- x < lb | x > ub 
  x[outliers] <- NA

  return(x)}

movies.errors$year <- recode(movies.errors$year, lb = 1925, ub = 2015)


hist(movies.errors$year)

#length
table(movies.errors$length)
## 
##       -1      -10      -11      -12      -15      -19       -2      -20 
##        1        1        2        3        2        1        1        1 
##      -21      -24      -25      -27      -28      -29       -3      -30 
##        1        1        1        1        1        1        1        1 
##      -33      -35      -36      -37      -38      -39       -4      -40 
##        2        2        1        1        1        1        1        1 
##      -41      -42      -43      -45      -46      -48      -49       -5 
##        3        2        1        3        1        1        1        1 
##      -50      -51      -54      -56      -57      -58      -59       -6 
##        1        3        5        1        1        2        1        3 
##      -65      -66      -68      -70      -71      -78       -8      -80 
##        1        3        3        1        2        1        4        1 
##      -81      -82      -83      -85      -87      -88      -89       -9 
##        1        1        1        1        4        1        1        1 
##      -90      -91      -93      -94      -95      -96      -98      -99 
##        1        1        2        3        1        3        1        1 
##        0      100      101      102      103      104      105      106 
##      108       58       43       42       44       44       59       50 
##      107      108      109      110     1108      111     1117      112 
##       47       39       47       55        1       36        1       30 
##      113      114      115      116      117      118      119      120 
##       33       33       43       38       28       48       30       53 
##      121      122      123      124      125      126      127      128 
##       36       30       36       29       29       29       32       24 
##     1284      129      130      131      132      133      134      135 
##        1       28       33       20       19       18       15       18 
##      136      137     1373      138      139      140      141      142 
##       15       13        1       17       17       15       11        8 
##      143      144      145     1454      146      147      148      149 
##       12        9        5        1       12        4        2        6 
##      150      151      152      153      154      155      156      157 
##        7        3        8        3        5        7        1        3 
##      158      159     1591      160      161      162      164      165 
##        5        2        1        5        3        2        5        6 
##      167     1689      169     1694      170      172      174      175 
##        2        1        3        1        2        1        1        2 
##     1765      177      179      181      182      187      189      191 
##        1        1        2        2        1        1        3        1 
##     1920      194      195     1957      197      200      201     2019 
##        1        1        1        1        1        1        2        1 
##     2129      220     2212     2258      240     2705     2736     2755 
##        1        1        1        1        1        1        1        1 
##     2791     2874     3053     3145     3340     3420     3457     3578 
##        1        1        1        1        1        1        1        1 
##     3751     3838       39       40     4001     4048     4098     4371 
##        1        1        1        5        1        1        1        1 
##       44     4443     4662       47     4749     4826     4963     4983 
##        1        1        1        1        1        1        1        1 
##      506      515       52       62      643       65      660       69 
##        1        2        1        1        1        1        1        2 
##       70       71      720       74       75       76       77       78 
##        1        1        1        1        4        1        2        3 
##      783      784       79       80       81       82      824      825 
##        1        1        2        4       14       15        1        1 
##       83       84      845       85       86       87       88       89 
##       11        8        1       17       19       22       27       28 
##       90       91       92       93       94       95      959       96 
##       31       46       37       33       28       52        1       42 
##       97       98       99 not sure 
##       55       52       33        3
recode <- function(x, lb, ub) {
  outliers <- x < lb | x > ub 
  x[outliers] <- NA

  return(x)}

movies.errors$length <- recode(movies.errors$length, lb = 0, ub = 300)

movies.errors$length <- as.numeric(movies.errors$length)

#genre
table(movies.errors$genre)
## 
##              action              Action           Adventure 
##                   1                 691                 485 
##        Black Comedy               Comdy              comedy 
##                  33                   1                   2 
##              Comedy Concert/Performance         Documentary 
##                1208                  14                  63 
##               drama               Drama              Horror 
##                   4                1083                 299 
##     Multiple Genres             musical             Musical 
##                   2                   2                  77 
##             Reality             REALITY     Romantic Comedy 
##                   2                   2                 248 
##     ROMANTIC COMEDY   Thriller/Suspense             Western 
##                   3                 427                  38
recode.factor <- function(x, old, new) {
  x[x == old[1]] <- new[1]
  x[x == old[2]] <- new[2]
  x[x == old[3]] <- new[3]
  x[x == old[4]] <- new[4]
  x[x == old[5]] <- new[5]
  x[x == old[6]] <- new[6]
  x[x == old[7]] <- new[7]
  
  return(x)
  
}

movies.errors$genre <- recode.factor(movies.errors$genre, old = c("action", "Comdy", "comedy", "drama", "musical", "ROMANTIC COMEDY", "REALITY"), new = c("Action", "Comedy", "Comedy", "Drama", "Musical", "Reality", "Romantic Comedy"))

#rating

table(movies.errors$rating)
## 
##        13         g         G   General        GP     NC-17 Not Rated 
##       452        58        46        54         1         5       196 
##        PG     PG-13      PG13         R         X 
##       699       457       462      1489         3
recode.factor2 <- function(x, old, new) {
  x[x == old[1]] <- new[1]
  x[x == old[2]] <- new[2]
  x[x == old[3]] <- new[3]
  x[x == old[4]] <- new[4]
  x[x == old[5]] <- new[5]
  x[x == old[6]] <- new[6]
  
  return(x)
  
}

movies.errors$rating <- recode.factor2(movies.errors$rating, old = c("13", "g", "General", "GP", "PG13", "X"), new = c("PG-13", "G", "G", "PG", "PG-13", "Not Rated"))

#budget

table(movies.errors$budget)
## 
##         0     25000     60000     65000    114000    140000    150000 
##      1899         1         1         1         1         1         2 
##     2e+05    245000    250000    325000    375000     4e+05    450000 
##         2         1         2         1         1         1         1 
##     5e+05    550000     6e+05    658000    777000     8e+05     9e+05 
##         4         1         2         1         1         1         1 
##     1e+06   1100000   1125000   1150000   1200000   1250000   1350000 
##        15         1         1         1         4         1         1 
##   1488000   1500000   1600000   1650000   1700000   1750000   1800000 
##         1         6         1         1         1         1         4 
##   1987650     2e+06   2100000   2200000   2250000   2280000   2500000 
##         1        18         2         2         1         1         8 
##   2600000   2700000   2777000   2800000   2883848   2900000     3e+06 
##         2         3         1         4         1         2        28 
##   3100000   3200000   3250000   3300000   3400000   3450000   3500000 
##         1         3         2         2         3         1        14 
##   3600000   3700000   3800000   3900000     4e+06   4200000   4300000 
##         1         1         1         2        23         1         1 
##   4400000   4500000   4600000   4800000   4833610   4900000     5e+06 
##         2         7         1         1         1         1        55 
##   5250000   5300000   5500000   5600000   5700000   5800000   5900000 
##         1         1         9         2         2         1         1 
##     6e+06   6200000   6250000   6400000   6500000   6537890   6800000 
##        31         1         1         2         9         1         1 
##   6900000     7e+06   7200000   7250000   7300000   7303082   7500000 
##         2        36         2         1         1         1         7 
##   7800000     8e+06   8200000   8250000   8300000   8470000   8500000 
##         1        37         2         1         2         1        12 
##   8600000   8800000   8900000     9e+06   9100000   9200000   9300000 
##         1         1         1        21         2         1         1 
##   9400000   9500000   9700000   9800000     1e+07  10100000  10350000 
##         3         3         1         1        83         1         1 
##  10500000  10600000  10700000  10750000  10800000   1.1e+07  11400000 
##         3         1         2         1         1        31         1 
##  11500000   1.2e+07  12300000  12500000  12800000   1.3e+07  13200000 
##         1        65         1        10         1        36         1 
##  13300000  13500000  13700000  13900000   1.4e+07  14400000  14500000 
##         1         5         1         1        40         1         3 
##  14600000   1.5e+07  15250000  15500000  15600000  15700000   1.6e+07 
##         1       101         1         1         1         1        36 
##  16400000  16500000   1.7e+07  17500000  17700000   1.8e+07  18500000 
##         1         8        32         7         1        58         5 
##  18900000  18975000   1.9e+07  19400000  19700000     2e+07  20500000 
##         1         1        20         1         1       141         2 
##   2.1e+07  21500000   2.2e+07  22500000  22700000   2.3e+07  23600000 
##        14         4        37         2         1        20         1 
##   2.4e+07   2.5e+07  25100000  25500000  25530000   2.6e+07   2.7e+07 
##        23       114         1         1         1        31        18 
##  27500000   2.8e+07  28500000   2.9e+07     3e+07  30250000   3.1e+07 
##         9        37         1        11       133         1         9 
##  31500000   3.2e+07  32500000   3.3e+07  33500000   3.4e+07  34800000 
##         1        24         5        10         1         4         1 
##   3.5e+07  35200000   3.6e+07  36500000   3.7e+07  37500000   3.8e+07 
##        99         1        11         1        11         2        22 
##  38600000   3.9e+07     4e+07   4.1e+07   4.2e+07  42500000   4.3e+07 
##         1         6       131         3        21         1         5 
##   4.4e+07  44500000   4.5e+07   4.6e+07   4.7e+07  47500000   4.8e+07 
##         5         1        63         6         6         1        16 
##   4.9e+07     5e+07  50200000   5.1e+07   5.2e+07  52500000   5.3e+07 
##         4        98         1         3        10         1         9 
##  53012938   5.4e+07   5.5e+07   5.6e+07   5.7e+07  57500000   5.8e+07 
##         1         6        54         3         5         3        11 
##     6e+07   6.1e+07   6.2e+07   6.3e+07  63700000   6.4e+07   6.5e+07 
##        93         4         5         4         1         2        45 
##   6.6e+07   6.7e+07  67500000   6.8e+07   6.9e+07     7e+07  70702619 
##         3         2         3         7         2        48         1 
##  71500000  71682975   7.2e+07  72500000   7.3e+07  73243106   7.4e+07 
##         1         1         6         4         3         1         2 
##   7.5e+07   7.6e+07  77500000  77600000   7.8e+07   7.9e+07  79300000 
##        38         4         1         1         6         2         1 
##     8e+07   8.1e+07   8.2e+07  82500000   8.4e+07   8.5e+07   8.6e+07 
##        66         1         3         4         3        35         1 
##   8.7e+07  87500000   8.8e+07     9e+07   9.1e+07   9.2e+07  92500000 
##         1         1         2        35         1         4         1 
##   9.3e+07   9.4e+07   9.5e+07   9.7e+07   9.8e+07     1e+08  1.02e+08 
##         2         4        11         1         1        38         3 
## 102500000  1.03e+08 103300000  1.05e+08  1.08e+08  1.09e+08   1.1e+08 
##         1         2         1         4         1         3        22 
##  1.12e+08  1.15e+08  1.17e+08   1.2e+08  1.23e+08  1.25e+08  1.27e+08 
##         1         7         1        15         1        18         1 
## 127500000   1.3e+08  1.32e+08  1.35e+08 136200000  1.37e+08 137500000 
##         2        16         2         7         1         2         1 
##  1.38e+08  1.39e+08   1.4e+08  1.42e+08  1.45e+08  1.49e+08   1.5e+08 
##         2         1         9         1         7         1        33 
##  1.51e+08 151500000  1.55e+08   1.6e+08  1.63e+08  1.65e+08   1.7e+08 
##         1         1         3         8         1         5         9 
##  1.75e+08  1.78e+08  1.79e+08   1.8e+08  1.85e+08  1.86e+08   1.9e+08 
##         8         1         1         5         3         1         6 
##  1.95e+08     2e+08  2.05e+08  2.07e+08  2.09e+08   2.1e+08  2.15e+08 
##         2        16         1         1         1         4         2 
##   2.2e+08  2.25e+08   2.3e+08  2.32e+08   2.5e+08  2.58e+08   2.6e+08 
##         1         4         1         1         6         1         1 
##  2.75e+08     3e+08  4.25e+08  9.02e+23  9.05e+23  9.15e+23  9.35e+23 
##         3         1         1         1         1         1         1 
##   9.4e+23  9.49e+23  9.58e+23  9.63e+23  9.65e+23  9.77e+23 
##         1         1         1         1         1         1
recode <- function(x, lb) {
  outliers <- x < lb 
  x[outliers] <- NA

  return(x)}

movies.errors$budget <- recode(movies.errors$budget, lb = 0)

Question 3

movies.errors$decade <- cut(
  x = movies.errors$year,
  breaks = seq(1920, 2020, 10)
)

table(movies.errors$decade)
## 
## (1.92e+03,1.93e+03] (1.93e+03,1.94e+03] (1.94e+03,1.95e+03] 
##                   1                   5                  20 
## (1.95e+03,1.96e+03] (1.96e+03,1.97e+03] (1.97e+03,1.98e+03] 
##                  40                 135                 288 
## (1.98e+03,1.99e+03]    (1.99e+03,2e+03]    (2e+03,2.01e+03] 
##                 671                1061                1844 
## (2.01e+03,2.02e+03] 
##                 864

Question 4

movies.errors$time.30 <- cut(
  x = movies.errors$length,
  breaks = seq(0, 300, 30)
)

table(movies.errors$time.30)
## 
##    (0,30]   (30,60]   (60,90]  (90,120] (120,150] (150,180] (180,210] 
##         0         0         0       900       549        72        14 
## (210,240] (240,270] (270,300] 
##         2         0         0

Question 5

movies.errors$age <- movies.errors$rating

movies.errors$age <- recode.v(original.vector = movies.errors$age,
         old.values = c("G", "PG", "PG-13", "N", "NC-17", "X", "R"),
         new.values = c("child", "child", "child", "adult", "adult", "adult", "adult")
         )

table(movies.errors$age)
## 
##     adult     child Not Rated 
##      1494      2229       199

Question 6

year.index <- read.table("http://nathanieldphillips.com/wp-content/uploads/2016/01/year_index.txt", 
                     sep = "\t", 
                     header = T, 
                     stringsAsFactors = F)
names(year.index)
## [1] "year"                   "economy"               
## [3] "international.conflict"
movies.errors2 <- merge(x = movies.errors,
                    y = year.index,
                    by = "year")

names(movies.errors2)
##  [1] "year"                   "name"                  
##  [3] "boxoffice"              "dvd.earnings"          
##  [5] "budget"                 "rating"                
##  [7] "genre"                  "length"                
##  [9] "sequel"                 "decade"                
## [11] "time.30"                "age"                   
## [13] "economy"                "international.conflict"
aggregate(boxoffice ~ economy, data = movies.errors2, FUN = median)
##   economy boxoffice
## 1    good  43092117
## 2      ok  42598498
## 3    poor  41276266