Dataset
movies.errors <- read.table("http://nathanieldphillips.com/wp-content/uploads/2016/01/movies_errors.txt",
sep = "\t",
header = T,
stringsAsFactors = F)
recode.v <- function(original.vector,
old.values,
new.values,
others = NULL) {
if(is.null(others)) {
new.vector <- original.vector
}
if(is.null(others) == F) {
new.vector <- rep(others, length(original.vector))
}
for (i in 1:length(old.values)) {
change.log <- new.vector == old.values[i] & is.na(new.vector) == F
new.vector[change.log] <- new.values[i]
}
return(new.vector)
}
Question 1
names(movies.errors)
## [1] "movie7653.name" "total.boxoffice.earnings"
## [3] "dvd.earnings.in.us.639c" "total.movie.budget"
## [5] "rating.GPGPG13RNC17" "genreX8423"
## [7] "TIME" "year.of.release"
## [9] "sequel"
names(movies.errors)[names(movies.errors) == "movie7653.name"] <- "name"
names(movies.errors)[names(movies.errors) == "total.boxoffice.earnings"] <- "boxoffice"
names(movies.errors)[names(movies.errors) == "dvd.earnings.in.us.639c"] <- "dvd.earnings"
names(movies.errors)[names(movies.errors) == "total.movie.budget"] <- "budget"
names(movies.errors)[names(movies.errors) == "rating.GPGPG13RNC17"] <- "rating"
names(movies.errors)[names(movies.errors) == "genreX8423"] <- "genre"
names(movies.errors)[names(movies.errors) == "TIME"] <- "length"
names(movies.errors)[names(movies.errors) == "year.of.release"] <- "year"
Question 2
names(movies.errors)
## [1] "name" "boxoffice" "dvd.earnings" "budget"
## [5] "rating" "genre" "length" "year"
## [9] "sequel"
summary(movies.errors)
## name boxoffice dvd.earnings
## Length:5000 Min. :1.251e+07 Min. : 6339
## Class :character 1st Qu.:2.256e+07 1st Qu.: 7562806
## Mode :character Median :4.222e+07 Median : 15797917
## Mean :9.821e+07 Mean : 27981856
## 3rd Qu.:1.023e+08 3rd Qu.: 30566375
## Max. :2.784e+09 Max. :540396685
## NA's :3566
## budget rating genre
## Min. :0.000e+00 Length:5000 Length:5000
## 1st Qu.:0.000e+00 Class :character Class :character
## Median :1.200e+07 Mode :character Mode :character
## Mean :1.882e+21
## 3rd Qu.:3.925e+07
## Max. :9.770e+23
##
## length year sequel
## Length:5000 Min. : 17 Length:5000
## Class :character 1st Qu.:1991 Class :character
## Mode :character Median :2002 Mode :character
## Mean :2015
## 3rd Qu.:2009
## Max. :3997
##
#year
table(movies.errors$year)
##
## 17 23 51 63 67 69 79 1925 1937 1939 1940 1941 1942 1943 1944
## 1 1 1 1 1 1 1 1 1 2 2 1 2 2 2
## 1945 1946 1947 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
## 3 6 2 1 1 2 2 5 6 2 4 3 3 8 5
## 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975
## 10 8 14 5 14 16 16 17 18 17 14 17 15 23 24
## 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990
## 24 34 36 45 56 60 56 53 66 65 63 76 75 69 88
## 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005
## 86 84 96 100 110 119 107 117 113 129 134 156 141 154 165
## 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 3025 3036 3059 3109
## 200 210 216 220 248 246 180 170 155 113 1 1 1 1 1
## 3127 3136 3151 3163 3178 3186 3210 3240 3258 3289 3290 3296 3298 3334 3344
## 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
## 3349 3354 3359 3403 3430 3437 3442 3467 3479 3499 3524 3528 3536 3540 3545
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 3551 3571 3586 3595 3619 3630 3642 3661 3669 3674 3700 3707 3725 3730 3776
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 3795 3816 3826 3838 3842 3855 3862 3886 3899 3913 3977 3994 3997
## 1 1 1 1 1 1 1 1 1 1 1 1 1
recode <- function(x, lb, ub) {
outliers <- x < lb | x > ub
x[outliers] <- NA
return(x)}
movies.errors$year <- recode(movies.errors$year, lb = 1925, ub = 2015)
hist(movies.errors$year)

#length
table(movies.errors$length)
##
## -1 -10 -11 -12 -15 -19 -2 -20
## 1 1 2 3 2 1 1 1
## -21 -24 -25 -27 -28 -29 -3 -30
## 1 1 1 1 1 1 1 1
## -33 -35 -36 -37 -38 -39 -4 -40
## 2 2 1 1 1 1 1 1
## -41 -42 -43 -45 -46 -48 -49 -5
## 3 2 1 3 1 1 1 1
## -50 -51 -54 -56 -57 -58 -59 -6
## 1 3 5 1 1 2 1 3
## -65 -66 -68 -70 -71 -78 -8 -80
## 1 3 3 1 2 1 4 1
## -81 -82 -83 -85 -87 -88 -89 -9
## 1 1 1 1 4 1 1 1
## -90 -91 -93 -94 -95 -96 -98 -99
## 1 1 2 3 1 3 1 1
## 0 100 101 102 103 104 105 106
## 108 58 43 42 44 44 59 50
## 107 108 109 110 1108 111 1117 112
## 47 39 47 55 1 36 1 30
## 113 114 115 116 117 118 119 120
## 33 33 43 38 28 48 30 53
## 121 122 123 124 125 126 127 128
## 36 30 36 29 29 29 32 24
## 1284 129 130 131 132 133 134 135
## 1 28 33 20 19 18 15 18
## 136 137 1373 138 139 140 141 142
## 15 13 1 17 17 15 11 8
## 143 144 145 1454 146 147 148 149
## 12 9 5 1 12 4 2 6
## 150 151 152 153 154 155 156 157
## 7 3 8 3 5 7 1 3
## 158 159 1591 160 161 162 164 165
## 5 2 1 5 3 2 5 6
## 167 1689 169 1694 170 172 174 175
## 2 1 3 1 2 1 1 2
## 1765 177 179 181 182 187 189 191
## 1 1 2 2 1 1 3 1
## 1920 194 195 1957 197 200 201 2019
## 1 1 1 1 1 1 2 1
## 2129 220 2212 2258 240 2705 2736 2755
## 1 1 1 1 1 1 1 1
## 2791 2874 3053 3145 3340 3420 3457 3578
## 1 1 1 1 1 1 1 1
## 3751 3838 39 40 4001 4048 4098 4371
## 1 1 1 5 1 1 1 1
## 44 4443 4662 47 4749 4826 4963 4983
## 1 1 1 1 1 1 1 1
## 506 515 52 62 643 65 660 69
## 1 2 1 1 1 1 1 2
## 70 71 720 74 75 76 77 78
## 1 1 1 1 4 1 2 3
## 783 784 79 80 81 82 824 825
## 1 1 2 4 14 15 1 1
## 83 84 845 85 86 87 88 89
## 11 8 1 17 19 22 27 28
## 90 91 92 93 94 95 959 96
## 31 46 37 33 28 52 1 42
## 97 98 99 not sure
## 55 52 33 3
recode <- function(x, lb, ub) {
outliers <- x < lb | x > ub
x[outliers] <- NA
return(x)}
movies.errors$length <- recode(movies.errors$length, lb = 0, ub = 300)
movies.errors$length <- as.numeric(movies.errors$length)
#genre
table(movies.errors$genre)
##
## action Action Adventure
## 1 691 485
## Black Comedy Comdy comedy
## 33 1 2
## Comedy Concert/Performance Documentary
## 1208 14 63
## drama Drama Horror
## 4 1083 299
## Multiple Genres musical Musical
## 2 2 77
## Reality REALITY Romantic Comedy
## 2 2 248
## ROMANTIC COMEDY Thriller/Suspense Western
## 3 427 38
recode.factor <- function(x, old, new) {
x[x == old[1]] <- new[1]
x[x == old[2]] <- new[2]
x[x == old[3]] <- new[3]
x[x == old[4]] <- new[4]
x[x == old[5]] <- new[5]
x[x == old[6]] <- new[6]
x[x == old[7]] <- new[7]
return(x)
}
movies.errors$genre <- recode.factor(movies.errors$genre, old = c("action", "Comdy", "comedy", "drama", "musical", "ROMANTIC COMEDY", "REALITY"), new = c("Action", "Comedy", "Comedy", "Drama", "Musical", "Reality", "Romantic Comedy"))
#rating
table(movies.errors$rating)
##
## 13 g G General GP NC-17 Not Rated
## 452 58 46 54 1 5 196
## PG PG-13 PG13 R X
## 699 457 462 1489 3
recode.factor2 <- function(x, old, new) {
x[x == old[1]] <- new[1]
x[x == old[2]] <- new[2]
x[x == old[3]] <- new[3]
x[x == old[4]] <- new[4]
x[x == old[5]] <- new[5]
x[x == old[6]] <- new[6]
return(x)
}
movies.errors$rating <- recode.factor2(movies.errors$rating, old = c("13", "g", "General", "GP", "PG13", "X"), new = c("PG-13", "G", "G", "PG", "PG-13", "Not Rated"))
#budget
table(movies.errors$budget)
##
## 0 25000 60000 65000 114000 140000 150000
## 1899 1 1 1 1 1 2
## 2e+05 245000 250000 325000 375000 4e+05 450000
## 2 1 2 1 1 1 1
## 5e+05 550000 6e+05 658000 777000 8e+05 9e+05
## 4 1 2 1 1 1 1
## 1e+06 1100000 1125000 1150000 1200000 1250000 1350000
## 15 1 1 1 4 1 1
## 1488000 1500000 1600000 1650000 1700000 1750000 1800000
## 1 6 1 1 1 1 4
## 1987650 2e+06 2100000 2200000 2250000 2280000 2500000
## 1 18 2 2 1 1 8
## 2600000 2700000 2777000 2800000 2883848 2900000 3e+06
## 2 3 1 4 1 2 28
## 3100000 3200000 3250000 3300000 3400000 3450000 3500000
## 1 3 2 2 3 1 14
## 3600000 3700000 3800000 3900000 4e+06 4200000 4300000
## 1 1 1 2 23 1 1
## 4400000 4500000 4600000 4800000 4833610 4900000 5e+06
## 2 7 1 1 1 1 55
## 5250000 5300000 5500000 5600000 5700000 5800000 5900000
## 1 1 9 2 2 1 1
## 6e+06 6200000 6250000 6400000 6500000 6537890 6800000
## 31 1 1 2 9 1 1
## 6900000 7e+06 7200000 7250000 7300000 7303082 7500000
## 2 36 2 1 1 1 7
## 7800000 8e+06 8200000 8250000 8300000 8470000 8500000
## 1 37 2 1 2 1 12
## 8600000 8800000 8900000 9e+06 9100000 9200000 9300000
## 1 1 1 21 2 1 1
## 9400000 9500000 9700000 9800000 1e+07 10100000 10350000
## 3 3 1 1 83 1 1
## 10500000 10600000 10700000 10750000 10800000 1.1e+07 11400000
## 3 1 2 1 1 31 1
## 11500000 1.2e+07 12300000 12500000 12800000 1.3e+07 13200000
## 1 65 1 10 1 36 1
## 13300000 13500000 13700000 13900000 1.4e+07 14400000 14500000
## 1 5 1 1 40 1 3
## 14600000 1.5e+07 15250000 15500000 15600000 15700000 1.6e+07
## 1 101 1 1 1 1 36
## 16400000 16500000 1.7e+07 17500000 17700000 1.8e+07 18500000
## 1 8 32 7 1 58 5
## 18900000 18975000 1.9e+07 19400000 19700000 2e+07 20500000
## 1 1 20 1 1 141 2
## 2.1e+07 21500000 2.2e+07 22500000 22700000 2.3e+07 23600000
## 14 4 37 2 1 20 1
## 2.4e+07 2.5e+07 25100000 25500000 25530000 2.6e+07 2.7e+07
## 23 114 1 1 1 31 18
## 27500000 2.8e+07 28500000 2.9e+07 3e+07 30250000 3.1e+07
## 9 37 1 11 133 1 9
## 31500000 3.2e+07 32500000 3.3e+07 33500000 3.4e+07 34800000
## 1 24 5 10 1 4 1
## 3.5e+07 35200000 3.6e+07 36500000 3.7e+07 37500000 3.8e+07
## 99 1 11 1 11 2 22
## 38600000 3.9e+07 4e+07 4.1e+07 4.2e+07 42500000 4.3e+07
## 1 6 131 3 21 1 5
## 4.4e+07 44500000 4.5e+07 4.6e+07 4.7e+07 47500000 4.8e+07
## 5 1 63 6 6 1 16
## 4.9e+07 5e+07 50200000 5.1e+07 5.2e+07 52500000 5.3e+07
## 4 98 1 3 10 1 9
## 53012938 5.4e+07 5.5e+07 5.6e+07 5.7e+07 57500000 5.8e+07
## 1 6 54 3 5 3 11
## 6e+07 6.1e+07 6.2e+07 6.3e+07 63700000 6.4e+07 6.5e+07
## 93 4 5 4 1 2 45
## 6.6e+07 6.7e+07 67500000 6.8e+07 6.9e+07 7e+07 70702619
## 3 2 3 7 2 48 1
## 71500000 71682975 7.2e+07 72500000 7.3e+07 73243106 7.4e+07
## 1 1 6 4 3 1 2
## 7.5e+07 7.6e+07 77500000 77600000 7.8e+07 7.9e+07 79300000
## 38 4 1 1 6 2 1
## 8e+07 8.1e+07 8.2e+07 82500000 8.4e+07 8.5e+07 8.6e+07
## 66 1 3 4 3 35 1
## 8.7e+07 87500000 8.8e+07 9e+07 9.1e+07 9.2e+07 92500000
## 1 1 2 35 1 4 1
## 9.3e+07 9.4e+07 9.5e+07 9.7e+07 9.8e+07 1e+08 1.02e+08
## 2 4 11 1 1 38 3
## 102500000 1.03e+08 103300000 1.05e+08 1.08e+08 1.09e+08 1.1e+08
## 1 2 1 4 1 3 22
## 1.12e+08 1.15e+08 1.17e+08 1.2e+08 1.23e+08 1.25e+08 1.27e+08
## 1 7 1 15 1 18 1
## 127500000 1.3e+08 1.32e+08 1.35e+08 136200000 1.37e+08 137500000
## 2 16 2 7 1 2 1
## 1.38e+08 1.39e+08 1.4e+08 1.42e+08 1.45e+08 1.49e+08 1.5e+08
## 2 1 9 1 7 1 33
## 1.51e+08 151500000 1.55e+08 1.6e+08 1.63e+08 1.65e+08 1.7e+08
## 1 1 3 8 1 5 9
## 1.75e+08 1.78e+08 1.79e+08 1.8e+08 1.85e+08 1.86e+08 1.9e+08
## 8 1 1 5 3 1 6
## 1.95e+08 2e+08 2.05e+08 2.07e+08 2.09e+08 2.1e+08 2.15e+08
## 2 16 1 1 1 4 2
## 2.2e+08 2.25e+08 2.3e+08 2.32e+08 2.5e+08 2.58e+08 2.6e+08
## 1 4 1 1 6 1 1
## 2.75e+08 3e+08 4.25e+08 9.02e+23 9.05e+23 9.15e+23 9.35e+23
## 3 1 1 1 1 1 1
## 9.4e+23 9.49e+23 9.58e+23 9.63e+23 9.65e+23 9.77e+23
## 1 1 1 1 1 1
recode <- function(x, lb) {
outliers <- x < lb
x[outliers] <- NA
return(x)}
movies.errors$budget <- recode(movies.errors$budget, lb = 0)
Question 3
movies.errors$decade <- cut(
x = movies.errors$year,
breaks = seq(1920, 2020, 10)
)
table(movies.errors$decade)
##
## (1.92e+03,1.93e+03] (1.93e+03,1.94e+03] (1.94e+03,1.95e+03]
## 1 5 20
## (1.95e+03,1.96e+03] (1.96e+03,1.97e+03] (1.97e+03,1.98e+03]
## 40 135 288
## (1.98e+03,1.99e+03] (1.99e+03,2e+03] (2e+03,2.01e+03]
## 671 1061 1844
## (2.01e+03,2.02e+03]
## 864
Question 4
movies.errors$time.30 <- cut(
x = movies.errors$length,
breaks = seq(0, 300, 30)
)
table(movies.errors$time.30)
##
## (0,30] (30,60] (60,90] (90,120] (120,150] (150,180] (180,210]
## 0 0 0 900 549 72 14
## (210,240] (240,270] (270,300]
## 2 0 0
Question 5
movies.errors$age <- movies.errors$rating
movies.errors$age <- recode.v(original.vector = movies.errors$age,
old.values = c("G", "PG", "PG-13", "N", "NC-17", "X", "R"),
new.values = c("child", "child", "child", "adult", "adult", "adult", "adult")
)
table(movies.errors$age)
##
## adult child Not Rated
## 1494 2229 199
Question 6
year.index <- read.table("http://nathanieldphillips.com/wp-content/uploads/2016/01/year_index.txt",
sep = "\t",
header = T,
stringsAsFactors = F)
names(year.index)
## [1] "year" "economy"
## [3] "international.conflict"
movies.errors2 <- merge(x = movies.errors,
y = year.index,
by = "year")
names(movies.errors2)
## [1] "year" "name"
## [3] "boxoffice" "dvd.earnings"
## [5] "budget" "rating"
## [7] "genre" "length"
## [9] "sequel" "decade"
## [11] "time.30" "age"
## [13] "economy" "international.conflict"
aggregate(boxoffice ~ economy, data = movies.errors2, FUN = median)
## economy boxoffice
## 1 good 43092117
## 2 ok 42598498
## 3 poor 41276266