# import data of movies
setwd('../studyspace/数据处理与可视化/')
raw_data <- read.csv("Movies.csv")
# Load packages
library('ggplot2') # visualization
library('ggthemes') # visualization
library('scales') # visualization
library('dplyr') # data manipulation
str(raw_data)
'data.frame':   103 obs. of  5 variables:
 $ Motion.Picture    : Factor w/ 101 levels "","A History of Violence",..: 16 41 6 92 63 22 29 53 99 54 ...
 $ Opening.Gross     : num  29.17 0.15 48.75 10.9 0.06 ...
 $ Total.Gross       : num  67.25 6.65 205.28 24.47 0.23 ...
 $ Number.of.Theaters: Factor w/ 88 levels "","1,011","1,040",..: 31 15 61 14 38 55 61 57 27 58 ...
 $ Weeks.in.Top.60   : int  16 22 18 8 4 14 13 16 7 21 ...
变量名 描述
Motion.Picture 电影名
Opening.Gross 首映(预售)票房
Total.Gross 总票房
Number.of.Theaters 放映剧院数量
Weeks.in.Top.60 在前60榜单的周数

此处增加两个变量:
1. 票房增长率(increase_rate) = 总票房/预售票房
2. 剧院平均票房(avg_gross) = 总票房/放映剧院数量

raw_data$increase_rate <- raw_data$Total.Gross/raw_data$Opening.Gross
raw_data$avg_gross <- raw_data$Total.Gross/raw_data$Number.of.Theaters

对于该数据集,
首先观察Motion.Picture变量中的电影名称可以发现该数据描述的是2005年在美国上映的电影票房等相关数据,由此可以计算出美国当年的电影总产值为

sum(na.omit(raw_data)$Total.Gross)
[1] 3303.93
raw_data$Number.of.Theaters <- as.numeric(gsub(",","", raw_data$Number.of.Theaters))
par(mfrow=c(2,2))
hist(raw_data$Weeks.in.Top.60, xlab = "weeks in top 60", ylab = "frequency")
hist(raw_data$Opening.Gross, xlab = "opening gross", ylab = "frequency")
hist(raw_data$Total.Gross, xlab = "total gross", ylab = "frequency")
hist(raw_data$Number.of.Theaters, xlab = "number of theaters", ylab = "frequency")

从每一列属性的直方图可以看出,
1. weeks in top 60基本呈现出右偏分布,大部分(70-80%)数值聚集在0-10之间,说明一般上映电影基本只能在榜单持续10天左右;
2. 同样的,70%的预售票房都在0-10之间;
3. 电影的上映剧院大部分都集中在0-500, 不过有20%左右分布在3000-3500之间。从上映数量来看,当时的美国电影产业处于良性发展状态。

library(corrplot)
cor(na.omit(raw_data[2:7]))
                   Opening.Gross Total.Gross Number.of.Theaters Weeks.in.Top.60 increase_rate avg_gross
Opening.Gross          1.0000000   0.9642527          0.7143165       0.4528279    -0.1953030 0.7048103
Total.Gross            0.9642527   1.0000000          0.7098528       0.5253941    -0.1157201 0.7669799
Number.of.Theaters     0.7143165   0.7098528          1.0000000       0.5261138    -0.1663245 0.3909717
Weeks.in.Top.60        0.4528279   0.5253941          0.5261138       1.0000000     0.3853256 0.7039244
increase_rate         -0.1953030  -0.1157201         -0.1663245       0.3853256     1.0000000 0.1727042
avg_gross              0.7048103   0.7669799          0.3909717       0.7039244     0.1727042 1.0000000
corrplot(corr=cor(na.omit(raw_data[2:7])),method = "color",order = "AOE",addCoef.col = "grey")

由上图可知,有几个变量相关性非常高:例如Total.Gross && Opening.Gross, avg_gross && Weeks.in.Top.60等。

require(GGally)
ggpairs(na.omit(raw_data)[,c(2:7)])

以上几张变量间散点图说明以下几点:
1. 预售总票房与最终总票房呈现正相关,预售票房越高,可以说最终的总票房一定不会差。
2. 观察到当因变量是Number.of.Theaters,可以发现Number.of.Theaters是有阈值的,可以用这个变量的最大值来估计当时2005年美国的总活跃电影院的数量。这个数量为:

max(na.omit(raw_data)$Number.of.Theaters)
[1] 3910
  1. 平均票房(avg_gross)与其它变量均有良好的线性相关性,且都为正相关,它可以很好地用来解释其它变量。
LS0tCnRpdGxlOiAiZXhwbG9yZSBtb3ZpZXMgZGF0YSIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CiMgaW1wb3J0IGRhdGEgb2YgbW92aWVzCnNldHdkKCcuLi9zdHVkeXNwYWNlL+aVsOaNruWkhOeQhuS4juWPr+inhuWMli8nKQpyYXdfZGF0YSA8LSByZWFkLmNzdigiTW92aWVzLmNzdiIpCmBgYApgYGB7cn0KIyBMb2FkIHBhY2thZ2VzCmxpYnJhcnkoJ2dncGxvdDInKSAjIHZpc3VhbGl6YXRpb24KbGlicmFyeSgnZ2d0aGVtZXMnKSAjIHZpc3VhbGl6YXRpb24KbGlicmFyeSgnc2NhbGVzJykgIyB2aXN1YWxpemF0aW9uCmxpYnJhcnkoJ2RwbHlyJykgIyBkYXRhIG1hbmlwdWxhdGlvbgpgYGAKYGBge3J9CnN0cihyYXdfZGF0YSkKYGBgCuWPmOmHj+WQjXzmj4/ov7AKLS18Oi0tOnwtLToKTW90aW9uLlBpY3R1cmV855S15b2x5ZCNCk9wZW5pbmcuR3Jvc3N86aaW5pig77yI6aKE5ZSu77yJ56Wo5oi/ClRvdGFsLkdyb3NzfOaAu+elqOaIvwpOdW1iZXIub2YuVGhlYXRlcnN85pS+5pig5Ymn6Zmi5pWw6YePCldlZWtzLmluLlRvcC42MHzlnKjliY02MOamnOWNleeahOWRqOaVsAotLS0KICAK5q2k5aSE5aKe5Yqg5Lik5Liq5Y+Y6YeP77yaICAKMS4g56Wo5oi/5aKe6ZW/546HKGluY3JlYXNlX3JhdGUpID0g5oC756Wo5oi/L+mihOWUruelqOaIvyAgCjIuIOWJp+mZouW5s+Wdh+elqOaIvyhhdmdfZ3Jvc3MpID0g5oC756Wo5oi/L+aUvuaYoOWJp+mZouaVsOmHjwpgYGB7cn0KcmF3X2RhdGEkaW5jcmVhc2VfcmF0ZSA8LSByYXdfZGF0YSRUb3RhbC5Hcm9zcy9yYXdfZGF0YSRPcGVuaW5nLkdyb3NzCnJhd19kYXRhJGF2Z19ncm9zcyA8LSByYXdfZGF0YSRUb3RhbC5Hcm9zcy9yYXdfZGF0YSROdW1iZXIub2YuVGhlYXRlcnMKYGBgCuWvueS6juivpeaVsOaNrumbhu+8jCAgCummluWFiOinguWvn01vdGlvbi5QaWN0dXJl5Y+Y6YeP5Lit55qE55S15b2x5ZCN56ew5Y+v5Lul5Y+R546w6K+l5pWw5o2u5o+P6L+w55qE5pivMjAwNeW5tOWcqOe+juWbveS4iuaYoOeahOeUteW9seelqOaIv+etieebuOWFs+aVsOaNru+8jOeUseatpOWPr+S7peiuoeeul+WHuue+juWbveW9k+W5tOeahOeUteW9seaAu+S6p+WAvOS4ugpgYGB7cn0Kc3VtKG5hLm9taXQocmF3X2RhdGEpJFRvdGFsLkdyb3NzKQpgYGAKCgoKYGBge3J9CnJhd19kYXRhJE51bWJlci5vZi5UaGVhdGVycyA8LSBhcy5udW1lcmljKGdzdWIoIiwiLCIiLCByYXdfZGF0YSROdW1iZXIub2YuVGhlYXRlcnMpKQpwYXIobWZyb3c9YygyLDIpKQpoaXN0KHJhd19kYXRhJFdlZWtzLmluLlRvcC42MCwgeGxhYiA9ICJ3ZWVrcyBpbiB0b3AgNjAiLCB5bGFiID0gImZyZXF1ZW5jeSIpCmhpc3QocmF3X2RhdGEkT3BlbmluZy5Hcm9zcywgeGxhYiA9ICJvcGVuaW5nIGdyb3NzIiwgeWxhYiA9ICJmcmVxdWVuY3kiKQpoaXN0KHJhd19kYXRhJFRvdGFsLkdyb3NzLCB4bGFiID0gInRvdGFsIGdyb3NzIiwgeWxhYiA9ICJmcmVxdWVuY3kiKQpoaXN0KHJhd19kYXRhJE51bWJlci5vZi5UaGVhdGVycywgeGxhYiA9ICJudW1iZXIgb2YgdGhlYXRlcnMiLCB5bGFiID0gImZyZXF1ZW5jeSIpCmBgYArku47mr4/kuIDliJflsZ7mgKfnmoTnm7Tmlrnlm77lj6/ku6XnnIvlh7rvvIwgIAoxLiB3ZWVrcyBpbiB0b3AgNjDln7rmnKzlkYjnjrDlh7rlj7PlgY/liIbluIPvvIzlpKfpg6jliIYoNzAtODAlKeaVsOWAvOiBmumbhuWcqDAtMTDkuYvpl7TvvIzor7TmmI7kuIDoiKzkuIrmmKDnlLXlvbHln7rmnKzlj6rog73lnKjmppzljZXmjIHnu60xMOWkqeW3puWPs++8myAgCjIuIOWQjOagt+eahO+8jDcwJeeahOmihOWUruelqOaIv+mDveWcqDAtMTDkuYvpl7TvvJsgIAozLiDnlLXlvbHnmoTkuIrmmKDliafpmaLlpKfpg6jliIbpg73pm4bkuK3lnKgwLTUwMCwg5LiN6L+H5pyJMjAl5bem5Y+z5YiG5biD5ZyoMzAwMC0zNTAw5LmL6Ze044CC5LuO5LiK5pig5pWw6YeP5p2l55yL77yM5b2T5pe255qE576O5Zu955S15b2x5Lqn5Lia5aSE5LqO6Imv5oCn5Y+R5bGV54q25oCB44CCICAgIAoKYGBge3J9CmxpYnJhcnkoY29ycnBsb3QpCmNvcihuYS5vbWl0KHJhd19kYXRhWzI6N10pKQpjb3JycGxvdChjb3JyPWNvcihuYS5vbWl0KHJhd19kYXRhWzI6N10pKSxtZXRob2QgPSAiY29sb3IiLG9yZGVyID0gIkFPRSIsYWRkQ29lZi5jb2wgPSAiZ3JleSIpCmBgYArnlLHkuIrlm77lj6/nn6XvvIzmnInlh6DkuKrlj5jph4/nm7jlhbPmgKfpnZ7luLjpq5jvvJrkvovlpoJUb3RhbC5Hcm9zcyAmJiBPcGVuaW5nLkdyb3NzLCBhdmdfZ3Jvc3MgJiYgV2Vla3MuaW4uVG9wLjYw562J44CCCgpgYGB7cn0KcmVxdWlyZShHR2FsbHkpCmdncGFpcnMobmEub21pdChyYXdfZGF0YSlbLGMoMjo3KV0pCmBgYAoKCuS7peS4iuWHoOW8oOWPmOmHj+mXtOaVo+eCueWbvuivtOaYjuS7peS4i+WHoOeCue+8miAgCjEuIOmihOWUruaAu+elqOaIv+S4juacgOe7iOaAu+elqOaIv+WRiOeOsOato+ebuOWFs++8jOmihOWUruelqOaIv+i2iumrmO+8jOWPr+S7peivtOacgOe7iOeahOaAu+elqOaIv+S4gOWumuS4jeS8muW3ruOAgiAgCjIuIOinguWvn+WIsOW9k+WboOWPmOmHj+aYr051bWJlci5vZi5UaGVhdGVyc++8jOWPr+S7peWPkeeOsE51bWJlci5vZi5UaGVhdGVyc+aYr+aciemYiOWAvOeahO+8jOWPr+S7peeUqOi/meS4quWPmOmHj+eahOacgOWkp+WAvOadpeS8sOiuoeW9k+aXtjIwMDXlubTnvo7lm73nmoTmgLvmtLvot4PnlLXlvbHpmaLnmoTmlbDph4/jgILov5nkuKrmlbDph4/kuLrvvJoKYGBge3J9Cm1heChuYS5vbWl0KHJhd19kYXRhKSROdW1iZXIub2YuVGhlYXRlcnMpCmBgYAozLiDlubPlnYfnpajmiL8oYXZnX2dyb3NzKeS4juWFtuWug+WPmOmHj+Wdh+acieiJr+WlveeahOe6v+aAp+ebuOWFs+aAp++8jOS4lOmDveS4uuato+ebuOWFs++8jOWug+WPr+S7peW+iOWlveWcsOeUqOadpeino+mHiuWFtuWug+WPmOmHj+OAggoKCgoKCgoKCgoKCgoKCgoKCgo=