About this Project


Brief description of your project.


Resources: Packages Use on this notebook


Install required packages


# Here we are checking if the package is installed
if(!require("tidyverse")){
  
  # If the package is not in the system then it will be install
  install.packages("tidyverse", dependencies = TRUE)
  
  # Here we are loading the package
  library("tidyverse")
}

# Here we are checking if the package is installed
if(!require("corrplot")){
  
  # If the package is not in the system then it will be install
  install.packages("corrplot", dependencies = TRUE)
  
  # Here we are loading the package
  library("corrplot")
}

Data Collection: INDUSTRY - TITLE_OF_YOUR_DATASET


Read the csv file into R Studio and explore the dataset (extract variables)

summary(mydata)
    title              genres            director            actor1         
 Length:5043        Length:5043        Length:5043        Length:5043       
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
    actor2             actor3              length          budget         
 Length:5043        Length:5043        Min.   :  7.0   Min.   :2.180e+02  
 Class :character   Class :character   1st Qu.: 93.0   1st Qu.:6.000e+06  
 Mode  :character   Mode  :character   Median :103.0   Median :2.000e+07  
                                       Mean   :107.2   Mean   :3.510e+07  
                                       3rd Qu.:118.0   3rd Qu.:4.400e+07  
                                       Max.   :511.0   Max.   :2.128e+09  
                                       NA's   :15      NA's   :496        
 director_fb_likes actor1_fb_likes  actor2_fb_likes  actor3_fb_likes  
 Min.   :    0.0   Min.   :     0   Min.   :     0   Min.   :    0.0  
 1st Qu.:    7.0   1st Qu.:   614   1st Qu.:   281   1st Qu.:  133.0  
 Median :   49.0   Median :   988   Median :   595   Median :  371.5  
 Mean   :  686.5   Mean   :  6560   Mean   :  1652   Mean   :  645.0  
 3rd Qu.:  194.5   3rd Qu.: 11000   3rd Qu.:   918   3rd Qu.:  636.0  
 Max.   :23000.0   Max.   :640000   Max.   :137000   Max.   :23000.0  
 NA's   :104       NA's   :7        NA's   :13       NA's   :23       
 total_cast_likes    fb_likes      critic_reviews  users_reviews   
 Min.   :     0   Min.   :     0   Min.   :  1.0   Min.   :   1.0  
 1st Qu.:  1411   1st Qu.:     0   1st Qu.: 50.0   1st Qu.:  65.0  
 Median :  3090   Median :   166   Median :110.0   Median : 156.0  
 Mean   :  9699   Mean   :  7526   Mean   :140.2   Mean   : 272.8  
 3rd Qu.: 13756   3rd Qu.:  3000   3rd Qu.:195.0   3rd Qu.: 326.0  
 Max.   :656730   Max.   :349000   Max.   :813.0   Max.   :5060.0  
                                   NA's   :50      NA's   :21      
  users_votes          score        aspect_ratio       gross          
 Min.   :      5   Min.   :1.600   Min.   : 1.18   Min.   :      162  
 1st Qu.:   8594   1st Qu.:5.800   1st Qu.: 1.85   1st Qu.:  5340988  
 Median :  34359   Median :6.600   Median : 2.35   Median : 25517500  
 Mean   :  83668   Mean   :6.442   Mean   : 2.22   Mean   : 48468408  
 3rd Qu.:  96309   3rd Qu.:7.200   3rd Qu.: 2.35   3rd Qu.: 62309438  
 Max.   :1689764   Max.   :9.500   Max.   :16.00   Max.   :760505847  
                                   NA's   :329     NA's   :884        
      year       genres_count  
 Min.   :1916   Min.   :1.000  
 1st Qu.:1999   1st Qu.:2.000  
 Median :2005   Median :3.000  
 Mean   :2002   Mean   :2.853  
 3rd Qu.:2011   3rd Qu.:4.000  
 Max.   :2016   Max.   :8.000  
 NA's   :108                   

## Data Preparation: Cleaning and preparing the data for analysis

Replace strings

lapply(genres[1:5], length) %>% unlist()
[1] 4 3 3 2 1
doc_true[1:10]
 [1] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE

Save the new clean data

write_csv( newdata, "data/clean_data.csv")

Data Analysis: Descriptive Statistics, Correlations


Basic descriptive statistics of the dataset (write down any observations)

summary(newdata)
    title              genres            director            actor1         
 Length:49          Length:49          Length:49          Length:49         
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
    actor2             actor3              length           budget        
 Length:49          Length:49          Min.   : 41.00   Min.   :     218  
 Class :character   Class :character   1st Qu.: 89.00   1st Qu.:  700000  
 Mode  :character   Mode  :character   Median : 93.00   Median : 2000000  
                                       Mean   : 99.04   Mean   : 5254317  
                                       3rd Qu.:106.00   3rd Qu.: 5000000  
                                       Max.   :215.00   Max.   :60000000  
 director_fb_likes actor1_fb_likes   actor2_fb_likes actor3_fb_likes 
 Min.   :   0.0    Min.   :    0.0   Min.   :  0.0   Min.   :  0.00  
 1st Qu.:   9.0    1st Qu.:   56.0   1st Qu.:  7.0   1st Qu.:  0.00  
 Median :  23.0    Median :  191.0   Median : 44.0   Median : 13.00  
 Mean   : 153.2    Mean   :  602.6   Mean   :156.6   Mean   : 81.49  
 3rd Qu.: 120.0    3rd Qu.:  773.0   3rd Qu.:208.0   3rd Qu.: 67.00  
 Max.   :1000.0    Max.   :11000.0   Max.   :892.0   Max.   :748.00  
 total_cast_likes     fb_likes     critic_reviews   users_reviews   
 Min.   :    0.0   Min.   :    0   Min.   :  4.00   Min.   :   4.0  
 1st Qu.:   68.0   1st Qu.:    0   1st Qu.: 30.00   1st Qu.:  31.0  
 Median :  338.0   Median :    0   Median : 72.00   Median :  69.0  
 Mean   :  946.3   Mean   : 3212   Mean   : 92.82   Mean   : 143.8  
 3rd Qu.: 1252.0   3rd Qu.:  588   3rd Qu.:123.00   3rd Qu.: 134.0  
 Max.   :11218.0   Max.   :62000   Max.   :288.00   Max.   :1416.0  
  users_votes         score       aspect_ratio        gross          
 Min.   :   131   Min.   :1.60   Min.   : 1.330   Min.   :     1111  
 1st Qu.:  2482   1st Qu.:6.70   1st Qu.: 1.780   1st Qu.:   592014  
 Median :  7721   Median :7.40   Median : 1.850   Median :  3713002  
 Mean   : 21676   Mean   :6.98   Mean   : 2.133   Mean   : 17885042  
 3rd Qu.: 23836   3rd Qu.:7.70   3rd Qu.: 1.850   3rd Qu.: 21244913  
 Max.   :123090   Max.   :8.50   Max.   :16.000   Max.   :119078393  
      year       genres_count
 Min.   :1970   Min.   :1    
 1st Qu.:2004   1st Qu.:1    
 Median :2007   Median :1    
 Mean   :2006   Mean   :1    
 3rd Qu.:2010   3rd Qu.:1    
 Max.   :2013   Max.   :1    
sum(newdata$director_fb_likes,newdata$actor1_fb_likes[1], newdata$total_cast_likes[1])
[1] 7726

Correlation table ( only numeric data )

data_corr
                       length      budget director_fb_likes actor1_fb_likes
length             1.00000000  0.07155201        0.16101077     -0.10143280
budget             0.07155201  1.00000000        0.05289809      0.06737129
director_fb_likes  0.16101077  0.05289809        1.00000000      0.02609618
actor1_fb_likes   -0.10143280  0.06737129        0.02609618      1.00000000
actor2_fb_likes    0.09005790  0.12778240        0.35184349      0.17359325
actor3_fb_likes    0.12845460  0.07481820        0.26296159      0.14794872
total_cast_likes  -0.05884588  0.10133039        0.11683803      0.95142980
fb_likes           0.07960336  0.03894341        0.04583538     -0.05343623
critic_reviews     0.12889690  0.17629284        0.47632480      0.15000723
users_reviews      0.20296368  0.07294848        0.62394343      0.15510807
users_votes        0.21743822  0.14351307        0.58671621      0.15475257
score              0.11019499 -0.10990129        0.11807291     -0.02906095
aspect_ratio      -0.02948317 -0.05001503       -0.08010282     -0.01951027
gross              0.14669378  0.47931328        0.25540764      0.35986081
year              -0.67426761  0.17327838       -0.15972997     -0.02006917
                  actor2_fb_likes actor3_fb_likes total_cast_likes    fb_likes
length                0.090057898      0.12845460      -0.05884588  0.07960336
budget                0.127782398      0.07481820       0.10133039  0.03894341
director_fb_likes     0.351843492      0.26296159       0.11683803  0.04583538
actor1_fb_likes       0.173593252      0.14794872       0.95142980 -0.05343623
actor2_fb_likes       1.000000000      0.83002137       0.44505783 -0.15045265
actor3_fb_likes       0.830021372      1.00000000       0.43361069 -0.11901935
total_cast_likes      0.445057828      0.43361069       1.00000000 -0.09193344
fb_likes             -0.150452655     -0.11901935      -0.09193344  1.00000000
critic_reviews        0.134061968      0.12619893       0.18491380  0.17762914
users_reviews         0.201427943      0.19625651       0.20508101  0.07557735
users_votes           0.176643997      0.17002324       0.20628669  0.28102803
score                -0.157769992     -0.24756783      -0.09436891 -0.35490438
aspect_ratio          0.008564091     -0.01889250      -0.02383234 -0.02995331
gross                 0.290829251      0.30754525       0.43816420  0.15681634
year                 -0.011731671     -0.00547031      -0.01419498  0.19141490
                  critic_reviews users_reviews users_votes       score
length                0.12889690    0.20296368  0.21743822  0.11019499
budget                0.17629284    0.07294848  0.14351307 -0.10990129
director_fb_likes     0.47632480    0.62394343  0.58671621  0.11807291
actor1_fb_likes       0.15000723    0.15510807  0.15475257 -0.02906095
actor2_fb_likes       0.13406197    0.20142794  0.17664400 -0.15776999
actor3_fb_likes       0.12619893    0.19625651  0.17002324 -0.24756783
total_cast_likes      0.18491380    0.20508101  0.20628669 -0.09436891
fb_likes              0.17762914    0.07557735  0.28102803 -0.35490438
critic_reviews        1.00000000    0.67203644  0.76042212  0.30222208
users_reviews         0.67203644    1.00000000  0.86347911  0.09576719
users_votes           0.76042212    0.86347911  1.00000000  0.03199929
score                 0.30222208    0.09576719  0.03199929  1.00000000
aspect_ratio         -0.12132492   -0.08777527 -0.10820789 -0.01380318
gross                 0.38946540    0.56748782  0.59163181 -0.21334622
year                  0.07304008   -0.09208640 -0.06017546 -0.20168869
                  aspect_ratio       gross        year
length            -0.029483170  0.14669378 -0.67426761
budget            -0.050015032  0.47931328  0.17327838
director_fb_likes -0.080102817  0.25540764 -0.15972997
actor1_fb_likes   -0.019510275  0.35986081 -0.02006917
actor2_fb_likes    0.008564091  0.29082925 -0.01173167
actor3_fb_likes   -0.018892501  0.30754525 -0.00547031
total_cast_likes  -0.023832344  0.43816420 -0.01419498
fb_likes          -0.029953312  0.15681634  0.19141490
critic_reviews    -0.121324922  0.38946540  0.07304008
users_reviews     -0.087775268  0.56748782 -0.09208640
users_votes       -0.108207891  0.59163181 -0.06017546
score             -0.013803184 -0.21334622 -0.20168869
aspect_ratio       1.000000000 -0.08506091  0.10744772
gross             -0.085060913  1.00000000  0.04006751
year               0.107447717  0.04006751  1.00000000

Correlation Plot ( only numeric data )


Visual Analytics: Use Table or R to create plots



Predictive Analytics: Create a Predictive Model


Based on your hypothesis create a predictive mode

LS0tCnRpdGxlOiAiUFJPSkVDVCBUSVRMRSIKYXV0aG9yOiAiVEVBTSBOQU1FIgpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazogZGVmYXVsdAogIGh0bWxfZG9jdW1lbnQ6IGRlZmF1bHQKZGF0ZTogIkRBVEVfSEVSRSIKc3VidGl0bGU6ICJCdXNpbmVzcyBBbmFseXRpY3MgSG9ub3JzIC0gRmluYWwgUHJvamVjdCIKLS0tCgoKLS0tLS0tLS0tLS0tLS0KCiMjIEFib3V0IHRoaXMgUHJvamVjdAoKLS0tLS0tLS0tLS0tLS0KCkJyaWVmIGRlc2NyaXB0aW9uIG9mIHlvdXIgcHJvamVjdC4KCi0tLS0tLS0tLS0tLS0tCgojIyBSZXNvdXJjZXM6IFBhY2thZ2VzIFVzZSBvbiB0aGlzIG5vdGVib29rCgotLS0tLS0tLS0tLS0tLQoKKiB3YXRzb24KKiB0YWJsZWF1CgoqKkluc3RhbGwgcmVxdWlyZWQgcGFja2FnZXMqKgoKKiBQYWNrYWdlOiB0aWR5dmVyc2UsIGNvcnJwbG90CgpgYGB7cn0KCiMgSGVyZSB3ZSBhcmUgY2hlY2tpbmcgaWYgdGhlIHBhY2thZ2UgaXMgaW5zdGFsbGVkCmlmKCFyZXF1aXJlKCJ0aWR5dmVyc2UiKSl7CiAgCiAgIyBJZiB0aGUgcGFja2FnZSBpcyBub3QgaW4gdGhlIHN5c3RlbSB0aGVuIGl0IHdpbGwgYmUgaW5zdGFsbAogIGluc3RhbGwucGFja2FnZXMoInRpZHl2ZXJzZSIsIGRlcGVuZGVuY2llcyA9IFRSVUUpCiAgCiAgIyBIZXJlIHdlIGFyZSBsb2FkaW5nIHRoZSBwYWNrYWdlCiAgbGlicmFyeSgidGlkeXZlcnNlIikKfQoKIyBIZXJlIHdlIGFyZSBjaGVja2luZyBpZiB0aGUgcGFja2FnZSBpcyBpbnN0YWxsZWQKaWYoIXJlcXVpcmUoImNvcnJwbG90IikpewogIAogICMgSWYgdGhlIHBhY2thZ2UgaXMgbm90IGluIHRoZSBzeXN0ZW0gdGhlbiBpdCB3aWxsIGJlIGluc3RhbGwKICBpbnN0YWxsLnBhY2thZ2VzKCJjb3JycGxvdCIsIGRlcGVuZGVuY2llcyA9IFRSVUUpCiAgCiAgIyBIZXJlIHdlIGFyZSBsb2FkaW5nIHRoZSBwYWNrYWdlCiAgbGlicmFyeSgiY29ycnBsb3QiKQp9CgpgYGAKCgotLS0tLS0tLS0tLS0tLQoKIyMgRGF0YSBDb2xsZWN0aW9uOiBJTkRVU1RSWSAtIFRJVExFX09GX1lPVVJfREFUQVNFVCAKCi0tLS0tLS0tLS0tLS0tCgojIyMjIFJlYWQgdGhlIGNzdiBmaWxlIGludG8gUiBTdHVkaW8gYW5kIGV4cGxvcmUgdGhlIGRhdGFzZXQgKGV4dHJhY3QgdmFyaWFibGVzKQoKYGBge3J9CgpteWRhdGEgPC0gcmVhZF9jc3YoImRhdGEvcm90dGVudG9tYXRvZXMuY3N2IikKCmhlYWQobXlkYXRhKQpzdW1tYXJ5KGFzLmZhY3RvcihteWRhdGEkZGlyZWN0b3IpKQoKc3VtbWFyeShteWRhdGEpCmBgYAoKLS0tLS0tLS0tLS0tLS0KCiMjIERhdGEgUHJlcGFyYXRpb246IENsZWFuaW5nIGFuZCBwcmVwYXJpbmcgdGhlIGRhdGEgZm9yIGFuYWx5c2lzCi0tLS0tLS0tLS0tLS0tCgoKIyMjIyBSZXBsYWNlIHN0cmluZ3MKCmBgYHtyfQoKdGl0bGUgPC0gbXlkYXRhJHRpdGxlWzE6MTBdCgp0aXRsZSA8LSBzdHJfcmVwbGFjZSh0aXRsZSwKICAgICAgICAgICAgcGF0dGVybiA9ICLDgiIsCiAgICAgICAgICAgIHJlcGxhY2VtZW50ID0gIiIpCgp0aXRsZSA8LSBzdHJfcmVwbGFjZV9hbGwodGl0bGUsCiAgICAgICAgICAgIHBhdHRlcm4gPSAiw4IiLAogICAgICAgICAgICByZXBsYWNlbWVudCA9ICIiKQoKZ2VucmVzIDwtIG15ZGF0YSRnZW5yZXMKZ2VucmVzWzE6MTBdCgpnZW5yZXMgPC0gc3RyX3NwbGl0KGdlbnJlcywgIlxcfCIpCgpnZW5yZXNfY291bnQgPC0gbGFwcGx5KGdlbnJlcywgbGVuZ3RoKSAlPiUgdW5saXN0KCkKCm15ZGF0YSRnZW5yZXNfY291bnQgPC0gZ2VucmVzX2NvdW50CgpgYGAKCmBgYHtyfQoKZG9jX3RydWUgPC0gc3RyX2RldGVjdChnZW5yZXMsICJEb2N1bWVudGFyeSIpCgpkb2NfdHJ1ZVsxOjEwXQoKZ2VucmVzW2RvY190cnVlXSA8LSAiRG9jdW1lbnRhcnkiCgpgYGAKCmBgYHtyfQoKbmV3ZGF0YSA8LSBuYS5vbWl0KG15ZGF0YVtkb2NfdHJ1ZSxdKQoKbmV3ZGF0YSR0aXRsZSA8LSBzdHJfcmVwbGFjZV9hbGwobmV3ZGF0YSR0aXRsZSwKICAgICAgICAgICAgcGF0dGVybiA9ICLDgiIsCiAgICAgICAgICAgIHJlcGxhY2VtZW50ID0gIiIpCgpuZXdkYXRhJGdlbnJlcyA8LSAiRG9jdW1lbnRhcnkiCgpoZWFkKG5ld2RhdGEpCnRhaWwobmV3ZGF0YSkKCmBgYAoKCgojIyMjIFNhdmUgdGhlIG5ldyBjbGVhbiBkYXRhCgpgYGB7cn0KCndyaXRlX2NzdiggbmV3ZGF0YSwgImRhdGEvY2xlYW5fZGF0YS5jc3YiKQoKYGBgCgoKLS0tLS0tLS0tLS0tLS0KCiMjIERhdGEgQW5hbHlzaXM6IERlc2NyaXB0aXZlIFN0YXRpc3RpY3MsIENvcnJlbGF0aW9ucwoKLS0tLS0tLS0tLS0tLS0KCiMjIyMgQmFzaWMgZGVzY3JpcHRpdmUgc3RhdGlzdGljcyBvZiB0aGUgZGF0YXNldCAod3JpdGUgZG93biBhbnkgb2JzZXJ2YXRpb25zKQoKYGBge3J9CgpzdW1tYXJ5KG5ld2RhdGEpCgpgYGAKCmBgYHtyfQoKbmV3ZGF0YVtuZXdkYXRhJGJ1ZGdldCA+IDUwMDAwMDAwLCBdCgpuZXdkYXRhW25ld2RhdGEkZ3Jvc3MgPT0gMTE5MDc4MzkzLCBdCgpuZXdkYXRhW25ld2RhdGEkeWVhciA9PSAxOTcwLCBdCgpzdW0obmV3ZGF0YSRkaXJlY3Rvcl9mYl9saWtlcyxuZXdkYXRhJGFjdG9yMV9mYl9saWtlc1sxXSwgbmV3ZGF0YSR0b3RhbF9jYXN0X2xpa2VzWzFdKQoKYGBgCgoKIyMjIyBDb3JyZWxhdGlvbiB0YWJsZSAoIG9ubHkgbnVtZXJpYyBkYXRhICkKCmBgYHtyfQoKZGF0YV9jb3JyIDwtIGNvciggbmV3ZGF0YVstYygxLDIsMyw0LDUsNiwyMildICkKZGF0YV9jb3JyCgpgYGAKCgojIyMjIENvcnJlbGF0aW9uIFBsb3QgKCBvbmx5IG51bWVyaWMgZGF0YSApCgpgYGB7cn0KCmNvcnJwbG90KGRhdGFfY29ycikKCmBgYAoKCi0tLS0tLS0tLS0tLS0tCgojIyBWaXN1YWwgQW5hbHl0aWNzOiBVc2UgVGFibGUgb3IgUiB0byBjcmVhdGUgcGxvdHMgCgotLS0tLS0tLS0tLS0tLQoKCmBgYHtyfQoKcGxvdChuZXdkYXRhWy1jKDEsMiwzLDQsNSw2LDIyKV1bMTo1XSkKCmBgYAoKCi0tLS0tLS0tLS0tLS0tCgojIyBQcmVkaWN0aXZlIEFuYWx5dGljczogQ3JlYXRlIGEgUHJlZGljdGl2ZSBNb2RlbAoKLS0tLS0tLS0tLS0tLS0KCkJhc2VkIG9uIHlvdXIgaHlwb3RoZXNpcyBjcmVhdGUgYSBwcmVkaWN0aXZlIG1vZGUgCgo=