Start Date: Oct. 7, 2024

Report Date: 31 October 2024

Source: Psi Chi R

knitr::opts_chunk$set(echo = T,message = F,warning = F)
library(tidyverse)

data=read.csv('data.csv')
#write.csv(data,'data.csv')
## EDA
SmartEDA::ExpData(data)
skimr::skim(data)
Data summary
Name data
Number of rows 3004
Number of columns 12
_______________________
Column type frequency:
character 9
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Title 0 1 9 62 0 2982 0
Genres 0 1 0 80 1 243 0
Release_Date 0 1 8 10 0 1252 0
Release 0 1 2 20 0 69 0
ParentalRating 0 1 0 9 1663 12 0
Plot 0 1 0 547 1 2987 0
Cast 0 1 0 308 13 2974 0
Language 0 1 0 64 69 162 0
FilmingLocations 0 1 0 125 1131 1027 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Rating 232 0.92 5.06 1.46 1 4 5 6 9.8e+00 ▁▆▇▃▁
RunTime 487 0.84 90.75 14.57 12 82 90 96 1.8e+02 ▁▂▇▁▁
Budget 2094 0.30 3433828.09 9269568.33 1 25000 475000 2500000 8.0e+07 ▇▁▁▁▁

Data processing (level 1)

### Write a script that will filter out movies that are missing a value in the ‘Budget,’ ‘RunTime,’ ‘ParentalRating’ and ‘Rating’ columns.

data1 = data %>% 
  filter(complete.cases(Budget),
         complete.cases(RunTime),
         complete.cases(ParentalRating),
         complete.cases(Rating)) %>% 
  mutate(DATE= mdy(Release_Date)) %>% 
  select(DATE,Release_Date,everything())
### Right now, the genre columns cannot be properly analyzed due to formatting (e.g., Action| Fantasy| Horror| Mystery| Romance). Write a script that will separate the column into multiple Genre categories.

data2 = data1 %>% 
  separate_wider_delim(Genres, delim = "|",
                       names_sep = "_",
                       too_few = "align_start")


# weird how Genres1 became sensible words
# data3 = data2 %>% 
#   mutate(Genres2 = strsplit(as.character(Genres1),"\\,")) %>% 
#   unnest() %>% 
#   select(Genres2,Genres1,Genres,everything())

Descriptive Statistics (level 2)

### Write a script that will provide the average, standard deviation, median, and range of the ‘Rating’ variable. Note the values you got from running the code

summary(data2$Rating,na.rm=T)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.900   4.900   4.955   6.000   9.600
sd(data2$Rating,na.rm = T)
## [1] 1.493
# Average = 5.0, Stand. Dev. = 1.5,
# Median = 4.9, Range = 1 to 9.6
### FILTER BY MAIN GENRE
### Write a script that will show the average ratings by a movie’s main genre. 

data2 %>% 
  group_by(Title,Genres_1) %>% 
  summarise(Rating = mean(Rating,na.rm = T)) %>% 
  arrange(desc(Rating))

Data visualization (level 3)

### Create a graph that shows the ‘Budget’ for the movies in the dataset over time.

data2 %>% 
  distinct(Title,DATE,.keep_all = T) %>% 
  ggplot(aes(x=DATE,y=Budget))+
  geom_line()+
  theme_bw()

# Something weird is happening with the Release_Date column showing 1905 for movies like Habit (2017) or Voidfinder (2017)
# extract year out of `Title` instead

data3 = data2 %>% 
  mutate(YEAR = str_extract(Title, "\\(\\d{4}\\)")) %>% 
  mutate(YEAR1 = str_extract(YEAR, "\\d{4}")) %>% 
  mutate(YEAR2 = as.Date(paste0(YEAR1,"-01-01"))) %>% 
  select(YEAR2,YEAR1,YEAR,everything())

# dollar format
dollars = function(x) {
  paste0("$", format(x,big.mark= ",",scientific=F))
}

data3 %>% 
  ggplot(aes(x=YEAR2,y=Budget))+
  geom_line()+
  theme_bw()+
  labs(x='Release Year',
       y='Movie Budget',
       title='From Box Office to Blockbuster',
       subtitle='A Decade-by-Decade Look into Movie Budgets')+
  theme(plot.title = element_text(hjust = .5),
        plot.subtitle = element_text(hjust = .5),
        axis.text.y=element_text(vjust=0,angle = 45))+
  scale_y_continuous(labels = dollars)

Inferential statistics (level 4)

### Do movies with bigger budgets get better ratings?
qqplot(data3$Budget,data3$Rating)

#No, there's no correlation between budget and rating.
cor.test(data3$Budget,data3$Rating,method = 'kendall')
## 
##  Kendall's rank correlation tau
## 
## data:  data3$Budget and data3$Rating
## z = 0.68657, p-value = 0.4924
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##        tau 
## 0.01723055
data3 %>% 
  ggplot(aes(x=Budget,y=Rating,col=Budget))+
  geom_point(size=2)+
  theme_bw()+
  scale_x_continuous(labels = dollars)

### Do movies in the Horror genre get better ratings than Mystery movies? 

#filter isn't showing anything. Odd.
#Check WS

horror= data3 %>%
  rowwise() %>%
  filter(any(c_across(starts_with("Genres_")) == " Horror"))

mystery = data3 %>%
  rowwise() %>%
  filter(any(c_across(starts_with("Genres_")) == " Mystery"))
shapiro.test(mystery$Rating)
## 
##  Shapiro-Wilk normality test
## 
## data:  mystery$Rating
## W = 0.99119, p-value = 0.7081
shapiro.test(horror$Rating)
## 
##  Shapiro-Wilk normality test
## 
## data:  horror$Rating
## W = 0.99551, p-value = 0.03097
hist(mystery$Rating)

hist(horror$Rating)

t.test(mystery$Rating,horror$Rating)
## 
##  Welch Two Sample t-test
## 
## data:  mystery$Rating and horror$Rating
## t = 3.8553, df = 153.9, p-value = 0.0001694
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2547602 0.7902126
## sample estimates:
## mean of x mean of y 
##  5.477982  4.955495
#Yes, Horror does get better ratings than Mystery.
LS0tDQp0aXRsZTogIlBzaSBDaGkgUiAtIE9jdG9iZXIgMjAyNCINCmF1dGhvcjogImJ5IEFsYW4gTGFtIg0KI2RhdGU6ICJgciBTeXMuRGF0ZSgpYCINCiNkYXRlOiAiRGF0ZTogYHIgZm9ybWF0KFN5cy5EYXRlKCksICclZCAlQiAlWScpYCIgDQpvdXRwdXQ6DQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdGhlbWU6IHJlYWRhYmxlDQogICAgYWx3YXlzX2FsbG93X2h0bWw6IHllcw0KICAgIGRmX3ByaW50OiBwYWdlZA0KICAgIHRvYzogeWVzDQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgICBudW1iZXJfc2VjdGlvbnM6IG5vDQogICAgYW5jaG9yX3NlY3Rpb25zOiBUUlVFDQogICAgY29kZV9mb2xkaW5nOiBoaWRlDQogICAgY29kZV9kb3dubG9hZDogdHJ1ZQ0KLS0tDQpTdGFydCBEYXRlOiBPY3QuIDcsIDIwMjQNCg0KUmVwb3J0IERhdGU6IGByIGZvcm1hdChTeXMuRGF0ZSgpLCAnJWQgJUIgJVknKWANCg0KWyoqU291cmNlKio6IFBzaSBDaGkgUl0oaHR0cHM6Ly9vc2YuaW8vYWN1aGUvd2lraS9ob21lLykNCg0KDQpgYGB7ciBzZXR1cCwgd2FybmluZz1GLG1lc3NhZ2U9Rn0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVCxtZXNzYWdlID0gRix3YXJuaW5nID0gRikNCmxpYnJhcnkodGlkeXZlcnNlKQ0KDQpkYXRhPXJlYWQuY3N2KCdkYXRhLmNzdicpDQojd3JpdGUuY3N2KGRhdGEsJ2RhdGEuY3N2JykNCmBgYA0KDQoNCmBgYHtyfQ0KIyMgRURBDQpTbWFydEVEQTo6RXhwRGF0YShkYXRhKQ0KDQpza2ltcjo6c2tpbShkYXRhKQ0KDQoNCmBgYA0KDQoNCiMjIERhdGEgcHJvY2Vzc2luZyAobGV2ZWwgMSkNCg0KDQoNCmBgYHtyfQ0KIyMjIFdyaXRlIGEgc2NyaXB0IHRoYXQgd2lsbCBmaWx0ZXIgb3V0IG1vdmllcyB0aGF0IGFyZSBtaXNzaW5nIGEgdmFsdWUgaW4gdGhlIOKAmEJ1ZGdldCzigJkg4oCYUnVuVGltZSzigJkg4oCYUGFyZW50YWxSYXRpbmfigJkgYW5kIOKAmFJhdGluZ+KAmSBjb2x1bW5zLg0KDQpkYXRhMSA9IGRhdGEgJT4lIA0KICBmaWx0ZXIoY29tcGxldGUuY2FzZXMoQnVkZ2V0KSwNCiAgICAgICAgIGNvbXBsZXRlLmNhc2VzKFJ1blRpbWUpLA0KICAgICAgICAgY29tcGxldGUuY2FzZXMoUGFyZW50YWxSYXRpbmcpLA0KICAgICAgICAgY29tcGxldGUuY2FzZXMoUmF0aW5nKSkgJT4lIA0KICBtdXRhdGUoREFURT0gbWR5KFJlbGVhc2VfRGF0ZSkpICU+JSANCiAgc2VsZWN0KERBVEUsUmVsZWFzZV9EYXRlLGV2ZXJ5dGhpbmcoKSkNCmBgYA0KDQoNCg0KYGBge3J9DQojIyMgUmlnaHQgbm93LCB0aGUgZ2VucmUgY29sdW1ucyBjYW5ub3QgYmUgcHJvcGVybHkgYW5hbHl6ZWQgZHVlIHRvIGZvcm1hdHRpbmcgKGUuZy4sIEFjdGlvbnwgRmFudGFzeXwgSG9ycm9yfCBNeXN0ZXJ5fCBSb21hbmNlKS4gV3JpdGUgYSBzY3JpcHQgdGhhdCB3aWxsIHNlcGFyYXRlIHRoZSBjb2x1bW4gaW50byBtdWx0aXBsZSBHZW5yZSBjYXRlZ29yaWVzLg0KDQpkYXRhMiA9IGRhdGExICU+JSANCiAgc2VwYXJhdGVfd2lkZXJfZGVsaW0oR2VucmVzLCBkZWxpbSA9ICJ8IiwNCiAgICAgICAgICAgICAgICAgICAgICAgbmFtZXNfc2VwID0gIl8iLA0KICAgICAgICAgICAgICAgICAgICAgICB0b29fZmV3ID0gImFsaWduX3N0YXJ0IikNCg0KDQojIHdlaXJkIGhvdyBHZW5yZXMxIGJlY2FtZSBzZW5zaWJsZSB3b3Jkcw0KIyBkYXRhMyA9IGRhdGEyICU+JSANCiMgICBtdXRhdGUoR2VucmVzMiA9IHN0cnNwbGl0KGFzLmNoYXJhY3RlcihHZW5yZXMxKSwiXFwsIikpICU+JSANCiMgICB1bm5lc3QoKSAlPiUgDQojICAgc2VsZWN0KEdlbnJlczIsR2VucmVzMSxHZW5yZXMsZXZlcnl0aGluZygpKQ0KYGBgDQoNCg0KIyMgRGVzY3JpcHRpdmUgU3RhdGlzdGljcyAobGV2ZWwgMikNCg0KDQpgYGB7cn0NCiMjIyBXcml0ZSBhIHNjcmlwdCB0aGF0IHdpbGwgcHJvdmlkZSB0aGUgYXZlcmFnZSwgc3RhbmRhcmQgZGV2aWF0aW9uLCBtZWRpYW4sIGFuZCByYW5nZSBvZiB0aGUg4oCYUmF0aW5n4oCZIHZhcmlhYmxlLiBOb3RlIHRoZSB2YWx1ZXMgeW91IGdvdCBmcm9tIHJ1bm5pbmcgdGhlIGNvZGUNCg0Kc3VtbWFyeShkYXRhMiRSYXRpbmcsbmEucm09VCkNCg0Kc2QoZGF0YTIkUmF0aW5nLG5hLnJtID0gVCkNCg0KIyBBdmVyYWdlID0gNS4wLCBTdGFuZC4gRGV2LiA9IDEuNSwNCiMgTWVkaWFuID0gNC45LCBSYW5nZSA9IDEgdG8gOS42DQpgYGANCg0KDQoNCmBgYHtyfQ0KIyMjIEZJTFRFUiBCWSBNQUlOIEdFTlJFDQojIyMgV3JpdGUgYSBzY3JpcHQgdGhhdCB3aWxsIHNob3cgdGhlIGF2ZXJhZ2UgcmF0aW5ncyBieSBhIG1vdmll4oCZcyBtYWluIGdlbnJlLiANCg0KZGF0YTIgJT4lIA0KICBncm91cF9ieShUaXRsZSxHZW5yZXNfMSkgJT4lIA0KICBzdW1tYXJpc2UoUmF0aW5nID0gbWVhbihSYXRpbmcsbmEucm0gPSBUKSkgJT4lIA0KICBhcnJhbmdlKGRlc2MoUmF0aW5nKSkNCmBgYA0KDQojIyBEYXRhIHZpc3VhbGl6YXRpb24gKGxldmVsIDMpDQoNCg0KDQpgYGB7cn0NCiMjIyBDcmVhdGUgYSBncmFwaCB0aGF0IHNob3dzIHRoZSDigJhCdWRnZXTigJkgZm9yIHRoZSBtb3ZpZXMgaW4gdGhlIGRhdGFzZXQgb3ZlciB0aW1lLg0KDQpkYXRhMiAlPiUgDQogIGRpc3RpbmN0KFRpdGxlLERBVEUsLmtlZXBfYWxsID0gVCkgJT4lIA0KICBnZ3Bsb3QoYWVzKHg9REFURSx5PUJ1ZGdldCkpKw0KICBnZW9tX2xpbmUoKSsNCiAgdGhlbWVfYncoKQ0KDQojIFNvbWV0aGluZyB3ZWlyZCBpcyBoYXBwZW5pbmcgd2l0aCB0aGUgUmVsZWFzZV9EYXRlIGNvbHVtbiBzaG93aW5nIDE5MDUgZm9yIG1vdmllcyBsaWtlIEhhYml0ICgyMDE3KSBvciBWb2lkZmluZGVyICgyMDE3KQ0KYGBgDQoNCmBgYHtyfQ0KIyBleHRyYWN0IHllYXIgb3V0IG9mIGBUaXRsZWAgaW5zdGVhZA0KDQpkYXRhMyA9IGRhdGEyICU+JSANCiAgbXV0YXRlKFlFQVIgPSBzdHJfZXh0cmFjdChUaXRsZSwgIlxcKFxcZHs0fVxcKSIpKSAlPiUgDQogIG11dGF0ZShZRUFSMSA9IHN0cl9leHRyYWN0KFlFQVIsICJcXGR7NH0iKSkgJT4lIA0KICBtdXRhdGUoWUVBUjIgPSBhcy5EYXRlKHBhc3RlMChZRUFSMSwiLTAxLTAxIikpKSAlPiUgDQogIHNlbGVjdChZRUFSMixZRUFSMSxZRUFSLGV2ZXJ5dGhpbmcoKSkNCg0KIyBkb2xsYXIgZm9ybWF0DQpkb2xsYXJzID0gZnVuY3Rpb24oeCkgew0KICBwYXN0ZTAoIiQiLCBmb3JtYXQoeCxiaWcubWFyaz0gIiwiLHNjaWVudGlmaWM9RikpDQp9DQoNCmRhdGEzICU+JSANCiAgZ2dwbG90KGFlcyh4PVlFQVIyLHk9QnVkZ2V0KSkrDQogIGdlb21fbGluZSgpKw0KICB0aGVtZV9idygpKw0KICBsYWJzKHg9J1JlbGVhc2UgWWVhcicsDQogICAgICAgeT0nTW92aWUgQnVkZ2V0JywNCiAgICAgICB0aXRsZT0nRnJvbSBCb3ggT2ZmaWNlIHRvIEJsb2NrYnVzdGVyJywNCiAgICAgICBzdWJ0aXRsZT0nQSBEZWNhZGUtYnktRGVjYWRlIExvb2sgaW50byBNb3ZpZSBCdWRnZXRzJykrDQogIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAuNSksDQogICAgICAgIHBsb3Quc3VidGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAuNSksDQogICAgICAgIGF4aXMudGV4dC55PWVsZW1lbnRfdGV4dCh2anVzdD0wLGFuZ2xlID0gNDUpKSsNCiAgc2NhbGVfeV9jb250aW51b3VzKGxhYmVscyA9IGRvbGxhcnMpDQpgYGANCg0KDQojIyBJbmZlcmVudGlhbCBzdGF0aXN0aWNzIChsZXZlbCA0KQ0KDQpgYGB7cn0NCiMjIyBEbyBtb3ZpZXMgd2l0aCBiaWdnZXIgYnVkZ2V0cyBnZXQgYmV0dGVyIHJhdGluZ3M/DQpxcXBsb3QoZGF0YTMkQnVkZ2V0LGRhdGEzJFJhdGluZykNCmBgYA0KDQpgYGB7cn0NCiNObywgdGhlcmUncyBubyBjb3JyZWxhdGlvbiBiZXR3ZWVuIGJ1ZGdldCBhbmQgcmF0aW5nLg0KY29yLnRlc3QoZGF0YTMkQnVkZ2V0LGRhdGEzJFJhdGluZyxtZXRob2QgPSAna2VuZGFsbCcpDQpgYGANCg0KYGBge3J9DQpkYXRhMyAlPiUgDQogIGdncGxvdChhZXMoeD1CdWRnZXQseT1SYXRpbmcsY29sPUJ1ZGdldCkpKw0KICBnZW9tX3BvaW50KHNpemU9MikrDQogIHRoZW1lX2J3KCkrDQogIHNjYWxlX3hfY29udGludW91cyhsYWJlbHMgPSBkb2xsYXJzKQ0KYGBgDQoNCg0KDQpgYGB7cn0NCg0KIyMjIERvIG1vdmllcyBpbiB0aGUgSG9ycm9yIGdlbnJlIGdldCBiZXR0ZXIgcmF0aW5ncyB0aGFuIE15c3RlcnkgbW92aWVzPyANCg0KI2ZpbHRlciBpc24ndCBzaG93aW5nIGFueXRoaW5nLiBPZGQuDQojQ2hlY2sgV1MNCg0KaG9ycm9yPSBkYXRhMyAlPiUNCiAgcm93d2lzZSgpICU+JQ0KICBmaWx0ZXIoYW55KGNfYWNyb3NzKHN0YXJ0c193aXRoKCJHZW5yZXNfIikpID09ICIgSG9ycm9yIikpDQoNCm15c3RlcnkgPSBkYXRhMyAlPiUNCiAgcm93d2lzZSgpICU+JQ0KICBmaWx0ZXIoYW55KGNfYWNyb3NzKHN0YXJ0c193aXRoKCJHZW5yZXNfIikpID09ICIgTXlzdGVyeSIpKQ0KDQpgYGANCg0KYGBge3J9DQpzaGFwaXJvLnRlc3QobXlzdGVyeSRSYXRpbmcpDQpzaGFwaXJvLnRlc3QoaG9ycm9yJFJhdGluZykNCg0KaGlzdChteXN0ZXJ5JFJhdGluZykNCmhpc3QoaG9ycm9yJFJhdGluZykNCg0KdC50ZXN0KG15c3RlcnkkUmF0aW5nLGhvcnJvciRSYXRpbmcpDQojWWVzLCBIb3Jyb3IgZG9lcyBnZXQgYmV0dGVyIHJhdGluZ3MgdGhhbiBNeXN0ZXJ5Lg0KYGBgDQoNCg0KYGBge3IgaW5jbHVkZT1GQUxTRX0NCiNiZWVwIHdoZW4gZG9uZQ0KaWYgKHJlcXVpcmUoImJlZXByIixxdWlldGx5ID0gVCkpDQogIGJlZXByOjpiZWVwKDIpDQpgYGANCg==