Start Date: Oct. 7, 2024
Report Date: 31 October 2024
Source:
Psi Chi R
knitr::opts_chunk$set(echo = T,message = F,warning = F)
library(tidyverse)
data=read.csv('data.csv')
#write.csv(data,'data.csv')
## EDA
SmartEDA::ExpData(data)
skimr::skim(data)
Data summary
Name |
data |
Number of rows |
3004 |
Number of columns |
12 |
_______________________ |
|
Column type frequency: |
|
character |
9 |
numeric |
3 |
________________________ |
|
Group variables |
None |
Variable type: character
Title |
0 |
1 |
9 |
62 |
0 |
2982 |
0 |
Genres |
0 |
1 |
0 |
80 |
1 |
243 |
0 |
Release_Date |
0 |
1 |
8 |
10 |
0 |
1252 |
0 |
Release |
0 |
1 |
2 |
20 |
0 |
69 |
0 |
ParentalRating |
0 |
1 |
0 |
9 |
1663 |
12 |
0 |
Plot |
0 |
1 |
0 |
547 |
1 |
2987 |
0 |
Cast |
0 |
1 |
0 |
308 |
13 |
2974 |
0 |
Language |
0 |
1 |
0 |
64 |
69 |
162 |
0 |
FilmingLocations |
0 |
1 |
0 |
125 |
1131 |
1027 |
0 |
Variable type: numeric
Rating |
232 |
0.92 |
5.06 |
1.46 |
1 |
4 |
5 |
6 |
9.8e+00 |
▁▆▇▃▁ |
RunTime |
487 |
0.84 |
90.75 |
14.57 |
12 |
82 |
90 |
96 |
1.8e+02 |
▁▂▇▁▁ |
Budget |
2094 |
0.30 |
3433828.09 |
9269568.33 |
1 |
25000 |
475000 |
2500000 |
8.0e+07 |
▇▁▁▁▁ |
Data processing (level 1)
### Write a script that will filter out movies that are missing a value in the ‘Budget,’ ‘RunTime,’ ‘ParentalRating’ and ‘Rating’ columns.
data1 = data %>%
filter(complete.cases(Budget),
complete.cases(RunTime),
complete.cases(ParentalRating),
complete.cases(Rating)) %>%
mutate(DATE= mdy(Release_Date)) %>%
select(DATE,Release_Date,everything())
### Right now, the genre columns cannot be properly analyzed due to formatting (e.g., Action| Fantasy| Horror| Mystery| Romance). Write a script that will separate the column into multiple Genre categories.
data2 = data1 %>%
separate_wider_delim(Genres, delim = "|",
names_sep = "_",
too_few = "align_start")
# weird how Genres1 became sensible words
# data3 = data2 %>%
# mutate(Genres2 = strsplit(as.character(Genres1),"\\,")) %>%
# unnest() %>%
# select(Genres2,Genres1,Genres,everything())
Descriptive Statistics (level 2)
### Write a script that will provide the average, standard deviation, median, and range of the ‘Rating’ variable. Note the values you got from running the code
summary(data2$Rating,na.rm=T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.900 4.900 4.955 6.000 9.600
sd(data2$Rating,na.rm = T)
## [1] 1.493
# Average = 5.0, Stand. Dev. = 1.5,
# Median = 4.9, Range = 1 to 9.6
### FILTER BY MAIN GENRE
### Write a script that will show the average ratings by a movie’s main genre.
data2 %>%
group_by(Title,Genres_1) %>%
summarise(Rating = mean(Rating,na.rm = T)) %>%
arrange(desc(Rating))
Data visualization (level 3)
### Create a graph that shows the ‘Budget’ for the movies in the dataset over time.
data2 %>%
distinct(Title,DATE,.keep_all = T) %>%
ggplot(aes(x=DATE,y=Budget))+
geom_line()+
theme_bw()

# Something weird is happening with the Release_Date column showing 1905 for movies like Habit (2017) or Voidfinder (2017)
# extract year out of `Title` instead
data3 = data2 %>%
mutate(YEAR = str_extract(Title, "\\(\\d{4}\\)")) %>%
mutate(YEAR1 = str_extract(YEAR, "\\d{4}")) %>%
mutate(YEAR2 = as.Date(paste0(YEAR1,"-01-01"))) %>%
select(YEAR2,YEAR1,YEAR,everything())
# dollar format
dollars = function(x) {
paste0("$", format(x,big.mark= ",",scientific=F))
}
data3 %>%
ggplot(aes(x=YEAR2,y=Budget))+
geom_line()+
theme_bw()+
labs(x='Release Year',
y='Movie Budget',
title='From Box Office to Blockbuster',
subtitle='A Decade-by-Decade Look into Movie Budgets')+
theme(plot.title = element_text(hjust = .5),
plot.subtitle = element_text(hjust = .5),
axis.text.y=element_text(vjust=0,angle = 45))+
scale_y_continuous(labels = dollars)

Inferential statistics (level 4)
### Do movies with bigger budgets get better ratings?
qqplot(data3$Budget,data3$Rating)

#No, there's no correlation between budget and rating.
cor.test(data3$Budget,data3$Rating,method = 'kendall')
##
## Kendall's rank correlation tau
##
## data: data3$Budget and data3$Rating
## z = 0.68657, p-value = 0.4924
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.01723055
data3 %>%
ggplot(aes(x=Budget,y=Rating,col=Budget))+
geom_point(size=2)+
theme_bw()+
scale_x_continuous(labels = dollars)

### Do movies in the Horror genre get better ratings than Mystery movies?
#filter isn't showing anything. Odd.
#Check WS
horror= data3 %>%
rowwise() %>%
filter(any(c_across(starts_with("Genres_")) == " Horror"))
mystery = data3 %>%
rowwise() %>%
filter(any(c_across(starts_with("Genres_")) == " Mystery"))
shapiro.test(mystery$Rating)
##
## Shapiro-Wilk normality test
##
## data: mystery$Rating
## W = 0.99119, p-value = 0.7081
shapiro.test(horror$Rating)
##
## Shapiro-Wilk normality test
##
## data: horror$Rating
## W = 0.99551, p-value = 0.03097
hist(mystery$Rating)

hist(horror$Rating)

t.test(mystery$Rating,horror$Rating)
##
## Welch Two Sample t-test
##
## data: mystery$Rating and horror$Rating
## t = 3.8553, df = 153.9, p-value = 0.0001694
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2547602 0.7902126
## sample estimates:
## mean of x mean of y
## 5.477982 4.955495
#Yes, Horror does get better ratings than Mystery.
LS0tDQp0aXRsZTogIlBzaSBDaGkgUiAtIE9jdG9iZXIgMjAyNCINCmF1dGhvcjogImJ5IEFsYW4gTGFtIg0KI2RhdGU6ICJgciBTeXMuRGF0ZSgpYCINCiNkYXRlOiAiRGF0ZTogYHIgZm9ybWF0KFN5cy5EYXRlKCksICclZCAlQiAlWScpYCIgDQpvdXRwdXQ6DQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdGhlbWU6IHJlYWRhYmxlDQogICAgYWx3YXlzX2FsbG93X2h0bWw6IHllcw0KICAgIGRmX3ByaW50OiBwYWdlZA0KICAgIHRvYzogeWVzDQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgICBudW1iZXJfc2VjdGlvbnM6IG5vDQogICAgYW5jaG9yX3NlY3Rpb25zOiBUUlVFDQogICAgY29kZV9mb2xkaW5nOiBoaWRlDQogICAgY29kZV9kb3dubG9hZDogdHJ1ZQ0KLS0tDQpTdGFydCBEYXRlOiBPY3QuIDcsIDIwMjQNCg0KUmVwb3J0IERhdGU6IGByIGZvcm1hdChTeXMuRGF0ZSgpLCAnJWQgJUIgJVknKWANCg0KWyoqU291cmNlKio6IFBzaSBDaGkgUl0oaHR0cHM6Ly9vc2YuaW8vYWN1aGUvd2lraS9ob21lLykNCg0KDQpgYGB7ciBzZXR1cCwgd2FybmluZz1GLG1lc3NhZ2U9Rn0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVCxtZXNzYWdlID0gRix3YXJuaW5nID0gRikNCmxpYnJhcnkodGlkeXZlcnNlKQ0KDQpkYXRhPXJlYWQuY3N2KCdkYXRhLmNzdicpDQojd3JpdGUuY3N2KGRhdGEsJ2RhdGEuY3N2JykNCmBgYA0KDQoNCmBgYHtyfQ0KIyMgRURBDQpTbWFydEVEQTo6RXhwRGF0YShkYXRhKQ0KDQpza2ltcjo6c2tpbShkYXRhKQ0KDQoNCmBgYA0KDQoNCiMjIERhdGEgcHJvY2Vzc2luZyAobGV2ZWwgMSkNCg0KDQoNCmBgYHtyfQ0KIyMjIFdyaXRlIGEgc2NyaXB0IHRoYXQgd2lsbCBmaWx0ZXIgb3V0IG1vdmllcyB0aGF0IGFyZSBtaXNzaW5nIGEgdmFsdWUgaW4gdGhlIOKAmEJ1ZGdldCzigJkg4oCYUnVuVGltZSzigJkg4oCYUGFyZW50YWxSYXRpbmfigJkgYW5kIOKAmFJhdGluZ+KAmSBjb2x1bW5zLg0KDQpkYXRhMSA9IGRhdGEgJT4lIA0KICBmaWx0ZXIoY29tcGxldGUuY2FzZXMoQnVkZ2V0KSwNCiAgICAgICAgIGNvbXBsZXRlLmNhc2VzKFJ1blRpbWUpLA0KICAgICAgICAgY29tcGxldGUuY2FzZXMoUGFyZW50YWxSYXRpbmcpLA0KICAgICAgICAgY29tcGxldGUuY2FzZXMoUmF0aW5nKSkgJT4lIA0KICBtdXRhdGUoREFURT0gbWR5KFJlbGVhc2VfRGF0ZSkpICU+JSANCiAgc2VsZWN0KERBVEUsUmVsZWFzZV9EYXRlLGV2ZXJ5dGhpbmcoKSkNCmBgYA0KDQoNCg0KYGBge3J9DQojIyMgUmlnaHQgbm93LCB0aGUgZ2VucmUgY29sdW1ucyBjYW5ub3QgYmUgcHJvcGVybHkgYW5hbHl6ZWQgZHVlIHRvIGZvcm1hdHRpbmcgKGUuZy4sIEFjdGlvbnwgRmFudGFzeXwgSG9ycm9yfCBNeXN0ZXJ5fCBSb21hbmNlKS4gV3JpdGUgYSBzY3JpcHQgdGhhdCB3aWxsIHNlcGFyYXRlIHRoZSBjb2x1bW4gaW50byBtdWx0aXBsZSBHZW5yZSBjYXRlZ29yaWVzLg0KDQpkYXRhMiA9IGRhdGExICU+JSANCiAgc2VwYXJhdGVfd2lkZXJfZGVsaW0oR2VucmVzLCBkZWxpbSA9ICJ8IiwNCiAgICAgICAgICAgICAgICAgICAgICAgbmFtZXNfc2VwID0gIl8iLA0KICAgICAgICAgICAgICAgICAgICAgICB0b29fZmV3ID0gImFsaWduX3N0YXJ0IikNCg0KDQojIHdlaXJkIGhvdyBHZW5yZXMxIGJlY2FtZSBzZW5zaWJsZSB3b3Jkcw0KIyBkYXRhMyA9IGRhdGEyICU+JSANCiMgICBtdXRhdGUoR2VucmVzMiA9IHN0cnNwbGl0KGFzLmNoYXJhY3RlcihHZW5yZXMxKSwiXFwsIikpICU+JSANCiMgICB1bm5lc3QoKSAlPiUgDQojICAgc2VsZWN0KEdlbnJlczIsR2VucmVzMSxHZW5yZXMsZXZlcnl0aGluZygpKQ0KYGBgDQoNCg0KIyMgRGVzY3JpcHRpdmUgU3RhdGlzdGljcyAobGV2ZWwgMikNCg0KDQpgYGB7cn0NCiMjIyBXcml0ZSBhIHNjcmlwdCB0aGF0IHdpbGwgcHJvdmlkZSB0aGUgYXZlcmFnZSwgc3RhbmRhcmQgZGV2aWF0aW9uLCBtZWRpYW4sIGFuZCByYW5nZSBvZiB0aGUg4oCYUmF0aW5n4oCZIHZhcmlhYmxlLiBOb3RlIHRoZSB2YWx1ZXMgeW91IGdvdCBmcm9tIHJ1bm5pbmcgdGhlIGNvZGUNCg0Kc3VtbWFyeShkYXRhMiRSYXRpbmcsbmEucm09VCkNCg0Kc2QoZGF0YTIkUmF0aW5nLG5hLnJtID0gVCkNCg0KIyBBdmVyYWdlID0gNS4wLCBTdGFuZC4gRGV2LiA9IDEuNSwNCiMgTWVkaWFuID0gNC45LCBSYW5nZSA9IDEgdG8gOS42DQpgYGANCg0KDQoNCmBgYHtyfQ0KIyMjIEZJTFRFUiBCWSBNQUlOIEdFTlJFDQojIyMgV3JpdGUgYSBzY3JpcHQgdGhhdCB3aWxsIHNob3cgdGhlIGF2ZXJhZ2UgcmF0aW5ncyBieSBhIG1vdmll4oCZcyBtYWluIGdlbnJlLiANCg0KZGF0YTIgJT4lIA0KICBncm91cF9ieShUaXRsZSxHZW5yZXNfMSkgJT4lIA0KICBzdW1tYXJpc2UoUmF0aW5nID0gbWVhbihSYXRpbmcsbmEucm0gPSBUKSkgJT4lIA0KICBhcnJhbmdlKGRlc2MoUmF0aW5nKSkNCmBgYA0KDQojIyBEYXRhIHZpc3VhbGl6YXRpb24gKGxldmVsIDMpDQoNCg0KDQpgYGB7cn0NCiMjIyBDcmVhdGUgYSBncmFwaCB0aGF0IHNob3dzIHRoZSDigJhCdWRnZXTigJkgZm9yIHRoZSBtb3ZpZXMgaW4gdGhlIGRhdGFzZXQgb3ZlciB0aW1lLg0KDQpkYXRhMiAlPiUgDQogIGRpc3RpbmN0KFRpdGxlLERBVEUsLmtlZXBfYWxsID0gVCkgJT4lIA0KICBnZ3Bsb3QoYWVzKHg9REFURSx5PUJ1ZGdldCkpKw0KICBnZW9tX2xpbmUoKSsNCiAgdGhlbWVfYncoKQ0KDQojIFNvbWV0aGluZyB3ZWlyZCBpcyBoYXBwZW5pbmcgd2l0aCB0aGUgUmVsZWFzZV9EYXRlIGNvbHVtbiBzaG93aW5nIDE5MDUgZm9yIG1vdmllcyBsaWtlIEhhYml0ICgyMDE3KSBvciBWb2lkZmluZGVyICgyMDE3KQ0KYGBgDQoNCmBgYHtyfQ0KIyBleHRyYWN0IHllYXIgb3V0IG9mIGBUaXRsZWAgaW5zdGVhZA0KDQpkYXRhMyA9IGRhdGEyICU+JSANCiAgbXV0YXRlKFlFQVIgPSBzdHJfZXh0cmFjdChUaXRsZSwgIlxcKFxcZHs0fVxcKSIpKSAlPiUgDQogIG11dGF0ZShZRUFSMSA9IHN0cl9leHRyYWN0KFlFQVIsICJcXGR7NH0iKSkgJT4lIA0KICBtdXRhdGUoWUVBUjIgPSBhcy5EYXRlKHBhc3RlMChZRUFSMSwiLTAxLTAxIikpKSAlPiUgDQogIHNlbGVjdChZRUFSMixZRUFSMSxZRUFSLGV2ZXJ5dGhpbmcoKSkNCg0KIyBkb2xsYXIgZm9ybWF0DQpkb2xsYXJzID0gZnVuY3Rpb24oeCkgew0KICBwYXN0ZTAoIiQiLCBmb3JtYXQoeCxiaWcubWFyaz0gIiwiLHNjaWVudGlmaWM9RikpDQp9DQoNCmRhdGEzICU+JSANCiAgZ2dwbG90KGFlcyh4PVlFQVIyLHk9QnVkZ2V0KSkrDQogIGdlb21fbGluZSgpKw0KICB0aGVtZV9idygpKw0KICBsYWJzKHg9J1JlbGVhc2UgWWVhcicsDQogICAgICAgeT0nTW92aWUgQnVkZ2V0JywNCiAgICAgICB0aXRsZT0nRnJvbSBCb3ggT2ZmaWNlIHRvIEJsb2NrYnVzdGVyJywNCiAgICAgICBzdWJ0aXRsZT0nQSBEZWNhZGUtYnktRGVjYWRlIExvb2sgaW50byBNb3ZpZSBCdWRnZXRzJykrDQogIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAuNSksDQogICAgICAgIHBsb3Quc3VidGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAuNSksDQogICAgICAgIGF4aXMudGV4dC55PWVsZW1lbnRfdGV4dCh2anVzdD0wLGFuZ2xlID0gNDUpKSsNCiAgc2NhbGVfeV9jb250aW51b3VzKGxhYmVscyA9IGRvbGxhcnMpDQpgYGANCg0KDQojIyBJbmZlcmVudGlhbCBzdGF0aXN0aWNzIChsZXZlbCA0KQ0KDQpgYGB7cn0NCiMjIyBEbyBtb3ZpZXMgd2l0aCBiaWdnZXIgYnVkZ2V0cyBnZXQgYmV0dGVyIHJhdGluZ3M/DQpxcXBsb3QoZGF0YTMkQnVkZ2V0LGRhdGEzJFJhdGluZykNCmBgYA0KDQpgYGB7cn0NCiNObywgdGhlcmUncyBubyBjb3JyZWxhdGlvbiBiZXR3ZWVuIGJ1ZGdldCBhbmQgcmF0aW5nLg0KY29yLnRlc3QoZGF0YTMkQnVkZ2V0LGRhdGEzJFJhdGluZyxtZXRob2QgPSAna2VuZGFsbCcpDQpgYGANCg0KYGBge3J9DQpkYXRhMyAlPiUgDQogIGdncGxvdChhZXMoeD1CdWRnZXQseT1SYXRpbmcsY29sPUJ1ZGdldCkpKw0KICBnZW9tX3BvaW50KHNpemU9MikrDQogIHRoZW1lX2J3KCkrDQogIHNjYWxlX3hfY29udGludW91cyhsYWJlbHMgPSBkb2xsYXJzKQ0KYGBgDQoNCg0KDQpgYGB7cn0NCg0KIyMjIERvIG1vdmllcyBpbiB0aGUgSG9ycm9yIGdlbnJlIGdldCBiZXR0ZXIgcmF0aW5ncyB0aGFuIE15c3RlcnkgbW92aWVzPyANCg0KI2ZpbHRlciBpc24ndCBzaG93aW5nIGFueXRoaW5nLiBPZGQuDQojQ2hlY2sgV1MNCg0KaG9ycm9yPSBkYXRhMyAlPiUNCiAgcm93d2lzZSgpICU+JQ0KICBmaWx0ZXIoYW55KGNfYWNyb3NzKHN0YXJ0c193aXRoKCJHZW5yZXNfIikpID09ICIgSG9ycm9yIikpDQoNCm15c3RlcnkgPSBkYXRhMyAlPiUNCiAgcm93d2lzZSgpICU+JQ0KICBmaWx0ZXIoYW55KGNfYWNyb3NzKHN0YXJ0c193aXRoKCJHZW5yZXNfIikpID09ICIgTXlzdGVyeSIpKQ0KDQpgYGANCg0KYGBge3J9DQpzaGFwaXJvLnRlc3QobXlzdGVyeSRSYXRpbmcpDQpzaGFwaXJvLnRlc3QoaG9ycm9yJFJhdGluZykNCg0KaGlzdChteXN0ZXJ5JFJhdGluZykNCmhpc3QoaG9ycm9yJFJhdGluZykNCg0KdC50ZXN0KG15c3RlcnkkUmF0aW5nLGhvcnJvciRSYXRpbmcpDQojWWVzLCBIb3Jyb3IgZG9lcyBnZXQgYmV0dGVyIHJhdGluZ3MgdGhhbiBNeXN0ZXJ5Lg0KYGBgDQoNCg0KYGBge3IgaW5jbHVkZT1GQUxTRX0NCiNiZWVwIHdoZW4gZG9uZQ0KaWYgKHJlcXVpcmUoImJlZXByIixxdWlldGx5ID0gVCkpDQogIGJlZXByOjpiZWVwKDIpDQpgYGANCg==