Source: Psi Chi R Contest

Code will be revealed after the submission deadline.

#Load packages and import data

library(ggplot2)
library(dplyr)
library(lubridate)
library(readr)

october=read.csv('october_psichiR_movies.csv')

Data processing (level 1)

  1. Write a script that will filter out movies that are missing a value in the ‘Budget’ column.
octBudget=october %>% 
  subset(!is.na(Budget))
  1. Write a script that will filter out movies that are missing a value in the ‘RunTime’ column.
octRunTimeNA =octBudget%>% 
  subset(!is.na(RunTime))

Descriptive Statistics (level 2)

  1. Write a script that will provide the average, standard deviation, median, and range of the ‘RunTime’ variable. Note the values you got.
oct_descriptive= summary(octRunTimeNA$RunTime)

print(oct_descriptive)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    45.0    82.0    90.0    90.3    96.0   170.0
sd(oct_descriptive)
## [1] 40.84176
range(oct_descriptive)
## [1]  45 170
#Mean = 90.3, STD = 40.84176, Median = 90, Range = 45 to 170
  1. Write a script that will provide the average, standard deviation, median, and range of the ‘Budget’ variable. Note the values you got.
oct_budget_description= summary(octRunTimeNA$Budget)

print(oct_budget_description)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      100    40000   500000  3828124  3000000 70000000
sd(oct_budget_description)
## [1] 28022369
range(oct_budget_description)
## [1]      100 70000000
#Mean = 3828124, STD = 28022369, Median = 500000, Range = 100 to 70000000
  1. Write a script that will provide the average, standard deviation, median, and range of the ‘Review’ variable. Note the values you got from running the code.
oct_review_description = summary(octRunTimeNA$Review)

print(oct_review_description)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   3.900   4.900   4.955   6.000   9.600      20
sd(oct_review_description)
## [1] 6.200773
range(oct_review_description)
## [1]  1 20
#Mean = 4.955, STD = 6.200773, Median = 4.9, Range = 1 - 20

Data visualization (level 3)

  1. Create a graph that shows the ‘Budget’ for the movies in the dataset over time.
#Transform dates

octRunTimeNA$Date=mdy(octRunTimeNA$Release_Date)

octRunTimeNA=octRunTimeNA %>% 
  select(Release_Date,Date,everything())

range(octRunTimeNA$Date) #Shows 1905 to 2017
## [1] "1905-07-04" "2017-10-31"
octRunTimeNA %>% 
  ggplot(aes(y=Budget,x=Date))+
  geom_line(col='darkgreen')+
  geom_point()+
  labs(title='Movie Budget Over the Years',subtitle="From 1905 to 2017",y=' ',x='Years')+
  theme_bw()+
  theme(plot.title = element_text(hjust=.5),plot.subtitle = element_text(hjust=.5))+
  scale_y_continuous(labels=scales::dollar_format())+
  expand_limits(y=80000000)

Inferential statistics (level 4)

  1. Test if there are any associations between the following variables: ‘Review’, ‘Budget’, and ‘RunTime’. Note the key statistics you got from running the code.
shapiro.test(octRunTimeNA$Review)
## 
##  Shapiro-Wilk normality test
## 
## data:  octRunTimeNA$Review
## W = 0.99551, p-value = 0.03097
shapiro.test(octRunTimeNA$Budget)
## 
##  Shapiro-Wilk normality test
## 
## data:  octRunTimeNA$Budget
## W = 0.42656, p-value < 2.2e-16
shapiro.test(octRunTimeNA$RunTime)
## 
##  Shapiro-Wilk normality test
## 
## data:  octRunTimeNA$RunTime
## W = 0.93213, p-value < 2.2e-16
cor.test(octRunTimeNA$Review,octRunTimeNA$Budget,method='spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  octRunTimeNA$Review and octRunTimeNA$Budget
## S = 65296279, p-value = 0.5632
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.02132607
#p-value = 0.5632, rho 0.02132607 

cor.test(octRunTimeNA$Review,octRunTimeNA$RunTime,method='spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  octRunTimeNA$Review and octRunTimeNA$RunTime
## S = 53524003, p-value = 0.00000006181
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1977713
#p-value = 0.00000006181, rho 0.1977713 

cor.test(octRunTimeNA$Budget,octRunTimeNA$RunTime,method='spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  octRunTimeNA$Budget and octRunTimeNA$RunTime
## S = 48588830, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3279512
#p-value < 2.2e-16, rho 0.3279512