knitr::opts_chunk$set(echo = TRUE)
library(stringr)
movies <- read.csv("C:/Users/Prasad/Downloads/imdb.csv")
str(movies)
## 'data.frame': 503 obs. of 12 variables:
## $ names : chr "Creed III" "Avatar: The Way of Water" "The Super Mario Bros. Movie" "Mummies" ...
## $ date_x : chr "03-02-2023" "12/15/2022 " "04-05-2023" "01-05-2023" ...
## $ score : int 73 78 76 70 61 66 80 83 59 58 ...
## $ genre : chr "Drama,?\xffAction" "Science Fiction,?\xffAdventure,?\xffAction" "Animation,?\xffAdventure,?\xffFamily,?\xffFantasy,?\xffComedy" "Animation,?\xffComedy,?\xffFamily,?\xffAdventure,?\xffFantasy" ...
## $ overview : chr "After dominating the boxing world, Adonis Creed has been thriving in both his career and family life. When a childhood friend a "Set more than a decade after the events of the first film, learn the story of the Sully family (Jake, Neytiri, and their kids), "While working underground to fix a water main, Brooklyn plumbers\x83??and brothers\x83??Mario and Luigi are transported down a "Through a series of unfortunate events, three mummies end up in present-day London and embark on a wacky and hilarious journey ...
## $ crew : chr "Michael B. Jordan, Adonis Creed, Tessa Thompson, Bianca Taylor, Jonathan Majors, Damien Anderson, Wood Harris, Tony 'Little Duk "Sam Worthington, Jake Sully, Zoe Salda?\xf1a, Neytiri, Sigourney Weaver, Kiri / Dr. Grace Augustine, Stephen Lang, Colonel Mile "Chris Pratt, Mario (voice), Anya Taylor-Joy, Princess Peach (voice), Charlie Day, Luigi (voice), Jack Black, Bowser (voice), Ke "??scar Barber?\xadn, Thut (voice), Ana Esther Alborg, Nefer (voice), Luis P??rez Reina, Carnaby (voice), Mar??a Luisa Sol?\xad, ...
## $ orig_title: chr "Creed III" "Avatar: The Way of Water" "The Super Mario Bros. Movie" " Momias" ...
## $ status : chr " Released" " Released" " Released" " Released" ...
## $ orig_lang : chr " English" " English" " English" " Spanish, Castilian" ...
## $ budget_x : num 7.50e+07 4.60e+08 1.00e+08 1.23e+07 7.70e+07 ...
## $ revenue : num 2.72e+08 2.32e+09 7.24e+08 3.42e+07 3.41e+08 ...
## $ country : chr "AU" "AU" "AU" "AU" ...
summary(movies)
## names date_x score genre
## Length:503 Length:503 Min. : 0.0 Length:503
## Class :character Class :character 1st Qu.: 64.0 Class :character
## Mode :character Mode :character Median : 70.0 Mode :character
## Mean : 68.3
## 3rd Qu.: 76.0
## Max. :100.0
## NA's :64
## overview crew orig_title status
## Length:503 Length:503 Length:503 Length:503
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## orig_lang budget_x revenue country
## Length:503 Min. : 105 Min. :0.000e+00 Length:503
## Class :character 1st Qu.: 35511113 1st Qu.:9.562e+07 Class :character
## Mode :character Median : 97200000 Median :3.441e+08 Mode :character
## Mean : 96848059 Mean :3.998e+08
## 3rd Qu.:136700000 3rd Qu.:5.671e+08
## Max. :460000000 Max. :2.924e+09
## NA's :64 NA's :64
movies$genre <- as.character(movies$genre)
movies$genre <- str_remove_all(movies$genre, "Drama,?")
movies$genre <- str_remove_all(movies$genre, "Action,?")
movies$genre <- str_remove_all(movies$genre, "Adventure,?")
movies$genre <- str_remove_all(movies$genre, "Fantasy,?")
movies$genre <- str_remove_all(movies$genre, "Comedy,?")
movies$genre <- str_remove_all(movies$genre, "Romance,?")
movies$genre <- str_remove_all(movies$genre, "Thriller,?")
movies$genre <- str_remove_all(movies$genre, "Crime,?")
movies$genre <- str_remove_all(movies$genre, "Horror,?")
movies$genre <- str_remove_all(movies$genre, "History,?")
movies$genre <- str_remove_all(movies$genre, "Mystery,?")
movies$genre <- str_remove_all(movies$genre, "War,?")
movies$genre <- str_remove_all(movies$genre, "Western,?")
movies$genre <- str_remove_all(movies$genre, "Music,?")
movies$genre <- str_remove_all(movies$genre, "Documentary,?")
movies$genre <- str_remove_all(movies$genre, "Family,?")
movies$genre <- str_remove_all(movies$genre, "Science Fiction,?")
movies$genre <- str_remove_all(movies$genre, "Animation,?")
table(movies$genre)
##
## ?� ?�?� ?�?�?� ?�?�?�?�
## 116 105 180 68 24
## ?�?�?�?�?� ?�?�?�TV Movie ?�?�TV Movie ?�TV Movie,?�?�
## 6 1 2 1
anova_model <- aov(revenue ~ genre, data = movies)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## genre 8 7.496e+17 9.370e+16 0.604 0.775
## Residuals 430 6.672e+19 1.552e+17
## 64 observations deleted due to missingness
regression_model <- lm(revenue ~ budget_x, data = movies)
summary(regression_model)
##
## Call:
## lm(formula = revenue ~ budget_x, data = movies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.264e+09 -1.389e+08 -3.777e+07 1.224e+08 2.026e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.562e+07 2.413e+07 2.305 0.0216 *
## budget_x 3.554e+00 2.006e-01 17.712 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 299800000 on 437 degrees of freedom
## (64 observations deleted due to missingness)
## Multiple R-squared: 0.4179, Adjusted R-squared: 0.4166
## F-statistic: 313.7 on 1 and 437 DF, p-value: < 2.2e-16
plot(movies$budget_x, movies$revenue,
xlab = "Budget", ylab = "Revenue")
abline(regression_model, col = "red")
regression_model2 <- lm(revenue ~ budget_x + genre, data = movies)
summary(regression_model2)
##
## Call:
## lm(formula = revenue ~ budget_x + genre, data = movies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.253e+09 -1.415e+08 -3.090e+07 1.209e+08 2.041e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.598e+07 4.464e+07 1.254 0.210
## budget_x 3.577e+00 2.049e-01 17.455 <2e-16 ***
## genre?� 2.249e+07 5.116e+07 0.440 0.660
## genre?�?� -1.917e+07 4.782e+07 -0.401 0.689
## genre?�?�?� -2.151e+07 5.594e+07 -0.384 0.701
## genre?�?�?�?� 3.751e+07 7.474e+07 0.502 0.616
## genre?�?�?�?�?� 8.430e+07 1.301e+08 0.648 0.517
## genre?�?�?�TV Movie 6.893e+07 3.048e+08 0.226 0.821
## genre?�?�TV Movie -9.544e+07 2.173e+08 -0.439 0.661
## genre?�TV Movie,?�?� 1.258e+08 3.047e+08 0.413 0.680
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 301600000 on 429 degrees of freedom
## (64 observations deleted due to missingness)
## Multiple R-squared: 0.4218, Adjusted R-squared: 0.4096
## F-statistic: 34.77 on 9 and 429 DF, p-value: < 2.2e-16
rm(list = ls())