knitr::opts_chunk$set(echo = TRUE)

library(stringr)
movies <- read.csv("C:/Users/DELL/Downloads/imdb.csv")
str(movies)
## 'data.frame':    377 obs. of  12 variables:
##  $ country   : chr  "Creed III" "Avatar: The Way of Water" "The Super Mario Bros. Movie" "Mummies" ...
##  $ date_x    : chr  "03-02-2023" "12/15/2022 " "04-05-2023" "01-05-2023" ...
##  $ score     : int  73 78 76 70 61 66 80 83 59 58 ...
##  $ genre     : chr  "Drama,?\xffAction" "Science Fiction,?\xffAdventure,?\xffAction" "Animation,?\xffAdventure,?\xffFamily,?\xffFantasy,?\xffComedy" "Animation,?\xffComedy,?\xffFamily,?\xffAdventure,?\xffFantasy" ...
##  $ overview  : chr  "After dominating the boxing world, Adonis Creed has been thriving in both his career and family life. When a childhood friend a "Set more than a decade after the events of the first film, learn the story of the Sully family (Jake, Neytiri, and their kids), "While working underground to fix a water main, Brooklyn plumbers\x83??and brothers\x83??Mario and Luigi are transported down a  "Through a series of unfortunate events, three mummies end up in present-day London and embark on a wacky and hilarious journey  ...
##  $ crew      : chr  "Michael B. Jordan, Adonis Creed, Tessa Thompson, Bianca Taylor, Jonathan Majors, Damien Anderson, Wood Harris, Tony 'Little Duk "Sam Worthington, Jake Sully, Zoe Salda?\xf1a, Neytiri, Sigourney Weaver, Kiri / Dr. Grace Augustine, Stephen Lang, Colonel Mile "Chris Pratt, Mario (voice), Anya Taylor-Joy, Princess Peach (voice), Charlie Day, Luigi (voice), Jack Black, Bowser (voice), Ke "??scar Barber?\xadn, Thut (voice), Ana Esther Alborg, Nefer (voice), Luis P??rez Reina, Carnaby (voice), Mar??a Luisa Sol?\xad, ...
##  $ orig_title: chr  "Creed III" "Avatar: The Way of Water" "The Super Mario Bros. Movie" " Momias" ...
##  $ status    : chr  " Released" " Released" " Released" " Released" ...
##  $ orig_lang : chr  " English" " English" " English" " Spanish, Castilian" ...
##  $ budget_x  : num  7.50e+07 4.60e+08 1.00e+08 1.23e+07 7.70e+07 ...
##  $ revenue   : num  2.72e+08 2.32e+09 7.24e+08 3.42e+07 3.41e+08 ...
##  $ X         : chr  "AU" "AU" "AU" "AU" ...
summary(movies)
##    country             date_x              score           genre          
##  Length:377         Length:377         Min.   :  0.00   Length:377        
##  Class :character   Class :character   1st Qu.: 63.00   Class :character  
##  Mode  :character   Mode  :character   Median : 70.00   Mode  :character  
##                                        Mean   : 68.26                     
##                                        3rd Qu.: 76.00                     
##                                        Max.   :100.00                     
##    overview             crew            orig_title           status         
##  Length:377         Length:377         Length:377         Length:377        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   orig_lang            budget_x            revenue               X            
##  Length:377         Min.   :      105   Min.   :0.000e+00   Length:377        
##  Class :character   1st Qu.: 38211149   1st Qu.:1.020e+08   Class :character  
##  Mode  :character   Median : 92600000   Median :3.526e+08   Mode  :character  
##                     Mean   : 96816280   Mean   :4.080e+08                     
##                     3rd Qu.:136400000   3rd Qu.:5.645e+08                     
##                     Max.   :460000000   Max.   :2.924e+09
movies$genre <- as.character(movies$genre)
movies$genre <- str_remove_all(movies$genre, "Drama,?")
movies$genre <- str_remove_all(movies$genre, "Action,?") 
movies$genre <- str_remove_all(movies$genre, "Adventure,?")
movies$genre <- str_remove_all(movies$genre, "Fantasy,?")
movies$genre <- str_remove_all(movies$genre, "Comedy,?")
movies$genre <- str_remove_all(movies$genre, "Romance,?")
movies$genre <- str_remove_all(movies$genre, "Thriller,?")
movies$genre <- str_remove_all(movies$genre, "Crime,?")
movies$genre <- str_remove_all(movies$genre, "Horror,?")
movies$genre <- str_remove_all(movies$genre, "History,?")
movies$genre <- str_remove_all(movies$genre, "Mystery,?")  
movies$genre <- str_remove_all(movies$genre, "War,?")
movies$genre <- str_remove_all(movies$genre, "Western,?")
movies$genre <- str_remove_all(movies$genre, "Music,?")
movies$genre <- str_remove_all(movies$genre, "Documentary,?")
movies$genre <- str_remove_all(movies$genre, "Family,?")
movies$genre <- str_remove_all(movies$genre, "Science Fiction,?")
movies$genre <- str_remove_all(movies$genre, "Animation,?")

table(movies$genre)
## 
##                              ?�            ?�?�          ?�?�?�        ?�?�?�?� 
##              47              91             154              59              19 
##      ?�?�?�?�?�    ?�?�TV Movie ?�TV Movie,?�?� 
##               5               1               1
anova_model <- aov(revenue ~ genre, data = movies)
summary(anova_model)
##              Df    Sum Sq   Mean Sq F value Pr(>F)
## genre         7 6.126e+17 8.751e+16   0.532   0.81
## Residuals   369 6.066e+19 1.644e+17
regression_model <- lm(revenue ~ budget_x, data = movies)
summary(regression_model)
## 
## Call:
## lm(formula = revenue ~ budget_x, data = movies)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.322e+09 -1.406e+08 -3.773e+07  1.132e+08  1.989e+09 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.397e+07  2.577e+07   1.706   0.0888 .  
## budget_x    3.760e+00  2.135e-01  17.617   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.99e+08 on 375 degrees of freedom
## Multiple R-squared:  0.4528, Adjusted R-squared:  0.4514 
## F-statistic: 310.4 on 1 and 375 DF,  p-value: < 2.2e-16
plot(movies$budget_x, movies$revenue, 
     xlab = "Budget", ylab = "Revenue")
abline(regression_model, col = "red")

regression_model2 <- lm(revenue ~ budget_x + genre, data = movies)
summary(regression_model2)
## 
## Call:
## lm(formula = revenue ~ budget_x + genre, data = movies)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.323e+09 -1.457e+08 -3.588e+07  1.192e+08  2.011e+09 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           4.354e+07  4.656e+07   0.935    0.350    
## budget_x              3.804e+00  2.184e-01  17.423   <2e-16 ***
## genre?�               2.771e+07  5.406e+07   0.512    0.609    
## genre?�?�            -1.361e+07  5.063e+07  -0.269    0.788    
## genre?�?�?�          -3.250e+07  5.931e+07  -0.548    0.584    
## genre?�?�?�?�        -1.066e+07  8.192e+07  -0.130    0.897    
## genre?�?�?�?�?�       8.621e+07  1.414e+08   0.610    0.543    
## genre?�?�TV Movie    -2.884e+08  3.042e+08  -0.948    0.344    
## genre?�TV Movie,?�?�  1.077e+08  3.040e+08   0.354    0.723    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 300500000 on 368 degrees of freedom
## Multiple R-squared:  0.4575, Adjusted R-squared:  0.4457 
## F-statistic: 38.79 on 8 and 368 DF,  p-value: < 2.2e-16
rm(list = ls())