knitr::opts_chunk$set(echo = TRUE)

library(stringr)
movies <- read.csv("C:/Users/Prasad/Downloads/imdb.csv")
str(movies)
## 'data.frame':    503 obs. of  12 variables:
##  $ names     : chr  "Creed III" "Avatar: The Way of Water" "The Super Mario Bros. Movie" "Mummies" ...
##  $ date_x    : chr  "03-02-2023" "12/15/2022 " "04-05-2023" "01-05-2023" ...
##  $ score     : int  73 78 76 70 61 66 80 83 59 58 ...
##  $ genre     : chr  "Drama,?\xffAction" "Science Fiction,?\xffAdventure,?\xffAction" "Animation,?\xffAdventure,?\xffFamily,?\xffFantasy,?\xffComedy" "Animation,?\xffComedy,?\xffFamily,?\xffAdventure,?\xffFantasy" ...
##  $ overview  : chr  "After dominating the boxing world, Adonis Creed has been thriving in both his career and family life. When a childhood friend a "Set more than a decade after the events of the first film, learn the story of the Sully family (Jake, Neytiri, and their kids), "While working underground to fix a water main, Brooklyn plumbers\x83??and brothers\x83??Mario and Luigi are transported down a  "Through a series of unfortunate events, three mummies end up in present-day London and embark on a wacky and hilarious journey  ...
##  $ crew      : chr  "Michael B. Jordan, Adonis Creed, Tessa Thompson, Bianca Taylor, Jonathan Majors, Damien Anderson, Wood Harris, Tony 'Little Duk "Sam Worthington, Jake Sully, Zoe Salda?\xf1a, Neytiri, Sigourney Weaver, Kiri / Dr. Grace Augustine, Stephen Lang, Colonel Mile "Chris Pratt, Mario (voice), Anya Taylor-Joy, Princess Peach (voice), Charlie Day, Luigi (voice), Jack Black, Bowser (voice), Ke "??scar Barber?\xadn, Thut (voice), Ana Esther Alborg, Nefer (voice), Luis P??rez Reina, Carnaby (voice), Mar??a Luisa Sol?\xad, ...
##  $ orig_title: chr  "Creed III" "Avatar: The Way of Water" "The Super Mario Bros. Movie" " Momias" ...
##  $ status    : chr  " Released" " Released" " Released" " Released" ...
##  $ orig_lang : chr  " English" " English" " English" " Spanish, Castilian" ...
##  $ budget_x  : num  7.50e+07 4.60e+08 1.00e+08 1.23e+07 7.70e+07 ...
##  $ revenue   : num  2.72e+08 2.32e+09 7.24e+08 3.42e+07 3.41e+08 ...
##  $ country   : chr  "AU" "AU" "AU" "AU" ...
summary(movies)
##     names              date_x              score          genre          
##  Length:503         Length:503         Min.   :  0.0   Length:503        
##  Class :character   Class :character   1st Qu.: 64.0   Class :character  
##  Mode  :character   Mode  :character   Median : 70.0   Mode  :character  
##                                        Mean   : 68.3                     
##                                        3rd Qu.: 76.0                     
##                                        Max.   :100.0                     
##                                        NA's   :64                        
##    overview             crew            orig_title           status         
##  Length:503         Length:503         Length:503         Length:503        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   orig_lang            budget_x            revenue            country         
##  Length:503         Min.   :      105   Min.   :0.000e+00   Length:503        
##  Class :character   1st Qu.: 35511113   1st Qu.:9.562e+07   Class :character  
##  Mode  :character   Median : 97200000   Median :3.441e+08   Mode  :character  
##                     Mean   : 96848059   Mean   :3.998e+08                     
##                     3rd Qu.:136700000   3rd Qu.:5.671e+08                     
##                     Max.   :460000000   Max.   :2.924e+09                     
##                     NA's   :64          NA's   :64
movies$genre <- as.character(movies$genre)
movies$genre <- str_remove_all(movies$genre, "Drama,?")
movies$genre <- str_remove_all(movies$genre, "Action,?") 
movies$genre <- str_remove_all(movies$genre, "Adventure,?")
movies$genre <- str_remove_all(movies$genre, "Fantasy,?")
movies$genre <- str_remove_all(movies$genre, "Comedy,?")
movies$genre <- str_remove_all(movies$genre, "Romance,?")
movies$genre <- str_remove_all(movies$genre, "Thriller,?")
movies$genre <- str_remove_all(movies$genre, "Crime,?")
movies$genre <- str_remove_all(movies$genre, "Horror,?")
movies$genre <- str_remove_all(movies$genre, "History,?")
movies$genre <- str_remove_all(movies$genre, "Mystery,?")  
movies$genre <- str_remove_all(movies$genre, "War,?")
movies$genre <- str_remove_all(movies$genre, "Western,?")
movies$genre <- str_remove_all(movies$genre, "Music,?")
movies$genre <- str_remove_all(movies$genre, "Documentary,?")
movies$genre <- str_remove_all(movies$genre, "Family,?")
movies$genre <- str_remove_all(movies$genre, "Science Fiction,?")
movies$genre <- str_remove_all(movies$genre, "Animation,?")

table(movies$genre)
## 
##                              ?�            ?�?�          ?�?�?�        ?�?�?�?� 
##             116             105             180              68              24 
##      ?�?�?�?�?�  ?�?�?�TV Movie    ?�?�TV Movie ?�TV Movie,?�?� 
##               6               1               2               1
anova_model <- aov(revenue ~ genre, data = movies)
summary(anova_model)
##              Df    Sum Sq   Mean Sq F value Pr(>F)
## genre         8 7.496e+17 9.370e+16   0.604  0.775
## Residuals   430 6.672e+19 1.552e+17               
## 64 observations deleted due to missingness
regression_model <- lm(revenue ~ budget_x, data = movies)
summary(regression_model)
## 
## Call:
## lm(formula = revenue ~ budget_x, data = movies)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.264e+09 -1.389e+08 -3.777e+07  1.224e+08  2.026e+09 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.562e+07  2.413e+07   2.305   0.0216 *  
## budget_x    3.554e+00  2.006e-01  17.712   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 299800000 on 437 degrees of freedom
##   (64 observations deleted due to missingness)
## Multiple R-squared:  0.4179, Adjusted R-squared:  0.4166 
## F-statistic: 313.7 on 1 and 437 DF,  p-value: < 2.2e-16
plot(movies$budget_x, movies$revenue, 
     xlab = "Budget", ylab = "Revenue")
abline(regression_model, col = "red")

regression_model2 <- lm(revenue ~ budget_x + genre, data = movies)
summary(regression_model2)
## 
## Call:
## lm(formula = revenue ~ budget_x + genre, data = movies)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.253e+09 -1.415e+08 -3.090e+07  1.209e+08  2.041e+09 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.598e+07  4.464e+07   1.254    0.210    
## budget_x              3.577e+00  2.049e-01  17.455   <2e-16 ***
## genre?�               2.249e+07  5.116e+07   0.440    0.660    
## genre?�?�            -1.917e+07  4.782e+07  -0.401    0.689    
## genre?�?�?�          -2.151e+07  5.594e+07  -0.384    0.701    
## genre?�?�?�?�         3.751e+07  7.474e+07   0.502    0.616    
## genre?�?�?�?�?�       8.430e+07  1.301e+08   0.648    0.517    
## genre?�?�?�TV Movie   6.893e+07  3.048e+08   0.226    0.821    
## genre?�?�TV Movie    -9.544e+07  2.173e+08  -0.439    0.661    
## genre?�TV Movie,?�?�  1.258e+08  3.047e+08   0.413    0.680    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 301600000 on 429 degrees of freedom
##   (64 observations deleted due to missingness)
## Multiple R-squared:  0.4218, Adjusted R-squared:  0.4096 
## F-statistic: 34.77 on 9 and 429 DF,  p-value: < 2.2e-16
rm(list = ls())