TMDB Movie Analysis

library(tidyr)
library(ggplot2)
library(glm2)

Convert the “Status” column into a binary variable

tmdb_data <- read.csv("TMDB.csv")
tmdb_data$Status_binary <- ifelse(tmdb_data$Status == ' Released', 1, 0)

Build a logistic regression model

# Build the logistic regression model
logistic_model <- glm2(Status_binary ~ Score + Budget + Revenue, family = binomial(link = "logit"), data = tmdb_data)

## Warning: glm.fit2: algorithm did not converge. Try increasing the maximum
## iterations

## Warning: glm.fit2: fitted probabilities numerically 0 or 1 occurred

summary(logistic_model)

## 
## Call:
## glm2(formula = Status_binary ~ Score + Budget + Revenue, family = binomial(link = "logit"), 
##     data = tmdb_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.409430   0.662907   2.126   0.0335 *  
## Score        2.081761  65.752071   0.032   0.9747    
## Budget      -0.032472   0.005766  -5.632 1.78e-08 ***
## Revenue      0.038466   0.007288   5.278 1.31e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 599.30  on 10177  degrees of freedom
## Residual deviance: 163.09  on 10174  degrees of freedom
## AIC: 171.09
## 
## Number of Fisher Scoring iterations: 25

Visualize the distributions

# Plotting histograms for Score, Budget, and Revenue
par(mfrow=c(1,3))
hist(tmdb_data$Score, main="Distribution of Score", xlab="Score", col="skyblue", border="black")
hist(tmdb_data$Budget, main="Distribution of Budget", xlab="Budget", col="salmon", border="black")
hist(tmdb_data$Revenue, main="Distribution of Revenue", xlab="Revenue", col="lightgreen", border="black")

Consider log transformations for “Budget” and “Revenue”

# Log transformation
tmdb_data$Log_Budget <- log1p(tmdb_data$Budget)
tmdb_data$Log_Revenue <- log1p(tmdb_data$Revenue)

# Scatter plots after log transformation
plot(tmdb_data$Log_Budget, tmdb_data$Status_binary, main="Log Budget vs Status", xlab="Log Budget", ylab="Status", col="salmon", pch=16)

plot(tmdb_data$Log_Revenue, tmdb_data$Status_binary, main="Log Revenue vs Status", xlab="Log Revenue", ylab="Status", col="lightgreen", pch=16)