library(tidyr)
library(ggplot2)
library(glm2)
Convert the “Status” column into a binary variable
tmdb_data <- read.csv("TMDB.csv")
tmdb_data$Status_binary <- ifelse(tmdb_data$Status == ' Released', 1, 0)
Build a logistic regression model
# Build the logistic regression model
logistic_model <- glm2(Status_binary ~ Score + Budget + Revenue, family = binomial(link = "logit"), data = tmdb_data)
## Warning: glm.fit2: algorithm did not converge. Try increasing the maximum
## iterations
## Warning: glm.fit2: fitted probabilities numerically 0 or 1 occurred
summary(logistic_model)
##
## Call:
## glm2(formula = Status_binary ~ Score + Budget + Revenue, family = binomial(link = "logit"),
## data = tmdb_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.409430 0.662907 2.126 0.0335 *
## Score 2.081761 65.752071 0.032 0.9747
## Budget -0.032472 0.005766 -5.632 1.78e-08 ***
## Revenue 0.038466 0.007288 5.278 1.31e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 599.30 on 10177 degrees of freedom
## Residual deviance: 163.09 on 10174 degrees of freedom
## AIC: 171.09
##
## Number of Fisher Scoring iterations: 25
Visualize the distributions
# Plotting histograms for Score, Budget, and Revenue
par(mfrow=c(1,3))
hist(tmdb_data$Score, main="Distribution of Score", xlab="Score", col="skyblue", border="black")
hist(tmdb_data$Budget, main="Distribution of Budget", xlab="Budget", col="salmon", border="black")
hist(tmdb_data$Revenue, main="Distribution of Revenue", xlab="Revenue", col="lightgreen", border="black")
