Read data into movies.df and rename column “length” to “minutes” since length is an R function
# i keep getting network errors
# Error in function (type, msg, asError = TRUE) : error:1407742E:SSL routines:SSL23_GET_SERVER_HELLO:tlsv1 alert protocol version
# library(httr)
# library(RCurl)
# url<-"https://raw.githubusercontent.com/TheReallyBigApple/CunyAssignments/main/R/movies.csv"
# x<-getURL(url)
# movies.df <- read.csv(text=x, header=T)
movies.df <- read.csv("C:\\Users\\arono\\CUNY\\R\\movies.csv")
# the "length" column will be confusing so rename it to "minutes"
names(movies.df)[names(movies.df) == 'length'] <- 'minutes'
Create summaries by year by taking the average votes, rating, minutes
# Create a table of yearly trends. Take the average of votes, rating and length
yearly_trends.df <- aggregate(list(movies.df$votes, movies.df$rating,movies.df$minutes),
by = list(movies.df$year), # group by
FUN = mean
)
colnames(yearly_trends.df)<-c('y_year','y_votes','y_rating','y_minutes' )
Now show a plot of the average rating by year
par(bg="lightgray")
yt_plot=cbind(yearly_trends.df$y_year,yearly_trends.df$y_rating)
plot(yt_plot, type="n", ann=FALSE)
usr <- par("usr")
rect(usr[1], usr[3], usr[2], usr[4], col="cornsilk", border="black")
lines(yt_plot, col="blue")
title(main= "Ratings By Year", font.main=8, col.main="red", xlab="Year", ylab="Avg Rating", col.lab="red",cex.lab=1.25)
Now show a plot of the average rating by votes
par(bg="lightgray")
yt_plot=cbind(yearly_trends.df$y_year,yearly_trends.df$y_votes)
plot(yt_plot, type="n", ann=FALSE)
usr <- par("usr")
rect(usr[1], usr[3], usr[2], usr[4], col="cornsilk", border="black")
lines(yt_plot, col="blue")
title(main= "Votes By Year", font.main=8, col.main="red", xlab="Year", ylab="Avg Votes", col.lab="red",cex.lab=1.25)
Lets take a closer look at our data. Lets look at comedies prior to the year 1900
old_comedies <- subset(movies.df, year < 1900 & Comedy==1, select=c(year, title, rating,votes, minutes))
knitr::kable(old_comedies, caption='Old Comedies')
| year | title | rating | votes | minutes | |
|---|---|---|---|---|---|
| 3392 | 1899 | Astor Tramp, The | 3.6 | 13 | 2 |
| 5849 | 1897 | Biter Bit, The | 4.7 | 25 | 1 |
| 10720 | 1898 | Come Along Do! | 4.6 | 27 | 1 |
| 33604 | 1898 | Miller and the Sweep, The | 5.5 | 28 | 1 |
| 45689 | 1897 | Seminary Girls | 5.7 | 97 | 1 |
| 54344 | 1896 | Une nuit terrible | 3.9 | 6 | 1 |
After reviewing the data, I get the sense there some noise in the data.
The year 2005 looks incomplete.
votes_per_year <- subset(yearly_trends.df, y_year > 2000 , select=c(y_year, y_votes))
knitr::kable(votes_per_year, caption='Recent Votes Per Year')
| y_year | y_votes | |
|---|---|---|
| 109 | 2001 | 1067.1202 |
| 110 | 2002 | 1047.3270 |
| 111 | 2003 | 841.5811 |
| 112 | 2004 | 792.1450 |
| 113 | 2005 | 399.1003 |
And the early movies look like they are shorts.
# I also noticed a huge # of shorts especially at the beginning
votes_per_year <- subset(yearly_trends.df, y_year < 1900 , select=c(y_year, y_minutes))
knitr::kable(votes_per_year, caption='Minutes Per Year')
| y_year | y_minutes |
|---|---|
| 1893 | 1.000000 |
| 1894 | 1.000000 |
| 1895 | 1.000000 |
| 1896 | 1.307692 |
| 1897 | 1.000000 |
| 1898 | 1.000000 |
| 1899 | 2.444444 |
To answer the original question about the quality of genres over time, lets restrict our data set to popular, full length movies during a time frame when the movie industry was mature
relevant_movies.df <- subset(movies.df, year >= 1930 & year<=2004 & votes>100 & minutes>50,
select=c(year, title, minutes, budget, rating,votes, Comedy, Drama, Romance))
relevant_comedies.df <- subset(relevant_movies.df , Comedy==1, select=c(year, rating))
relevant_dramas.df <- subset(relevant_movies.df , Drama==1, select=c(year, rating))
relevant_romances.df <- subset(relevant_movies.df , Romance==1, select=c(year, rating))
yearly_rc.df <- aggregate(list(relevant_comedies.df$rating), by = list(relevant_comedies.df$year), FUN = mean)
yearly_rd.df <- aggregate(list(relevant_dramas.df$rating), by = list(relevant_dramas.df$year), FUN = mean)
yearly_rr.df <- aggregate(list(relevant_romances.df$rating), by = list(relevant_romances.df$year), FUN = mean)
Now create a 3 lined graph
par(bg="lightgray")
plot_colors <- c("blue", "red", "green")
plot(yearly_rc.df, type="n", ann=FALSE)
usr <- par("usr")
rect(usr[1], usr[3], usr[2], usr[4], col="cornsilk", border="black")
lines(yearly_rc.df, lty=1, col="blue")
lines(yearly_rd.df, lty=2, col="red")
lines(yearly_rr.df, lty=3, col="green")
legend("topright", c("Comedy", "Drama", "Romance"), cex=0.8, col=plot_colors, lty=1:3, lwd=2, bty="n");
title(main= "Ratings By Year and Category", font.main=8, col.main="red", xlab="Year", ylab="Avg Rating", col.lab="red",cex.lab=1.25)
Looking at this figure, I have 2 observations.
One is that comedies of the 1930s are very highly rated and comedies had the steepest decline in acclaim.
The other observation is that the earlier you go back, the higher the rating, which I suspect has something to do with the critics sentimentality towards older movies.