Load movie ratings csv into R data frame
movie_ratings <- read.csv("https://raw.githubusercontent.com/ajbentley/cuny_ms_ds/master/607/movie_ratings.csv", header = TRUE)
# changing Respondant Name colum to character instead of factor
movie_ratings$Respondant.Name <- as.character((movie_ratings$Respondant.Name))
head(movie_ratings)
## id Respondant.Name Isle.of.Dogs The.First.Purge Christopher.Robin
## 1 1 DMH 5 1 1
## 2 2 Mea 3 1 3
## 3 3 Ryder 1 1 4
## 4 4 Kenneth 1 3 1
## 5 5 Alice 2 2 2
## 6 6 Rose 4 1 1
## Skyscraper Blockers A.Quiet.Place Gender Age US.Region
## 1 1 1 1 Female 40-49 Middle Atlantic
## 2 1 1 1 Female 30-39 West North Central
## 3 1 1 1 Female 40-49 Pacific
## 4 2 3 1 Male 40-49 East North Central
## 5 2 2 2 Female 40-49 Middle Atlantic
## 6 1 1 3 Female 60 or older East North Central
Check df dimensions and summary
dim(movie_ratings)
## [1] 14 11
summary(movie_ratings)
## id Respondant.Name Isle.of.Dogs The.First.Purge
## Min. : 1.00 Length:14 Min. :1.000 Min. :1.000
## 1st Qu.: 4.25 Class :character 1st Qu.:2.000 1st Qu.:1.000
## Median : 7.50 Mode :character Median :3.000 Median :2.500
## Mean : 7.50 Mean :2.857 Mean :2.214
## 3rd Qu.:10.75 3rd Qu.:3.750 3rd Qu.:3.000
## Max. :14.00 Max. :5.000 Max. :4.000
## Christopher.Robin Skyscraper Blockers A.Quiet.Place
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.250 1st Qu.:1.250
## Median :3.000 Median :2.000 Median :2.500 Median :3.000
## Mean :2.643 Mean :2.286 Mean :2.286 Mean :2.714
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.750
## Max. :4.000 Max. :5.000 Max. :4.000 Max. :5.000
## Gender Age US.Region
## Female:11 30-39 : 2 East North Central:4
## Male : 3 40-49 :10 Middle Atlantic :5
## 60 or older: 2 New England :2
## Pacific :1
## West North Central:2
##
Make table of ratings wih average row
nums <- unlist(lapply(movie_ratings, is.numeric))
mr_ratings_only <- movie_ratings[ , nums]
head(mr_ratings_only)
## id Isle.of.Dogs The.First.Purge Christopher.Robin Skyscraper Blockers
## 1 1 5 1 1 1 1
## 2 2 3 1 3 1 1
## 3 3 1 1 4 1 1
## 4 4 1 3 1 2 3
## 5 5 2 2 2 2 2
## 6 6 4 1 1 1 1
## A.Quiet.Place
## 1 1
## 2 1
## 3 1
## 4 1
## 5 2
## 6 3
avg_rtgs <- apply(mr_ratings_only,2,mean)
avg_rtgs <- round(avg_rtgs, digits=2)
avg_rtgs
## id Isle.of.Dogs The.First.Purge Christopher.Robin
## 7.50 2.86 2.21 2.64
## Skyscraper Blockers A.Quiet.Place
## 2.29 2.29 2.71
mr_ratings_only <- rbind(mr_ratings_only, avg_rtgs)
mr_ratings_only <- round(mr_ratings_only,digits=2)
tail(mr_ratings_only)
## id Isle.of.Dogs The.First.Purge Christopher.Robin Skyscraper Blockers
## 10 10.0 3.00 3.00 3.00 3.00 3.00
## 11 11.0 5.00 4.00 3.00 1.00 2.00
## 12 12.0 1.00 1.00 4.00 4.00 2.00
## 13 13.0 2.00 2.00 2.00 5.00 4.00
## 14 14.0 4.00 3.00 4.00 3.00 3.00
## 15 7.5 2.86 2.21 2.64 2.29 2.29
## A.Quiet.Place
## 10 3.00
## 11 4.00
## 12 4.00
## 13 4.00
## 14 5.00
## 15 2.71
movie_ratings[nrow(movie_ratings)+1,] <- NA
movie_ratings <- cbind(movie_ratings, mr_ratings_only)
movie_ratings <- movie_ratings[,-c(3:8)]
tail(movie_ratings)
## id Respondant.Name Gender Age US.Region id.1
## 10 10 Casey Female 40-49 New England 10.0
## 11 11 Gil Male 60 or older West North Central 11.0
## 12 12 Inae Female 40-49 New England 12.0
## 13 13 Pete Male 40-49 East North Central 13.0
## 14 14 Kristen Female 30-39 Middle Atlantic 14.0
## 15 NA <NA> <NA> <NA> <NA> 7.5
## Isle.of.Dogs The.First.Purge Christopher.Robin Skyscraper Blockers
## 10 3.00 3.00 3.00 3.00 3.00
## 11 5.00 4.00 3.00 1.00 2.00
## 12 1.00 1.00 4.00 4.00 2.00
## 13 2.00 2.00 2.00 5.00 4.00
## 14 4.00 3.00 4.00 3.00 3.00
## 15 2.86 2.21 2.64 2.29 2.29
## A.Quiet.Place
## 10 3.00
## 11 4.00
## 12 4.00
## 13 4.00
## 14 5.00
## 15 2.71
movie_ratings[15, 2] = "Average Rating"
tail(movie_ratings)
## id Respondant.Name Gender Age US.Region id.1
## 10 10 Casey Female 40-49 New England 10.0
## 11 11 Gil Male 60 or older West North Central 11.0
## 12 12 Inae Female 40-49 New England 12.0
## 13 13 Pete Male 40-49 East North Central 13.0
## 14 14 Kristen Female 30-39 Middle Atlantic 14.0
## 15 NA Average Rating <NA> <NA> <NA> 7.5
## Isle.of.Dogs The.First.Purge Christopher.Robin Skyscraper Blockers
## 10 3.00 3.00 3.00 3.00 3.00
## 11 5.00 4.00 3.00 1.00 2.00
## 12 1.00 1.00 4.00 4.00 2.00
## 13 2.00 2.00 2.00 5.00 4.00
## 14 4.00 3.00 4.00 3.00 3.00
## 15 2.86 2.21 2.64 2.29 2.29
## A.Quiet.Place
## 10 3.00
## 11 4.00
## 12 4.00
## 13 4.00
## 14 5.00
## 15 2.71
Create new DF that compares average movie ratings for men versus women
# df with only males
males <- movie_ratings[ which(movie_ratings$Gender=='Male'),]
# df with only males and only numeric columns
nums <- unlist(lapply(males, is.numeric))
male_rtgs_only <- males[ , nums]
# get average male ratings averaged to 2 digits and give it a nicer name
avg_male_rtgs <- apply(male_rtgs_only,2,mean)
avg_male_rtgs <- round(avg_male_rtgs, digits=2)
Male_Ratings <- avg_male_rtgs
# df with only females
females <- movie_ratings[ which(movie_ratings$Gender=='Female'),]
# df with only females and only numeric columns
nums <- unlist(lapply(females, is.numeric))
female_rtgs_only <- females[ , nums]
# get average female ratings averaged to 2 digits and give it a nicer name
avg_female_rtgs <- apply(female_rtgs_only,2,mean)
avg_female_rtgs <- round(avg_female_rtgs, digits=2)
avg_female_rtgs
## id id.1 Isle.of.Dogs The.First.Purge
## 7.00 7.00 2.91 2.00
## Christopher.Robin Skyscraper Blockers A.Quiet.Place
## 2.82 2.18 2.09 2.64
Female_Ratings <- avg_female_rtgs
# combine and make comparative df
gend_diff <- data.frame(rbind(Male_Ratings, Female_Ratings, Male_Ratings - Female_Ratings))
gend_diff <- gend_diff[,-c(1:2)]
gend_diff
## Isle.of.Dogs The.First.Purge Christopher.Robin Skyscraper
## Male_Ratings 2.67 3 2.00 2.67
## Female_Ratings 2.91 2 2.82 2.18
## -0.24 1 -0.82 0.49
## Blockers A.Quiet.Place
## Male_Ratings 3.00 3.00
## Female_Ratings 2.09 2.64
## 0.91 0.36
Creage graph showing male and female average ratings per movie side by side
# transpose dataframe
rtg_by_gender <- as.data.frame(t(gend_diff))
# drop change column
rtg_by_gender <- rtg_by_gender[,-c(3)]
# add index column
movie <- rownames(rtg_by_gender)
rtg_by_gender <- cbind(movie, rtg_by_gender)
rtg_by_gender
## movie Male_Ratings Female_Ratings
## Isle.of.Dogs Isle.of.Dogs 2.67 2.91
## The.First.Purge The.First.Purge 3.00 2.00
## Christopher.Robin Christopher.Robin 2.00 2.82
## Skyscraper Skyscraper 2.67 2.18
## Blockers Blockers 3.00 2.09
## A.Quiet.Place A.Quiet.Place 3.00 2.64
# convert to format needed to graph 2 variables
rbg_melt <- melt(rtg_by_gender, id.vars='movie')
rbg_melt
## movie variable value
## 1 Isle.of.Dogs Male_Ratings 2.67
## 2 The.First.Purge Male_Ratings 3.00
## 3 Christopher.Robin Male_Ratings 2.00
## 4 Skyscraper Male_Ratings 2.67
## 5 Blockers Male_Ratings 3.00
## 6 A.Quiet.Place Male_Ratings 3.00
## 7 Isle.of.Dogs Female_Ratings 2.91
## 8 The.First.Purge Female_Ratings 2.00
## 9 Christopher.Robin Female_Ratings 2.82
## 10 Skyscraper Female_Ratings 2.18
## 11 Blockers Female_Ratings 2.09
## 12 A.Quiet.Place Female_Ratings 2.64
# create and display graph
f <- ggplot(rbg_melt, aes(x=movie, y=value, fill=variable)) +
geom_bar(stat='identity', position='dodge') + labs(title ="Movie Ratings By Gender", x = "Movies", y = "Average Rating")
f
