#The objective of Project 2 is to prepare data for downstream analysis. The following datasets were obtained from fivethirtyeight. com.
#1. The first dataset that I worked with intends to prove the polarization of votes on Al Gore's sequel documentary called "An Inconvenient Sequel: Truth to Power." Movie ratings on a scale from 1 to 10 don't seem to be accurate and reliable.
"Al Gore’s New Movie Exposes The Big Flaw In Online Movie Ratings"
## [1] "Al Gore’s New Movie Exposes The Big Flaw In Online Movie Ratings"
rm(list=ls())
# Free up the memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 496354 26.6 940480 50.3 750400 40.1
## Vcells 937677 7.2 1650153 12.6 1187955 9.1
# first step is to load all the needed libraries
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.1 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
#next look at the dataset. I can see that the format of the data is "wide," therefore I will attempt to transform it into a "long"" one
myurl1<-"https://raw.githubusercontent.com/fivethirtyeight/data/master/inconvenient-sequel/ratings.csv"
ratings_data<- read.csv(myurl1, header = TRUE, stringsAsFactors = FALSE)
dim(ratings_data)
## [1] 80053 27
#Fix the column names to more readable names.
names(ratings_data)<-c("Timestamp","Respondents","Category","Link","Average","Mean","Median","1_Votes","2_Votes","3_Votes","4_Votes","5_Votes","6_Votes","7_Votes","8_Votes","9_Votes","10_Votes","1_Pct","2_Pct","3_Pct","4_Pct","5_pct","6_Pct","7_Pct","8_Pct","9_Pct","10_Pct")
head(ratings_data)
## Timestamp Respondents Category
## 1 2017-07-17 12:28:32.785639 402 Males
## 2 2017-07-17 12:28:33.025600 78 Females
## 3 2017-07-17 12:28:33.273919 4 Aged under 18
## 4 2017-07-17 12:28:33.495325 4 Males under 18
## 5 2017-07-17 12:28:33.722849 130 Aged 18-29
## 6 2017-07-17 12:28:33.970009 108 Males Aged 18-29
## Link Average Mean
## 1 http://www.imdb.com/title/tt6322922/ratings-male 4.6 5.0
## 2 http://www.imdb.com/title/tt6322922/ratings-female 6.9 7.7
## 3 http://www.imdb.com/title/tt6322922/ratings-age_1 4.2 4.2
## 4 http://www.imdb.com/title/tt6322922/ratings-male_age_1 4.2 4.2
## 5 http://www.imdb.com/title/tt6322922/ratings-age_2 6.3 6.5
## 6 http://www.imdb.com/title/tt6322922/ratings-male_age_2 6.2 6.2
## Median 1_Votes 2_Votes 3_Votes 4_Votes 5_Votes 6_Votes 7_Votes 8_Votes
## 1 2 197 7 7 3 7 7 11 8
## 2 10 16 1 0 1 1 0 3 4
## 3 3 2 0 0 0 1 0 0 0
## 4 3 2 0 0 0 1 0 0 0
## 5 9 41 0 3 1 2 3 6 4
## 6 9 37 0 3 1 1 3 6 2
## 9_Votes 10_Votes 1_Pct 2_Pct 3_Pct 4_Pct 5_pct 6_Pct 7_Pct 8_Pct 9_Pct
## 1 20 135 49.0 1.7 1.7 0.7 1.7 1.7 2.7 2.0 5.0
## 2 3 49 20.5 1.3 0.0 1.3 1.3 0.0 3.8 5.1 3.8
## 3 0 1 50.0 0.0 0.0 0.0 25.0 0.0 0.0 0.0 0.0
## 4 0 1 50.0 0.0 0.0 0.0 25.0 0.0 0.0 0.0 0.0
## 5 6 64 31.5 0.0 2.3 0.8 1.5 2.3 4.6 3.1 4.6
## 6 6 49 34.3 0.0 2.8 0.9 0.9 2.8 5.6 1.9 5.6
## 10_Pct
## 1 33.6
## 2 62.8
## 3 25.0
## 4 25.0
## 5 49.2
## 6 45.4
#Next steps are to turn the Votes and Percent columns into two separate ones. I made sure to remove the link column since I don't need it. Also the timestamp I turned into three separate columns for the year, month and date, while removing the hour.
ratings_data <- ratings_data %>%
gather(Votes,Rank,8:17) %>%
gather(Percent,Numbers,"1_Pct":"10_Pct") %>%
select(-Link) %>%
mutate(Timestamp = trimws(unlist(str_extract_all(Timestamp, "[0-9-]+[ ]")), which = c("both"))) %>%
separate(Timestamp, c("Year", "Month", "Date"), sep="-")
head(ratings_data)
## Year Month Date Respondents Category Average Mean Median Votes
## 1 2017 07 17 402 Males 4.6 5.0 2 1_Votes
## 2 2017 07 17 78 Females 6.9 7.7 10 1_Votes
## 3 2017 07 17 4 Aged under 18 4.2 4.2 3 1_Votes
## 4 2017 07 17 4 Males under 18 4.2 4.2 3 1_Votes
## 5 2017 07 17 130 Aged 18-29 6.3 6.5 9 1_Votes
## 6 2017 07 17 108 Males Aged 18-29 6.2 6.2 9 1_Votes
## Rank Percent Numbers
## 1 197 1_Pct 49.0
## 2 16 1_Pct 20.5
## 3 2 1_Pct 50.0
## 4 2 1_Pct 50.0
## 5 41 1_Pct 31.5
## 6 37 1_Pct 34.3
#The plottong and the analysis shows that polarization of votes, which proves that possibly this particular system of rating movies is inefficient or valid. A future research could be done on voter's political affiliation or to what extend they support Al Gore. The findings can be then compared to voters' ratings.
plot(ratings_data$Median,ratings_data$Numbers,main="Plot of Median Ratings and Ranking", xlab="Median Ratings",ylab="Rank")
