Project2_Dataset1

#The objective of Project 2 is to prepare data for downstream analysis. The following datasets were obtained from fivethirtyeight. com.
#1. The first dataset that I worked with intends to prove the polarization of votes on Al Gore's sequel documentary called "An Inconvenient Sequel: Truth to Power." Movie ratings on a scale from 1 to 10 don't seem to be accurate and reliable. 
"Al Gore’s New Movie Exposes The Big Flaw In Online Movie Ratings"

## [1] "Al Gore’s New Movie Exposes The Big Flaw In Online Movie Ratings"

rm(list=ls())

# Free up the memory
gc()

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 496354 26.6     940480 50.3   750400 40.1
## Vcells 937677  7.2    1650153 12.6  1187955  9.1

# first step is to load all the needed libraries 
library(tidyverse)

## ── Attaching packages ───────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.1     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0

## ── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggplot2)
#next look at the dataset. I can see that the format of the data is "wide," therefore I will attempt to transform it into a "long"" one
myurl1<-"https://raw.githubusercontent.com/fivethirtyeight/data/master/inconvenient-sequel/ratings.csv"
ratings_data<- read.csv(myurl1, header = TRUE, stringsAsFactors = FALSE)
dim(ratings_data)

## [1] 80053    27

#Fix the column names to more readable names. 
names(ratings_data)<-c("Timestamp","Respondents","Category","Link","Average","Mean","Median","1_Votes","2_Votes","3_Votes","4_Votes","5_Votes","6_Votes","7_Votes","8_Votes","9_Votes","10_Votes","1_Pct","2_Pct","3_Pct","4_Pct","5_pct","6_Pct","7_Pct","8_Pct","9_Pct","10_Pct")
head(ratings_data)

##                    Timestamp Respondents         Category
## 1 2017-07-17 12:28:32.785639         402            Males
## 2 2017-07-17 12:28:33.025600          78          Females
## 3 2017-07-17 12:28:33.273919           4    Aged under 18
## 4 2017-07-17 12:28:33.495325           4   Males under 18
## 5 2017-07-17 12:28:33.722849         130       Aged 18-29
## 6 2017-07-17 12:28:33.970009         108 Males Aged 18-29
##                                                     Link Average Mean
## 1       http://www.imdb.com/title/tt6322922/ratings-male     4.6  5.0
## 2     http://www.imdb.com/title/tt6322922/ratings-female     6.9  7.7
## 3      http://www.imdb.com/title/tt6322922/ratings-age_1     4.2  4.2
## 4 http://www.imdb.com/title/tt6322922/ratings-male_age_1     4.2  4.2
## 5      http://www.imdb.com/title/tt6322922/ratings-age_2     6.3  6.5
## 6 http://www.imdb.com/title/tt6322922/ratings-male_age_2     6.2  6.2
##   Median 1_Votes 2_Votes 3_Votes 4_Votes 5_Votes 6_Votes 7_Votes 8_Votes
## 1      2     197       7       7       3       7       7      11       8
## 2     10      16       1       0       1       1       0       3       4
## 3      3       2       0       0       0       1       0       0       0
## 4      3       2       0       0       0       1       0       0       0
## 5      9      41       0       3       1       2       3       6       4
## 6      9      37       0       3       1       1       3       6       2
##   9_Votes 10_Votes 1_Pct 2_Pct 3_Pct 4_Pct 5_pct 6_Pct 7_Pct 8_Pct 9_Pct
## 1      20      135  49.0   1.7   1.7   0.7   1.7   1.7   2.7   2.0   5.0
## 2       3       49  20.5   1.3   0.0   1.3   1.3   0.0   3.8   5.1   3.8
## 3       0        1  50.0   0.0   0.0   0.0  25.0   0.0   0.0   0.0   0.0
## 4       0        1  50.0   0.0   0.0   0.0  25.0   0.0   0.0   0.0   0.0
## 5       6       64  31.5   0.0   2.3   0.8   1.5   2.3   4.6   3.1   4.6
## 6       6       49  34.3   0.0   2.8   0.9   0.9   2.8   5.6   1.9   5.6
##   10_Pct
## 1   33.6
## 2   62.8
## 3   25.0
## 4   25.0
## 5   49.2
## 6   45.4

#Next steps are to turn the Votes and Percent columns into two separate ones. I made sure to remove the link column since I don't need it. Also the timestamp I turned into three separate columns for the year, month and date, while removing the hour. 
ratings_data <- ratings_data %>%
  gather(Votes,Rank,8:17) %>% 
  gather(Percent,Numbers,"1_Pct":"10_Pct") %>%
  select(-Link) %>% 
  mutate(Timestamp = trimws(unlist(str_extract_all(Timestamp, "[0-9-]+[ ]")), which = c("both"))) %>% 
  separate(Timestamp, c("Year", "Month", "Date"), sep="-")
  

head(ratings_data)

##   Year Month Date Respondents         Category Average Mean Median   Votes
## 1 2017    07   17         402            Males     4.6  5.0      2 1_Votes
## 2 2017    07   17          78          Females     6.9  7.7     10 1_Votes
## 3 2017    07   17           4    Aged under 18     4.2  4.2      3 1_Votes
## 4 2017    07   17           4   Males under 18     4.2  4.2      3 1_Votes
## 5 2017    07   17         130       Aged 18-29     6.3  6.5      9 1_Votes
## 6 2017    07   17         108 Males Aged 18-29     6.2  6.2      9 1_Votes
##   Rank Percent Numbers
## 1  197   1_Pct    49.0
## 2   16   1_Pct    20.5
## 3    2   1_Pct    50.0
## 4    2   1_Pct    50.0
## 5   41   1_Pct    31.5
## 6   37   1_Pct    34.3

#The plottong and the analysis shows that polarization of votes, which proves that possibly this particular system of rating movies is inefficient or valid. A future research could be done on voter's political affiliation or to what extend they support Al Gore. The findings can be then compared to voters' ratings. 
plot(ratings_data$Median,ratings_data$Numbers,main="Plot of Median Ratings and Ranking", xlab="Median Ratings",ylab="Rank")

Project2_Dataset1

Violeta Stoyanova

3/11/2018