# Class:        Foundations of Statistics Using R
# Title:        in-class exercises
# Session:      5
# Topic:        R - Graphs
# Last updated: 4/8/2016
# Data:         movies2014.csv
# Data describe: 50 top grossing movies for 2014
# get current working directory
getwd()
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")
#remove variables
rm(list=ls())
# Source: Top Grossing Movies for 2014 - dataset contains top 50
# http://www.the-numbers.com/market/2014/top-grossing-movies

# import data
movies <- read.csv("movies2014.csv", sep=",", header=TRUE)

# open and look at data
#View(movies)

# how many variables are in the data frame? (number of columns)
print(length(movies))
## [1] 8
print(nrow(movies))
## [1] 50
# what are the names of the columns?
names(movies)
## [1] "Rank"         "Movie"        "Release.Date" "Distributor" 
## [5] "Genre"        "MPAA"         "X2014.Gross"  "Tickets.Sold"
str(movies)
## 'data.frame':    50 obs. of  8 variables:
##  $ Rank        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Movie       : Factor w/ 50 levels "12 Years a Slave",..: 42 32 22 13 3 11 23 28 43 6 ...
##  $ Release.Date: Factor w/ 25 levels "1/10/2014","1/17/2014",..: 21 2 1 11 25 23 25 20 21 12 ...
##  $ Distributor : Factor w/ 15 levels "20th Century Fox",..: 14 12 12 13 14 6 1 12 11 11 ...
##  $ Genre       : Factor w/ 8 levels "Action","Adventure",..: 2 4 1 2 1 2 2 1 5 3 ...
##  $ MPAA        : Factor w/ 3 levels "PG","PG-13","R": 1 2 3 1 3 2 1 2 2 3 ...
##  $ X2014.Gross : int  248303720 133659265 124722648 121285671 101145414 95260008 94479448 85091060 76599461 74500902 ...
##  $ Tickets.Sold: int  30429377 16379811 15284638 14863440 12395271 11674020 11578363 10427825 9387188 9130012 ...
# how many rows, columns are in the data frame?
dim(movies)
## [1] 50  8
#print(dim(movies))

attach(movies)

# let's do scatter plot of Tickets Sold and Gross 
plot(movies$X2014.Gross, movies$Tickets.Sold, xlab = "Gross", ylab="Tickets sold")

# let's try again, adjusting scales, divide by 1000
# try dividing Gross sales by 1000
plot(movies$Tickets.Sold/1000,movies$X2014.Gross/1000, xlab = "Gross", ylab="Tickets sold", main="Correlation between tickets sold and gross")

plot(movies$Tickets.Sold/100000,movies$X2014.Gross/100000,xlab = "Gross", ylab="Tickets sold", main="Correlation between tickets sold and gross")

# Scale by a million
movies$Tickets.Sold.millions <- movies$Tickets.Sold/1000000
movies$X2014.Gross.millions <- movies$X2014.Gross/1000000

#to reorder
movies <- movies[c(1,2,10,3,4,5,6,7,8,9)]

# let us try creating the scatter plot again
plot(movies$Tickets.Sold.millions,  movies$X2014.Gross.millions, xlab = "Gross", ylab="Tickets sold", main="Correlation between tickets sold and gross")

#other ways to create scatter plot
require(ggvis)
## Loading required package: ggvis
movies %>% ggvis(x = ~Tickets.Sold.millions, y = ~movies$X2014.Gross.millions) %>% layer_points()

require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:ggvis':
## 
##     resolution

require(ggthemes)
## Loading required package: ggthemes
## Warning: replacing previous import by 'grid::arrow' when loading 'ggthemes'
## Warning: replacing previous import by 'grid::unit' when loading 'ggthemes'
## Warning: replacing previous import by 'scales::alpha' when loading
## 'ggthemes'
ggplot(movies, aes(Tickets.Sold.millions, movies$X2014.Gross.millions))+ geom_point() + theme_tufte()

ggplot(movies, aes(Tickets.Sold.millions, movies$X2014.Gross.millions))+ geom_point() + theme_few()

# let's see what the correlation is
cor(movies$Tickets.Sold.millions,  movies$X2014.Gross.millions)
## [1] 1
# Note:  There seems to be a positive correlation between tickets sold and gross,
# which is to be expected.

# let's add a line to the scatter plot
plot(movies$Tickets.Sold.millions,  movies$X2014.Gross.millions, type="b")

# let's label the simple scatter plot
plot(movies$Tickets.Sold.millions, movies$X2014.Gross.millions, 
     type="b", 
     xlab="Tickets Sold (millions)", 
     ylab="Gross Sales (millions)",
     main="Top 50 Grossing Films of 2014")

# let's see a boxplot of Gross Sales
boxplot(movies$X2014.Gross.millions, main="Top 50 Grossing Films of 2014", frame.plot=FALSE)

#describe what you see
mean(movies$X2014.Gross.millions)
## [1] 44.60988
median(movies$X2014.Gross.millions)
## [1] 27.64746
# let's see a boxplot of Gross Sales - try horizontal
boxplot(movies$X2014.Gross.millions, 
        horizontal = TRUE,
        main="Top 50 Grossing Films of 2014, Sales (millions)", 
        frame.plot=FALSE)

# let's look at histograms of the types of films
##error if you do not convert
#hist(movies$Genre)

hist(as.numeric(movies$Genre))

# gives error since Genre is a factor variable

# let's do histogram of numeric variable, Gross Sales
hist(movies$X2014.Gross.millions)

# Note:  gives 5 bins

# let's try specifying with 10 bins
hist(movies$X2014.Gross.millions, breaks=10)

# let's do histogram for ticket sales
hist(movies$Tickets.Sold)

hist(movies$Tickets.Sold, breaks=15)

# if we use the millions value, it looks cleaner
hist(movies$Tickets.Sold.millions, breaks=15)

# print freq counts at top
hist(movies$Tickets.Sold.millions, 
     breaks=15, 
     labels=TRUE,
     xlab="Tickets Sold (millions)",
     main="Top 50 Grossing Films of 2014, Sales (millions)"
     )

str(movies)
## 'data.frame':    50 obs. of  10 variables:
##  $ Rank                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Movie                : Factor w/ 50 levels "12 Years a Slave",..: 42 32 22 13 3 11 23 28 43 6 ...
##  $ X2014.Gross.millions : num  248 134 125 121 101 ...
##  $ Release.Date         : Factor w/ 25 levels "1/10/2014","1/17/2014",..: 21 2 1 11 25 23 25 20 21 12 ...
##  $ Distributor          : Factor w/ 15 levels "20th Century Fox",..: 14 12 12 13 14 6 1 12 11 11 ...
##  $ Genre                : Factor w/ 8 levels "Action","Adventure",..: 2 4 1 2 1 2 2 1 5 3 ...
##  $ MPAA                 : Factor w/ 3 levels "PG","PG-13","R": 1 2 3 1 3 2 1 2 2 3 ...
##  $ X2014.Gross          : int  248303720 133659265 124722648 121285671 101145414 95260008 94479448 85091060 76599461 74500902 ...
##  $ Tickets.Sold         : int  30429377 16379811 15284638 14863440 12395271 11674020 11578363 10427825 9387188 9130012 ...
##  $ Tickets.Sold.millions: num  30.4 16.4 15.3 14.9 12.4 ...
table(movies$Genre)
## 
##            Action         Adventure      Black Comedy            Comedy 
##                 9                 9                 3                 7 
##             Drama            Horror   Romantic Comedy Thriller/Suspense 
##                16                 2                 2                 2
barplot(prop.table(table(movies$Genre)))

#where are all the categories?
barplot(prop.table(table(movies$Genre)), cex.names=.5,names.arg=c("Action","Adventure","Black Comedy","Comedy","Drama","Horror","Rom Com","Thriller / Suspense"))

barplot(prop.table(table(movies$Distributor)))  

# END