# Class: Foundations of Statistics Using R
# Title: in-class exercises
# Session: 5
# Topic: R - Graphs
# Last updated: 4/8/2016
# Data: movies2014.csv
# Data describe: 50 top grossing movies for 2014
# get current working directory
getwd()
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")
#remove variables
rm(list=ls())
# Source: Top Grossing Movies for 2014 - dataset contains top 50
# http://www.the-numbers.com/market/2014/top-grossing-movies
# import data
movies <- read.csv("movies2014.csv", sep=",", header=TRUE)
# open and look at data
#View(movies)
# how many variables are in the data frame? (number of columns)
print(length(movies))
## [1] 8
print(nrow(movies))
## [1] 50
# what are the names of the columns?
names(movies)
## [1] "Rank" "Movie" "Release.Date" "Distributor"
## [5] "Genre" "MPAA" "X2014.Gross" "Tickets.Sold"
str(movies)
## 'data.frame': 50 obs. of 8 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Movie : Factor w/ 50 levels "12 Years a Slave",..: 42 32 22 13 3 11 23 28 43 6 ...
## $ Release.Date: Factor w/ 25 levels "1/10/2014","1/17/2014",..: 21 2 1 11 25 23 25 20 21 12 ...
## $ Distributor : Factor w/ 15 levels "20th Century Fox",..: 14 12 12 13 14 6 1 12 11 11 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 2 4 1 2 1 2 2 1 5 3 ...
## $ MPAA : Factor w/ 3 levels "PG","PG-13","R": 1 2 3 1 3 2 1 2 2 3 ...
## $ X2014.Gross : int 248303720 133659265 124722648 121285671 101145414 95260008 94479448 85091060 76599461 74500902 ...
## $ Tickets.Sold: int 30429377 16379811 15284638 14863440 12395271 11674020 11578363 10427825 9387188 9130012 ...
# how many rows, columns are in the data frame?
dim(movies)
## [1] 50 8
#print(dim(movies))
attach(movies)
# let's do scatter plot of Tickets Sold and Gross
plot(movies$X2014.Gross, movies$Tickets.Sold, xlab = "Gross", ylab="Tickets sold")

# let's try again, adjusting scales, divide by 1000
# try dividing Gross sales by 1000
plot(movies$Tickets.Sold/1000,movies$X2014.Gross/1000, xlab = "Gross", ylab="Tickets sold", main="Correlation between tickets sold and gross")

plot(movies$Tickets.Sold/100000,movies$X2014.Gross/100000,xlab = "Gross", ylab="Tickets sold", main="Correlation between tickets sold and gross")

# Scale by a million
movies$Tickets.Sold.millions <- movies$Tickets.Sold/1000000
movies$X2014.Gross.millions <- movies$X2014.Gross/1000000
#to reorder
movies <- movies[c(1,2,10,3,4,5,6,7,8,9)]
# let us try creating the scatter plot again
plot(movies$Tickets.Sold.millions, movies$X2014.Gross.millions, xlab = "Gross", ylab="Tickets sold", main="Correlation between tickets sold and gross")
#other ways to create scatter plot
require(ggvis)
## Loading required package: ggvis
movies %>% ggvis(x = ~Tickets.Sold.millions, y = ~movies$X2014.Gross.millions) %>% layer_points()
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:ggvis':
##
## resolution

require(ggthemes)
## Loading required package: ggthemes
## Warning: replacing previous import by 'grid::arrow' when loading 'ggthemes'
## Warning: replacing previous import by 'grid::unit' when loading 'ggthemes'
## Warning: replacing previous import by 'scales::alpha' when loading
## 'ggthemes'
ggplot(movies, aes(Tickets.Sold.millions, movies$X2014.Gross.millions))+ geom_point() + theme_tufte()

ggplot(movies, aes(Tickets.Sold.millions, movies$X2014.Gross.millions))+ geom_point() + theme_few()

# let's see what the correlation is
cor(movies$Tickets.Sold.millions, movies$X2014.Gross.millions)
## [1] 1
# Note: There seems to be a positive correlation between tickets sold and gross,
# which is to be expected.
# let's add a line to the scatter plot
plot(movies$Tickets.Sold.millions, movies$X2014.Gross.millions, type="b")

# let's label the simple scatter plot
plot(movies$Tickets.Sold.millions, movies$X2014.Gross.millions,
type="b",
xlab="Tickets Sold (millions)",
ylab="Gross Sales (millions)",
main="Top 50 Grossing Films of 2014")

# let's see a boxplot of Gross Sales
boxplot(movies$X2014.Gross.millions, main="Top 50 Grossing Films of 2014", frame.plot=FALSE)

#describe what you see
mean(movies$X2014.Gross.millions)
## [1] 44.60988
median(movies$X2014.Gross.millions)
## [1] 27.64746
# let's see a boxplot of Gross Sales - try horizontal
boxplot(movies$X2014.Gross.millions,
horizontal = TRUE,
main="Top 50 Grossing Films of 2014, Sales (millions)",
frame.plot=FALSE)

# let's look at histograms of the types of films
##error if you do not convert
#hist(movies$Genre)
hist(as.numeric(movies$Genre))

# gives error since Genre is a factor variable
# let's do histogram of numeric variable, Gross Sales
hist(movies$X2014.Gross.millions)

# Note: gives 5 bins
# let's try specifying with 10 bins
hist(movies$X2014.Gross.millions, breaks=10)

# let's do histogram for ticket sales
hist(movies$Tickets.Sold)

hist(movies$Tickets.Sold, breaks=15)

# if we use the millions value, it looks cleaner
hist(movies$Tickets.Sold.millions, breaks=15)

# print freq counts at top
hist(movies$Tickets.Sold.millions,
breaks=15,
labels=TRUE,
xlab="Tickets Sold (millions)",
main="Top 50 Grossing Films of 2014, Sales (millions)"
)

str(movies)
## 'data.frame': 50 obs. of 10 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Movie : Factor w/ 50 levels "12 Years a Slave",..: 42 32 22 13 3 11 23 28 43 6 ...
## $ X2014.Gross.millions : num 248 134 125 121 101 ...
## $ Release.Date : Factor w/ 25 levels "1/10/2014","1/17/2014",..: 21 2 1 11 25 23 25 20 21 12 ...
## $ Distributor : Factor w/ 15 levels "20th Century Fox",..: 14 12 12 13 14 6 1 12 11 11 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 2 4 1 2 1 2 2 1 5 3 ...
## $ MPAA : Factor w/ 3 levels "PG","PG-13","R": 1 2 3 1 3 2 1 2 2 3 ...
## $ X2014.Gross : int 248303720 133659265 124722648 121285671 101145414 95260008 94479448 85091060 76599461 74500902 ...
## $ Tickets.Sold : int 30429377 16379811 15284638 14863440 12395271 11674020 11578363 10427825 9387188 9130012 ...
## $ Tickets.Sold.millions: num 30.4 16.4 15.3 14.9 12.4 ...
table(movies$Genre)
##
## Action Adventure Black Comedy Comedy
## 9 9 3 7
## Drama Horror Romantic Comedy Thriller/Suspense
## 16 2 2 2
barplot(prop.table(table(movies$Genre)))

#where are all the categories?
barplot(prop.table(table(movies$Genre)), cex.names=.5,names.arg=c("Action","Adventure","Black Comedy","Comedy","Drama","Horror","Rom Com","Thriller / Suspense"))

barplot(prop.table(table(movies$Distributor)))

# END