# Class: Foundations of Statistics Using R
# Title: in-class exercises
# Session: 5
# Topic: R - Graphs
# Last updated: 4/8/2016
# Data: movies2014.csv
# Data describe: 50 top grossing movies for 2014
# get current working directory
getwd()
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")
# Source: Top Grossing Movies for 2014 - dataset contains top 50
# http://www.the-numbers.com/market/2014/top-grossing-movies
# import data
movies <- read.csv("movies2014.csv", sep=",", header=TRUE)
# open and look at data
#View(movies)
# how many variables are in the data frame? (number of columns)
print(length(movies))
## [1] 8
print(nrow(movies))
## [1] 50
# what are the names of the columns?
names(movies)
## [1] "Rank" "Movie" "Release.Date" "Distributor"
## [5] "Genre" "MPAA" "X2014.Gross" "Tickets.Sold"
str(movies)
## 'data.frame': 50 obs. of 8 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Movie : Factor w/ 50 levels "12 Years a Slave",..: 42 32 22 13 3 11 23 28 43 6 ...
## $ Release.Date: Factor w/ 25 levels "1/10/2014","1/17/2014",..: 21 2 1 11 25 23 25 20 21 12 ...
## $ Distributor : Factor w/ 15 levels "20th Century Fox",..: 14 12 12 13 14 6 1 12 11 11 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 2 4 1 2 1 2 2 1 5 3 ...
## $ MPAA : Factor w/ 3 levels "PG","PG-13","R": 1 2 3 1 3 2 1 2 2 3 ...
## $ X2014.Gross : int 248303720 133659265 124722648 121285671 101145414 95260008 94479448 85091060 76599461 74500902 ...
## $ Tickets.Sold: int 30429377 16379811 15284638 14863440 12395271 11674020 11578363 10427825 9387188 9130012 ...
# how many rows, columns are in the data frame?
dim(movies)
## [1] 50 8
#print(dim(movies))
# let's do scatter plot of Tickets Sold and Gross
plot(movies$X2014.Gross, movies$Tickets.Sold)

# let's try again, adjusting scales, divide by 1000
# try dividing Gross sales by 1000
plot(movies$Tickets.Sold/1000, movies$X2014.Gross/1000 )

plot(movies$Tickets.Sold/100000, movies$X2014.Gross/100000)

# Scale by a million
movies$Tickets.Sold.millions <- movies$Tickets.Sold/1000000
movies$X2014.Gross.millions <- movies$X2014.Gross/1000000
# let us try creating the scatter plot again
plot(movies$Tickets.Sold.millions, movies$X2014.Gross.millions)

# let's see what the correlation is
cor(movies$Tickets.Sold.millions, movies$X2014.Gross.millions)
## [1] 1
# Note: There seems to be a positive correlation between tickets sold and gross,
# which is to be expected.
# let's add a line to the scatter plot
plot(movies$Tickets.Sold.millions, movies$X2014.Gross.millions, type="b")

# let's label the simple scatter plot
plot(movies$Tickets.Sold.millions, movies$X2014.Gross.millions,
type="b",
xlab="Tickets Sold (millions)",
ylab="Gross Sales (millions)",
main="Top 50 Grossing Films of 2014")

# let's see a boxplot of Gross Sales
boxplot(movies$X2014.Gross.millions, main="Top 50 Grossing Films of 2014", frame.plot=FALSE)

#describe what you see
mean(movies$X2014.Gross.millions)
## [1] 44.60988
median(movies$X2014.Gross.millions)
## [1] 27.64746
# let's see a boxplot of Gross Sales - try horizontal
boxplot(movies$X2014.Gross.millions,
horizontal = TRUE,
main="Top 50 Grossing Films of 2014, Sales (millions)",
frame.plot=FALSE)

# let's look at histograms of the types of films
##error if you do not convert
#hist(movies$Genre)
hist(as.numeric(movies$Genre))

# gives error since Genre is a factor variable
# let's do histogram of numeric variable, Gross Sales
hist(movies$X2014.Gross.millions)

# Note: gives 5 bins
# let's try specifying with 10 bins
hist(movies$X2014.Gross.millions, breaks=10)

# let's do histogram for ticket sales
hist(movies$Tickets.Sold)

hist(movies$Tickets.Sold, breaks=15)

# if we use the millions value, it looks cleaner
hist(movies$Tickets.Sold.millions, breaks=15)

# print freq counts at top
hist(movies$Tickets.Sold.millions,
breaks=15,
labels=TRUE,
xlab="Tickets Sold (millions)",
main="Top 50 Grossing Films of 2014, Sales (millions)"
)

str(movies)
## 'data.frame': 50 obs. of 10 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Movie : Factor w/ 50 levels "12 Years a Slave",..: 42 32 22 13 3 11 23 28 43 6 ...
## $ Release.Date : Factor w/ 25 levels "1/10/2014","1/17/2014",..: 21 2 1 11 25 23 25 20 21 12 ...
## $ Distributor : Factor w/ 15 levels "20th Century Fox",..: 14 12 12 13 14 6 1 12 11 11 ...
## $ Genre : Factor w/ 8 levels "Action","Adventure",..: 2 4 1 2 1 2 2 1 5 3 ...
## $ MPAA : Factor w/ 3 levels "PG","PG-13","R": 1 2 3 1 3 2 1 2 2 3 ...
## $ X2014.Gross : int 248303720 133659265 124722648 121285671 101145414 95260008 94479448 85091060 76599461 74500902 ...
## $ Tickets.Sold : int 30429377 16379811 15284638 14863440 12395271 11674020 11578363 10427825 9387188 9130012 ...
## $ Tickets.Sold.millions: num 30.4 16.4 15.3 14.9 12.4 ...
## $ X2014.Gross.millions : num 248 134 125 121 101 ...
barplot(prop.table(table(movies$Genre)))

barplot(prop.table(table(movies$Distributor)))

# END