# Class:        Foundations of Statistics Using R
# Title:        in-class exercises
# Session:      5
# Topic:        R - Graphs
# Last updated: 4/8/2016
# Data:         movies2014.csv
# Data describe: 50 top grossing movies for 2014
# get current working directory
getwd()
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")

# Source: Top Grossing Movies for 2014 - dataset contains top 50
# http://www.the-numbers.com/market/2014/top-grossing-movies

# import data
movies <- read.csv("movies2014.csv", sep=",", header=TRUE)

# open and look at data
#View(movies)

# how many variables are in the data frame? (number of columns)
print(length(movies))
## [1] 8
print(nrow(movies))
## [1] 50
# what are the names of the columns?
names(movies)
## [1] "Rank"         "Movie"        "Release.Date" "Distributor" 
## [5] "Genre"        "MPAA"         "X2014.Gross"  "Tickets.Sold"
str(movies)
## 'data.frame':    50 obs. of  8 variables:
##  $ Rank        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Movie       : Factor w/ 50 levels "12 Years a Slave",..: 42 32 22 13 3 11 23 28 43 6 ...
##  $ Release.Date: Factor w/ 25 levels "1/10/2014","1/17/2014",..: 21 2 1 11 25 23 25 20 21 12 ...
##  $ Distributor : Factor w/ 15 levels "20th Century Fox",..: 14 12 12 13 14 6 1 12 11 11 ...
##  $ Genre       : Factor w/ 8 levels "Action","Adventure",..: 2 4 1 2 1 2 2 1 5 3 ...
##  $ MPAA        : Factor w/ 3 levels "PG","PG-13","R": 1 2 3 1 3 2 1 2 2 3 ...
##  $ X2014.Gross : int  248303720 133659265 124722648 121285671 101145414 95260008 94479448 85091060 76599461 74500902 ...
##  $ Tickets.Sold: int  30429377 16379811 15284638 14863440 12395271 11674020 11578363 10427825 9387188 9130012 ...
# how many rows, columns are in the data frame?
dim(movies)
## [1] 50  8
#print(dim(movies))


# let's do scatter plot of Tickets Sold and Gross 
plot(movies$X2014.Gross, movies$Tickets.Sold)

# let's try again, adjusting scales, divide by 1000
# try dividing Gross sales by 1000
plot(movies$Tickets.Sold/1000,  movies$X2014.Gross/1000 )

plot(movies$Tickets.Sold/100000,  movies$X2014.Gross/100000)

# Scale by a million
movies$Tickets.Sold.millions <- movies$Tickets.Sold/1000000
movies$X2014.Gross.millions <- movies$X2014.Gross/1000000

# let us try creating the scatter plot again
plot(movies$Tickets.Sold.millions,  movies$X2014.Gross.millions)

# let's see what the correlation is
cor(movies$Tickets.Sold.millions,  movies$X2014.Gross.millions)
## [1] 1
# Note:  There seems to be a positive correlation between tickets sold and gross,
# which is to be expected.

# let's add a line to the scatter plot
plot(movies$Tickets.Sold.millions,  movies$X2014.Gross.millions, type="b")

# let's label the simple scatter plot
plot(movies$Tickets.Sold.millions, movies$X2014.Gross.millions, 
     type="b", 
     xlab="Tickets Sold (millions)", 
     ylab="Gross Sales (millions)",
     main="Top 50 Grossing Films of 2014")

# let's see a boxplot of Gross Sales
boxplot(movies$X2014.Gross.millions, main="Top 50 Grossing Films of 2014", frame.plot=FALSE)

#describe what you see
mean(movies$X2014.Gross.millions)
## [1] 44.60988
median(movies$X2014.Gross.millions)
## [1] 27.64746
# let's see a boxplot of Gross Sales - try horizontal
boxplot(movies$X2014.Gross.millions, 
        horizontal = TRUE,
        main="Top 50 Grossing Films of 2014, Sales (millions)", 
        frame.plot=FALSE)

# let's look at histograms of the types of films
##error if you do not convert
#hist(movies$Genre)

hist(as.numeric(movies$Genre))

# gives error since Genre is a factor variable

# let's do histogram of numeric variable, Gross Sales
hist(movies$X2014.Gross.millions)

# Note:  gives 5 bins

# let's try specifying with 10 bins
hist(movies$X2014.Gross.millions, breaks=10)

# let's do histogram for ticket sales
hist(movies$Tickets.Sold)

hist(movies$Tickets.Sold, breaks=15)

# if we use the millions value, it looks cleaner
hist(movies$Tickets.Sold.millions, breaks=15)

# print freq counts at top
hist(movies$Tickets.Sold.millions, 
     breaks=15, 
     labels=TRUE,
     xlab="Tickets Sold (millions)",
     main="Top 50 Grossing Films of 2014, Sales (millions)"
     )

str(movies)
## 'data.frame':    50 obs. of  10 variables:
##  $ Rank                 : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Movie                : Factor w/ 50 levels "12 Years a Slave",..: 42 32 22 13 3 11 23 28 43 6 ...
##  $ Release.Date         : Factor w/ 25 levels "1/10/2014","1/17/2014",..: 21 2 1 11 25 23 25 20 21 12 ...
##  $ Distributor          : Factor w/ 15 levels "20th Century Fox",..: 14 12 12 13 14 6 1 12 11 11 ...
##  $ Genre                : Factor w/ 8 levels "Action","Adventure",..: 2 4 1 2 1 2 2 1 5 3 ...
##  $ MPAA                 : Factor w/ 3 levels "PG","PG-13","R": 1 2 3 1 3 2 1 2 2 3 ...
##  $ X2014.Gross          : int  248303720 133659265 124722648 121285671 101145414 95260008 94479448 85091060 76599461 74500902 ...
##  $ Tickets.Sold         : int  30429377 16379811 15284638 14863440 12395271 11674020 11578363 10427825 9387188 9130012 ...
##  $ Tickets.Sold.millions: num  30.4 16.4 15.3 14.9 12.4 ...
##  $ X2014.Gross.millions : num  248 134 125 121 101 ...
barplot(prop.table(table(movies$Genre)))

barplot(prop.table(table(movies$Distributor)))  

# END