Use the data from the Top Grossing Movies for 2014 - dataset contains top 50 # http://www.the-numbers.com/market/2014/top-grossing-movies

1. Getting to know the data

  1. Import the data named “movies.csv”
  2. View the data
  3. Look at column names
  4. Look at dimension of data (rows and columns)

2. Scatterplots

a. do scatter plot of Tickets Sold and Gross (Is the trend expected?)

library(graphics)
plot(movies$Tickets_Sold, movies$Gross_Sales)

b. redo scatter plot, adjusting scales, divide by 1000

plot(movies$Tickets_Sold/1000, movies$Gross_Sales/1000)

c. redo scatter plot, adjusting scales, divide by 100,000

plot(movies$Tickets_Sold/100000, movies$Gross_Sales/100000)

d. redo scatter plot, adjusting scales, divide by 1,000,000

plot(movies$Tickets_Sold/1000000, movies$Gross_Sales/1000000)

Refine the scatterplot

movies$Tickets_Sold_Millions <- movies$Tickets_Sold/1000000
movies$Gross_Sales_Millions <- movies$Gross_Sales/1000000

plot(movies$Tickets_Sold_Millions,  movies$Gross_Sales_Millions, main = "Number of tickets sold and gross ticket sales", xlab = "Number of tickets sold (in millions)", ylab = "Gross ticket sales (in millions of $USD)", frame.plot = FALSE, col="#4cbea3")

3. What is the correlation between tickets sold and sales? Is this expected?

cor(movies$Tickets_Sold_Millions,  movies$Gross_Sales_Millions)
## [1] 1
# Note: There seems to be a perfect positive correlation between tickets sold and gross, which is to be expected.

4. Scatterplots with lines

a. do scatter plot with millions scale, add a regression line

b. add label to x and y axis, add plot title label

plot(movies$Tickets_Sold_Millions,movies$Gross_Sales_Millions, type="b", main = "Number of tickets sold and gross ticket sales", xlab = "Number of tickets sold (in millions)", ylab = "Gross ticket sales (in millions of $USD)", frame.plot = FALSE, col="#4cbea3", las=2)

5. Other plots

a. do boxplot

boxplot(movies$Gross_Sales_Millions, main="Top 50 Grossing Films of 2014 (in millions of $USD)",frame.plot=FALSE, col="#4cbea3")

b. do boxplot - horizontal

boxplot(movies$Gross_Sales_Millions, 
        horizontal = TRUE,
        main="Top 50 Grossing Films of 2014, Sales (in millions of USD $)", 
        frame.plot=FALSE, col="#4cbea3")

c. do histogram for type of films

movies$Genre_factor <- as.factor(movies$Genre)
movies$Genre_numeric <- as.numeric(movies$Genre_factor)

hist(movies$Genre_numeric, main = "Frequencies of top 50 movies by genre", xlab = "Genre category", col="#4cbea3", border="#FFFFFF",  labels=TRUE)

d. do histogram of gross sales. How bins are shown by default?

hist(movies$Gross_Sales_Millions, main = "Frequencies of top 50 movies by gross sales in millions of USD$", xlab = "Gross Sales", col="#4cbea3", border="#FFFFFF")

e. do histogram of gross sales with 10 bins.

hist(movies$Gross_Sales_Millions, main = "Frequencies of top 50 movies by gross sales in millions of USD$", xlab = "Gross Sales", col="#4cbea3", border="#FFFFFF",breaks=10)

f. do histogram of ticket sold. Try different bin numbers.

hist(movies$Tickets_Sold_Millions, xlab="Tickets sold (millions)",main="Number of tickets sold for the top 50 grossing films of 2014", col="#4cbea3", border="#FFFFFF")

# change the breaks
hist(movies$Tickets_Sold_Millions, breaks=15, xlab="Tickets sold (millions)", main="Number of tickets sold for the top 50 grossing films of 2014", col="#4cbea3", border="#FFFFFF")

g. do histogram of ticket sales (use millions unit). Add frequency count to top of bars. Add titles.

# print freq counts at top
hist(movies$Tickets_Sold_Millions, 
     breaks=15, 
     labels=TRUE,
     xlab="Tickets sold (millions)",
     main="Number of tickets sold for the top 50 grossing films of 2014", col="#4cbea3", border="#FFFFFF"
     )

h. do barplot of genre

Need to plot the table of movie genres as proporations of a whole

barplot(prop.table(table(movies$Genre)),col="#4cbea3", border="#FFFFFF")

Change the font size of the labels for the x-axis and add the label names

table(movies$Genre)
## 
##            Action         Adventure      Black Comedy            Comedy 
##                 9                 9                 3                 7 
##             Drama            Horror   Romantic Comedy Thriller/Suspense 
##                16                 2                 2                 2
barplot(prop.table(table(movies$Genre)), cex.names = .5, names.arg =c("Action", "Adventure", "Black Comedy", "Comedy", "Drama", "Horror", "Rom Com", "Thriler/Suspense"), col="#4cbea3", border="#FFFFFF")

Advanced demonstration

Scatterplot function with boxplots

library(car)
scatterplot(xlab="Tickets",ylab="Gross",main="Gross X Tickets",movies$Tickets_Sold_Millions,  movies$Gross_Sales_Millions, col="#4cbea3")

ggvis scatterplot

require(ggvis)
## Loading required package: ggvis
movies %>% ggvis(x=~movies$Tickets_Sold_Millions, y=~movies$Gross_Sales_Millions) %>%layer_points(fill="Movie")

ggplot - basic

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:ggvis':
## 
##     resolution
ggplot(movies, aes(movies$Tickets_Sold_Millions, movies$Gross_Sales_Millions)) + geom_point(col="#4cbea3")

ggplot - with formatting similar to the Economist

library(ggplot2)
library(ggthemes)
ggplot(movies, aes(movies$Tickets_Sold_Millions, movies$Gross_Sales_Millions)) + geom_point() + theme_economist() + labs(title = "Tickets sold and gross earnings", subtitle = "sub title here",caption = "Data from the-numbers.com", x = "Tickets sold in millions",y = "Gross in millions")

ggplot - with formatting similar to the Wall Street Journal

library(ggplot2)
library(ggthemes)
ggplot(movies, aes(movies$Tickets_Sold_Millions, movies$Gross_Sales_Millions)) + geom_point()+ labs(title = "Tickets sold and gross earnings", subtitle = "sub title here",caption = "Data from the-numbers.com", x = "Tickets sold in millions",y = "Gross in millions") + theme_fivethirtyeight()

saving your last plot

ggsave("myspecialplot.pdf")
## Saving 7 x 5 in image

Taxi Example

Taxi subset

taxisub <- subset(taxi, taxi$passenger_count==3 & taxi$trip_distance >=20)

Plot

require(ggplot2)
require(ggthemes)
ggplot(taxisub, aes(taxisub$trip_distance,taxisub$fare_amount )) + geom_point() + theme_excel()