Mini Class 4

This Mini Class use the dataset ccm2, which was produced in the first assignment.

Problem 1

Get tidyverse and the dataset loaded.

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.3.3

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Warning: package 'ggplot2' was built under R version 3.3.3

## Warning: package 'purrr' was built under R version 3.3.3

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

load("C:/Users/senet/Desktop/CSC 463 Data Visualization Tools/R/Mini Class 4/ccm.rdata")
ccm2<-ccm%>%
  select(num_critic_for_reviews,director_facebook_likes,actor_1_facebook_likes,gross,genres,movie_title,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes)%>%
  mutate(Profit = gross-budget)
ccm2

## # A tibble: 3,752 × 17
##    num_critic_for_reviews director_facebook_likes actor_1_facebook_likes
##                     <int>                   <int>                  <int>
## 1                     723                       0                   1000
## 2                     302                     563                  40000
## 3                     602                       0                  11000
## 4                     813                   22000                  27000
## 5                     462                     475                    640
## 6                     392                       0                  24000
## 7                     324                      15                    799
## 8                     635                       0                  26000
## 9                     375                     282                  25000
## 10                    673                       0                  15000
## # ... with 3,742 more rows, and 14 more variables: gross <int>,
## #   genres <chr>, movie_title <chr>, num_voted_users <int>,
## #   cast_total_facebook_likes <int>, num_user_for_reviews <int>,
## #   language <chr>, country <chr>, content_rating <chr>, budget <int>,
## #   title_year <int>, imdb_score <dbl>, movie_facebook_likes <int>,
## #   Profit <int>

str(ccm2)

## Classes 'tbl_df', 'tbl' and 'data.frame':    3752 obs. of  17 variables:
##  $ num_critic_for_reviews   : int  723 302 602 813 462 392 324 635 375 673 ...
##  $ director_facebook_likes  : int  0 563 0 22000 475 0 15 0 282 0 ...
##  $ actor_1_facebook_likes   : int  1000 40000 11000 27000 640 24000 799 26000 25000 15000 ...
##  $ gross                    : int  760505847 309404152 200074175 448130642 73058679 336530303 200807262 458991599 301956980 330249062 ...
##  $ genres                   : chr  "Action|Adventure|Fantasy|Sci-Fi" "Action|Adventure|Fantasy" "Action|Adventure|Thriller" "Action|Thriller" ...
##  $ movie_title              : chr  "Avatar\xe5\xca" "Pirates of the Caribbean: At World's End\xe5\xca" "Spectre\xe5\xca" "The Dark Knight Rises\xe5\xca" ...
##  $ num_voted_users          : int  886204 471220 275868 1144337 212204 383056 294810 462669 321795 371639 ...
##  $ cast_total_facebook_likes: int  4834 48350 11700 106759 1873 46055 2036 92000 58753 24450 ...
##  $ num_user_for_reviews     : int  3054 1238 994 2701 738 1902 387 1117 973 3018 ...
##  $ language                 : chr  "English" "English" "English" "English" ...
##  $ country                  : chr  "USA" "USA" "UK" "USA" ...
##  $ content_rating           : chr  "PG-13" "PG-13" "PG-13" "PG-13" ...
##  $ budget                   : int  237000000 300000000 245000000 250000000 263700000 258000000 260000000 250000000 250000000 250000000 ...
##  $ title_year               : int  2009 2007 2015 2012 2012 2007 2010 2015 2009 2016 ...
##  $ imdb_score               : num  7.9 7.1 6.8 8.5 6.6 6.2 7.8 7.5 7.5 6.9 ...
##  $ movie_facebook_likes     : int  33000 0 85000 164000 24000 0 29000 118000 10000 197000 ...
##  $ Profit                   : int  523505847 9404152 -44925825 198130642 -190641321 78530303 -59192738 208991599 51956980 80249062 ...

Filter the data with Countries Four countries- Australia, Canada, China, France.

ccm2_select<-ccm2[ccm2$country %in% c("Australia", "Canada", "China","France"),]
(head(ccm2_select))

## # A tibble: 6 × 17
##   num_critic_for_reviews director_facebook_likes actor_1_facebook_likes
##                    <int>                   <int>                  <int>
## 1                    334                     420                  20000
## 2                    490                    1000                  29000
## 3                    245                       0                    309
## 4                    739                     750                  27000
## 5                    465                     188                  10000
## 6                    156                     750                  49000
## # ... with 14 more variables: gross <int>, genres <chr>,
## #   movie_title <chr>, num_voted_users <int>,
## #   cast_total_facebook_likes <int>, num_user_for_reviews <int>,
## #   language <chr>, country <chr>, content_rating <chr>, budget <int>,
## #   title_year <int>, imdb_score <dbl>, movie_facebook_likes <int>,
## #   Profit <int>

Compare two variables budget on x axis and gross on y axis using geom_point function.

First<-ggplot(ccm2_select, aes(x=budget, y=gross))+geom_point()+
        labs(title="scatter plot", 
         subtitle="Relation between variable budget and gross",
         caption="Source: Moodle CCM",
         x="BUDGET", y="GROSS")
First

scatterplot lets us compare the relationship between 2 continuous variables. Whereas bubble chart serves well if we want to understand relationship of a categorical variable and another continous variable along with the previous two variables we have used as x and y axis.

In simpler words, bubble charts are more suitable if we have 4-Dimensional data where two of them are numeric (X and Y) and one other categorical (color) and another numeric variable (size).

second<-First+geom_point(aes(size = movie_facebook_likes), alpha = 0.7)+
  labs(title="Bubble plot", 
         subtitle="Relation between variable budget and gross",
         caption="Source: Moodle CCM",
         x="BUDGET", y="GROSS")
second

Third<-First+geom_point(aes(color=country, size =  movie_facebook_likes), alpha = 1)
Third

Fifth<-First+geom_point(aes(color=country, size= movie_facebook_likes),  alpha = 0.7)+scale_color_brewer()+facet_wrap(~country)
Fifth

Since it is irrelevant that the facebook likes about movies will determine the profit on a movie, we can use profit factor to see if a high budget movies usally does better in the box office.

library(ggplot2)
ccm2_select<-ccm2[ccm2$country %in% c("Australia", "Canada", "China","France"),]
# mpg <- read.csv("http://goo.gl/uEeRGu")



# Scatterplot
theme_set(theme_bw())  # pre-set the bw theme.
g <- ggplot(ccm2_select, aes(budget, gross)) + 
  labs(subtitle="CCM2: Profit vs Budget",
       title="Bubble chart")


g + geom_point(aes(color=country, size=Profit)) + 
  geom_smooth(aes(col=country), method="lm", se=F)

cor(ccm2_select$budget, ccm2_select$gross)

## [1] 0.2997182

The bubble chart clearly distinguishes the range of gross between the countries and how the slope of lines-of-best-fit varies, providing a better visual comparison between the groups.

For Individual Countries:

g + geom_point(aes(col=country, size = Profit), alpha=.3) + 
  geom_smooth(aes(col=country), method="lm", se=F)+ facet_wrap(~country)

Mini Class 4

10/5/2017

Bubble Chart about Movies made in Different Countries.

Problem 1