This Mini Class use the dataset ccm2, which was produced in the first assignment.
Bubble charts can be considered a variation of the scatter plot, in which the data points are replaced with bubbles
Get tidyverse and the dataset loaded.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.3
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.3.3
## Warning: package 'purrr' was built under R version 3.3.3
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
load("C:/Users/senet/Desktop/CSC 463 Data Visualization Tools/R/Mini Class 4/ccm.rdata")
ccm2<-ccm%>%
select(num_critic_for_reviews,director_facebook_likes,actor_1_facebook_likes,gross,genres,movie_title,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,language,country,content_rating,budget,title_year,imdb_score,movie_facebook_likes)%>%
mutate(Profit = gross-budget)
ccm2
## # A tibble: 3,752 × 17
## num_critic_for_reviews director_facebook_likes actor_1_facebook_likes
## <int> <int> <int>
## 1 723 0 1000
## 2 302 563 40000
## 3 602 0 11000
## 4 813 22000 27000
## 5 462 475 640
## 6 392 0 24000
## 7 324 15 799
## 8 635 0 26000
## 9 375 282 25000
## 10 673 0 15000
## # ... with 3,742 more rows, and 14 more variables: gross <int>,
## # genres <chr>, movie_title <chr>, num_voted_users <int>,
## # cast_total_facebook_likes <int>, num_user_for_reviews <int>,
## # language <chr>, country <chr>, content_rating <chr>, budget <int>,
## # title_year <int>, imdb_score <dbl>, movie_facebook_likes <int>,
## # Profit <int>
str(ccm2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3752 obs. of 17 variables:
## $ num_critic_for_reviews : int 723 302 602 813 462 392 324 635 375 673 ...
## $ director_facebook_likes : int 0 563 0 22000 475 0 15 0 282 0 ...
## $ actor_1_facebook_likes : int 1000 40000 11000 27000 640 24000 799 26000 25000 15000 ...
## $ gross : int 760505847 309404152 200074175 448130642 73058679 336530303 200807262 458991599 301956980 330249062 ...
## $ genres : chr "Action|Adventure|Fantasy|Sci-Fi" "Action|Adventure|Fantasy" "Action|Adventure|Thriller" "Action|Thriller" ...
## $ movie_title : chr "Avatar\xe5\xca" "Pirates of the Caribbean: At World's End\xe5\xca" "Spectre\xe5\xca" "The Dark Knight Rises\xe5\xca" ...
## $ num_voted_users : int 886204 471220 275868 1144337 212204 383056 294810 462669 321795 371639 ...
## $ cast_total_facebook_likes: int 4834 48350 11700 106759 1873 46055 2036 92000 58753 24450 ...
## $ num_user_for_reviews : int 3054 1238 994 2701 738 1902 387 1117 973 3018 ...
## $ language : chr "English" "English" "English" "English" ...
## $ country : chr "USA" "USA" "UK" "USA" ...
## $ content_rating : chr "PG-13" "PG-13" "PG-13" "PG-13" ...
## $ budget : int 237000000 300000000 245000000 250000000 263700000 258000000 260000000 250000000 250000000 250000000 ...
## $ title_year : int 2009 2007 2015 2012 2012 2007 2010 2015 2009 2016 ...
## $ imdb_score : num 7.9 7.1 6.8 8.5 6.6 6.2 7.8 7.5 7.5 6.9 ...
## $ movie_facebook_likes : int 33000 0 85000 164000 24000 0 29000 118000 10000 197000 ...
## $ Profit : int 523505847 9404152 -44925825 198130642 -190641321 78530303 -59192738 208991599 51956980 80249062 ...
Filter the data with Countries Four countries- Australia, Canada, China, France.
ccm2_select<-ccm2[ccm2$country %in% c("Australia", "Canada", "China","France"),]
(head(ccm2_select))
## # A tibble: 6 × 17
## num_critic_for_reviews director_facebook_likes actor_1_facebook_likes
## <int> <int> <int>
## 1 334 420 20000
## 2 490 1000 29000
## 3 245 0 309
## 4 739 750 27000
## 5 465 188 10000
## 6 156 750 49000
## # ... with 14 more variables: gross <int>, genres <chr>,
## # movie_title <chr>, num_voted_users <int>,
## # cast_total_facebook_likes <int>, num_user_for_reviews <int>,
## # language <chr>, country <chr>, content_rating <chr>, budget <int>,
## # title_year <int>, imdb_score <dbl>, movie_facebook_likes <int>,
## # Profit <int>
Compare two variables budget on x axis and gross on y axis using geom_point function.
First<-ggplot(ccm2_select, aes(x=budget, y=gross))+geom_point()+
labs(title="scatter plot",
subtitle="Relation between variable budget and gross",
caption="Source: Moodle CCM",
x="BUDGET", y="GROSS")
First
scatterplot lets us compare the relationship between 2 continuous variables. Whereas bubble chart serves well if we want to understand relationship of a categorical variable and another continous variable along with the previous two variables we have used as x and y axis.
In simpler words, bubble charts are more suitable if we have 4-Dimensional data where two of them are numeric (X and Y) and one other categorical (color) and another numeric variable (size).
second<-First+geom_point(aes(size = movie_facebook_likes), alpha = 0.7)+
labs(title="Bubble plot",
subtitle="Relation between variable budget and gross",
caption="Source: Moodle CCM",
x="BUDGET", y="GROSS")
second
Third<-First+geom_point(aes(color=country, size = movie_facebook_likes), alpha = 1)
Third
Fifth<-First+geom_point(aes(color=country, size= movie_facebook_likes), alpha = 0.7)+scale_color_brewer()+facet_wrap(~country)
Fifth
Since it is irrelevant that the facebook likes about movies will determine the profit on a movie, we can use profit factor to see if a high budget movies usally does better in the box office.
library(ggplot2)
ccm2_select<-ccm2[ccm2$country %in% c("Australia", "Canada", "China","France"),]
# mpg <- read.csv("http://goo.gl/uEeRGu")
# Scatterplot
theme_set(theme_bw()) # pre-set the bw theme.
g <- ggplot(ccm2_select, aes(budget, gross)) +
labs(subtitle="CCM2: Profit vs Budget",
title="Bubble chart")
g + geom_point(aes(color=country, size=Profit)) +
geom_smooth(aes(col=country), method="lm", se=F)
cor(ccm2_select$budget, ccm2_select$gross)
## [1] 0.2997182
The bubble chart clearly distinguishes the range of gross between the countries and how the slope of lines-of-best-fit varies, providing a better visual comparison between the groups.
For Individual Countries:
g + geom_point(aes(col=country, size = Profit), alpha=.3) +
geom_smooth(aes(col=country), method="lm", se=F)+ facet_wrap(~country)