library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
Workflow:
Data Visualization Data (csv, database, images) -> preprocessing -> eda (exploratory data analysis) -> visualization / plotting
mv is a dataframe with 4803 observations and 20 variables
mv <- read.csv("data_input/tmdb_5000_movies.csv", stringsAsFactors = FALSE)
# str(mv)
mv[,c("original_language", "status")] <- lapply(mv[,c("original_language", "status")], as.factor)
mv$release_date <- ymd(mv$release_date)
Start with question: What is top 10 movie with big revenue? And please visualize it.
top10 <- head(mv[order(mv$revenue, decreasing = TRUE), c("title", "revenue")], n = 10)
top10
## title revenue
## 1 Avatar 2787965087
## 26 Titanic 1845034188
## 17 The Avengers 1519557910
## 29 Jurassic World 1513528810
## 45 Furious 7 1506249360
## 8 Avengers: Age of Ultron 1405403694
## 125 Frozen 1274219009
## 32 Iron Man 3 1215439994
## 547 Minions 1156730962
## 27 Captain America: Civil War 1153304495
rm(mv) #remove unused data to preserve memory
top10$title <- reorder(top10$title, as.numeric(top10$revenue))
# Formating in Billions
top10$revenue <- paste(format(round(top10$revenue / 1e9, 2), trim = TRUE), "B")
ggplot(top10, aes(title, revenue)) +
geom_col(position = "dodge", aes(fill = revenue)) +
coord_flip() +
labs(x = "Movie Name", y = "Revenues in USD", title = "Top 10 Movie with Most Revenues All Time")
This is my first doing data visualization using Kaggle dataset
What I Learn?
geom_col() + coord_flip() + labs()