#install.packages("ggplot2movies")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2movies)
data(diamonds) #第一組data
data(movies) #第二組data
## diamonds ##
## 了解你的數據 ALWAYS ##
head(diamonds)
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
str(diamonds)
## tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
summary(diamonds)
## carat cut color clarity depth
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00
## J: 2808 (Other): 2531
## table price x y
## Min. :43.00 Min. : 326 Min. : 0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710 1st Qu.: 4.720
## Median :57.00 Median : 2401 Median : 5.700 Median : 5.710
## Mean :57.46 Mean : 3933 Mean : 5.731 Mean : 5.735
## 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.: 6.540
## Max. :95.00 Max. :18823 Max. :10.740 Max. :58.900
##
## z
## Min. : 0.000
## 1st Qu.: 2.910
## Median : 3.530
## Mean : 3.539
## 3rd Qu.: 4.040
## Max. :31.800
##
## 1-1 不同切割方法的鑽石個數
## 1-1-1 (geom_bar)
ggplot(data = diamonds, aes(x = cut))+
geom_bar()

## XY軸對調 ##
ggplot(data = diamonds, aes(x = cut))+
geom_bar()+
coord_flip()

ggp = ggplot(data = diamonds, aes(x = cut))+
geom_bar()
ggp

## label title ##
ggp1 = ggp + xlab('Cut')+
ylab("Count")+
labs(title = "Hello ggplot",
subtitle = "This is subtitle",
caption = "Caption is here")
ggp1

# bar用雪白色填滿(fill),外匡(color)為黑色
ggp+
geom_bar(fill = "snow",
color = "black")

ggp1+
geom_bar(fill = "snow",
color = "black")

#1-1-2.(geom_col) 我們可以事先做一點點計算(plot counts as is )
diamonds_precounted <- as.data.frame(table(diamonds$cut, dnn=c("Cut")))
diamonds_precounted
## Cut Freq
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
diamonds_precounted <- diamonds%>%
group_by(cut) %>%
summarize(freq = n())
ggp2 = ggplot(diamonds_precounted,
aes(x = cut,
y = freq))+
geom_col()
ggp2

#1-2.有關於stat=“identity”
#A.row should be unique: otherwise counts will be summed up
#B.missing label will be present at default: differ from stat=“bin”
#C.negative bar is allowed
diamonds[1:5,]
## # A tibble: 5 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
ggplot(diamonds[1:5,],aes(x = cut,
y = depth))+
geom_bar(stat = "identity")

ggplot(diamonds[1:5,],aes(x = cut,
y = depth))+
geom_col()

#1-3.stack grouping
# Q:不同顏色鑽石的個數
ggplot(data = diamonds, aes(x = color))+
geom_bar()

# Q:不同顏色鑽石的個數, fill by cut,以堆疊(stack)呈現
ggplot(data = diamonds, aes(x = color,
fill = cut))+
geom_bar()

ggplot(data = diamonds, aes(x = color,
fill = cut))+
geom_bar(position = 'stack')

# Q:不同顏色鑽石的個數, fill by cut,但是不同cut方式分開(dodge)呈現
ggplot(data = diamonds, aes(x = color,
fill = cut))+
geom_bar(position = 'dodge')

# Q:不同顏色的鑽石中,不同切割方式的所佔的百分比
ggplot(data = diamonds, aes(x = color,
fill = cut))+
geom_bar(position = 'fill')

#1-4.From bar to histogram (when x is numeric)
#1-4-1 對鑽石的price 用 bar 作圖
ggplot(data = diamonds, aes(x = price))+
geom_bar()

#1-4-2 對鑽石的price 用 bar 作圖,bar 的顏色用 cut 填滿 (fill)
ggplot(data = diamonds, aes(x = price,
fill = cut))+
geom_bar()

#from histogram to density plot
#1-4-3 對鑽石的price 用 density 作圖,顏色用 cut 填滿 (fill)
ggplot(data = diamonds, aes(x = price,
fill = cut))+
geom_density()

#1-4-4 對鑽石的price 用 density 作圖,顏色用 cut 填滿 (fill)+設定透明度讓每個density顯現出來
ggplot(data = diamonds, aes(x = price,
fill = cut))+
geom_density(position = 'identity')

ggplot(data = diamonds, aes(x = price,
fill = cut))+
geom_density(alpha = 0.5)

## Q: bar, hist, density中 position 有四種 'stack','dodge','fill','identity', 有什麼差異? ##
ggplot(data = diamonds, aes(x = color,
fill = cut))+
geom_bar(position = 'stack')

ggplot(data = diamonds, aes(x = color,
fill = cut))+
geom_bar(position = 'dodge')

ggplot(data = diamonds, aes(x = color,
fill = cut))+
geom_bar(position = 'fill')

ggplot(data = diamonds, aes(x = price,
fill = cut))+
geom_bar(position = 'identity')

# Scatter Plot
# movies data 基本概述
## 了解你的數據 ALWAYS ##
head(movies)
## # A tibble: 6 × 24
## title year length budget rating votes r1 r2 r3 r4 r5 r6
## <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 $ 1971 121 NA 6.4 348 4.5 4.5 4.5 4.5 14.5 24.5
## 2 $1000 a … 1939 71 NA 6 20 0 14.5 4.5 24.5 14.5 14.5
## 3 $21 a Da… 1941 7 NA 8.2 5 0 0 0 0 0 24.5
## 4 $40,000 1996 70 NA 8.2 6 14.5 0 0 0 0 0
## 5 $50,000 … 1975 71 NA 3.4 17 24.5 4.5 0 14.5 14.5 4.5
## 6 $pent 2000 91 NA 4.3 45 4.5 4.5 4.5 14.5 14.5 14.5
## # ℹ 12 more variables: r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
## # Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
str(movies)
## tibble [58,788 × 24] (S3: tbl_df/tbl/data.frame)
## $ title : chr [1:58788] "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
## $ year : int [1:58788] 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
## $ length : int [1:58788] 121 71 7 70 71 91 93 25 97 61 ...
## $ budget : int [1:58788] NA NA NA NA NA NA NA NA NA NA ...
## $ rating : num [1:58788] 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
## $ votes : int [1:58788] 348 20 5 6 17 45 200 24 18 51 ...
## $ r1 : num [1:58788] 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
## $ r2 : num [1:58788] 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
## $ r3 : num [1:58788] 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
## $ r4 : num [1:58788] 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
## $ r5 : num [1:58788] 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
## $ r6 : num [1:58788] 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
## $ r7 : num [1:58788] 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
## $ r8 : num [1:58788] 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
## $ r9 : num [1:58788] 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
## $ r10 : num [1:58788] 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
## $ mpaa : chr [1:58788] "" "" "" "" ...
## $ Action : int [1:58788] 0 0 0 0 0 0 1 0 0 0 ...
## $ Animation : int [1:58788] 0 0 1 0 0 0 0 0 0 0 ...
## $ Comedy : int [1:58788] 1 1 0 1 0 0 0 0 0 0 ...
## $ Drama : int [1:58788] 1 0 0 0 0 1 1 0 1 0 ...
## $ Documentary: int [1:58788] 0 0 0 0 0 0 0 1 0 0 ...
## $ Romance : int [1:58788] 0 0 0 0 0 0 0 0 0 0 ...
## $ Short : int [1:58788] 0 0 1 0 0 0 0 1 0 0 ...
summary(movies)
## title year length budget
## Length:58788 Min. :1893 Min. : 1.00 Min. : 0
## Class :character 1st Qu.:1958 1st Qu.: 74.00 1st Qu.: 250000
## Mode :character Median :1983 Median : 90.00 Median : 3000000
## Mean :1976 Mean : 82.34 Mean : 13412513
## 3rd Qu.:1997 3rd Qu.: 100.00 3rd Qu.: 15000000
## Max. :2005 Max. :5220.00 Max. :200000000
## NA's :53573
## rating votes r1 r2
## Min. : 1.000 Min. : 5.0 Min. : 0.000 Min. : 0.000
## 1st Qu.: 5.000 1st Qu.: 11.0 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 6.100 Median : 30.0 Median : 4.500 Median : 4.500
## Mean : 5.933 Mean : 632.1 Mean : 7.014 Mean : 4.022
## 3rd Qu.: 7.000 3rd Qu.: 112.0 3rd Qu.: 4.500 3rd Qu.: 4.500
## Max. :10.000 Max. :157608.0 Max. :100.000 Max. :84.500
##
## r3 r4 r5 r6
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 4.500 1st Qu.: 4.50
## Median : 4.500 Median : 4.500 Median : 4.500 Median :14.50
## Mean : 4.721 Mean : 6.375 Mean : 9.797 Mean :13.04
## 3rd Qu.: 4.500 3rd Qu.: 4.500 3rd Qu.: 14.500 3rd Qu.:14.50
## Max. :84.500 Max. :100.000 Max. :100.000 Max. :84.50
##
## r7 r8 r9 r10
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 4.50 1st Qu.: 4.50 1st Qu.: 4.500 1st Qu.: 4.50
## Median : 14.50 Median : 14.50 Median : 4.500 Median : 14.50
## Mean : 15.55 Mean : 13.88 Mean : 8.954 Mean : 16.85
## 3rd Qu.: 24.50 3rd Qu.: 24.50 3rd Qu.: 14.500 3rd Qu.: 24.50
## Max. :100.00 Max. :100.00 Max. :100.000 Max. :100.00
##
## mpaa Action Animation Comedy
## Length:58788 Min. :0.00000 Min. :0.00000 Min. :0.0000
## Class :character 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Mode :character Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.07974 Mean :0.06277 Mean :0.2938
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000
##
## Drama Documentary Romance Short
## Min. :0.000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.371 Mean :0.05906 Mean :0.0807 Mean :0.1609
## 3rd Qu.:1.000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## Q: 預算(budget)和評價(rating)的關係
ggplot(movies, aes(x = budget,
y = rating))+
geom_point()
## Warning: Removed 53573 rows containing missing values or values outside the scale range
## (`geom_point()`).

## remove na value in budget
movies1 <- movies[!is.na(movies$budget),]
ggplot(movies1, aes(x = budget,
y = rating))+
geom_point(shape = 5,
size = 3)

#2-1.控制point的各種型態, 形狀(shape) 大小(size)
ggplot(movies1, aes(x = budget,
y = rating))+
geom_point()

#2-2 Grouping
#2-2-1.Grouping: 是否為動作片(Action)
ggplot(movies1,aes(x = budget,
y = rating,
color =Action))+
geom_point()

ggplot(movies1,aes(x = budget,
y = rating,
color = factor(Action),
shape = (length>120)))+
geom_point(size = 3)+
labs(color = "Action Movie?")

#2-2-2.Grouping: 是否為動作片(Action) 片長是否大於 120分鐘 #Multi-grouping
ggplot(movies1,aes(x = budget,
y = rating,
color = factor(Action)))+
geom_point()+
labs(color = "Action Movie?")

#2-3.regression line
ggplot(movies1,aes(x = budget,
y = rating))+
geom_point()+
stat_smooth(method = lm, level = 0.95)
## `geom_smooth()` using formula = 'y ~ x'

#2-3-1. add regression line
ggplot(movies1,aes(x = budget,
y = rating))+
geom_point()+
stat_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#2-3-2. Fitting by group
ggplot(movies1,aes(x = budget,
y = rating,
color = factor(Action)))+
geom_point()+
labs(color = "Action Movie?")+
stat_smooth(method = lm, se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

ggplot(movies1,aes(x = budget,
y= rating,
color = factor(Action),
shape = (length>120)))+
geom_point()+
labs(color = "Action Movie?")+
stat_smooth(method = lm, se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

#### mtcats dataset ####
help(mtcars)
## 開啟 httpd 求助伺服器… 好了
# overview mtcars
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
# mpg VS hp => scatter plot
p1 = ggplot(data = mtcars, aes(x = mpg,
y = hp))+
geom_point()
p1

## Add statistics (smooth,quantitle)
p1 + stat_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

p1 + stat_quantile()
## Smoothing formula not specified. Using: y ~ x

## Add facets (by am or cyl or both)
p1 + facet_wrap(~am)

p1 + facet_wrap(~cyl)

p1 + facet_grid(am~cyl)

p1 + facet_grid(cyl ~ .)+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# cyl VS mpg => boxplot
ggplot(data = mtcars, aes(x = factor(cyl),
y = mpg))+
geom_boxplot()

# fill by "cyl"
ggplot(data = mtcars, aes(x = factor(cyl),
y = mpg,
fill = factor(cyl)))+
geom_boxplot()

# flip the coordinate
ggplot(data = mtcars, aes(x = factor(cyl),
y = mpg,
fill = factor(cyl)))+
geom_boxplot()+
coord_flip()
