1108.R

#install.packages("ggplot2movies")
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2movies) 
data(diamonds) #第一組data 
data(movies) #第二組data 

## diamonds ##
## 了解你的數據 ALWAYS ##
head(diamonds)

## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

str(diamonds)

## tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
##  $ carat  : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

summary(diamonds)

##      carat               cut        color        clarity          depth      
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
##                                     J: 2808   (Other): 2531                  
##      table           price             x                y         
##  Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
##  Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
##  Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
##  3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
##  Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
##                                                                   
##        z         
##  Min.   : 0.000  
##  1st Qu.: 2.910  
##  Median : 3.530  
##  Mean   : 3.539  
##  3rd Qu.: 4.040  
##  Max.   :31.800  
##

## 1-1 不同切割方法的鑽石個數
## 1-1-1 (geom_bar)
ggplot(data = diamonds, aes(x = cut))+
  geom_bar()

## XY軸對調 ##
ggplot(data = diamonds, aes(x = cut))+
  geom_bar()+
  coord_flip()

ggp = ggplot(data = diamonds, aes(x = cut))+
  geom_bar()
ggp

## label title ##
ggp1 = ggp + xlab('Cut')+
  ylab("Count")+
  labs(title = "Hello ggplot",
       subtitle = "This is subtitle",
       caption = "Caption is here")
ggp1

# bar用雪白色填滿(fill)，外匡(color)為黑色
ggp+
  geom_bar(fill = "snow",
           color = "black")

ggp1+
  geom_bar(fill = "snow",
           color = "black")

#1-1-2.(geom_col) 我們可以事先做一點點計算(plot counts as is )
diamonds_precounted <- as.data.frame(table(diamonds$cut, dnn=c("Cut")))
diamonds_precounted

##         Cut  Freq
## 1      Fair  1610
## 2      Good  4906
## 3 Very Good 12082
## 4   Premium 13791
## 5     Ideal 21551

diamonds_precounted <- diamonds%>%
  group_by(cut) %>%
  summarize(freq = n())
ggp2 = ggplot(diamonds_precounted,
              aes(x = cut,
                  y = freq))+
  geom_col()
ggp2

#1-2.有關於stat=“identity”
#A.row should be unique: otherwise counts will be summed up
#B.missing label will be present at default: differ from stat=“bin”
#C.negative bar is allowed

diamonds[1:5,]

## # A tibble: 5 × 10
##   carat cut     color clarity depth table price     x     y     z
##   <dbl> <ord>   <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal   E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good    E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good    J     SI2      63.3    58   335  4.34  4.35  2.75

ggplot(diamonds[1:5,],aes(x = cut,
                          y = depth))+
  geom_bar(stat = "identity")

ggplot(diamonds[1:5,],aes(x = cut,
                          y = depth))+
  geom_col()

#1-3.stack grouping
# Ｑ：不同顏色鑽石的個數
ggplot(data = diamonds, aes(x = color))+
  geom_bar()

# Ｑ：不同顏色鑽石的個數， fill by cut，以堆疊(stack)呈現
ggplot(data = diamonds, aes(x = color,
                            fill = cut))+
  geom_bar()

ggplot(data = diamonds, aes(x = color,
                            fill = cut))+
  geom_bar(position = 'stack')

# Ｑ：不同顏色鑽石的個數， fill by cut，但是不同cut方式分開(dodge)呈現
ggplot(data = diamonds, aes(x = color,
                            fill = cut))+
  geom_bar(position = 'dodge')

# Ｑ：不同顏色的鑽石中，不同切割方式的所佔的百分比
ggplot(data = diamonds, aes(x = color,
                            fill = cut))+
  geom_bar(position = 'fill')

#1-4.From bar to histogram  (when x is numeric)
#1-4-1 對鑽石的price 用 bar 作圖
ggplot(data = diamonds, aes(x = price))+
  geom_bar()

#1-4-2 對鑽石的price 用 bar 作圖，bar 的顏色用 cut 填滿 (fill) 
ggplot(data = diamonds, aes(x = price,
                            fill = cut))+
  geom_bar()

#from histogram to density plot
#1-4-3 對鑽石的price 用 density 作圖，顏色用 cut 填滿 (fill)
ggplot(data = diamonds, aes(x = price,
                            fill = cut))+
  geom_density()

#1-4-4 對鑽石的price 用 density 作圖，顏色用 cut 填滿 (fill)＋設定透明度讓每個density顯現出來
ggplot(data = diamonds, aes(x = price,
                            fill = cut))+
  geom_density(position = 'identity')

ggplot(data = diamonds, aes(x = price,
                            fill = cut))+
  geom_density(alpha = 0.5)

##  Q: bar, hist, density中 position 有四種 'stack','dodge','fill','identity', 有什麼差異？ ##
ggplot(data = diamonds, aes(x = color,
                            fill = cut))+
  geom_bar(position = 'stack')

ggplot(data = diamonds, aes(x = color,
                            fill = cut))+
  geom_bar(position = 'dodge')

ggplot(data = diamonds, aes(x = color,
                            fill = cut))+
  geom_bar(position = 'fill')

ggplot(data = diamonds, aes(x = price,
                            fill = cut))+
  geom_bar(position = 'identity')

# Scatter Plot
# movies data 基本概述
## 了解你的數據 ALWAYS ##
head(movies)

## # A tibble: 6 × 24
##   title      year length budget rating votes    r1    r2    r3    r4    r5    r6
##   <chr>     <int>  <int>  <int>  <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 $          1971    121     NA    6.4   348   4.5   4.5   4.5   4.5  14.5  24.5
## 2 $1000 a …  1939     71     NA    6      20   0    14.5   4.5  24.5  14.5  14.5
## 3 $21 a Da…  1941      7     NA    8.2     5   0     0     0     0     0    24.5
## 4 $40,000    1996     70     NA    8.2     6  14.5   0     0     0     0     0  
## 5 $50,000 …  1975     71     NA    3.4    17  24.5   4.5   0    14.5  14.5   4.5
## 6 $pent      2000     91     NA    4.3    45   4.5   4.5   4.5  14.5  14.5  14.5
## # ℹ 12 more variables: r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
## #   Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>

str(movies)

## tibble [58,788 × 24] (S3: tbl_df/tbl/data.frame)
##  $ title      : chr [1:58788] "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
##  $ year       : int [1:58788] 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
##  $ length     : int [1:58788] 121 71 7 70 71 91 93 25 97 61 ...
##  $ budget     : int [1:58788] NA NA NA NA NA NA NA NA NA NA ...
##  $ rating     : num [1:58788] 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
##  $ votes      : int [1:58788] 348 20 5 6 17 45 200 24 18 51 ...
##  $ r1         : num [1:58788] 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
##  $ r2         : num [1:58788] 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
##  $ r3         : num [1:58788] 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
##  $ r4         : num [1:58788] 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
##  $ r5         : num [1:58788] 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
##  $ r6         : num [1:58788] 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
##  $ r7         : num [1:58788] 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
##  $ r8         : num [1:58788] 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
##  $ r9         : num [1:58788] 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
##  $ r10        : num [1:58788] 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
##  $ mpaa       : chr [1:58788] "" "" "" "" ...
##  $ Action     : int [1:58788] 0 0 0 0 0 0 1 0 0 0 ...
##  $ Animation  : int [1:58788] 0 0 1 0 0 0 0 0 0 0 ...
##  $ Comedy     : int [1:58788] 1 1 0 1 0 0 0 0 0 0 ...
##  $ Drama      : int [1:58788] 1 0 0 0 0 1 1 0 1 0 ...
##  $ Documentary: int [1:58788] 0 0 0 0 0 0 0 1 0 0 ...
##  $ Romance    : int [1:58788] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Short      : int [1:58788] 0 0 1 0 0 0 0 1 0 0 ...

summary(movies)

##     title                year          length            budget         
##  Length:58788       Min.   :1893   Min.   :   1.00   Min.   :        0  
##  Class :character   1st Qu.:1958   1st Qu.:  74.00   1st Qu.:   250000  
##  Mode  :character   Median :1983   Median :  90.00   Median :  3000000  
##                     Mean   :1976   Mean   :  82.34   Mean   : 13412513  
##                     3rd Qu.:1997   3rd Qu.: 100.00   3rd Qu.: 15000000  
##                     Max.   :2005   Max.   :5220.00   Max.   :200000000  
##                                                      NA's   :53573      
##      rating           votes                r1                r2        
##  Min.   : 1.000   Min.   :     5.0   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.: 5.000   1st Qu.:    11.0   1st Qu.:  0.000   1st Qu.: 0.000  
##  Median : 6.100   Median :    30.0   Median :  4.500   Median : 4.500  
##  Mean   : 5.933   Mean   :   632.1   Mean   :  7.014   Mean   : 4.022  
##  3rd Qu.: 7.000   3rd Qu.:   112.0   3rd Qu.:  4.500   3rd Qu.: 4.500  
##  Max.   :10.000   Max.   :157608.0   Max.   :100.000   Max.   :84.500  
##                                                                        
##        r3               r4                r5                r6       
##  Min.   : 0.000   Min.   :  0.000   Min.   :  0.000   Min.   : 0.00  
##  1st Qu.: 0.000   1st Qu.:  0.000   1st Qu.:  4.500   1st Qu.: 4.50  
##  Median : 4.500   Median :  4.500   Median :  4.500   Median :14.50  
##  Mean   : 4.721   Mean   :  6.375   Mean   :  9.797   Mean   :13.04  
##  3rd Qu.: 4.500   3rd Qu.:  4.500   3rd Qu.: 14.500   3rd Qu.:14.50  
##  Max.   :84.500   Max.   :100.000   Max.   :100.000   Max.   :84.50  
##                                                                      
##        r7               r8               r9               r10        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Min.   :  0.00  
##  1st Qu.:  4.50   1st Qu.:  4.50   1st Qu.:  4.500   1st Qu.:  4.50  
##  Median : 14.50   Median : 14.50   Median :  4.500   Median : 14.50  
##  Mean   : 15.55   Mean   : 13.88   Mean   :  8.954   Mean   : 16.85  
##  3rd Qu.: 24.50   3rd Qu.: 24.50   3rd Qu.: 14.500   3rd Qu.: 24.50  
##  Max.   :100.00   Max.   :100.00   Max.   :100.000   Max.   :100.00  
##                                                                      
##      mpaa               Action          Animation           Comedy      
##  Length:58788       Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  Class :character   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Mode  :character   Median :0.00000   Median :0.00000   Median :0.0000  
##                     Mean   :0.07974   Mean   :0.06277   Mean   :0.2938  
##                     3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##                     Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##                                                                         
##      Drama        Documentary         Romance           Short       
##  Min.   :0.000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.371   Mean   :0.05906   Mean   :0.0807   Mean   :0.1609  
##  3rd Qu.:1.000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##

## Q: 預算(budget)和評價(rating)的關係 
ggplot(movies, aes(x = budget,
                   y = rating))+
  geom_point()

## Warning: Removed 53573 rows containing missing values or values outside the scale range
## (`geom_point()`).

## remove na value in budget
movies1 <- movies[!is.na(movies$budget),]
ggplot(movies1, aes(x = budget, 
                    y = rating))+
  geom_point(shape = 5,
             size = 3)

#2-1.控制point的各種型態, 形狀(shape) 大小(size)
ggplot(movies1, aes(x = budget,
                    y = rating))+
  geom_point()

#2-2 Grouping
#2-2-1.Grouping: 是否為動作片(Action)
ggplot(movies1,aes(x = budget,
                   y = rating,
                   color =Action))+
  geom_point()

ggplot(movies1,aes(x = budget,
                   y = rating,
                   color = factor(Action),
                   shape = (length>120)))+
  geom_point(size = 3)+
  labs(color = "Action Movie?")

#2-2-2.Grouping: 是否為動作片(Action) 片長是否大於 １２０分鐘  #Multi-grouping
ggplot(movies1,aes(x = budget,
                   y = rating,
                   color = factor(Action)))+
  geom_point()+
  labs(color = "Action Movie?")

#2-3.regression line
ggplot(movies1,aes(x = budget,
                   y = rating))+
  geom_point()+
  stat_smooth(method = lm, level = 0.95)

## `geom_smooth()` using formula = 'y ~ x'

#2-3-1. add regression line
ggplot(movies1,aes(x = budget,
                   y = rating))+
  geom_point()+
  stat_smooth()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#2-3-2. Fitting by group
ggplot(movies1,aes(x = budget,
                   y = rating,
                   color = factor(Action)))+
  geom_point()+
  labs(color = "Action Movie?")+
  stat_smooth(method = lm, se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

ggplot(movies1,aes(x = budget,
                   y= rating,
                   color = factor(Action),
                   shape = (length>120)))+
  geom_point()+
  labs(color = "Action Movie?")+
  stat_smooth(method = lm, se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

#### mtcats dataset ####
help(mtcars)

## 開啟 httpd 求助伺服器… 好了

# overview mtcars
head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

str(mtcars)

## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

summary(mtcars)

##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

# mpg VS hp => scatter plot
p1 = ggplot(data = mtcars, aes(x = mpg,
                               y = hp))+
  geom_point()
p1

## Add statistics (smooth,quantitle)
p1 + stat_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

p1 + stat_quantile()

## Smoothing formula not specified. Using: y ~ x

## Add facets (by am or cyl or both)
p1 + facet_wrap(~am)

p1 + facet_wrap(~cyl)

p1 + facet_grid(am~cyl)

p1 + facet_grid(cyl ~ .)+
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# cyl VS mpg => boxplot
ggplot(data = mtcars, aes(x = factor(cyl),
                          y = mpg))+
  geom_boxplot()

# fill by "cyl"
ggplot(data = mtcars, aes(x = factor(cyl),
                          y = mpg,
                          fill = factor(cyl)))+
  geom_boxplot()

# flip the coordinate
ggplot(data = mtcars, aes(x = factor(cyl),
                          y = mpg,
                          fill = factor(cyl)))+
  geom_boxplot()+
  coord_flip()

1108.R

USER

2024-11-09