########tidyy資料格式與ggplot2畫圖####
# Alt+-(用於賦值運算符<-)
####安裝套件####
#install.packages("ggplot2")
# install.packages("SportsAnalytics")
# install.packages("titanic")
# install.packages("dplyr")
# install.packages("tidyverse")
# 多行註解 『 Command + Shift + C 』,若是想要取消註解,僅需將段落反白再執行一次Command + Shift + C 即可。
####tidyy資料格式常用的函數####
# 選取tibble中的幾個Variables: select()
# 依照位置選取tibble中的Observations: slice()
# 根據條件選取tibble中的Observations: filter()
# 增加或修改tibble的Variables: mutate()
# 依照某個Variable的資料數值大小,排列Observations: arrange()
# 依照某個Variable的資料數值大小,選出前幾筆Observations: top_n()
# 依照某個Variable的資料數值,將Observations分群: group_by()
# 對Observatiosn進行彙整(加總、平均、…): summarise()
# 刪除NA值:filter(欄位名稱!="NA") 或是 filter(!is.na(欄位名稱))
####ggplot2畫圖文法####
# 資料來源(data):指定原始資料來源的 data frame。
# 美學對應(aesthetic):指定原始資料與圖形之間的對應關係,例如哪一個變數要當作 x 座標變數,而哪一個要當作 y 座標變數,還有資料繪圖時的樣式等。
# 幾何圖案(geometry):要用什麼幾何圖形繪製資料,例如點、線條、多邊形等。
# 繪圖面(facet):指定如何將資料分散在多張子圖形中繪製,以利互相比較。
# 統計轉換(statistical transformation):指定如何以將資料轉換為各種統計量,例如將連續型資料轉為離散型的類別。
# 座標系統(coordinate system):指定繪圖時所使用的座標系統,除了常見的笛卡兒直角座標系統,也可以使用極坐標或地圖投影(map projection)。
# 主題(theme):控制資料以外的繪圖組件,例如座標軸、說明文字等。
library(ggplot2) ##須先安裝 install.packages("ggplot2")
#qplot()為ggplot2 “Hello, world!”,
#簡單使用qplot(x軸名稱,y軸名稱,data=使用資料)就可畫散佈圖
#library(SportsAnalytics)##須先安裝 install.packages("SportsAnalytics")
#NBA1920<-fetch_NBAPlayerStatistics("19-20") ## 讀入資料
####存取資料與讀取資料####
#存取資料檔(.csv)
#存檔
#write.csv(NBA1920, file = "NBA1920.csv", fileEncoding = "utf-8")
#讀取
#NBA1920_1 <- read.csv( file = "NBA1920.csv", fileEncoding = "utf-8")
#存取資料檔(.RData)
#存檔
#save(NBA1920, file = "NBA1920.RData")
#讀取
load(file = "NBA1920.RData")
#查看資料
colnames(NBA1920)
## [1] "League" "Name" "Team"
## [4] "Position" "GamesPlayed" "TotalMinutesPlayed"
## [7] "FieldGoalsMade" "FieldGoalsAttempted" "ThreesMade"
## [10] "ThreesAttempted" "FreeThrowsMade" "FreeThrowsAttempted"
## [13] "OffensiveRebounds" "TotalRebounds" "Assists"
## [16] "Steals" "Turnovers" "Blocks"
## [19] "PersonalFouls" "Disqualifications" "TotalPoints"
## [22] "Technicals" "Ejections" "FlagrantFouls"
## [25] "GamesStarted"
str(NBA1920)
## 'data.frame': 529 obs. of 25 variables:
## $ League : Factor w/ 1 level "NBA": 1 1 1 1 1 1 1 1 1 1 ...
## $ Name : chr "Steven Adams" "Bam Adebayo" "Lamarcu Aldridge" "Kyle Alexander" ...
## $ Team : Factor w/ 32 levels "ATL","BOS","BRO",..: 23 17 29 17 21 16 3 22 24 3 ...
## $ Position : Factor w/ 5 levels "C","PF","PG",..: 1 2 1 2 5 5 2 5 4 4 ...
## $ GamesPlayed : int 63 72 53 2 47 38 70 10 18 10 ...
## $ TotalMinutesPlayed : int 1679 2415 1757 13 590 721 1854 118 381 108 ...
## $ FieldGoalsMade : int 283 440 391 1 98 117 302 19 25 10 ...
## $ FieldGoalsAttempted: int 478 791 794 2 266 251 465 44 86 38 ...
## $ ThreesMade : int 1 2 61 0 46 57 0 5 9 6 ...
## $ ThreesAttempted : int 3 14 157 0 133 141 6 16 36 29 ...
## $ FreeThrowsMade : int 117 264 158 0 25 39 171 7 19 2 ...
## $ FreeThrowsAttempted: int 201 382 191 0 37 45 270 11 29 4 ...
## $ OffensiveRebounds : int 208 177 104 2 9 8 216 2 24 1 ...
## $ TotalRebounds : int 582 735 392 3 84 85 671 9 88 21 ...
## $ Assists : int 146 368 129 0 89 52 110 21 21 8 ...
## $ Steals : int 51 82 36 0 17 12 40 5 18 0 ...
## $ Turnovers : int 95 203 74 1 54 33 77 8 17 4 ...
## $ Blocks : int 68 93 87 0 8 2 92 2 8 6 ...
## $ PersonalFouls : int 122 182 128 1 57 53 162 7 27 13 ...
## $ Disqualifications : int 1 2 1 0 0 0 2 0 0 0 ...
## $ TotalPoints : int 684 1146 1001 2 267 330 775 50 78 28 ...
## $ Technicals : int 2 0 0 0 0 1 1 0 0 0 ...
## $ Ejections : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FlagrantFouls : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GamesStarted : int 63 72 53 0 1 0 64 0 2 1 ...
summary(NBA1920)
## League Name Team Position GamesPlayed
## NBA:529 Length:529 BRO : 24 C : 84 Min. : 1.00
## Class :character HOU : 20 PF:108 1st Qu.:21.00
## Mode :character SAC : 20 PG:112 Median :48.00
## WAS : 20 SF:116 Mean :42.18
## DAL : 19 SG:109 3rd Qu.:62.00
## MIA : 19 Max. :74.00
## (Other):407
## TotalMinutesPlayed FieldGoalsMade FieldGoalsAttempted ThreesMade
## Min. : 1.0 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 281.0 1st Qu.: 34.0 1st Qu.: 72.0 1st Qu.: 4.00
## Median : 921.0 Median :124.0 Median : 277.0 Median : 31.00
## Mean : 967.7 Mean :163.6 Mean : 355.6 Mean : 48.89
## 3rd Qu.:1594.0 3rd Qu.:252.0 3rd Qu.: 554.0 3rd Qu.: 78.00
## Max. :2559.0 Max. :685.0 Max. :1514.0 Max. :299.00
##
## ThreesAttempted FreeThrowsMade FreeThrowsAttempted OffensiveRebounds
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 14.0 1st Qu.: 10.0 1st Qu.: 16.00 1st Qu.: 9.00
## Median : 94.0 Median : 40.0 Median : 57.00 Median : 26.00
## Mean :136.6 Mean : 71.5 Mean : 92.51 Mean : 40.35
## 3rd Qu.:221.0 3rd Qu.: 93.0 3rd Qu.:122.00 3rd Qu.: 57.00
## Max. :843.0 Max. :692.0 Max. :800.00 Max. :257.00
##
## TotalRebounds Assists Steals Turnovers
## Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 49.0 1st Qu.: 18.00 1st Qu.: 7.00 1st Qu.: 14.00
## Median :140.0 Median : 60.00 Median : 25.00 Median : 43.00
## Mean :179.5 Mean : 97.64 Mean : 30.64 Mean : 55.72
## 3rd Qu.:269.0 3rd Qu.:123.00 3rd Qu.: 48.00 3rd Qu.: 80.00
## Max. :919.0 Max. :684.00 Max. :125.00 Max. :308.00
##
## Blocks PersonalFouls Disqualifications TotalPoints
## Min. : 0.00 Min. : 0.00 Min. : 0.0000 Min. : 0.0
## 1st Qu.: 4.00 1st Qu.: 31.00 1st Qu.: 0.0000 1st Qu.: 89.0
## Median : 11.00 Median : 83.00 Median : 0.0000 Median : 342.0
## Mean : 19.64 Mean : 83.17 Mean : 0.5293 Mean : 447.6
## 3rd Qu.: 26.00 3rd Qu.:128.00 3rd Qu.: 1.0000 3rd Qu.: 684.0
## Max. :196.00 Max. :278.00 Max. :10.0000 Max. :2335.0
##
## Technicals Ejections FlagrantFouls GamesStarted
## Min. : 0.000 Min. :0 Min. :0 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.:0 1st Qu.:0 1st Qu.: 0.00
## Median : 0.000 Median :0 Median :0 Median : 6.00
## Mean : 1.172 Mean :0 Mean :0 Mean :20.02
## 3rd Qu.: 2.000 3rd Qu.:0 3rd Qu.:0 3rd Qu.:39.00
## Max. :14.000 Max. :0 Max. :0 Max. :73.00
##
####使用tidyverse套件的函數####
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.6 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#問題
# 1. 簡化資料,選取NBA資料中的Name, Team, Position
# 2. 選取NBA資料中前五筆及後五筆
# 3. 誰是鋼鐵人?選取NBA資料中GamesPlayed大於或等於65的紀錄
# 4. 按照GamesPlayed的值,由大到小排序NBA資料
# 5. 誰是三分王?選取空氣品質指標資料中AQI值最大的五筆紀錄
# 6. 計算各球隊的球員人數
# 7. 計算各球隊的球員人數,並且依人數由大到小排序
# 8. 查看Team=="NA"和(或)Team=="CLI"的資料
# 9.過濾Team=="NA"和(或)Team=="CLI"的資料
# 10. 找出某個球隊Team的全體球員Name和場上位置Position,並且依照場上位置排序
# 11. 新建一個投籃命中率欄位,命名為FieldGoalPercentage
# 12. 依據TotalPoints,將球員分類
# 13.1 新增FieldGoalPercentage(FieldGoalsMade/FieldGoalsAttempted)
# 13.2 依據Position將資料分組
# 13.3 新增一欄位(avg_fgp),填入FieldGoalPercentage的平均數
#Ctrl+Shift+M(用於管道操作符%>%)
# 1.選取NBA資料中的Name, Team, Position
df1 <- NBA1920 %>% select(Name, Team, Position)
# 2. 選取NBA資料中前五筆及後五筆
#傳統作法
head(NBA1920,3)
## League Name Team Position GamesPlayed TotalMinutesPlayed
## 1 NBA Steven Adams OKL C 63 1679
## 2 NBA Bam Adebayo MIA PF 72 2415
## 3 NBA Lamarcu Aldridge SAN C 53 1757
## FieldGoalsMade FieldGoalsAttempted ThreesMade ThreesAttempted FreeThrowsMade
## 1 283 478 1 3 117
## 2 440 791 2 14 264
## 3 391 794 61 157 158
## FreeThrowsAttempted OffensiveRebounds TotalRebounds Assists Steals Turnovers
## 1 201 208 582 146 51 95
## 2 382 177 735 368 82 203
## 3 191 104 392 129 36 74
## Blocks PersonalFouls Disqualifications TotalPoints Technicals Ejections
## 1 68 122 1 684 2 0
## 2 93 182 2 1146 0 0
## 3 87 128 1 1001 0 0
## FlagrantFouls GamesStarted
## 1 0 63
## 2 0 72
## 3 0 53
tail(NBA1920,10)
## League Name Team Position GamesPlayed TotalMinutesPlayed
## 520 NBA Justise Winslow MIA SF 11 353
## 521 NBA Christian Wood DET PF 62 1319
## 522 NBA Delon Wright DAL SG 73 1570
## 523 NBA Wright-foreman UTA PG 4 45
## 524 NBA Thaddeus Young CHI PF 64 1594
## 525 NBA Trae Young ATL PG 60 2119
## 526 NBA Cody Zeller CHA C 58 1343
## 527 NBA Tyler Zeller SAN C 1 4
## 528 NBA Ante Zizic CLE C 22 222
## 529 NBA Ivica Zubac LAC C 72 1326
## FieldGoalsMade FieldGoalsAttempted ThreesMade ThreesAttempted
## 520 50 129 6 27
## 521 288 508 54 140
## 522 190 412 47 128
## 523 7 20 2 10
## 524 269 602 79 223
## 525 546 1249 205 568
## 526 251 480 18 75
## 527 1 4 0 0
## 528 41 72 0 0
## 529 236 387 0 2
## FreeThrowsMade FreeThrowsAttempted OffensiveRebounds TotalRebounds Assists
## 520 18 27 16 73 44
## 521 181 243 104 390 60
## 522 77 100 71 281 244
## 523 3 4 0 5 7
## 524 42 72 94 315 117
## 525 481 559 31 255 560
## 526 122 179 161 412 88
## 527 0 0 3 4 0
## 528 14 19 17 65 6
## 529 124 166 199 545 82
## Steals Turnovers Blocks PersonalFouls Disqualifications TotalPoints
## 520 8 24 5 38 0 124
## 521 34 84 55 98 0 811
## 522 84 73 22 93 0 504
## 523 2 3 0 5 0 19
## 524 92 102 23 134 1 659
## 525 65 289 8 104 0 1778
## 526 40 75 27 140 1 642
## 527 0 0 0 0 0 2
## 528 7 10 5 27 0 96
## 529 16 61 66 168 2 596
## Technicals Ejections FlagrantFouls GamesStarted
## 520 0 0 0 5
## 521 1 0 0 12
## 522 0 0 0 6
## 523 0 0 0 0
## 524 4 0 0 16
## 525 7 0 0 60
## 526 0 0 0 39
## 527 0 0 0 0
## 528 0 0 0 0
## 529 1 0 0 70
#使用tidyverse
nrow(NBA1920)#算列數
## [1] 529
df2 <- NBA1920 %>% slice(c(1:5, (nrow(NBA1920)-4):nrow(NBA1920)))
# 3. 誰是鋼鐵人?選取NBA資料中GamesPlayed大於或等於65的紀錄
#先畫直方圖瞭解資料
summary(NBA1920$GamesPlayed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 21.00 48.00 42.18 62.00 74.00
ggplot(NBA1920, aes(x=GamesPlayed))+
geom_histogram(bins = 50)

df3 <- NBA1920%>% filter(GamesPlayed>=65)
# 4. 按照GamesPlayed的值,由大到小排序NBA資料,選取前5筆資料,只保留Name, Team, GamesPlayed三個欄位
#由小到大
df41<- NBA1920%>% arrange(GamesPlayed)
#由大到小
df42<- NBA1920%>%
arrange(desc(GamesPlayed)) %>%
slice(c(1:5)) %>%
select(Name, Team, GamesPlayed)
# 5. 誰是三分王?選取選取NBA資料中ThreesMade值最大的五筆紀錄
#tidy的做法(第一種),可以利用top_n()找出資料最大的前五筆
#特別注意,top_n找出的結果,其呈現時不會將其排序
df51<- NBA1920%>% top_n(5, ThreesMade)
#在tidy的做法(第二種),
#先依照ThreesMade的數值排列(arrange()),然後再取出前五筆的Observations(slice())。
df52 <- NBA1920 %>% arrange(desc(ThreesMade)) %>% slice(1:5)
# 6. 計算各球隊的球員人數
df6 <- NBA1920 %>%
group_by(Team) %>%
summarise(人數=n())
df6
## # A tibble: 32 x 2
## Team 人數
## <fct> <int>
## 1 ATL 18
## 2 BOS 17
## 3 BRO 24
## 4 CHA 14
## 5 CHI 17
## 6 CLE 17
## 7 CLI 1
## 8 DAL 19
## 9 DEN 16
## 10 DET 17
## # … with 22 more rows
# 7.計算各球隊的球員人數,並且依人數由大到小排序
df7 <- NBA1920 %>%
group_by(Team) %>%
summarise(人數=n()) %>%
arrange(desc(人數))
df7
## # A tibble: 32 x 2
## Team 人數
## <fct> <int>
## 1 BRO 24
## 2 HOU 20
## 3 SAC 20
## 4 WAS 20
## 5 DAL 19
## 6 MIA 19
## 7 ORL 19
## 8 ATL 18
## 9 MEM 18
## 10 OKL 18
## # … with 22 more rows
ggplot(data = df7, aes(x = Team, y = 人數)) +
geom_col() +
scale_fill_brewer(palette="Set3")+
coord_flip()

#長條圖排序
ggplot(data = df7, aes(x = reorder(Team, 人數), y = 人數)) +
geom_col() +
scale_fill_brewer(palette="PuBu")+
coord_flip()

# 列出所有色調模組
RColorBrewer::display.brewer.all()

# 8.查詢Team=="NA"和(或)Team=="CLI"的資料
df8 <- NBA1920 %>% filter(Team=="NA"|Team=="CLI")
# 9.過濾Team=="NA"和(或)Team=="CLI"的資料
df9 <- NBA1920 %>% filter(Team!="NA") %>% filter(Team!="CLI")
# 10. 找出某個球隊Team的全體球員Name和場上位置Position,並且依照場上位置排序
df10 <- NBA1920%>%
filter(Team=="MIA") %>%
select(Name, Position) %>%
arrange(Position)
# 11. 新建一個投籃命中率欄位,命名為FieldGoalPercentage
df11 <- NBA1920%>%
mutate(FieldGoalPercentage= FieldGoalsMade/FieldGoalsAttempted)%>%
arrange(desc(FieldGoalPercentage))%>%
select(Team,Name,FieldGoalPercentage)%>%
slice(c(1:10))
df11
## Team Name FieldGoalPercentage
## 1 LAL K Antetokounmpo 1.0000000
## 2 BOS Tacko Fall 0.7857143
## 3 HOU Tyson Chandler 0.7777778
## 4 NYK Mitche Robinson 0.7419355
## 5 LAC Johnathan Motley 0.7333333
## 6 LAL Dwight Howard 0.7292419
## 7 BOS Robert Williams 0.7272727
## 8 CHI Daniel Gafford 0.7014925
## 9 UTA Rudy Gobert 0.6929982
## 10 CLE Dean Wade 0.6923077
# 12. 依據TotalPoints,將球員分類
ggplot(NBA1920, aes(x = TotalPoints))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

hist(NBA1920$TotalPoints)

summary(NBA1920$TotalPoints)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 89.0 342.0 447.6 684.0 2335.0
df12 <- NBA1920%>%
mutate(Rating = cut(NBA1920$TotalPoints,
c(0, 89, 684,2335),
c("ordinary", "excellent", "star")))%>%
filter(!is.na(Rating))
#畫長條圖
ggplot(data =df12, aes(x = Rating, fill = Rating))+
geom_bar()

# 13.1 新增FieldGoalPercentage(FieldGoalsMade/FieldGoalsAttempted)
# 13.2 依據Position將資料分組
# 13.3 新增一欄位平均命中率(avg_fgp),填入FieldGoalPercentage的平均數
# 註:如發現NA值,加上filter(!is.na(FieldGoalPercentage))
#狀況一:未排除na
df13 <-NBA1920%>%
mutate(FieldGoalPercentage= FieldGoalsMade/FieldGoalsAttempted) %>%
group_by(Position) %>%
summarise(avg_fgp= mean(FieldGoalPercentage))
#排除na
df13 <-NBA1920%>%
mutate(FieldGoalPercentage= FieldGoalsMade/FieldGoalsAttempted) %>%
filter(!is.na(FieldGoalPercentage)) %>%
group_by(Position) %>%
summarise(avg_fgp= mean(FieldGoalPercentage))
#畫長條圖geom_col
ggplot(data = df13)+
geom_col(aes(x = Position, y = avg_fgp, fill = Position))

ggplot(data = df13)+
geom_col(aes(x = reorder(Position, -avg_fgp), y = avg_fgp, fill = Position))

#練習1.選取NBA資料中的Name, GamesPlayed, GamesStarted
n<- NBA1920 %>% select(Name,GamesPlayed, GamesStarted)
#練習3. 誰是鋼鐵人?選取NBA資料中TotalMinutesPlayed大於或等於2000的紀錄
summary(NBA1920$TotalMinutesPlayed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 281.0 921.0 967.7 1594.0 2559.0
ggplot(NBA1920, aes(x=TotalMinutesPlayed))+
geom_histogram(bins = 50)

n2 <- NBA1920%>% filter(TotalMinutesPlayed>=2000)
#練習4 按照TotalMinutesPlayed的值,由大到小排序NBA資料, 選取前10筆資料,只保留Name, Team, TotalMinutesPlayed三個欄位
n3<- NBA1920%>% arrange(TotalMinutesPlayed)
n4<- NBA1920%>%
arrange(desc(TotalMinutesPlayed)) %>%
slice(c(1:10)) %>%
select(Name, Team, TotalMinutesPlayed)
#練習5. 誰是罰球王?選取選取NBA資料中FreeThrowsMade值最大的五筆紀錄,保留Name, Team,FreeThrowsMade三個欄位,並畫圖
n5<- NBA1920%>% arrange(FreeThrowsMade)
n7<- NBA1920%>%
arrange(desc(FreeThrowsMade)) %>%
slice(c(1:5)) %>%
select(Name, Team, FreeThrowsMade)
ggplot(data =n7,aes(x=Name, y=FreeThrowsMade,fill=Team))+
geom_col()

ggplot(data =n7, aes(x =reorder(Name,-FreeThrowsMade),y=FreeThrowsMade, fill =Team))+
geom_col()

#練習6.計算各場上位置Position的球員人數
n6 <- NBA1920 %>%
group_by(Position) %>%
summarise(freq = n())
n6
## # A tibble: 5 x 2
## Position freq
## <fct> <int>
## 1 C 84
## 2 PF 108
## 3 PG 112
## 4 SF 116
## 5 SG 109
#練習7 計算各球隊的不同場上位置球員人數
n8 <- NBA1920 %>%
group_by(Team, Position) %>%
summarise(freq = n())
## `summarise()` has grouped output by 'Team'. You can override using the `.groups` argument.
n8%>%filter(Team=="MIA")
## # A tibble: 5 x 3
## # Groups: Team [1]
## Team Position freq
## <fct> <fct> <int>
## 1 MIA C 3
## 2 MIA PF 3
## 3 MIA PG 4
## 4 MIA SF 7
## 5 MIA SG 2
#練習8_1. 計算各球隊的加總sum()犯規次數(PersonalFouls),並由大到小排序
n9 <- NBA1920 %>%
group_by(Team) %>%
summarise(TotalFouls=sum(PersonalFouls)) %>%
arrange(desc(TotalFouls))
#練習8_2. 計算各球隊的不同場上位置的加總sum()犯規次數(PersonalFouls),
n10 <- NBA1920 %>%
group_by(Team, Position) %>%
summarise(TotalFouls=sum(PersonalFouls))
## `summarise()` has grouped output by 'Team'. You can override using the `.groups` argument.