library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#資料來源
#https://www.kaggle.com/spscientist/students-performance-in-exams
#http://tsf.cdrf.org.cn/Content/Detail/30/43/652
sp <- read.csv(file = "StudentsPerformance.csv", stringsAsFactors = TRUE)
str(sp)
## 'data.frame':    1000 obs. of  8 variables:
##  $ gender                     : Factor w/ 2 levels "female","male": 1 1 1 2 2 1 1 2 2 1 ...
##  $ race.ethnicity             : Factor w/ 5 levels "group A","group B",..: 2 3 2 1 3 2 2 2 4 2 ...
##  $ parental.level.of.education: Factor w/ 6 levels "associate's degree",..: 2 5 4 1 5 1 5 5 3 3 ...
##  $ lunch                      : Factor w/ 2 levels "free/reduced",..: 2 2 2 1 2 2 2 1 1 1 ...
##  $ test.preparation.course    : Factor w/ 2 levels "completed","none": 2 1 2 2 2 2 1 2 1 2 ...
##  $ math.score                 : int  72 69 90 47 76 71 88 40 64 38 ...
##  $ reading.score              : int  72 90 95 57 78 83 95 43 64 60 ...
##  $ writing.score              : int  74 88 93 44 75 78 92 39 67 50 ...
#欄位名稱------------------------------------------
# gender 性別
# race.ethnicity 種族分群
# parental.level.of.education  父母教育程度
# lunch 營養午餐類型(free/reduced免費或減免餐費,standard為一般類別)
# test.preparation.course
# math.score 數學成績
# reading.score 閱讀成績
# writing.score 寫作成績
#一、數學成績與閱讀成績
#註解:free/reduced免費或減免餐費
#註解geom_point()的上色語法是 color = 
ggplot(sp, aes(x= math.score, y= reading.score, color = lunch))+
  geom_point()

#二、數學成績與閱讀成績的相關係數
math_reading_cor <- cor(sp$math.score, sp$reading.score)

#三、午餐類型與數學成績
#註解:free/reduced免費或減免餐費
#註解geom_col()的上色語法是 fill =
summary(sp)
##     gender    race.ethnicity     parental.level.of.education          lunch    
##  female:518   group A: 89    associate's degree:222          free/reduced:355  
##  male  :482   group B:190    bachelor's degree :118          standard    :645  
##               group C:319    high school       :196                            
##               group D:262    master's degree   : 59                            
##               group E:140    some college      :226                            
##                              some high school  :179                            
##  test.preparation.course   math.score     reading.score    writing.score   
##  completed:358           Min.   :  0.00   Min.   : 17.00   Min.   : 10.00  
##  none     :642           1st Qu.: 57.00   1st Qu.: 59.00   1st Qu.: 57.75  
##                          Median : 66.00   Median : 70.00   Median : 69.00  
##                          Mean   : 66.09   Mean   : 69.17   Mean   : 68.05  
##                          3rd Qu.: 77.00   3rd Qu.: 79.00   3rd Qu.: 79.00  
##                          Max.   :100.00   Max.   :100.00   Max.   :100.00
sp %>% #type處理一系列的東西  
  group_by(lunch) %>%
  summarise(math = mean(math.score)) %>% 
  ggplot(aes(x= lunch, y=math, fill = lunch))+
  geom_col()

#四、父母學歷與數學成績
sp1 <- sp %>% 
  group_by(parental.level.of.education) %>%  # 將請填入替換為父母學歷的變數名稱
  summarise(math = round(mean(math.score))) %>% 
  arrange(desc(math))

#畫圖準備:先將「父母學歷與數學成績」表格中的sp1$parental.level.of.education的因子排序改為依數學排序(預設是字母順序)
sp1$parental.level.of.education <- factor(sp1$parental.level.of.education, 
                                          levels = sp1$parental.level.of.education[order(sp1$math, decreasing = TRUE)])
#五、畫圖:父母學歷與數學成績
ggplot(sp1, aes(x = parental.level.of.education, y= math, fill = parental.level.of.education))+
  geom_col()+
  geom_text(aes(label = math, vjust = -0.8, hjust = 0.5, color = parental.level.of.education))+   ## 顯示長條的資料標籤
  ylim(min(sp1$math, 0)*1.1, max(sp1$math)*1.1)   ## 加大 Y 軸的範圍,避免資料標籤顯示不完整

#隱藏圖例
ggplot(sp1, aes(x = parental.level.of.education, y= math, fill = parental.level.of.education))+
  geom_col(show.legend=F)+
  geom_text(aes(label = math, vjust = -0.8, hjust = 0.5, color = parental.level.of.education),show.legend=F)+   ## 顯示長條的資料標籤
  ylim(min(sp1$math, 0)*1.1, max(sp1$math)*1.1)   ## 加大 Y 軸的範圍,避免資料標籤顯示不完整

#六、自由練習:
#性別與寫作成績長條圖
ggplot(sp, aes(x = gender, y = writing.score, fill = gender)) +
  geom_col() +
  labs(x = "Gender", y = "Writing Score") +
  theme_minimal()

#性別與寫作成績盒狀圖
ggplot(sp, aes(x = gender, y = writing.score, fill = gender)) +
  geom_boxplot() +
  labs(x = "Gender", y = "Writing Score") +
  theme_minimal()