library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#資料來源
#https://www.kaggle.com/spscientist/students-performance-in-exams
#http://tsf.cdrf.org.cn/Content/Detail/30/43/652
sp <- read.csv(file = "StudentsPerformance.csv", stringsAsFactors = TRUE)
str(sp)
## 'data.frame': 1000 obs. of 8 variables:
## $ gender : Factor w/ 2 levels "female","male": 1 1 1 2 2 1 1 2 2 1 ...
## $ race.ethnicity : Factor w/ 5 levels "group A","group B",..: 2 3 2 1 3 2 2 2 4 2 ...
## $ parental.level.of.education: Factor w/ 6 levels "associate's degree",..: 2 5 4 1 5 1 5 5 3 3 ...
## $ lunch : Factor w/ 2 levels "free/reduced",..: 2 2 2 1 2 2 2 1 1 1 ...
## $ test.preparation.course : Factor w/ 2 levels "completed","none": 2 1 2 2 2 2 1 2 1 2 ...
## $ math.score : int 72 69 90 47 76 71 88 40 64 38 ...
## $ reading.score : int 72 90 95 57 78 83 95 43 64 60 ...
## $ writing.score : int 74 88 93 44 75 78 92 39 67 50 ...
#欄位名稱------------------------------------------
# gender 性別
# race.ethnicity 種族分群
# parental.level.of.education 父母教育程度
# lunch 營養午餐類型(free/reduced免費或減免餐費,standard為一般類別)
# test.preparation.course
# math.score 數學成績
# reading.score 閱讀成績
# writing.score 寫作成績
#一、數學成績與閱讀成績
#註解:free/reduced免費或減免餐費
#註解geom_point()的上色語法是 color =
ggplot(sp, aes(x= math.score, y= reading.score, color = lunch))+
geom_point()

#二、數學成績與閱讀成績的相關係數
math_reading_cor <- cor(sp$math.score, sp$reading.score)
#三、午餐類型與數學成績
#註解:free/reduced免費或減免餐費
#註解geom_col()的上色語法是 fill =
summary(sp)
## gender race.ethnicity parental.level.of.education lunch
## female:518 group A: 89 associate's degree:222 free/reduced:355
## male :482 group B:190 bachelor's degree :118 standard :645
## group C:319 high school :196
## group D:262 master's degree : 59
## group E:140 some college :226
## some high school :179
## test.preparation.course math.score reading.score writing.score
## completed:358 Min. : 0.00 Min. : 17.00 Min. : 10.00
## none :642 1st Qu.: 57.00 1st Qu.: 59.00 1st Qu.: 57.75
## Median : 66.00 Median : 70.00 Median : 69.00
## Mean : 66.09 Mean : 69.17 Mean : 68.05
## 3rd Qu.: 77.00 3rd Qu.: 79.00 3rd Qu.: 79.00
## Max. :100.00 Max. :100.00 Max. :100.00
sp %>% #type處理一系列的東西
group_by(lunch) %>%
summarise(math = mean(math.score)) %>%
ggplot(aes(x= lunch, y=math, fill = lunch))+
geom_col()

#四、父母學歷與數學成績
sp1 <- sp %>%
group_by(parental.level.of.education) %>% # 將請填入替換為父母學歷的變數名稱
summarise(math = round(mean(math.score))) %>%
arrange(desc(math))
#畫圖準備:先將「父母學歷與數學成績」表格中的sp1$parental.level.of.education的因子排序改為依數學排序(預設是字母順序)
sp1$parental.level.of.education <- factor(sp1$parental.level.of.education,
levels = sp1$parental.level.of.education[order(sp1$math, decreasing = TRUE)])
#五、畫圖:父母學歷與數學成績
ggplot(sp1, aes(x = parental.level.of.education, y= math, fill = parental.level.of.education))+
geom_col()+
geom_text(aes(label = math, vjust = -0.8, hjust = 0.5, color = parental.level.of.education))+ ## 顯示長條的資料標籤
ylim(min(sp1$math, 0)*1.1, max(sp1$math)*1.1) ## 加大 Y 軸的範圍,避免資料標籤顯示不完整

#隱藏圖例
ggplot(sp1, aes(x = parental.level.of.education, y= math, fill = parental.level.of.education))+
geom_col(show.legend=F)+
geom_text(aes(label = math, vjust = -0.8, hjust = 0.5, color = parental.level.of.education),show.legend=F)+ ## 顯示長條的資料標籤
ylim(min(sp1$math, 0)*1.1, max(sp1$math)*1.1) ## 加大 Y 軸的範圍,避免資料標籤顯示不完整

#六、自由練習:
#性別與寫作成績長條圖
ggplot(sp, aes(x = gender, y = writing.score, fill = gender)) +
geom_col() +
labs(x = "Gender", y = "Writing Score") +
theme_minimal()

#性別與寫作成績盒狀圖
ggplot(sp, aes(x = gender, y = writing.score, fill = gender)) +
geom_boxplot() +
labs(x = "Gender", y = "Writing Score") +
theme_minimal()
