dta <- read.csv("D:/data_management/week4/nlsy86long.csv")
head(dta)
## id sex race time grade year month math read
## 1 2390 Female Majority 1 0 6 67 14.285714 19.047619
## 2 2560 Female Majority 1 0 6 66 20.238095 21.428571
## 3 3740 Female Majority 1 0 6 67 17.857143 21.428571
## 4 4020 Male Majority 1 0 5 60 7.142857 7.142857
## 5 6350 Male Majority 1 1 7 78 29.761905 30.952381
## 6 7030 Male Majority 1 0 5 62 14.285714 17.857143
str(dta)
## 'data.frame': 664 obs. of 9 variables:
## $ id : int 2390 2560 3740 4020 6350 7030 7200 7610 7680 7700 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
## $ race : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 1 1 1 1 1 1 1 1 1 1 ...
## $ grade: int 0 0 0 0 1 0 0 0 0 0 ...
## $ year : int 6 6 6 5 7 5 6 7 6 6 ...
## $ month: int 67 66 67 60 78 62 66 79 76 67 ...
## $ math : num 14.29 20.24 17.86 7.14 29.76 ...
## $ read : num 19.05 21.43 21.43 7.14 30.95 ...
pacman::p_load(mlmRev, HSAUR3, knitr, kableExtra,
readr, dplyr, ggplot2, tidyr, car,
magrittr, tibble, purrr, stringr)
head(dta)
## id sex race time grade year month math read
## 1 2390 Female Majority 1 0 6 67 14.285714 19.047619
## 2 2560 Female Majority 1 0 6 66 20.238095 21.428571
## 3 3740 Female Majority 1 0 6 67 17.857143 21.428571
## 4 4020 Male Majority 1 0 5 60 7.142857 7.142857
## 5 6350 Male Majority 1 1 7 78 29.761905 30.952381
## 6 7030 Male Majority 1 0 5 62 14.285714 17.857143
gather(dta, key = test_var, value= test_score, math, read) %>% head
## id sex race time grade year month test_var test_score
## 1 2390 Female Majority 1 0 6 67 math 14.285714
## 2 2560 Female Majority 1 0 6 66 math 20.238095
## 3 3740 Female Majority 1 0 6 67 math 17.857143
## 4 4020 Male Majority 1 0 5 60 math 7.142857
## 5 6350 Male Majority 1 1 7 78 math 29.761905
## 6 7030 Male Majority 1 0 5 62 math 14.285714
dta2 <- Vocab
head(dta2)
## year sex education vocabulary
## 20040001 2004 Female 9 3
## 20040002 2004 Female 14 6
## 20040003 2004 Male 14 9
## 20040005 2004 Female 17 8
## 20040008 2004 Male 14 1
## 20040010 2004 Male 14 7
畫圖看看男女性別在字彙逐年表現折線圖
dta2 %>%
rename(gender = sex) %>%
group_by(year, gender) %>%
summarize(voc_m= mean(vocabulary, na.rm = T),
voc_se= sd(vocabulary, na.rm = T)/sqrt(n())) %>%
ggplot(data = ., aes(x = year, y = voc_m, color = gender)) +
geom_point() +
geom_line(aes(group = gender)) +
geom_errorbar(aes(ymin = voc_m - 2*voc_se, ymax = voc_m + 2*voc_se), width = .1) +
labs(x = "Year", y = "Average Vocabulary score") +
theme_gray()
## Warning: package 'bindrcpp' was built under R version 3.3.3
畫每年的分數散佈圖
ggplot(dta2, aes(x = education, y = vocabulary, group = sex)) +
geom_point(shape = 1 ,aes(color = sex)) +
stat_smooth(method = "lm", se = F, aes(color = sex)) +
facet_wrap( ~ factor(year)) +
labs(x ="education", y = "vocabulary") +
theme_gray()
dta3 <- read.table("probeL.txt", header = T)
head(dta3)
## ID Response_Time Position
## 1 S01 51 1
## 2 S01 36 2
## 3 S01 50 3
## 4 S01 35 4
## 5 S01 42 5
## 6 S02 27 1
把Position的data依五種不同數據展開
dta3 %>%
mutate(Position = paste("Pos", Position, sep = "-")) %>%
spread(Position, Response_Time)
## ID Pos-1 Pos-2 Pos-3 Pos-4 Pos-5
## 1 S01 51 36 50 35 42
## 2 S02 27 20 26 17 27
## 3 S03 37 22 41 37 30
## 4 S04 42 36 32 34 27
## 5 S05 27 18 33 14 29
## 6 S06 43 32 43 35 40
## 7 S07 41 22 36 25 38
## 8 S08 38 21 31 20 16
## 9 S09 36 23 27 25 28
## 10 S10 26 31 31 32 36
## 11 S11 29 20 25 26 25
看一下兩筆資料的結構
country <-read.table("nobel_countries.txt", header = T)
winner <-read.table("nobel_winners.txt", header = T)
head(country)
## Country Year
## 1 France 2014
## 2 UK 1950
## 3 UK 2017
## 4 US 2016
## 5 Canada 2013
## 6 China 2012
head(winner)
## Name Gender Year
## 1 Patrick Modiano Male 2014
## 2 Bertrand Russell Male 1950
## 3 Kazuo Ishiguro Male 2017
## 4 Bob Dylan Male 2016
## 5 Alice Munro Female 2013
## 6 Mo Yan Male 2012
兩筆資料共同部分合併
inner_join(country, winner)
## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
共同部分中僅保留country的部分
semi_join(country, winner)
## Joining, by = "Year"
## Country Year
## 1 France 2014
## 2 UK 1950
## 3 UK 2017
## 4 US 2016
## 5 Canada 2013
## 6 China 2012
country的row為索引,跟winner資料合併
left_join(country, winner)
## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
## 7 Russia 2015 <NA> <NA>
## 8 Sweden 2011 <NA> <NA>
由country的資料,找出兩筆資料中沒辦法合併的欄位
anti_join(country, winner)
## Joining, by = "Year"
## Country Year
## 1 Russia 2015
## 2 Sweden 2011
兩筆資料直接合併
full_join(country, winner)
## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
## 7 Russia 2015 <NA> <NA>
## 8 Sweden 2011 <NA> <NA>
## 9 <NA> 1938 Pearl Buck Female