##1
dta <-read.csv("nlsy86long.csv",header=T)
head(dta)
## id sex race time grade year month math read
## 1 2390 Female Majority 1 0 6 67 14.286 19.048
## 2 2560 Female Majority 1 0 6 66 20.238 21.429
## 3 3740 Female Majority 1 0 6 67 17.857 21.429
## 4 4020 Male Majority 1 0 5 60 7.143 7.143
## 5 6350 Male Majority 1 1 7 78 29.762 30.952
## 6 7030 Male Majority 1 0 5 62 14.286 17.857
dta %>% gather("test_var", "test_score", 8:9) %>% head
## id sex race time grade year month test_var test_score
## 1 2390 Female Majority 1 0 6 67 math 14.286
## 2 2560 Female Majority 1 0 6 66 math 20.238
## 3 3740 Female Majority 1 0 6 67 math 17.857
## 4 4020 Male Majority 1 0 5 60 math 7.143
## 5 6350 Male Majority 1 1 7 78 math 29.762
## 6 7030 Male Majority 1 0 5 62 math 14.286
##2
head(dta2 <- Vocab)
## year sex education vocabulary
## 20040001 2004 Female 9 3
## 20040002 2004 Female 14 6
## 20040003 2004 Male 14 9
## 20040005 2004 Female 17 8
## 20040008 2004 Male 14 1
## 20040010 2004 Male 14 7
str(dta2)
## 'data.frame': 21638 obs. of 4 variables:
## $ year : int 2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
## $ education : int 9 14 14 17 14 14 12 10 11 9 ...
## $ vocabulary: int 3 6 9 8 1 7 6 6 5 1 ...
ggplot(dta2, aes(education, vocabulary, color = sex))+
geom_count(position = position_dodge(.5))+
stat_smooth(method = "lm")+
facet_wrap(~factor(year))
Vocab %>%
rename(Gender = sex) %>%
group_by(Gender, year) %>%
summarize(edu_m = mean(education, na.rm = T),
edu_se = sd(education, na.rm = T)/sqrt(n()),
voc_m = mean(vocabulary, na.rm = T),
voc_se = sd(vocabulary, na.rm = T)/sqrt(n())) %>%
ggplot(data = ., aes(x = year, y = edu_m, color = Gender)) +
geom_point(position = position_dodge(.5), size = rel(2))+
geom_line(aes(group = Gender), position = position_dodge(.5)) +
geom_errorbar(aes(ymin = edu_m - 2*edu_se, ymax = edu_m + 2*edu_se), width = .1, position = position_dodge(.5)) +
geom_point(aes(y = voc_m), position = position_dodge(.5), size = rel(2), pch = 1)+
geom_line(aes(y = voc_m, group = Gender), position = position_dodge(.5), linetype = "dashed") +
geom_errorbar(aes(ymin = voc_m - 2*voc_se, ymax = voc_m + 2*voc_se), width = .1, position = position_dodge(.5)) +
labs(x = "Year", y = "Average Education Year and Vocabulary") +
theme_bw()
## Warning: package 'bindrcpp' was built under R version 3.4.4
##3
dta3 <- read.table("probeL.txt", header = T)
head(dta3)
## ID Response_Time Position
## 1 S01 51 1
## 2 S01 36 2
## 3 S01 50 3
## 4 S01 35 4
## 5 S01 42 5
## 6 S02 27 1
dta3 %>%
mutate(Position = paste("Pos", Position, sep = "_")) %>%
spread(Position, Response_Time)
## ID Pos_1 Pos_2 Pos_3 Pos_4 Pos_5
## 1 S01 51 36 50 35 42
## 2 S02 27 20 26 17 27
## 3 S03 37 22 41 37 30
## 4 S04 42 36 32 34 27
## 5 S05 27 18 33 14 29
## 6 S06 43 32 43 35 40
## 7 S07 41 22 36 25 38
## 8 S08 38 21 31 20 16
## 9 S09 36 23 27 25 28
## 10 S10 26 31 31 32 36
## 11 S11 29 20 25 26 25
##4
dtaC <-read.table ("nobel_countries.txt",header=T)
dtaW <-read.table ("nobel_winners.txt",header=T)
merge(dtaC, dtaW)
## Year Country Name Gender
## 1 1950 UK Bertrand Russell Male
## 2 2012 China Mo Yan Male
## 3 2013 Canada Alice Munro Female
## 4 2014 France Patrick Modiano Male
## 5 2016 US Bob Dylan Male
## 6 2017 UK Kazuo Ishiguro Male
merge(dtaC, dtaW, all = TRUE)
## Year Country Name Gender
## 1 1938 <NA> Pearl Buck Female
## 2 1950 UK Bertrand Russell Male
## 3 2011 Sweden <NA> <NA>
## 4 2012 China Mo Yan Male
## 5 2013 Canada Alice Munro Female
## 6 2014 France Patrick Modiano Male
## 7 2015 Russia <NA> <NA>
## 8 2016 US Bob Dylan Male
## 9 2017 UK Kazuo Ishiguro Male
取交集
dplyr::inner_join(dtaW, dtaC)
## Joining, by = "Year"
## Name Gender Year Country
## 1 Patrick Modiano Male 2014 France
## 2 Bertrand Russell Male 1950 UK
## 3 Kazuo Ishiguro Male 2017 UK
## 4 Bob Dylan Male 2016 US
## 5 Alice Munro Female 2013 Canada
## 6 Mo Yan Male 2012 China
兩個資料集都有的資料,僅保留第一個原來的資訊
semi_join(dtaW, dtaC)
## Joining, by = "Year"
## Name Gender Year
## 1 Patrick Modiano Male 2014
## 2 Bertrand Russell Male 1950
## 3 Kazuo Ishiguro Male 2017
## 4 Bob Dylan Male 2016
## 5 Alice Munro Female 2013
## 6 Mo Yan Male 2012
用左邊資料當作索引
left_join(dtaW, dtaC)
## Joining, by = "Year"
## Name Gender Year Country
## 1 Patrick Modiano Male 2014 France
## 2 Bertrand Russell Male 1950 UK
## 3 Kazuo Ishiguro Male 2017 UK
## 4 Bob Dylan Male 2016 US
## 5 Alice Munro Female 2013 Canada
## 6 Mo Yan Male 2012 China
## 7 Pearl Buck Female 1938 <NA>
第一個資料集,把left_join中不共有的列出。
anti_join(dtaW, dtaC)
## Joining, by = "Year"
## Name Gender Year
## 1 Pearl Buck Female 1938
取聯集
full_join(dtaW, dtaC)
## Joining, by = "Year"
## Name Gender Year Country
## 1 Patrick Modiano Male 2014 France
## 2 Bertrand Russell Male 1950 UK
## 3 Kazuo Ishiguro Male 2017 UK
## 4 Bob Dylan Male 2016 US
## 5 Alice Munro Female 2013 Canada
## 6 Mo Yan Male 2012 China
## 7 Pearl Buck Female 1938 <NA>
## 8 <NA> <NA> 2015 Russia
## 9 <NA> <NA> 2011 Sweden