##1

dta <-read.csv("nlsy86long.csv",header=T)
head(dta)
##     id    sex     race time grade year month   math   read
## 1 2390 Female Majority    1     0    6    67 14.286 19.048
## 2 2560 Female Majority    1     0    6    66 20.238 21.429
## 3 3740 Female Majority    1     0    6    67 17.857 21.429
## 4 4020   Male Majority    1     0    5    60  7.143  7.143
## 5 6350   Male Majority    1     1    7    78 29.762 30.952
## 6 7030   Male Majority    1     0    5    62 14.286 17.857
dta %>% gather("test_var", "test_score", 8:9) %>% head
##     id    sex     race time grade year month test_var test_score
## 1 2390 Female Majority    1     0    6    67     math     14.286
## 2 2560 Female Majority    1     0    6    66     math     20.238
## 3 3740 Female Majority    1     0    6    67     math     17.857
## 4 4020   Male Majority    1     0    5    60     math      7.143
## 5 6350   Male Majority    1     1    7    78     math     29.762
## 6 7030   Male Majority    1     0    5    62     math     14.286

##2

head(dta2 <- Vocab)
##          year    sex education vocabulary
## 20040001 2004 Female         9          3
## 20040002 2004 Female        14          6
## 20040003 2004   Male        14          9
## 20040005 2004 Female        17          8
## 20040008 2004   Male        14          1
## 20040010 2004   Male        14          7
str(dta2)
## 'data.frame':    21638 obs. of  4 variables:
##  $ year      : int  2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
##  $ education : int  9 14 14 17 14 14 12 10 11 9 ...
##  $ vocabulary: int  3 6 9 8 1 7 6 6 5 1 ...
ggplot(dta2, aes(education, vocabulary, color = sex))+
  geom_count(position = position_dodge(.5))+
  stat_smooth(method = "lm")+
  facet_wrap(~factor(year))

Vocab %>%
 rename(Gender = sex) %>%
 group_by(Gender, year) %>%
 summarize(edu_m = mean(education, na.rm = T),
           edu_se = sd(education, na.rm = T)/sqrt(n()),
           voc_m = mean(vocabulary, na.rm = T),
           voc_se = sd(vocabulary, na.rm = T)/sqrt(n())) %>%
 ggplot(data = ., aes(x = year, y = edu_m, color = Gender)) +
  geom_point(position = position_dodge(.5), size = rel(2))+
  geom_line(aes(group = Gender), position = position_dodge(.5)) +
  geom_errorbar(aes(ymin = edu_m - 2*edu_se, ymax = edu_m + 2*edu_se), width = .1, position = position_dodge(.5)) + 
  geom_point(aes(y = voc_m), position = position_dodge(.5), size = rel(2), pch = 1)+
  geom_line(aes(y = voc_m, group = Gender), position = position_dodge(.5), linetype = "dashed") +
  geom_errorbar(aes(ymin = voc_m - 2*voc_se, ymax = voc_m + 2*voc_se), width = .1, position = position_dodge(.5)) + 
  labs(x = "Year", y = "Average Education Year and Vocabulary") +
  theme_bw() 
## Warning: package 'bindrcpp' was built under R version 3.4.4

##3

dta3 <- read.table("probeL.txt", header = T)
head(dta3)
##    ID Response_Time Position
## 1 S01            51        1
## 2 S01            36        2
## 3 S01            50        3
## 4 S01            35        4
## 5 S01            42        5
## 6 S02            27        1
dta3 %>% 
  mutate(Position = paste("Pos", Position, sep = "_")) %>% 
  spread(Position, Response_Time)
##     ID Pos_1 Pos_2 Pos_3 Pos_4 Pos_5
## 1  S01    51    36    50    35    42
## 2  S02    27    20    26    17    27
## 3  S03    37    22    41    37    30
## 4  S04    42    36    32    34    27
## 5  S05    27    18    33    14    29
## 6  S06    43    32    43    35    40
## 7  S07    41    22    36    25    38
## 8  S08    38    21    31    20    16
## 9  S09    36    23    27    25    28
## 10 S10    26    31    31    32    36
## 11 S11    29    20    25    26    25

##4

dtaC <-read.table ("nobel_countries.txt",header=T)
dtaW <-read.table ("nobel_winners.txt",header=T)
merge(dtaC, dtaW)
##   Year Country              Name Gender
## 1 1950      UK Bertrand  Russell   Male
## 2 2012   China            Mo Yan   Male
## 3 2013  Canada      Alice  Munro Female
## 4 2014  France   Patrick Modiano   Male
## 5 2016      US        Bob  Dylan   Male
## 6 2017      UK    Kazuo Ishiguro   Male
merge(dtaC, dtaW, all = TRUE)
##   Year Country              Name Gender
## 1 1938    <NA>        Pearl Buck Female
## 2 1950      UK Bertrand  Russell   Male
## 3 2011  Sweden              <NA>   <NA>
## 4 2012   China            Mo Yan   Male
## 5 2013  Canada      Alice  Munro Female
## 6 2014  France   Patrick Modiano   Male
## 7 2015  Russia              <NA>   <NA>
## 8 2016      US        Bob  Dylan   Male
## 9 2017      UK    Kazuo Ishiguro   Male

取交集

dplyr::inner_join(dtaW, dtaC)
## Joining, by = "Year"
##                Name Gender Year Country
## 1   Patrick Modiano   Male 2014  France
## 2 Bertrand  Russell   Male 1950      UK
## 3    Kazuo Ishiguro   Male 2017      UK
## 4        Bob  Dylan   Male 2016      US
## 5      Alice  Munro Female 2013  Canada
## 6            Mo Yan   Male 2012   China

兩個資料集都有的資料,僅保留第一個原來的資訊

semi_join(dtaW, dtaC)
## Joining, by = "Year"
##                Name Gender Year
## 1   Patrick Modiano   Male 2014
## 2 Bertrand  Russell   Male 1950
## 3    Kazuo Ishiguro   Male 2017
## 4        Bob  Dylan   Male 2016
## 5      Alice  Munro Female 2013
## 6            Mo Yan   Male 2012

用左邊資料當作索引

left_join(dtaW, dtaC)
## Joining, by = "Year"
##                Name Gender Year Country
## 1   Patrick Modiano   Male 2014  France
## 2 Bertrand  Russell   Male 1950      UK
## 3    Kazuo Ishiguro   Male 2017      UK
## 4        Bob  Dylan   Male 2016      US
## 5      Alice  Munro Female 2013  Canada
## 6            Mo Yan   Male 2012   China
## 7        Pearl Buck Female 1938    <NA>

第一個資料集,把left_join中不共有的列出。

anti_join(dtaW, dtaC)
## Joining, by = "Year"
##         Name Gender Year
## 1 Pearl Buck Female 1938

取聯集

full_join(dtaW, dtaC)
## Joining, by = "Year"
##                Name Gender Year Country
## 1   Patrick Modiano   Male 2014  France
## 2 Bertrand  Russell   Male 1950      UK
## 3    Kazuo Ishiguro   Male 2017      UK
## 4        Bob  Dylan   Male 2016      US
## 5      Alice  Munro Female 2013  Canada
## 6            Mo Yan   Male 2012   China
## 7        Pearl Buck Female 1938    <NA>
## 8              <NA>   <NA> 2015  Russia
## 9              <NA>   <NA> 2011  Sweden