Week 4 In-Class Excercise

D84039034 莊霈淳 Patricia Chuang

26 三月, 2018


Q1 Create a variable “test_var” to store the labels: ‘math’ and ‘read’ and a variable “test_score” to store their corresponding values and expand the data set to a long format.

dta <- read.csv("D:/data_management/week4/nlsy86long.csv")
head(dta)
##     id    sex     race time grade year month      math      read
## 1 2390 Female Majority    1     0    6    67 14.285714 19.047619
## 2 2560 Female Majority    1     0    6    66 20.238095 21.428571
## 3 3740 Female Majority    1     0    6    67 17.857143 21.428571
## 4 4020   Male Majority    1     0    5    60  7.142857  7.142857
## 5 6350   Male Majority    1     1    7    78 29.761905 30.952381
## 6 7030   Male Majority    1     0    5    62 14.285714 17.857143
str(dta)
## 'data.frame':    664 obs. of  9 variables:
##  $ id   : int  2390 2560 3740 4020 6350 7030 7200 7610 7680 7700 ...
##  $ sex  : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
##  $ race : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
##  $ time : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ grade: int  0 0 0 0 1 0 0 0 0 0 ...
##  $ year : int  6 6 6 5 7 5 6 7 6 6 ...
##  $ month: int  67 66 67 60 78 62 66 79 76 67 ...
##  $ math : num  14.29 20.24 17.86 7.14 29.76 ...
##  $ read : num  19.05 21.43 21.43 7.14 30.95 ...
pacman::p_load(mlmRev, HSAUR3, knitr, kableExtra,
               readr, dplyr, ggplot2, tidyr, car,
               magrittr, tibble, purrr, stringr)
head(dta)
##     id    sex     race time grade year month      math      read
## 1 2390 Female Majority    1     0    6    67 14.285714 19.047619
## 2 2560 Female Majority    1     0    6    66 20.238095 21.428571
## 3 3740 Female Majority    1     0    6    67 17.857143 21.428571
## 4 4020   Male Majority    1     0    5    60  7.142857  7.142857
## 5 6350   Male Majority    1     1    7    78 29.761905 30.952381
## 6 7030   Male Majority    1     0    5    62 14.285714 17.857143
gather(dta, key = test_var, value= test_score, math, read) %>% head
##     id    sex     race time grade year month test_var test_score
## 1 2390 Female Majority    1     0    6    67     math  14.285714
## 2 2560 Female Majority    1     0    6    66     math  20.238095
## 3 3740 Female Majority    1     0    6    67     math  17.857143
## 4 4020   Male Majority    1     0    5    60     math   7.142857
## 5 6350   Male Majority    1     1    7    78     math  29.761905
## 6 7030   Male Majority    1     0    5    62     math  14.285714

Q2 Education and vocabulary over the years by gender

dta2 <- Vocab
head(dta2)
##          year    sex education vocabulary
## 20040001 2004 Female         9          3
## 20040002 2004 Female        14          6
## 20040003 2004   Male        14          9
## 20040005 2004 Female        17          8
## 20040008 2004   Male        14          1
## 20040010 2004   Male        14          7

畫圖看看男女性別在字彙逐年表現折線圖

dta2 %>%
  rename(gender = sex) %>%
  group_by(year, gender) %>%
  summarize(voc_m= mean(vocabulary, na.rm = T),
            voc_se= sd(vocabulary, na.rm = T)/sqrt(n())) %>%
  ggplot(data = ., aes(x = year, y = voc_m, color = gender)) +
  geom_point() +
  geom_line(aes(group = gender)) +
  geom_errorbar(aes(ymin = voc_m - 2*voc_se, ymax = voc_m + 2*voc_se), width = .1) + 
  labs(x = "Year", y = "Average Vocabulary score") +
  theme_gray()
## Warning: package 'bindrcpp' was built under R version 3.3.3

畫每年的分數散佈圖

ggplot(dta2, aes(x = education, y = vocabulary, group = sex)) +
  geom_point(shape = 1 ,aes(color = sex)) +
  stat_smooth(method = "lm",  se = F, aes(color = sex)) +
  facet_wrap( ~ factor(year)) +
  labs(x ="education", y = "vocabulary") +
  theme_gray()

Q3 Convert the data set probe words from long to wide format as described

dta3 <- read.table("probeL.txt", header = T)
head(dta3)
##    ID Response_Time Position
## 1 S01            51        1
## 2 S01            36        2
## 3 S01            50        3
## 4 S01            35        4
## 5 S01            42        5
## 6 S02            27        1

把Position的data依五種不同數據展開

dta3 %>% 
  mutate(Position = paste("Pos", Position, sep = "-")) %>% 
  spread(Position, Response_Time)
##     ID Pos-1 Pos-2 Pos-3 Pos-4 Pos-5
## 1  S01    51    36    50    35    42
## 2  S02    27    20    26    17    27
## 3  S03    37    22    41    37    30
## 4  S04    42    36    32    34    27
## 5  S05    27    18    33    14    29
## 6  S06    43    32    43    35    40
## 7  S07    41    22    36    25    38
## 8  S08    38    21    31    20    16
## 9  S09    36    23    27    25    28
## 10 S10    26    31    31    32    36
## 11 S11    29    20    25    26    25

Q4 Reverse the order of input to the series using *_join function

看一下兩筆資料的結構

country <-read.table("nobel_countries.txt", header = T)
winner <-read.table("nobel_winners.txt", header = T)
head(country)
##   Country Year
## 1  France 2014
## 2      UK 1950
## 3      UK 2017
## 4      US 2016
## 5  Canada 2013
## 6   China 2012
head(winner)
##                Name Gender Year
## 1   Patrick Modiano   Male 2014
## 2 Bertrand  Russell   Male 1950
## 3    Kazuo Ishiguro   Male 2017
## 4        Bob  Dylan   Male 2016
## 5      Alice  Munro Female 2013
## 6            Mo Yan   Male 2012

兩筆資料共同部分合併

inner_join(country, winner)
## Joining, by = "Year"
##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male

共同部分中僅保留country的部分

semi_join(country, winner)
## Joining, by = "Year"
##   Country Year
## 1  France 2014
## 2      UK 1950
## 3      UK 2017
## 4      US 2016
## 5  Canada 2013
## 6   China 2012

country的row為索引,跟winner資料合併

left_join(country, winner)
## Joining, by = "Year"
##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male
## 7  Russia 2015              <NA>   <NA>
## 8  Sweden 2011              <NA>   <NA>

由country的資料,找出兩筆資料中沒辦法合併的欄位

anti_join(country, winner)
## Joining, by = "Year"
##   Country Year
## 1  Russia 2015
## 2  Sweden 2011

兩筆資料直接合併

full_join(country, winner)
## Joining, by = "Year"
##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male
## 7  Russia 2015              <NA>   <NA>
## 8  Sweden 2011              <NA>   <NA>
## 9    <NA> 1938        Pearl Buck Female

end