1

# 讀進資料
dta <- read.csv("C:/Users/user/Dropbox/1062-Data_manage/0326/nlsy86long.csv")
head(dta)

##     id    sex     race time grade year month      math      read
## 1 2390 Female Majority    1     0    6    67 14.285714 19.047619
## 2 2560 Female Majority    1     0    6    66 20.238095 21.428571
## 3 3740 Female Majority    1     0    6    67 17.857143 21.428571
## 4 4020   Male Majority    1     0    5    60  7.142857  7.142857
## 5 6350   Male Majority    1     1    7    78 29.761905 30.952381
## 6 7030   Male Majority    1     0    5    62 14.285714 17.857143

#寬轉長
dtal <- reshape(dta, 
                    direction="long", 
                    varying = list(c(8,9)), 
                    times = c("math","read"),
                    v.names = c("test_score"), idvar = "id.1") %>% rename(test_var=time) 
#看資料之後的形式
head(dtal)

##          id    sex     race test_var grade year month test_score id.1
## 1.math 2390 Female Majority     math     0    6    67  14.285714    1
## 2.math 2560 Female Majority     math     0    6    66  20.238095    2
## 3.math 3740 Female Majority     math     0    6    67  17.857143    3
## 4.math 4020   Male Majority     math     0    5    60   7.142857    4
## 5.math 6350   Male Majority     math     1    7    78  29.761905    5
## 6.math 7030   Male Majority     math     0    5    62  14.285714    6

2

#讀進資料並轉換成factor
dta2<-car::Vocab    
head(dta2)

##          year    sex education vocabulary
## 20040001 2004 Female         9          3
## 20040002 2004 Female        14          6
## 20040003 2004   Male        14          9
## 20040005 2004 Female        17          8
## 20040008 2004   Male        14          1
## 20040010 2004   Male        14          7

dta2$year <-dta2$year %>% factor()
dta2$sex <-dta2$sex %>% factor()

#先畫education的圖
dta2 %>%
  rename(gender = sex) %>%
  group_by(year, gender) %>%
  summarize(edu_m= mean(education, na.rm = T),
            edu_se= sd(education, na.rm = T)/sqrt(n())) %>%
  ggplot(data = ., aes(x = year, y = edu_m, color = gender)) +
  geom_point() +
  geom_line(aes(group = gender)) +
  geom_errorbar(aes(ymin = edu_m - 2*edu_se, ymax = edu_m + 2*edu_se), width = .1) + 
  labs(x = "Year", y = "Average Education score") +
  theme_bw()

隨著年份增加，無論男性或女性的教育分數都逐年上升，且男性分數比女性高

#畫Vocabulary的圖
dta2 %>%
  rename(gender = sex) %>%
  group_by(year, gender) %>%
  summarize(voc_m= mean(vocabulary, na.rm = T),
            voc_se= sd(vocabulary, na.rm = T)/sqrt(n())) %>%
  ggplot(data = ., aes(x = year, y = voc_m, color = gender)) +
  geom_point() +
  geom_line(aes(group = gender)) +
  geom_errorbar(aes(ymin = voc_m - 2*voc_se, ymax = voc_m + 2*voc_se), width = .1) + 
  labs(x = "Year", y = "Average Vocabulart score") +
  theme_bw()

女性與男性的字彙分數很接近，有些微逐年上升的趨勢

3

#read in the data
probel <- read.csv("C:/Users/user/Dropbox/1062-Data_manage/0326/probel.txt", sep="")
head(probel)

##    ID Response_Time Position
## 1 S01            51        1
## 2 S01            36        2
## 3 S01            50        3
## 4 S01            35        4
## 5 S01            42        5
## 6 S02            27        1

#reshape it
prob_l<-reshape(probel, idvar = "ID", timevar = "Position", direction = "wide")

#rename it
namelist <-names(prob_l) %>% sub("Response_Time.","RT",.) 
colnames(prob_l) <- namelist

#see the data
head(prob_l)

##     ID RT1 RT2 RT3 RT4 RT5
## 1  S01  51  36  50  35  42
## 6  S02  27  20  26  17  27
## 11 S03  37  22  41  37  30
## 16 S04  42  36  32  34  27
## 21 S05  27  18  33  14  29
## 26 S06  43  32  43  35  40

4

#read the data
country <- read.csv("C:/Users/user/Dropbox/1062-Data_manage/0326/country.txt", sep="")
winer <- read.csv("C:/Users/user/Dropbox/1062-Data_manage/0326/winner.txt", sep="")

根據country裡的country與year來看說winner裡有哪些對得上的人名

inner_join(country, winer)

## Joining, by = "Year"

##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male

根據country裡的country與year來看說winner裡有哪些對得上，但不顯示人名跟性別

semi_join(country, winer)

## Joining, by = "Year"

##   Country Year
## 1  France 2014
## 2      UK 1950
## 3      UK 2017
## 4      US 2016
## 5  Canada 2013
## 6   China 2012

找出winner內所有對得上country檔案的人名與性別，對不上的也會留遺漏值

left_join(country, winer)

## Joining, by = "Year"

##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male
## 7  Russia 2015              <NA>   <NA>
## 8  Sweden 2011              <NA>   <NA>

找出winner所有對不上country檔案的資料，留下沒有配對成功的國家跟年分

anti_join(country, winer)

## Joining, by = "Year"

##   Country Year
## 1  Russia 2015
## 2  Sweden 2011

全部配對起來，配不起來就留遺漏值

full_join(country, winer)

## Joining, by = "Year"

##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male
## 7  Russia 2015              <NA>   <NA>
## 8  Sweden 2011              <NA>   <NA>
## 9    <NA> 1938        Pearl Buck Female

Inclass 0326

Yifang

2018-03-26

1

2

3

4

根據country裡的country與year來看說winner裡有哪些對得上的人名

根據country裡的country與year來看說winner裡有哪些對得上，但不顯示人名跟性別

找出winner內所有對得上country檔案的人名與性別，對不上的也會留遺漏值

找出winner所有對不上country檔案的資料，留下沒有配對成功的國家跟年分

全部配對起來，配不起來就留遺漏值