1
# 讀進資料
dta <- read.csv("C:/Users/user/Dropbox/1062-Data_manage/0326/nlsy86long.csv")
head(dta)## id sex race time grade year month math read
## 1 2390 Female Majority 1 0 6 67 14.285714 19.047619
## 2 2560 Female Majority 1 0 6 66 20.238095 21.428571
## 3 3740 Female Majority 1 0 6 67 17.857143 21.428571
## 4 4020 Male Majority 1 0 5 60 7.142857 7.142857
## 5 6350 Male Majority 1 1 7 78 29.761905 30.952381
## 6 7030 Male Majority 1 0 5 62 14.285714 17.857143
#寬轉長
dtal <- reshape(dta,
direction="long",
varying = list(c(8,9)),
times = c("math","read"),
v.names = c("test_score"), idvar = "id.1") %>% rename(test_var=time)
#看資料之後的形式
head(dtal)## id sex race test_var grade year month test_score id.1
## 1.math 2390 Female Majority math 0 6 67 14.285714 1
## 2.math 2560 Female Majority math 0 6 66 20.238095 2
## 3.math 3740 Female Majority math 0 6 67 17.857143 3
## 4.math 4020 Male Majority math 0 5 60 7.142857 4
## 5.math 6350 Male Majority math 1 7 78 29.761905 5
## 6.math 7030 Male Majority math 0 5 62 14.285714 6
2
#讀進資料並轉換成factor
dta2<-car::Vocab
head(dta2)## year sex education vocabulary
## 20040001 2004 Female 9 3
## 20040002 2004 Female 14 6
## 20040003 2004 Male 14 9
## 20040005 2004 Female 17 8
## 20040008 2004 Male 14 1
## 20040010 2004 Male 14 7
dta2$year <-dta2$year %>% factor()
dta2$sex <-dta2$sex %>% factor()
#先畫education的圖
dta2 %>%
rename(gender = sex) %>%
group_by(year, gender) %>%
summarize(edu_m= mean(education, na.rm = T),
edu_se= sd(education, na.rm = T)/sqrt(n())) %>%
ggplot(data = ., aes(x = year, y = edu_m, color = gender)) +
geom_point() +
geom_line(aes(group = gender)) +
geom_errorbar(aes(ymin = edu_m - 2*edu_se, ymax = edu_m + 2*edu_se), width = .1) +
labs(x = "Year", y = "Average Education score") +
theme_bw() 隨著年份增加,無論男性或女性的教育分數都逐年上升,且男性分數比女性高
#畫Vocabulary的圖
dta2 %>%
rename(gender = sex) %>%
group_by(year, gender) %>%
summarize(voc_m= mean(vocabulary, na.rm = T),
voc_se= sd(vocabulary, na.rm = T)/sqrt(n())) %>%
ggplot(data = ., aes(x = year, y = voc_m, color = gender)) +
geom_point() +
geom_line(aes(group = gender)) +
geom_errorbar(aes(ymin = voc_m - 2*voc_se, ymax = voc_m + 2*voc_se), width = .1) +
labs(x = "Year", y = "Average Vocabulart score") +
theme_bw() 女性與男性的字彙分數很接近,有些微逐年上升的趨勢
3
#read in the data
probel <- read.csv("C:/Users/user/Dropbox/1062-Data_manage/0326/probel.txt", sep="")
head(probel)## ID Response_Time Position
## 1 S01 51 1
## 2 S01 36 2
## 3 S01 50 3
## 4 S01 35 4
## 5 S01 42 5
## 6 S02 27 1
#reshape it
prob_l<-reshape(probel, idvar = "ID", timevar = "Position", direction = "wide")
#rename it
namelist <-names(prob_l) %>% sub("Response_Time.","RT",.)
colnames(prob_l) <- namelist
#see the data
head(prob_l)## ID RT1 RT2 RT3 RT4 RT5
## 1 S01 51 36 50 35 42
## 6 S02 27 20 26 17 27
## 11 S03 37 22 41 37 30
## 16 S04 42 36 32 34 27
## 21 S05 27 18 33 14 29
## 26 S06 43 32 43 35 40
4
#read the data
country <- read.csv("C:/Users/user/Dropbox/1062-Data_manage/0326/country.txt", sep="")
winer <- read.csv("C:/Users/user/Dropbox/1062-Data_manage/0326/winner.txt", sep="")根據country裡的country與year來看說winner裡有哪些對得上的人名
inner_join(country, winer) ## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
根據country裡的country與year來看說winner裡有哪些對得上,但不顯示人名跟性別
semi_join(country, winer)## Joining, by = "Year"
## Country Year
## 1 France 2014
## 2 UK 1950
## 3 UK 2017
## 4 US 2016
## 5 Canada 2013
## 6 China 2012
找出winner內所有對得上country檔案的人名與性別,對不上的也會留遺漏值
left_join(country, winer)## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
## 7 Russia 2015 <NA> <NA>
## 8 Sweden 2011 <NA> <NA>
找出winner所有對不上country檔案的資料,留下沒有配對成功的國家跟年分
anti_join(country, winer)## Joining, by = "Year"
## Country Year
## 1 Russia 2015
## 2 Sweden 2011
全部配對起來,配不起來就留遺漏值
full_join(country, winer)## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
## 7 Russia 2015 <NA> <NA>
## 8 Sweden 2011 <NA> <NA>
## 9 <NA> 1938 Pearl Buck Female