setwd("/Users/tayloryen/Desktop/大學/成大課業/大四下/資料管理/0326")
source("passwd.txt")
link1<-paste0("http://",IDPW,"140.116.183.121/~sheu/dataM/Data/nlsy86long.csv")
dta<-read.csv(link1)
head(dta)
## id sex race time grade year month math read
## 1 2390 Female Majority 1 0 6 67 14.285714 19.047619
## 2 2560 Female Majority 1 0 6 66 20.238095 21.428571
## 3 3740 Female Majority 1 0 6 67 17.857143 21.428571
## 4 4020 Male Majority 1 0 5 60 7.142857 7.142857
## 5 6350 Male Majority 1 1 7 78 29.761905 30.952381
## 6 7030 Male Majority 1 0 5 62 14.285714 17.857143
str(dta)
## 'data.frame': 664 obs. of 9 variables:
## $ id : int 2390 2560 3740 4020 6350 7030 7200 7610 7680 7700 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
## $ race : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 1 1 1 1 1 1 1 1 1 1 ...
## $ grade: int 0 0 0 0 1 0 0 0 0 0 ...
## $ year : int 6 6 6 5 7 5 6 7 6 6 ...
## $ month: int 67 66 67 60 78 62 66 79 76 67 ...
## $ math : num 14.29 20.24 17.86 7.14 29.76 ...
## $ read : num 19.05 21.43 21.43 7.14 30.95 ...
new_dta<-reshape(dta,idvar="ID",varying = list(8:9),times = c("math","read"),v.names="test_score",direction = "long")
head(new_dta)
## id sex race time grade year month test_score ID
## 1.math 2390 Female Majority math 0 6 67 14.285714 1
## 2.math 2560 Female Majority math 0 6 66 20.238095 2
## 3.math 3740 Female Majority math 0 6 67 17.857143 3
## 4.math 4020 Male Majority math 0 5 60 7.142857 4
## 5.math 6350 Male Majority math 1 7 78 29.761905 5
## 6.math 7030 Male Majority math 0 5 62 14.285714 6
colnames(new_dta)[4]<-"test_var"
head(new_dta)
## id sex race test_var grade year month test_score ID
## 1.math 2390 Female Majority math 0 6 67 14.285714 1
## 2.math 2560 Female Majority math 0 6 66 20.238095 2
## 3.math 3740 Female Majority math 0 6 67 17.857143 3
## 4.math 4020 Male Majority math 0 5 60 7.142857 4
## 5.math 6350 Male Majority math 1 7 78 29.761905 5
## 6.math 7030 Male Majority math 0 5 62 14.285714 6
dta2<-car::Vocab
head(dta2)
## year sex education vocabulary
## 20040001 2004 Female 9 3
## 20040002 2004 Female 14 6
## 20040003 2004 Male 14 9
## 20040005 2004 Female 17 8
## 20040008 2004 Male 14 1
## 20040010 2004 Male 14 7
str(dta2)
## 'data.frame': 21638 obs. of 4 variables:
## $ year : int 2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
## $ education : int 9 14 14 17 14 14 12 10 11 9 ...
## $ vocabulary: int 3 6 9 8 1 7 6 6 5 1 ...
dta2$year <-factor(dta2$year)
dta2$sex <-factor(dta2$sex )
#繪圖(Vocabulary)
dta2 %>%
group_by(year, sex) %>%
summarize(voc_m= mean(vocabulary, na.rm = T),
voc_se= sd(vocabulary, na.rm = T)/sqrt(n())) %>%
ggplot(data = ., aes(x = year, y = voc_m, color = sex)) +
geom_point() +
geom_line(aes(group = sex)) +
geom_errorbar(aes(ymin = voc_m - 2*voc_se, ymax = voc_m + 2*voc_se), width = .1) +
labs(x = "Year", y = "Average Vocabulart score") +
theme_bw()
由圖可知,男性和女性的字彙分數相近,逐年呈現上下震盪趨勢
#繪圖(Education)
dta2%>%group_by(year, sex) %>%
summarize(edu_m= mean(education, na.rm = T),
edu_se= sd(education, na.rm = T)/sqrt(n())) %>%
ggplot(data = ., aes(x = year, y = edu_m, color = sex)) +
geom_point() +
geom_line(aes(group = sex)) +
geom_errorbar(aes(ymin = edu_m - 2*edu_se, ymax = edu_m + 2*edu_se), width = .1) +
labs(x = "Year", y = "Average Education score") +
theme_bw()
由圖可知,男生和女生教育分數逐年上升,男性分數比女性高
link1<-paste0("http://",IDPW,"140.116.183.121/~sheu/dataM/Data/probeL.txt")
dta3<-read.table(link1,header=T,sep="")
head(dta3)
## ID Response_Time Position
## 1 S01 51 1
## 2 S01 36 2
## 3 S01 50 3
## 4 S01 35 4
## 5 S01 42 5
## 6 S02 27 1
str(dta3)
## 'data.frame': 55 obs. of 3 variables:
## $ ID : Factor w/ 11 levels "S01","S02","S03",..: 1 1 1 1 1 2 2 2 2 2 ...
## $ Response_Time: int 51 36 50 35 42 27 20 26 17 27 ...
## $ Position : int 1 2 3 4 5 1 2 3 4 5 ...
dta3_1<-reshape(dta3, idvar = "ID", timevar = "Position", direction = "wide")
head(dta3_1)
## ID Response_Time.1 Response_Time.2 Response_Time.3 Response_Time.4
## 1 S01 51 36 50 35
## 6 S02 27 20 26 17
## 11 S03 37 22 41 37
## 16 S04 42 36 32 34
## 21 S05 27 18 33 14
## 26 S06 43 32 43 35
## Response_Time.5
## 1 42
## 6 27
## 11 30
## 16 27
## 21 29
## 26 40
link2<-paste0("http://",IDPW,"140.116.183.121/~sheu/dataM/Rdw/data/nobel_countries.txt")
dta4_1<-read.table(link2,header=T)
link3<-paste0("http://",IDPW,"140.116.183.121/~sheu/dataM/Rdw/data/nobel_winners.txt")
dta4_2<-read.table(link3,header=T)
#兩資料都有的部分才會配對,其餘刪除
inner_join(dta4_1,dta4_2)
## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
#根據兩資料都有的部分,只保留前一個資料(dta4_1)的內容
semi_join(dta4_1,dta4_2)
## Joining, by = "Year"
## Country Year
## 1 France 2014
## 2 UK 1950
## 3 UK 2017
## 4 US 2016
## 5 Canada 2013
## 6 China 2012
#根據第一個資料(dta4_1)部分,配對第二個資料(dta_2)
left_join(dta4_1,dta4_2)
## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
## 7 Russia 2015 <NA> <NA>
## 8 Sweden 2011 <NA> <NA>
#找出第一個資料(dta4_1)中沒法配對到第二個資料(dta_2)的部分,僅保留第一個資料(dta4_1)
anti_join(dta4_1,dta4_2)
## Joining, by = "Year"
## Country Year
## 1 Russia 2015
## 2 Sweden 2011
##將兩筆資料以共同變項配對,無資料的地方以<NA>遺漏值取代
full_join(dta4_1,dta4_2)
## Joining, by = "Year"
## Country Year Name Gender
## 1 France 2014 Patrick Modiano Male
## 2 UK 1950 Bertrand Russell Male
## 3 UK 2017 Kazuo Ishiguro Male
## 4 US 2016 Bob Dylan Male
## 5 Canada 2013 Alice Munro Female
## 6 China 2012 Mo Yan Male
## 7 Russia 2015 <NA> <NA>
## 8 Sweden 2011 <NA> <NA>
## 9 <NA> 1938 Pearl Buck Female