0326in_class Exercise

Question 1

讀取資料、查看前六筆資料、查看資料結構

setwd("/Users/tayloryen/Desktop/大學/成大課業/大四下/資料管理/0326")
source("passwd.txt")
link1<-paste0("http://",IDPW,"140.116.183.121/~sheu/dataM/Data/nlsy86long.csv")
dta<-read.csv(link1)
head(dta)

##     id    sex     race time grade year month      math      read
## 1 2390 Female Majority    1     0    6    67 14.285714 19.047619
## 2 2560 Female Majority    1     0    6    66 20.238095 21.428571
## 3 3740 Female Majority    1     0    6    67 17.857143 21.428571
## 4 4020   Male Majority    1     0    5    60  7.142857  7.142857
## 5 6350   Male Majority    1     1    7    78 29.761905 30.952381
## 6 7030   Male Majority    1     0    5    62 14.285714 17.857143

str(dta)

## 'data.frame':    664 obs. of  9 variables:
##  $ id   : int  2390 2560 3740 4020 6350 7030 7200 7610 7680 7700 ...
##  $ sex  : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
##  $ race : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
##  $ time : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ grade: int  0 0 0 0 1 0 0 0 0 0 ...
##  $ year : int  6 6 6 5 7 5 6 7 6 6 ...
##  $ month: int  67 66 67 60 78 62 66 79 76 67 ...
##  $ math : num  14.29 20.24 17.86 7.14 29.76 ...
##  $ read : num  19.05 21.43 21.43 7.14 30.95 ...

使用reshape處理資料、查看前六筆資料

new_dta<-reshape(dta,idvar="ID",varying = list(8:9),times = c("math","read"),v.names="test_score",direction = "long")
head(new_dta)

##          id    sex     race time grade year month test_score ID
## 1.math 2390 Female Majority math     0    6    67  14.285714  1
## 2.math 2560 Female Majority math     0    6    66  20.238095  2
## 3.math 3740 Female Majority math     0    6    67  17.857143  3
## 4.math 4020   Male Majority math     0    5    60   7.142857  4
## 5.math 6350   Male Majority math     1    7    78  29.761905  5
## 6.math 7030   Male Majority math     0    5    62  14.285714  6

colnames(new_dta)[4]<-"test_var"
head(new_dta)

##          id    sex     race test_var grade year month test_score ID
## 1.math 2390 Female Majority     math     0    6    67  14.285714  1
## 2.math 2560 Female Majority     math     0    6    66  20.238095  2
## 3.math 3740 Female Majority     math     0    6    67  17.857143  3
## 4.math 4020   Male Majority     math     0    5    60   7.142857  4
## 5.math 6350   Male Majority     math     1    7    78  29.761905  5
## 6.math 7030   Male Majority     math     0    5    62  14.285714  6

Question 2

讀資料、查看資料前六項、查看資料格式、將“year”,“sex”資料格式改成Factor

dta2<-car::Vocab    
head(dta2)

##          year    sex education vocabulary
## 20040001 2004 Female         9          3
## 20040002 2004 Female        14          6
## 20040003 2004   Male        14          9
## 20040005 2004 Female        17          8
## 20040008 2004   Male        14          1
## 20040010 2004   Male        14          7

str(dta2)

## 'data.frame':    21638 obs. of  4 variables:
##  $ year      : int  2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
##  $ education : int  9 14 14 17 14 14 12 10 11 9 ...
##  $ vocabulary: int  3 6 9 8 1 7 6 6 5 1 ...

dta2$year <-factor(dta2$year)
dta2$sex <-factor(dta2$sex )

繪圖

#繪圖(Vocabulary)
dta2 %>%
  group_by(year, sex) %>%
  summarize(voc_m= mean(vocabulary, na.rm = T),
            voc_se= sd(vocabulary, na.rm = T)/sqrt(n())) %>%
  ggplot(data = ., aes(x = year, y = voc_m, color = sex)) +
  geom_point() +
  geom_line(aes(group = sex)) +
  geom_errorbar(aes(ymin = voc_m - 2*voc_se, ymax = voc_m + 2*voc_se), width = .1) + 
  labs(x = "Year", y = "Average Vocabulart score") +
  theme_bw()

由圖可知，男性和女性的字彙分數相近，逐年呈現上下震盪趨勢

#繪圖(Education)
dta2%>%group_by(year, sex) %>%
  summarize(edu_m= mean(education, na.rm = T),
            edu_se= sd(education, na.rm = T)/sqrt(n())) %>%
  ggplot(data = ., aes(x = year, y = edu_m, color = sex)) +
  geom_point() +
  geom_line(aes(group = sex)) +
  geom_errorbar(aes(ymin = edu_m - 2*edu_se, ymax = edu_m + 2*edu_se), width = .1) + 
  labs(x = "Year", y = "Average Education score") +
  theme_bw()

由圖可知，男生和女生教育分數逐年上升，男性分數比女性高

Question 3

讀資料、查看前六筆資料、查看資料格式

link1<-paste0("http://",IDPW,"140.116.183.121/~sheu/dataM/Data/probeL.txt")
dta3<-read.table(link1,header=T,sep="")
head(dta3)

##    ID Response_Time Position
## 1 S01            51        1
## 2 S01            36        2
## 3 S01            50        3
## 4 S01            35        4
## 5 S01            42        5
## 6 S02            27        1

str(dta3)

## 'data.frame':    55 obs. of  3 variables:
##  $ ID           : Factor w/ 11 levels "S01","S02","S03",..: 1 1 1 1 1 2 2 2 2 2 ...
##  $ Response_Time: int  51 36 50 35 42 27 20 26 17 27 ...
##  $ Position     : int  1 2 3 4 5 1 2 3 4 5 ...

使用reshape處理資料、查看前六筆資料

dta3_1<-reshape(dta3, idvar = "ID", timevar = "Position", direction = "wide")
head(dta3_1)

##     ID Response_Time.1 Response_Time.2 Response_Time.3 Response_Time.4
## 1  S01              51              36              50              35
## 6  S02              27              20              26              17
## 11 S03              37              22              41              37
## 16 S04              42              36              32              34
## 21 S05              27              18              33              14
## 26 S06              43              32              43              35
##    Response_Time.5
## 1               42
## 6               27
## 11              30
## 16              27
## 21              29
## 26              40

Question 4

讀取資料

link2<-paste0("http://",IDPW,"140.116.183.121/~sheu/dataM/Rdw/data/nobel_countries.txt")
dta4_1<-read.table(link2,header=T)
link3<-paste0("http://",IDPW,"140.116.183.121/~sheu/dataM/Rdw/data/nobel_winners.txt")
dta4_2<-read.table(link3,header=T)

#兩資料都有的部分才會配對，其餘刪除
inner_join(dta4_1,dta4_2)

## Joining, by = "Year"

##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male

#根據兩資料都有的部分，只保留前一個資料(dta4_1)的內容
semi_join(dta4_1,dta4_2)

## Joining, by = "Year"

##   Country Year
## 1  France 2014
## 2      UK 1950
## 3      UK 2017
## 4      US 2016
## 5  Canada 2013
## 6   China 2012

#根據第一個資料（dta4_1）部分，配對第二個資料(dta_2)
left_join(dta4_1,dta4_2)

## Joining, by = "Year"

##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male
## 7  Russia 2015              <NA>   <NA>
## 8  Sweden 2011              <NA>   <NA>

#找出第一個資料(dta4_1)中沒法配對到第二個資料(dta_2)的部分，僅保留第一個資料(dta4_1)
anti_join(dta4_1,dta4_2)

## Joining, by = "Year"

##   Country Year
## 1  Russia 2015
## 2  Sweden 2011

##將兩筆資料以共同變項配對，無資料的地方以<NA>遺漏值取代
full_join(dta4_1,dta4_2)

## Joining, by = "Year"

##   Country Year              Name Gender
## 1  France 2014   Patrick Modiano   Male
## 2      UK 1950 Bertrand  Russell   Male
## 3      UK 2017    Kazuo Ishiguro   Male
## 4      US 2016        Bob  Dylan   Male
## 5  Canada 2013      Alice  Munro Female
## 6   China 2012            Mo Yan   Male
## 7  Russia 2015              <NA>   <NA>
## 8  Sweden 2011              <NA>   <NA>
## 9    <NA> 1938        Pearl Buck Female

0326in_class Exercise

Li-Ping Yen

2018.3.26

Question 1

讀取資料、查看前六筆資料、查看資料結構

使用reshape處理資料、查看前六筆資料

Question 2

讀資料、查看資料前六項、查看資料格式、將“year”,“sex”資料格式改成Factor

繪圖

Question 3

讀資料、查看前六筆資料、查看資料格式

使用reshape處理資料、查看前六筆資料

Question 4

讀取資料

The End