library(dplyr)
1. Data
df=data.frame(Group=c("A","B","A","A","B","A","C","C","B","C"),
A=c(3,0,0,NA,0,5,6,NA,0,8),
B=c(9,4,4,2,3,0,NA,1,0,10),
C=c(0,3,0,6,1,7,8,NA,0,5),
D=c(6,0,7,1,0,4,NA,2,0,1),
E=c(1,5,NA,7,1,3,NA,5,0,9))
df
## Group A B C D E
## 1 A 3 9 0 6 1
## 2 B 0 4 3 0 5
## 3 A 0 4 0 7 NA
## 4 A NA 2 6 1 7
## 5 B 0 3 1 0 1
## 6 A 5 0 7 4 3
## 7 C 6 NA 8 NA NA
## 8 C NA 1 NA 2 5
## 9 B 0 0 0 0 0
## 10 C 8 10 5 1 9
2. rowSums
df1=df
df1 %>% select(A:C) %>% rowSums(na.rm=TRUE)->df1$SUM_A_to_C
df1 %>% select(A,C,E) %>% rowSums(na.rm=TRUE)->df1$SUM_A_C_E
df1
## Group A B C D E SUM_A_to_C SUM_A_C_E
## 1 A 3 9 0 6 1 12 4
## 2 B 0 4 3 0 5 7 8
## 3 A 0 4 0 7 NA 4 0
## 4 A NA 2 6 1 7 8 13
## 5 B 0 3 1 0 1 4 2
## 6 A 5 0 7 4 3 12 15
## 7 C 6 NA 8 NA NA 14 14
## 8 C NA 1 NA 2 5 1 5
## 9 B 0 0 0 0 0 0 0
## 10 C 8 10 5 1 9 23 22
3. rowMeans
df1 %>% select(A:C) %>% rowMeans(na.rm=TRUE)->df1$Mean_A_to_C
df1 %>% select(A,C,E) %>% rowMeans(na.rm=TRUE)->df1$Mean_A_C_E
df1
## Group A B C D E SUM_A_to_C SUM_A_C_E Mean_A_to_C Mean_A_C_E
## 1 A 3 9 0 6 1 12 4 4.000000 1.3333333
## 2 B 0 4 3 0 5 7 8 2.333333 2.6666667
## 3 A 0 4 0 7 NA 4 0 1.333333 0.0000000
## 4 A NA 2 6 1 7 8 13 4.000000 6.5000000
## 5 B 0 3 1 0 1 4 2 1.333333 0.6666667
## 6 A 5 0 7 4 3 12 15 4.000000 5.0000000
## 7 C 6 NA 8 NA NA 14 14 7.000000 7.0000000
## 8 C NA 1 NA 2 5 1 5 1.000000 5.0000000
## 9 B 0 0 0 0 0 0 0 0.000000 0.0000000
## 10 C 8 10 5 1 9 23 22 7.666667 7.3333333
4. colSums
df2=df[,2:6]
df2
## A B C D E
## 1 3 9 0 6 1
## 2 0 4 3 0 5
## 3 0 4 0 7 NA
## 4 NA 2 6 1 7
## 5 0 3 1 0 1
## 6 5 0 7 4 3
## 7 6 NA 8 NA NA
## 8 NA 1 NA 2 5
## 9 0 0 0 0 0
## 10 8 10 5 1 9
df2 %>% colSums(na.rm=TRUE)
## A B C D E
## 22 33 30 21 31
5. colMeans
df2=df[,2:6]
print(df2 %>% colMeans(na.rm=TRUE),digits=3)
## A B C D E
## 2.75 3.67 3.33 2.33 3.88
6. summarise
6.1 For variable A
df %>% summarise_each(funs(min(.,na.rm=TRUE),mean(.,na.rm=TRUE),max(.,na.rm=TRUE),sd(.,na.rm=TRUE)),A)
## min mean max sd
## 1 0 2.75 8 3.24037
6.2 For variable A by Group
df %>% group_by(Group) %>%summarise_each(funs(min(.,na.rm=TRUE),mean(.,na.rm=TRUE),max(.,na.rm=TRUE),sd(.,na.rm=TRUE)),A)
## # A tibble: 3 × 5
## Group min mean max sd
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 A 0 2.666667 5 2.516611
## 2 B 0 0.000000 0 0.000000
## 3 C 6 7.000000 8 1.414214
7. Replace
7.1 Replace NA in variable A by 9999
df1=df
df1=df1 %>% mutate(A=replace(A,is.na(A),9999))
df1
## Group A B C D E
## 1 A 3 9 0 6 1
## 2 B 0 4 3 0 5
## 3 A 0 4 0 7 NA
## 4 A 9999 2 6 1 7
## 5 B 0 3 1 0 1
## 6 A 5 0 7 4 3
## 7 C 6 NA 8 NA NA
## 8 C 9999 1 NA 2 5
## 9 B 0 0 0 0 0
## 10 C 8 10 5 1 9
7.2 Replace NA in variable A by mean A
df1=df
df1=df1 %>% mutate(A=replace(A,is.na(A),mean(A,na.rm=T)))
df1
## Group A B C D E
## 1 A 3.00 9 0 6 1
## 2 B 0.00 4 3 0 5
## 3 A 0.00 4 0 7 NA
## 4 A 2.75 2 6 1 7
## 5 B 0.00 3 1 0 1
## 6 A 5.00 0 7 4 3
## 7 C 6.00 NA 8 NA NA
## 8 C 2.75 1 NA 2 5
## 9 B 0.00 0 0 0 0
## 10 C 8.00 10 5 1 9
8. Recode
8.1 Recode value zero in variable A by a number
df1=df
df1=df1 %>% mutate(A=recode(A,'0'=6666))
df1
## Group A B C D E
## 1 A 3 9 0 6 1
## 2 B 6666 4 3 0 5
## 3 A 6666 4 0 7 NA
## 4 A NA 2 6 1 7
## 5 B 6666 3 1 0 1
## 6 A 5 0 7 4 3
## 7 C 6 NA 8 NA NA
## 8 C NA 1 NA 2 5
## 9 B 6666 0 0 0 0
## 10 C 8 10 5 1 9
8.2 Recode number into factor
df1=df
df1=df1 %>% mutate(A=recode(A,'0'="Zero",'3'="Three",'5'="Five",'6'="Six",'8'="Eight"))
df1
## Group A B C D E
## 1 A Three 9 0 6 1
## 2 B Zero 4 3 0 5
## 3 A Zero 4 0 7 NA
## 4 A <NA> 2 6 1 7
## 5 B Zero 3 1 0 1
## 6 A Five 0 7 4 3
## 7 C Six NA 8 NA NA
## 8 C <NA> 1 NA 2 5
## 9 B Zero 0 0 0 0
## 10 C Eight 10 5 1 9
8.2 Recode factor into number
df1=df
df1=df1%>%mutate(Group=recode(Group,"A"='1',"B"='2',"C"='3'))%>%mutate(Group=as.numeric(as.character(Group)))
str(df1)
## 'data.frame': 10 obs. of 6 variables:
## $ Group: num 1 2 1 1 2 1 3 3 2 3
## $ A : num 3 0 0 NA 0 5 6 NA 0 8
## $ B : num 9 4 4 2 3 0 NA 1 0 10
## $ C : num 0 3 0 6 1 7 8 NA 0 5
## $ D : num 6 0 7 1 0 4 NA 2 0 1
## $ E : num 1 5 NA 7 1 3 NA 5 0 9
df1
## Group A B C D E
## 1 1 3 9 0 6 1
## 2 2 0 4 3 0 5
## 3 1 0 4 0 7 NA
## 4 1 NA 2 6 1 7
## 5 2 0 3 1 0 1
## 6 1 5 0 7 4 3
## 7 3 6 NA 8 NA NA
## 8 3 NA 1 NA 2 5
## 9 2 0 0 0 0 0
## 10 3 8 10 5 1 9
9. Rename
df_rename=df1 %>% rename(Bien_A=A, Bien_C=C)
# Note: plyr / multcomp package should be loaded before dplyr (tidyverse). Otherwise it can cause a conflict.
df_rename
## Group Bien_A B Bien_C D E
## 1 1 3 9 0 6 1
## 2 2 0 4 3 0 5
## 3 1 0 4 0 7 NA
## 4 1 NA 2 6 1 7
## 5 2 0 3 1 0 1
## 6 1 5 0 7 4 3
## 7 3 6 NA 8 NA NA
## 8 3 NA 1 NA 2 5
## 9 2 0 0 0 0 0
## 10 3 8 10 5 1 9
10. Count function
df1 %>% apply(.,2,function(x)sum(!is.na(x))) # number of non-NA by column
## Group A B C D E
## 10 8 9 9 9 8
df1 %>% apply(.,2,function(x)sum(is.na(x))) # number of NA by column
## Group A B C D E
## 0 2 1 1 1 2
df1 %>% apply(.,1,function(x)sum(!is.na(x))) # number of non-NA by row
## [1] 6 6 5 5 6 6 3 4 6 6
df1 %>% apply(.,1,function(x)sum(is.na(x))) # number of NA by row
## [1] 0 0 1 1 0 0 3 2 0 0
df1 %>% count(A) # in plyr package - Similar with table(A) function
## # A tibble: 6 × 2
## A n
## <dbl> <int>
## 1 0 4
## 2 3 1
## 3 5 1
## 4 6 1
## 5 8 1
## 6 NA 2
df1 %>% count(A,B) # in plyr package - Similar with table(A,B) function
## Source: local data frame [9 x 3]
## Groups: A [?]
##
## A B n
## <dbl> <dbl> <int>
## 1 0 0 1
## 2 0 3 1
## 3 0 4 2
## 4 3 9 1
## 5 5 0 1
## 6 6 NA 1
## 7 8 10 1
## 8 NA 1 1
## 9 NA 2 1
11. Filter data with NA / non-NA value only
11.1. Remove NA
df2=df1 %>% na.omit() ;df2 # remove all rows with NA
## Group A B C D E
## 1 1 3 9 0 6 1
## 2 2 0 4 3 0 5
## 5 2 0 3 1 0 1
## 6 1 5 0 7 4 3
## 9 2 0 0 0 0 0
## 10 3 8 10 5 1 9
df2=df1 %>% filter(!is.na(A)) ;df2 # remove rows with NA of variable A only
## Group A B C D E
## 1 1 3 9 0 6 1
## 2 2 0 4 3 0 5
## 3 1 0 4 0 7 NA
## 4 2 0 3 1 0 1
## 5 1 5 0 7 4 3
## 6 3 6 NA 8 NA NA
## 7 2 0 0 0 0 0
## 8 3 8 10 5 1 9
11.2. Select NA
df2=df1 %>% filter(is.na(A)) ;df2 # Select rows with NA of variable A only
## Group A B C D E
## 1 1 NA 2 6 1 7
## 2 3 NA 1 NA 2 5
12. as.numeric function
x=c("1.39", "1.68", "1.6", "1.65", "1.69", "1.97")
ID=1:6; dat=data.frame(ID,x)
str(dat);dat
## 'data.frame': 6 obs. of 2 variables:
## $ ID: int 1 2 3 4 5 6
## $ x : Factor w/ 6 levels "1.39","1.6","1.65",..: 1 4 2 3 5 6
## ID x
## 1 1 1.39
## 2 2 1.68
## 3 3 1.6
## 4 4 1.65
## 5 5 1.69
## 6 6 1.97
dat_num1=dat %>% mutate(x=as.numeric(x));dat_num1 # numeric as factor value (Wrong way)
## ID x
## 1 1 1
## 2 2 4
## 3 3 2
## 4 4 3
## 5 5 5
## 6 6 6
dat_num2=dat %>% mutate(x=as.numeric(as.character(x)));dat_num2 # numeric as character
## ID x
## 1 1 1.39
## 2 2 1.68
## 3 3 1.60
## 4 4 1.65
## 5 5 1.69
## 6 6 1.97