In this post I describe some commonly used functions I use and beleive it will be useful for other as well. I primarily work in the area of finance and economics and hence most of the posts will be specific to them. If any errors or suggestions please write to me. I hope this might be useful for some people working in R at an basic/intermediate level. * Some coding conventions that I follow (may not be strict but I try to follow them) 1. # indicates optional code 2. ## indicates comments
There is disagreement about using rm(list=ls()) which will erase all the data in the existing global environment. Hence, I attach the command with an # prompting the users to use it only when necessary.
# rm(list=ls())
library(tidyverse); library(psych)
library(broom); library(stargazer)
conflicts()
## [1] "%>%" "%>%" "%>%" "%>%"
## [5] "add_row" "as_data_frame" "as_tibble" "data_frame"
## [9] "data_frame_" "frame_data" "glimpse" "lst"
## [13] "lst_" "tbl_sum" "tibble" "tribble"
## [17] "trunc_mat" "type_sum" "%+%" "alpha"
## [21] "enexpr" "enexprs" "enquo" "enquos"
## [25] "ensym" "ensyms" "expr" "quo"
## [29] "quo_name" "quos" "sym" "syms"
## [33] "vars" "filter" "lag" "body<-"
## [37] "intersect" "kronecker" "Position" "setdiff"
## [41] "setequal" "union"
This is one way of generating dummy data in R. Whereever necessary I create data specific to the method. Note that rerun is a command from purrr package in R which is useful for simulating data.
## Data creation
x = setNames(data.frame(rerun(5, rnorm(100,0,1)),
grp = sample(letters[1:5],100,replace = TRUE)),
c(paste0("x",1:5), "grp"))
y = setNames(data.frame(rerun(3, rnorm(100,0,1)),
grp = sample(letters[1:10],100,replace = TRUE)),
c(paste0("x",3:1), "grp"))
stargazer(x[1:10,], summary=FALSE, digits=3, type="html")
| x1 | x2 | x3 | x4 | x5 | grp | |
| 1 | -1.934 | -0.621 | 0.462 | -0.701 | -1.097 | d |
| 2 | 0.309 | -1.033 | 0.858 | -0.263 | -0.744 | d |
| 3 | -0.686 | 0.867 | 1.202 | 0.241 | -1.127 | a |
| 4 | -1.660 | 0.642 | 0.171 | -0.713 | 1.117 | c |
| 5 | -1.231 | -1.133 | 0.908 | -0.792 | 2.060 | b |
| 6 | 0.883 | -0.783 | 0.461 | -0.300 | -0.341 | a |
| 7 | 1.137 | -0.190 | 1.721 | -1.469 | 0.823 | a |
| 8 | 1.310 | -0.455 | 1.202 | -0.285 | -0.942 | d |
| 9 | 0.036 | -0.458 | -1.134 | -0.657 | 0.670 | c |
| 10 | 0.610 | 0.808 | 0.214 | -1.119 | -0.881 | b |
stargazer(y[1:10,], summary=FALSE, digits=3, type="html")
| x3 | x2 | x1 | grp | |
| 1 | -0.535 | 0.095 | 0.956 | j |
| 2 | -0.903 | -0.812 | 0.458 | j |
| 3 | 0.432 | 0.147 | -0.829 | c |
| 4 | -0.432 | -0.161 | 0.764 | b |
| 5 | 0.763 | 0.696 | -0.655 | f |
| 6 | -2.284 | -1.849 | -0.490 | d |
| 7 | -0.441 | -1.137 | -2.423 | h |
| 8 | 0.966 | -0.592 | 1.047 | b |
| 9 | 0.817 | -0.803 | 1.080 | i |
| 10 | -1.370 | 0.870 | 0.334 | d |
This code can be used for winsorising a data frame. The input can include non-numeric variables i.e character, factor variables and other types of data. It simply overlooks the non-numeric data and winsorises the numeric data.
The code has two inputs, a data frame with atleast one numeric vector and the specification of winsorisation level. The function uses winsor command from the psych package for winsoring data. And it excludes NAs in the winsor step. One can slighlty tweak the code to include original numeric variables i the output.
win.df = function(df,t,opt){
df.chr = df[!c(sapply(df,is.numeric))]
df = df[c(sapply(df, is.numeric))]
if(opt=="w"){
for(i in seq_along(names(df))){
j=paste0(names(df)[i],".l")
df[[i]] = ifelse(is.finite(df[[i]])==TRUE, df[[i]], NA)
df[[j]] = psych::winsor(df[[i]], trim = t, na.rm = TRUE)
rm(i,j)
}
df = df[grep("\\.l", names(df), value = TRUE)]
df = cbind(df.chr,df)
return(df)
}
else if (opt=="t"){
for(i in seq_along(names(df))){
j=paste0(names(df)[i],".l")
df[[i]] = ifelse(is.finite(df[[i]])==TRUE, df[[i]], NA)
lw = quantile(df[[i]],t, na.rm = TRUE)
up = quantile(df[[i]],1-t, na.rm = TRUE)
df[[j]] = ifelse(df[[i]] > lw & df[[i]] < up, df[[i]], NA)
rm(i,j)
}
df = df[grep("\\.l", names(df), value = TRUE)]
df = cbind(df.chr,df)
return(df)
}
else{print("Choose between winsorisation (opt=w) and trimming (opt=t)")}
}
trim.vl = function(df,lw,up) {
df.chr = df[!c(sapply(df,is.numeric))]
df = df[c(sapply(df, is.numeric))]
for(i in seq_along(names(df))){
j=paste0(names(df)[i],".l")
df[[i]] = ifelse(is.finite(df[[i]])==TRUE, df[[i]], NA)
df[[j]] = ifelse(df[[i]] > lw & df[[i]] < up, df[[i]], NA)
rm(i,j)
}
df = df[grep("\\.l", names(df), value = TRUE)]
df = cbind(df.chr,df)
return(df)
}
tmp = data.frame(x=1:100)
matrix(tmp$x,ncol=10,byrow = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 2 3 4 5 6 7 8 9 10
## [2,] 11 12 13 14 15 16 17 18 19 20
## [3,] 21 22 23 24 25 26 27 28 29 30
## [4,] 31 32 33 34 35 36 37 38 39 40
## [5,] 41 42 43 44 45 46 47 48 49 50
## [6,] 51 52 53 54 55 56 57 58 59 60
## [7,] 61 62 63 64 65 66 67 68 69 70
## [8,] 71 72 73 74 75 76 77 78 79 80
## [9,] 81 82 83 84 85 86 87 88 89 90
## [10,] 91 92 93 94 95 96 97 98 99 100
matrix(as.matrix(win.df(tmp,0.0,"w")),ncol=10,byrow = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 2 3 4 5 6 7 8 9 10
## [2,] 11 12 13 14 15 16 17 18 19 20
## [3,] 21 22 23 24 25 26 27 28 29 30
## [4,] 31 32 33 34 35 36 37 38 39 40
## [5,] 41 42 43 44 45 46 47 48 49 50
## [6,] 51 52 53 54 55 56 57 58 59 60
## [7,] 61 62 63 64 65 66 67 68 69 70
## [8,] 71 72 73 74 75 76 77 78 79 80
## [9,] 81 82 83 84 85 86 87 88 89 90
## [10,] 91 92 93 94 95 96 97 98 99 100
matrix(as.matrix(win.df(tmp,0.01,"w")),ncol=10,byrow = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1.99 2 3 4 5 6 7 8 9 10.00
## [2,] 11.00 12 13 14 15 16 17 18 19 20.00
## [3,] 21.00 22 23 24 25 26 27 28 29 30.00
## [4,] 31.00 32 33 34 35 36 37 38 39 40.00
## [5,] 41.00 42 43 44 45 46 47 48 49 50.00
## [6,] 51.00 52 53 54 55 56 57 58 59 60.00
## [7,] 61.00 62 63 64 65 66 67 68 69 70.00
## [8,] 71.00 72 73 74 75 76 77 78 79 80.00
## [9,] 81.00 82 83 84 85 86 87 88 89 90.00
## [10,] 91.00 92 93 94 95 96 97 98 99 99.01
matrix(as.matrix(win.df(tmp,0.05,"w")),ncol=10,byrow = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 5.95 5.95 5.95 5.95 5.95 6.00 7.00 8.00 9.00 10.00
## [2,] 11.00 12.00 13.00 14.00 15.00 16.00 17.00 18.00 19.00 20.00
## [3,] 21.00 22.00 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00
## [4,] 31.00 32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00
## [5,] 41.00 42.00 43.00 44.00 45.00 46.00 47.00 48.00 49.00 50.00
## [6,] 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00
## [7,] 61.00 62.00 63.00 64.00 65.00 66.00 67.00 68.00 69.00 70.00
## [8,] 71.00 72.00 73.00 74.00 75.00 76.00 77.00 78.00 79.00 80.00
## [9,] 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00
## [10,] 91.00 92.00 93.00 94.00 95.00 95.05 95.05 95.05 95.05 95.05
matrix(as.matrix(win.df(tmp,0.0,"t")),ncol=10,byrow = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] NA 2 3 4 5 6 7 8 9 10
## [2,] 11 12 13 14 15 16 17 18 19 20
## [3,] 21 22 23 24 25 26 27 28 29 30
## [4,] 31 32 33 34 35 36 37 38 39 40
## [5,] 41 42 43 44 45 46 47 48 49 50
## [6,] 51 52 53 54 55 56 57 58 59 60
## [7,] 61 62 63 64 65 66 67 68 69 70
## [8,] 71 72 73 74 75 76 77 78 79 80
## [9,] 81 82 83 84 85 86 87 88 89 90
## [10,] 91 92 93 94 95 96 97 98 99 NA
matrix(as.matrix(win.df(tmp,0.01,"t")),ncol=10,byrow = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] NA 2 3 4 5 6 7 8 9 10
## [2,] 11 12 13 14 15 16 17 18 19 20
## [3,] 21 22 23 24 25 26 27 28 29 30
## [4,] 31 32 33 34 35 36 37 38 39 40
## [5,] 41 42 43 44 45 46 47 48 49 50
## [6,] 51 52 53 54 55 56 57 58 59 60
## [7,] 61 62 63 64 65 66 67 68 69 70
## [8,] 71 72 73 74 75 76 77 78 79 80
## [9,] 81 82 83 84 85 86 87 88 89 90
## [10,] 91 92 93 94 95 96 97 98 99 NA
matrix(as.matrix(win.df(tmp,0.05,"t")),ncol=10,byrow = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] NA NA NA NA NA 6 7 8 9 10
## [2,] 11 12 13 14 15 16 17 18 19 20
## [3,] 21 22 23 24 25 26 27 28 29 30
## [4,] 31 32 33 34 35 36 37 38 39 40
## [5,] 41 42 43 44 45 46 47 48 49 50
## [6,] 51 52 53 54 55 56 57 58 59 60
## [7,] 61 62 63 64 65 66 67 68 69 70
## [8,] 71 72 73 74 75 76 77 78 79 80
## [9,] 81 82 83 84 85 86 87 88 89 90
## [10,] 91 92 93 94 95 NA NA NA NA NA
summ.df = function(df){
df = df[c(sapply(df, is.numeric))]
summ = psych::describe(df, na.rm = TRUE, skew=FALSE,
quant = c(0.01,0.05,0.10,0.25,0.50,0.75,0.90,0.95,0.99))
rnames = as.data.frame(row.names(summ))
names(rnames) = c("var.name")
summ = cbind(rnames,summ)
rownames(summ) = NULL
return(summ)
}
inspect = function(df,opt){
df = df[c(sapply(df, is.numeric))]
datalist = list()
for(i in seq_along(names(df))){
var = names(df)[i]
psts = sum(ifelse(df[[i]]>0 & df[[i]]!=Inf, 1, 0), na.rm = TRUE)
negs = sum(ifelse(df[[i]]<0 & df[[i]]!=-Inf, 1, 0), na.rm = TRUE)
zeros = sum(ifelse(df[[i]]==0, 1, 0), na.rm = TRUE)
pst.Inf = sum(ifelse(df[[i]]==Inf, 1, 0), na.rm = TRUE)
neg.Inf = sum(ifelse(df[[i]]==-Inf, 1, 0), na.rm = TRUE)
nans = sum(ifelse(is.nan(df[[i]])==TRUE, 1, 0), na.rm = TRUE)
nas = sum(ifelse(is.na(df[[i]])==TRUE, 1, 0), na.rm = TRUE)
only.nas = nas - nans
non.finite = only.nas+nans+pst.Inf+neg.Inf
finite = psts+ negs +zeros
n = finite+non.finite
datalist[[i]] = data.frame(var=var, n=n, psts=psts, negs=negs,
zeros=zeros,finite=finite,
non.finite=non.finite, only.nas=only.nas,
nans=nans, pst.Inf=pst.Inf,
neg.Inf=neg.Inf,
p.psts=psts/n, p.negs=negs/n,
p.zeros=zeros/n,p.finite=finite/n,
p.non.finite=non.finite/n, p.only.nas=only.nas/n,
p.nans=nans/n, p.pst.Inf=pst.Inf/n,
p.neg.Inf=neg.Inf/n)
rm(var,n,psts,negs,zeros,finite,non.finite,
only.nas,nans,pst.Inf,neg.Inf)
}
big_data = do.call(rbind, datalist)
pr = grep("^p\\.",names(big_data),value=TRUE)
non.pr = base::setdiff(names(big_data), pr)
if (opt==1) {
big_data = big_data[c("var", "n", pr )]
} else if (opt==2) {
big_data = big_data[non.pr]
} else {
big_data = big_data}
return(big_data)
}
diff.tests.df = function(df,t){
df = df[c(sapply(df,is.numeric))]
df = win.df(df,t)
ttest.out = data.frame(var.name = double(), estimate=double(), estimate1=double(),
estimate2=double(), statistic=double(), p.value=double(),
parameter=double(), conf.low=double(), conf.high=double(),
method=character(), alternative=character(), statistic1=double(),
p.value1=double(), method1=character(), alternative1=character(),
stringsAsFactors=FALSE)
for(i in seq_along(names(df))){
ttest = tidy(t.test(df[[i]]))
wilcox= tidy(wilcox.test(df[[i]]))
out = data.frame(var.name = names(df)[i], ttest, wilcox)
ttest.out = rbind(ttest.out, out)
rm(ttest, wilcox, out)
}
df = data.frame(summ.df(df), ttest.out)
return(df)
}
diff.tests.dfs = function(df1,df2,t){
df1 = df1[c(sapply(df1,is.numeric))]
df2 = df2[c(sapply(df2,is.numeric))]
common.vars = base::intersect(names(df1), names(df2))
df1 = df1[common.vars]
df2 = df2[common.vars]
df1 = win.df(df1,t)
df2 = win.df(df2,t)
ttest.out = data.frame(var.name = double(), estimate=double(), estimate1=double(),
estimate2=double(), statistic=double(), p.value=double(),
parameter=double(), conf.low=double(), conf.high=double(),
method=character(), alternative=character(), statistic1=double(),
p.value1=double(), method1=character(), alternative1=character(),
stringsAsFactors=FALSE)
for(i in seq_along(common.vars)){
ttest = tidy(t.test(df1[[i]], df2[[i]]))
wilcox= tidy(wilcox.test(df1[[i]],df2[[i]]))
out = data.frame(var.name = common.vars[i], ttest, wilcox)
ttest.out = rbind(ttest.out, out)
rm(ttest, wilcox, out)
}
df = data.frame(summ.df(df1), summ.df(df2), ttest.out)
return(df)
}
group.diff.tests.dfs = function(df1,df2,grp,t){
df1[["grp"]] = df1[[grp]]
df2[["grp"]] = df2[[grp]]
grp1 = names(table(df1[[grp]]))
grp2 = names(table(df2[[grp]]))
common.grp = base::intersect(grp1, grp2)
rm(grp1,grp2)
grp.ttest.out=data.frame(var.name = double(), grp.name = double(),
estimate=double(), estimate1=double(),
estimate2=double(), statistic=double(),
p.value=double(), parameter=double(),
conf.low=double(), conf.high=double(),
method=character(), alternative=character(),
statistic1=double(), p.value1=double(),
method1=character(), alternative1=character(),
stringsAsFactors=FALSE)
for(i in seq_along(common.grp)){
x = subset(df1, grp == common.grp[i])
x = x[c(sapply(x,is.numeric))]
y = subset(df2, grp == common.grp[i])
y = y[c(sapply(y,is.numeric))]
out = diff.tests.dfs(x,y,t)
out = data.frame(grp.name = rep(common.grp[i], each=dim(out)[1]), out)
grp.ttest.out = rbind(grp.ttest.out,out)
rm(x,y,out)
}
return(grp.ttest.out)
}
* to variablessign.clean = function(df,vr,pval){
out = data.frame(df, vr = df[[vr]], pval = df[[pval]])
out = out %>%
mutate(sign = case_when(pval<=0.01 ~ "*",
pval>0.01 & pval<=0.05 ~ "#",
pval>0.05 & pval<=0.10 ~ "~",
pval>0.10 ~ " "),
vr = format(round(vr,3), nsmall = 3),
vr = paste(as.character(vr), sign, sep = "")) %>%
select(-c(pval, sign)) %>%
as.data.frame()
return(out)
}
format.clean = function(df,n){
df1 = df[!c(sapply(df,is.numeric))]
df2 = df[c(sapply(df,is.numeric))]
for(i in seq_along(names(df2))){
df2[[i]] = format(round(df2[[i]],digits=n), nsmall = n)
}
df = cbind(df1, df2)
return(df)
rm(df1,df2,i,n)
}
# ## Test runs
# X = win.df(x,.01); Y = win.df(x,.01)
# tmp = summ.df(x)
# tmp = format.clean(x,4)
# tmp = sign.clean(y,vr="x1",pval="x2")
# tmp1 = diff.tests.df(x,y,0)
# tmp2 = group.diff.tests.dfs(x,y,grp ="grp", 0)
# tmp3 = as.data.frame(bind_rows(tmp1,tmp2))
# tmp3 = tmp3 %>%
# arrange(var.name, grp.name) %>%
# mutate(grp.name = grp.name)