1 Functions

In this post I describe some commonly used functions I use and beleive it will be useful for other as well. I primarily work in the area of finance and economics and hence most of the posts will be specific to them. If any errors or suggestions please write to me. I hope this might be useful for some people working in R at an basic/intermediate level. * Some coding conventions that I follow (may not be strict but I try to follow them) 1. # indicates optional code 2. ## indicates comments

2 Loading standard libraries

There is disagreement about using rm(list=ls()) which will erase all the data in the existing global environment. Hence, I attach the command with an # prompting the users to use it only when necessary.

# rm(list=ls())
library(tidyverse); library(psych)
library(broom); library(stargazer)
conflicts()
##  [1] "%>%"           "%>%"           "%>%"           "%>%"          
##  [5] "add_row"       "as_data_frame" "as_tibble"     "data_frame"   
##  [9] "data_frame_"   "frame_data"    "glimpse"       "lst"          
## [13] "lst_"          "tbl_sum"       "tibble"        "tribble"      
## [17] "trunc_mat"     "type_sum"      "%+%"           "alpha"        
## [21] "enexpr"        "enexprs"       "enquo"         "enquos"       
## [25] "ensym"         "ensyms"        "expr"          "quo"          
## [29] "quo_name"      "quos"          "sym"           "syms"         
## [33] "vars"          "filter"        "lag"           "body<-"       
## [37] "intersect"     "kronecker"     "Position"      "setdiff"      
## [41] "setequal"      "union"

3 Dummy data generation

This is one way of generating dummy data in R. Whereever necessary I create data specific to the method. Note that rerun is a command from purrr package in R which is useful for simulating data.

## Data creation
x = setNames(data.frame(rerun(5, rnorm(100,0,1)),
              grp = sample(letters[1:5],100,replace = TRUE)),
              c(paste0("x",1:5), "grp"))
y = setNames(data.frame(rerun(3, rnorm(100,0,1)),
              grp = sample(letters[1:10],100,replace = TRUE)),
              c(paste0("x",3:1), "grp"))

3.1 Sample data generated using above code

stargazer(x[1:10,], summary=FALSE, digits=3, type="html")
x1 x2 x3 x4 x5 grp
1 -1.934 -0.621 0.462 -0.701 -1.097 d
2 0.309 -1.033 0.858 -0.263 -0.744 d
3 -0.686 0.867 1.202 0.241 -1.127 a
4 -1.660 0.642 0.171 -0.713 1.117 c
5 -1.231 -1.133 0.908 -0.792 2.060 b
6 0.883 -0.783 0.461 -0.300 -0.341 a
7 1.137 -0.190 1.721 -1.469 0.823 a
8 1.310 -0.455 1.202 -0.285 -0.942 d
9 0.036 -0.458 -1.134 -0.657 0.670 c
10 0.610 0.808 0.214 -1.119 -0.881 b
stargazer(y[1:10,], summary=FALSE, digits=3, type="html")
x3 x2 x1 grp
1 -0.535 0.095 0.956 j
2 -0.903 -0.812 0.458 j
3 0.432 0.147 -0.829 c
4 -0.432 -0.161 0.764 b
5 0.763 0.696 -0.655 f
6 -2.284 -1.849 -0.490 d
7 -0.441 -1.137 -2.423 h
8 0.966 -0.592 1.047 b
9 0.817 -0.803 1.080 i
10 -1.370 0.870 0.334 d

4 Winsorisation and Trimming

This code can be used for winsorising a data frame. The input can include non-numeric variables i.e character, factor variables and other types of data. It simply overlooks the non-numeric data and winsorises the numeric data.

The code has two inputs, a data frame with atleast one numeric vector and the specification of winsorisation level. The function uses winsor command from the psych package for winsoring data. And it excludes NAs in the winsor step. One can slighlty tweak the code to include original numeric variables i the output.

win.df = function(df,t,opt){
  df.chr = df[!c(sapply(df,is.numeric))]
  df = df[c(sapply(df, is.numeric))]
  if(opt=="w"){
  for(i in seq_along(names(df))){
  j=paste0(names(df)[i],".l")
  df[[i]] = ifelse(is.finite(df[[i]])==TRUE, df[[i]], NA)
  df[[j]] = psych::winsor(df[[i]], trim = t, na.rm = TRUE)
  rm(i,j)
  }
  df = df[grep("\\.l", names(df), value = TRUE)]
  df = cbind(df.chr,df)
  return(df)
  }
  else if (opt=="t"){
    for(i in seq_along(names(df))){
      j=paste0(names(df)[i],".l")
      df[[i]] = ifelse(is.finite(df[[i]])==TRUE, df[[i]], NA)
      lw = quantile(df[[i]],t, na.rm = TRUE)
      up = quantile(df[[i]],1-t, na.rm = TRUE)
      df[[j]] = ifelse(df[[i]] > lw & df[[i]] < up, df[[i]], NA)
      rm(i,j)
    }
    df = df[grep("\\.l", names(df), value = TRUE)]
    df = cbind(df.chr,df)
    return(df)
  }
  else{print("Choose between winsorisation (opt=w) and trimming (opt=t)")}
}

trim.vl = function(df,lw,up) {
  df.chr = df[!c(sapply(df,is.numeric))]
  df = df[c(sapply(df, is.numeric))]
  for(i in seq_along(names(df))){
    j=paste0(names(df)[i],".l")
    df[[i]] = ifelse(is.finite(df[[i]])==TRUE, df[[i]], NA)
    df[[j]] = ifelse(df[[i]] > lw & df[[i]] < up, df[[i]], NA)
    rm(i,j)
  }
  df = df[grep("\\.l", names(df), value = TRUE)]
  df = cbind(df.chr,df)
  return(df)
}

4.1 Test runs and checks.

tmp = data.frame(x=1:100)
matrix(tmp$x,ncol=10,byrow = TRUE)
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1    2    3    4    5    6    7    8    9    10
##  [2,]   11   12   13   14   15   16   17   18   19    20
##  [3,]   21   22   23   24   25   26   27   28   29    30
##  [4,]   31   32   33   34   35   36   37   38   39    40
##  [5,]   41   42   43   44   45   46   47   48   49    50
##  [6,]   51   52   53   54   55   56   57   58   59    60
##  [7,]   61   62   63   64   65   66   67   68   69    70
##  [8,]   71   72   73   74   75   76   77   78   79    80
##  [9,]   81   82   83   84   85   86   87   88   89    90
## [10,]   91   92   93   94   95   96   97   98   99   100
matrix(as.matrix(win.df(tmp,0.0,"w")),ncol=10,byrow = TRUE)
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1    2    3    4    5    6    7    8    9    10
##  [2,]   11   12   13   14   15   16   17   18   19    20
##  [3,]   21   22   23   24   25   26   27   28   29    30
##  [4,]   31   32   33   34   35   36   37   38   39    40
##  [5,]   41   42   43   44   45   46   47   48   49    50
##  [6,]   51   52   53   54   55   56   57   58   59    60
##  [7,]   61   62   63   64   65   66   67   68   69    70
##  [8,]   71   72   73   74   75   76   77   78   79    80
##  [9,]   81   82   83   84   85   86   87   88   89    90
## [10,]   91   92   93   94   95   96   97   98   99   100
matrix(as.matrix(win.df(tmp,0.01,"w")),ncol=10,byrow = TRUE)
##        [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  1.99    2    3    4    5    6    7    8    9 10.00
##  [2,] 11.00   12   13   14   15   16   17   18   19 20.00
##  [3,] 21.00   22   23   24   25   26   27   28   29 30.00
##  [4,] 31.00   32   33   34   35   36   37   38   39 40.00
##  [5,] 41.00   42   43   44   45   46   47   48   49 50.00
##  [6,] 51.00   52   53   54   55   56   57   58   59 60.00
##  [7,] 61.00   62   63   64   65   66   67   68   69 70.00
##  [8,] 71.00   72   73   74   75   76   77   78   79 80.00
##  [9,] 81.00   82   83   84   85   86   87   88   89 90.00
## [10,] 91.00   92   93   94   95   96   97   98   99 99.01
matrix(as.matrix(win.df(tmp,0.05,"w")),ncol=10,byrow = TRUE)
##        [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10]
##  [1,]  5.95  5.95  5.95  5.95  5.95  6.00  7.00  8.00  9.00 10.00
##  [2,] 11.00 12.00 13.00 14.00 15.00 16.00 17.00 18.00 19.00 20.00
##  [3,] 21.00 22.00 23.00 24.00 25.00 26.00 27.00 28.00 29.00 30.00
##  [4,] 31.00 32.00 33.00 34.00 35.00 36.00 37.00 38.00 39.00 40.00
##  [5,] 41.00 42.00 43.00 44.00 45.00 46.00 47.00 48.00 49.00 50.00
##  [6,] 51.00 52.00 53.00 54.00 55.00 56.00 57.00 58.00 59.00 60.00
##  [7,] 61.00 62.00 63.00 64.00 65.00 66.00 67.00 68.00 69.00 70.00
##  [8,] 71.00 72.00 73.00 74.00 75.00 76.00 77.00 78.00 79.00 80.00
##  [9,] 81.00 82.00 83.00 84.00 85.00 86.00 87.00 88.00 89.00 90.00
## [10,] 91.00 92.00 93.00 94.00 95.00 95.05 95.05 95.05 95.05 95.05
matrix(as.matrix(win.df(tmp,0.0,"t")),ncol=10,byrow = TRUE)
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]   NA    2    3    4    5    6    7    8    9    10
##  [2,]   11   12   13   14   15   16   17   18   19    20
##  [3,]   21   22   23   24   25   26   27   28   29    30
##  [4,]   31   32   33   34   35   36   37   38   39    40
##  [5,]   41   42   43   44   45   46   47   48   49    50
##  [6,]   51   52   53   54   55   56   57   58   59    60
##  [7,]   61   62   63   64   65   66   67   68   69    70
##  [8,]   71   72   73   74   75   76   77   78   79    80
##  [9,]   81   82   83   84   85   86   87   88   89    90
## [10,]   91   92   93   94   95   96   97   98   99    NA
matrix(as.matrix(win.df(tmp,0.01,"t")),ncol=10,byrow = TRUE)
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]   NA    2    3    4    5    6    7    8    9    10
##  [2,]   11   12   13   14   15   16   17   18   19    20
##  [3,]   21   22   23   24   25   26   27   28   29    30
##  [4,]   31   32   33   34   35   36   37   38   39    40
##  [5,]   41   42   43   44   45   46   47   48   49    50
##  [6,]   51   52   53   54   55   56   57   58   59    60
##  [7,]   61   62   63   64   65   66   67   68   69    70
##  [8,]   71   72   73   74   75   76   77   78   79    80
##  [9,]   81   82   83   84   85   86   87   88   89    90
## [10,]   91   92   93   94   95   96   97   98   99    NA
matrix(as.matrix(win.df(tmp,0.05,"t")),ncol=10,byrow = TRUE)
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]   NA   NA   NA   NA   NA    6    7    8    9    10
##  [2,]   11   12   13   14   15   16   17   18   19    20
##  [3,]   21   22   23   24   25   26   27   28   29    30
##  [4,]   31   32   33   34   35   36   37   38   39    40
##  [5,]   41   42   43   44   45   46   47   48   49    50
##  [6,]   51   52   53   54   55   56   57   58   59    60
##  [7,]   61   62   63   64   65   66   67   68   69    70
##  [8,]   71   72   73   74   75   76   77   78   79    80
##  [9,]   81   82   83   84   85   86   87   88   89    90
## [10,]   91   92   93   94   95   NA   NA   NA   NA    NA

5 Summary statistics

summ.df = function(df){
  df = df[c(sapply(df, is.numeric))]
  summ = psych::describe(df, na.rm = TRUE, skew=FALSE, 
                         quant = c(0.01,0.05,0.10,0.25,0.50,0.75,0.90,0.95,0.99))
  rnames = as.data.frame(row.names(summ))
  names(rnames) = c("var.name")
  summ = cbind(rnames,summ)
  rownames(summ) = NULL
  return(summ)
}

5.1 Test runs and checks

6 Inspect numeric variables

inspect = function(df,opt){
  df = df[c(sapply(df, is.numeric))]
  datalist = list()
  for(i in seq_along(names(df))){
    var = names(df)[i]
    psts = sum(ifelse(df[[i]]>0 & df[[i]]!=Inf, 1, 0), na.rm = TRUE)
    negs = sum(ifelse(df[[i]]<0 & df[[i]]!=-Inf, 1, 0), na.rm = TRUE)
    zeros = sum(ifelse(df[[i]]==0, 1, 0), na.rm = TRUE)
    pst.Inf = sum(ifelse(df[[i]]==Inf, 1, 0), na.rm = TRUE)
    neg.Inf = sum(ifelse(df[[i]]==-Inf, 1, 0), na.rm = TRUE)
    nans = sum(ifelse(is.nan(df[[i]])==TRUE, 1, 0), na.rm = TRUE)
    nas = sum(ifelse(is.na(df[[i]])==TRUE, 1, 0), na.rm = TRUE)
    only.nas = nas - nans
    non.finite = only.nas+nans+pst.Inf+neg.Inf
    finite = psts+ negs +zeros
    n = finite+non.finite
    datalist[[i]] = data.frame(var=var, n=n, psts=psts, negs=negs,
                               zeros=zeros,finite=finite,
                               non.finite=non.finite, only.nas=only.nas,
                               nans=nans, pst.Inf=pst.Inf,
                               neg.Inf=neg.Inf,
                               p.psts=psts/n, p.negs=negs/n,
                               p.zeros=zeros/n,p.finite=finite/n,
                               p.non.finite=non.finite/n, p.only.nas=only.nas/n,
                               p.nans=nans/n, p.pst.Inf=pst.Inf/n,
                               p.neg.Inf=neg.Inf/n)
    rm(var,n,psts,negs,zeros,finite,non.finite,
       only.nas,nans,pst.Inf,neg.Inf)
  }
  big_data = do.call(rbind, datalist)
  pr = grep("^p\\.",names(big_data),value=TRUE)
  non.pr = base::setdiff(names(big_data), pr)
  if (opt==1) {
    big_data = big_data[c("var", "n", pr )]
  } else if (opt==2) {
    big_data = big_data[non.pr]
  } else { 
    big_data = big_data}
  return(big_data)
}

6.1 Test runs and checks

7 Univariate t tests

diff.tests.df = function(df,t){
  df = df[c(sapply(df,is.numeric))]
  df = win.df(df,t)
  ttest.out = data.frame(var.name = double(), estimate=double(), estimate1=double(),
                         estimate2=double(), statistic=double(), p.value=double(),
                         parameter=double(), conf.low=double(), conf.high=double(),
                         method=character(), alternative=character(), statistic1=double(),
                         p.value1=double(), method1=character(), alternative1=character(),
                         stringsAsFactors=FALSE)
  for(i in seq_along(names(df))){
    ttest = tidy(t.test(df[[i]]))
    wilcox=  tidy(wilcox.test(df[[i]]))
    out = data.frame(var.name = names(df)[i], ttest, wilcox)
    ttest.out = rbind(ttest.out, out)
    rm(ttest, wilcox, out)
  }
  df = data.frame(summ.df(df), ttest.out)
  return(df)
}

8 Bivariate t tests

diff.tests.dfs = function(df1,df2,t){
df1 = df1[c(sapply(df1,is.numeric))]
df2 = df2[c(sapply(df2,is.numeric))]
common.vars = base::intersect(names(df1), names(df2))
df1 = df1[common.vars]
df2 = df2[common.vars]
df1 = win.df(df1,t)
df2 = win.df(df2,t)
ttest.out = data.frame(var.name = double(), estimate=double(), estimate1=double(),
                       estimate2=double(), statistic=double(), p.value=double(),
                       parameter=double(), conf.low=double(), conf.high=double(),
                       method=character(), alternative=character(), statistic1=double(),
                       p.value1=double(), method1=character(), alternative1=character(),
                      stringsAsFactors=FALSE)
  for(i in seq_along(common.vars)){
    ttest = tidy(t.test(df1[[i]], df2[[i]]))
    wilcox=  tidy(wilcox.test(df1[[i]],df2[[i]]))
    out = data.frame(var.name = common.vars[i], ttest, wilcox)
    ttest.out = rbind(ttest.out, out)
    rm(ttest, wilcox, out)
  }
 df = data.frame(summ.df(df1), summ.df(df2), ttest.out)
 return(df)
}

9 Bivariate t tests by groups

group.diff.tests.dfs = function(df1,df2,grp,t){
  df1[["grp"]] = df1[[grp]]
  df2[["grp"]] = df2[[grp]]
  grp1 = names(table(df1[[grp]]))
  grp2 = names(table(df2[[grp]]))
  common.grp = base::intersect(grp1, grp2)
  rm(grp1,grp2)
  grp.ttest.out=data.frame(var.name = double(), grp.name = double(),
                           estimate=double(), estimate1=double(),
                           estimate2=double(), statistic=double(), 
                           p.value=double(), parameter=double(), 
                           conf.low=double(), conf.high=double(),
                           method=character(), alternative=character(),
                           statistic1=double(), p.value1=double(),
                           method1=character(), alternative1=character(),
                           stringsAsFactors=FALSE)
  for(i in seq_along(common.grp)){
  x = subset(df1, grp == common.grp[i])
  x = x[c(sapply(x,is.numeric))]
  y = subset(df2, grp == common.grp[i])
  y = y[c(sapply(y,is.numeric))]
  out = diff.tests.dfs(x,y,t)
  out = data.frame(grp.name = rep(common.grp[i], each=dim(out)[1]), out)
  grp.ttest.out = rbind(grp.ttest.out,out)
  rm(x,y,out)
  }
  return(grp.ttest.out)
}

10 Adding * to variables

sign.clean = function(df,vr,pval){
  out = data.frame(df, vr = df[[vr]], pval = df[[pval]]) 
  out = out %>% 
    mutate(sign = case_when(pval<=0.01 ~ "*",
                            pval>0.01 & pval<=0.05 ~ "#",
                            pval>0.05 & pval<=0.10 ~ "~",
                            pval>0.10 ~ " "),
           vr = format(round(vr,3), nsmall = 3),
           vr = paste(as.character(vr), sign, sep = "")) %>% 
    select(-c(pval, sign)) %>% 
    as.data.frame()
  return(out)
}

11 Formatting data

format.clean = function(df,n){
  df1 = df[!c(sapply(df,is.numeric))]
  df2 = df[c(sapply(df,is.numeric))]
  for(i in seq_along(names(df2))){
    df2[[i]] = format(round(df2[[i]],digits=n), nsmall = n)
  }
  df = cbind(df1, df2)
  return(df)
  rm(df1,df2,i,n)
}



# ## Test runs
# X = win.df(x,.01); Y = win.df(x,.01)
# tmp = summ.df(x)
# tmp = format.clean(x,4)
# tmp = sign.clean(y,vr="x1",pval="x2")
# tmp1 = diff.tests.df(x,y,0)
# tmp2 = group.diff.tests.dfs(x,y,grp ="grp", 0)
# tmp3 = as.data.frame(bind_rows(tmp1,tmp2))
# tmp3 = tmp3 %>%
#        arrange(var.name, grp.name) %>%
#        mutate(grp.name = grp.name)