Set Up

library(reticulate) # to use python in RStudio
library(tidyverse) # data wrangling and plotting with R

R code chunks are in light pink, while python in light blue.

Why Modules

In both R and Python, there are thousands of functions that have been created and available for everyone to use. If we import all of them every time we use either language, I don’t know about your computer, mine will immediately let me know how unhappy she is :P

Module, or libraries/packages are collections of functions that perform a set of tasks. For example, pandas in Python is widely used for handling dataframe, while ggplot2 in R is popular for data visualization. When working on a project in either R or Python, instead of loading all functions at once, we only need to load modules that are relevant to this specific project.

Import Libraries/Packages

Let’s import the numpy library in Python. It contains 616 functions.

import numpy
dir(numpy) # check out the functions in the package
## ['ALLOW_THREADS', 'AxisError', 'BUFSIZE', 'CLIP', 'ComplexWarning', 'DataSource', 'ERR_CALL', 'ERR_DEFAULT', 'ERR_IGNORE', 'ERR_LOG', 'ERR_PRINT', 'ERR_RAISE', 'ERR_WARN', 'FLOATING_POINT_SUPPORT', 'FPE_DIVIDEBYZERO', 'FPE_INVALID', 'FPE_OVERFLOW', 'FPE_UNDERFLOW', 'False_', 'Inf', 'Infinity', 'MAXDIMS', 'MAY_SHARE_BOUNDS', 'MAY_SHARE_EXACT', 'MachAr', 'ModuleDeprecationWarning', 'NAN', 'NINF', 'NZERO', 'NaN', 'PINF', 'PZERO', 'RAISE', 'RankWarning', 'SHIFT_DIVIDEBYZERO', 'SHIFT_INVALID', 'SHIFT_OVERFLOW', 'SHIFT_UNDERFLOW', 'ScalarType', 'Tester', 'TooHardError', 'True_', 'UFUNC_BUFSIZE_DEFAULT', 'UFUNC_PYVALS_NAME', 'VisibleDeprecationWarning', 'WRAP', '_NoValue', '_UFUNC_API', '__NUMPY_SETUP__', '__all__', '__builtins__', '__cached__', '__config__', '__doc__', '__file__', '__git_revision__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_add_newdoc_ufunc', '_distributor_init', '_globals', '_mat', '_pytesttester', 'abs', 'absolute', 'add', 'add_docstring', 'add_newdoc', 'add_newdoc_ufunc', 'alen', 'all', 'allclose', 'alltrue', 'amax', 'amin', 'angle', 'any', 'append', 'apply_along_axis', 'apply_over_axes', 'arange', 'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan', 'arctan2', 'arctanh', 'argmax', 'argmin', 'argpartition', 'argsort', 'argwhere', 'around', 'array', 'array2string', 'array_equal', 'array_equiv', 'array_repr', 'array_split', 'array_str', 'asanyarray', 'asarray', 'asarray_chkfinite', 'ascontiguousarray', 'asfarray', 'asfortranarray', 'asmatrix', 'asscalar', 'atleast_1d', 'atleast_2d', 'atleast_3d', 'average', 'bartlett', 'base_repr', 'binary_repr', 'bincount', 'bitwise_and', 'bitwise_not', 'bitwise_or', 'bitwise_xor', 'blackman', 'block', 'bmat', 'bool', 'bool8', 'bool_', 'broadcast', 'broadcast_arrays', 'broadcast_to', 'busday_count', 'busday_offset', 'busdaycalendar', 'byte', 'byte_bounds', 'bytes0', 'bytes_', 'c_', 'can_cast', 'cast', 'cbrt', 'cdouble', 'ceil', 'cfloat', 'char', 'character', 'chararray', 'choose', 'clip', 'clongdouble', 'clongfloat', 'column_stack', 'common_type', 'compare_chararrays', 'compat', 'complex', 'complex128', 'complex64', 'complex_', 'complexfloating', 'compress', 'concatenate', 'conj', 'conjugate', 'convolve', 'copy', 'copysign', 'copyto', 'core', 'corrcoef', 'correlate', 'cos', 'cosh', 'count_nonzero', 'cov', 'cross', 'csingle', 'ctypeslib', 'cumprod', 'cumproduct', 'cumsum', 'datetime64', 'datetime_as_string', 'datetime_data', 'deg2rad', 'degrees', 'delete', 'deprecate', 'deprecate_with_doc', 'diag', 'diag_indices', 'diag_indices_from', 'diagflat', 'diagonal', 'diff', 'digitize', 'disp', 'divide', 'divmod', 'dot', 'double', 'dsplit', 'dstack', 'dtype', 'e', 'ediff1d', 'einsum', 'einsum_path', 'emath', 'empty', 'empty_like', 'equal', 'errstate', 'euler_gamma', 'exp', 'exp2', 'expand_dims', 'expm1', 'extract', 'eye', 'fabs', 'fastCopyAndTranspose', 'fft', 'fill_diagonal', 'find_common_type', 'finfo', 'fix', 'flatiter', 'flatnonzero', 'flexible', 'flip', 'fliplr', 'flipud', 'float', 'float16', 'float32', 'float64', 'float_', 'float_power', 'floating', 'floor', 'floor_divide', 'fmax', 'fmin', 'fmod', 'format_float_positional', 'format_float_scientific', 'format_parser', 'frexp', 'frombuffer', 'fromfile', 'fromfunction', 'fromiter', 'frompyfunc', 'fromregex', 'fromstring', 'full', 'full_like', 'fv', 'gcd', 'generic', 'genfromtxt', 'geomspace', 'get_array_wrap', 'get_include', 'get_printoptions', 'getbufsize', 'geterr', 'geterrcall', 'geterrobj', 'gradient', 'greater', 'greater_equal', 'half', 'hamming', 'hanning', 'heaviside', 'histogram', 'histogram2d', 'histogram_bin_edges', 'histogramdd', 'hsplit', 'hstack', 'hypot', 'i0', 'identity', 'iinfo', 'imag', 'in1d', 'index_exp', 'indices', 'inexact', 'inf', 'info', 'infty', 'inner', 'insert', 'int', 'int0', 'int16', 'int32', 'int64', 'int8', 'int_', 'intc', 'integer', 'interp', 'intersect1d', 'intp', 'invert', 'ipmt', 'irr', 'is_busday', 'isclose', 'iscomplex', 'iscomplexobj', 'isfinite', 'isfortran', 'isin', 'isinf', 'isnan', 'isnat', 'isneginf', 'isposinf', 'isreal', 'isrealobj', 'isscalar', 'issctype', 'issubclass_', 'issubdtype', 'issubsctype', 'iterable', 'ix_', 'kaiser', 'kron', 'lcm', 'ldexp', 'left_shift', 'less', 'less_equal', 'lexsort', 'lib', 'linalg', 'linspace', 'little_endian', 'load', 'loads', 'loadtxt', 'log', 'log10', 'log1p', 'log2', 'logaddexp', 'logaddexp2', 'logical_and', 'logical_not', 'logical_or', 'logical_xor', 'logspace', 'long', 'longcomplex', 'longdouble', 'longfloat', 'longlong', 'lookfor', 'ma', 'mafromtxt', 'mask_indices', 'mat', 'math', 'matmul', 'matrix', 'matrixlib', 'max', 'maximum', 'maximum_sctype', 'may_share_memory', 'mean', 'median', 'memmap', 'meshgrid', 'mgrid', 'min', 'min_scalar_type', 'minimum', 'mintypecode', 'mirr', 'mod', 'modf', 'moveaxis', 'msort', 'multiply', 'nan', 'nan_to_num', 'nanargmax', 'nanargmin', 'nancumprod', 'nancumsum', 'nanmax', 'nanmean', 'nanmedian', 'nanmin', 'nanpercentile', 'nanprod', 'nanquantile', 'nanstd', 'nansum', 'nanvar', 'nbytes', 'ndarray', 'ndenumerate', 'ndfromtxt', 'ndim', 'ndindex', 'nditer', 'negative', 'nested_iters', 'newaxis', 'nextafter', 'nonzero', 'not_equal', 'nper', 'npv', 'numarray', 'number', 'obj2sctype', 'object', 'object0', 'object_', 'ogrid', 'oldnumeric', 'ones', 'ones_like', 'os', 'outer', 'packbits', 'pad', 'partition', 'percentile', 'pi', 'piecewise', 'place', 'pmt', 'poly', 'poly1d', 'polyadd', 'polyder', 'polydiv', 'polyfit', 'polyint', 'polymul', 'polynomial', 'polysub', 'polyval', 'positive', 'power', 'ppmt', 'printoptions', 'prod', 'product', 'promote_types', 'ptp', 'put', 'put_along_axis', 'putmask', 'pv', 'quantile', 'r_', 'rad2deg', 'radians', 'random', 'rate', 'ravel', 'ravel_multi_index', 'real', 'real_if_close', 'rec', 'recarray', 'recfromcsv', 'recfromtxt', 'reciprocal', 'record', 'remainder', 'repeat', 'require', 'reshape', 'resize', 'result_type', 'right_shift', 'rint', 'roll', 'rollaxis', 'roots', 'rot90', 'round', 'round_', 'row_stack', 's_', 'safe_eval', 'save', 'savetxt', 'savez', 'savez_compressed', 'sctype2char', 'sctypeDict', 'sctypeNA', 'sctypes', 'searchsorted', 'select', 'set_numeric_ops', 'set_printoptions', 'set_string_function', 'setbufsize', 'setdiff1d', 'seterr', 'seterrcall', 'seterrobj', 'setxor1d', 'shape', 'shares_memory', 'short', 'show_config', 'sign', 'signbit', 'signedinteger', 'sin', 'sinc', 'single', 'singlecomplex', 'sinh', 'size', 'sometrue', 'sort', 'sort_complex', 'source', 'spacing', 'split', 'sqrt', 'square', 'squeeze', 'stack', 'std', 'str', 'str0', 'str_', 'string_', 'subtract', 'sum', 'swapaxes', 'sys', 'take', 'take_along_axis', 'tan', 'tanh', 'tensordot', 'test', 'testing', 'tile', 'timedelta64', 'trace', 'tracemalloc_domain', 'transpose', 'trapz', 'tri', 'tril', 'tril_indices', 'tril_indices_from', 'trim_zeros', 'triu', 'triu_indices', 'triu_indices_from', 'true_divide', 'trunc', 'typeDict', 'typeNA', 'typecodes', 'typename', 'ubyte', 'ufunc', 'uint', 'uint0', 'uint16', 'uint32', 'uint64', 'uint8', 'uintc', 'uintp', 'ulonglong', 'unicode', 'unicode_', 'union1d', 'unique', 'unpackbits', 'unravel_index', 'unsignedinteger', 'unwrap', 'use_hugepage', 'ushort', 'vander', 'var', 'vdot', 'vectorize', 'version', 'void', 'void0', 'vsplit', 'vstack', 'warnings', 'where', 'who', 'zeros', 'zeros_like']
len(dir(numpy)) # how many are there?
## 616

Let’s import another library, the pandas library. It includes 141 functions.

import pandas
len(dir(pandas))
## 141

Specify Functions within Modules

Sometimes functions from different modules may share the same name. For example, test appears in both numpy and pandas library. How do we tell Python which test we are referring to?

Therefore, to avoid name conflicts, while calling a function in Python we specify both the module and function name like this: LIBRARY_NAME.FUNCTION_NAME

help(numpy.test)
## Help on PytestTester in module numpy._pytesttester object:
## 
## class PytestTester(builtins.object)
##  |  Pytest test runner.
##  |  
##  |  A test function is typically added to a package's __init__.py like so::
##  |  
##  |    from numpy._pytesttester import PytestTester
##  |    test = PytestTester(__name__).test
##  |    del PytestTester
##  |  
##  |  Calling this test function finds and runs all tests associated with the
##  |  module and all its sub-modules.
##  |  
##  |  Attributes
##  |  ----------
##  |  module_name : str
##  |      Full path to the package to test.
##  |  
##  |  Parameters
##  |  ----------
##  |  module_name : module name
##  |      The name of the module to test.
##  |  
##  |  Notes
##  |  -----
##  |  Unlike the previous ``nose``-based implementation, this class is not
##  |  publicly exposed as it performs some ``numpy``-specific warning
##  |  suppression.
##  |  
##  |  Methods defined here:
##  |  
##  |  __call__(self, label='fast', verbose=1, extra_argv=None, doctests=False, coverage=False, durations=-1, tests=None)
##  |      Run tests for module using pytest.
##  |      
##  |      Parameters
##  |      ----------
##  |      label : {'fast', 'full'}, optional
##  |          Identifies the tests to run. When set to 'fast', tests decorated
##  |          with `pytest.mark.slow` are skipped, when 'full', the slow marker
##  |          is ignored.
##  |      verbose : int, optional
##  |          Verbosity value for test outputs, in the range 1-3. Default is 1.
##  |      extra_argv : list, optional
##  |          List with any extra arguments to pass to pytests.
##  |      doctests : bool, optional
##  |          .. note:: Not supported
##  |      coverage : bool, optional
##  |          If True, report coverage of NumPy code. Default is False.
##  |          Requires installation of (pip) pytest-cov.
##  |      durations : int, optional
##  |          If < 0, do nothing, If 0, report time of all tests, if > 0,
##  |          report the time of the slowest `timer` tests. Default is -1.
##  |      tests : test or list of tests
##  |          Tests to be executed with pytest '--pyargs'
##  |      
##  |      Returns
##  |      -------
##  |      result : bool
##  |          Return True on success, false otherwise.
##  |      
##  |      Notes
##  |      -----
##  |      Each NumPy module exposes `test` in its namespace to run all tests for
##  |      it. For example, to run all tests for numpy.lib:
##  |      
##  |      >>> np.lib.test() #doctest: +SKIP
##  |      
##  |      Examples
##  |      --------
##  |      >>> result = np.lib.test() #doctest: +SKIP
##  |      ...
##  |      1023 passed, 2 skipped, 6 deselected, 1 xfailed in 10.39 seconds
##  |      >>> result
##  |      True
##  |  
##  |  __init__(self, module_name)
##  |      Initialize self.  See help(type(self)) for accurate signature.
##  |  
##  |  ----------------------------------------------------------------------
##  |  Data descriptors defined here:
##  |  
##  |  __dict__
##  |      dictionary for instance variables (if defined)
##  |  
##  |  __weakref__
##  |      list of weak references to the object (if defined)
help(pandas.test)
## Help on function test in module pandas.util._tester:
## 
## test(extra_args=None)

Why Alias in Python?

Note that our code may become quite lengthy if we have to type out the full name of a library every time when using one of its functions. This is where alias comes in handy.

import numpy as np
import pandas as pd
help(pd.test)
## Help on function test in module pandas.util._tester:
## 
## test(extra_args=None)

You could import all functions without specifying the library name as follows, but as discussed above, that might cause name conflicts and is therefore strongly discouraged.

from pandas import *

Handling name conflicts in R

In R, a module/package can be imported as such:

library(dplyr)

Notice there is no alias involved. If I suspect potential name conflicts for a specific function and would like to specify its package, the full name of the library needs to be used, like this PACKAGE_NAME::FUNCTION_NAME.

mpg |> 
  # use filter without specifying library
  filter(year == max(year)) |> 
  # use select from the dplyr library
  dplyr::select(-hwy) 
## # A tibble: 117 x 10
##    manufacturer model         displ  year   cyl trans   drv     cty fl    class 
##    <chr>        <chr>         <dbl> <int> <int> <chr>   <chr> <int> <chr> <chr> 
##  1 audi         a4              2    2008     4 manual~ f        20 p     compa~
##  2 audi         a4              2    2008     4 auto(a~ f        21 p     compa~
##  3 audi         a4              3.1  2008     6 auto(a~ f        18 p     compa~
##  4 audi         a4 quattro      2    2008     4 manual~ 4        20 p     compa~
##  5 audi         a4 quattro      2    2008     4 auto(s~ 4        19 p     compa~
##  6 audi         a4 quattro      3.1  2008     6 auto(s~ 4        17 p     compa~
##  7 audi         a4 quattro      3.1  2008     6 manual~ 4        15 p     compa~
##  8 audi         a6 quattro      3.1  2008     6 auto(s~ 4        17 p     midsi~
##  9 audi         a6 quattro      4.2  2008     8 auto(s~ 4        16 p     midsi~
## 10 chevrolet    c1500 suburb~   5.3  2008     8 auto(l~ r        14 r     suv   
## # ... with 107 more rows

If in a project, the function from one package is always the preferred one, we could use the conflicted package to specify that concisely. For example, in the following code, the select() function in the dplyr package is defined as preferred. More on the conflicted package.

conflicted::conflict_prefer("select","dplyr") # use select() in the dplyr package. 

Import functions only

Instead of importing the whole library, we can only import a specific function in Python.

from pandas import test

To do this in R

In R, the above LIBRARY_NAME::FUNCTION_NAME formula can be used to call a function from a library without importing the library in advance.

mpg |> 
  # calling the vis_miss() function from the visdat library
  visdat::vis_miss() +
  labs(title = "Missing in the mpg dataset")