Calling Python from R

Importing Modules

  • Functions and other data within Python modules and classes can be accessed via the $ operator (analogous to the way you would interact with an R list, environment, or reference class).
library(reticulate)
os <- import("os")
pd <- import("pandas")
os$getcwd()
## [1] "/Users/chenzhenghui/Dropbox/NTU_System_P/Data_Science_R_Python"
builtins <- import_builtins()
builtins$max(c(1, 2, 3, 4, 5))
## [1] 5

Type Conversions

  • When calling into Python, R data types are automatically converted to their equivalent Python types.

  • When values are returned from Python to R they are converted back to R types.

R Python Examples
Single-element vector Scalar 1, 1L, TRUE, "foo"
Multi-element vector List c(1.0, 2.0, 3.0), c(1L, 2L, 3L)
List of multiple types Tuple list(1L, TRUE, "foo")
Named list Dict list(a = 1L, b = 2.0), dict(x = x_data)
Matrix/Array NumPy ndarray matrix(c(1,2,3,4), nrow = 2, ncol = 2)
Data Frame Pandas DataFrame data.frame(x = c(1,2,3), y = c("a", "b", "c"))
Function Python function function(x) x + 1
Raw Python bytearray as.raw(c(1:10))
NULL, TRUE, FALSE None, True, False NULL, TRUE, FALSE
# import numpy and specify no automatic Python to R conversion
np <- import("numpy", convert = FALSE)

# do some array manipulations with NumPy
a <- np$array(c(1:4))
class(a)
## [1] "numpy.ndarray"         "python.builtin.object"
sum <- a$cumsum()

# convert to R explicitly at the end
py_to_r(sum)
## [1]  1  3  6 10

Sourcing Python scripts

## source.py ##
import pandas as pd
import matplotlib.pyplot as plt

def read_file(file):
  files = pd.read_csv(file, index_col=0, parse_dates=True)
  return files


def add(x, y):
  return x + y


PI = 3.14
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
source_python('source.py')
df <- as_tibble(read_file("air_quality_no2.csv"))
class(df)
## [1] "tbl_df"     "tbl"        "data.frame"
df
## # A tibble: 1,035 × 3
##    station_antwerp station_paris station_london
##              <dbl>         <dbl>          <dbl>
##  1           NaN           NaN               23
##  2            50.5          25               19
##  3            45            27.7             19
##  4           NaN            50.4             16
##  5           NaN            61.9            NaN
##  6           NaN            72.4             26
##  7           NaN            77.7             32
##  8           NaN            67.9             32
##  9           NaN            56               28
## 10           NaN            34.5             21
## # … with 1,025 more rows
PI
## [1] 3.14
py_run_string("py_x = 100")
source <- import("source")
source$PI
## [1] 3.14
py$PI
## [1] 3.14

Object Conversion

  • By default when Python objects are returned to R they are converted to their equivalent R types.

  • However, if you’d rather make conversion from Python to R explicit and deal in native Python objects by default you can pass convert = FALSE to the import function.

# import numpy and specify no automatic Python to R conversion
np <- import("numpy", convert = FALSE)

# do some array manipulations with NumPy
a <- np$array(c(1:4))
s <- a$cumsum()

# convert to R explicitly at the end
py_to_r(s)
## [1]  1  3  6 10
x <- np$arange(1, 9)$reshape(2L, 2L, 2L)
class(x)
## [1] "numpy.ndarray"         "python.builtin.object"
x
## [[[1. 2.]
##   [3. 4.]]
## 
##  [[5. 6.]
##   [7. 8.]]]
y <- py_to_r(x)
class(y)
## [1] "array"
y
## , , 1
## 
##      [,1] [,2]
## [1,]    1    3
## [2,]    5    7
## 
## , , 2
## 
##      [,1] [,2]
## [1,]    2    4
## [2,]    6    8

Calling R from Python

import rpy2
print(rpy2.__version__)

from rpy2.robjects.packages import importr
# import R's "base" package
base = importr('base')

# import R's "utils" package
utils = importr('utils')

# import R's "tidyverse" package
tidyverse = importr('tidyverse')

Installing packages

# R package names
packnames = ('ggplot2', 'hexbin')

# R vector of strings
from rpy2.robjects.vectors import StrVector
import rpy2.robjects.packages as rpackages


# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))

The r instance

import rpy2.robjects as robjects

pi = robjects.r['pi']
pi[0]

robjects.r('''
        # create a function `f`
        f <- function(r, verbose=FALSE) {
            if (verbose) {
                cat("I am calling f().\n")
            }
            2 * pi * r
        }
        # call the function `f` with argument value 3
        f(3)
        ''')

r_f = robjects.globalenv['f']
print(r_f.r_repr())
r_f = robjects.r['f']
res = r_f(3)

R vectors

len(robjects.r['pi'])
robjects.r['pi'][0]

Creating rpy2 vectors

res = robjects.StrVector(['abc', 'def'])
print(res.r_repr())

res = robjects.IntVector([1, 2, 3])
print(res.r_repr())

res = robjects.FloatVector([1.1, 2.2, 3.3])
print(res.r_repr())

Calling R functions

rsum = robjects.r['sum']
rsum(robjects.IntVector([1,2,3]))[0]

Examples

import rpy2.robjects as robjects

r = robjects.r

x = robjects.IntVector(range(10))
y = r.rnorm(10)

x
y

from rpy2.robjects import FloatVector
from rpy2.robjects.packages import importr
stats = importr('stats')
base = importr('base')

ctl = FloatVector([4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14])
trt = FloatVector([4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69])
group = base.gl(2, 10, 20, labels = ['Ctl','Trt'])
weight = ctl + trt

robjects.globalenv['weight'] = weight
robjects.globalenv['group'] = group
lm_D9 = stats.lm('weight ~ group')
print(stats.anova(lm_D9))

# omitting the intercept
lm_D90 = stats.lm('weight ~ group - 1')
print(base.summary(lm_D90))
  • 假設有的R檔(preprocess.R)
## preprocess.R

dataR = data.frame( Ingresos = c(23,45,24,23,54),
                    Bonos = c(23,45,12,67,54),
                    Deuda = c(23,4,1,6,3),
                    row.names = c("Nathy", "Tomas", "Joe", "Emily", "Javi") )
promedio_ingresos = mean(dataR$Ingresos)
Max_Ing = sort(dataR$Ingresos[dataR$Ingresos>promedio_ingresos])
import rpy2
from rpy2.robjects.packages import importr
import rpy2.robjects as robjects
r = robjects.r
output = r.source("preprocess.R")
output[0]
output[1]