library(reticulate) # to use python in RStudio
library(tidyverse) # data wrangling and plotting with R
In this note, R code chunks are in light pink, while python in light blue. I keep it mainly as a study note, but hopefully it might be of interest to fellow R users learning Python, and Python users learning R.
py$object_namer.object_nameThere is still a lot more we can discuss regarding data wrangling in R vs Python, so this is only part I for pandas. More to come in the future.
Panda examples and code in this note are mainly based on pandas.pydata.org
import pandas as pd
df = pd.DataFrame(
{
"Name": [
"Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth",
],
"Age": [22, 35, 58],
"Sex": ["male", "male", "female"],
}
)
Worth noting that we also have tibbles in R, which is similar to data frames, but with some outdated features removed. For more details please see here.
df <- data.frame(
Name = c("Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth"),
Age = c(11,35,58),
Sex = c("male", "male", "female")
)
library(tidyverse)
tb <- tibble(
Name = c("Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth"),
Age = c(11,35,58),
Sex = c("male", "male", "female")
)
df.describe()
## Age
## count 3.000000
## mean 38.333333
## std 18.230012
## min 22.000000
## 25% 28.500000
## 50% 35.000000
## 75% 46.500000
## max 58.000000
summary(df)
## Name Age Sex
## Length:3 Min. :11.00 Length:3
## Class :character 1st Qu.:23.00 Class :character
## Mode :character Median :35.00 Mode :character
## Mean :34.67
## 3rd Qu.:46.50
## Max. :58.00
pcovid = pd.read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv")
# read_csv is much fater than read.csv in R
rcovid <- read_csv(file = 'https://covid.ourworldindata.org/data/owid-covid-data.csv')
Let me sneak in some code to subset the data (more on data manipulation later). This also gives us an opportunity to check out how we can utilize R object in Python and python object in R, please see the next code chunk.
df <-rcovid |>
# select entity, continent and cases
select(entity=location, #you can change the column name at the same time
date,
continent,
new_deaths_smoothed_per_million
)
# it is simple to call object created in R, just use r.
df = r.df
df.head(10)
## entity date continent new_deaths_smoothed_per_million
## 0 Afghanistan 2020-02-24 Asia NaN
## 1 Afghanistan 2020-02-25 Asia NaN
## 2 Afghanistan 2020-02-26 Asia NaN
## 3 Afghanistan 2020-02-27 Asia NaN
## 4 Afghanistan 2020-02-28 Asia NaN
## 5 Afghanistan 2020-02-29 Asia NaN
## 6 Afghanistan 2020-03-01 Asia NaN
## 7 Afghanistan 2020-03-02 Asia NaN
## 8 Afghanistan 2020-03-03 Asia NaN
## 9 Afghanistan 2020-03-04 Asia NaN
test = df.sort_values(by="new_deaths_smoothed_per_million", ascending=False).head(5) #create an object in python so we can try calling it later in R
test
## entity date continent new_deaths_smoothed_per_million
## 61591 Gibraltar 2021-01-21 Europe 144.167
## 61590 Gibraltar 2021-01-20 Europe 139.927
## 61589 Gibraltar 2021-01-19 Europe 127.207
## 61592 Gibraltar 2021-01-22 Europe 122.966
## 61588 Gibraltar 2021-01-18 Europe 122.966
# simply use py$ to call an object created in python
test_py_2_r <- py$test
test_py_2_r
## entity date continent
## 61591 Gibraltar <environment: 0x000000003830eef0> Europe
## 61590 Gibraltar <environment: 0x0000000038316cc0> Europe
## 61589 Gibraltar <environment: 0x00000000383228d0> Europe
## 61592 Gibraltar <environment: 0x000000003832c5c0> Europe
## 61588 Gibraltar <environment: 0x0000000038334390> Europe
## new_deaths_smoothed_per_million
## 61591 144.167
## 61590 139.927
## 61589 127.207
## 61592 122.966
## 61588 122.966
df.info()
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 173714 entries, 0 to 173713
## Data columns (total 4 columns):
## entity 173714 non-null object
## date 173714 non-null object
## continent 173714 non-null object
## new_deaths_smoothed_per_million 148691 non-null float64
## dtypes: float64(1), object(3)
## memory usage: 5.3+ MB
df.describe()
## new_deaths_smoothed_per_million
## count 148691.000000
## mean 1.669912
## std 3.593401
## min 0.000000
## 25% 0.016000
## 50% 0.290000
## 75% 1.763000
## max 144.167000
There are many good functions to choose from, here are some of my favorites.
str(df)
## tibble [173,714 x 4] (S3: tbl_df/tbl/data.frame)
## $ entity : chr [1:173714] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ date : Date[1:173714], format: "2020-02-24" "2020-02-25" ...
## $ continent : chr [1:173714] "Asia" "Asia" "Asia" "Asia" ...
## $ new_deaths_smoothed_per_million: num [1:173714] NA NA NA NA NA NA NA NA NA NA ...
summary(df)
## entity date continent
## Length:173714 Min. :2020-01-01 Length:173714
## Class :character 1st Qu.:2020-09-16 Class :character
## Mode :character Median :2021-03-29 Mode :character
## Mean :2021-03-23
## 3rd Qu.:2021-09-29
## Max. :2022-04-02
##
## new_deaths_smoothed_per_million
## Min. : 0.000
## 1st Qu.: 0.016
## Median : 0.290
## Mean : 1.670
## 3rd Qu.: 1.763
## Max. :144.167
## NA's :25023
glimpse(df)
## Rows: 173,714
## Columns: 4
## $ entity <chr> "Afghanistan", "Afghanistan", "Afghani~
## $ date <date> 2020-02-24, 2020-02-25, 2020-02-26, 2~
## $ continent <chr> "Asia", "Asia", "Asia", "Asia", "Asia"~
## $ new_deaths_smoothed_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
df |> summarytools::descr() # from summarytools package
## Descriptive Statistics
## df$new_deaths_smoothed_per_million
## N: 173714
##
## new_deaths_smoothed_per_million
## ----------------- ---------------------------------
## Mean 1.67
## Std.Dev 3.59
## Min 0.00
## Q1 0.02
## Median 0.29
## Q3 1.76
## Max 144.17
## MAD 0.43
## IQR 1.75
## CV 2.15
## Skewness 7.48
## SE.Skewness 0.01
## Kurtosis 138.26
## N.Valid 148691.00
## Pct.Valid 85.60
skimr::skim(df) # from skimr package
| Name | df |
| Number of rows | 173714 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| entity | 0 | 1.00 | 4 | 32 | 0 | 239 | 0 |
| continent | 10320 | 0.94 | 4 | 13 | 0 | 6 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date | 0 | 1 | 2020-01-01 | 2022-04-02 | 2021-03-29 | 823 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| new_deaths_smoothed_per_million | 25023 | 0.86 | 1.67 | 3.59 | 0 | 0.02 | 0.29 | 1.76 | 144.17 | ▇▁▁▁▁ |
# our world in data created collectives of entities based on income, let's take a look
income<-df |>
# select three variables
select(entity,date, new_deaths_smoothed_per_million) |>
# filter only observations among these collective entities regarding income
filter(grepl("income",entity))
income |>
count(entity)
## # A tibble: 4 x 2
## entity n
## <chr> <int>
## 1 High income 802
## 2 Low income 770
## 3 Lower middle income 802
## 4 Upper middle income 802
# select two variables
income |>
select(date,new_deaths_smoothed_per_million)
## # A tibble: 3,176 x 2
## date new_deaths_smoothed_per_million
## <date> <dbl>
## 1 2020-01-22 NA
## 2 2020-01-23 NA
## 3 2020-01-24 NA
## 4 2020-01-25 NA
## 5 2020-01-26 NA
## 6 2020-01-27 NA
## 7 2020-01-28 0
## 8 2020-01-29 0
## 9 2020-01-30 0
## 10 2020-01-31 0
## # ... with 3,166 more rows
income = r.income # call the object "income" created in R
# select two variables
income[["date","new_deaths_smoothed_per_million"]]
## date new_deaths_smoothed_per_million
## 0 2020-01-22 NaN
## 1 2020-01-23 NaN
## 2 2020-01-24 NaN
## 3 2020-01-25 NaN
## 4 2020-01-26 NaN
## ... ... ...
## 3171 2022-03-29 0.449
## 3172 2022-03-30 0.450
## 3173 2022-03-31 0.432
## 3174 2022-04-01 0.407
## 3175 2022-04-02 0.395
##
## [3176 rows x 2 columns]
income[["date","new_deaths_smoothed_per_million"]].shape # shows (rows, columns)
## (3176, 2)
# filter observations based on values of two variables
income |>
filter(new_deaths_smoothed_per_million < 1,
entity == "High income")
## # A tibble: 193 x 3
## entity date new_deaths_smoothed_per_million
## <chr> <date> <dbl>
## 1 High income 2020-01-28 0
## 2 High income 2020-01-29 0
## 3 High income 2020-01-30 0
## 4 High income 2020-01-31 0
## 5 High income 2020-02-01 0
## 6 High income 2020-02-02 0
## 7 High income 2020-02-03 0
## 8 High income 2020-02-04 0
## 9 High income 2020-02-05 0
## 10 High income 2020-02-06 0
## # ... with 183 more rows
# filter observations based on values of two variables
income[(income["new_deaths_smoothed_per_million"] < 1) &
(income["entity"] == "Low income")]
## entity date new_deaths_smoothed_per_million
## 808 Low income 2020-02-29 0.000
## 809 Low income 2020-03-01 0.000
## 810 Low income 2020-03-02 0.000
## 811 Low income 2020-03-03 0.000
## 812 Low income 2020-03-04 0.000
## ... ... ... ...
## 1567 Low income 2022-03-29 0.009
## 1568 Low income 2022-03-30 0.009
## 1569 Low income 2022-03-31 0.011
## 1570 Low income 2022-04-01 0.011
## 1571 Low income 2022-04-02 0.012
##
## [764 rows x 3 columns]
income |>
filter(entity=="High income" | entity=="Low income") |>
select(date,new_deaths_smoothed_per_million)
## # A tibble: 1,572 x 2
## date new_deaths_smoothed_per_million
## <date> <dbl>
## 1 2020-01-22 NA
## 2 2020-01-23 NA
## 3 2020-01-24 NA
## 4 2020-01-25 NA
## 5 2020-01-26 NA
## 6 2020-01-27 NA
## 7 2020-01-28 0
## 8 2020-01-29 0
## 9 2020-01-30 0
## 10 2020-01-31 0
## # ... with 1,562 more rows
# use the .loc operator before the selection bracket []
income.loc[(income["entity"]=="High income")|(income["entity"]=="Low income"), # filter criteria
("date","new_deaths_smoothed_per_million")] # columns/variables we want
## date new_deaths_smoothed_per_million
## 0 2020-01-22 NaN
## 1 2020-01-23 NaN
## 2 2020-01-24 NaN
## 3 2020-01-25 NaN
## 4 2020-01-26 NaN
## ... ... ...
## 1567 2022-03-29 0.009
## 1568 2022-03-30 0.009
## 1569 2022-03-31 0.011
## 1570 2022-04-01 0.011
## 1571 2022-04-02 0.012
##
## [1572 rows x 2 columns]
income |>
mutate(new_deaths_round = round(new_deaths_smoothed_per_million))
## # A tibble: 3,176 x 4
## entity date new_deaths_smoothed_per_million new_deaths_round
## <chr> <date> <dbl> <dbl>
## 1 High income 2020-01-22 NA NA
## 2 High income 2020-01-23 NA NA
## 3 High income 2020-01-24 NA NA
## 4 High income 2020-01-25 NA NA
## 5 High income 2020-01-26 NA NA
## 6 High income 2020-01-27 NA NA
## 7 High income 2020-01-28 0 0
## 8 High income 2020-01-29 0 0
## 9 High income 2020-01-30 0 0
## 10 High income 2020-01-31 0 0
## # ... with 3,166 more rows
income["new_deaths_round"] = round(income["new_deaths_smoothed_per_million"])
income.head()
## entity date new_deaths_smoothed_per_million new_deaths_round
## 0 High income 2020-01-22 NaN NaN
## 1 High income 2020-01-23 NaN NaN
## 2 High income 2020-01-24 NaN NaN
## 3 High income 2020-01-25 NaN NaN
## 4 High income 2020-01-26 NaN NaN
income.drop(columns="new_deaths_round", inplace=True) #drop the column: new_deaths_round
test <- income |>
# new names = old names
rename(country=entity,
new_deaths_pm = new_deaths_smoothed_per_million)
head(test,5)
## # A tibble: 5 x 3
## country date new_deaths_pm
## <chr> <date> <dbl>
## 1 High income 2020-01-22 NA
## 2 High income 2020-01-23 NA
## 3 High income 2020-01-24 NA
## 4 High income 2020-01-25 NA
## 5 High income 2020-01-26 NA
test = income.rename(
# old names : new names
columns={
"entity" : "country",
"new_deaths_smoothed_per_million" : "new_deaths_pm"
}
)
test.head()
## country date new_deaths_pm
## 0 High income 2020-01-22 NaN
## 1 High income 2020-01-23 NaN
## 2 High income 2020-01-24 NaN
## 3 High income 2020-01-25 NaN
## 4 High income 2020-01-26 NaN
# a list of preload data available in r
data()
# choose a preload dataset cars in R. It has two variables: speed and distance needed to stop
head(cars)
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
If only the above summary statistics are needed, summary() function in R is handy. A more thourough list is available too using descr() in the summarytools package introduced above. However, if we want more tailor-made info, the following python code is nice and easy. The advantage of being multilingual, is that we can pick whichever language that suits the task at hand the most. :) Click here for a comprehensive list of summary statistics that we can summon through the following code.
cars=r.cars
cars.agg(
{
'speed' : ['max','mean','std','skew','kurt'],
'dist' : ['min','median','skew','kurt'],
}
)
## speed dist
## kurt -0.508994 0.405053
## max 25.000000 NaN
## mean 15.400000 NaN
## median NaN 36.000000
## min NaN 2.000000
## skew -0.117510 0.806895
## std 5.287644 NaN
income |>
group_by(entity) |>
summarise(mean_new_deaths=mean(new_deaths_smoothed_per_million,na.rm=TRUE),
median_new_deaths=median(new_deaths_smoothed_per_million,na.rm=TRUE),
sd_new_deaths=sd(new_deaths_smoothed_per_million,na.rm=TRUE)) |>
arrange(desc(mean_new_deaths)) # arrange in descending order
## # A tibble: 4 x 4
## entity mean_new_deaths median_new_deaths sd_new_deaths
## <chr> <dbl> <dbl> <dbl>
## 1 High income 2.38 2.11 1.65
## 2 Upper middle income 1.23 1.25 0.607
## 3 Lower middle income 0.488 0.427 0.376
## 4 Low income 0.0846 0.073 0.0612
# alternatively, use "summarise_" functions to do it
# summarize all numeric variables:
income |>
group_by(entity) |>
summarise_if(is.numeric,list(mean_deaths=mean,median_deaths=median,sd_deaths=sd),na.rm=TRUE)
## # A tibble: 4 x 4
## entity mean_deaths median_deaths sd_deaths
## <chr> <dbl> <dbl> <dbl>
## 1 High income 2.38 2.11 1.65
## 2 Low income 0.0846 0.073 0.0612
## 3 Lower middle income 0.488 0.427 0.376
## 4 Upper middle income 1.23 1.25 0.607
# summarize all variables:
cars |>
summarise_all(list(mean=mean,median=median), na.rm=TRUE) #get mean and median for all variables
## speed_mean dist_mean speed_median dist_median
## 1 15.4 42.98 15 36
When applying series of operations on one object, we can do piping in Python too as follows. Pretty cool, isn’t it? :P But the piping isn’t as convenient as it is in R (or maybe I haven’t found the best way?). For example, I couldn’t find a quick and easy way to continue the piping by sorting the entities by mean as I did in R in the above code chunk.
income\
.rename(columns = {'new_deaths_smoothed_per_million':'new_deaths_pm'})\
.groupby("entity")\
.agg({
'new_deaths_pm': ['mean','median','std']
})
## new_deaths_pm
## mean median std
## entity
## High income 2.376943 2.1125 1.654180
## Low income 0.084636 0.0730 0.061153
## Lower middle income 0.488049 0.4270 0.375865
## Upper middle income 1.233544 1.2500 0.606711
income |>
count(entity)
## # A tibble: 4 x 2
## entity n
## <chr> <int>
## 1 High income 802
## 2 Low income 770
## 3 Lower middle income 802
## 4 Upper middle income 802
income['entity'].value_counts()
## Upper middle income 802
## High income 802
## Lower middle income 802
## Low income 770
## Name: entity, dtype: int64
Here is an example of piping in R
rcovid |>
select(entity=location,date,new_deaths_smoothed_per_million) |>
filter(grepl("income",entity), #select relevant observations
date >= as.Date("2020-10-01")) |>
mutate(#change units to enhance axis readability
new_deaths_per_billion=new_deaths_smoothed_per_million * 1000,
# set the factor order, so the facets are ordered by income in the graph
entity=fct_relevel(as.factor(entity), "High income",
"Upper middle income","Lower middle income","Low income")) |>
select(-new_deaths_smoothed_per_million) |>
ggplot(aes(x=date,y=new_deaths_per_billion,fill=entity))+
geom_area(alpha=0.5)+
scale_fill_brewer(palette="Dark2")+
# customize x axis labels
scale_x_date(date_labels = "%b %y",
date_breaks = "4 months")+
facet_grid(rows=vars(entity),scales="free_y")+
theme_minimal()+
labs(title="Daily New Deaths per Billion in the World by Income",
subtitle=glue::glue("Data since Oct 2020, last updated {Sys.Date()}"),
caption="By: @mena_wang Data: ourworldindata")+
theme(legend.position="none",
axis.title=element_blank(),
plot.caption = element_text(color="grey50"),
plot.subtitle = element_text(color="grey50",face="italic"))
# save this image on local disk
ggsave("images/daily_new_deaths_by_income.png")