Data Types, Subsets, and Merging Data

In this tutorial, we will only use base R functions to manipulate our data.

setwd("~/Dropbox/PIBIC2021/data") # Setting Working Directory
list.files("~/Dropbox/PIBIC2021/data") # Listing files in data folder
list.files("~/Dropbox/PIBIC2021/codebook") # Listing files in codebook folder
lapopBR19<-read.csv("LAPOP2019_work.csv", header=T, sep=",") # CSV
databr<-readstata13::read.dta13("Brazil_LAPOP_AmericasBarometer2019.dta", 
                                generate.factors = TRUE, 
                                nonint.factors = TRUE) # DTA

# After you load it, get to know your data
names(lapopBR19) # Column names
head(lapopBR19) # head of data
tail(lapopBR19) # tail of data
str(lapopBR19) # Structure
nrow(lapopBR19) # Number of rows
ncol(lapopBR19) # Number of columns

View(lapopBR19) # 

# You need to either constantly reference the data set or attach it to reference variables
table(lapopBR19$ideology)

attach(lapopBR19)
table(ideology)
detach(lapopBR19)

lapopBR19$gender<-lapopBR19$q1
table(lapopBR19$gender)

table(lapopBR19$ideology, lapopBR19$gender)

psych::describe(lapopBR19$ideology)

Types of Data in R

# Scalar
scalar<-1
# Vectors
vector<-seq_along(1:22)
# List
list<-list(seq_along(1:10), LETTERS[1:10])
# Matrix
matrix.1
matrix.2
matrix.3 
dim(matrix.1) # 4 x 3 
dim(matrix.2) # 3 x 2

# ***** Identifying the Structure -- the 'str()' function
# different data structures
# identify the structure of each object
str(scalar)
str(vector)
str(list)
str(matrix)
str(list)

length(scalar)
length(vector)
dim(matrix)
nrow(matrix.3)
ncol(matrix.2)

# Find out the class of a vector or object
class(list) # Classe
class(matrix.3)

Managing Vectors

# Can create a vector with 'c()' or as.vector()
v <- c(8:17); v
v <- as.vector(8:17); v
# Vectors must be of the same type or they will automatically be coerced to the most flexible type. 

# ***** Adding a value to a vector
v1 <- 1:3; v1
v1 <- c(v1, 4:5); v1

# * We can create attributes for the vector
# assigning names to a pre-existing vector
names(v1) <- letters[1:length(v1)]; v1
names(v1) <- c("Concordo Muito", "Concordo", "Não sei", "Discordo", "Discordo Muito"); v1

# adding names when creating vectors
v2 <- c(Esquerda = -1, Centro = 0, Direita = +1); v2

### Subsetting
v1
v1[2]
# Subsetting with negative integers will omit the elements at the specified positions:
v1[-3]
v1[-c(1:3)]
# * Subsetting with logical values
v1[c(FALSE, TRUE, FALSE, TRUE, FALSE)]
v1[v1 < 3]
v1[v1 < 2 | v1 > 4] # OU

# * Subsetting with names
# Subsetting with names will return the elements with the matching names specified:
v1["Don't Know"]

# * Simplifying vs. Preserving 
# Its also important to understand the difference between simplifying and preserving when subsetting. 

# Simplifying subsets returns the simplest possible data structure that can 
# represent the output. 

# Preserving subsets keeps the structure of the output the same 
# as the input.

# For vectors, subsetting with single brackets [ ] preserves while subsetting with 
# double brackets [[]] simplifies. The change you will notice when simplifying vectors 
# is the removal of names.
v1
v1[1]
v1[[1]]

# bind several vectors together
v1 <- 1:4
v2 <- 5:8

cbind(v1, v2) # Column Bind
rbind(v1, v2) # Row Bind

v3 <- 9:12
cbind(v1, v2, v3)
rbind(v1, v2, v3)

Managing Lists

# You can create a list with the 'list()' function
l <- list(
  1:3, "a", c(TRUE, FALSE, TRUE), c(2.5, 4.2)
          )
str(l)
l

# a list containing a list
l <- list( c(1:3), c(4:6), c(7:9))
str(l)
l

# ***** Adding on to a list
# To add additional list components to a list we can leverage the list() and 
# append() functions. We can illustrate with the following list.
l1 <- list(1:3, c("a", "b", "c"), c(TRUE, FALSE, TRUE)); l1
str(l1)

# * Adding names
# Like vectors, we can add names to a list using the 'names()' function, or when we create
# the list
l1
# adding names to a pre-existing list
names(l1) <- c("numbers", "letters", "truefalse")
l1

# ***** Subsetting
# See for more info
# http://adv-r.had.co.nz/Subsetting.html#subsetting-operators

# To subset lists we can utilize the single bracket [ ], double brackets [[ ]], 
# and dollar sign $ operators.

# extract first list item
l1[1]
l1[[1]]
l1["numbers"]
l1[["numbers"]]
l1$numbers

# extract multiple list items
l1[c(1,3)]
# same as above but using the items' names
l1[c("numbers", "truefalse")]

# * Subset list to get elements out of a list
# To extract individual elements out of a specific list item combine 
# the [[ (or $) operator with the [ operator:
# extract third element from the second list item
l1[[2]][3]
# same as above but using the item's name
l1[["letters"]][3]
# same as above but using the `$` operator
l1$letters[3]

Managing Matrices

# All elements of a matrix must be of the same type/mode

# ***** Creating Matricies
# Numeric Matrix
m1 <- matrix(1:6, nrow = 2, ncol = 3)
m1; dim(m1) # 2 x 3

# Matrices can also be created using the column-bind cbind() and row-bind 
# rbind() functions. However, keep in mind that the vectors that are being binded 
# Thus, they ust be of equal length and mode.
# ***** Adding on to a matrix
m1 <- cbind(v1, v2); m1
# add a new column
cbind(m1, v3)
# or add a new row
rbind(m1, c(4.1, 8.1))

# Bind columns using 'cbind'
a<-c(1,2,3)
b<-c(4,5,6)
g<-cbind(a,b); g

# Bind rows using 'rbind'
w<-rbind(a, b); w

# add row names as an attribute
m2 <- matrix(1:12, nrow = 4, ncol = 3)
rownames(m2) <- c("row1", "row2", "row3", "row4")
m2
colnames(m2) <- c("col1", "col2", "col3")
m2

# ***** Subsetting
# By using different values in the rows and columns argument of m2[rows, columns], 
# we can subset m2 in multiple ways.
# Note that subsetting matrices with the [ operator will simplify the results to the 
# lowest possible dimension. 

# subset a cell
m2[2, 3]
# subset for rows 1 and 2 but keep all columns
m2[1:2, ]
# subset for columns 1 and 3 but keep all rows
m2[ , c(1,3)]
# subset for both rows and columns
m2[1:2, c(1, 3)]

# use a vector to subset
v <- c(1, 2, 4)
f <- c(1, 3)
m2[v, f]
# use names to subset
m2[c("row1", "row3"), ]

# Simplify to vector
as.vector(m2[,2])

Managing Data Frames

# Under the hood, a data frame is a list of equal-length vectors. 
# Each element of the list can be thought of as a column and 
# the length of each element of the list is the number of rows. 
# As a result, data frames can store different classes of objects 
# in each column (i.e., numeric, character, factor).

# We can create data.frames by reading in data or we can create them manually
df <- data.frame(col1 = 1:5,
                 col2 = c("M", "F", "F", "M", "F"),
                 col3 = c(TRUE, FALSE, TRUE, T, F),
                 col4 = c(2.5, 4.2, pi, 3.1, 1.7))
df

df$col2 <- as.factor(df$col2); 
class(df$col2)
class(df$col4)

# convert a matrix to a data frame using as.data.frame()
m1 <- matrix(1:12, nrow = 4, ncol = 3); m1
class(m1)
m2 <- as.data.frame(m1)
class(m2); m2

# We can merge on columns to a data.frame using cbind()
df
v4 <- c("A", "B", "C", "D", "E")
cbind(df, v4)

# Note that is one of the objects in cbind is not a data.frame, a matrix will be returned
df$col5 <- c("A", "B", "C", "D", "E")
df

# We can also use the rbind() function to merge data frame rows together.
# To do that, they must have the same number of columns, and mostly importantly,
# in the same order, so cell values match their correct column.
df
df2 <- rbind(df, c(4,"F", T, 1.1, 5))
df2

# ***** Subsetting
# Selecting parts of a dataframe
# Dataframe[rows, columns]

# subsetting by row numbers
df[2:3, ]
# subsetting columns like a list
df[c("col2", "col4")]
# subsetting columns like a matrix
df[ , c("col2", "col4")]
# subset for both rows and columns
df
df[1:2, c(1, 3)]
# simplifying results in a named vector
df[ , 2]

Reordering Columns

# It’s possible to reorder the column by position as follow:
my_data2 <- df[, c(5, 4, 1, 2, 3)]
my_data2

# Reorder column by name
col_order <- c("col4", "col3", "col2", "col1")
my_data3 <- df[, col_order]
my_data3

Subsetting datasets based on specific conditions

load("~/Dropbox/PIBIC2021/data/datalapop.RData")

table(datalapop$country) # countries
table(datalapop$lapopwave) # years
table(datalapop$country, datalapop$lapopwave) # crosstab
 
datalapop$countrycode2 <- paste(datalapop$country, datalapop$lapopwave, sep="")
table(datalapop$countrycode2)

# Column selection
dataideo<-subset(datalapop, select=c("country", "lapopwave", "countrycode2",  "ideo_lapop")) 
str(dataideo)
View(dataideo)

# Central America Countries
datacentralamerica<-datalapop[datalapop$country=="Costa Rica" | datalapop$country=="El Salvador" | 
                                datalapop$country=="Guatemala" | datalapop$country=="Honduras" | 
                                datalapop$country=="Nicaragua", ] # Matrix notation

datacentral<-subset(datalapop, country=="Costa Rica" | country=="El Salvador" | 
                      country=="Guatemala" | country=="Honduras" | 
                      country=="Nicaragua") # subset(), == equals | 'OR'
 
datasul<-subset(datalapop, country!="Costa Rica" & country!="El Salvador" & 
                  country!="Guatemala" & country!="Honduras" & 
                  country!="Nicaragua") #  subset(), != different & 'AND'

# Selectiing by Year
table(datalapop$lapopwave)
data2018<-subset(datalapop, lapopwave==2018)
table(data2018$lapopwave)

# Elections inside a timespan
data2010<-subset(datalapop, lapopwave>2008 & lapopwave<2014)
table(data2010$lapopwave)

# Subset by election
dataBR10<-subset(datalapop, country=="Brazil" & lapopwave=="2010")
dataBR2010<-subset(datalapop, countrycode=="Brazil2010")

# Columns subset
dataBR10_ideo<-subset(dataBR2010, select=c("countrycode", "ideo_lapop"))

# Brazil 2018 and only left-wing respondents
data2018left_NA <- data2018[data2018$ideo_lapop<5, ] # number of rows change, but not the number of columns

data2018left2 <- data2018[data2018$ideo_lapop %in% c(1:4), ] # all rows of column with values between 1 and 4
data2018left3 <- subset(data2018, data2018$ideo_lapop<5)

# Checking the number of NAs
table(is.na(data2018left_NA$ideo_lapop)) # 5703 
table(is.na(data2018left2$ideo_lapop)) # 0
nrow(data2018left) - nrow(data2018left2)

mean(data2018left_NA$ideo_lapop)
mean(data2018left2$ideo_lapop)
mean(data2018left_NA$ideo_lapop, na.rm=T)

Merging Datasets

df
dfZ <- data.frame(col1 = 1:5,
                 gender = c("M", "F", "F", "M", "F"),
                 votedPT = c("Yes", "No", "Yes", "No", "Yes"),
                 econ_percep = c(.21, .5, .35, .7, .11),
                 feelthermoPT = c(25, 42, 0, 74, 90))
dfZ

# merge(x, y, by='col')
?merge

dfmerged<-merge(df, dfZ, by = 'col1'); dfmerged
dfmerged2<-merge(df, dfZ, by = 'col1', all = TRUE); dfmerged2
dfmerged3<-merge(df, dfZ, by = 'col1', all.x = T); dfmerged3
dfmerged4<-merge(df, dfZ, by = 'col1', all.y = T); dfmerged4

# When rows & columns differ...
dplyr::bind_rows(df, dfZ)


# Creating an empty column
dfmerged$col6<-NA
dfmerged

Data Types, Subsets, and Merging Data

Robert Vidigal, PhD

Types of Data in R

Managing Vectors

Managing Lists

Managing Matrices

Managing Data Frames

Reordering Columns

Subsetting datasets based on specific conditions

Merging Datasets

Further Reading