In this tutorial, we will only use base R functions to manipulate our data.
setwd("~/Dropbox/PIBIC2021/data") # Setting Working Directory
list.files("~/Dropbox/PIBIC2021/data") # Listing files in data folder
list.files("~/Dropbox/PIBIC2021/codebook") # Listing files in codebook folder
lapopBR19<-read.csv("LAPOP2019_work.csv", header=T, sep=",") # CSV
databr<-readstata13::read.dta13("Brazil_LAPOP_AmericasBarometer2019.dta",
generate.factors = TRUE,
nonint.factors = TRUE) # DTA
# After you load it, get to know your data
names(lapopBR19) # Column names
head(lapopBR19) # head of data
tail(lapopBR19) # tail of data
str(lapopBR19) # Structure
nrow(lapopBR19) # Number of rows
ncol(lapopBR19) # Number of columns
View(lapopBR19) #
# You need to either constantly reference the data set or attach it to reference variables
table(lapopBR19$ideology)
attach(lapopBR19)
table(ideology)
detach(lapopBR19)
lapopBR19$gender<-lapopBR19$q1
table(lapopBR19$gender)
table(lapopBR19$ideology, lapopBR19$gender)
psych::describe(lapopBR19$ideology)
# Scalar
scalar<-1
# Vectors
vector<-seq_along(1:22)
# List
list<-list(seq_along(1:10), LETTERS[1:10])
# Matrix
matrix.1
matrix.2
matrix.3
dim(matrix.1) # 4 x 3
dim(matrix.2) # 3 x 2
# ***** Identifying the Structure -- the 'str()' function
# different data structures
# identify the structure of each object
str(scalar)
str(vector)
str(list)
str(matrix)
str(list)
length(scalar)
length(vector)
dim(matrix)
nrow(matrix.3)
ncol(matrix.2)
# Find out the class of a vector or object
class(list) # Classe
class(matrix.3)
# Can create a vector with 'c()' or as.vector()
v <- c(8:17); v
v <- as.vector(8:17); v
# Vectors must be of the same type or they will automatically be coerced to the most flexible type.
# ***** Adding a value to a vector
v1 <- 1:3; v1
v1 <- c(v1, 4:5); v1
# * We can create attributes for the vector
# assigning names to a pre-existing vector
names(v1) <- letters[1:length(v1)]; v1
names(v1) <- c("Concordo Muito", "Concordo", "Não sei", "Discordo", "Discordo Muito"); v1
# adding names when creating vectors
v2 <- c(Esquerda = -1, Centro = 0, Direita = +1); v2
### Subsetting
v1
v1[2]
# Subsetting with negative integers will omit the elements at the specified positions:
v1[-3]
v1[-c(1:3)]
# * Subsetting with logical values
v1[c(FALSE, TRUE, FALSE, TRUE, FALSE)]
v1[v1 < 3]
v1[v1 < 2 | v1 > 4] # OU
# * Subsetting with names
# Subsetting with names will return the elements with the matching names specified:
v1["Don't Know"]
# * Simplifying vs. Preserving
# Its also important to understand the difference between simplifying and preserving when subsetting.
# Simplifying subsets returns the simplest possible data structure that can
# represent the output.
# Preserving subsets keeps the structure of the output the same
# as the input.
# For vectors, subsetting with single brackets [ ] preserves while subsetting with
# double brackets [[]] simplifies. The change you will notice when simplifying vectors
# is the removal of names.
v1
v1[1]
v1[[1]]
# bind several vectors together
v1 <- 1:4
v2 <- 5:8
cbind(v1, v2) # Column Bind
rbind(v1, v2) # Row Bind
v3 <- 9:12
cbind(v1, v2, v3)
rbind(v1, v2, v3)
# You can create a list with the 'list()' function
l <- list(
1:3, "a", c(TRUE, FALSE, TRUE), c(2.5, 4.2)
)
str(l)
l
# a list containing a list
l <- list( c(1:3), c(4:6), c(7:9))
str(l)
l
# ***** Adding on to a list
# To add additional list components to a list we can leverage the list() and
# append() functions. We can illustrate with the following list.
l1 <- list(1:3, c("a", "b", "c"), c(TRUE, FALSE, TRUE)); l1
str(l1)
# * Adding names
# Like vectors, we can add names to a list using the 'names()' function, or when we create
# the list
l1
# adding names to a pre-existing list
names(l1) <- c("numbers", "letters", "truefalse")
l1
# ***** Subsetting
# See for more info
# http://adv-r.had.co.nz/Subsetting.html#subsetting-operators
# To subset lists we can utilize the single bracket [ ], double brackets [[ ]],
# and dollar sign $ operators.
# extract first list item
l1[1]
l1[[1]]
l1["numbers"]
l1[["numbers"]]
l1$numbers
# extract multiple list items
l1[c(1,3)]
# same as above but using the items' names
l1[c("numbers", "truefalse")]
# * Subset list to get elements out of a list
# To extract individual elements out of a specific list item combine
# the [[ (or $) operator with the [ operator:
# extract third element from the second list item
l1[[2]][3]
# same as above but using the item's name
l1[["letters"]][3]
# same as above but using the `$` operator
l1$letters[3]
# All elements of a matrix must be of the same type/mode
# ***** Creating Matricies
# Numeric Matrix
m1 <- matrix(1:6, nrow = 2, ncol = 3)
m1; dim(m1) # 2 x 3
# Matrices can also be created using the column-bind cbind() and row-bind
# rbind() functions. However, keep in mind that the vectors that are being binded
# Thus, they ust be of equal length and mode.
# ***** Adding on to a matrix
m1 <- cbind(v1, v2); m1
# add a new column
cbind(m1, v3)
# or add a new row
rbind(m1, c(4.1, 8.1))
# Bind columns using 'cbind'
a<-c(1,2,3)
b<-c(4,5,6)
g<-cbind(a,b); g
# Bind rows using 'rbind'
w<-rbind(a, b); w
# add row names as an attribute
m2 <- matrix(1:12, nrow = 4, ncol = 3)
rownames(m2) <- c("row1", "row2", "row3", "row4")
m2
colnames(m2) <- c("col1", "col2", "col3")
m2
# ***** Subsetting
# By using different values in the rows and columns argument of m2[rows, columns],
# we can subset m2 in multiple ways.
# Note that subsetting matrices with the [ operator will simplify the results to the
# lowest possible dimension.
# subset a cell
m2[2, 3]
# subset for rows 1 and 2 but keep all columns
m2[1:2, ]
# subset for columns 1 and 3 but keep all rows
m2[ , c(1,3)]
# subset for both rows and columns
m2[1:2, c(1, 3)]
# use a vector to subset
v <- c(1, 2, 4)
f <- c(1, 3)
m2[v, f]
# use names to subset
m2[c("row1", "row3"), ]
# Simplify to vector
as.vector(m2[,2])
# Under the hood, a data frame is a list of equal-length vectors.
# Each element of the list can be thought of as a column and
# the length of each element of the list is the number of rows.
# As a result, data frames can store different classes of objects
# in each column (i.e., numeric, character, factor).
# We can create data.frames by reading in data or we can create them manually
df <- data.frame(col1 = 1:5,
col2 = c("M", "F", "F", "M", "F"),
col3 = c(TRUE, FALSE, TRUE, T, F),
col4 = c(2.5, 4.2, pi, 3.1, 1.7))
df
df$col2 <- as.factor(df$col2);
class(df$col2)
class(df$col4)
# convert a matrix to a data frame using as.data.frame()
m1 <- matrix(1:12, nrow = 4, ncol = 3); m1
class(m1)
m2 <- as.data.frame(m1)
class(m2); m2
# We can merge on columns to a data.frame using cbind()
df
v4 <- c("A", "B", "C", "D", "E")
cbind(df, v4)
# Note that is one of the objects in cbind is not a data.frame, a matrix will be returned
df$col5 <- c("A", "B", "C", "D", "E")
df
# We can also use the rbind() function to merge data frame rows together.
# To do that, they must have the same number of columns, and mostly importantly,
# in the same order, so cell values match their correct column.
df
df2 <- rbind(df, c(4,"F", T, 1.1, 5))
df2
# ***** Subsetting
# Selecting parts of a dataframe
# Dataframe[rows, columns]
# subsetting by row numbers
df[2:3, ]
# subsetting columns like a list
df[c("col2", "col4")]
# subsetting columns like a matrix
df[ , c("col2", "col4")]
# subset for both rows and columns
df
df[1:2, c(1, 3)]
# simplifying results in a named vector
df[ , 2]
# It’s possible to reorder the column by position as follow:
my_data2 <- df[, c(5, 4, 1, 2, 3)]
my_data2
# Reorder column by name
col_order <- c("col4", "col3", "col2", "col1")
my_data3 <- df[, col_order]
my_data3
load("~/Dropbox/PIBIC2021/data/datalapop.RData")
table(datalapop$country) # countries
table(datalapop$lapopwave) # years
table(datalapop$country, datalapop$lapopwave) # crosstab
datalapop$countrycode2 <- paste(datalapop$country, datalapop$lapopwave, sep="")
table(datalapop$countrycode2)
# Column selection
dataideo<-subset(datalapop, select=c("country", "lapopwave", "countrycode2", "ideo_lapop"))
str(dataideo)
View(dataideo)
# Central America Countries
datacentralamerica<-datalapop[datalapop$country=="Costa Rica" | datalapop$country=="El Salvador" |
datalapop$country=="Guatemala" | datalapop$country=="Honduras" |
datalapop$country=="Nicaragua", ] # Matrix notation
datacentral<-subset(datalapop, country=="Costa Rica" | country=="El Salvador" |
country=="Guatemala" | country=="Honduras" |
country=="Nicaragua") # subset(), == equals | 'OR'
datasul<-subset(datalapop, country!="Costa Rica" & country!="El Salvador" &
country!="Guatemala" & country!="Honduras" &
country!="Nicaragua") # subset(), != different & 'AND'
# Selectiing by Year
table(datalapop$lapopwave)
data2018<-subset(datalapop, lapopwave==2018)
table(data2018$lapopwave)
# Elections inside a timespan
data2010<-subset(datalapop, lapopwave>2008 & lapopwave<2014)
table(data2010$lapopwave)
# Subset by election
dataBR10<-subset(datalapop, country=="Brazil" & lapopwave=="2010")
dataBR2010<-subset(datalapop, countrycode=="Brazil2010")
# Columns subset
dataBR10_ideo<-subset(dataBR2010, select=c("countrycode", "ideo_lapop"))
# Brazil 2018 and only left-wing respondents
data2018left_NA <- data2018[data2018$ideo_lapop<5, ] # number of rows change, but not the number of columns
data2018left2 <- data2018[data2018$ideo_lapop %in% c(1:4), ] # all rows of column with values between 1 and 4
data2018left3 <- subset(data2018, data2018$ideo_lapop<5)
# Checking the number of NAs
table(is.na(data2018left_NA$ideo_lapop)) # 5703
table(is.na(data2018left2$ideo_lapop)) # 0
nrow(data2018left) - nrow(data2018left2)
mean(data2018left_NA$ideo_lapop)
mean(data2018left2$ideo_lapop)
mean(data2018left_NA$ideo_lapop, na.rm=T)
df
dfZ <- data.frame(col1 = 1:5,
gender = c("M", "F", "F", "M", "F"),
votedPT = c("Yes", "No", "Yes", "No", "Yes"),
econ_percep = c(.21, .5, .35, .7, .11),
feelthermoPT = c(25, 42, 0, 74, 90))
dfZ
# merge(x, y, by='col')
?merge
dfmerged<-merge(df, dfZ, by = 'col1'); dfmerged
dfmerged2<-merge(df, dfZ, by = 'col1', all = TRUE); dfmerged2
dfmerged3<-merge(df, dfZ, by = 'col1', all.x = T); dfmerged3
dfmerged4<-merge(df, dfZ, by = 'col1', all.y = T); dfmerged4
# When rows & columns differ...
dplyr::bind_rows(df, dfZ)
# Creating an empty column
dfmerged$col6<-NA
dfmerged