2023/01/31 (updated: 2024-08-19)
…
5 + 7
## [1] 12
45/9
## [1] 5
4*(12-7)
## [1] 20
sqrt(25)
## [1] 5
# Um script permite salvar seu trabalho # Esse aqui é só um exemplo 5 + 7 # o R pode somar 3 * 8 # Pode multiplicar 4*(12-7) # sabe usar as propriedades da adição e da multiplicação sqrt(25) # sabe até encontrar a raiz de um número # Em geral você vai trabalhar com scripts
install_packages().library()library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ── ## ✔ dplyr 1.1.4 ✔ readr 2.1.4 ## ✔ forcats 1.0.0 ✔ stringr 1.5.1 ## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1 ## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0 ## ✔ purrr 1.0.2 ## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── ## ✖ dplyr::filter() masks stats::filter() ## ✖ dplyr::lag() masks stats::lag() ## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
soma <- 5 + 3 print(soma)
## [1] 8
soma <- 5 + 3 soma
## [1] 8
soma <- 5+3 class(soma)
## [1] "numeric"
curso <- "Economia" class(curso)
## [1] "character"
logic <- 5 == 7 class(logic)
## [1] "logical"
x <- 5 y <- 3 x+y
## [1] 8
z <- x+y z
## [1] 8
nome <- "Roberto" sobrenome <- "Ellery" paste(nome,sobrenome)
## [1] "Roberto Ellery"
nome.completo <- paste(nome,sobrenome) nome.completo
## [1] "Roberto Ellery"
c() é usada para inserir vetores no R.world.pop <- c(2525779,3026003,3691173,4449049,5320817,6127700,6916183) world.pop
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
pop.first <- c(2525779,3026003,3691173) pop.second <- c(4449049,5320817,6127700,6916183) pop.all <- c(pop.first, pop.second) pop.all
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
world.pop[2]
## [1] 3026003
world.pop[c(1,5)]
## [1] 2525779 5320817
world.pop[-3]
## [1] 2525779 3026003 4449049 5320817 6127700 6916183
world.pop[1:3]
## [1] 2525779 3026003 3691173
pop.million <- world.pop/1000 pop.million
## [1] 2525.779 3026.003 3691.173 4449.049 5320.817 6127.700 6916.183
pop.rate <- world.pop/world.pop[1] pop.rate
## [1] 1.000000 1.198047 1.461400 1.761456 2.106604 2.426063 2.738238
world.pop
## [1] 2525779 3026003 3691173 4449049 5320817 6127700 6916183
pop.final <- world.pop[-1] pop.final
## [1] 3026003 3691173 4449049 5320817 6127700 6916183
pop.inicial <- world.pop[-7] pop.inicial
## [1] 2525779 3026003 3691173 4449049 5320817 6127700
pop.increase <- pop.final - pop.inicial percent.increase <- (pop.increase/pop.inicial) * 100 percent.increase
## [1] 19.80474 21.98180 20.53212 19.59448 15.16464 12.86752
percent.increase
## [1] 19.80474 21.98180 20.53212 19.59448 15.16464 12.86752
percent.increase[c(1,2)] <- c(20,22) percent.increase
## [1] 20.00000 22.00000 20.53212 19.59448 15.16464 12.86752
length(world.pop)
## [1] 7
min(world.pop)
## [1] 2525779
mean(world.pop)
## [1] 4579529
sum(world.pop)/length(world.pop)
## [1] 4579529
year <- seq(from = 1950, to = 2010, by = 10) year
## [1] 1950 1960 1970 1980 1990 2000 2010
seq(to = 2010, from = 1950, by = 10)
## [1] 1950 1960 1970 1980 1990 2000 2010
seq(from = 2010, to = 1950, by = -10)
## [1] 2010 2000 1990 1980 1970 1960 1950
2005:2010
## [1] 2005 2006 2007 2008 2009 2010
2010:2005
## [1] 2010 2009 2008 2007 2006 2005
names(world.pop)
## NULL
names(world.pop) <- year names(world.pop)
## [1] "1950" "1960" "1970" "1980" "1990" "2000" "2010"
world.pop
## 1950 1960 1970 1980 1990 2000 2010 ## 2525779 3026003 3691173 4449049 5320817 6127700 6916183
function() cria novas funções.function():my.summary <- function(x) { #a função usa um input que é o vetor x
s.out <- sum(x)
i.out <- length(x)
m.out <- s.out/i.out
out <- c(s.out, i.out, m.out) #define os outputs
names(out) <- c("sum", "length", "mean") #adicionar rótulos
return(out) #encerra a função e retorna o output
}
z <- 1:10 my.summary(z)
## sum length mean ## 55.0 10.0 5.5
my.summary(world.pop)
## sum length mean ## 32056704 7 4579529
nomes <- c("Paulo", "José", "Maria", "Ana", "Pedro")
altura <- c(1.80, 1.75, 1.70, 1.60, 1.90)
genero <- factor(c("masc", "masc", "fem", "fem", "masc"))
my.df <- data.frame(nomes, altura, genero)
str(my.df)
## 'data.frame': 5 obs. of 3 variables: ## $ nomes : chr "Paulo" "José" "Maria" "Ana" ... ## $ altura: num 1.8 1.75 1.7 1.6 1.9 ## $ genero: Factor w/ 2 levels "fem","masc": 2 2 1 1 2
summary(my.df)
## nomes altura genero ## Length:5 Min. :1.60 fem :2 ## Class :character 1st Qu.:1.70 masc:3 ## Mode :character Median :1.75 ## Mean :1.75 ## 3rd Qu.:1.80 ## Max. :1.90
my.df$nomes
## [1] "Paulo" "José" "Maria" "Ana" "Pedro"
my.df$altura
## [1] 1.80 1.75 1.70 1.60 1.90
my.df$genero
## [1] masc masc fem fem masc ## Levels: fem masc
mean(my.df$altura)
## [1] 1.75
read.table(), mas funções como read.csv() e read.csv2() podem ser úteis;…
dados <- read.table("map1.txt", header=TRUE, sep=",")
str(dados)
## 'data.frame': 5 obs. of 3 variables: ## $ VarA: num 1.5 3.2 5.7 7.6 9.2 ## $ VarB: num 2.3 4.1 6.7 8.2 10.1 ## $ VarC: int 3 7 11 15 19
dados <- read.table("map1.txt", sep=",")
str(dados)
## 'data.frame': 6 obs. of 3 variables: ## $ V1: chr "VarA" "1.5" "3.2" "5.7" ... ## $ V2: chr " VarB" " 2.3" " 4.1" " 6.7" ... ## $ V3: chr " VarC" " 3" " 7" " 11" ...
dados <- read.table("map1.txt", sep=";")
str(dados)
## 'data.frame': 6 obs. of 1 variable: ## $ V1: chr "VarA, VarB, VarC" "1.5, 2.3, 3" "3.2, 4.1, 7" "5.7, 6.7, 11" ...
dados <- read.table("map1.txt", header=TRUE,
sep = ",", dec=",")
str(dados)
## 'data.frame': 5 obs. of 3 variables: ## $ VarA: chr "1.5" "3.2" "5.7" "7.6" ... ## $ VarB: chr " 2.3" " 4.1" " 6.7" " 8.2" ... ## $ VarC: int 3 7 11 15 19
…
dados <- read.table("map2.txt", header=TRUE, sep=";",
dec=",", skip=4)
str(dados)
## 'data.frame': 7 obs. of 3 variables: ## $ Nome : chr "Paulo" "Francisco" "Maria" "Natalia" ... ## $ Altura: num 1.82 1.7 1.75 1.6 1.6 1.75 1.78 ## $ Genero: chr " Masculino" " Masculino" " Feminino" " Feminino" ...
str(dados)
## 'data.frame': 7 obs. of 3 variables: ## $ Nome : chr "Paulo" "Francisco" "Maria" "Natalia" ... ## $ Altura: num 1.82 1.7 1.75 1.6 1.6 1.75 1.78 ## $ Genero: chr " Masculino" " Masculino" " Feminino" " Feminino" ...
summary(dados)
## Nome Altura Genero ## Length:7 Min. :1.600 Length:7 ## Class :character 1st Qu.:1.650 Class :character ## Mode :character Median :1.750 Mode :character ## Mean :1.714 ## 3rd Qu.:1.765 ## Max. :1.820
dados$Genero <- as.factor(dados$Genero) summary(dados)
## Nome Altura Genero ## Length:7 Min. :1.600 Feminino :3 ## Class :character 1st Qu.:1.650 Masculino:4 ## Mode :character Median :1.750 ## Mean :1.714 ## 3rd Qu.:1.765 ## Max. :1.820
…
url <- "https://people.sc.fsu.edu/~jburkardt/data/csv/deniro.csv" deniro <- read.table(url, header = TRUE, sep=",")
head(deniro, n=10)
## Year Score Title ## 1 1968 86 Greetings ## 2 1970 17 Bloody Mama ## 3 1970 73 Hi, Mom! ## 4 1971 40 Born to Win ## 5 1973 98 Mean Streets ## 6 1973 88 Bang the Drum Slowly ## 7 1974 97 The Godfather, Part II ## 8 1976 41 The Last Tycoon ## 9 1976 99 Taxi Driver ## 10 1977 47 1900
str(deniro)
## 'data.frame': 87 obs. of 3 variables: ## $ Year : int 1968 1970 1970 1971 1973 1973 1974 1976 1976 1977 ... ## $ Score: int 86 17 73 40 98 88 97 41 99 47 ... ## $ Title: chr " Greetings" " Bloody Mama" " Hi, Mom!" " Born to Win" ...
summary(deniro)
## Year Score Title ## Min. :1968 Min. : 4.0 Length:87 ## 1st Qu.:1988 1st Qu.: 38.0 Class :character ## Median :1997 Median : 65.0 Mode :character ## Mean :1996 Mean : 58.2 ## 3rd Qu.:2007 3rd Qu.: 80.0 ## Max. :2016 Max. :100.0
load("PWT100a.Rda")
select() e para filtrar a função filter().select() e filter() “aninhadas” (nested), ou seja, uma como argumento da outra.filter(select(pwt, countrycode, rgdpo, year), countrycode == "BRA")
## # A tibble: 70 × 3 ## countrycode rgdpo year ## <fct> <dbl> <dbl> ## 1 BRA 88443. 1950 ## 2 BRA 91100. 1951 ## 3 BRA 100770. 1952 ## 4 BRA 104141. 1953 ## 5 BRA 113028. 1954 ## 6 BRA 121660. 1955 ## 7 BRA 126157. 1956 ## 8 BRA 137198. 1957 ## 9 BRA 145166. 1958 ## 10 BRA 151863. 1959 ## # ℹ 60 more rows
%>%pwt %>% select(countrycode, year, rgdpo) %>% filter(countrycode == "BRA") %>% head()
## # A tibble: 6 × 3 ## countrycode year rgdpo ## <fct> <dbl> <dbl> ## 1 BRA 1950 88443. ## 2 BRA 1951 91100. ## 3 BRA 1952 100770. ## 4 BRA 1953 104141. ## 5 BRA 1954 113028. ## 6 BRA 1955 121660.
pib_BR <- pwt %>% select(code = countrycode, ano = year, pib = rgdpo) %>% filter(code == "BRA")
pwt %>%
select(code = countrycode, pais = country, ano = year,
starts_with("rgdp")) %>%
filter(code %in% c("ARG", "BRA"), ano == 2019)
## # A tibble: 2 × 6 ## code pais ano rgdpe rgdpo rgdpna ## <fct> <fct> <dbl> <dbl> <dbl> <dbl> ## 1 ARG Argentina 2019 991646. 977421. 975569 ## 2 BRA Brazil 2019 3089274. 3080048. 3042119
mutate() faz esse trabalho.pwt %>% select(code = countrycode, ano = year, pib = rgdpna, pop) %>% filter(code == "BRA", ano >= 2011) %>% mutate(pib.pc = pib/pop) %>% select(-code, -pib, - pop) %>% tail(n=5)
## # A tibble: 5 × 2 ## ano pib.pc ## <dbl> <dbl> ## 1 2015 14815. ## 2 2016 14212. ## 3 2017 14285. ## 4 2018 14360. ## 5 2019 14414.
group_by() e summarize() permitem agrupar os dados de acordo com variáveis de interesse e obter estatísticas relativas aos grupo.paises <- c("BRA", "ARG", "COL", "PER", "BOL", "MEX")
pwt %>%
select(code = countrycode, pais = country, ano = year,
pib = rgdpna, pop) %>%
filter(code %in% paises, ano >= 2015) %>%
mutate(pib.pc = pib/pop) %>%
group_by(code) %>%
summarise(m_pib.pc = mean(pib.pc)) %>%
ungroup()
## # A tibble: 6 × 2 ## code m_pib.pc ## <fct> <dbl> ## 1 ARG 22808. ## 2 BOL 8152. ## 3 BRA 14417. ## 4 COL 13521. ## 5 MEX 18830. ## 6 PER 11979.
df.exemplo <- pwt %>%
select(code = countrycode, pais = country, ano = year,
pib = rgdpna, pop) %>%
filter(code %in% paises, ano >= 2015) %>%
mutate(pib.pc = pib/pop) %>%
group_by(code, pais) %>%
summarise(media = mean(pib.pc),
desv.pad = sd(pib.pc),
max = max(pib.pc),
min = min(pib.pc),
mediana = median(pib.pc)) %>%
ungroup()
## `summarise()` has grouped output by 'code'. You can override using the ## `.groups` argument.
pop5_1960 <- pwt %>%
select(code = countrycode, pop, year) %>%
filter(year == 1960, pop > 5) %>%
pull(code)
df_pib.pc <- pwt %>%
select(code = countrycode, pais = country, ano = year,
pib = rgdpo, pop, emp) %>%
filter(code %in% pop5_1960) %>%
filter(ano %in% c(1960, 1990, 2019)) %>%
mutate(pib.pc = pib/(1000*pop),
l_pib.pc = log(pib.pc),
pib.emp = pib/(1000*emp),
l_pib.emp = log(pib.emp))
df_pib.pc %>% filter(ano == 1960) %>% select(pais, pib.pc) %>% slice_max(order_by = pib.pc, n = 5)
## # A tibble: 5 × 2 ## pais pib.pc ## <fct> <dbl> ## 1 Switzerland 23.2 ## 2 United States 19.1 ## 3 Australia 15.8 ## 4 Canada 15.4 ## 5 Sweden 13.5
df_pib.pc %>% filter(ano == 2019) %>% select(pais, pib.pc) %>% slice_max(order_by = pib.pc, n = 5)
## # A tibble: 5 × 2 ## pais pib.pc ## <fct> <dbl> ## 1 Switzerland 75.3 ## 2 United States 62.6 ## 3 Netherlands 55.6 ## 4 Australia 54.1 ## 5 Austria 53.3
df_pib.pc %>% filter(ano == 2019) %>% select(pais, pib.pc) %>% slice_max(order_by = pib.pc, prop = 0.1)
## # A tibble: 5 × 2 ## pais pib.pc ## <fct> <dbl> ## 1 Switzerland 75.3 ## 2 United States 62.6 ## 3 Netherlands 55.6 ## 4 Australia 54.1 ## 5 Austria 53.3
df_pib.pc %>% filter(ano == 2019) %>% select(pais, pib.pc) %>% slice_min(order_by = pib.pc, prop = 0.1)
## # A tibble: 5 × 2 ## pais pib.pc ## <fct> <dbl> ## 1 Venezuela (Bolivarian Republic of) 0.251 ## 2 D.R. of the Congo 1.02 ## 3 Mozambique 1.23 ## 4 Madagascar 1.54 ## 5 Uganda 2.09
pwt %>% select(code = countrycode, ano = year, tx_i = csh_i) %>% filter(code %in% pop5_1960, ano >= 1980) %>% group_by(ano) %>% summarise(m_tx_i = mean(tx_i)) %>% ungroup() %>% slice_max(order_by = m_tx_i, n=5)
## # A tibble: 5 × 2 ## ano m_tx_i ## <dbl> <dbl> ## 1 2008 0.255 ## 2 2011 0.250 ## 3 1980 0.250 ## 4 2012 0.249 ## 5 2007 0.246
relig_income %>%
pivot_longer(-religion, names_to = "income",
values_to = "frequency") %>%
head(n=5)
## # A tibble: 5 × 3 ## religion income frequency ## <chr> <chr> <dbl> ## 1 Agnostic <$10k 27 ## 2 Agnostic $10-20k 34 ## 3 Agnostic $20-30k 60 ## 4 Agnostic $30-40k 81 ## 5 Agnostic $40-50k 76
billboard2 <- billboard %>%
pivot_longer(wk1:wk76,
names_to = "week",
values_to = "rank",
values_drop_na = TRUE)
billboard3 <- billboard2 %>%
mutate(week = as.integer(gsub("wk","",week)),
date = as.Date(date.entered) + 7 * (week-1),
date.entered = NULL)
read_excel() do pacote readxllibrary(readxl)
dados <- read_excel("WEO_data.xlsx", na = c("","n/a", "--"))
dados1 <- dados %>%
select(-Units, -Scale,
-`Country/Series-specific Notes`, -`Estimates Start After`)
dados2 <- dados1 %>%
filter(`Subject Descriptor` %in%
c("Gross domestic product, constant prices",
"General government gross debt"))
dados3 <- dados2 %>%
pivot_longer(names_to = "ano", values_to = "valores",
`2011`:`2021`)
pivot_wider() faz esse trabalhodados4 <- dados3 %>%
pivot_wider(names_from = `Subject Descriptor`,
values_from = "valores") %>%
rename(pais = Country,
cresc = `Gross domestic product, constant prices`,
div_pib = `General government gross debt`) %>%
filter(ano != 2021)
dados5 <- dados4 %>%
group_by(pais) %>%
summarise(cresc = mean(cresc, na.rm = TRUE),
div_pib = mean(div_pib, na.rm = TRUE)) %>%
ungroup()
cor.test(dados5$cresc, dados5$div_pib)
## ## Pearson's product-moment correlation ## ## data: dados5$cresc and dados5$div_pib ## t = -3.9696, df = 190, p-value = 0.000102 ## alternative hypothesis: true correlation is not equal to 0 ## 95 percent confidence interval: ## -0.4025674 -0.1406379 ## sample estimates: ## cor ## -0.2767349
dados.final <- dados %>%
select(-Units, -Scale, -`Country/Series-specific Notes`, -`Estimates Start After`) %>%
filter(`Subject Descriptor` %in% c("Gross domestic product, constant prices",
"General government gross debt")) %>%
pivot_longer(names_to = "ano", values_to = "valores", `2011`:`2021`) %>%
pivot_wider(names_from = `Subject Descriptor`, values_from = "valores") %>%
rename(pais = Country, cresc = `Gross domestic product, constant prices`,
div_pib = `General government gross debt`) %>%
filter(ano != 2021) %>%
group_by(pais) %>%
summarise(cresc = mean(cresc, na.rm = TRUE),
div_pib = mean(div_pib, na.rm = TRUE)) %>%
ungroup()
cor.test(dados.final$cresc, dados.final$div_pib)
## ## Pearson's product-moment correlation ## ## data: dados.final$cresc and dados.final$div_pib ## t = -3.9696, df = 190, p-value = 0.000102 ## alternative hypothesis: true correlation is not equal to 0 ## 95 percent confidence interval: ## -0.4025674 -0.1406379 ## sample estimates: ## cor ## -0.2767349
left_join(): inclui todas as linhas do \(df_1\)right_join(): inlui todas as linhas do \(df_2\)full_join(): inclui as linhas comuns em \(df_1\) e \(df_2\)inner_join(): inclui todas as linhas em \(df_1\) ou \(df_2\)head(df1)
## time pontos ## 1 Atlético Mineiro 84 ## 2 Flamengo 71 ## 3 Palmeiras 66 ## 4 Fortaleza 58 ## 5 Corinthians 57
head(df2)
## time pontos ## 1 Palmeiras 81 ## 2 Internacional 73 ## 3 Fluminense 70 ## 4 Corinthians 65 ## 5 Flamengo 62
left_join(df1,df2, by=c("time"="time"), suffix=c(".21",".22"))
## time pontos.21 pontos.22 ## 1 Atlético Mineiro 84 NA ## 2 Flamengo 71 62 ## 3 Palmeiras 66 81 ## 4 Fortaleza 58 NA ## 5 Corinthians 57 65
right_join(df1,df2, by=c("time"="time"), suffix=c(".21",".22"))
## time pontos.21 pontos.22 ## 1 Flamengo 71 62 ## 2 Palmeiras 66 81 ## 3 Corinthians 57 65 ## 4 Internacional NA 73 ## 5 Fluminense NA 70
inner_join(df1,df2, by=c("time"="time"), suffix=c(".21",".22"))
## time pontos.21 pontos.22 ## 1 Flamengo 71 62 ## 2 Palmeiras 66 81 ## 3 Corinthians 57 65
full_join(df1,df2, by=c("time"="time"), suffix=c(".21",".22"))
## time pontos.21 pontos.22 ## 1 Atlético Mineiro 84 NA ## 2 Flamengo 71 62 ## 3 Palmeiras 66 81 ## 4 Fortaleza 58 NA ## 5 Corinthians 57 65 ## 6 Internacional NA 73 ## 7 Fluminense NA 70
df_cresc <- pwt %>%
select(code = countrycode, pais = country, ano = year, pib = rgdpna,
pop, grupo = group) %>%
mutate(pib.pc = pib/pop) %>%
filter(ano %in% c(1960, 2019), code %in% pop5_1960) %>%
select(-pib, -pop) %>%
pivot_wider(names_from = ano, values_from = pib.pc) %>%
mutate(cresc = 100*((`2019`/`1960`)^(1/59) - 1),
l_pib.pc60 = log(`1960`)) %>%
select(code, pais, grupo, l_pib.pc60, cresc)
save(df_cresc, file="df_cresc.Rda")
ggplot(df_cresc, aes(x=l_pib.pc60, y=cresc)) + geom_point()
ggplot(df_cresc, aes(x=l_pib.pc60, y=cresc)) + geom_point() + geom_text(aes(label = code))
ggplot(df_cresc, aes(x=l_pib.pc60, y=cresc)) + geom_text(aes(label = code))
ggplot(df_cresc, aes(x=l_pib.pc60, y=cresc)) +
geom_text(aes(label = code)) +
labs(title = "Taxa de crescimento e PIB per capita inicial",
subtitle = "1960 a 2019",
x = "log do PIB per capita em 1960",
y = "taxa de crescimento",
caption = "Fonte: PWT 10.0")
ggplot(df_cresc, aes(x=l_pib.pc60, y=cresc, color = grupo)) +
geom_text(aes(label = code)) +
labs(title = "Taxa de crescimento e PIB per capita inicial",
subtitle = "1960 a 2019",
x = "log do PIB per capita em 1960",
y = "taxa de crescimento",
caption = "Fonte: PWT 10.0")
ggplot(df_cresc, aes(x=l_pib.pc60, y=cresc, color = grupo)) +
geom_text(aes(label = code)) +
labs(title = "Taxa de crescimento e PIB per capita inicial",
subtitle = "1960 a 2019",
x = "log do PIB per capita em 1960",
y = "taxa de crescimento",
caption = "Fonte: PWT 10.0") +
scale_color_brewer(name = "", palette = "Set1", labels = c("Países Avançados",
"Emergentes da Ásia",
"Emergentes da Europa",
"América Latina e Caribe",
"Oriente Médio",
"África Subsaariana"))
ggplot(df_cresc, aes(x=l_pib.pc60, y=cresc, color = grupo)) +
geom_text(aes(label = code)) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Taxa de crescimento e PIB per capita inicial",
subtitle = "1960 a 2019",
x = "log do PIB per capita em 1960",
y = "taxa de crescimento",
caption = "Fonte: PWT 10.0") +
scale_color_brewer(name = "", palette = "Set1", labels = c("Países Avançados",
"Emergentes da Ásia",
"Emergentes da Europa",
"América Latina e Caribe",
"Oriente Médio",
"África Subsaariana"))
ggplot(df_cresc, aes(x=l_pib.pc60, y=cresc)) +
geom_text(aes(label = code, color = grupo)) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Taxa de crescimento e PIB per capita inicial",
subtitle = "1960 a 2019",
x = "log do PIB per capita em 1960",
y = "taxa de crescimento",
caption = "Fonte: PWT 10.0") +
scale_color_brewer(name = "", palette = "Set1", labels = c("Países Avançados",
"Emergentes da Ásia",
"Emergentes da Europa",
"América Latina e Caribe",
"Oriente Médio",
"África Subsaariana"))
library(ggthemes)
ggplot(df_cresc, aes(x=l_pib.pc60, y=cresc)) +
geom_text(aes(label = code, color = grupo)) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Taxa de crescimento e PIB per capita inicial",
subtitle = "1960 a 2019",
x = "log do PIB per capita em 1960",
y = "taxa de crescimento",
caption = "Fonte: PWT 10.0") +
scale_color_brewer(name = "", palette = "Set1", labels = c("Países Avançados",
"Emergentes da Ásia",
"Emergentes da Europa",
"América Latina e Caribe",
"Oriente Médio",
"África Subsaariana")) +
theme_hc()
theme_stata()theme_economist()theme_fivethirtyeight()theme_excel()df_cresc %>% group_by(grupo) %>% summarise(m_cresc = mean(cresc)) %>% ggplot(aes(grupo, m_cresc)) + geom_col()
df_cresc %>% group_by(grupo) %>% summarise(m_cresc = mean(cresc)) %>% ggplot(aes(reorder(grupo, m_cresc), m_cresc, fill = grupo)) + geom_col() + scale_x_discrete(labels = NULL)
df_cresc %>% group_by(grupo) %>% summarise(m_cresc = mean(cresc)) %>% ggplot(aes(reorder(grupo, m_cresc), m_cresc, fill = grupo)) + geom_col() + scale_x_discrete(labels = NULL) + coord_flip()
df_cresc %>% group_by(grupo) %>% summarise(m_cresc = mean(cresc)) %>% ggplot(aes(reorder(grupo, m_cresc), m_cresc, fill = grupo)) + geom_col() + geom_text(aes(label = m_cresc)) + scale_x_discrete(labels = NULL)
library(scales)
df_cresc %>%
group_by(grupo) %>%
summarise(m_cresc = mean(cresc)) %>%
ggplot(aes(reorder(grupo, m_cresc), m_cresc, fill = grupo)) +
geom_col() +
geom_text(aes(label = percent(m_cresc, scale = 1)), vjust = 1.5,
color = "darkblue", size=6) +
scale_x_discrete(labels = NULL)
df_cresc %>%
group_by(grupo) %>%
summarise(m_cresc = mean(cresc)) %>%
ggplot(aes(reorder(grupo, m_cresc), m_cresc, fill = grupo)) +
geom_col() +
geom_text(aes(label = percent(m_cresc, scale = 1)), vjust = 1.5,
color = "darkblue", size=6) +
scale_x_discrete(labels = NULL) +
labs(title = "Média da taxa de crescimento dos países de cada grupo",
subtitle = "1960 a 2019",
x = NULL,
y = "taxa de crescimento, %",
caption = "Fonte: PWT 10.0")
df_cresc %>%
group_by(grupo) %>%
summarise(m_cresc = mean(cresc)) %>%
ggplot(aes(reorder(grupo, m_cresc), m_cresc, fill = grupo)) +
geom_col() +
geom_text(aes(label = percent(m_cresc, scale = 1)), vjust = 1.5,
color = "darkblue", size=6) +
scale_x_discrete(labels = NULL) +
labs(title = "Média da taxa de crescimento dos países de cada grupo",
subtitle = "1960 a 2019",
x = NULL,
y = "taxa de crescimento, %",
caption = "Fonte: PWT 10.0") +
scale_fill_brewer(name = "", palette = "Set1")
theme_hc().df_cresc %>%
group_by(grupo) %>% summarise(m_cresc = mean(cresc)) %>%
ggplot(aes(reorder(grupo, m_cresc), m_cresc, fill = grupo)) +
geom_col() +
geom_text(aes(label = percent(m_cresc, scale = 1)), vjust = 1.5,
color = "darkblue", size=6) +
scale_x_discrete(labels = NULL) +
labs(title = "Média da taxa de crescimento dos países de cada grupo",
subtitle = "1960 a 2019", x = NULL,
y = "taxa de crescimento, %",
caption = "Fonte: PWT 10.0") +
scale_fill_brewer(name = "", palette = "Set1", labels = c("Países Avançados",
"Emergentes da Ásia",
"Emergentes da Europa",
"América Latina e Caribe",
"Oriente Médio",
"África Subsaariana")) +
theme_hc()
pwt %>%
select(countrycode, year, rgdpna, pop) %>%
filter(countrycode == "BRA") %>%
mutate(pib.pc = rgdpna/pop) %>%
ggplot(aes(year, pib.pc)) +
geom_line(linewidth=1.5, color = "darkblue") +
labs(title = "PIB per capita - Brasil",
subtitle = "1960 a 2019",
x = NULL,
y = "PIB per capita",
caption = "Fonte: PWT 10.0") +
theme_hc()
pwt %>%
select(countrycode, year, rgdpna, pop) %>%
filter(countrycode %in% c("ARG","BRA", "MEX")) %>%
mutate(pib.pc = rgdpna/pop) %>%
ggplot(aes(year, pib.pc, color = countrycode)) +
geom_line(linewidth=1.5) +
labs(title = "PIB per capita - Brasil",
subtitle = "1960 a 2019",
x = NULL,
y = "PIB per capita",
caption = "Fonte: PWT 10.0") +
theme_hc()
pwt %>%
select(countrycode, year, rgdpna, pop) %>%
filter(countrycode %in% c("ARG","BRA", "MEX", "USA")) %>%
mutate(pib.pc = rgdpna/pop) %>% select(-rgdpna, -pop) %>%
pivot_wider(names_from = countrycode, values_from = pib.pc) %>%
mutate(Brasil = 100*BRA/USA,
Argentina = 100*ARG/USA,
Mexico = 100*MEX/USA) %>%
select(-USA, - BRA, -MEX, - ARG) %>%
pivot_longer(names_to = "pais", values_to = "pib_us", -year) %>%
ggplot(aes(year, pib_us, shape = pais)) +
geom_line(linewidth = 1) + geom_point(size=3) +
labs(title = "PIB per capita como proporção do PIB per capita dos Estados Unidos",
subtitle = "1950 a 2019", x = NULL,
y = "% do PIB per capita dos EUA",
caption = "Fonte: PWT 10.0") +
scale_shape_discrete(name = "") +
theme_classic()
apply() que aplica uma determinada função a elementos de um vetor.apply(), porém com uma estrutura compatível com o tydiverse.Principais funções do purrr:
map(.x,.f): principal função do pacote, retorna uma lista;map_df(.x, .f): retorna um data frame;map_dbl(.x,.f): retorna um vetor numérico (double);map_chr(.x,.f): retorna um vetor de caracteres;map_lgt(.x,.f): retorna um vetor lógico.map() são compatíveis com o uso de pipes, %>%.dobro <- function(.x){
return(.x * 2)
}
library(purrr) map(c(1,2,3), dobro)
## [[1]] ## [1] 2 ## ## [[2]] ## [1] 4 ## ## [[3]] ## [1] 6
map_dbl(c(1,2,3), dobro)
## [1] 2 4 6
map_chr(c(1,2,3), dobro)
## Warning: Automatic coercion from double to character was deprecated in purrr 1.0.0. ## ℹ Please use an explicit call to `as.character()` within `map_chr()` instead. ## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was ## generated.
## [1] "2.000000" "4.000000" "6.000000"
map(list(1,2,3), dobro)
## [[1]] ## [1] 2 ## ## [[2]] ## [1] 4 ## ## [[3]] ## [1] 6
map_dbl(list(1,2,3), dobro)
## [1] 2 4 6
map(data.frame(a=1,b=2,c=3), dobro)
## $a ## [1] 2 ## ## $b ## [1] 4 ## ## $c ## [1] 6
map_df(data.frame(a=1,b=2,c=3), dobro)
## # A tibble: 1 × 3 ## a b c ## <dbl> <dbl> <dbl> ## 1 2 4 6
map_df(c(1,2,3), function(.x){
return(data.frame(numero = .x,
dobro = dobro(.x)))
})
## numero dobro ## 1 1 2 ## 2 2 4 ## 3 3 6
map_df() é chamada de função anônima, repare que ela não tem nome, esse tipo de função é de uso temporário nas iterações.map_dbl(c(1,2,3), ~{2*.x})
## [1] 2 4 6
library(gapminder) str(gapminder)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame) ## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ... ## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ... ## $ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ... ## $ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ... ## $ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ... ## $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
class() em cada coluna:map_chr(gapminder, class)
## country continent year lifeExp pop gdpPercap ## "factor" "factor" "integer" "numeric" "integer" "numeric"
gapminder %>% map_chr(class)
## country continent year lifeExp pop gdpPercap ## "factor" "factor" "integer" "numeric" "integer" "numeric"
gapminder %>% map_dbl(n_distinct)
## country continent year lifeExp pop gdpPercap ## 142 5 12 1626 1704 1704
gapminder %>% map_df(~(data.frame(n_distinct = n_distinct(.x),
class = class(.x))))
## n_distinct class ## 1 142 factor ## 2 5 factor ## 3 12 integer ## 4 1626 numeric ## 5 1704 integer ## 6 1704 numeric
map_df() aplica a função designada em cada coluna do data.frame, podemos usar o argumento .id para identificar a coluna utilizada em cada iteração.gapminder %>% map_df(~(data.frame(n_distinct = n_distinct(.x),
class = class(.x))),
.id = "variable")
## variable n_distinct class ## 1 country 142 factor ## 2 continent 5 factor ## 3 year 12 integer ## 4 lifeExp 1626 numeric ## 5 pop 1704 integer ## 6 gdpPercap 1704 numeric
map2() permite fazer iterações com dois objetos.map2(c(1,3,5),c(2,4,6),~{.x+.y})
## [[1]] ## [1] 3 ## ## [[2]] ## [1] 7 ## ## [[3]] ## [1] 11
map2_dbl(c(1,3,5),c(2,4,6),~{.x+.y})
## [1] 3 7 11
map2() facilita muito nossa vida.continent_year <- gapminder %>% distinct(continent, year)
continents <- continent_year %>% pull(continent) %>% as.character years <- continent_year %>% pull(year)
plot_list <- map2(.x = continents,
.y = years,
.f = ~{
gapminder %>%
filter(continent == .x,
year == .y) %>%
ggplot() +
geom_point(aes(x = gdpPercap, y = lifeExp)) +
ggtitle(paste(.x, .y))
})
plot_list[[15]]
plot_list[[45]]
map() cria um grande número de possibilidades para realização de operações com dados no R.nest(), do pacote tidyr que á parte do tidyverse, cria coluna de listas em uma tibble.gapminder_nested <- gapminder %>% group_by(continent) %>% nest()
gapminder_nested$data[[5]].pluck(), do tidyr, faz o mesmo serviço.gapminder_nested %>% pluck("data", 5)
## # A tibble: 24 × 5 ## country year lifeExp pop gdpPercap ## <fct> <int> <dbl> <int> <dbl> ## 1 Australia 1952 69.1 8691212 10040. ## 2 Australia 1957 70.3 9712569 10950. ## 3 Australia 1962 70.9 10794968 12217. ## 4 Australia 1967 71.1 11872264 14526. ## 5 Australia 1972 71.9 13177000 16789. ## 6 Australia 1977 73.5 14074100 18334. ## 7 Australia 1982 74.7 15184200 19477. ## 8 Australia 1987 76.3 16257249 21889. ## 9 Australia 1992 77.6 17481977 23425. ## 10 Australia 1997 78.8 18565243 26998. ## # ℹ 14 more rows
mutate() com os dados que estão na lista.mutate() trabalha com colunas de um data.frame que são do tipo vetor, não pode ser aplicada diretamente em listas.mutate() faz o serviço:tibble(vec_col = 1:10) %>% mutate(vec_sum = sum(vec_col))
## # A tibble: 10 × 2 ## vec_col vec_sum ## <int> <int> ## 1 1 55 ## 2 2 55 ## 3 3 55 ## 4 4 55 ## 5 5 55 ## 6 6 55 ## 7 7 55 ## 8 8 55 ## 9 9 55 ## 10 10 55
tibble(list_col = list(c(1, 5, 7),
5,
c(10, 10, 11))) %>%
mutate(list_sum = sum(list_col))
Error in sum(x) : invalid 'type' (list) of argumentmap():tibble(list_col = list(c(1, 5, 7),
5,
c(10, 10, 11))) %>%
mutate(list_sum = map(list_col, sum))
## # A tibble: 3 × 2 ## list_col list_sum ## <list> <list> ## 1 <dbl [3]> <dbl [1]> ## 2 <dbl [1]> <dbl [1]> ## 3 <dbl [3]> <dbl [1]>
map_dbl():tibble(list_col = list(c(1, 5, 7),
5,
c(10, 10, 11))) %>%
mutate(list_sum = map_dbl(list_col, sum))
## # A tibble: 3 × 2 ## list_col list_sum ## <list> <dbl> ## 1 <dbl [3]> 13 ## 2 <dbl [1]> 5 ## 3 <dbl [3]> 31
mutate() e uma função do tipo map() para obter a média da expectativa de vida em cada continente:gapminder_nested %>%
mutate(avg_lifeExp = map_dbl(data, ~{mean(.x$lifeExp)}))
## # A tibble: 5 × 3 ## # Groups: continent [5] ## continent data avg_lifeExp ## <fct> <list> <dbl> ## 1 Asia <tibble [396 × 5]> 60.1 ## 2 Europe <tibble [360 × 5]> 71.9 ## 3 Africa <tibble [624 × 5]> 48.9 ## 4 Americas <tibble [300 × 5]> 64.7 ## 5 Oceania <tibble [24 × 5]> 74.3
group_by()gapminder %>% group_by(continent) %>% summarise(avg_lifeExp = mean(lifeExp))
## # A tibble: 5 × 2 ## continent avg_lifeExp ## <fct> <dbl> ## 1 Africa 48.9 ## 2 Americas 64.7 ## 3 Asia 60.1 ## 4 Europe 71.9 ## 5 Oceania 74.3
gapminder_nested %>% mutate(cor = map_dbl(data, ~cor(.x$lifeExp, .x$gdpPercap)))
## # A tibble: 5 × 3 ## # Groups: continent [5] ## continent data cor ## <fct> <list> <dbl> ## 1 Asia <tibble [396 × 5]> 0.382 ## 2 Europe <tibble [360 × 5]> 0.781 ## 3 Africa <tibble [624 × 5]> 0.426 ## 4 Americas <tibble [300 × 5]> 0.558 ## 5 Oceania <tibble [24 × 5]> 0.956
lm().reg.US <- lm(pib_pc ~ year, data=dados.reg.US)
summary(reg.US)
## ## Call: ## lm(formula = pib_pc ~ year, data = dados.reg.US) ## ## Residuals: ## Min 1Q Median 3Q Max ## -0.09562 -0.04831 0.02185 0.04120 0.07516 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -30.270695 0.609239 -49.69 <2e-16 *** ## year 0.020510 0.000307 66.81 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.0519 on 68 degrees of freedom ## Multiple R-squared: 0.985, Adjusted R-squared: 0.9848 ## F-statistic: 4464 on 1 and 68 DF, p-value: < 2.2e-16
glance() do pacote broom cria um data.frame com as estatísticas da regressão.library(broom) glance(reg.US)
## # A tibble: 1 × 12 ## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 0.985 0.985 0.0519 4464. 9.55e-64 1 109. -212. -205. ## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
## # A tibble: 1 × 12 ## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 0.874 0.872 0.187 471. 2.81e-32 1 19.2 -32.4 -25.7 ## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
nest(), mutate(), map(), glance() e unnest().reg_purr <- pwt %>% select(code = countrycode, ano = year, pib = rgdpna, pop) %>% filter(code %in% pop5_1960, ano >= 1960) %>% mutate(l_pib.pc = log(pib/pop)) %>% select(-pib, -pop)
nest():reg_purr <- pwt %>% select(code = countrycode, ano = year, pib = rgdpna, pop) %>% filter(code %in% pop5_1960, ano >= 1960) %>% mutate(l_pib.pc = log(pib/pop)) %>% select(-pib, -pop) %>% nest(data = -code)
mutate(), map() e glance():reg_purr <- pwt %>%
select(code = countrycode, ano = year, pib = rgdpna, pop) %>%
filter(code %in% pop5_1960, ano >= 1960) %>%
mutate(l_pib.pc = log(pib/pop)) %>%
select(-pib, -pop) %>%
nest(data = -code) %>%
mutate(fit = map(data, ~ lm(l_pib.pc ~ ano, data = .x)),
glanced = map(fit, glance))
unnest():reg_purr <- pwt %>%
select(code = countrycode, ano = year, pib = rgdpna, pop) %>%
filter(code %in% pop5_1960, ano >= 1960) %>%
mutate(l_pib.pc = log(pib/pop)) %>%
select(-pib, -pop) %>%
nest(data = -code) %>%
mutate(fit = map(data, ~ lm(l_pib.pc ~ ano, data = .x)),
glanced = map(fit, glance)) %>%
unnest(glanced)
reg_purr <- pwt %>%
select(code = countrycode, ano = year, pib = rgdpna, pop) %>%
filter(code %in% pop5_1960, ano >= 1960) %>%
mutate(l_pib.pc = log(pib/pop)) %>%
select(-pib, -pop) %>%
nest(data = -code) %>%
mutate(fit = map(data, ~ lm(l_pib.pc ~ ano, data = .x)),
glanced = map(fit, glance)) %>%
unnest(glanced) %>%
select(code, r2 = r.squared)
df_grupos <- pwt %>%
select(code = countrycode, pais = country, ano = year, grupo = group) %>%
filter(code %in% pop5_1960, ano == 2010) %>%
select(code, pais, grupo) %>%
mutate(grupo2 = case_when(grupo == "Advanced" ~ "Países Ricos",
grupo %in% c("Emerging Asia", "Latin America and Caribbean",
"Emerging Europe") ~ "Emergentes",
TRUE ~ "Outros"),
grupo2 = as.factor(grupo2))
left_join(), voltaremos as funções do tipo _join() nas próximas unidades:df_r2 <- left_join(df_grupos, reg_purr, by = "code")
df_r2 %>% group_by(grupo2) %>% summarise(media_r2 = mean(r2))
## # A tibble: 3 × 2 ## grupo2 media_r2 ## <fct> <dbl> ## 1 Emergentes 0.826 ## 2 Outros 0.625 ## 3 Países Ricos 0.934