2023/01/31 (updated: 2023-12-06)
afghan <- read.csv("afghan.csv")
str(afghan)
## 'data.frame': 2754 obs. of 11 variables: ## $ province : chr "Logar" "Logar" "Logar" "Logar" ... ## $ district : chr "Baraki Barak" "Baraki Barak" "Baraki Barak" "Baraki Barak" ... ## $ village.id : int 80 80 80 80 80 80 80 80 80 35 ... ## $ age : int 26 49 60 34 21 18 42 39 20 18 ... ## $ educ.years : int 10 3 0 14 12 10 6 12 5 10 ... ## $ employed : int 0 1 1 1 1 1 1 1 1 0 ... ## $ income : chr "2,001-10,000" "2,001-10,000" "2,001-10,000" "2,001-10,000" ... ## $ violent.exp.ISAF : int 0 0 1 0 0 0 0 0 0 0 ... ## $ violent.exp.taliban: int 0 0 0 0 0 0 0 1 0 0 ... ## $ list.group : chr "control" "control" "control" "ISAF" ... ## $ list.response : int 0 1 1 3 3 2 1 3 1 2 ...
head(afghan)
## province district village.id age educ.years employed income ## 1 Logar Baraki Barak 80 26 10 0 2,001-10,000 ## 2 Logar Baraki Barak 80 49 3 1 2,001-10,000 ## 3 Logar Baraki Barak 80 60 0 1 2,001-10,000 ## 4 Logar Baraki Barak 80 34 14 1 2,001-10,000 ## 5 Logar Baraki Barak 80 21 12 1 2,001-10,000 ## 6 Logar Baraki Barak 80 18 10 1 <NA> ## violent.exp.ISAF violent.exp.taliban list.group list.response ## 1 0 0 control 0 ## 2 0 0 control 1 ## 3 1 0 control 1 ## 4 0 0 ISAF 3 ## 5 0 0 ISAF 3 ## 6 0 0 ISAF 2
library(tidyverse) afghan %>% select(age, educ.years, employed, income) %>% summary()
## age educ.years employed income ## Min. :15.00 Min. : 0.000 Min. :0.0000 Length:2754 ## 1st Qu.:22.00 1st Qu.: 0.000 1st Qu.:0.0000 Class :character ## Median :30.00 Median : 1.000 Median :1.0000 Mode :character ## Mean :32.39 Mean : 4.002 Mean :0.5828 ## 3rd Qu.:40.00 3rd Qu.: 8.000 3rd Qu.:1.0000 ## Max. :80.00 Max. :18.000 Max. :1.0000
afghan %>% select(age, educ.years, employed, income) %>% mutate(income = as.factor(income)) %>% summary()
## age educ.years employed income ## Min. :15.00 Min. : 0.000 Min. :0.0000 10,001-20,000 : 616 ## 1st Qu.:22.00 1st Qu.: 0.000 1st Qu.:0.0000 2,001-10,000 :1420 ## Median :30.00 Median : 1.000 Median :1.0000 20,001-30,000 : 93 ## Mean :32.39 Mean : 4.002 Mean :0.5828 less than 2,000: 457 ## 3rd Qu.:40.00 3rd Qu.: 8.000 3rd Qu.:1.0000 over 30,000 : 14 ## Max. :80.00 Max. :18.000 Max. :1.0000 NA's : 154
count(afghan, income)
## income n ## 1 10,001-20,000 616 ## 2 2,001-10,000 1420 ## 3 20,001-30,000 93 ## 4 less than 2,000 457 ## 5 over 30,000 14 ## 6 <NA> 154
unique(afghan$income)
## [1] "2,001-10,000" NA "10,001-20,000" "less than 2,000" ## [5] "20,001-30,000" "over 30,000"
harm_props <- afghan %>% group_by(violent.exp.ISAF, violent.exp.taliban) %>% count() %>% ungroup() %>% mutate(prop = n/sum(n))
## # A tibble: 9 × 4 ## violent.exp.ISAF violent.exp.taliban n prop ## <int> <int> <int> <dbl> ## 1 0 0 1330 0.483 ## 2 0 1 354 0.129 ## 3 0 NA 22 0.00799 ## 4 1 0 475 0.172 ## 5 1 1 526 0.191 ## 6 1 NA 22 0.00799 ## 7 NA 0 7 0.00254 ## 8 NA 1 8 0.00290 ## 9 NA NA 10 0.00363
afghan %>% group_by(violent.exp.ISAF, violent.exp.taliban) %>% count() %>% # ungroup() %>% mutate(prop = n/sum(n))
## # A tibble: 9 × 4 ## # Groups: violent.exp.ISAF, violent.exp.taliban [9] ## violent.exp.ISAF violent.exp.taliban n prop ## <int> <int> <int> <dbl> ## 1 0 0 1330 1 ## 2 0 1 354 1 ## 3 0 NA 22 1 ## 4 1 0 475 1 ## 5 1 1 526 1 ## 6 1 NA 22 1 ## 7 NA 0 7 1 ## 8 NA 1 8 1 ## 9 NA NA 10 1
ISAF_harm_prop <- harm_props %>% filter(violent.exp.ISAF == 1) %>% summarize(harm_prop = sum(prop)) %>% pull() ISAF_harm_prop
## [1] 0.3714597
talib_harm_prop <- harm_props %>% filter(violent.exp.taliban == 1) %>% summarize(harm_prop = sum(prop)) %>% pull() talib_harm_prop
## [1] 0.3224401
both_harm_prop <- harm_props %>% filter(violent.exp.taliban == 1 & violent.exp.ISAF == 1) %>% summarize(harm_prop = sum(prop)) %>% pull() both_harm_prop
## [1] 0.1909949
afghan %>% select(income) %>% slice(1:10)
## income ## 1 2,001-10,000 ## 2 2,001-10,000 ## 3 2,001-10,000 ## 4 2,001-10,000 ## 5 2,001-10,000 ## 6 <NA> ## 7 10,001-20,000 ## 8 2,001-10,000 ## 9 2,001-10,000 ## 10 <NA>
is.na() identifica os NAs presentes nos dados.afghan %>% select(income) %>% slice(1:10) %>% is.na()
## income ## [1,] FALSE ## [2,] FALSE ## [3,] FALSE ## [4,] FALSE ## [5,] FALSE ## [6,] TRUE ## [7,] FALSE ## [8,] FALSE ## [9,] FALSE ## [10,] TRUE
is.na() é o número de NAs e a média é a proporção de NAs.afghan %>%
summarize(n_missing = sum(is.na(income)),
p_missing = mean(is.na(income)))
## n_missing p_missing ## 1 154 0.05591866
5 + NA
## [1] NA
5*NA
## [1] NA
x <- c(1,2,3,4,NA) sum(x)
## [1] NA
sum(x, na.rm = TRUE)
## [1] 10
x <- c(1,2,3,4,NA) mean(x)
## [1] NA
mean(x, na.rm = TRUE)
## [1] 2.5
filter() combinada com a função is.na() oeferece uma maneira rápida de tirar NAs de uma análise.afghan %>% filter(!is.na(violent.exp.ISAF), !is.na(violent.exp.taliban)) %>% group_by(violent.exp.ISAF, violent.exp.taliban) %>% count() %>% ungroup() %>% mutate(prop = n/sum(n)) %>% arrange(prop)
## # A tibble: 4 × 4 ## violent.exp.ISAF violent.exp.taliban n prop ## <int> <int> <int> <dbl> ## 1 0 1 354 0.132 ## 2 1 0 475 0.177 ## 3 1 1 526 0.196 ## 4 0 0 1330 0.495
arrange() organizou os dados da menor para maior proporção, podemos organizar em ordem decrescente usando a função desc().afghan %>% filter(!is.na(violent.exp.ISAF), !is.na(violent.exp.taliban)) %>% group_by(violent.exp.ISAF, violent.exp.taliban) %>% count() %>% ungroup() %>% mutate(prop = n/sum(n)) %>% arrange(desc(prop))
## # A tibble: 4 × 4 ## violent.exp.ISAF violent.exp.taliban n prop ## <int> <int> <int> <dbl> ## 1 0 0 1330 0.495 ## 2 1 1 526 0.196 ## 3 1 0 475 0.177 ## 4 0 1 354 0.132
filter() e is.na() também poder ser usadas para identificar NAs em mais de uma variável, por exemplo, podemos encontrar quantos NAs para quem sofreu violência com ISAF ou taliban.missing_prop <- harm_props %>% filter(is.na(violent.exp.ISAF) | is.na(violent.exp.taliban)) %>% summarize(missing_prop = sum(prop)) %>% pull missing_prop
## [1] 0.02505447
na.omit(), do R básico, e drop_na(), do tidyverse excluem as observações onde aparece NA em pelo menos uma variável. Repare que as funções excluem toda a linha mesmo que esteja faltando dados para apenas uma variável.nrow(afghan)
## [1] 2754
afghan.sub <- na.omit(afghan) nrow(afghan.sub)
## [1] 2554
nrow(afghan)
## [1] 2754
afghan.sub2 <- drop_na(afghan) nrow(afghan.sub2)
## [1] 2554
afghan %>% drop_na(income) %>% nrow()
## [1] 2600
geom_col() para fazer gráficos de barras, aqui vamos usar a função geom_bar(). A função geom_col() é uma versão simplificada da geom_bar().ggplot(data = afghan, aes(x=as.factor(violent.exp.ISAF))) +
geom_bar(aes(y= after_stat(prop))) +
scale_x_discrete(labels = c("No harm", "Harm", "Nonresponse")) +
ylab("Proportion of respondents") +
xlab("Response category") +
ggtitle("Civilian Victimization by the ISAF")
aes() no arumento da função geom_bar() é chamada da seguinte forma: geom_bar(aes(y=..prop.., group=1)) o argumento ..prop.. foi substituido por after_stat(prop) para ficar compatível com as versões mais modernas do ggplot2. O argumento group = 1 é um “truque” para fazer o agrupamento pelos níveis de violent.exp.ISAF. Por fim, note que os autores transformaram a variável violent.exp.ISAF em fator.prop na função after_stat() diz para usar proporções. Para ilustrar a função after_stat() vou refazer o gráfico usando o argumento count.ggplot(data = afghan, aes(x=as.factor(violent.exp.ISAF))) +
geom_bar(aes(y= after_stat(count))) +
scale_x_discrete(labels = c("No harm", "Harm", "Nonresponse")) +
ylab("Proportion of respondents") +
xlab("Response category") +
ggtitle("Civilian Victimization by the ISAF")
ggplot(data = afghan, aes(x=as.factor(violent.exp.taliban))) +
geom_bar(aes(y= after_stat(prop))) +
scale_x_discrete(labels = c("No harm", "Harm", "Nonresponse")) +
ylab("Proportion of respondents") +
xlab("Response category") +
ggtitle("Civilian Victimization by the ISAF")
afghan %>%
pivot_longer(violent.exp.ISAF:violent.exp.taliban,
names_to ="harming group", values_to="harm") %>%
ggplot(aes(x=as.factor(harm))) +
geom_bar(aes(y=after_stat(prop), fill=harming_group, group=harming_group),
position = "dodge") +
scale_x_discrete(labels=c("No harm", "Harm", "Nonresponse")) +
scale_fill_discrete(name="Harming group", labels=c("ISAF", "Taliban")) +
labs(title = "Civilian victimization",
x = "Response category",
y = "Proportion of respondents")
ggplot()afghan %>%
pivot_longer(violent.exp.ISAF:violent.exp.taliban,
names_to ="harming group", values_to="harm")
afghan %>%
pivot_longer(violent.exp.ISAF:violent.exp.taliban,
names_to ="harming_group", values_to="harm") %>%
ggplot(aes(x=as.factor(harm))) +
geom_bar(aes(y=after_stat(prop), fill=harming_group, group=harming_group),
position = "dodge") +
scale_x_discrete(labels=c("No harm", "Harm", "Nonresponse")) +
scale_fill_brewer(name="Harming group", labels=c("ISAF", "Taliban"),
palette = "Blues") +
labs(title = "Civilian victimization",
x = "Response category",
y = "Proportion of respondents") +
theme_classic()
geom_histogram() do ggplot2 faz o histograma.afghan %>%
ggplot(aes(x = age)) +
geom_histogram(aes(y= after_stat(density)), binwidth=5, boundary=0) +
scale_x_continuous(breaks = seq(20,80,by=10)) +
labs(title = "Distribution of respondent's age",
y = "Density",
x = "Age") +
theme_classic()
afghan %>%
ggplot(aes(x = age)) +
geom_histogram(aes(y= after_stat(count)), binwidth=5, boundary=0) +
scale_x_continuous(breaks = seq(20,80,by=10)) +
labs(title = "Distribuição das idades",
y = "Número de observações",
x = "Idade") +
theme_classic()
geom_vline() coloca linhas verticais em um gráfico (para linhas horizontais use a função geom_hline()) e a função annotate() faz anotações no gráfico.afghan %>%
ggplot(aes(x = educ.years, y= after_stat(density))) +
geom_histogram(binwidth=1, center=0) +
geom_vline(xintercept = median(afghan$educ.years)) +
annotate(geom="text", x = median(afghan$educ.years),
y=0.4, label="Median", hjust=-0.1)
labs(title = "Distribution of respondent's education",
y = "denssity",
x = "Years of education") +
theme_classic()
afghan %>% ggplot(aes(y=age)) + geom_boxplot() + labs(y="Age", x=NULL, title = "Distribution of age")
afghan %>% ggplot(aes(y=educ.years, x=province)) + geom_boxplot() + labs(y="Years of education", x = "Province", title = "Education by province")
afghan %>%
group_by(province) %>%
summarize(violence.exp.taliban = mean(violent.exp.taliban, na.rm=TRUE),
violence.exp.ISAF = mean(violent.exp.ISAF, na.rm=TRUE))
## # A tibble: 5 × 3 ## province violence.exp.taliban violence.exp.ISAF ## <chr> <dbl> <dbl> ## 1 Helmand 0.504 0.541 ## 2 Khost 0.233 0.242 ## 3 Kunar 0.303 0.399 ## 4 Logar 0.0802 0.144 ## 5 Uruzgan 0.455 0.496
ggsave(). Deixo por conta de vocês olhar o help desta função.library(gridExtra)
age_hist <- afghan %>%
ggplot(aes(x=age)) +
geom_histogram(aes(y= after_stat(density)), binwidth=5, boundary=0) +
scale_x_continuous(breaks = seq(20,80,by=10)) +
labs(title = "Distribution of respondent's age",
y = "Density",
x = "Age") +
theme_classic()
educ_hist <- afghan %>%
ggplot(aes(x = educ.years, y= after_stat(density))) +
geom_histogram(binwidth=1, center=0) +
geom_vline(xintercept = median(afghan$educ.years)) +
annotate(geom="text", x = median(afghan$educ.years),
y=0.4, label="Median", hjust=-0.1) +
labs(title = "Distribution of respondent's education",
y = "denssity",
x = "Years of education") +
theme_classic()
grid.arrange(age_hist, educ_hist, ncol=2)
afghan.village <- read.csv("afghan-village.csv")
head(afghan.village)
## altitude population village.surveyed ## 1 1959.08 197 1 ## 2 2425.88 744 0 ## 3 2236.60 179 1 ## 4 1691.76 225 0 ## 5 1928.04 379 0 ## 6 1194.56 617 0
afghan.village %>%
ggplot(aes(x=altitude)) +
geom_histogram(aes(y=after_stat(density))) +
labs(x = "Altitude (metros)",
y= "densidade")
afghan.village %>%
ggplot(aes(x=population)) +
geom_histogram(aes(y=after_stat(density))) +
labs(x = "População",
y= "densidade")
afghan.village %>%
ggplot(aes(x=log(population))) +
geom_histogram(aes(y=after_stat(density))) +
labs(x = "População (log)",
y= "densidade")
afghan.village %>%
ggplot(aes(x = as.factor(x=village.surveyed),
y = altitude)) +
geom_boxplot() +
scale_x_discrete(labels = c("Fora da amostra", "Na amostra")) +
labs(y = "Altitude (metros)", x=NULL) +
theme_classic()
afghan.village %>%
ggplot(aes(x = as.factor(x=village.surveyed),
y = log(population))) +
geom_boxplot() +
scale_x_discrete(labels = c("Fora da amostra", "Na amostra")) +
labs(y = "População (log)", x=NULL) +
theme_classic()
…
congress <- read.csv("congress.csv")
head(congress)
## congress district state party name dwnom1 dwnom2 ## 1 80 0 USA Democrat TRUMAN -0.276 0.016 ## 2 80 1 ALABAMA Democrat BOYKIN F. -0.026 0.796 ## 3 80 2 ALABAMA Democrat GRANT G. -0.042 0.999 ## 4 80 3 ALABAMA Democrat ANDREWS G. -0.008 1.005 ## 5 80 4 ALABAMA Democrat HOBBS S. -0.082 1.066 ## 6 80 5 ALABAMA Democrat RAINS A. -0.170 0.870
plot_80 <- congress %>%
filter(congress == 80) %>%
ggplot(aes(x=dwnom1, y=dwnom2)) +
geom_point(aes(shape=party, color = party), show.legend = FALSE) +
scale_color_manual(values = c(Democrat = "blue",
Republican = "red",
Other = "green")) +
scale_shape_manual(values = c(Democrat = "square",
Republican = "triangle",
Other = "circle")) +
annotate("text",x=-1, y=1, label="Democrats", color = "blue") +
annotate("text",x=0.5, y=-1, label="Republicans", color = "red") +
scale_y_continuous("Racial liberalism/conservatism", limits = c(-1.5, 1.5)) +
scale_x_continuous("Economic liberalism/conservatism", limits = c(-1.5, 1.5)) +
ggtitle("80th Congress") +
coord_fixed() +
theme_classic()
plot_112 <- congress %>%
filter(congress == 112) %>%
ggplot(aes(x=dwnom1, y=dwnom2)) +
geom_point(aes(shape=party, color = party), show.legend = FALSE) +
scale_color_manual(values = c(Democrat = "blue",
Republican = "red",
Other = "green")) +
scale_shape_manual(values = c(Democrat = "square",
Republican = "triangle",
Other = "circle")) +
annotate("text",x=-1, y=1, label="Democrats", color = "blue") +
annotate("text",x=0.5, y=-1, label="Republicans", color = "red") +
scale_y_continuous("Racial liberalism/conservatism", limits = c(-1.5, 1.5)) +
scale_x_continuous("Economic liberalism/conservatism", limits = c(-1.5, 1.5)) +
ggtitle("80th Congress") +
coord_fixed() +
theme_classic()
grid.arrange(plot_80, plot_112, ncol=2)
median_dw1 <- congress %>%
filter(party %in% c("Republican", "Democrat")) %>%
group_by(party, congress) %>%
summarize(median_dw1 = median(dwnom1)) %>%
ungroup()
## `summarise()` has grouped output by 'party'. You can override using the ## `.groups` argument.
median_dw1
## # A tibble: 66 × 3 ## party congress median_dw1 ## <chr> <int> <dbl> ## 1 Democrat 80 -0.126 ## 2 Democrat 81 -0.207 ## 3 Democrat 82 -0.179 ## 4 Democrat 83 -0.174 ## 5 Democrat 84 -0.223 ## 6 Democrat 85 -0.231 ## 7 Democrat 86 -0.259 ## 8 Democrat 87 -0.258 ## 9 Democrat 88 -0.285 ## 10 Democrat 89 -0.292 ## # ℹ 56 more rows
ggplot(data=median_dw1, aes(x=congress, y=median_dw1, color=party)) + geom_line() + labs(title = "DW-NOMINATE score (1st dimension)", x="Congress", y="Party") + theme_classic()
…
\[\mbox{coeficiente de Gini}=\frac{\mbox{área A}}{\mbox{área A + área B}}\]
polarization <- median_dw1 %>%
pivot_wider(names_from = party,
values_from = median_dw1) %>%
mutate(polarization = Republican - Democrat)
polarization %>%
ggplot(aes(congress, polarization)) +
geom_point() +
labs(title = "Politican polarization",
x="Congress",
y="Pepublican median -\n Democratic median") +
theme_classic()
USGini <- read.csv("USGini.csv")
head(USGini)
## X year gini ## 1 1 1947 0.376 ## 2 2 1948 0.371 ## 3 3 1949 0.378 ## 4 4 1950 0.379 ## 5 5 1951 0.363 ## 6 6 1952 0.368
USGini %>%
ggplot(aes(x=year, y=gini)) +
geom_point() +
labs(title = "Gini coefficent",
x= "Year",
y = "Income inequality") +
theme_classic
gini_2yr <- USGini %>% filter(row_number() %% 2 ==0) %>% select(gini) %>% pull() pol_annual <- polarization %>% select(polarization) %>% pull() cor(gini_2yr, pol_annual)
## [1] 0.9418128
facet_grid() do ggplot2 permite fazer gráficos do tipo. No exemplo vamos comparar as distribuições da segunda dimensão do DW-NOMINATE score entre republicanos e democratas da 112o legislatura.congress %>%
filter(congress == 112, party %in% c("Republican", "Democrat")) %>%
ggplot(aes(x=dwnom2, y=after_stat(density))) +
geom_histogram(binwidth = .2) +
facet_grid(party ~ .) +
labs(x="Racial liberalism/conservatism dimension",
y="Density") +
theme_classic()
geom_point() para criar o gráfico.quantile_probs <- seq(from=0, to=1, by=0.01)
quantile_names <- as.character(quantile_probs)
quantiles <- congress %>%
filter(congress == 112) %>%
group_by(party) %>%
summarize(dwnom_quantile = quantile(dwnom2, probs=quantile_probs),
quantile = quantile_names) %>%
pivot_wider(names_from=party, values_from=dwnom_quantile) %>%
ungroup()
quantiles %>%
ggplot(aes(x=Democrat, y=Republican)) +
geom_point(shape=1) +
ylim(-1.5, 1.5) +
xlim(-1.5, 1.5) +
geom_abline(intercept=0, slope=1) +
ggtitle("Racial liberalism/conservadorism dimension") +
coord_fixed()
qqplot() do R básico oferece uma maneira mais simples de fazer um gráfico Q-Q, porém sem as comodidades do tidyversedem112 <- filter(congress, party == "Democrat", congress == 112)
rep112 <- filter(congress, party == "Republican", congress == 112)
qqplot(x=dem112$dwnom2,
y=rep112$dwnom2,
xlab = "Democrats",
ylab = "Republicans",
xlim = c(-1.5, 1.5), ylim = c(-1.5, 1.5),
main = "Racial liberalism/conservadorism dimension")
matrix(), para isso é necessário fornecer um vetor com os valores dos elementos e informar se o vetor deve ser lido por linhas ou por colunas bem como o número de linhas/colunas.rownames() e colnames().x <- matrix(1:12, nrow=3, ncol=4, byrow=TRUE) dim(x)
## [1] 3 4
x
## [,1] [,2] [,3] [,4] ## [1,] 1 2 3 4 ## [2,] 5 6 7 8 ## [3,] 9 10 11 12
x <- matrix(1:12, nrow=3, ncol=4, byrow=TRUE)
rownames(x) <- c("a", "b", "C")
colnames(x) <- c("d", "e", "f", "g")
x
## d e f g ## a 1 2 3 4 ## b 5 6 7 8 ## C 9 10 11 12
as.matrix() o R vai transformar a classe das variáveis de forma que todas fiquem com a mesma classe.y <- data.frame(y1 = as.factor(c("a", "b", "c")),
y2 = c(0.1,0.2,0.3))
str(y)
## 'data.frame': 3 obs. of 2 variables: ## $ y1: Factor w/ 3 levels "a","b","c": 1 2 3 ## $ y2: num 0.1 0.2 0.3
z <- as.matrix(y) z
## y1 y2 ## [1,] "a" "0.1" ## [2,] "b" "0.2" ## [3,] "c" "0.3"
colSums() e rowMeans().colSums(x)
## d e f g ## 15 18 21 24
rowMeans(x)
## a b C ## 2.5 6.5 10.5
x <- list(y1 = 1:10, y2 = c("hi", "hello", "hey"),
y3 = data.frame(z1=1:3, z2 = c("good", "bad", "ugly")))
x$y1
## [1] 1 2 3 4 5 6 7 8 9 10
x[[2]]
## [1] "hi" "hello" "hey"
x[["y3"]]
## z1 z2 ## 1 1 good ## 2 2 bad ## 3 3 ugly
names() se aplicada em uma lista retorna os nomes dos objetos na lista e a função length() retorna o número de objetos na lista.names(x)
## [1] "y1" "y2" "y3"
length(x)
## [1] 3
O algoritmo k-means é uma forma popular de agrupar dados. O objetico do algoritmo é distribuir as variáveis em k grupos semelhantes onde cada grupo é associado a um centro. O primeiro passo é definir o número de centros e “espalhar” os centros no espaço de dados, depois os dados são classficados pela distância desse centro. Após a classificação dos dados, novos centros são calculados e os dados são reclassficados de acordo com os novos centros. O algoritmo para quando nenhum dados mudar de classificação de uma rodada para outra.
No link https://www.tidymodels.org/learn/statistics/k-means/ você encontra uma animação que ilustra bem o algoritmo k-means.
kmeans() aplica o algoritmo, entre seus argumentos estão centers que determina o número de clusters, iter.max que é o número máximo de iterações e nstart o número de centros aleatórios que devem ser escolhidos.kmeans() retorna o melhor agrupamento.k80two.out <- congress %>% filter(congress == 80) %>% select(dwnom1, dwnom2) %>% kmeans(centers = 2, nstart = 5) k112two.out <- congress %>% filter(congress == 112) %>% select(dwnom1, dwnom2) %>% kmeans(centers = 2, nstart = 5)
kmeans(), entre as quais o número de iterações, iter, um vetor dizendo a que cluster pertence cada observação, cluster e uma matriz com os centros, centers.names(k80two.out)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" ## [6] "betweenss" "size" "iter" "ifault"
k80two.out$iter
## [1] 1
k112two.out$iter
## [1] 1
k80two.out$center
## dwnom1 dwnom2 ## 1 0.14681029 -0.3389293 ## 2 -0.04843704 0.7827259
k112two.out$center
## dwnom1 dwnom2 ## 1 0.6776736 0.09061157 ## 2 -0.3912687 0.03260696
tidy() do pacote tidymodels ou broom, para transformar os resultados em data.frames fáceis de manipular.library(tidymodels) k80two.clusters <- tidy(k80two.out) k80two.clusters
## # A tibble: 2 × 5 ## dwnom1 dwnom2 size withinss cluster ## <dbl> <dbl> <int> <dbl> <fct> ## 1 0.147 -0.339 311 54.9 1 ## 2 -0.0484 0.783 135 10.9 2
k112two.clusters <- tidy(k112two.out) k112two.clusters
## # A tibble: 2 × 5 ## dwnom1 dwnom2 size withinss cluster ## <dbl> <dbl> <int> <dbl> <fct> ## 1 0.678 0.0906 242 27.1 1 ## 2 -0.391 0.0326 201 38.8 2
congress %>% filter(congress == 80) %>% mutate(cluster2 = k80two.out$cluster) %>% group_by(party, cluster2) %>% count() %>% pivot_wider(names_from = cluster2, values_from = n)
## # A tibble: 3 × 3 ## # Groups: party [3] ## party `1` `2` ## <chr> <int> <int> ## 1 Democrat 62 132 ## 2 Other 2 NA ## 3 Republican 247 3
congress %>% filter(congress == 112) %>% mutate(cluster2 = k112two.out$cluster) %>% group_by(party, cluster2) %>% count() %>% pivot_wider(names_from = cluster2, values_from = n)
## # A tibble: 2 × 3 ## # Groups: party [2] ## party `2` `1` ## <chr> <int> <int> ## 1 Democrat 200 NA ## 2 Republican 1 242
k80four.out <- congress %>% filter(congress == 80) %>% select(dwnom1, dwnom2) %>% kmeans(centers = 4, nstart = 5) k112four.out <- congress %>% filter(congress == 112) %>% select(dwnom1, dwnom2) %>% kmeans(centers = 4, nstart = 5)
k80four.cluster <- tidy(k80four.out) k112four.cluster <- tidy(k112four.out) congress80 <- filter(congress, congress == 80) %>% mutate(cluster4 = factor(k80four.out$cluster)) congress112 <- filter(congress, congress == 112) %>% mutate(cluster4 = factor(k112four.out$cluster))
ggplot() + geom_point(data = congress80, aes(dwnom1, dwnom2, color=cluster4)) + geom_point(data = k80four.cluster, aes(dwnom1, dwnom2), size=3, shape=8) + ylim(-1.5,1.5) + xlim(-1.5,1.5) + coord_fixed() + theme(legend.position = "none")
ggplot() + geom_point(data = congress112, aes(dwnom1, dwnom2, color=cluster4)) + geom_point(data = k112four.cluster, aes(dwnom1, dwnom2), size=3, shape=8) + ylim(-1.5,1.5) + xlim(-1.5,1.5) + coord_fixed() + theme(legend.position = "none")