Following the directions on the Coursera assignment page, you will make four original visualizations. Note that the data for the CCES and CEL data are imported in code in the R Markdown file.

Put your name here: Macarena

Exercise 1

Explain what you are visualizing here:

Firstly, we are going to look at the CCES data. We are interested in looking at the educational background of the respondents.

First, we are going to look at the ratio of male to female and the ratio of racial background of respondents to gain a quick overview of the patients characteristics. Then, we are going to see the percentages of the educational background based on Gender. In order to go a bit deeper, we are going to look at the academic background based on race and gender.

We will be showing our code to show the process behind the analysis.

In order to represent the ratios of the racial background, the treemap package has added to the workflow to avoid the use of Pie Charts with multiple variables..

Put your figure here:

# DATA PROCESSING AND DECODING

cces$gender[cces$gender == 1] <- "Male"
cces$gender[cces$gender == 2] <- "Female"

cces$educ[cces$educ == 1] <- "Basic"
cces$educ[cces$educ == 2] <- "High School"
cces$educ[cces$educ == 3] <- "Some college"
cces$educ[cces$educ == 4] <- "2-year"
cces$educ[cces$educ == 5] <- "4-year"
cces$educ[cces$educ == 6] <- "Post-grad"

cces$race[cces$race == 1] <- "White"
cces$race[cces$race == 2] <- "Black"
cces$race[cces$race == 3] <- "Hispanic"
cces$race[cces$race == 4] <- "Asian"
cces$race[cces$race == 5] <- "Native American"
cces$race[cces$race == 6] <- "Mixed"
cces$race[cces$race == 7] <- "Other"
cces$race[cces$race == 8] <- "Middle Eastern"
names(cces)[6] <- paste(c("Race"))

cces$ideo5[cces$ideo5 == 1] <- "Very Liberal"
cces$ideo5[cces$ideo5 == 2] <- "Liberal"
cces$ideo5[cces$ideo5 == 3] <- "Moderate"
cces$ideo5[cces$ideo5 == 4] <- "Conservative"
cces$ideo5[cces$ideo5 == 5] <- "Very Conservative"
names(cces)[11] <- paste(c("Ideology"))

cces$employ[cces$employ == 1] <- "Full-Time"
cces$employ[cces$employ == 2] <- "Part-Time"
cces$employ[cces$employ == 3] <- "Temporarily Laid Off"
cces$employ[cces$employ == 4] <- "Unemployed"
cces$employ[cces$employ == 5] <- "Retired"
cces$employ[cces$employ == 6] <- "Permanently Disabled"
cces$employ[cces$employ == 7] <- "Homemaker"
cces$employ[cces$employ == 8] <- "Student"
cces$employ[cces$employ == 9] <- "Other"
## Data for Gender Ratios

Gender <- as.data.frame(table(cces1$gender)/length(cces1$gender) * 100)
Gender$Gender <- c("Male",
                 "Female") # Decode the race column
Gender <- subset(Gender, select = -c(Var1)) # Drop the Var1 column
Gender <- Gender[, c(2,1)] # Reorder the columns
names(Gender)[2] <- paste(c("Percentage")) # Rename the columns

GenderPlot <- as.data.frame(list(Gender$Gender,
                                 round(Gender$Percentage,
                                       2)))
names(GenderPlot)[1] <- paste(c("Gender")) 
names(GenderPlot)[2] <- paste(c("Percentage"))

GenderPlot <- GenderPlot %>% 
  mutate(text_y = cumsum(Percentage) - Percentage/2) %>%
  mutate(label = paste(Gender, 
                       " ",
                       Percentage,
                       "%"))

## Data For Racial Ratios

racial <- as.data.frame(table(cces1$race)/length(cces1$race) * 100) # Calculates Percentage
racial$Race <- c("White",
                 "Black",
                 "Hispanic",
                 "Asian",
                 "Native American",
                 "Mixed",
                 "Other",
                 "Middle Eastern") # Decode the race column
racial <- subset(racial, select = -c(Var1)) # Drop the Var1 column
racial <- racial[, c(2,1)] # Reorder the columns
names(racial)[2] <- paste(c("Percentage")) # Rename the columns
racial$Plot <- paste(racial$Race, 
                     " ",
                     round(racial$Percentage,
                                        2),
                     "%") # Create a column for plotting showing %

## Data for Mean Educational Background by gender

meaneduc <- cces1 %>%
  group_by(gender) %>%
  summarise(mean(educ))
meaneduc$gender[meaneduc$gender == 1] <- "Male"
meaneduc$gender[meaneduc$gender == 2] <- "Female"
names(meaneduc)[1] <- paste(c("Gender"))
names(meaneduc)[2] <- paste(c("Avg_Education"))

# Plot 1

ggplot(GenderPlot,
       aes(x = "",
           y = Percentage,
           fill = Gender)) +
  geom_bar(width = 1,
           stat = "Identity",
           color = "black") +
  labs(title = "Gender Distribution of Respondents") +
  coord_polar(theta = "y",
              start = 0) +
  scale_fill_brewer(palette = "Pastel1") +
  geom_label_repel(data = GenderPlot,
                   aes(label = label,
                       y = text_y),
                   nudge_x = 0.6,
                   nudge_y = 0.6,
                   size = 5,
                   show.legend = FALSE) +
  theme_void() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5))

# Plot 2

treemap(racial,
        # Data
        index = "Plot",
        vSize = "Percentage",
        vColor = "Race",
        type = "index",
        # Main
        title = "Racial Distribution of Respondants",
        palette = "Pastel1",
        # Borders
        border.col = "grey",
        border.lwds = 0.5,
        # Labels
        fontcolor.labels = "black",
        overlap.labels = 0.5,
        fontface.labels = 1,
        fontsize.labels = 12,
        lowerbound.cex.labels = 0,
        align.labels = c("left",
                         "top"))

# Plot 3 --> Racial distribution of educational background according to gender

ggplot(cces) +
  geom_bar(aes(x = educ,
               fill = Race)) +
  facet_wrap(~gender) +
  labs(title = "Respondent Demographical Analysis",
       subtitle = "Educational and Racial Background by Gender",
       x = "Max Educational Degree Achieved") +
  scale_y_continuous(expand = c(0, 0)) +
  coord_cartesian(ylim = c(0, 150)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, 
                                   hjust = 1),
        panel.grid = element_blank())

Exercise 2

Explain what you are visualizing here:

Now that we know the demographic basics of the Respondents, we are going to see if there is a correlation between these characteristics and the political views.

As we can see, there is not strong correlations between any of the demographics. However, the correlations are significant as follow:

Taking into account how the data is codified (see above), we can say that:

We are also looking at the distribution of the income based on gender. After analysing the significance betweem the genders and income.

We can see that males do significantly belong to a higher tax braket than females.

Put your figure here:

Correlation1 <- cces2 %>%
  select(Gender = gender, 
         Race = race, 
         Education = educ, 
         Ideology = ideo5,
         Employment = employ,
         Marital_Status = marstat) 

CorrelationPlot <- corr.test(Correlation1)
CorrelationR <- as.matrix(CorrelationPlot$r)
CorrelationP <- as.matrix(CorrelationPlot$p)

ggcorrplot(CorrelationR,
           p.mat = CorrelationP,
           type = "upper",
           hc.order = TRUE,
           sig.level = 0.05,
           insig = "blank",
           lab = TRUE) +
  labs(title = "Correlation",
       subtitle = "Demographical Data")

Income <- cces %>%
  select(Gender = gender,
         Income = faminc_new)

Income_Mean <- Income %>%
  group_by(Gender) %>%
  summarise(Mean = mean(Income))

Income_Corr <- cces2 %>%
  select(Gender = gender,
         Income = faminc_new) 

wilcox.test(Income_Corr$Gender, 
            Income_Corr$Income)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Income_Corr$Gender and Income_Corr$Income
## W = 34285, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
corr.test(Income_Corr)
## Call:corr.test(x = Income_Corr)
## Correlation matrix 
##        Gender Income
## Gender   1.00  -0.17
## Income  -0.17   1.00
## Sample Size 
## [1] 869
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##        Gender Income
## Gender      0      0
## Income      0      0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
ggplot(Income) +
  geom_density(aes(x = Income,
                   fill = Gender),
               alpha = 0.3) +
  geom_vline(data = Income_Mean,
             aes(xintercept = Mean,
                 color = Gender),
             linetype = "dashed") +
  scale_x_continuous(name = "Income Bracket",
                     expand = c(0, 0),
                     breaks = seq(0, 16, 1)) +
  labs(title = "Salary by Gender",
       subtitle = "Mean Comparison") +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 0.12)) +
  theme_bw() +
  theme(panel.grid = element_blank())

Exercise 3

Explain what you are visualizing here: In this viz we are looking at the relationship between different demographics.

On the correlation table we can see that there is not a significant correlation between any of them. This can be further observed in the scatter plot. However, two female member of parliament ware significantly more successful than their female peers. These persons are labelled in the scatter plot.

Put your figure here:

Congress <- cel %>%
  select(Gender = female,
         Latino = latino,
         Ideology = dwnom1,
         Effective = les)

wilcox.test(Congress$Gender, Congress$Ideology)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Congress$Gender and Congress$Ideology
## W = 56444814, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(Congress$Latino, Congress$Ideology)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Congress$Latino and Congress$Ideology
## W = 53385708, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(Congress$Effective, Congress$Ideology)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Congress$Effective and Congress$Ideology
## W = 76589423, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
Congress_Corr <- corr.test(Congress)
Congress_Corr
## Call:corr.test(x = Congress)
## Correlation matrix 
##           Gender Latino Ideology Effective
## Gender      1.00   0.07    -0.14     -0.05
## Latino      0.07   1.00    -0.11     -0.01
## Ideology   -0.14  -0.11     1.00     -0.02
## Effective  -0.05  -0.01    -0.02      1.00
## Sample Size 
## [1] 9845
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##           Gender Latino Ideology Effective
## Gender         0    0.0     0.00      0.00
## Latino         0    0.0     0.00      0.30
## Ideology       0    0.0     0.00      0.07
## Effective      0    0.3     0.04      0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
Congress$Name <- cel$thomas_name

Congress$Gender[Congress$Gender == 0] <- "Male"
Congress$Gender[Congress$Gender == 1] <- "Female"

Congress$Latino[Congress$Latino == 0] <- "No"
Congress$Latino[Congress$Latino == 1] <- "Yes"

Congress_Label <- Congress %>%
  filter(Gender == "Female")
Congress_Label <- Congress_Label[order(desc(Congress_Label$Effective)),]
congress_Label <- head(Congress_Label, 2)

ggplot(Congress,
       aes(x = Ideology,
           y = Effective,
           color = Gender)) +
  geom_point() +
  facet_grid(~Gender) +
  labs(title = "Relationship between Ideology and Effectiveness",
       subtitle = "By Gender",
       x = "Ideology",
       y = "Effectiveness") +
  scale_y_continuous(expand = c(0,0),
                     limits = c(0, 20)) +
  ggrepel::geom_text_repel(data = congress_Label,
                           aes(label = Name)) +
  theme_bw() +
  theme(panel.grid = element_blank())

Exercise 4

Explain what you are visualizing here:

On the previous visualization we can see that the data includes duplicated Names as the same members of congress were in different congresses. Because of this we are going to look at a timeline and see the composition of congress along time and which congress members have taken part in congress most times.

Timeline <- cel %>%
  select(Year = year,
         Gender = female,
         Name = thomas_name) %>%
  group_by(Year, Gender) %>%
  tally() %>%
  rename(Total = n)
Timeline$Gender[Timeline$Gender == 0] <- "Male"
Timeline$Gender[Timeline$Gender == 1] <- "Female"

Timeline1 <- cel %>%
  select(Year = year,
         Name = thomas_name) %>%
  group_by(Year) %>%
  tally() %>%
  rename(Total = n)
  
## Total members of Congress

ggplot(Timeline1,
            aes(x = Year,
                y = Total)) +
  geom_line(color = "purple") +
  labs(title = "Number of Congressmen per Year",
       x = "Congress Year",
       y = "Total") +
  xlim(1972, 2017) +
  scale_x_continuous(breaks = seq(1972, 2017, 2)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(400, 450)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5),
        panel.grid = element_blank()) 
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

## Total Members of Congress by Gender

ggplot(Timeline,
            aes(x = Year,
                y = Total,
                color = Gender)) +
  geom_line() +
  labs(title = "Number of Congressmen per Year",
       subtitle = "By Gender",
       x = "Congress Year",
       y = "Total") +
  xlim(1972, 2017) +
  scale_x_continuous(breaks = seq(1970, 2018, 2)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 450)) +
  ylim(0, 450) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5),
        panel.grid = element_blank()) 
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

Repeats <- cel %>%
  select(Name = thomas_name,
         Gender = female) %>%
  group_by(Name, Gender) %>%
  tally() %>%
  rename(Total = n)
Repeats$Gender[Repeats$Gender == 0] <- "Male"
Repeats$Gender[Repeats$Gender == 1] <- "Female"
Repeats <- head(Repeats, 10)

Top10_Females <- cel %>%
  select(Name = thomas_name,
         Gender = female) %>%
  filter(Gender == 1) %>%
  group_by(Name, Gender) %>%
  tally() %>%
  rename(Total = n) %>%
  mutate(Name = as_factor(Name))
Top10_Females <- head(Top10_Females, 10)

Top10_Males <- cel %>%
  select(Name = thomas_name,
         Gender = female) %>%
  filter(Gender == 0) %>%
  group_by(Name, Gender) %>%
  tally() %>%
  rename(Total = n)%>%
  mutate(Name = as_factor(Name))
Top10_Males <- head(Top10_Males, 10)

Top10 <- rbind(Top10_Females, Top10_Males)
Top10$Gender[Top10$Gender == 0] <- "Male"
Top10$Gender[Top10$Gender == 1] <- "Female"

ggplot(Repeats) +
  geom_col(aes(x = Name,
               y= Total,
               fill = Gender)) +
  scale_y_continuous(expand = c(0, 0),
                     limits = c(0, 15)) +
  labs(title = "Top 10 Congresmen",
      subtitle = "By Number of Membership to Congress ") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5),
        panel.grid = element_blank())

ggplot(Top10) +
  geom_col(aes(x = Name,
               y= Total,
               fill = Gender)) +
  scale_y_continuous(expand = c(0, 0)) +
  labs(title = "Top 10 Congresmen",
      subtitle = "By Number of Membership to Congress ") +
  facet_wrap(~Gender,
             scale = "free") +
  scale_y_continuous(expand = c(0,0),
                     limits = c(0, 15)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5),
        legend.position = "none",
        panel.spacing = unit(0.5, "lines"),
        panel.grid = element_blank())
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.