library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("gt")
library("gapminder")
library("srvyr")
## 
## Attaching package: 'srvyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
library("srvyrexploR")
library("fst")
library("ggridges")
print(gapminder)
## # A tibble: 1,704 × 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ℹ 1,694 more rows
life_exp_summary <- gapminder %>%
  filter(year %in% c(1987,2007)) %>%
  group_by(continent) %>%
  summarize(start_life = first(lifeExp), 
            end_life = last(lifeExp), 
            change = end_life - start_life, .groups = "drop") %>%
  arrange(desc(change))
  
life_exp_summary #code from your email.
## # A tibble: 5 × 4
##   continent start_life end_life change
##   <fct>          <dbl>    <dbl>  <dbl>
## 1 Asia            40.8     62.7  21.9 
## 2 Europe          72       79.4   7.42
## 3 Oceania         76.3     80.2   3.88
## 4 Americas        70.8     73.7   2.97
## 5 Africa          65.8     43.5 -22.3
life_exp_summary %>%
  gt(
    rowname_col = "row", 
    groupname_col = "group") %>%
  cols_label(
    continent = md("**Continent**"), #using md and ** to make the column names bold!
    start_life = md("**1987**"), 
    end_life = md("**2007**"), 
    change = md("**Change**")) %>%
  tab_header(
    title = "Life Expectancy Changes by Continent", #choosing title and subtitle names
    subtitle = "Average life expectancy in years") %>%
 fmt_number(
   columns = start_life, decimals = 1) %>% #setting all decimal places to one
  fmt_number(
    columns = end_life, decimals = 1) %>%
  fmt_number(
    columns = change, decimals = 1) %>%
  tab_source_note(
    source_note = "Data: Gapminder") #adding source note.
Life Expectancy Changes by Continent
Average life expectancy in years
Continent 1987 2007 Change
Asia 40.8 62.7 21.9
Europe 72.0 79.4 7.4
Oceania 76.3 80.2 3.9
Americas 70.8 73.7 3.0
Africa 65.8 43.5 −22.3
Data: Gapminder
life_exp_country <- gapminder %>%
  filter(year %in% c(1987,2007)) %>%
  group_by(continent, country) %>%
  summarize(
    start_life = first(lifeExp), 
    end_life = last(lifeExp), 
    change = end_life - start_life, avg_life = mean(lifeExp), .groups = "drop") %>%
  arrange(avg_life) %>%
  arrange(desc(change))
life_exp_country #code from your email, thank you professor!
## # A tibble: 142 × 6
##    continent country    start_life end_life change avg_life
##    <fct>     <fct>           <dbl>    <dbl>  <dbl>    <dbl>
##  1 Africa    Niger            44.6     56.9   12.3     50.7
##  2 Africa    Eritrea          46.5     58.0   11.6     52.2
##  3 Africa    Egypt            59.8     71.3   11.5     65.6
##  4 Asia      Vietnam          62.8     74.2   11.4     68.5
##  5 Asia      Nepal            52.5     63.8   11.2     58.2
##  6 Asia      Bangladesh       52.8     64.1   11.2     58.4
##  7 Americas  Nicaragua        62.0     72.9   10.9     67.5
##  8 Asia      Indonesia        60.1     70.6   10.5     65.4
##  9 Africa    Guinea           45.6     56.0   10.5     50.8
## 10 Africa    Comoros          54.9     65.2   10.2     60.0
## # ℹ 132 more rows
key_cases <- life_exp_country %>%
  filter(country %in% c("Niger", "Bangladesh", "El Salvador", "Iraq", "Zimbabwe"))
print(key_cases) #used this code from your email.
## # A tibble: 5 × 6
##   continent country     start_life end_life change avg_life
##   <fct>     <fct>            <dbl>    <dbl>  <dbl>    <dbl>
## 1 Africa    Niger             44.6     56.9  12.3      50.7
## 2 Asia      Bangladesh        52.8     64.1  11.2      58.4
## 3 Americas  El Salvador       63.2     71.9   8.72     67.5
## 4 Asia      Iraq              65.0     59.5  -5.50     62.3
## 5 Africa    Zimbabwe          62.4     43.5 -18.9      52.9
ggplot(key_cases, mapping =  aes(y = start_life, x = country)) +
  geom_point(
    aes(y = start_life, x = country, colour = "1987"), size = 2.5) + #assigning 1987 to colour so we have a value for the legend
  geom_point(
    aes(y = end_life, x = country, colour = "2007"), size = 2.5) + #same thing here, assigning 2007 to colour so it appears in the legend
  labs(
    title = "Life Expectancy Trajectories (1987-2007)", 
    subtitle = "in Selected Countries", 
    x = "Life Expectancy (Years)", 
    y = "Years") + #adding titles
  theme_minimal() + 
  theme(
    legend.position = "bottom", #putting the legend at the bottom
    legend.title = element_text(size = 0), #I thought of this to get rid of the legend title, if I just made the text 0 it would not show! Probably not a very reliable way to do it but it worked this time.
    legend.text = element_text(size = 10), 
    plot.title = element_text(face = "bold", #making the title of the plot bold
                              size = 14), 
    plot.subtitle = element_text(size = 12), 
    axis.line = element_line(color = "black", 
                             linewidth = 1.5)) #sizes

INTERPRETATION: The biggest change that stood out to me was between Asia and Africa, they basically switched places. I wonder what factors contributed to Asia’s life expectancy increasing by 20 years, and what contributed to Africa’s life expectancy decreasing by 20 years. Off the top of my head, I immediately think about access to health care and basic necessities. Asia is definitely more developed overall than Africa is. Of course, there needs to be other factors to account for such a big change. The Americas and Oceania were the most consistent out of the continents, with Europe having a still big increase of 7.4 years. Looking at the 1987 values and 2007 values, they look very similar for the most part, with a small overall increase on the planet.

Regarding the countries plot, the two African countries have the biggest change, positive or negative. However the average of the two is still negative, which leads towards the data in the continents analysis. We can do a similar comparison with the two Asian countries in the chart, which have the second biggest changes. The average of the two Asian countries is postive, which again supports the continent data.


QUESTION 2


table(anes_2020$TrustPeople)
## 
##              Always    Most of the time About half the time    Some of the time 
##                  48                3511                2020                1597 
##               Never 
##                 264
table(anes_2020$AgeGroup) #table for both variables
## 
##       18-29       30-39       40-49       50-59       60-69 70 or older 
##         871        1241        1081        1200        1436        1330
total_valid <- anes_2020 %>%
  filter(!is.na(TrustPeople), !is.na(AgeGroup)) %>%
  nrow()
total_valid #calculating the total number of respondants
## [1] 7153
people_check <- c("Always", "Most of the time", "About half the time", "Some of the time", "Never") #created a variable to calculate valid_pct

trust_props <- anes_2020 %>%
  filter(!is.na(TrustPeople), !is.na(AgeGroup)) %>% #removing empty values
  group_by(AgeGroup) %>%
  count(TrustPeople) %>%
   mutate(
    prop = n/sum(n),
    pct = round(100 * prop, 1),
    valid_pct = ifelse(TrustPeople %in% people_check, n/sum(n) * 100, NA)
  )
trust_props
## # A tibble: 30 × 6
## # Groups:   AgeGroup [6]
##    AgeGroup TrustPeople             n    prop   pct valid_pct
##    <fct>    <fct>               <int>   <dbl> <dbl>     <dbl>
##  1 18-29    Always                  7 0.00804   0.8     0.804
##  2 18-29    Most of the time      268 0.308    30.8    30.8  
##  3 18-29    About half the time   278 0.319    31.9    31.9  
##  4 18-29    Some of the time      246 0.282    28.2    28.2  
##  5 18-29    Never                  72 0.0827    8.3     8.27 
##  6 30-39    Always                 10 0.00807   0.8     0.807
##  7 30-39    Most of the time      502 0.405    40.5    40.5  
##  8 30-39    About half the time   378 0.305    30.5    30.5  
##  9 30-39    Some of the time      281 0.227    22.7    22.7  
## 10 30-39    Never                  68 0.0549    5.5     5.49 
## # ℹ 20 more rows
#I was not sure if valid_pct is what you showed us how to do in the video tutorial on 2.a, so I calculated it again just to make sure that I had it. 
trust_props_table <- trust_props %>%
  gt(
    rowname_col = "row", groupname_col = "group") %>%
  cols_label(
    AgeGroup = md("**Age Group**"), #md and ** makes column header bold
    TrustPeople = ("Trust People"), 
    n = ("Respondents"), 
    prop = "prop", 
    pct = ("Percent"), 
    valid_pct = ("Percent")) %>% #changing column names
  tab_header(
    title = md("**Interpersonal Trust by Age Group**"), #bold title
    subtitle = "Distribution of responses (percentages)") %>%
  fmt_number(
    columns = prop, decimals = 1) %>% #making percentages to 1 decimal place.
  fmt_number(
    columns = valid_pct, decimals = 1) %>%
  tab_source_note(
    source_note = md("Data: ANES 2020 (*sample size value*)")) %>% #I learned that md and * makes selected text italic! Added the source note here
  cols_hide(prop) %>% #I decided to remove prop and percent from the table and leave in valid percent because it felt very redundant, and doing this would make it clearer.
  cols_hide(pct)
trust_props_table
Interpersonal Trust by Age Group
Distribution of responses (percentages)
Age Group Trust People Respondents Percent
18-29 Always 7 0.8
18-29 Most of the time 268 30.8
18-29 About half the time 278 31.9
18-29 Some of the time 246 28.2
18-29 Never 72 8.3
30-39 Always 10 0.8
30-39 Most of the time 502 40.5
30-39 About half the time 378 30.5
30-39 Some of the time 281 22.7
30-39 Never 68 5.5
40-49 Always 8 0.7
40-49 Most of the time 476 44.1
40-49 About half the time 314 29.1
40-49 Some of the time 247 22.9
40-49 Never 35 3.2
50-59 Always 2 0.2
50-59 Most of the time 586 48.9
50-59 About half the time 325 27.1
50-59 Some of the time 249 20.8
50-59 Never 37 3.1
60-69 Always 10 0.7
60-69 Most of the time 752 52.4
60-69 About half the time 362 25.2
60-69 Some of the time 284 19.8
60-69 Never 27 1.9
70 or older Always 8 0.6
70 or older Most of the time 787 59.2
70 or older About half the time 287 21.6
70 or older Some of the time 230 17.3
70 or older Never 17 1.3
Data: ANES 2020 (sample size value)
ggplot(data = trust_props %>% 
           filter(!is.na(TrustPeople), !is.na(AgeGroup)), #removing empty values for both categories (which should already be done but just in case)
    mapping = aes(x = AgeGroup, y = prop, fill = TrustPeople) #used prop for y instead of pct because when I switched the y scale to percent it multiplied the pct value by 100, which gave me 10,000%.
) +
    geom_bar(position = "stack", stat = "identity", #google helped me with this, my plot was showing all response levels (always, never, etc) equal across age groups at 20% each. This fixed it!
        color = "white",
        alpha = 0.9
    ) +
    scale_fill_viridis_d( #the requested color palette
        option = "mako",
        direction = -1
    ) +
    scale_y_continuous(
        labels = scales::percent, #changing the scale to percent, as mentionned above.
    ) +
    labs(
        title = "Interpersonal Trust Distribution by Age Group", #changing all the names, and the name of the legend.
        x = "Age Group",
        y = "Percentage by Age Group",
        fill = "Level of Trust"
    ) +
    theme_minimal() +
    theme(
        legend.position = "right",
        legend.title = element_text(face = "bold"), #making the legend title bold, and to the right of the plot
        plot.title = element_text(face = "bold", size = 14), #bolding the title
        axis.text = element_text(size = 11), #changing the axis text size.
    ) 

INTERPRETATION: The first thing that stood out to me in regards to age and trust patterns is that the “Always” category was very small, almost negligible in each age group. It is barely visible at the very top of the bars. However in moving towards “Most of the time”, you can see that this category increases as we move down the x axis. This could be from a couple things. Generally speaking, young adults are becoming more critical or less trusting of many things. In parallel, we can see that the “Never” response is highest in the 18-29 age category, and gets lower as it moves down the x axis. The distribution changes significantly across the age groups, the closest two age groups being 40-49 and 50-59. Over all; the younger the respondents are, the less trusting they are, and, the older the respondents are, the more trusting they are.


QUESTION 3


library(fst)
denmark_data <- read_fst("denmark_data.fst")
italy_data <- read_fst("italy_data.fst")

gt table data manipulation

ita_sample <- italy_data %>%
  filter(!is.na(sofrdst)) %>%
  nrow()
ita_sample
## [1] 2745
#used this to see the sample size for the Italy data
italy_clean <- italy_data %>%
  mutate(
    society_fair_ita = case_when(
      sofrdst == 1 ~ "Agree strongly",
      sofrdst == 2 ~ "Agree",
      sofrdst == 3 ~ "Neither agree nor disagree",
      sofrdst == 4 ~ "Disagree",
      sofrdst == 5 ~ "Disagree strongly",
      TRUE ~ NA_character_
    ), society_fair_ita = factor(society_fair_ita, levels = c("Agree strongly", "Agree", "Neither agree nor disagree", "Disagree", "Disagree strongly")))

italy_clean <- italy_clean %>%
  count(
    society_fair_ita) %>%
  filter(
    !is.na(society_fair_ita)) %>%
  mutate(
    ItalyPercent = n/sum(n) * 100)
italy_clean
##             society_fair_ita    n ItalyPercent
## 1             Agree strongly  692    25.715347
## 2                      Agree 1346    50.018580
## 3 Neither agree nor disagree  448    16.648086
## 4                   Disagree  174     6.465998
## 5          Disagree strongly   31     1.151988
#here I cleaned the data by removing empty values, and assigning factor levels to the variable. Also calculating percent and count. I did this for Italy first.
den_sample <- denmark_data %>%
  filter(!is.na(sofrdst)) %>%
  nrow()
#sample size for Denmark data
denmark_clean <- denmark_data %>%
  mutate(
    society_fair_den = case_when(
      sofrdst == 1 ~ "Agree strongly",
      sofrdst == 2 ~ "Agree",
      sofrdst == 3 ~ "Neither agree nor disagree",
      sofrdst == 4 ~ "Disagree",
      sofrdst == 5 ~ "Disagree strongly",
      TRUE ~ NA_character_
    ), society_fair_den = factor(society_fair_den, levels = c("Agree strongly", "Agree", "Neither agree nor disagree", "Disagree", "Disagree strongly"))) 

denmark_clean <- denmark_clean %>%
  count(
    society_fair_den) %>%
  filter(
    !is.na(society_fair_den)) %>%
  mutate(
    DenmarkPercent = n/sum(n) * 100) 
denmark_clean
##             society_fair_den   n DenmarkPercent
## 1             Agree strongly  79       5.103359
## 2                      Agree 268      17.312661
## 3 Neither agree nor disagree 325      20.994832
## 4                   Disagree 674      43.540052
## 5          Disagree strongly 202      13.049096
#did the same thing for Denmark as I did for Italy previously. Removing empty values, assigning factor levels, calculating percent and count.
all_data <- cbind(denmark_clean, italy_clean) 
colnames(all_data) <- c("Society Opinion", "Denmark Respondants", "Denmark Percent", "Society Opinions", "Italy Respondants", "Italy Percent") 

all_data
##              Society Opinion Denmark Respondants Denmark Percent
## 1             Agree strongly                  79        5.103359
## 2                      Agree                 268       17.312661
## 3 Neither agree nor disagree                 325       20.994832
## 4                   Disagree                 674       43.540052
## 5          Disagree strongly                 202       13.049096
##             Society Opinions Italy Respondants Italy Percent
## 1             Agree strongly               692     25.715347
## 2                      Agree              1346     50.018580
## 3 Neither agree nor disagree               448     16.648086
## 4                   Disagree               174      6.465998
## 5          Disagree strongly                31      1.151988
#Here I combined the columns using cbind to set up the data for a table creation. As we discussed today in your office, it would have been best to do a rbind before manipulating the data. However, I do like how my table turned out so I am just going to leave it how it is.
all_data %>%
  gt(rowname_col = "row", groupname_col = "group") %>%
  cols_hide(
    `Society Opinions`) %>%
  tab_header(
    title = md("**Views on Fair Income Distribution**"), 
    subtitle = "Response distribution by country (%)") %>%
  fmt_number(
    columns = `Denmark Percent`, decimals = 1) %>%
  fmt_number(
    columns = `Italy Percent`, decimals = 1) %>%
  tab_spanner(#using tab spanner to create column names over the columns.
    label = md("**Denmark**"),
    columns = c(
      `Denmark Respondants`, `Denmark Percent`)) %>%
  tab_spanner(
    label = md("**Italy**"), columns = c(
      `Italy Respondants`, `Italy Percent`)) %>%
  tab_spanner(
    label = md("**Country**"), columns = c(
      `Denmark Respondants`, `Denmark Percent`, `Italy Respondants`, `Italy Percent`)) %>%
  cols_label(
    `Denmark Respondants` = "Response Count", #here I just changed the column titles to fit better.
    `Denmark Percent` = "Percent (%)", 
    `Italy Respondants` = "Response Count", 
    `Italy Percent` = "Percent (%)", 
    `Society Opinion` = md("**Respondent's Opinions**")) %>%
  tab_source_note(source_note = c("Denmark Sample Size:", den_sample, "Italy Sample Size:", ita_sample)) #added a source note. Now, I know it is not pretty, I could not get them to line up side by side, like to say 'Denmark Sample Size: 1572'. Would I have to combine these values to achieve this?
Views on Fair Income Distribution
Response distribution by country (%)
Country
Respondent’s Opinions
Denmark
Italy
Response Count Percent (%) Response Count Percent (%)
Agree strongly 79 5.1 692 25.7
Agree 268 17.3 1346 50.0
Neither agree nor disagree 325 21.0 448 16.6
Disagree 674 43.5 174 6.5
Disagree strongly 202 13.0 31 1.2
Denmark Sample Size:
1572
Italy Sample Size:
2745

Visualization data manipulation

combined_data <- bind_rows(
  denmark_data %>% mutate(cntry = "Denmark"),
  italy_data  %>% mutate(cntry = "Italy"),
)
density_plot <- combined_data %>%
  filter(
    !is.na(sofrdst),
    sofrdst <= 5
  )  %>%
ggplot(aes(
        x = (sofrdst),
        y = cntry,
        fill = cntry
    )
) +
    geom_density_ridges(
        alpha = 0.7
    ) +
    scale_fill_brewer(palette = "Set1") +
    scale_x_continuous(
        breaks = 1:5,
        labels = c("Agree\nstrongly", "Agree", "Neither agree\nnor disagree", "Disagree", "Disagree\nstrongly")
    ) +
    labs(
        title = "Distribution of Views on Income Equality",
        subtitle = "Comparison between Italy and Denmark",
        x = "Resopondent's Opinion",
        y = NULL 
    ) +
    theme_minimal() +
    theme(legend.position = "none")
#density plot that you built for me in your office :)
density_plot
## Picking joint bandwidth of 0.146

edu_data <- combined_data %>%
  mutate(
    education = case_when(
      sofrdst %in% c(1:2) ~ "Lower Education",   #comment on this section below
      sofrdst %in% c(3:5) ~ "Higher Education",       
      TRUE ~ NA_character_
    ),
    education = factor(education)
  )
edu_data <- edu_data %>%
  filter(!is.na(sofrdst), !is.na(education)) %>%
  group_by(education, cntry) %>%
  count(sofrdst) %>%
  mutate(
    total = sum(n),
    percent = round(100 * n/total, 1)
  )

print(edu_data)
## # A tibble: 10 × 6
## # Groups:   education, cntry [4]
##    education        cntry   sofrdst     n total percent
##    <fct>            <chr>     <dbl> <int> <int>   <dbl>
##  1 Higher Education Denmark       3   325  1201    27.1
##  2 Higher Education Denmark       4   674  1201    56.1
##  3 Higher Education Denmark       5   202  1201    16.8
##  4 Higher Education Italy         3   448   653    68.6
##  5 Higher Education Italy         4   174   653    26.6
##  6 Higher Education Italy         5    31   653     4.7
##  7 Lower Education  Denmark       1    79   347    22.8
##  8 Lower Education  Denmark       2   268   347    77.2
##  9 Lower Education  Italy         1   692  2038    34  
## 10 Lower Education  Italy         2  1346  2038    66
edu_plot <- edu_data %>%
  filter(
    !is.na(sofrdst)
  )  %>%
ggplot(aes(
        x = (sofrdst),
        y = education,
        fill = education
    )
) +
  facet_wrap(~cntry) +
  geom_density_ridges(
        alpha = 0.7
    ) +
    scale_fill_brewer(palette = "Set1") +
    scale_x_continuous(
        breaks = 1:5,
        labels = c("Agree\nstrongly", "Agree", "Neither agree\nnor disagree", "Disagree", "Disagree\nstrongly")
    ) +
    labs(
        title = "Views on Income Distribution by Education Level",
        subtitle = "Comparing Italy and Denmark",
        x = "Resopondent's Opinion",
        y = NULL 
    ) +
    theme_minimal() +
    theme(legend.position = "none") + 
  labs()
edu_plot 
## Picking joint bandwidth of 0.416
## Picking joint bandwidth of 0.416

#I don't really understand this, I understood it in your office but lost it when I got home. There should be two spikes for each color, because not all agreements are in Italy. I cannot figure this out. the edu_data makes sense, but it is not plotting correctly. I know it said to facet, which I was able to do but it is not showing for the 1:2 range, which is interesting. I tried to follow the third tutorial steps, but to not much success. 

INTERPRETATION: It seems like in the first visualization, Italy agrees more and Denmark disagrees more. This is very interesting, the data almost looks like opposites for the countries. I cannot comment on the educational patterns in the countries, as I cannot get the plot to work for me, and I am unwilling to make guesses without the work being completed.

There was a lot to take from this assignment. I have learned a new level of patience, but more importantly, I learned how to approach coding problems by steps to get to the solution. It is important to keep your mind open, but critical. One thing I would say is that if there were no delays on the assignement, I would not have gotten it done in time. Would we be able to receive the assignements a little earlier? Thank you for all your help on this, I definitely feel more comfortable trying things and making mistakes in R.