Skill Building 1

library("tidyverse")

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library("gt")
library("gapminder")
library("srvyr")

## 
## Attaching package: 'srvyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter

library("srvyrexploR")
library("fst")
library("ggridges")

print(gapminder)

## # A tibble: 1,704 × 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ℹ 1,694 more rows

life_exp_summary <- gapminder %>%
  filter(year %in% c(1987,2007)) %>%
  group_by(continent) %>%
  summarize(start_life = first(lifeExp), 
            end_life = last(lifeExp), 
            change = end_life - start_life, .groups = "drop") %>%
  arrange(desc(change))
  
life_exp_summary #code from your email.

## # A tibble: 5 × 4
##   continent start_life end_life change
##   <fct>          <dbl>    <dbl>  <dbl>
## 1 Asia            40.8     62.7  21.9 
## 2 Europe          72       79.4   7.42
## 3 Oceania         76.3     80.2   3.88
## 4 Americas        70.8     73.7   2.97
## 5 Africa          65.8     43.5 -22.3

life_exp_summary %>%
  gt(
    rowname_col = "row", 
    groupname_col = "group") %>%
  cols_label(
    continent = md("**Continent**"), #using md and ** to make the column names bold!
    start_life = md("**1987**"), 
    end_life = md("**2007**"), 
    change = md("**Change**")) %>%
  tab_header(
    title = "Life Expectancy Changes by Continent", #choosing title and subtitle names
    subtitle = "Average life expectancy in years") %>%
 fmt_number(
   columns = start_life, decimals = 1) %>% #setting all decimal places to one
  fmt_number(
    columns = end_life, decimals = 1) %>%
  fmt_number(
    columns = change, decimals = 1) %>%
  tab_source_note(
    source_note = "Data: Gapminder") #adding source note.

Continent	1987	2007	Change
Life Expectancy Changes by Continent
Average life expectancy in years
Asia	40.8	62.7	21.9
Europe	72.0	79.4	7.4
Oceania	76.3	80.2	3.9
Americas	70.8	73.7	3.0
Africa	65.8	43.5	−22.3
Data: Gapminder

life_exp_country <- gapminder %>%
  filter(year %in% c(1987,2007)) %>%
  group_by(continent, country) %>%
  summarize(
    start_life = first(lifeExp), 
    end_life = last(lifeExp), 
    change = end_life - start_life, avg_life = mean(lifeExp), .groups = "drop") %>%
  arrange(avg_life) %>%
  arrange(desc(change))
life_exp_country #code from your email, thank you professor!

## # A tibble: 142 × 6
##    continent country    start_life end_life change avg_life
##    <fct>     <fct>           <dbl>    <dbl>  <dbl>    <dbl>
##  1 Africa    Niger            44.6     56.9   12.3     50.7
##  2 Africa    Eritrea          46.5     58.0   11.6     52.2
##  3 Africa    Egypt            59.8     71.3   11.5     65.6
##  4 Asia      Vietnam          62.8     74.2   11.4     68.5
##  5 Asia      Nepal            52.5     63.8   11.2     58.2
##  6 Asia      Bangladesh       52.8     64.1   11.2     58.4
##  7 Americas  Nicaragua        62.0     72.9   10.9     67.5
##  8 Asia      Indonesia        60.1     70.6   10.5     65.4
##  9 Africa    Guinea           45.6     56.0   10.5     50.8
## 10 Africa    Comoros          54.9     65.2   10.2     60.0
## # ℹ 132 more rows

key_cases <- life_exp_country %>%
  filter(country %in% c("Niger", "Bangladesh", "El Salvador", "Iraq", "Zimbabwe"))
print(key_cases) #used this code from your email.

## # A tibble: 5 × 6
##   continent country     start_life end_life change avg_life
##   <fct>     <fct>            <dbl>    <dbl>  <dbl>    <dbl>
## 1 Africa    Niger             44.6     56.9  12.3      50.7
## 2 Asia      Bangladesh        52.8     64.1  11.2      58.4
## 3 Americas  El Salvador       63.2     71.9   8.72     67.5
## 4 Asia      Iraq              65.0     59.5  -5.50     62.3
## 5 Africa    Zimbabwe          62.4     43.5 -18.9      52.9

ggplot(key_cases, mapping =  aes(y = start_life, x = country)) +
  geom_point(
    aes(y = start_life, x = country, colour = "1987"), size = 2.5) + #assigning 1987 to colour so we have a value for the legend
  geom_point(
    aes(y = end_life, x = country, colour = "2007"), size = 2.5) + #same thing here, assigning 2007 to colour so it appears in the legend
  labs(
    title = "Life Expectancy Trajectories (1987-2007)", 
    subtitle = "in Selected Countries", 
    x = "Life Expectancy (Years)", 
    y = "Years") + #adding titles
  theme_minimal() + 
  theme(
    legend.position = "bottom", #putting the legend at the bottom
    legend.title = element_text(size = 0), #I thought of this to get rid of the legend title, if I just made the text 0 it would not show! Probably not a very reliable way to do it but it worked this time.
    legend.text = element_text(size = 10), 
    plot.title = element_text(face = "bold", #making the title of the plot bold
                              size = 14), 
    plot.subtitle = element_text(size = 12), 
    axis.line = element_line(color = "black", 
                             linewidth = 1.5)) #sizes

INTERPRETATION: The biggest change that stood out to me was between Asia and Africa, they basically switched places. I wonder what factors contributed to Asia’s life expectancy increasing by 20 years, and what contributed to Africa’s life expectancy decreasing by 20 years. Off the top of my head, I immediately think about access to health care and basic necessities. Asia is definitely more developed overall than Africa is. Of course, there needs to be other factors to account for such a big change. The Americas and Oceania were the most consistent out of the continents, with Europe having a still big increase of 7.4 years. Looking at the 1987 values and 2007 values, they look very similar for the most part, with a small overall increase on the planet.

Regarding the countries plot, the two African countries have the biggest change, positive or negative. However the average of the two is still negative, which leads towards the data in the continents analysis. We can do a similar comparison with the two Asian countries in the chart, which have the second biggest changes. The average of the two Asian countries is postive, which again supports the continent data.

QUESTION 2

table(anes_2020$TrustPeople)

## 
##              Always    Most of the time About half the time    Some of the time 
##                  48                3511                2020                1597 
##               Never 
##                 264

table(anes_2020$AgeGroup) #table for both variables

## 
##       18-29       30-39       40-49       50-59       60-69 70 or older 
##         871        1241        1081        1200        1436        1330

total_valid <- anes_2020 %>%
  filter(!is.na(TrustPeople), !is.na(AgeGroup)) %>%
  nrow()
total_valid #calculating the total number of respondants

## [1] 7153

people_check <- c("Always", "Most of the time", "About half the time", "Some of the time", "Never") #created a variable to calculate valid_pct

trust_props <- anes_2020 %>%
  filter(!is.na(TrustPeople), !is.na(AgeGroup)) %>% #removing empty values
  group_by(AgeGroup) %>%
  count(TrustPeople) %>%
   mutate(
    prop = n/sum(n),
    pct = round(100 * prop, 1),
    valid_pct = ifelse(TrustPeople %in% people_check, n/sum(n) * 100, NA)
  )
trust_props

## # A tibble: 30 × 6
## # Groups:   AgeGroup [6]
##    AgeGroup TrustPeople             n    prop   pct valid_pct
##    <fct>    <fct>               <int>   <dbl> <dbl>     <dbl>
##  1 18-29    Always                  7 0.00804   0.8     0.804
##  2 18-29    Most of the time      268 0.308    30.8    30.8  
##  3 18-29    About half the time   278 0.319    31.9    31.9  
##  4 18-29    Some of the time      246 0.282    28.2    28.2  
##  5 18-29    Never                  72 0.0827    8.3     8.27 
##  6 30-39    Always                 10 0.00807   0.8     0.807
##  7 30-39    Most of the time      502 0.405    40.5    40.5  
##  8 30-39    About half the time   378 0.305    30.5    30.5  
##  9 30-39    Some of the time      281 0.227    22.7    22.7  
## 10 30-39    Never                  68 0.0549    5.5     5.49 
## # ℹ 20 more rows

#I was not sure if valid_pct is what you showed us how to do in the video tutorial on 2.a, so I calculated it again just to make sure that I had it.

trust_props_table <- trust_props %>%
  gt(
    rowname_col = "row", groupname_col = "group") %>%
  cols_label(
    AgeGroup = md("**Age Group**"), #md and ** makes column header bold
    TrustPeople = ("Trust People"), 
    n = ("Respondents"), 
    prop = "prop", 
    pct = ("Percent"), 
    valid_pct = ("Percent")) %>% #changing column names
  tab_header(
    title = md("**Interpersonal Trust by Age Group**"), #bold title
    subtitle = "Distribution of responses (percentages)") %>%
  fmt_number(
    columns = prop, decimals = 1) %>% #making percentages to 1 decimal place.
  fmt_number(
    columns = valid_pct, decimals = 1) %>%
  tab_source_note(
    source_note = md("Data: ANES 2020 (*sample size value*)")) %>% #I learned that md and * makes selected text italic! Added the source note here
  cols_hide(prop) %>% #I decided to remove prop and percent from the table and leave in valid percent because it felt very redundant, and doing this would make it clearer.
  cols_hide(pct)
trust_props_table

Age Group	Trust People	Respondents	Percent
Interpersonal Trust by Age Group
Distribution of responses (percentages)
18-29	Always	7	0.8
18-29	Most of the time	268	30.8
18-29	About half the time	278	31.9
18-29	Some of the time	246	28.2
18-29	Never	72	8.3
30-39	Always	10	0.8
30-39	Most of the time	502	40.5
30-39	About half the time	378	30.5
30-39	Some of the time	281	22.7
30-39	Never	68	5.5
40-49	Always	8	0.7
40-49	Most of the time	476	44.1
40-49	About half the time	314	29.1
40-49	Some of the time	247	22.9
40-49	Never	35	3.2
50-59	Always	2	0.2
50-59	Most of the time	586	48.9
50-59	About half the time	325	27.1
50-59	Some of the time	249	20.8
50-59	Never	37	3.1
60-69	Always	10	0.7
60-69	Most of the time	752	52.4
60-69	About half the time	362	25.2
60-69	Some of the time	284	19.8
60-69	Never	27	1.9
70 or older	Always	8	0.6
70 or older	Most of the time	787	59.2
70 or older	About half the time	287	21.6
70 or older	Some of the time	230	17.3
70 or older	Never	17	1.3
Data: ANES 2020 (sample size value)

ggplot(data = trust_props %>% 
           filter(!is.na(TrustPeople), !is.na(AgeGroup)), #removing empty values for both categories (which should already be done but just in case)
    mapping = aes(x = AgeGroup, y = prop, fill = TrustPeople) #used prop for y instead of pct because when I switched the y scale to percent it multiplied the pct value by 100, which gave me 10,000%.
) +
    geom_bar(position = "stack", stat = "identity", #google helped me with this, my plot was showing all response levels (always, never, etc) equal across age groups at 20% each. This fixed it!
        color = "white",
        alpha = 0.9
    ) +
    scale_fill_viridis_d( #the requested color palette
        option = "mako",
        direction = -1
    ) +
    scale_y_continuous(
        labels = scales::percent, #changing the scale to percent, as mentionned above.
    ) +
    labs(
        title = "Interpersonal Trust Distribution by Age Group", #changing all the names, and the name of the legend.
        x = "Age Group",
        y = "Percentage by Age Group",
        fill = "Level of Trust"
    ) +
    theme_minimal() +
    theme(
        legend.position = "right",
        legend.title = element_text(face = "bold"), #making the legend title bold, and to the right of the plot
        plot.title = element_text(face = "bold", size = 14), #bolding the title
        axis.text = element_text(size = 11), #changing the axis text size.
    )

INTERPRETATION: The first thing that stood out to me in regards to age and trust patterns is that the “Always” category was very small, almost negligible in each age group. It is barely visible at the very top of the bars. However in moving towards “Most of the time”, you can see that this category increases as we move down the x axis. This could be from a couple things. Generally speaking, young adults are becoming more critical or less trusting of many things. In parallel, we can see that the “Never” response is highest in the 18-29 age category, and gets lower as it moves down the x axis. The distribution changes significantly across the age groups, the closest two age groups being 40-49 and 50-59. Over all; the younger the respondents are, the less trusting they are, and, the older the respondents are, the more trusting they are.

QUESTION 3

library(fst)
denmark_data <- read_fst("denmark_data.fst")

italy_data <- read_fst("italy_data.fst")

gt table data manipulation

ita_sample <- italy_data %>%
  filter(!is.na(sofrdst)) %>%
  nrow()
ita_sample

## [1] 2745

#used this to see the sample size for the Italy data

italy_clean <- italy_data %>%
  mutate(
    society_fair_ita = case_when(
      sofrdst == 1 ~ "Agree strongly",
      sofrdst == 2 ~ "Agree",
      sofrdst == 3 ~ "Neither agree nor disagree",
      sofrdst == 4 ~ "Disagree",
      sofrdst == 5 ~ "Disagree strongly",
      TRUE ~ NA_character_
    ), society_fair_ita = factor(society_fair_ita, levels = c("Agree strongly", "Agree", "Neither agree nor disagree", "Disagree", "Disagree strongly")))

italy_clean <- italy_clean %>%
  count(
    society_fair_ita) %>%
  filter(
    !is.na(society_fair_ita)) %>%
  mutate(
    ItalyPercent = n/sum(n) * 100)
italy_clean

##             society_fair_ita    n ItalyPercent
## 1             Agree strongly  692    25.715347
## 2                      Agree 1346    50.018580
## 3 Neither agree nor disagree  448    16.648086
## 4                   Disagree  174     6.465998
## 5          Disagree strongly   31     1.151988

#here I cleaned the data by removing empty values, and assigning factor levels to the variable. Also calculating percent and count. I did this for Italy first.

den_sample <- denmark_data %>%
  filter(!is.na(sofrdst)) %>%
  nrow()
#sample size for Denmark data

denmark_clean <- denmark_data %>%
  mutate(
    society_fair_den = case_when(
      sofrdst == 1 ~ "Agree strongly",
      sofrdst == 2 ~ "Agree",
      sofrdst == 3 ~ "Neither agree nor disagree",
      sofrdst == 4 ~ "Disagree",
      sofrdst == 5 ~ "Disagree strongly",
      TRUE ~ NA_character_
    ), society_fair_den = factor(society_fair_den, levels = c("Agree strongly", "Agree", "Neither agree nor disagree", "Disagree", "Disagree strongly"))) 

denmark_clean <- denmark_clean %>%
  count(
    society_fair_den) %>%
  filter(
    !is.na(society_fair_den)) %>%
  mutate(
    DenmarkPercent = n/sum(n) * 100) 
denmark_clean

##             society_fair_den   n DenmarkPercent
## 1             Agree strongly  79       5.103359
## 2                      Agree 268      17.312661
## 3 Neither agree nor disagree 325      20.994832
## 4                   Disagree 674      43.540052
## 5          Disagree strongly 202      13.049096

#did the same thing for Denmark as I did for Italy previously. Removing empty values, assigning factor levels, calculating percent and count.

all_data <- cbind(denmark_clean, italy_clean) 
colnames(all_data) <- c("Society Opinion", "Denmark Respondants", "Denmark Percent", "Society Opinions", "Italy Respondants", "Italy Percent") 

all_data

##              Society Opinion Denmark Respondants Denmark Percent
## 1             Agree strongly                  79        5.103359
## 2                      Agree                 268       17.312661
## 3 Neither agree nor disagree                 325       20.994832
## 4                   Disagree                 674       43.540052
## 5          Disagree strongly                 202       13.049096
##             Society Opinions Italy Respondants Italy Percent
## 1             Agree strongly               692     25.715347
## 2                      Agree              1346     50.018580
## 3 Neither agree nor disagree               448     16.648086
## 4                   Disagree               174      6.465998
## 5          Disagree strongly                31      1.151988

#Here I combined the columns using cbind to set up the data for a table creation. As we discussed today in your office, it would have been best to do a rbind before manipulating the data. However, I do like how my table turned out so I am just going to leave it how it is.

all_data %>%
  gt(rowname_col = "row", groupname_col = "group") %>%
  cols_hide(
    `Society Opinions`) %>%
  tab_header(
    title = md("**Views on Fair Income Distribution**"), 
    subtitle = "Response distribution by country (%)") %>%
  fmt_number(
    columns = `Denmark Percent`, decimals = 1) %>%
  fmt_number(
    columns = `Italy Percent`, decimals = 1) %>%
  tab_spanner(#using tab spanner to create column names over the columns.
    label = md("**Denmark**"),
    columns = c(
      `Denmark Respondants`, `Denmark Percent`)) %>%
  tab_spanner(
    label = md("**Italy**"), columns = c(
      `Italy Respondants`, `Italy Percent`)) %>%
  tab_spanner(
    label = md("**Country**"), columns = c(
      `Denmark Respondants`, `Denmark Percent`, `Italy Respondants`, `Italy Percent`)) %>%
  cols_label(
    `Denmark Respondants` = "Response Count", #here I just changed the column titles to fit better.
    `Denmark Percent` = "Percent (%)", 
    `Italy Respondants` = "Response Count", 
    `Italy Percent` = "Percent (%)", 
    `Society Opinion` = md("**Respondent's Opinions**")) %>%
  tab_source_note(source_note = c("Denmark Sample Size:", den_sample, "Italy Sample Size:", ita_sample)) #added a source note. Now, I know it is not pretty, I could not get them to line up side by side, like to say 'Denmark Sample Size: 1572'. Would I have to combine these values to achieve this?

	Country
Views on Fair Income Distribution
Response distribution by country (%)
Respondent’s Opinions	Denmark		Italy
Respondent’s Opinions	Response Count	Percent (%)	Response Count	Percent (%)
Agree strongly	79	5.1	692	25.7
Agree	268	17.3	1346	50.0
Neither agree nor disagree	325	21.0	448	16.6
Disagree	674	43.5	174	6.5
Disagree strongly	202	13.0	31	1.2
Denmark Sample Size:
1572
Italy Sample Size:
2745

Visualization data manipulation

combined_data <- bind_rows(
  denmark_data %>% mutate(cntry = "Denmark"),
  italy_data  %>% mutate(cntry = "Italy"),
)

density_plot <- combined_data %>%
  filter(
    !is.na(sofrdst),
    sofrdst <= 5
  )  %>%
ggplot(aes(
        x = (sofrdst),
        y = cntry,
        fill = cntry
    )
) +
    geom_density_ridges(
        alpha = 0.7
    ) +
    scale_fill_brewer(palette = "Set1") +
    scale_x_continuous(
        breaks = 1:5,
        labels = c("Agree\nstrongly", "Agree", "Neither agree\nnor disagree", "Disagree", "Disagree\nstrongly")
    ) +
    labs(
        title = "Distribution of Views on Income Equality",
        subtitle = "Comparison between Italy and Denmark",
        x = "Resopondent's Opinion",
        y = NULL 
    ) +
    theme_minimal() +
    theme(legend.position = "none")
#density plot that you built for me in your office :)
density_plot

## Picking joint bandwidth of 0.146

edu_data <- combined_data %>%
  mutate(
    education = case_when(
      sofrdst %in% c(1:2) ~ "Lower Education",   #comment on this section below
      sofrdst %in% c(3:5) ~ "Higher Education",       
      TRUE ~ NA_character_
    ),
    education = factor(education)
  )
edu_data <- edu_data %>%
  filter(!is.na(sofrdst), !is.na(education)) %>%
  group_by(education, cntry) %>%
  count(sofrdst) %>%
  mutate(
    total = sum(n),
    percent = round(100 * n/total, 1)
  )

print(edu_data)

## # A tibble: 10 × 6
## # Groups:   education, cntry [4]
##    education        cntry   sofrdst     n total percent
##    <fct>            <chr>     <dbl> <int> <int>   <dbl>
##  1 Higher Education Denmark       3   325  1201    27.1
##  2 Higher Education Denmark       4   674  1201    56.1
##  3 Higher Education Denmark       5   202  1201    16.8
##  4 Higher Education Italy         3   448   653    68.6
##  5 Higher Education Italy         4   174   653    26.6
##  6 Higher Education Italy         5    31   653     4.7
##  7 Lower Education  Denmark       1    79   347    22.8
##  8 Lower Education  Denmark       2   268   347    77.2
##  9 Lower Education  Italy         1   692  2038    34  
## 10 Lower Education  Italy         2  1346  2038    66

edu_plot <- edu_data %>%
  filter(
    !is.na(sofrdst)
  )  %>%
ggplot(aes(
        x = (sofrdst),
        y = education,
        fill = education
    )
) +
  facet_wrap(~cntry) +
  geom_density_ridges(
        alpha = 0.7
    ) +
    scale_fill_brewer(palette = "Set1") +
    scale_x_continuous(
        breaks = 1:5,
        labels = c("Agree\nstrongly", "Agree", "Neither agree\nnor disagree", "Disagree", "Disagree\nstrongly")
    ) +
    labs(
        title = "Views on Income Distribution by Education Level",
        subtitle = "Comparing Italy and Denmark",
        x = "Resopondent's Opinion",
        y = NULL 
    ) +
    theme_minimal() +
    theme(legend.position = "none") + 
  labs()
edu_plot

## Picking joint bandwidth of 0.416
## Picking joint bandwidth of 0.416

#I don't really understand this, I understood it in your office but lost it when I got home. There should be two spikes for each color, because not all agreements are in Italy. I cannot figure this out. the edu_data makes sense, but it is not plotting correctly. I know it said to facet, which I was able to do but it is not showing for the 1:2 range, which is interesting. I tried to follow the third tutorial steps, but to not much success.

INTERPRETATION: It seems like in the first visualization, Italy agrees more and Denmark disagrees more. This is very interesting, the data almost looks like opposites for the countries. I cannot comment on the educational patterns in the countries, as I cannot get the plot to work for me, and I am unwilling to make guesses without the work being completed.

There was a lot to take from this assignment. I have learned a new level of patience, but more importantly, I learned how to approach coding problems by steps to get to the solution. It is important to keep your mind open, but critical. One thing I would say is that if there were no delays on the assignement, I would not have gotten it done in time. Would we be able to receive the assignements a little earlier? Thank you for all your help on this, I definitely feel more comfortable trying things and making mistakes in R.

Skill Building 1

2025-01-28