Q1 a load dataset

ucb_data <- read.csv("https://ritsokiguess.site/datafiles/berkeley.txt", sep = "\t")
ucb_data

##   Dept Males.admitted Males.rejected Females.admitted Females.rejected
## 1   A             512            313               89               19
## 2   B             353            207               17                8
## 3   C             120            205              202              391
## 4   D             139            279              131              244
## 5   E              53            138               94              299
## 6   F              22            351               24              317

b

The dataset is not tidy because:It contains multiple variables in column names (e.g., “Males:admitted” and “Females:rejected” instead of separate “Gender” and “Admission Status” columns). It does not follow the structure where each row represents a single observation.

#install.packages("tidyverse","dplyr")

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)


tidy_ucb_data <- ucb_data %>%
  pivot_longer(
    cols = -Dept,
    names_to = c("Gender", "Admission_Status"),
    names_sep = "\\.", 
    values_to = "Count"
  )



tidy_ucb_data

## # A tibble: 24 × 4
##    Dept  Gender  Admission_Status Count
##    <chr> <chr>   <chr>            <dbl>
##  1 "A "  Males   admitted           512
##  2 "A "  Males   rejected           313
##  3 "A "  Females admitted            89
##  4 "A "  Females rejected            19
##  5 "B "  Males   admitted           353
##  6 "B "  Males   rejected           207
##  7 "B "  Females admitted            17
##  8 "B "  Females rejected             8
##  9 "C "  Males   admitted           120
## 10 "C "  Males   rejected           205
## # ℹ 14 more rows

c

# Plot the number of admitted and rejected individuals for each gender
ggplot(tidy_ucb_data, aes(x = Admission_Status, y = Count, fill = Gender)) +
  geom_col(position = "dodge") +
  labs(title = "Number of Individuals Admitted and Rejected by Gender",
       x = "Admission Status", y = "Count") +
  theme_minimal()

This bar chart shows the number of individuals admitted and rejected at UC Berkeley, separated by gender.

Higher Male Admissions and Higher Female Rejection Rate: More males were admitted compared to females. While the number of rejected males and females appears similar, fewer females were admitted, indicating that a larger proportion of female applicants were rejected.

Possible Gender Disparity: The difference in admissions could indicate bias in the admission process or differences in application numbers across genders and departments. Further analysis, such as looking at acceptance rates (proportions) instead of raw counts, would give a clearer picture.

· # d

# Compute proportion
tidy_ucb_data <- tidy_ucb_data %>%
  group_by(Dept, Gender) %>%
  mutate(Prop = Count / sum(Count))

# View updated data
head(tidy_ucb_data)

## # A tibble: 6 × 5
## # Groups:   Dept, Gender [3]
##   Dept  Gender  Admission_Status Count  Prop
##   <chr> <chr>   <chr>            <dbl> <dbl>
## 1 "A "  Males   admitted           512 0.621
## 2 "A "  Males   rejected           313 0.379
## 3 "A "  Females admitted            89 0.824
## 4 "A "  Females rejected            19 0.176
## 5 "B "  Males   admitted           353 0.630
## 6 "B "  Males   rejected           207 0.370

e

ggplot(tidy_ucb_data, aes(x = Admission_Status, y = Prop, fill = Gender)) +
  geom_col(position = "dodge") +
  labs(
    title = "Admission Proportions by Gender",
    subtitle = "Proportions account for differences in application numbers",
    x = "Admission Status",
    y = "Proportion"
  ) +
  scale_y_continuous(labels = scales::percent) +  # Format as %
  scale_fill_brewer(palette = "Set1") +
  theme_minimal()

Why Proportions Are Better: It controls for different application numbers: Males and females may apply in different quantities Shows actual admission rates: Shows what percentage of each group was admitted Avoids Simpson’s Paradox: The famous UC Berkeley case showed that while women had lower overall admission rates, they often had higher rates within individual departments Proportions is better when analyzing gender disparity because it accounts for different application pool sizes. Count alone can be misleading—for example, if more males applied overall, they might have more admissions, but that does not mean they had a higher acceptance rate.

Proportion allows for a fairer comparison.

f

tidy_ucb_counts <- tidy_ucb_data %>% 
  select(-Prop)

original_ucb_data <- tidy_ucb_counts %>%
  pivot_wider(
    names_from = c(Gender, Admission_Status),
    values_from = Count,
    names_sep = "."
  )

print(original_ucb_data)

## # A tibble: 6 × 5
## # Groups:   Dept [6]
##   Dept  Males.admitted Males.rejected Females.admitted Females.rejected
##   <chr>          <dbl>          <dbl>            <dbl>            <dbl>
## 1 "A "             512            313               89               19
## 2 "B "             353            207               17                8
## 3 "C "             120            205              202              391
## 4 "D "             139            279              131              244
## 5 "E "              53            138               94              299
## 6 "F "              22            351               24              317

q2 a

Cognition <- read_table("https://ritsokiguess.site/datafiles/cognition.txt")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   id = col_double(),
##   c1 = col_double(),
##   c2 = col_double(),
##   c3 = col_double(),
##   c4 = col_double(),
##   c5 = col_double(),
##   c6 = col_double(),
##   c7 = col_double(),
##   c8 = col_double(),
##   c9 = col_double(),
##   c10 = col_double(),
##   c11 = col_double(),
##   c12 = col_double(),
##   c13 = col_double(),
##   c14 = col_double(),
##   c15 = col_double(),
##   c16 = col_double(),
##   c17 = col_double(),
##   c18 = col_double()
## )

## Warning: 1 parsing failure.
## row col   expected     actual                                                file
## 101  -- 19 columns 20 columns 'https://ritsokiguess.site/datafiles/cognition.txt'

Cognition

## # A tibble: 201 × 19
##       id    c1    c2    c3    c4    c5    c6    c7    c8    c9   c10   c11   c12
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     1     1     3     4     4     3     4     3     4     4     3     4     3
##  2     2     2     4     1     1     2     1     4     2     2     4     4     3
##  3     3     4     4     1     1     1     3     1     3     2     4     5     1
##  4     4     3     1     4     5     4     3     3     3     5     1     1     2
##  5     5     4     4     2     2     1     4     2     2     1     4     5     1
##  6     6     5     5     1     1     1     5     1     3     1     5     5     1
##  7     7     3     3     1     2     2     1     4     3     2     4     4     2
##  8     8     3     4     1     1     1     3     2     3     3     4     5     1
##  9     9     4     4     1     2     2     4     4     3     1     5     5     2
## 10    10     4     4     2     2     1     3     5     2     2     4     5     1
## # ℹ 191 more rows
## # ℹ 6 more variables: c13 <dbl>, c14 <dbl>, c15 <dbl>, c16 <dbl>, c17 <dbl>,
## #   c18 <dbl>

b

Cognition %>% pivot_longer(-id, names_to = "test",
names_prefix
= "c",
values_to = "perform", values_drop_na = TRUE )

## # A tibble: 3,604 × 3
##       id test  perform
##    <dbl> <chr>   <dbl>
##  1     1 1           1
##  2     1 2           3
##  3     1 3           4
##  4     1 4           4
##  5     1 5           3
##  6     1 6           4
##  7     1 7           3
##  8     1 8           4
##  9     1 9           4
## 10     1 10          3
## # ℹ 3,594 more rows

stac33_a5

Jinminli-1008264361

2025-03-27

Q1 a load dataset

b

c

e

f

q2 a

b