StatsII-Lab1

a <- sample(1:100, size = 10, replace = T)

getmode <- function(v) {
  as.numeric(names(which.max(table(v))))
}

getmode(a)

## [1] 4

median(a)

## [1] 25

mean(a)

## [1] 38.6

range(a)

## [1]  4 98

max(a)-min(a)

## [1] 94

IQR(a)

## [1] 58.75

var(a)

## [1] 1287.822

sd(a)

## [1] 35.88624

# I loaded the data above 

wages <- wage1

View(wages)
# The columbs represent average hourly earnings and a number of demogaphic variables. These include education, experience, race, marriage status, place of residence, and occupational indsutry

nrow(wages)

## [1] 526

ncol(wages)

## [1] 24

# the n is 526 (the number of rows)
# entry 35 reported 12 years of education
wages$educ[35]

## [1] 12

# confirms what I found manually

str(wages)

## tibble [526 × 24] (S3: tbl_df/tbl/data.frame)
##  $ wage    : num [1:526] 3.1 3.24 3 6 5.3 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ educ    : num [1:526] 11 12 11 8 12 16 18 12 12 17 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ exper   : num [1:526] 2 22 2 44 7 9 15 5 26 22 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ tenure  : num [1:526] 0 2 0 28 2 8 7 3 4 21 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ nonwhite: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ female  : num [1:526] 1 1 0 0 0 0 0 1 1 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ married : num [1:526] 0 1 0 1 1 1 0 0 0 1 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ numdep  : num [1:526] 2 3 2 0 1 0 0 0 2 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ smsa    : num [1:526] 1 1 0 1 0 1 1 1 1 1 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ northcen: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ south   : num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ west    : num [1:526] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ construc: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ ndurman : num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ trcommpu: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ trade   : num [1:526] 0 0 1 0 0 0 1 0 1 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ services: num [1:526] 0 1 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ profserv: num [1:526] 0 0 0 0 0 1 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ profocc : num [1:526] 0 0 0 0 0 1 1 1 1 1 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ clerocc : num [1:526] 0 0 0 1 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ servocc : num [1:526] 0 1 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ lwage   : num [1:526] 1.13 1.18 1.1 1.79 1.67 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ expersq : num [1:526] 4 484 4 1936 49 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ tenursq : num [1:526] 0 4 0 784 4 64 49 9 16 441 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"

# there are no factor variables

colSums(is.na(wages))

##     wage     educ    exper   tenure nonwhite   female  married   numdep 
##        0        0        0        0        0        0        0        0 
##     smsa northcen    south     west construc  ndurman trcommpu    trade 
##        0        0        0        0        0        0        0        0 
## services profserv  profocc  clerocc  servocc    lwage  expersq  tenursq 
##        0        0        0        0        0        0        0        0

# I think I'm doing this right but I don't see any missing variables in the dataset

head(wages)

## # A tibble: 6 × 24
##    wage  educ exper tenure nonwhite female married numdep  smsa northcen south
##   <dbl> <dbl> <dbl>  <dbl>    <dbl>  <dbl>   <dbl>  <dbl> <dbl>    <dbl> <dbl>
## 1  3.10    11     2      0        0      1       0      2     1        0     0
## 2  3.24    12    22      2        0      1       1      3     1        0     0
## 3  3       11     2      0        0      0       0      2     0        0     0
## 4  6        8    44     28        0      0       1      0     1        0     0
## 5  5.30    12     7      2        0      0       1      1     0        0     0
## 6  8.75    16     9      8        0      0       1      0     1        0     0
## # ℹ 13 more variables: west <dbl>, construc <dbl>, ndurman <dbl>,
## #   trcommpu <dbl>, trade <dbl>, services <dbl>, profserv <dbl>, profocc <dbl>,
## #   clerocc <dbl>, servocc <dbl>, lwage <dbl>, expersq <dbl>, tenursq <dbl>

wages <- wages %>%
  mutate(gender = factor(ifelse(female == 1, "Female", "Male"), 
                         levels = c("Male", "Female")))


wages <- wages %>%
  mutate(educ_level = factor(case_when(
    educ < 12 ~ "Less than High School",
    educ == 12 ~ "High School Graduate",
    educ > 12 & educ < 16 ~ "Some College",
    educ >= 16 ~ "College Graduate and above"
  ), levels = c("Less than High School", "High School Graduate", "Some College", "College Graduate and above")))

wages <- wages %>%
  mutate(region = case_when(
    northcen == 1 ~ "North Central",
    south == 1 ~ "South",
    west == 1 ~ "West",
    TRUE ~ "Other"  # default case
  )) %>%
  mutate(region = factor(region))


wages %>%
  filter(northcen ==1, south == 1, west ==1) %>%
  nrow()

## [1] 0

# used this to check for any rows that have more than 1 region and it didn't return any

wage2 <- wages[c("wage","educ_level","gender","exper")]
View(wage2)

mean(wages$wage, na.rm = TRUE)

## [1] 5.896103

median(wages$wage, na.rm = TRUE)

## [1] 4.65

sd(wages$wage, na.rm = TRUE)

## [1] 3.693086

mean(wages$exper, na.rm = TRUE)

## [1] 17.01711

median(wages$exper, na.rm = TRUE)

## [1] 13.5

sd(wages$exper, na.rm = TRUE)

## [1] 13.57216

mean(wages$tenure, na.rm = TRUE)

## [1] 5.104563

median(wages$tenure, na.rm = TRUE)

## [1] 2

sd(wages$tenure, na.rm = TRUE)

## [1] 7.224462

educ_freq <- table(wages$educ_level)
educ_percent <- prop.table(educ_freq) * 100
educ_table <- cbind(Count = educ_freq, Percentage = round(educ_percent,2))
educ_table

##                            Count Percentage
## Less than High School        116      22.05
## High School Graduate         198      37.64
## Some College                 113      21.48
## College Graduate and above    99      18.82

gender_freq <- table(wages$gender)
gender_percent <- prop.table(gender_freq) * 100
gender_table <- cbind(Count = gender_freq, Percentage = round(gender_percent, 2))
gender_table

##        Count Percentage
## Male     274      52.09
## Female   252      47.91

region_freq <- table(wages$region)
region_percent <- prop.table(region_freq) * 100
region_table <- cbind(Count = region_freq, Percentage = round(region_percent, 2))
region_table

##               Count Percentage
## North Central   132      25.10
## Other           118      22.43
## South           187      35.55
## West             89      16.92

combined_table <- list( 
  Education = educ_table, 
  Gender = gender_table, 
  Region = region_table 
  ) 
combined_table

## $Education
##                            Count Percentage
## Less than High School        116      22.05
## High School Graduate         198      37.64
## Some College                 113      21.48
## College Graduate and above    99      18.82
## 
## $Gender
##        Count Percentage
## Male     274      52.09
## Female   252      47.91
## 
## $Region
##               Count Percentage
## North Central   132      25.10
## Other           118      22.43
## South           187      35.55
## West             89      16.92

datasummary_skim(wage2)

	Unique	Missing Pct.	Mean	SD	Min	Median	Max
wage	241	0	5.9	3.7	0.5	4.7	25.0
exper	51	0	17.0	13.6	1.0	13.5	51.0
		N	%
educ_level	Less than High School	116	22.1
	High School Graduate	198	37.6
	Some College	113	21.5
	College Graduate and above	99	18.8
gender	Male	274	52.1
	Female	252	47.9

# Most respondents have only a high school education
mean(wage2$exper)

## [1] 17.01711

# The average years of experience amongst the respondents is 17



wage2 %>%
  group_by(educ_level) %>%
  summarise(
    Mean = mean(wage, na.rm = TRUE)
  )

## # A tibble: 4 × 2
##   educ_level                  Mean
##   <fct>                      <dbl>
## 1 Less than High School       4.06
## 2 High School Graduate        5.37
## 3 Some College                6.03
## 4 College Graduate and above  8.95

# College graduates have the highest average hourly wage

wage2 %>%
  group_by(educ_level) %>%
  summarise(
    Mean = mean(wage, na.rm = TRUE),
    SD = sd(wage,na.rm = TRUE)
  )

## # A tibble: 4 × 3
##   educ_level                  Mean    SD
##   <fct>                      <dbl> <dbl>
## 1 Less than High School       4.06  1.99
## 2 High School Graduate        5.37  3.09
## 3 Some College                6.03  3.32
## 4 College Graduate and above  8.95  4.75

# Standard deviation of the education level with the second highest earnings is 3.319543

# Based on these descriptive statistics, I would like to perform an OLS regression on education and wages to understand the magnitude of how the former informs the latter. This would also be useful to understand the statistical significance of this relationship.

ggplot(wages, aes(x = educ_level, y = wage)) +
  geom_boxplot() +
  labs(title = "Figure 1. Average Hourly Earnings by Education Level",
       x = "Education Level",
       y = "Hourly Earnings",
       caption = "Notes: n=BLANK; Wooldridge Data Set") +
  theme_minimal()

mean_wages <- wages %>%
  group_by(educ_level) %>%
  summarise(Mean = mean(wage, na.rm = TRUE))

ggplot(wages, aes(x = wage, fill = educ_level)) +
  geom_density(alpha = 0.7) +
  geom_vline(data = mean_wages, aes(xintercept = Mean, color = educ_level), linetype = "dashed") +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Figure 2. Average Hourly Earnings by Education Level",
       x = "Hourly Earnings",
       y = "Density",
       caption = "Notes: n=BLANK; Wooldridge Data Set") +
  theme_minimal()

# I think I did this right? I'm not entirely confident in this part.

StatsII-Lab1

Matthew Reyes

2026-01-28