a <- sample(1:100, size = 10, replace = T)

getmode <- function(v) {
  as.numeric(names(which.max(table(v))))
}

getmode(a)
## [1] 4
median(a)
## [1] 25
mean(a)
## [1] 38.6
range(a)
## [1]  4 98
max(a)-min(a)
## [1] 94
IQR(a)
## [1] 58.75
var(a)
## [1] 1287.822
sd(a)
## [1] 35.88624
# I loaded the data above 

wages <- wage1

View(wages)
# The columbs represent average hourly earnings and a number of demogaphic variables. These include education, experience, race, marriage status, place of residence, and occupational indsutry

nrow(wages)
## [1] 526
ncol(wages)
## [1] 24
# the n is 526 (the number of rows)
# entry 35 reported 12 years of education
wages$educ[35]
## [1] 12
# confirms what I found manually

str(wages)
## tibble [526 × 24] (S3: tbl_df/tbl/data.frame)
##  $ wage    : num [1:526] 3.1 3.24 3 6 5.3 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ educ    : num [1:526] 11 12 11 8 12 16 18 12 12 17 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ exper   : num [1:526] 2 22 2 44 7 9 15 5 26 22 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ tenure  : num [1:526] 0 2 0 28 2 8 7 3 4 21 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ nonwhite: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ female  : num [1:526] 1 1 0 0 0 0 0 1 1 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ married : num [1:526] 0 1 0 1 1 1 0 0 0 1 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ numdep  : num [1:526] 2 3 2 0 1 0 0 0 2 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ smsa    : num [1:526] 1 1 0 1 0 1 1 1 1 1 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ northcen: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ south   : num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ west    : num [1:526] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ construc: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ ndurman : num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ trcommpu: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ trade   : num [1:526] 0 0 1 0 0 0 1 0 1 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ services: num [1:526] 0 1 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ profserv: num [1:526] 0 0 0 0 0 1 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ profocc : num [1:526] 0 0 0 0 0 1 1 1 1 1 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ clerocc : num [1:526] 0 0 0 1 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ servocc : num [1:526] 0 1 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ lwage   : num [1:526] 1.13 1.18 1.1 1.79 1.67 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ expersq : num [1:526] 4 484 4 1936 49 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ tenursq : num [1:526] 0 4 0 784 4 64 49 9 16 441 ...
##   ..- attr(*, "format.stata")= chr "%9.0g"
# there are no factor variables

colSums(is.na(wages))
##     wage     educ    exper   tenure nonwhite   female  married   numdep 
##        0        0        0        0        0        0        0        0 
##     smsa northcen    south     west construc  ndurman trcommpu    trade 
##        0        0        0        0        0        0        0        0 
## services profserv  profocc  clerocc  servocc    lwage  expersq  tenursq 
##        0        0        0        0        0        0        0        0
# I think I'm doing this right but I don't see any missing variables in the dataset
head(wages)
## # A tibble: 6 × 24
##    wage  educ exper tenure nonwhite female married numdep  smsa northcen south
##   <dbl> <dbl> <dbl>  <dbl>    <dbl>  <dbl>   <dbl>  <dbl> <dbl>    <dbl> <dbl>
## 1  3.10    11     2      0        0      1       0      2     1        0     0
## 2  3.24    12    22      2        0      1       1      3     1        0     0
## 3  3       11     2      0        0      0       0      2     0        0     0
## 4  6        8    44     28        0      0       1      0     1        0     0
## 5  5.30    12     7      2        0      0       1      1     0        0     0
## 6  8.75    16     9      8        0      0       1      0     1        0     0
## # ℹ 13 more variables: west <dbl>, construc <dbl>, ndurman <dbl>,
## #   trcommpu <dbl>, trade <dbl>, services <dbl>, profserv <dbl>, profocc <dbl>,
## #   clerocc <dbl>, servocc <dbl>, lwage <dbl>, expersq <dbl>, tenursq <dbl>
wages <- wages %>%
  mutate(gender = factor(ifelse(female == 1, "Female", "Male"), 
                         levels = c("Male", "Female")))


wages <- wages %>%
  mutate(educ_level = factor(case_when(
    educ < 12 ~ "Less than High School",
    educ == 12 ~ "High School Graduate",
    educ > 12 & educ < 16 ~ "Some College",
    educ >= 16 ~ "College Graduate and above"
  ), levels = c("Less than High School", "High School Graduate", "Some College", "College Graduate and above")))

wages <- wages %>%
  mutate(region = case_when(
    northcen == 1 ~ "North Central",
    south == 1 ~ "South",
    west == 1 ~ "West",
    TRUE ~ "Other"  # default case
  )) %>%
  mutate(region = factor(region))


wages %>%
  filter(northcen ==1, south == 1, west ==1) %>%
  nrow()
## [1] 0
# used this to check for any rows that have more than 1 region and it didn't return any

wage2 <- wages[c("wage","educ_level","gender","exper")]
View(wage2)
mean(wages$wage, na.rm = TRUE)
## [1] 5.896103
median(wages$wage, na.rm = TRUE)
## [1] 4.65
sd(wages$wage, na.rm = TRUE)
## [1] 3.693086
mean(wages$exper, na.rm = TRUE)
## [1] 17.01711
median(wages$exper, na.rm = TRUE)
## [1] 13.5
sd(wages$exper, na.rm = TRUE)
## [1] 13.57216
mean(wages$tenure, na.rm = TRUE)
## [1] 5.104563
median(wages$tenure, na.rm = TRUE)
## [1] 2
sd(wages$tenure, na.rm = TRUE)
## [1] 7.224462
educ_freq <- table(wages$educ_level)
educ_percent <- prop.table(educ_freq) * 100
educ_table <- cbind(Count = educ_freq, Percentage = round(educ_percent,2))
educ_table
##                            Count Percentage
## Less than High School        116      22.05
## High School Graduate         198      37.64
## Some College                 113      21.48
## College Graduate and above    99      18.82
gender_freq <- table(wages$gender)
gender_percent <- prop.table(gender_freq) * 100
gender_table <- cbind(Count = gender_freq, Percentage = round(gender_percent, 2))
gender_table
##        Count Percentage
## Male     274      52.09
## Female   252      47.91
region_freq <- table(wages$region)
region_percent <- prop.table(region_freq) * 100
region_table <- cbind(Count = region_freq, Percentage = round(region_percent, 2))
region_table
##               Count Percentage
## North Central   132      25.10
## Other           118      22.43
## South           187      35.55
## West             89      16.92
combined_table <- list( 
  Education = educ_table, 
  Gender = gender_table, 
  Region = region_table 
  ) 
combined_table
## $Education
##                            Count Percentage
## Less than High School        116      22.05
## High School Graduate         198      37.64
## Some College                 113      21.48
## College Graduate and above    99      18.82
## 
## $Gender
##        Count Percentage
## Male     274      52.09
## Female   252      47.91
## 
## $Region
##               Count Percentage
## North Central   132      25.10
## Other           118      22.43
## South           187      35.55
## West             89      16.92
datasummary_skim(wage2)
Unique Missing Pct. Mean SD Min Median Max Histogram
wage 241 0 5.9 3.7 0.5 4.7 25.0
exper 51 0 17.0 13.6 1.0 13.5 51.0
N %
educ_level Less than High School 116 22.1
High School Graduate 198 37.6
Some College 113 21.5
College Graduate and above 99 18.8
gender Male 274 52.1
Female 252 47.9
# Most respondents have only a high school education
mean(wage2$exper)
## [1] 17.01711
# The average years of experience amongst the respondents is 17



wage2 %>%
  group_by(educ_level) %>%
  summarise(
    Mean = mean(wage, na.rm = TRUE)
  )
## # A tibble: 4 × 2
##   educ_level                  Mean
##   <fct>                      <dbl>
## 1 Less than High School       4.06
## 2 High School Graduate        5.37
## 3 Some College                6.03
## 4 College Graduate and above  8.95
# College graduates have the highest average hourly wage

wage2 %>%
  group_by(educ_level) %>%
  summarise(
    Mean = mean(wage, na.rm = TRUE),
    SD = sd(wage,na.rm = TRUE)
  )
## # A tibble: 4 × 3
##   educ_level                  Mean    SD
##   <fct>                      <dbl> <dbl>
## 1 Less than High School       4.06  1.99
## 2 High School Graduate        5.37  3.09
## 3 Some College                6.03  3.32
## 4 College Graduate and above  8.95  4.75
# Standard deviation of the education level with the second highest earnings is 3.319543

# Based on these descriptive statistics, I would like to perform an OLS regression on education and wages to understand the magnitude of how the former informs the latter. This would also be useful to understand the statistical significance of this relationship.
ggplot(wages, aes(x = educ_level, y = wage)) +
  geom_boxplot() +
  labs(title = "Figure 1. Average Hourly Earnings by Education Level",
       x = "Education Level",
       y = "Hourly Earnings",
       caption = "Notes: n=BLANK; Wooldridge Data Set") +
  theme_minimal()

mean_wages <- wages %>%
  group_by(educ_level) %>%
  summarise(Mean = mean(wage, na.rm = TRUE))

ggplot(wages, aes(x = wage, fill = educ_level)) +
  geom_density(alpha = 0.7) +
  geom_vline(data = mean_wages, aes(xintercept = Mean, color = educ_level), linetype = "dashed") +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Figure 2. Average Hourly Earnings by Education Level",
       x = "Hourly Earnings",
       y = "Density",
       caption = "Notes: n=BLANK; Wooldridge Data Set") +
  theme_minimal()

# I think I did this right? I'm not entirely confident in this part.