a <- sample(1:100, size = 10, replace = T)
getmode <- function(v) {
as.numeric(names(which.max(table(v))))
}
getmode(a)
## [1] 4
median(a)
## [1] 25
mean(a)
## [1] 38.6
range(a)
## [1] 4 98
max(a)-min(a)
## [1] 94
IQR(a)
## [1] 58.75
var(a)
## [1] 1287.822
sd(a)
## [1] 35.88624
# I loaded the data above
wages <- wage1
View(wages)
# The columbs represent average hourly earnings and a number of demogaphic variables. These include education, experience, race, marriage status, place of residence, and occupational indsutry
nrow(wages)
## [1] 526
ncol(wages)
## [1] 24
# the n is 526 (the number of rows)
# entry 35 reported 12 years of education
wages$educ[35]
## [1] 12
# confirms what I found manually
str(wages)
## tibble [526 × 24] (S3: tbl_df/tbl/data.frame)
## $ wage : num [1:526] 3.1 3.24 3 6 5.3 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ educ : num [1:526] 11 12 11 8 12 16 18 12 12 17 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ exper : num [1:526] 2 22 2 44 7 9 15 5 26 22 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ tenure : num [1:526] 0 2 0 28 2 8 7 3 4 21 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ nonwhite: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ female : num [1:526] 1 1 0 0 0 0 0 1 1 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ married : num [1:526] 0 1 0 1 1 1 0 0 0 1 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ numdep : num [1:526] 2 3 2 0 1 0 0 0 2 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ smsa : num [1:526] 1 1 0 1 0 1 1 1 1 1 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ northcen: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ south : num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ west : num [1:526] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ construc: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ ndurman : num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ trcommpu: num [1:526] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ trade : num [1:526] 0 0 1 0 0 0 1 0 1 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ services: num [1:526] 0 1 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ profserv: num [1:526] 0 0 0 0 0 1 0 0 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ profocc : num [1:526] 0 0 0 0 0 1 1 1 1 1 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ clerocc : num [1:526] 0 0 0 1 0 0 0 0 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ servocc : num [1:526] 0 1 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ lwage : num [1:526] 1.13 1.18 1.1 1.79 1.67 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ expersq : num [1:526] 4 484 4 1936 49 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ tenursq : num [1:526] 0 4 0 784 4 64 49 9 16 441 ...
## ..- attr(*, "format.stata")= chr "%9.0g"
# there are no factor variables
colSums(is.na(wages))
## wage educ exper tenure nonwhite female married numdep
## 0 0 0 0 0 0 0 0
## smsa northcen south west construc ndurman trcommpu trade
## 0 0 0 0 0 0 0 0
## services profserv profocc clerocc servocc lwage expersq tenursq
## 0 0 0 0 0 0 0 0
# I think I'm doing this right but I don't see any missing variables in the dataset
head(wages)
## # A tibble: 6 × 24
## wage educ exper tenure nonwhite female married numdep smsa northcen south
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.10 11 2 0 0 1 0 2 1 0 0
## 2 3.24 12 22 2 0 1 1 3 1 0 0
## 3 3 11 2 0 0 0 0 2 0 0 0
## 4 6 8 44 28 0 0 1 0 1 0 0
## 5 5.30 12 7 2 0 0 1 1 0 0 0
## 6 8.75 16 9 8 0 0 1 0 1 0 0
## # ℹ 13 more variables: west <dbl>, construc <dbl>, ndurman <dbl>,
## # trcommpu <dbl>, trade <dbl>, services <dbl>, profserv <dbl>, profocc <dbl>,
## # clerocc <dbl>, servocc <dbl>, lwage <dbl>, expersq <dbl>, tenursq <dbl>
wages <- wages %>%
mutate(gender = factor(ifelse(female == 1, "Female", "Male"),
levels = c("Male", "Female")))
wages <- wages %>%
mutate(educ_level = factor(case_when(
educ < 12 ~ "Less than High School",
educ == 12 ~ "High School Graduate",
educ > 12 & educ < 16 ~ "Some College",
educ >= 16 ~ "College Graduate and above"
), levels = c("Less than High School", "High School Graduate", "Some College", "College Graduate and above")))
wages <- wages %>%
mutate(region = case_when(
northcen == 1 ~ "North Central",
south == 1 ~ "South",
west == 1 ~ "West",
TRUE ~ "Other" # default case
)) %>%
mutate(region = factor(region))
wages %>%
filter(northcen ==1, south == 1, west ==1) %>%
nrow()
## [1] 0
# used this to check for any rows that have more than 1 region and it didn't return any
wage2 <- wages[c("wage","educ_level","gender","exper")]
View(wage2)
mean(wages$wage, na.rm = TRUE)
## [1] 5.896103
median(wages$wage, na.rm = TRUE)
## [1] 4.65
sd(wages$wage, na.rm = TRUE)
## [1] 3.693086
mean(wages$exper, na.rm = TRUE)
## [1] 17.01711
median(wages$exper, na.rm = TRUE)
## [1] 13.5
sd(wages$exper, na.rm = TRUE)
## [1] 13.57216
mean(wages$tenure, na.rm = TRUE)
## [1] 5.104563
median(wages$tenure, na.rm = TRUE)
## [1] 2
sd(wages$tenure, na.rm = TRUE)
## [1] 7.224462
educ_freq <- table(wages$educ_level)
educ_percent <- prop.table(educ_freq) * 100
educ_table <- cbind(Count = educ_freq, Percentage = round(educ_percent,2))
educ_table
## Count Percentage
## Less than High School 116 22.05
## High School Graduate 198 37.64
## Some College 113 21.48
## College Graduate and above 99 18.82
gender_freq <- table(wages$gender)
gender_percent <- prop.table(gender_freq) * 100
gender_table <- cbind(Count = gender_freq, Percentage = round(gender_percent, 2))
gender_table
## Count Percentage
## Male 274 52.09
## Female 252 47.91
region_freq <- table(wages$region)
region_percent <- prop.table(region_freq) * 100
region_table <- cbind(Count = region_freq, Percentage = round(region_percent, 2))
region_table
## Count Percentage
## North Central 132 25.10
## Other 118 22.43
## South 187 35.55
## West 89 16.92
combined_table <- list(
Education = educ_table,
Gender = gender_table,
Region = region_table
)
combined_table
## $Education
## Count Percentage
## Less than High School 116 22.05
## High School Graduate 198 37.64
## Some College 113 21.48
## College Graduate and above 99 18.82
##
## $Gender
## Count Percentage
## Male 274 52.09
## Female 252 47.91
##
## $Region
## Count Percentage
## North Central 132 25.10
## Other 118 22.43
## South 187 35.55
## West 89 16.92
datasummary_skim(wage2)
| |
Unique |
Missing Pct. |
Mean |
SD |
Min |
Median |
Max |
Histogram |
| wage |
241 |
0 |
5.9 |
3.7 |
0.5 |
4.7 |
25.0 |
 |
| exper |
51 |
0 |
17.0 |
13.6 |
1.0 |
13.5 |
51.0 |
 |
|
|
N |
% |
|
|
|
|
|
| educ_level |
Less than High School |
116 |
22.1 |
|
|
|
|
|
|
High School Graduate |
198 |
37.6 |
|
|
|
|
|
|
Some College |
113 |
21.5 |
|
|
|
|
|
|
College Graduate and above |
99 |
18.8 |
|
|
|
|
|
| gender |
Male |
274 |
52.1 |
|
|
|
|
|
|
Female |
252 |
47.9 |
|
|
|
|
|
# Most respondents have only a high school education
mean(wage2$exper)
## [1] 17.01711
# The average years of experience amongst the respondents is 17
wage2 %>%
group_by(educ_level) %>%
summarise(
Mean = mean(wage, na.rm = TRUE)
)
## # A tibble: 4 × 2
## educ_level Mean
## <fct> <dbl>
## 1 Less than High School 4.06
## 2 High School Graduate 5.37
## 3 Some College 6.03
## 4 College Graduate and above 8.95
# College graduates have the highest average hourly wage
wage2 %>%
group_by(educ_level) %>%
summarise(
Mean = mean(wage, na.rm = TRUE),
SD = sd(wage,na.rm = TRUE)
)
## # A tibble: 4 × 3
## educ_level Mean SD
## <fct> <dbl> <dbl>
## 1 Less than High School 4.06 1.99
## 2 High School Graduate 5.37 3.09
## 3 Some College 6.03 3.32
## 4 College Graduate and above 8.95 4.75
# Standard deviation of the education level with the second highest earnings is 3.319543
# Based on these descriptive statistics, I would like to perform an OLS regression on education and wages to understand the magnitude of how the former informs the latter. This would also be useful to understand the statistical significance of this relationship.
ggplot(wages, aes(x = educ_level, y = wage)) +
geom_boxplot() +
labs(title = "Figure 1. Average Hourly Earnings by Education Level",
x = "Education Level",
y = "Hourly Earnings",
caption = "Notes: n=BLANK; Wooldridge Data Set") +
theme_minimal()

mean_wages <- wages %>%
group_by(educ_level) %>%
summarise(Mean = mean(wage, na.rm = TRUE))
ggplot(wages, aes(x = wage, fill = educ_level)) +
geom_density(alpha = 0.7) +
geom_vline(data = mean_wages, aes(xintercept = Mean, color = educ_level), linetype = "dashed") +
scale_fill_brewer(palette = "Set1") +
labs(title = "Figure 2. Average Hourly Earnings by Education Level",
x = "Hourly Earnings",
y = "Density",
caption = "Notes: n=BLANK; Wooldridge Data Set") +
theme_minimal()

# I think I did this right? I'm not entirely confident in this part.