Employee_Salaries_-_2023.csv, this dataset was provided by Crystal Quezada. The data was filtered based on the ‘Department of Police,’ and I would like to compare the salaries based on gender. To better display the data, I will create a subcategory for salary ranges:
employee_salaries <- read_csv("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/Employee_Salaries_-_2023.csv",show_col_types = FALSE)
employee_salaries_dep_pol <- employee_salaries |>
filter(Department_Name == "Department of Police") |>
select(Division, Gender, Base_Salary) |>
mutate(Gender = if_else(Gender == "F", "Female",
if_else(Gender == "M", "Male", "Unknown")),
Salary_Range = cut(Base_Salary,
# Define salary ranges
breaks = c(0, 30000, 60000, 100000, Inf),
labels = c("Low", "Medium", "High", "Very High"),
# Range labels
right = FALSE)) # Left-closed intervals
datatable(employee_salaries_dep_pol)
summary_table <- employee_salaries_dep_pol |>
group_by(Gender, Salary_Range) |>
summarise(
mean_salary = round(mean(Base_Salary), 2),
median_salary = median(Base_Salary),
sd_salary = round(sd(Base_Salary), 2),
min_salary = min(Base_Salary),
max_salary = max(Base_Salary),
n = n(),
upper = mean_salary + sd_salary,
lower = mean_salary - sd_salary
)
## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.
# Display the summary table using kable
summary_table |>
kable(caption = "Statistical Summary of Salaries in the Department of Police ") |>
kable_styling(full_width = F, position = "center")
Gender | Salary_Range | mean_salary | median_salary | sd_salary | min_salary | max_salary | n | upper | lower |
---|---|---|---|---|---|---|---|---|---|
Female | Low | 23495.96 | 23022.58 | 2820.71 | 18257.50 | 29481.63 | 93 | 26316.67 | 20675.25 |
Female | Medium | 37808.51 | 30620.00 | 10446.20 | 30296.39 | 59793.32 | 93 | 48254.71 | 27362.31 |
Female | High | 80458.94 | 78947.00 | 10924.31 | 60061.68 | 99622.92 | 314 | 91383.25 | 69534.63 |
Female | Very High | 116179.25 | 108084.00 | 18261.14 | 100000.00 | 190185.00 | 182 | 134440.39 | 97918.11 |
Male | Low | 23138.16 | 22346.70 | 2693.16 | 18257.50 | 28570.49 | 31 | 25831.32 | 20445.00 |
Male | Medium | 48862.81 | 54169.10 | 10539.93 | 30296.39 | 59849.01 | 20 | 59402.74 | 38322.88 |
Male | High | 81384.57 | 82078.00 | 11689.11 | 60265.00 | 98896.00 | 411 | 93073.68 | 69695.46 |
Male | Very High | 117119.24 | 108084.00 | 19641.75 | 100897.00 | 258000.00 | 650 | 136760.99 | 97477.49 |
In conclusion, the bar plot shows that, on average, females earn higher salaries in the range of $0 to $30,000, while males earn higher salaries in the medium, high, and very high categories.
ggplot(summary_table, aes(x = Salary_Range, y = mean_salary, fill = Gender)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = dollar(mean_salary)), # Use dollar format for the labels
position = position_dodge(width = 0.9), # Adjust position to align with bars
vjust = -0.5, # Adjust vertical position of the text
size = 3) + # Text size
labs(title = "Average Salary by Gender",
x = "Salary Range",
y = "Salary $") +
theme_minimal()+
scale_y_continuous(labels = dollar_format()) # we had to add library scales
The highest salary by gender for the low-income category belongs to females. For the medium-income category, it belongs to males, while for the high-income category, it again belongs to females. However, in the very high-income category, males have a significantly higher salary.
ggplot(summary_table, aes(x = Salary_Range, y = max_salary, fill = Gender)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = dollar(max_salary)), # Use dollar format for the labels
position = position_dodge(width = 0.9), # Adjust position to align with bars
vjust = -0.5, # Adjust vertical position of the text
size = 3) + # Text size
labs(title = "The Highest Salary by Gender",
x = "Salary Range",
y = "Salary $") +
theme_minimal()+
scale_y_continuous(labels = dollar_format()) # we had to add library scales
“2024 FE Guide for DOE-release dates before 9-17-2024-no-sales -9-17-2024public.xlsx”, this dataset was provided by Benjamin Wolin.
cars_df <- read_csv("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/2024%20FE%20Guide%20for%20DOE-release%20dates%20before%209-17-2024-no-sales%20-9-17-2024public.csv",show_col_types = FALSE)
#We will fix the column names to remove spaces and make them lower case
colnames(cars_df) <- tolower(gsub("[ ()#]", "_", colnames(cars_df)))
colnames(cars_df) <- sub("^__|_$|_-_", "", colnames(cars_df))
#Now I will create a new data frame with the rows that I need for the Search
cars_df_current_year <- cars_df|>
filter(model_year == as.numeric(format(Sys.Date(), "%Y"))) |>
select (model_year, mfr_name, carline, eng_displ, cyl, transmission, city_fe__guide_conventional_fuel, hwy_fe__guide_conventional_fuel)
datatable(cars_df_current_year)
We will now conduct an analysis based on the City fuel efficiency and the number of cylinders, using the new data frame cars_df_current_year. First, we will generate summary statistics to calculate the average fuel efficiency by the number of cylinders in the city.
summary_table <- cars_df_current_year |>
group_by(cyl) |>
summarise(
mean_city_fe= round(mean(city_fe__guide_conventional_fuel), 2),
median_city_fe = median(city_fe__guide_conventional_fuel),
sd_city_fe = round(sd(city_fe__guide_conventional_fuel), 2),
min_city_fe= min(city_fe__guide_conventional_fuel),
max_city_fe = max(city_fe__guide_conventional_fuel),
total_count = n(),
upper = mean_city_fe + sd_city_fe,
lower = mean_city_fe- sd_city_fe
)
summary_table |>
kable(caption = "Average City Fuel Efficiency (MPG)") |>
kable_styling(full_width = F, position = "center")
cyl | mean_city_fe | median_city_fe | sd_city_fe | min_city_fe | max_city_fe | total_count | upper | lower |
---|---|---|---|---|---|---|---|---|
3 | 27.92 | 28 | 3.09 | 21 | 36 | 25 | 31.01 | 24.83 |
4 | 25.32 | 23 | 7.41 | 16 | 57 | 448 | 32.73 | 17.91 |
5 | 19.00 | 19 | NA | 19 | 19 | 1 | NA | NA |
6 | 18.44 | 18 | 2.40 | 13 | 26 | 326 | 20.84 | 16.04 |
8 | 14.73 | 15 | 1.44 | 10 | 18 | 166 | 16.17 | 13.29 |
10 | 13.00 | 13 | 0.00 | 13 | 13 | 3 | 13.00 | 13.00 |
12 | 11.94 | 12 | 0.25 | 11 | 12 | 16 | 12.19 | 11.69 |
16 | 8.00 | 8 | NA | 8 | 8 | 1 | NA | NA |
The bar plot shows that cars with fewer cylinders tend to have higher MPG (miles per gallon) in city driving.
ggplot(summary_table, aes(x = factor(cyl), y = mean_city_fe, fill = factor(cyl))) +
geom_bar(stat = "identity", position = "dodge", color ="darkblue") +
geom_text(aes(label = paste(mean_city_fe,'MPG')),
position = position_dodge(width = 1),
vjust = 0.5, angle = 90, hjust =1, color="white") +
labs(y = "City Fuel Efficiency (MPG)", x = "Number of Cylinders", fill = "Cylinder",title = "Average City Fuel Efficiency (MPG)",
subtitle = "Comparison between Number of Cylinders")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12))+
scale_fill_brewer(palette = "Blues", direction = -1)
We will generate summary statistics to calculate the average fuel efficiency by the number of cylinders on the highway. On average, cars with 3 cylinders provide better fuel efficiency.
summary_table2 <- cars_df_current_year |>
group_by(cyl) |>
summarise(
mean_hwy_fe= round(mean(hwy_fe__guide_conventional_fuel), 2),
median_hwy_fe = median(hwy_fe__guide_conventional_fuel),
sd_hwy_fe = round(sd(hwy_fe__guide_conventional_fuel), 2),
min_hwy_fe= min(hwy_fe__guide_conventional_fuel),
max_hwy_fe = max(hwy_fe__guide_conventional_fuel),
total_count = n(),
upper = mean_hwy_fe + sd_hwy_fe,
lower = mean_hwy_fe - sd_hwy_fe
)
summary_table2 |>
kable(caption = "Average City Fuel Highway (MPG)") |>
kable_styling(full_width = F, position = "center")
cyl | mean_hwy_fe | median_hwy_fe | sd_hwy_fe | min_hwy_fe | max_hwy_fe | total_count | upper | lower |
---|---|---|---|---|---|---|---|---|
3 | 34.24 | 34 | 4.11 | 28 | 43 | 25 | 38.35 | 30.13 |
4 | 31.01 | 31 | 6.38 | 16 | 58 | 448 | 37.39 | 24.63 |
5 | 29.00 | 29 | NA | 29 | 29 | 1 | NA | NA |
6 | 24.68 | 24 | 3.38 | 14 | 33 | 326 | 28.06 | 21.30 |
8 | 20.43 | 20 | 2.48 | 12 | 27 | 166 | 22.91 | 17.95 |
10 | 18.00 | 18 | 0.00 | 18 | 18 | 3 | 18.00 | 18.00 |
12 | 18.12 | 19 | 1.54 | 15 | 20 | 16 | 19.66 | 16.58 |
16 | 11.00 | 11 | NA | 11 | 11 | 1 | NA | NA |
The bar plot shows that cars with fewer cylinders tend to have higher Highway Fuel Efficiency (MPG). This trend is consistent with City Fuel Efficiency as well.
ggplot(summary_table2, aes(x = factor(cyl), y = mean_hwy_fe, fill = factor(cyl))) +
geom_bar(stat = "identity", position = "dodge", color ="darkblue") +
geom_text(aes(label = paste(mean_hwy_fe,'MPG')),
position = position_dodge(width = 1),
vjust = 0.5, angle = 90, hjust =1, color="white") +
labs(y = "Highway Fuel Efficiency (MPG)", x = "Number of Cylinders", fill = "Cylinder",title = "Average Highway Fuel Efficiency (MPG)",
subtitle = "Comparison between Number of Cylinders")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12))+
scale_fill_brewer(palette = "Blues", direction = -1)
A new category is created in order to display a graphic with both summaries.
#Merging for summary data frame and using one column for the average
combined_summary <- bind_rows(
summary_table |>
select(cyl, mean_city_fe) |>
rename(mean_fe = mean_city_fe) |>
mutate(fuel_type = "City"),
summary_table2 |>
select(cyl, mean_hwy_fe) |>
rename(mean_fe = mean_hwy_fe) |>
mutate(fuel_type = "Highway")
)
ggplot(combined_summary, aes(x = factor(cyl), y = mean_fe, fill = fuel_type)) +
geom_bar(stat = "identity", position = "dodge", color = "darkblue") +
geom_text(aes(label = paste(mean_fe, 'MPG')),
position = position_dodge(width = 0.9),
vjust = -0.5, color = "black", size = 2.5) +
labs(y = "Fuel Efficiency (MPG)",
x = "Number of Cylinders",
fill = "Fuel Type",
title = "City vs. Highway Fuel Efficiency (MPG)",
subtitle = "Comparison by Number of Cylinders") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12)) +
scale_fill_brewer(palette = "Blues")