Project2

DataSet I : Employee Salaries 2023

Employee_Salaries_-_2023.csv, this dataset was provided by Crystal Quezada. The data was filtered based on the ‘Department of Police,’ and I would like to compare the salaries based on gender. To better display the data, I will create a subcategory for salary ranges:

Low: 0 - 30,000
Medium: 30,001 - 60,000
High: 60,001 - 100,000
Very High: 100,001 and above.

employee_salaries <- read_csv("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/Employee_Salaries_-_2023.csv",show_col_types = FALSE)

employee_salaries_dep_pol <- employee_salaries |>
  filter(Department_Name == "Department of Police") |>
  select(Division, Gender, Base_Salary) |>
  mutate(Gender = if_else(Gender == "F", "Female", 
                          if_else(Gender == "M", "Male", "Unknown")),
         Salary_Range = cut(Base_Salary, 
                            # Define salary ranges
                            breaks = c(0, 30000, 60000, 100000, Inf),  
                            labels = c("Low", "Medium", "High", "Very High"),  
                            # Range labels
                            right = FALSE))  # Left-closed intervals

datatable(employee_salaries_dep_pol)

summary_table <- employee_salaries_dep_pol |>
  group_by(Gender, Salary_Range) |>
  summarise(
    mean_salary = round(mean(Base_Salary), 2),
    median_salary = median(Base_Salary),
    sd_salary = round(sd(Base_Salary), 2),
    min_salary = min(Base_Salary),
    max_salary = max(Base_Salary),
    n = n(),
    upper = mean_salary + sd_salary,
    lower = mean_salary - sd_salary
  )

## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.

# Display the summary table using kable
summary_table  |>
  kable(caption = "Statistical Summary of Salaries in the Department of Police ")  |>
  kable_styling(full_width = F, position = "center")

Statistical Summary of Salaries in the Department of Police
Gender	Salary_Range	mean_salary	median_salary	sd_salary	min_salary	max_salary	n	upper	lower
Female	Low	23495.96	23022.58	2820.71	18257.50	29481.63	93	26316.67	20675.25
Female	Medium	37808.51	30620.00	10446.20	30296.39	59793.32	93	48254.71	27362.31
Female	High	80458.94	78947.00	10924.31	60061.68	99622.92	314	91383.25	69534.63
Female	Very High	116179.25	108084.00	18261.14	100000.00	190185.00	182	134440.39	97918.11
Male	Low	23138.16	22346.70	2693.16	18257.50	28570.49	31	25831.32	20445.00
Male	Medium	48862.81	54169.10	10539.93	30296.39	59849.01	20	59402.74	38322.88
Male	High	81384.57	82078.00	11689.11	60265.00	98896.00	411	93073.68	69695.46
Male	Very High	117119.24	108084.00	19641.75	100897.00	258000.00	650	136760.99	97477.49

In conclusion, the bar plot shows that, on average, females earn higher salaries in the range of $0 to $30,000, while males earn higher salaries in the medium, high, and very high categories.

ggplot(summary_table, aes(x = Salary_Range, y = mean_salary, fill = Gender)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = dollar(mean_salary)),  # Use dollar format for the labels
            position = position_dodge(width = 0.9),  # Adjust position to align with bars
            vjust = -0.5,  # Adjust vertical position of the text
            size = 3) +  # Text size
  labs(title = "Average Salary by Gender",
       x = "Salary Range",
       y = "Salary $") +
  theme_minimal()+
  scale_y_continuous(labels = dollar_format()) # we had to add library scales

The highest salary by gender for the low-income category belongs to females. For the medium-income category, it belongs to males, while for the high-income category, it again belongs to females. However, in the very high-income category, males have a significantly higher salary.

ggplot(summary_table, aes(x = Salary_Range, y = max_salary, fill = Gender)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = dollar(max_salary)),  # Use dollar format for the labels
            position = position_dodge(width = 0.9),  # Adjust position to align with bars
            vjust = -0.5,  # Adjust vertical position of the text
            size = 3) +  # Text size
  labs(title = "The Highest Salary by Gender",
       x = "Salary Range",
       y = "Salary $") +
  theme_minimal()+
  scale_y_continuous(labels = dollar_format()) # we had to add library scales

DataSet II : Car Comparisons

“2024 FE Guide for DOE-release dates before 9-17-2024-no-sales -9-17-2024public.xlsx”, this dataset was provided by Benjamin Wolin.

cars_df <- read_csv("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/2024%20FE%20Guide%20for%20DOE-release%20dates%20before%209-17-2024-no-sales%20-9-17-2024public.csv",show_col_types = FALSE)

#We will fix the column names to remove spaces and make them lower case
colnames(cars_df) <- tolower(gsub("[ ()#]", "_", colnames(cars_df)))
colnames(cars_df) <- sub("^__|_$|_-_", "", colnames(cars_df))


#Now I will create a new data frame with the rows that I need for the Search
cars_df_current_year <- cars_df|>
                        filter(model_year == as.numeric(format(Sys.Date(), "%Y"))) |>
                        select (model_year, mfr_name, carline, eng_displ, cyl, transmission, city_fe__guide_conventional_fuel, hwy_fe__guide_conventional_fuel)

datatable(cars_df_current_year)

We will now conduct an analysis based on the City fuel efficiency and the number of cylinders, using the new data frame cars_df_current_year. First, we will generate summary statistics to calculate the average fuel efficiency by the number of cylinders in the city.

summary_table <- cars_df_current_year |>
  group_by(cyl) |>
  summarise(
    mean_city_fe= round(mean(city_fe__guide_conventional_fuel), 2),
    median_city_fe = median(city_fe__guide_conventional_fuel),
    sd_city_fe = round(sd(city_fe__guide_conventional_fuel), 2),
    min_city_fe= min(city_fe__guide_conventional_fuel),
    max_city_fe = max(city_fe__guide_conventional_fuel),
    total_count = n(),
    upper = mean_city_fe + sd_city_fe,
    lower = mean_city_fe- sd_city_fe
  )
summary_table  |>
  kable(caption = "Average City Fuel Efficiency (MPG)")  |>
  kable_styling(full_width = F, position = "center")

Average City Fuel Efficiency (MPG)
cyl	mean_city_fe	median_city_fe	sd_city_fe	min_city_fe	max_city_fe	total_count	upper	lower
3	27.92	28	3.09	21	36	25	31.01	24.83
4	25.32	23	7.41	16	57	448	32.73	17.91
5	19.00	19	NA	19	19	1	NA	NA
6	18.44	18	2.40	13	26	326	20.84	16.04
8	14.73	15	1.44	10	18	166	16.17	13.29
10	13.00	13	0.00	13	13	3	13.00	13.00
12	11.94	12	0.25	11	12	16	12.19	11.69
16	8.00	8	NA	8	8	1	NA	NA

The bar plot shows that cars with fewer cylinders tend to have higher MPG (miles per gallon) in city driving.

ggplot(summary_table, aes(x = factor(cyl), y = mean_city_fe, fill = factor(cyl))) +
  geom_bar(stat = "identity", position = "dodge", color ="darkblue") +
  geom_text(aes(label = paste(mean_city_fe,'MPG')), 
            position = position_dodge(width = 1), 
            vjust = 0.5, angle = 90, hjust =1, color="white") +
  labs(y = "City Fuel Efficiency (MPG)", x = "Number of Cylinders", fill = "Cylinder",title = "Average City Fuel Efficiency (MPG)",
       subtitle = "Comparison between Number of Cylinders")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12))+
  scale_fill_brewer(palette = "Blues", direction = -1)

We will generate summary statistics to calculate the average fuel efficiency by the number of cylinders on the highway. On average, cars with 3 cylinders provide better fuel efficiency.

summary_table2 <- cars_df_current_year |>
  group_by(cyl) |>
  summarise(
    mean_hwy_fe= round(mean(hwy_fe__guide_conventional_fuel), 2),
    median_hwy_fe = median(hwy_fe__guide_conventional_fuel),
    sd_hwy_fe = round(sd(hwy_fe__guide_conventional_fuel), 2),
    min_hwy_fe= min(hwy_fe__guide_conventional_fuel),
    max_hwy_fe = max(hwy_fe__guide_conventional_fuel),
    total_count = n(),
    upper = mean_hwy_fe + sd_hwy_fe,
    lower = mean_hwy_fe - sd_hwy_fe
  )
summary_table2  |>
  kable(caption = "Average City Fuel Highway (MPG)")  |>
  kable_styling(full_width = F, position = "center")

Average City Fuel Highway (MPG)
cyl	mean_hwy_fe	median_hwy_fe	sd_hwy_fe	min_hwy_fe	max_hwy_fe	total_count	upper	lower
3	34.24	34	4.11	28	43	25	38.35	30.13
4	31.01	31	6.38	16	58	448	37.39	24.63
5	29.00	29	NA	29	29	1	NA	NA
6	24.68	24	3.38	14	33	326	28.06	21.30
8	20.43	20	2.48	12	27	166	22.91	17.95
10	18.00	18	0.00	18	18	3	18.00	18.00
12	18.12	19	1.54	15	20	16	19.66	16.58
16	11.00	11	NA	11	11	1	NA	NA

The bar plot shows that cars with fewer cylinders tend to have higher Highway Fuel Efficiency (MPG). This trend is consistent with City Fuel Efficiency as well.

ggplot(summary_table2, aes(x = factor(cyl), y = mean_hwy_fe, fill = factor(cyl))) +
  geom_bar(stat = "identity", position = "dodge", color ="darkblue") +
  geom_text(aes(label = paste(mean_hwy_fe,'MPG')), 
            position = position_dodge(width = 1), 
            vjust = 0.5, angle = 90, hjust =1, color="white") +
  labs(y = "Highway Fuel Efficiency (MPG)", x = "Number of Cylinders", fill = "Cylinder",title = "Average Highway Fuel Efficiency (MPG)",
       subtitle = "Comparison between Number of Cylinders")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12))+
  scale_fill_brewer(palette = "Blues", direction = -1)

A new category is created in order to display a graphic with both summaries.

#Merging for summary data frame and using one column for the average
combined_summary <- bind_rows(
  summary_table |> 
    select(cyl, mean_city_fe) |> 
    rename(mean_fe = mean_city_fe) |>
    mutate(fuel_type = "City"),
  summary_table2 |>
    select(cyl, mean_hwy_fe) |>
    rename(mean_fe = mean_hwy_fe) |>
    mutate(fuel_type = "Highway")
)


ggplot(combined_summary, aes(x = factor(cyl), y = mean_fe, fill = fuel_type)) +
  geom_bar(stat = "identity", position = "dodge", color = "darkblue") +
  geom_text(aes(label = paste(mean_fe, 'MPG')), 
            position = position_dodge(width = 0.9), 
            vjust = -0.5, color = "black", size = 2.5) +
  labs(y = "Fuel Efficiency (MPG)", 
       x = "Number of Cylinders", 
       fill = "Fuel Type", 
       title = "City vs. Highway Fuel Efficiency (MPG)", 
       subtitle = "Comparison by Number of Cylinders") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12)) +
  scale_fill_brewer(palette = "Blues")

Project2

Alinzon Simon

2024-10-06

DataSet I : Employee Salaries 2023

DataSet II : Car Comparisons

DataSet III : I resubmitted Assigment week 5