data <- read.csv("/Users/ramyaamudapakula/Desktop/Sem1/Statistics/lasya/nurses.csv")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

#Grouping by State

grouped_state <- data %>%
  group_by(State) %>%
  summarise(
    Total_Employed = sum(Total_Employed_RN, na.rm = TRUE),
    Avg_Annual_Salary = mean(Annual_Salary_Avg, na.rm = TRUE),
    Median_Hourly_Wage = median(Hourly_Wage_Median, na.rm = TRUE)
  )

grouped_state$Probability <- grouped_state$Total_Employed / sum(grouped_state$Total_Employed, na.rm = TRUE)

# Find the index of the minimum probability
min_prob_index <- which.min(grouped_state$Probability)

# Extract the state(s) with the minimum probability
lowest_prob_state <- grouped_state[min_prob_index, ]
lowest_prob_state_tag <- "Lowest Probability Group"

# Create a new dataframe with grouping results
grouping_results_state <- grouped_state[, c("State", "Probability")]

# Merge the new dataframe back to the original data
data <- left_join(data, grouping_results_state, by = "State")

# Assign the Group_Tag column
data$Group_Tag <- ifelse(is.na(data$Probability), lowest_prob_state_tag, "")

#Grouping by Hourly_wage_percentile

# Assuming your data frame is named 'data'
library(dplyr)

# Create percentiles for hourly wage
data$Hourly_Wage_Percentile <- cut(data$Hourly_Wage_Median, breaks = quantile(data$Hourly_Wage_Median, probs = seq(0, 1, 0.25), na.rm = TRUE))

grouped_hourly_percentile <- data %>%
  group_by(Hourly_Wage_Percentile) %>%
  summarise(
    Avg_Annual_Salary = mean(Annual_Salary_Avg),
    Median_Hourly_Wage = median(Hourly_Wage_Median),
    Total_Employed = sum(Total_Employed_RN)
  ) %>%
  ungroup()

# Find the tag and lowest probability for each percentile
lowest_prob_hourly <- grouped_hourly_percentile %>%
  slice(which.min(Total_Employed))

lowest_prob_hourly_tag <- "Lowest Probability Group"

# Left join the data frames
data <- left_join(data, grouped_hourly_percentile[, c("Hourly_Wage_Percentile", "Total_Employed")], by = "Hourly_Wage_Percentile")

# Filter rows to get the lowest probability group and assign the tag
data <- data %>%
  group_by(Hourly_Wage_Percentile) %>%
  mutate(
    Group_Tag_Hourly = if_else(Total_Employed == lowest_prob_hourly$Total_Employed, lowest_prob_hourly_tag, ""),
    Lowest_Probability = if_else(Total_Employed == lowest_prob_hourly$Total_Employed, min(Total_Employed), NA_real_)
  ) %>%
  ungroup()

# Drop unnecessary columns
data <- data %>% select(-Total_Employed)

#Grouping by location quotient

library(dplyr)

grouped_location_quotient <- data %>%
  group_by(Location_Quotient) %>%
  summarise(
    Avg_Annual_Salary = mean(Annual_Salary_Avg, na.rm = TRUE),
    Median_Hourly_Wage = median(Hourly_Wage_Median, na.rm = TRUE),
    Total_Employed = sum(Total_Employed_RN, na.rm = TRUE)
  )

grouped_location_quotient$Probability <- grouped_location_quotient$Total_Employed / sum(grouped_location_quotient$Total_Employed, na.rm = TRUE)

lowest_prob_location <- grouped_location_quotient[grouped_location_quotient$Probability == min(grouped_location_quotient$Probability), ]
lowest_prob_location_tag <- "Lowest Probability Group"

# Create a new dataframe with grouping results
grouping_results_location <- grouped_location_quotient[, c("Location_Quotient", "Probability")]

# Merge the new dataframe back to the original data
data <- left_join(data, grouping_results_location, by = "Location_Quotient")

# Assign the Group_Tag_Location column
data$Group_Tag_Location <- ifelse(is.na(data$Location_Quotient), lowest_prob_location_tag, "")
print(lowest_prob_state)
## # A tibble: 1 × 5
##   State          Total_Employed Avg_Annual_Salary Median_Hourly_Wage Probability
##   <chr>                   <int>             <dbl>              <dbl>       <dbl>
## 1 Virgin Islands           7480             49932               22.9    0.000127
print(lowest_prob_hourly)
## # A tibble: 1 × 4
##   Hourly_Wage_Percentile Avg_Annual_Salary Median_Hourly_Wage Total_Employed
##   <fct>                              <dbl>              <dbl>          <int>
## 1 (8.64,23.1]                       42536.               20.2       10096030
print(lowest_prob_location)
## # A tibble: 1 × 5
##   Location_Quotient Avg_Annual_Salary Median_Hourly_Wage Total_Employed
##               <dbl>             <dbl>              <dbl>          <int>
## 1              0.34             47650               23.5            250
## # ℹ 1 more variable: Probability <dbl>
head(data)
## # A tibble: 6 × 29
##   State       Year Total_Employed_RN Employed_Standard_Error Hourly_Wage_Avg
##   <chr>      <int>             <int>                   <dbl>           <dbl>
## 1 Alabama     2020             48850                     2.9            29.0
## 2 Alaska      2020              6240                    13              45.8
## 3 Arizona     2020             55520                     3.7            38.6
## 4 Arkansas    2020             25300                     4.2            30.6
## 5 California  2020            307060                     2              58.0
## 6 Colorado    2020             52330                     2.8            37.4
## # ℹ 24 more variables: Hourly_Wage_Median <dbl>, Annual_Salary_Avg <int>,
## #   Annual_Salary_Median <int>, Wage_standard_error <dbl>,
## #   Hourly_10th_Percentile <dbl>, Hourly_25th_Percentile <dbl>,
## #   Hourly_75th_Percentile <dbl>, Hourly_90th_Percentile <dbl>,
## #   Annual_10th_Percentile <int>, Annual_25th_Percentile <int>,
## #   Annual_75th_Percentile <int>, Annual_90th_Percentile <int>,
## #   Location_Quotient <dbl>, Total_Employed_National_Aggregate <int>, …

#Visualisation for Grouping by State

library(ggplot2)

# Assuming your data frame is named 'data'
ggplot(data, aes(x = State, y = Total_Employed_RN, fill = Group_Tag)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "State-wise Analysis",
       x = "State",
       y = "Total_Employed_RN") +
  theme_minimal()
## Warning: Removed 5 rows containing missing values (`geom_bar()`).

Conclusion :

In summary, we analysed the data state-wise and found differences in the overall number of registered nurses (RNs) in employment. According to total employment, the states having the lowest possibility of being chosen are represented by the “Lowest Probability Group”.

Testable Hypothesis: States with lower overall employment rates for registered nurses might have unique difficulties or circumstances that influence the distribution of the healthcare workforce. To further understand the differences in RN employment between states, more research could look into things like population health needs, healthcare policies, or educational possibilities.

#Visualisation for Grouping by Hourly_Wage_Percentile

library(ggplot2)

# Assuming your data frame is named 'data'
ggplot(data, aes(x = Hourly_Wage_Percentile, y = Annual_Salary_Avg, fill = Group_Tag_Hourly)) +
  geom_boxplot() +
  labs(title = "Hourly Wage Percentile Analysis",
       x = "Hourly Wage Percentile",
       y = "Average Annual Salary") +
  theme_minimal()
## Warning: Removed 6 rows containing non-finite values (`stat_boxplot()`).

Conclusion:

In summary, the data for the hourly wage percentile study was categorised according to the median hourly wage percentile. Taking into account the overall employment of RNs, the “Lowest Probability Group” displays the percentile with the lowest chance of getting chosen.

Testable Hypothesis:

The hypothesis is that the places with the lowest probability group in hourly wage percentiles are those where there is a shortage of registered nurses, which drives up wages. Subsequent research endeavours may delve into the geographical elements that impact the dynamics of supply and demand within the healthcare labour market.

#Visualisation for Grouping by Location Quotient

library(ggplot2)

# Assuming your data frame is named 'data'
ggplot(data, aes(x = Location_Quotient, y = Annual_Salary_Avg, fill = Group_Tag_Location)) +
  geom_point() +
  labs(title = "Location Quotient Analysis",
       x = "Location Quotient",
       y = "Average Annual Salary") +
  theme_minimal()
## Warning: Removed 650 rows containing missing values (`geom_point()`).

Conclusion:

In summary, the location quotient analysis grouped the data according to the location quotient, which revealed information about the concentration of RN employment in various regions. Areas with the lowest likelihood of being chosen are indicated by the “Lowest Probability Group”.

Testable Hypothesis: One possible hypothesis is that areas with a lower location quotient would have trouble attracting or keeping registered nurses, possibly as a result of things like healthcare infrastructure, lifestyle issues, or discrepancies between rural and urban areas. Additional research and surveys may be able to pinpoint the precise elements affecting the distribution of registered nurses in various areas.

Conclusions and Testable Hypothesis:

State wise Analysis

1.Lowest Probability Group:The state with the lowest likelihood of hiring registered nurses (RNs) is known as the Low Probability Group.

Conclusion- The state that belongs to the “Lowest Probability Group” might have a lesser demand for registered nurses than other states.

Testable Hypothesis: The need for RNs may be reduced in this state due to a smaller population, a better-functioning healthcare system, or other causes. Regional health patterns, healthcare infrastructure, and population demographics could all be the subject of more research.

Hourly Wage Percentile Analysis

2.Conclusion- compared to other RNs, those in this wage percentile lowest probability group may have a decreased employment chance.

Hypothesis: This would suggest that there is less of a need for RNs in positions where the median hourly pay falls into this particular range. This pattern might be influenced by variables such as the kinds of healthcare facilities, the duties of the jobs, or the state of the local economy.

Location Quotient Analysis

3.Conclusion-When compared to other places, the location with the lowest probability may have a reduced demand for RNs.

Hypothesis: The demand for registered nurses in this location may be lower than the national average if the location quotient is low. A more balanced healthcare workforce, a less densely populated area, or particular local healthcare dynamics that impact RN career chances are examples of potential contributing factors.

#Finding missing combinations, most and least combinations and visualisation

# Check for missing combinations
missing_combinations <- expand.grid(Location_Quotient = unique(data$Location_Quotient),
                                     Hourly_Wage_Avg = unique(data$Hourly_Wage_Avg)) %>%
  filter(!(paste(Location_Quotient, Hourly_Wage_Avg) %in% paste(data$Location_Quotient, data$Hourly_Wage_Avg)))

# Display a sample of missing combinations along with the total count
cat("Missing Combinations (Sample):\n")
## Missing Combinations (Sample):
print(head(missing_combinations, 5))  # Displaying the first 5 rows as a sample
##   Location_Quotient Hourly_Wage_Avg
## 1              0.98           28.96
## 2              0.91           28.96
## 3              1.00           28.96
## 4              0.87           28.96
## 5              0.95           28.96
cat("Total Missing Combinations: ", nrow(missing_combinations), "\n")
## Total Missing Combinations:  124760

There are possible missing combinations here as there are possible chances of location being a factor but there are no registered nurses for that location or it could be that the data set might have been collected based on specific criteria like time periods,occupational categories etc which on not meeting the criteria would have been excluded.

# Most Common Combinations
most_common_combinations <- data %>%
  group_by(Location_Quotient, Hourly_Wage_Avg) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  slice(1)
## `summarise()` has grouped output by 'Location_Quotient'. You can override using
## the `.groups` argument.
# Display most common combinations
cat("\nMost Common Combinations:\n")
## 
## Most Common Combinations:
print(most_common_combinations)
## # A tibble: 131 × 3
## # Groups:   Location_Quotient [131]
##    Location_Quotient Hourly_Wage_Avg count
##                <dbl>           <dbl> <int>
##  1             0.32             25.4     1
##  2             0.34             22.9     1
##  3             0.35             26.8     1
##  4             0.36             25.1     1
##  5             0.368            24.4     1
##  6             0.38             21.6     1
##  7             0.4              24.5     1
##  8             0.408            25.3     1
##  9             0.41             25.4     1
## 10             0.42             25.4     1
## # ℹ 121 more rows

The reason for the most common combination of the both could be due to the high need of nurses in a specific region as a result of population density, educational levels or other workforce characteristics which influence the existence of such combinations.

# Least Common Combinations
least_common_combinations <- data %>%
  group_by(Location_Quotient, Hourly_Wage_Avg) %>%
  summarise(count = n()) %>%
  arrange(count) %>%
  slice(1)
## `summarise()` has grouped output by 'Location_Quotient'. You can override using
## the `.groups` argument.
# Display least common combinations
cat("\nLeast Common Combinations:\n")
## 
## Least Common Combinations:
print(least_common_combinations)
## # A tibble: 131 × 3
## # Groups:   Location_Quotient [131]
##    Location_Quotient Hourly_Wage_Avg count
##                <dbl>           <dbl> <int>
##  1             0.32             25.4     1
##  2             0.34             22.9     1
##  3             0.35             26.8     1
##  4             0.36             25.1     1
##  5             0.368            24.4     1
##  6             0.38             21.6     1
##  7             0.4              24.5     1
##  8             0.408            25.3     1
##  9             0.41             25.4     1
## 10             0.42             25.4     1
## # ℹ 121 more rows

The reason for the least common combination of the both could be due to the low demand of nurses in a specific region as a result of less population density,less educational levels or other workforce characteristics which influence the absence of such combinations.

library(ggplot2)
# Plotting the most common combinations
ggplot(most_common_combinations, aes(x = Location_Quotient, y = Hourly_Wage_Avg, size = count)) +
  geom_point() +
  labs(title = "Most Common Combinations",
       x = "Location Quotient",
       y = "Hourly Wage Average",
       size = "Count")
## Warning: Removed 2 rows containing missing values (`geom_point()`).