knitr::opts_chunk$set(echo = TRUE)
# Install zoo if not already installed
if (!requireNamespace("zoo", quietly = TRUE)) {install.packages("zoo")}
if (!requireNamespace("tidyverse", quietly = TRUE)) {install.packages("tidyverse")}
if (!requireNamespace("skimr", quietly = TRUE)) install.packages("skimr")
if (!requireNamespace("corrplot", quietly = TRUE)) install.packages("corrplot")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(skimr) # For quick summary
library(corrplot) # For correlation heatmap
## corrplot 0.95 loaded
library(stringr)
library(readxl)
library(readr)
library(dplyr)
library(tinytex)
library(ggplot2)
Data_1: Airline_Delay_Cause Data has been taken from :https://www.kaggle.com/datasets/sriharshaeedala/airline-delay. Data has been taken up-to year 2015. We will analyze data and try to find out the reasons for a poor performances. Lastly we will save the data.
file <- 'https://raw.githubusercontent.com/tanzil64/Data-607-Project-02/refs/heads/main/Airline_Delay_Cause%20-%20Cleaned.csv'
Airline_Delay_Cause <- read.csv(file)
head(Airline_Delay_Cause)
## year month carrier_name airport
## 1 2023 8 Endeavor Air Inc. ABE
## 2 2023 8 Endeavor Air Inc. ABY
## 3 2023 8 Endeavor Air Inc. AEX
## 4 2023 8 Endeavor Air Inc. AGS
## 5 2023 8 Endeavor Air Inc. ALB
## 6 2023 8 Endeavor Air Inc. ATL
## airport_name arr_flights
## 1 Allentown/Bethlehem/Easton, PA: Lehigh Valley International 89
## 2 Albany, GA: Southwest Georgia Regional 62
## 3 Alexandria, LA: Alexandria International 62
## 4 Augusta, GA: Augusta Regional at Bush Field 66
## 5 Albany, NY: Albany International 92
## 6 Atlanta, GA: Hartsfield-Jackson Atlanta International 1636
## arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct
## 1 13 2.25 1.60 3.16 0 5.99
## 2 10 1.97 0.04 0.57 0 7.42
## 3 10 2.73 1.18 1.80 0 4.28
## 4 12 3.69 2.27 4.47 0 1.57
## 5 22 7.76 0.00 2.96 0 11.28
## 6 256 55.98 27.81 63.64 0 108.57
## arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay
## 1 2 1 1375 71 761 118
## 2 0 1 799 218 1 62
## 3 1 0 766 56 188 78
## 4 1 1 1397 471 320 388
## 5 2 0 1530 628 0 134
## 6 32 11 29768 9339 4557 4676
## security_delay late_aircraft_delay
## 1 0 425
## 2 0 518
## 3 0 444
## 4 0 218
## 5 0 768
## 6 0 11196
# Check the structure of the dataset to see data types and sample values
str(Airline_Delay_Cause)
## 'data.frame': 150953 obs. of 20 variables:
## $ year : int 2023 2023 2023 2023 2023 2023 2023 2023 2023 2023 ...
## $ month : int 8 8 8 8 8 8 8 8 8 8 ...
## $ carrier_name : chr "Endeavor Air Inc." "Endeavor Air Inc." "Endeavor Air Inc." "Endeavor Air Inc." ...
## $ airport : chr "ABE" "ABY" "AEX" "AGS" ...
## $ airport_name : chr "Allentown/Bethlehem/Easton, PA: Lehigh Valley International" "Albany, GA: Southwest Georgia Regional" "Alexandria, LA: Alexandria International" "Augusta, GA: Augusta Regional at Bush Field" ...
## $ arr_flights : int 89 62 62 66 92 1636 75 59 62 30 ...
## $ arr_del15 : int 13 10 10 12 22 256 12 7 13 4 ...
## $ carrier_ct : num 2.25 1.97 2.73 3.69 7.76 ...
## $ weather_ct : num 1.6 0.04 1.18 2.27 0 ...
## $ nas_ct : num 3.16 0.57 1.8 4.47 2.96 ...
## $ security_ct : num 0 0 0 0 0 0 0 0 0 0 ...
## $ late_aircraft_ct : num 5.99 7.42 4.28 1.57 11.28 ...
## $ arr_cancelled : int 2 0 1 1 2 32 0 2 0 1 ...
## $ arr_diverted : int 1 1 0 1 0 11 0 0 0 0 ...
## $ arr_delay : int 1375 799 766 1397 1530 29768 843 324 707 1421 ...
## $ carrier_delay : int 71 218 56 471 628 9339 535 117 470 0 ...
## $ weather_delay : int 761 1 188 320 0 4557 170 0 77 532 ...
## $ nas_delay : int 118 62 78 388 134 4676 111 25 87 0 ...
## $ security_delay : int 0 0 0 0 0 0 0 0 0 0 ...
## $ late_aircraft_delay: int 425 518 444 218 768 11196 27 182 73 889 ...
# Check for missing (null) values in each column
null_counts <- sapply(Airline_Delay_Cause, function(x) sum(is.na(x)))
print(null_counts)
## year month carrier_name airport
## 0 0 0 0
## airport_name arr_flights arr_del15 carrier_ct
## 0 209 410 209
## weather_ct nas_ct security_ct late_aircraft_ct
## 209 209 209 209
## arr_cancelled arr_diverted arr_delay carrier_delay
## 209 209 209 209
## weather_delay nas_delay security_delay late_aircraft_delay
## 209 209 209 209
# Get a summary of the data for a quick look at distributions and potential issues
summary(Airline_Delay_Cause)
## year month carrier_name airport
## Min. :2015 Min. : 1.000 Length:150953 Length:150953
## 1st Qu.:2017 1st Qu.: 3.000 Class :character Class :character
## Median :2019 Median : 6.000 Mode :character Mode :character
## Mean :2019 Mean : 6.336
## 3rd Qu.:2021 3rd Qu.: 9.000
## Max. :2023 Max. :12.000
##
## airport_name arr_flights arr_del15 carrier_ct
## Length:150953 Min. : 1.0 Min. : 0.00 Min. : 0.00
## Class :character 1st Qu.: 48.0 1st Qu.: 6.00 1st Qu.: 2.00
## Mode :character Median : 95.0 Median : 16.00 Median : 6.03
## Mean : 356.2 Mean : 64.04 Mean : 20.33
## 3rd Qu.: 243.0 3rd Qu.: 45.00 3rd Qu.: 16.50
## Max. :21977.0 Max. :4176.00 Max. :1293.91
## NA's :209 NA's :410 NA's :209
## weather_ct nas_ct security_ct late_aircraft_ct
## Min. : 0.000 Min. : 0.00 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 1.00 1st Qu.: 0.0000 1st Qu.: 1.02
## Median : 0.380 Median : 3.62 Median : 0.0000 Median : 4.67
## Mean : 2.244 Mean : 18.65 Mean : 0.1611 Mean : 22.58
## 3rd Qu.: 1.850 3rd Qu.: 11.00 3rd Qu.: 0.0000 3rd Qu.: 14.43
## Max. :266.420 Max. :1884.42 Max. :58.6900 Max. :2069.07
## NA's :209 NA's :209 NA's :209 NA's :209
## arr_cancelled arr_diverted arr_delay carrier_delay
## Min. : 0.000 Min. : 0.0000 Min. : 0 Min. : 0
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 316 1st Qu.: 103
## Median : 1.000 Median : 0.0000 Median : 980 Median : 361
## Mean : 7.502 Mean : 0.8519 Mean : 4182 Mean : 1439
## 3rd Qu.: 4.000 3rd Qu.: 1.0000 3rd Qu.: 2804 3rd Qu.: 1089
## Max. :4951.000 Max. :160.0000 Max. :438783 Max. :196944
## NA's :209 NA's :209 NA's :209 NA's :209
## weather_delay nas_delay security_delay late_aircraft_delay
## Min. : 0 Min. : 0 Min. : 0.000 Min. : 0
## 1st Qu.: 0 1st Qu.: 30 1st Qu.: 0.000 1st Qu.: 59
## Median : 17 Median : 136 Median : 0.000 Median : 302
## Mean : 227 Mean : 898 Mean : 7.619 Mean : 1609
## 3rd Qu.: 147 3rd Qu.: 451 3rd Qu.: 0.000 3rd Qu.: 1038
## Max. :31960 Max. :112018 Max. :3760.000 Max. :227959
## NA's :209 NA's :209 NA's :209 NA's :209
`
# Trim whitespace in character columns if necessary
Airline_Delay_Cause <- Airline_Delay_Cause %>%
dplyr::mutate(across(where(is.character), ~ trimws(.)))
Checck Data
head(Airline_Delay_Cause)
## year month carrier_name airport
## 1 2023 8 Endeavor Air Inc. ABE
## 2 2023 8 Endeavor Air Inc. ABY
## 3 2023 8 Endeavor Air Inc. AEX
## 4 2023 8 Endeavor Air Inc. AGS
## 5 2023 8 Endeavor Air Inc. ALB
## 6 2023 8 Endeavor Air Inc. ATL
## airport_name arr_flights
## 1 Allentown/Bethlehem/Easton, PA: Lehigh Valley International 89
## 2 Albany, GA: Southwest Georgia Regional 62
## 3 Alexandria, LA: Alexandria International 62
## 4 Augusta, GA: Augusta Regional at Bush Field 66
## 5 Albany, NY: Albany International 92
## 6 Atlanta, GA: Hartsfield-Jackson Atlanta International 1636
## arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct
## 1 13 2.25 1.60 3.16 0 5.99
## 2 10 1.97 0.04 0.57 0 7.42
## 3 10 2.73 1.18 1.80 0 4.28
## 4 12 3.69 2.27 4.47 0 1.57
## 5 22 7.76 0.00 2.96 0 11.28
## 6 256 55.98 27.81 63.64 0 108.57
## arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay
## 1 2 1 1375 71 761 118
## 2 0 1 799 218 1 62
## 3 1 0 766 56 188 78
## 4 1 1 1397 471 320 388
## 5 2 0 1530 628 0 134
## 6 32 11 29768 9339 4557 4676
## security_delay late_aircraft_delay
## 1 0 425
## 2 0 518
## 3 0 444
## 4 0 218
## 5 0 768
## 6 0 11196
# Create a date column using the "year" and "Month" columns (set day to "01")
# Using sprintf to ensure Month is formatted with two digits (e.g., "01" for January)
Airline_Delay_Cause$date <- as.Date(with(Airline_Delay_Cause, paste(year, sprintf("%02d", month), "01", sep = "-")), format = "%Y-%m-%d")
# Arrange the data by date (optional, but helps with time series)
df <- Airline_Delay_Cause %>% arrange(date)
# 2. Fill missing values in 'arr_flights' with the median value
median_arr_flights <- median(df$arr_flights, na.rm = TRUE)
df$arr_flights[is.na(df$arr_flights)] <- median_arr_flights
# 3. Plot the trend of 'arr_flights' over time using ggplot2
#ggplot(df, aes(x = date, y = arr_flights)) +
# geom_line(color = "blue", size = 1) +
#geom_point(color = "green", size = 0.5) +
#geom_smooth(method = "loess", se = FALSE, color = "red") +
#labs(title = "Trend of Arrival Flights Over Time",
# x = "Time",
#y = "Number of Arrival Flights") +
#theme_minimal()
##Insight: Carrier Delays: Shows a steady increase after 2018, with a dip around 2020, likely due to the pandemic’s effect on flight operations.
#Aggregate by month: Summing 'arr_flights' for each month
monthly_df <- df %>%
group_by(year = year(date), month = month(date)) %>%
#computes the total arriving flights per month.
summarise(arr_flights = sum(arr_flights, na.rm = TRUE)) %>%
ungroup() %>%
#converts the grouping back into a proper date column.
mutate(date = as.Date(paste(year, month, "01", sep = "-")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Plot the aggregated monthly sum of arrival flights over time
ggplot(monthly_df, aes(x = date, y = arr_flights)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 1) +
labs(title = "Monthly Sum of Arrival Flights Over Time",
x = "Time",
y = "Number of Arrival Flights") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Calculate the average arrival delay per carrier
average_delay_per_carrier <- df %>%
group_by(carrier_name) %>%
summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(avg_arr_delay) # Sort in ascending order
# Print the result
print(average_delay_per_carrier)
## # A tibble: 21 × 2
## carrier_name avg_arr_delay
## <chr> <dbl>
## 1 Horizon Air 1257.
## 2 Allegiant Air 1309.
## 3 ExpressJet Airlines LLC 1746.
## 4 Envoy Air 1838.
## 5 Alaska Airlines Inc. 1928.
## 6 Hawaiian Airlines Inc. 1933.
## 7 Endeavor Air Inc. 2011.
## 8 Mesa Airlines Inc. 2138.
## 9 Frontier Airlines Inc. 2320.
## 10 PSA Airlines Inc. 2589.
## # ℹ 11 more rows
library(dplyr)
library(ggplot2)
# Filter data for the year 2023 and airport ABE
df_2023_ABE <- df%>%
filter(year == 2023, airport == "ABE")
# Summarize data by carrier_name
df_summary_2023_ABE <- df_2023_ABE %>%
group_by(carrier_name) %>%
summarise(total_delays = sum(arr_del15, na.rm = TRUE)) %>%
ungroup()
# Create a bar chart
ggplot(df_summary_2023_ABE, aes(x = carrier_name, y = total_delays, fill = carrier_name)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Total Delays by Carrier at ABE Airport in 2023",
x = "Carrier Name",
y = "Total Delays",
fill = "Carrier Name")
#Insight:Alegant Air has most delays in ABE airport.
# Calculate total delays and percentage of each delay type
library(ggplot2)
library(dplyr)
library(tidyr)
# Calculate total delays, percentage of each delay type, and frequency
df_delay_analysis <- df %>%
summarise(
total_delay = sum(arr_delay, na.rm = TRUE),
freq_carrier_delay = sum(carrier_ct, na.rm = TRUE),
freq_weather_delay = sum(weather_ct, na.rm = TRUE),
freq_nas_delay = sum(nas_ct, na.rm = TRUE),
freq_security_delay = sum(security_ct, na.rm = TRUE),
freq_late_aircraft_delay = sum(late_aircraft_ct, na.rm = TRUE),
percent_carrier_delay = sum(carrier_delay, na.rm = TRUE) / total_delay * 100,
percent_weather_delay = sum(weather_delay, na.rm = TRUE) / total_delay * 100,
percent_nas_delay = sum(nas_delay, na.rm = TRUE) / total_delay * 100,
percent_security_delay = sum(security_delay, na.rm = TRUE) / total_delay * 100,
percent_late_aircraft_delay = sum(late_aircraft_delay, na.rm = TRUE) / total_delay * 100
)
# Reshape the data for plotting
df_delay_analysis_long <- df_delay_analysis %>%
pivot_longer(cols = -total_delay, names_to = "metric", values_to = "value") %>%
separate(metric, into = c("type", "delay_type"), sep = "_", extra = "merge") %>%
pivot_wider(names_from = type, values_from = value)
# Plot total delay, percentage delays, and frequency with connected dots
ggplot(df_delay_analysis_long, aes(x = delay_type)) +
geom_bar(aes(y = percent, fill = delay_type), stat = "identity", position = "dodge") +
geom_point(aes(y = freq / max(df_delay_analysis_long$freq) * 100), color = "red", size = 3) +
geom_line(aes(y = freq / max(df_delay_analysis_long$freq) * 100, group = 1), color = "red") +
labs(title = paste("Total Delay:", df_delay_analysis$total_delay, "minutes"),
x = "Delay Type", y = "Percentage of Total Delay / Frequency (scaled)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Use of `df_delay_analysis_long$freq` is discouraged.
## ℹ Use `freq` instead.
## Use of `df_delay_analysis_long$freq` is discouraged.
## ℹ Use `freq` instead.
Insight:Carrier Delays: 1)Carrier & aircraft Delays are the most significant. 2)Weather Delays: Exhibits significant variability, influenced by seasonal weather conditions and not tied directly to flight volume trends. 3)NAS Delays has an average significance. 4)Security Delays: Generally low significance
Long Data
library(dplyr)
library(ggplot2)
pacman::p_load(tidyr)
# Transform the dataset into a long format for numeric delay columns
data_long <- df %>%
pivot_longer(cols = c(carrier_delay, weather_delay, nas_delay, security_delay, late_aircraft_delay),
names_to = "delay_type",
values_to = "delay_value")
# Print a small part of the transformed dataset
print(head(data_long))
## # A tibble: 6 × 18
## year month carrier_name airport airport_name arr_flights arr_del15 carrier_ct
## <int> <int> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 2015 1 American Ai… JFK New York, N… 1369 322 73.3
## 2 2015 1 American Ai… JFK New York, N… 1369 322 73.3
## 3 2015 1 American Ai… JFK New York, N… 1369 322 73.3
## 4 2015 1 American Ai… JFK New York, N… 1369 322 73.3
## 5 2015 1 American Ai… JFK New York, N… 1369 322 73.3
## 6 2015 1 American Ai… LAX Los Angeles… 2633 445 157.
## # ℹ 10 more variables: weather_ct <dbl>, nas_ct <dbl>, security_ct <dbl>,
## # late_aircraft_ct <dbl>, arr_cancelled <int>, arr_diverted <int>,
## # arr_delay <int>, date <date>, delay_type <chr>, delay_value <int>
library(ggplot2)
if(!require('DataExplorer')) {
install.packages('DataExplorer')
library('DataExplorer')
}
## Loading required package: DataExplorer
plot_bar(data_long)
## 3 columns ignored with more than 50 categories.
## airport: 389 categories
## airport_name: 413 categories
## date: 104 categories
Insight :Skywest airline has the most delays in minutes.Now if we would to find out the reasons behind the poor performances of the sky west.Lets take the data for only year 2023.
# Filter the data frame to keep only SkyWest Airlines Inc.
df_2023_skywest <- df[df$carrier_name == "SkyWest Airlines Inc.", ]
# Display the first few rows of the filtered data frame
head(df_2023_skywest)
## year month carrier_name airport
## 744 2015 1 SkyWest Airlines Inc. RNO
## 745 2015 1 SkyWest Airlines Inc. LAX
## 746 2015 1 SkyWest Airlines Inc. SAN
## 747 2015 1 SkyWest Airlines Inc. CLD
## 748 2015 1 SkyWest Airlines Inc. LAR
## 749 2015 1 SkyWest Airlines Inc. SFO
## airport_name arr_flights arr_del15
## 744 Reno, NV: Reno/Tahoe International 257 45
## 745 Los Angeles, CA: Los Angeles International 4188 819
## 746 San Diego, CA: San Diego International 716 126
## 747 Carlsbad, CA: McClellan-Palomar 204 27
## 748 Laramie, WY: Laramie Regional 62 13
## 749 San Francisco, CA: San Francisco International 3567 977
## carrier_ct weather_ct nas_ct security_ct late_aircraft_ct arr_cancelled
## 744 14.37 0.95 7.80 0.81 21.06 3
## 745 128.50 13.46 242.28 0.82 433.93 50
## 746 42.19 1.00 19.47 0.00 63.34 5
## 747 8.15 0.00 7.14 0.00 11.70 4
## 748 5.63 0.45 3.56 0.00 3.36 1
## 749 91.52 6.22 384.21 0.00 495.04 132
## arr_diverted arr_delay carrier_delay weather_delay nas_delay security_delay
## 744 0 3084 1107 21 295 35
## 745 2 46448 10896 1100 7322 18
## 746 0 6387 2052 86 624 0
## 747 1 1016 424 0 189 0
## 748 1 779 387 22 179 0
## 749 21 68807 8496 363 24530 0
## late_aircraft_delay date
## 744 1626 2015-01-01
## 745 27112 2015-01-01
## 746 3625 2015-01-01
## 747 403 2015-01-01
## 748 191 2015-01-01
## 749 35418 2015-01-01
# Filter the data frame to keep only the rows for the year 2023
df_2023 <- df[df$year == 2023, ]
# Display the first few rows of the filtered data frame
head(df_2023)
## year month carrier_name airport
## 138581 2023 1 Endeavor Air Inc. ABE
## 138582 2023 1 Endeavor Air Inc. ABY
## 138583 2023 1 Endeavor Air Inc. AEX
## 138584 2023 1 Endeavor Air Inc. AGS
## 138585 2023 1 Endeavor Air Inc. ALB
## 138586 2023 1 Endeavor Air Inc. ATL
## airport_name arr_flights
## 138581 Allentown/Bethlehem/Easton, PA: Lehigh Valley International 14
## 138582 Albany, GA: Southwest Georgia Regional 82
## 138583 Alexandria, LA: Alexandria International 60
## 138584 Augusta, GA: Augusta Regional at Bush Field 26
## 138585 Albany, NY: Albany International 109
## 138586 Atlanta, GA: Hartsfield-Jackson Atlanta International 2095
## arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct
## 138581 1 0.00 1.00 0.00 0 0.00
## 138582 10 6.06 1.72 0.47 0 1.74
## 138583 8 3.54 1.59 2.71 0 0.16
## 138584 6 3.40 1.20 0.64 0 0.76
## 138585 30 8.08 0.61 9.80 0 11.52
## 138586 367 68.04 24.22 99.71 0 175.03
## arr_cancelled arr_diverted arr_delay carrier_delay weather_delay
## 138581 0 0 648 0 647
## 138582 0 0 1800 1393 272
## 138583 2 4 484 176 184
## 138584 0 0 395 119 167
## 138585 3 0 1591 402 22
## 138586 13 3 35193 9465 3036
## nas_delay security_delay late_aircraft_delay date
## 138581 1 0 0 2023-01-01
## 138582 56 0 79 2023-01-01
## 138583 113 0 11 2023-01-01
## 138584 30 0 79 2023-01-01
## 138585 333 0 834 2023-01-01
## 138586 4307 0 18385 2023-01-01
# Filter the data frame to keep only SkyWest Airlines Inc.
df_2023_no_skywest <- df[df$carrier_name != "SkyWest Airlines Inc.", ]
# Display the first few rows of the filtered data frame
head(df_2023_no_skywest)
## year month carrier_name airport
## 1 2015 1 American Airlines Inc. JFK
## 2 2015 1 American Airlines Inc. LAX
## 3 2015 1 American Airlines Inc. DFW
## 4 2015 1 American Airlines Inc. OGG
## 5 2015 1 American Airlines Inc. HNL
## 6 2015 1 American Airlines Inc. SFO
## airport_name arr_flights arr_del15
## 1 New York, NY: John F. Kennedy International 1369 322
## 2 Los Angeles, CA: Los Angeles International 2633 445
## 3 Dallas/Fort Worth, TX: Dallas/Fort Worth International 12466 2463
## 4 Kahului, HI: Kahului Airport 100 22
## 5 Honolulu, HI: Daniel K Inouye International 169 50
## 6 San Francisco, CA: San Francisco International 876 200
## carrier_ct weather_ct nas_ct security_ct late_aircraft_ct arr_cancelled
## 1 73.31 8.44 136.78 0.00 103.47 86
## 2 157.17 25.21 107.82 1.36 153.43 41
## 3 645.29 64.66 765.92 4.14 982.99 203
## 4 11.53 0.00 6.48 0.00 4.00 3
## 5 28.69 0.00 15.04 0.00 6.27 0
## 6 59.98 10.44 73.98 0.14 55.46 15
## arr_diverted arr_delay carrier_delay weather_delay nas_delay security_delay
## 1 3 20055 5273 999 6358 0
## 2 4 25261 10914 1460 3293 42
## 3 6 167313 66714 5055 24137 123
## 4 0 1776 1207 0 188 0
## 5 4 4175 2602 0 523 0
## 6 0 11993 3108 761 3394 21
## late_aircraft_delay date
## 1 7425 2015-01-01
## 2 9552 2015-01-01
## 3 71284 2015-01-01
## 4 381 2015-01-01
## 5 1050 2015-01-01
## 6 4709 2015-01-01
# Calculate medians for df_2023_no_skywest
medians_no_skywest <- df_2023_no_skywest %>%
summarise(across(c("arr_flights", "arr_del15", "carrier_ct", "weather_ct",
"nas_ct", "security_ct", "late_aircraft_ct", "arr_cancelled",
"arr_diverted", "arr_delay", "carrier_delay", "weather_delay",
"nas_delay", "security_delay", "late_aircraft_delay"), median, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
# Calculate medians for df_2023_skywest
medians_skywest <- df_2023_skywest %>%
summarise(across(c("arr_flights", "arr_del15", "carrier_ct", "weather_ct",
"nas_ct", "security_ct", "late_aircraft_ct", "arr_cancelled",
"arr_diverted", "arr_delay", "carrier_delay", "weather_delay",
"nas_delay", "security_delay", "late_aircraft_delay"), median, na.rm = TRUE))
# Combine medians into a single data frame for plotting
medians_combined <- bind_rows(
mutate(medians_no_skywest, Carrier = "No SkyWest"),
mutate(medians_skywest, Carrier = "SkyWest")
)
# Convert to long format for ggplot
medians_long <- pivot_longer(medians_combined, -Carrier, names_to = "Variable", values_to = "Median")
# Plot the medians using ggplot2
ggplot(medians_long, aes(x = Variable, y = Median, fill = Carrier)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Comparison of Medians: No SkyWest vs SkyWest (2023)", y = "Median Value")
Conclusion: We have compare the skyline west data with the median of all other airline.The graph shows that arrival delay and the career delay are the main reasons for the poor performances of Skyline west airlines.For others reasons this airline is pretty much similar with the other airlines.
Data_2#Cheese_data
#path1 <- "C:/Users/tanzi/OneDrive/DATA/607/week6/tanzil_airline_delay.csv"
#write.csv(data_long, path1)
Data_2_Cheese Data Chesse Data has been taken from :Discussion 05. We will analyze data and try to find out correlations. Lastly we will save the data.
file <- 'https://raw.githubusercontent.com/tanzil64/Data-607-Project-02/refs/heads/main/cheeses.csv'
data <- read.csv(file)
cheese_df <-data.frame(data)
head(cheese_df)
## cheese url milk
## 1 Aarewasser https://www.cheese.com/aarewasser/ cow
## 2 Abbaye de Belloc https://www.cheese.com/abbaye-de-belloc/ sheep
## 3 Abbaye de Belval https://www.cheese.com/abbaye-de-belval/ cow
## 4 Abbaye de Citeaux https://www.cheese.com/abbaye-de-citeaux/ cow
## 5 Abbaye de Tamié https://www.cheese.com/tamie/ cow
## 6 Abbaye de Timadeuc https://www.cheese.com/abbaye-de-timadeuc/ cow
## country region family type
## 1 Switzerland <NA> <NA> semi-soft
## 2 France Pays Basque <NA> semi-hard, artisan
## 3 France <NA> <NA> semi-hard
## 4 France Burgundy <NA> semi-soft, artisan, brined
## 5 France Savoie <NA> soft, artisan
## 6 France province of Brittany <NA> semi-hard
## fat_content calcium_content texture rind color
## 1 <NA> <NA> buttery washed yellow
## 2 <NA> <NA> creamy, dense, firm natural yellow
## 3 40-46% <NA> elastic washed ivory
## 4 <NA> <NA> creamy, dense, smooth washed white
## 5 <NA> <NA> creamy, open, smooth washed white
## 6 <NA> <NA> soft washed pale yellow
## flavor aroma vegetarian vegan
## 1 sweet buttery FALSE FALSE
## 2 burnt caramel lanoline TRUE FALSE
## 3 <NA> aromatic FALSE FALSE
## 4 acidic, milky, smooth barnyardy, earthy FALSE FALSE
## 5 fruity, nutty perfumed, pungent FALSE FALSE
## 6 salty, smooth nutty FALSE FALSE
## synonyms alt_spellings
## 1 <NA> <NA>
## 2 Abbaye Notre-Dame de Belloc <NA>
## 3 <NA> <NA>
## 4 <NA> <NA>
## 5 <NA> Tamié, Trappiste de Tamie, Abbey of Tamie
## 6 <NA> <NA>
## producers
## 1 Jumi
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 Abbaye Cistercienne NOTRE-DAME DE TIMADEUC
# Check column names and structure
str(cheese_df)
## 'data.frame': 1187 obs. of 19 variables:
## $ cheese : chr "Aarewasser" "Abbaye de Belloc" "Abbaye de Belval" "Abbaye de Citeaux" ...
## $ url : chr "https://www.cheese.com/aarewasser/" "https://www.cheese.com/abbaye-de-belloc/" "https://www.cheese.com/abbaye-de-belval/" "https://www.cheese.com/abbaye-de-citeaux/" ...
## $ milk : chr "cow" "sheep" "cow" "cow" ...
## $ country : chr "Switzerland" "France" "France" "France" ...
## $ region : chr NA "Pays Basque" NA "Burgundy" ...
## $ family : chr NA NA NA NA ...
## $ type : chr "semi-soft" "semi-hard, artisan" "semi-hard" "semi-soft, artisan, brined" ...
## $ fat_content : chr NA NA "40-46%" NA ...
## $ calcium_content: chr NA NA NA NA ...
## $ texture : chr "buttery" "creamy, dense, firm" "elastic" "creamy, dense, smooth" ...
## $ rind : chr "washed" "natural" "washed" "washed" ...
## $ color : chr "yellow" "yellow" "ivory" "white" ...
## $ flavor : chr "sweet" "burnt caramel" NA "acidic, milky, smooth" ...
## $ aroma : chr "buttery" "lanoline" "aromatic" "barnyardy, earthy" ...
## $ vegetarian : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
## $ vegan : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ synonyms : chr NA "Abbaye Notre-Dame de Belloc" NA NA ...
## $ alt_spellings : chr NA NA NA NA ...
## $ producers : chr "Jumi" NA NA NA ...
# Summary statistics of numerical columns
summary(cheese_df)
## cheese url milk country
## Length:1187 Length:1187 Length:1187 Length:1187
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
## region family type fat_content
## Length:1187 Length:1187 Length:1187 Length:1187
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
## calcium_content texture rind color
## Length:1187 Length:1187 Length:1187 Length:1187
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
## flavor aroma vegetarian vegan
## Length:1187 Length:1187 Mode :logical Mode :logical
## Class :character Class :character FALSE:386 FALSE:742
## Mode :character Mode :character TRUE :362 TRUE :6
## NA's :439 NA's :439
## synonyms alt_spellings producers
## Length:1187 Length:1187 Length:1187
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
# Check for missing values
colSums(is.na(cheese_df))
## cheese url milk country region
## 0 0 0 0 332
## family type fat_content calcium_content texture
## 698 13 939 1162 0
## rind color flavor aroma vegetarian
## 242 142 98 258 439
## vegan synonyms alt_spellings producers
## 439 893 1078 400
#Convert categorical columns to factors
cheese_df <- cheese_df %>%
mutate(across(where(is.character), as.factor))
library(ggplot2)
library(dplyr)
# Assuming df is already loaded in the environment
# Clean and prepare the data
df_clean <- data %>%
mutate(fat_content = as.numeric(gsub(" g/100g|%", "", fat_content))) %>%
filter(!is.na(fat_content))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `fat_content = as.numeric(gsub(" g/100g|%", "", fat_content))`.
## Caused by warning:
## ! NAs introduced by coercion
# Plotting milk type vs fat content
ggplot(df_clean, aes(x = milk, y = fat_content, fill = milk)) +
geom_boxplot() +
labs(title = "Fat Content by Milk Type", x = "Milk Type", y = "Fat Content (%)") +
theme_minimal() +
theme(legend.position = "none")
library(ggplot2)
library(dplyr)
# Assuming df is already loaded in the environment
# Group by color and milk, filter out NA values, and summarize the count of cheeses
df_color_milk_summary <- df_clean %>%
filter(!is.na(color) & !is.na(milk)) %>%
group_by(color, milk) %>%
summarise(count = n()) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'color'. You can override using the
## `.groups` argument.
# Plotting the count of cheeses by color and milk type
ggplot(df_color_milk_summary, aes(x = reorder(color, -count), y = count, fill = milk)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Count of Cheeses by Color and Milk Type", x = "Color", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust =1))
library(ggplot2)
library(dplyr)
# Assuming df is already loaded in the environment
# Filter for rows where milk includes 'cow', 'sheep', or 'goat', group by milk, and summarize the count of cheeses
df_selected_milk_summary <- df_clean %>%
filter(grepl("cow|sheep|goat", milk, ignore.case = TRUE) & !is.na(milk)) %>%
group_by(milk) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Plotting the count of cheeses by milk type with numbers and connecting lines
ggplot(df_selected_milk_summary, aes(x = reorder(milk, -count), y = count, group = 1)) +
geom_bar(stat = "identity", aes(fill = milk), position = "dodge") +
geom_text(aes(label = count), vjust = -0.5) +
geom_line() +
geom_point() +
labs(title = "Count of Cheeses by Milk Type (Cow, Sheep, Goat)", x = "Milk Type", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
head(df_clean)
## cheese url milk
## 1 Abbaye du Mont des Cats https://www.cheese.com/abbaye-du-mont-des-cats/ cow
## 2 Abertam https://www.cheese.com/abertam/ sheep
## 3 Acorn https://www.cheese.com/acorn/ sheep
## 4 Adelost https://www.cheese.com/adelost/ cow
## 5 ADL Brick Cheese https://www.cheese.com/adl-brick-cheese/ cow
## 6 ADL Mild Cheddar https://www.cheese.com/adl-mild-cheddar/ cow
## country region family type
## 1 France Nord-Pas-de-Calais <NA> semi-soft, artisan, brined
## 2 Czech Republic Karlovy Vary <NA> hard, artisan
## 3 United Kingdom Bethania <NA> hard, artisan
## 4 Sweden <NA> Blue semi-soft, blue-veined
## 5 Canada Prince Edward Island Cheddar semi-soft
## 6 Canada Prince Edward Island Cheddar semi-hard
## fat_content calcium_content texture rind color
## 1 50 <NA> smooth, supple washed pale yellow
## 2 45 <NA> firm natural pale yellow
## 3 52 <NA> crumbly, firm <NA> <NA>
## 4 50 <NA> creamy natural blue
## 5 12 <NA> elastic, firm, open, soft rindless ivory
## 6 14 <NA> firm, springy rindless yellow
## flavor aroma vegetarian vegan synonyms
## 1 milky, salty floral FALSE FALSE <NA>
## 2 acidic, strong, tangy <NA> FALSE FALSE <NA>
## 3 burnt caramel, citrusy, herbaceous fruity TRUE FALSE <NA>
## 4 salty, sharp, tangy strong NA NA <NA>
## 5 buttery, mild, milky, subtle buttery, sweet NA NA <NA>
## 6 acidic, buttery, milky, subtle <NA> NA NA <NA>
## alt_spellings producers
## 1 <NA> Abbaye du Mont des Cats
## 2 <NA> <NA>
## 3 <NA> <NA>
## 4 <NA> <NA>
## 5 <NA> ADL - Amalgamated Dairies Limited
## 6 <NA> ADL - Amalgamated Dairies Limited
# Remove United States data from the original data frame
df_clean$fat_content <- as.numeric(gsub("%", "", df_clean$fat_content))
# Separate data for United States
usa_df <- df_clean[df_clean$country == "United States", ]
head(usa_df)
## cheese url milk country
## 36 Baby Swiss https://www.cheese.com/baby-swiss/ cow United States
## 53 Bijou https://www.cheese.com/bijou/ goat United States
## 62 Bonne Bouche https://www.cheese.com/bonne-bouche/ goat United States
## 68 Breakfast Cheese https://www.cheese.com/breakfast-cheese/ cow United States
## 75 Bucheret https://www.cheese.com/bucheret/ goat United States
## 78 Buttermilk Blue https://www.cheese.com/buttermilk-blue/ cow United States
## region family type fat_content
## 36 Charm, Ohio Swiss Cheese semi-soft, processed 43
## 53 Websterville, VT <NA> semi-soft, artisan 11
## 62 Vermont <NA> soft 21
## 68 California <NA> fresh firm, soft-ripened 7
## 75 California Brie soft, artisan, soft-ripened 10
## 78 Wisconsin <NA> semi-hard, blue-veined 8
## calcium_content texture rind
## 36 <NA> creamy, open, smooth rindless
## 53 <NA> creamy, smooth mold ripened
## 62 <NA> creamy, fluffy, smooth mold ripened
## 68 90 mg/100g dense, firm rindless
## 75 <NA> buttery, chalky, dense, smooth, soft-ripened bloomy
## 78 <NA> creamy, crumbly natural
## color flavor aroma vegetarian vegan
## 36 pale yellow nutty, sharp, sweet <NA> FALSE FALSE
## 53 <NA> sharp, sweet, tangy, yeasty fresh, yeasty NA NA
## 62 ivory citrusy, grassy yeasty TRUE FALSE
## 68 white citrusy, tangy fresh TRUE FALSE
## 75 white buttery, mushroomy, nutty, tangy rich TRUE FALSE
## 78 pale yellow piquant, tangy fresh NA NA
## synonyms alt_spellings producers
## 36 Lacy cheese <NA> Guggisberg Cheese
## 53 <NA> <NA> Vermont Creamery
## 62 <NA> <NA> Vermont Creamery
## 68 Petite Breakfast <NA> Marin French Cheeese Co.
## 75 <NA> <NA> Redwood Hill Farm & Creamery
## 78 <NA> <NA> Emmi Roth USA
# Remove United States data from the original data frame
df_filtered <- df_clean[df_clean$country != "United States", ]
head(df_filtered)
## cheese url milk
## 1 Abbaye du Mont des Cats https://www.cheese.com/abbaye-du-mont-des-cats/ cow
## 2 Abertam https://www.cheese.com/abertam/ sheep
## 3 Acorn https://www.cheese.com/acorn/ sheep
## 4 Adelost https://www.cheese.com/adelost/ cow
## 5 ADL Brick Cheese https://www.cheese.com/adl-brick-cheese/ cow
## 6 ADL Mild Cheddar https://www.cheese.com/adl-mild-cheddar/ cow
## country region family type
## 1 France Nord-Pas-de-Calais <NA> semi-soft, artisan, brined
## 2 Czech Republic Karlovy Vary <NA> hard, artisan
## 3 United Kingdom Bethania <NA> hard, artisan
## 4 Sweden <NA> Blue semi-soft, blue-veined
## 5 Canada Prince Edward Island Cheddar semi-soft
## 6 Canada Prince Edward Island Cheddar semi-hard
## fat_content calcium_content texture rind color
## 1 50 <NA> smooth, supple washed pale yellow
## 2 45 <NA> firm natural pale yellow
## 3 52 <NA> crumbly, firm <NA> <NA>
## 4 50 <NA> creamy natural blue
## 5 12 <NA> elastic, firm, open, soft rindless ivory
## 6 14 <NA> firm, springy rindless yellow
## flavor aroma vegetarian vegan synonyms
## 1 milky, salty floral FALSE FALSE <NA>
## 2 acidic, strong, tangy <NA> FALSE FALSE <NA>
## 3 burnt caramel, citrusy, herbaceous fruity TRUE FALSE <NA>
## 4 salty, sharp, tangy strong NA NA <NA>
## 5 buttery, mild, milky, subtle buttery, sweet NA NA <NA>
## 6 acidic, buttery, milky, subtle <NA> NA NA <NA>
## alt_spellings producers
## 1 <NA> Abbaye du Mont des Cats
## 2 <NA> <NA>
## 3 <NA> <NA>
## 4 <NA> <NA>
## 5 <NA> ADL - Amalgamated Dairies Limited
## 6 <NA> ADL - Amalgamated Dairies Limited
# Convert fat content to numeric, removing the percentage sign
df_clean$fat_content <- as.numeric(gsub("%", "", df_clean$fat_content))
# Calculate the mean fat content for USA and other countries
mean_fat_content_consumption <- data.frame(
Country = c("USA", "Other"),
MeanFatContent = c(
mean(df_clean[df_clean$country == "United States", ]$fat_content, na.rm = TRUE),
mean(df_clean[df_clean$country != "United States", ]$fat_content, na.rm = TRUE)
)
)
# Display the mean fat content consumption
print(mean_fat_content_consumption)
## Country MeanFatContent
## 1 USA 18.25333
## 2 Other 38.78750
# Plot the mean fat content consumption comparison with numbers in the plot
library(ggplot2)
ggplot(mean_fat_content_consumption, aes(x = Country, y = MeanFatContent, fill = Country)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(MeanFatContent, 2)), vjust = -0.5) +
labs(title = "Mean Fat Content Consumption: USA vs Other Countries", x = "Country", y = "Mean Fat Content") +
theme_minimal()
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Analyze the most consumed cheese by country and select top 5 countries
top_countries_cheese <- df_clean %>%
group_by(country) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
slice_head(n = 5)
# Plot the most consumed cheese by top 5 countries using ggplot2 with numbers on bars
ggplot(top_countries_cheese, aes(x = reorder(country, -count), y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = count), vjust = -0.5) +
coord_flip() +
labs(title = "Most Consumed Cheese by Top 5 Countries", x = "Country", y = "Count") +
theme_minimal()
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Filter and analyze the top 15 cheese consumption by USA
top_usa_cheese <- df_clean %>%
filter(country == "United States") %>%
group_by(cheese) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
slice_head(n = 15)
# Plot the top 15 cheese consumption in the USA using ggplot2
ggplot(top_usa_cheese, aes(x = reorder(cheese, -count), y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = count), vjust = -0.5) +
coord_flip() +
labs(title = "Top 15 Cheese Consumption in the USA", x = "Cheese", y = "Count") +
theme_minimal()
library(dplyr)
library(ggplot2)
# Assuming df is your data frame
# Filter for cheeses from the USA and with non-missing fat content
usa_cheese <- df_clean %>%
filter(country == "United States" & !is.na(fat_content))
# Convert fat_content to numeric if it's not already
usa_cheese$fat_content <- as.numeric(sub("%", "", usa_cheese$fat_content))
# Summarize the data to get the number of occurrences of each cheese
cheese_summary <- usa_cheese %>%
group_by(producers) %>%
summarise(count = n(), avg_fat_content = mean(fat_content, na.rm = TRUE)) %>%
arrange(desc(count))
# Select the top 15 most consumed cheeses
top_15_cheeses <- cheese_summary %>%
top_n(15, count)
# Plot the data
ggplot(top_15_cheeses, aes(x = reorder(producers, -count), y = avg_fat_content)) +
geom_bar(stat = "identity") +
labs(title = "Top 15 Most Consumed Cheeses in the USA with Fat Content",
x = "producers",
y = "Average Fat Content (%)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
As the Guggisberg Cheese,Egg Farm Dairy,Shelburne Farms produced cheese have much higher(more than the medean 18.25) fat content than other other producers we can say that American do not like these producers cheese.Lets see what the common phenomean of thses producers cheese.
# Assuming df is already loaded in the environment
# Filter the data for common entries between Guggisberg Cheese, Egg Farm Dairy, and Shelburne Farms
common_data <- df_clean %>%
filter(producers %in% c("Guggisberg Cheese", "Egg Farm Dairy", "Shelburne Farms"))
# Display the common data
print(common_data)
## cheese url milk
## 1 Baby Swiss https://www.cheese.com/baby-swiss/ cow
## 2 Peekskill Pyramid https://www.cheese.com/peekskill-pyramid/ cow
## 3 Shelburne Cheddar https://www.cheese.com/shelburne-cheddar/ cow
## country region family type fat_content
## 1 United States Charm, Ohio Swiss Cheese semi-soft, processed 43
## 2 United States Peekskill Brie soft, artisan 50
## 3 United States Shelburne Farms Cheddar hard, artisan 51
## calcium_content texture rind color
## 1 <NA> creamy, open, smooth rindless pale yellow
## 2 <NA> creamy rindless pale yellow
## 3 <NA> firm rindless pale yellow
## flavor aroma vegetarian vegan synonyms alt_spellings
## 1 nutty, sharp, sweet <NA> FALSE FALSE Lacy cheese <NA>
## 2 buttery, sour, sweet rich FALSE FALSE <NA> <NA>
## 3 strong rich TRUE FALSE <NA> <NA>
## producers
## 1 Guggisberg Cheese
## 2 Egg Farm Dairy
## 3 Shelburne Farms
Conclusions: We see that Americans dont like the high fat content,rindless,rich aromatic cheese those mostly produced from the cow.It can be useful data who want to introduce a new product or wnat to have growth in there sell.
#path1 <- "C:/Users/tanzi/OneDrive/DATA/607/week6/tanzil_cheese_data.csv"
#write.csv(df_clean, path1)
# Load dataset
df <-'https://raw.githubusercontent.com/tanzil64/Data-607-Project-02/refs/heads/main/Uncleaned_DS_jobs.csv'
df <- read.csv(df)
# Display first few rows
head(df)
## index Job.Title Salary.Estimate
## 1 0 Sr Data Scientist $137K-$171K (Glassdoor est.)
## 2 1 Data Scientist $137K-$171K (Glassdoor est.)
## 3 2 Data Scientist $137K-$171K (Glassdoor est.)
## 4 3 Data Scientist $137K-$171K (Glassdoor est.)
## 5 4 Data Scientist $137K-$171K (Glassdoor est.)
## 6 5 Data Scientist $137K-$171K (Glassdoor est.)
## Job.Description
## 1 Description\n\nThe Senior Data Scientist is responsible for defining, building, and improving statistical models to improve business processes and outcomes in one or more healthcare domains such as Clinical, Enrollment, Claims, and Finance. As part of the broader analytics team, Data Scientist will gather and analyze data to solve and address complex business problems and evaluate scenarios to make predictions on future outcomes and work with the business to communicate and support decision-making. This position requires strong analytical skills and experience in analytic methods including multivariate regressions, hierarchical linear models, regression trees, clustering methods and other complex statistical techniques.\n\nDuties & Responsibilities:\n\n• Develops advanced statistical models to predict, quantify or forecast various operational and performance metrics in multiple healthcare domains\n• Investigates, recommends, and initiates acquisition of new data resources from internal and external sources\n• Works with multiple teams to support data collection, integration, and retention requirements based on business needs\n• Identifies critical and emerging technologies that will support and extend quantitative analytic capabilities\n• Collaborates with business subject matter experts to select relevant sources of information\n• Develops expertise with multiple machine learning algorithms and data science techniques, such as exploratory data analysis and predictive modeling, graph theory, recommender systems, text analytics and validation\n• Develops expertise with Healthfirst datasets, data repositories, and data movement processes\n• Assists on projects/requests and may lead specific tasks within the project scope\n• Prepares and manipulates data for use in development of statistical models\n• Other duties as assigned\n\nMinimum Qualifications:\n\n-Bachelor's Degree\n\nPreferred Qualifications:\n\n- Master’s degree in Computer Science or Statistics\nFamiliarity with major cloud platforms such as AWS and Azure\nHealthcare Industry Experience\n\nMinimum Qualifications:\n\n-Bachelor's Degree\n\nPreferred Qualifications:\n\n- Master’s degree in Computer Science or Statistics\nFamiliarity with major cloud platforms such as AWS and Azure\nHealthcare Industry Experience\n\nWE ARE AN EQUAL OPPORTUNITY EMPLOYER. Applicants and employees are considered for positions and are evaluated without regard to mental or physical disability, race, color, religion, gender, national origin, age, genetic information, military or veteran status, sexual orientation, marital status or any other protected Federal, State/Province or Local status unrelated to the performance of the work involved.\n\nIf you have a disability under the Americans with Disability Act or a similar law, and want a reasonable accommodation to assist with your job search or application for employment, please contact us by sending an email to careers@Healthfirst.org or calling 212-519-1798 . In your email please include a description of the accommodation you are requesting and a description of the position for which you are applying. Only reasonable accommodation requests related to applying for a position within Healthfirst Management Services will be reviewed at the e-mail address and phone number supplied. Thank you for considering a career with Healthfirst Management Services.\nEEO Law Poster and Supplement\n\n]]>
## 2 Secure our Nation, Ignite your Future\n\nJoin the top Information Technology and Analytic professionals in the industry to make invaluable contributions to our national security on a daily basis. In this innovative, self-contained, Big Data environment, the ManTech team is responsible for everything from infrastructure, to application development, to data science, to advanced analytics and beyond. The team is diverse, the questions are thought-provoking, and the opportunities for growth and advancement are numerous\n\nThe successful candidate will possess a diverse range of data-focused skills and experience, both technical and analytical. They will have a strong desire and capability for problem solving, data analysis and troubleshooting, analytical thinking, and experimentation.\n\nDuties, Tasks & Responsibilities\nWorking with large, complex, and disparate data sets\nDesigning and implementing innovative ways to analyze and exploit the Sponsors data holdings\nResearching and reporting on a wide variety of Sponsor inquiries\nRaising proactive inquiries to the Sponsor based on observations and proposed data analysis/exploitation\nSolving difficult, non-routine problems by applying advanced analytical methodologies, and improving analytic methodologies\nDeveloping custom searches\nCommunicating and coordinating with internal and external partners as needed\nRequired Experience, Skills, & Technologies\n\nThorough knowledge of appropriate analytic tools and methodologies in one or more of the following:\nApplied mathematics (e.g. probability and statistics, formal modeling, computational social sciences)\nComputer programming (e.g. programming languages, math/statistics packages, computer science, machine learning, scientific computing)\nAbility to code or script in one or more general programming language\nExperience with and theoretical understanding of algorithms for classification, regression, clustering, and anomaly detection\nKnowledge of relational databases, including SQL and large-scale distributed systems (e.g. Hadoop)\nExpertise with statistical data analysis (e.g. linear models, multivariate analysis, stochastic models, sampling methods)\nDemonstrated effectiveness in collecting information and accurately representing/visualizing it to non-technical third parties\nTS/SCI with Polygraph\nBachelor of Science or equivalent and 12-15 years related experience, but will consider all levels of experience.\nDesired Experience, Skills & Technologies\nPrevious investigative experience using a combination of technical and analytic skills\n#LI-DU1\n\nManTech International Corporation, as well as its subsidiaries proactively fulfills its role as an equal opportunity employer. We do not discriminate against any employee or applicant for employment because of race, color, sex, religion, age, sexual orientation, gender identity and expression, national origin, marital status, physical or mental disability, status as a Disabled Veteran, Recently Separated Veteran, Active Duty Wartime or Campaign Badge Veteran, Armed Forces Services Medal, or any other characteristic protected by law.\n\nIf you require a reasonable accommodation to apply for a position with ManTech through its online applicant system, please contact ManTech's Corporate EEO Department at (703) 218-6000. ManTech is an affirmative action/equal opportunity employer - minorities, females, disabled and protected veterans are urged to apply. ManTech's utilization of any external recruitment or job placement agency is predicated upon its full compliance with our equal opportunity/affirmative action policies. ManTech does not accept resumes from unsolicited recruiting firms. We pay no fees for unsolicited services.\n\nIf you are a qualified individual with a disability or a disabled veteran, you have the right to request an accommodation if you are unable or limited in your ability to use or access http://www.mantech.com/careers/Pages/careers.aspx as a result of your disability. To request an accommodation please click careers@mantech.com and provide your name and contact information.
## 3 Overview\n\n\nAnalysis Group is one of the largest international economics consulting firms, with more than 1,000 professionals across 14 offices in North America, Europe, and Asia. Since 1981, we have provided expertise in economics, finance, health care analytics, and strategy to top law firms, Fortune Global 500 companies, and government agencies worldwide. Our internal experts, together with our network of affiliated experts from academia, industry, and government, offer our clients exceptional breadth and depth of expertise.\n\nWe are currently seeking a Data Scientist to join our team. The ideal candidate should be passionate about working on cutting edge research and analytical services for Fortune 500 companies, global pharma/biotech firms and leaders in industries such as finance, energy and life sciences. The Data Scientist will be a contributing member to client engagements and have the opportunity to work with our network of world-class experts and thought leaders.\n\nJob Functions and Responsibilities\n\nThe candidate Data Scientist will help develop, maintain and teach new tools and methodologies related to data science and high performance computing. This position will also help Analysis Group in maintaining our leadership position in terms of advancing methodology and data analytics. The Data Scientist will be responsible for staying abreast of new developments in technology relating to data science, to share more broadly with Analysis Group.\n\nKey responsibilities for this position will include:\nWorking with project teams to address data science/computing challenges\nIdentifying opportunities for technology to enhance service offerings\nActing as a resource and participating in client engagements and research as part of the project team\nMaintaining up-to-date knowledge of computing tools, providing technical training and helping to grow the in-house knowledge base, specifically in a Linux environment\nPresenting research at selected conferences\nExamples of activities for the Data Scientist will include:\nDeveloping data engineering and machine learning production systems for full stack data science projects\nUsing natural language processing methodologies to work with EMR data, social media data and other unstructured data\nOptimizing procedures for managing and accessing large databases (e.g., insurance claims, electronic health records, financial transactions)\nCreating interactive analytics portals and data visualizations (e.g., using R/Shiny, Python/Flask, D3)\nBuilding and maintaining high performance computing (HPC) tools on grid and cloud computing environments\nDeveloping and reviewing software and packages in R, Python and other Object Oriented Languages\nEstablishing optimized procedures for repetitive or computationally intensive tasks (C, C++, Cuda-C)\nQualifications\nStrong credentials and experience in database management and data visualization\nSignificant experience working within a Linux environment required\nBackground in Statistics/Econometrics or Biostatistics\nIdeally PhD in Computer Science, Mathematics, Statistics, Economics or other relevant scientific degree with relevant experience. Other candidates with at least one year of experience in the field may also be considered\nExcellent written and verbal communication skills\nProject experience with R and/or Python\nFamiliar with online/cloud computing/storage (e.g., AWS)\nDemonstrated experience working on project teams and collaborating with others\nSCIENTIFIQUE DES DONNÉES\n\n*L’utilisation du genre masculin sert uniquement à alléger le texte et est utilisé ici en tant que genre neutre\n\nSurvol\n\nGroupe d’analyse ltée est l’une des plus grandes firmes de services-conseils en économie, comptant plus de 950 professionnels répartis dans 14 bureaux en Amérique du Nord, en Europe et en Asie. Depuis 1981, nous offrons notre expertise en matière de stratégie, d’économie, de finance et d’analyse dans le domaine des soins de santé aux grands cabinets d’avocats, aux sociétés Fortune Global 500 et aux agences gouvernementales du monde entier. Nos professionnels en poste conjugués à notre réseau de spécialistes affiliés issus d’universités, d’industries spécifiques et d’organismes gouvernementaux procurent à notre clientèle un savoir-faire d’une portée et d’une profondeur exceptionnelles.\n\nNous sommes présentement à la recherche d'un Scientifique des données (« Data Scientist ») pour se joindre à notre équipe. Le candidat idéal devrait être passionné par la recherche de pointe et les services analytiques pour les entreprises Fortune 500, les entreprises pharmaceutiques et biotechnologiques mondiales et les chefs de file dans des secteurs de la finance, l'énergie et les sciences de la vie. Le Scientifique des données sera un membre contributeur aux mandats des clients et aura l'occasion de travailler avec notre réseau d'experts et de leaders d'opinion de classe mondiale.\n\nDescription du poste et des responsabilités\n\nLe scientifique des données aidera à développer, maintenir et enseigner de nouveaux outils et méthodologies liés à la science des données (« Data Science ») et au HPC. Ce poste aidera également le Groupe d'analyse à maintenir sa position de chef de file en ce qui a trait à l'avancement de la méthodologie et de l'analyse des données. Le scientifique des données sera chargé de se tenir au courant des nouveaux développements technologiques liés à la science des données, afin de les partager plus largement avec le Groupe d'analyse.\n\nLes principales responsabilités de ce poste comprendront:\n\n- Collaborer avec les consultants pour relever les défis de la science des données et de sciences informatiques\n\n- Agir à titre de ressource et participer aux mandats et à la recherche en tant que membre de l'équipe de projet\n\n- Maintenir à jour les connaissances sur les outils informatiques, fournir une formation technique et aider à développer la base de connaissances interne, notamment dans un environnement Linux\n\n- Présenter la recherche à des conférences choisies\n\nExemples de tâches du scientifique des données :\n\n- Développement de systèmes de production en ingénierie des données ainsi qu’en apprentissage machine pour des projets de science des données full stack\n\n- Utiliser des méthodologies NLP pour travailler avec les données médicales électroniques, les données des médias sociaux et d'autres données non structures\n\n- Optimiser les procédures de gestion et d'accès aux grandes bases de données (ex. réclamations d'assurance, dossiers de santé électroniques, transactions financières)\n\n- Création de portails d'analyse interactifs et de visualisations de données (par exemple, en utilisant R/Shiny, Python/Flask, D3)\n\n- Construire et maintenir des outils de calcul de haute performance (HPC).\n\n- Développement et révision de codes en R, Python et autres langages\n\n- Mise en place de procédures optimisées pour les tâches répétitives ou intensives en calcul (C, C++, Cuda-C)\n\nQualifications requises\n\n- Solides références et expérience dans la gestion de bases de données et de la visualisation de données\n\n- Expérience de travail significative dans un environnement Linux requise\n\n- Expérience antérieure en statistique/économétrie ou bio-statistique\n\n- Idéalement, être titulaire d'un doctorat en sciences informatiques, en mathématiques, en statistique, en économie ou d'un autre diplôme scientifique pertinent et posséder une expérience pertinente. Les candidats ayant au moins un an d'expérience dans le domaine peuvent également être considérés.\n\n- Excellentes aptitudes de communication écrite et verbale\n\n- Expérience de projet avec R et/ou Python\n\n- Familiarité avec l'informatique en ligne/info nuagique et le stockage (AWS)\n\n- Expérience de travail démontrée au sein d'équipes de projet et de collaboration avec d'autres personnes\n\n\nEqual Opportunity Employer/Protected Veterans/Individuals with Disabilities.\nPlease view Equal Employment Opportunity Posters provided by OFCCP here.\nThe contractor will not discharge or in any other manner discriminate against employees or applicants because they have inquired about, discussed, or disclosed their own pay or the pay of another employee or applicant. However, employees who have access to the compensation information of other employees or applicants as a part of their essential job functions cannot disclose the pay of other employees or applicants to individuals who do not otherwise have access to compensation information, unless the disclosure is (a) in response to a formal complaint or charge, (b) in furtherance of an investigation, proceeding, hearing, or action, including an investigation conducted by the employer, or (c) consistent with the contractor's legal duty to furnish information. 41 CFR 60-1.35(c)
## 4 JOB DESCRIPTION:\n\nDo you have a passion for Data and Machine Learning? Do you dream of working with customers on their most forward-looking AI initiatives? Does the challenge of developing modern machine learning solutions to solve real-world manufacturing problems exciting to you?\n\nWe develop software for monitoring semiconductor manufacturing process and are looking to leverage the latest technologies to address our customer's needs. You will be part of a team that investigates and builds solutions based all the data available in factories, ranging from time series data, to post manufacturing data, to production logs. You will be working side by side with application developers and customers on real world problems with actual manufacturing data.\n\nJOB FUNCTION:\n\nBasic and applied research in statistical machine learning, deep learning, and data science as well as signal and information processing to advance the state of the art in time series analysis of semiconductor manufacturing data.\n\nResponsibilities:\nPerform data analysis, data pre-processing, and feature engineering in support of advanced machine learning algorithm development. Incorporate physical and operational insights/constraints into statistical models to achieve a high degree of robustness.\nPrototype algorithms for proof of concept, validation, and software implementation.\nSupport performance evaluations and the transition of algorithms into existing fault detection and classification systems.\nConvey the results of scientific research to sponsors and the scientific community through briefings, conferences and peer-reviewed publications.\nOther related functions as assigned.\n\nREQUIRED QUALIFICATIONS:\nBachelor's degree in computer science or chemical engineering or related technical field.\nDemonstrated ability in machine learning/artificial intelligence (ML/AI) development and/or scientific modelling and data analysis.\nDemonstrated ability with python/MATLAB or similar abstract language. Experience with both traditional ML and modern deep learning approaches.\nExperience with agile development practices and Git version control.\nExperience with one or more of the DNN frameworks like TensorFlow, PyTorch, Chainer.\nExperience with SQL, Graph stores, or NoSQL stores.\nApplicant must have a dynamic skill set, be willing to work with new technologies, be highly organized and capable of planning and coordinating multiple tasks. The position will require attention to detail, effective problem solving skills and excellent judgment. Ability to work independently with sensitive and confidential information, maintain a professional demeanor, work as a team member without daily supervision.\n\nCOMPENSATION & BENEFITS:\n\nCompensation will be commensurate with experience including a competitive base salary, bonus opportunity, competitive benefits package, and relocation assistance.\n\nINFICON, is committed to ensuring that our online application process provides an equal opportunity to all job seekers that apply without regard to race, religion, ethnicity, national origin, citizenship, gender, age, protected veteran status, disability status, genetic information, sexual orientation, or any other protected characteristic. A notice describing Federal equal employment opportunity laws is available here and here to reaffirm this commitment.\n\nPI120660357
## 5 Data Scientist\nAffinity Solutions / Marketing Cloud seeks smart, curious, technically savvy candidates to join our cutting-edge data science team. We hire the best and brightest and give them the opportunity to work on industry-leading technologies.\nThe data sciences team at AFS/Marketing Cloud build models, machine learning algorithms that power all our ad-tech/mar-tech products at scale, develop methodology and tools to precisely and effectively measure market campaign effects, and research in-house and public data sources for consumer spend behavior insights. In this role, you'll have the opportunity to come up with new ideas and solutions that will lead to improvement of our ability to target the right audience, derive insights and provide better measurement methodology for marketing campaigns. You'll access our core data asset and machine learning infrastructure to power your ideas.\nDuties and Responsibilities\n· Support all clients model building needs, including maintaining and improving current modeling/scoring methodology and processes,\n· Provide innovative solutions to customized modeling/scoring/targeting with appropriate ML/statistical tools,\n· Provide analytical/statistical support such as marketing test design, projection, campaign measurement, market insights to clients and stakeholders.\n· Mine large consumer datasets in the cloud environment to support ad hoc business and statistical analysis,\n· Develop and Improve automation capabilities to enable customized delivery of the analytical products to clients,\n· Communicate the methodologies and the results to the management, clients and none technical stakeholders.\nBasic Qualifications\n· Advanced degree in Statistics/Mathematics/Computer Science/Economics or other fields that requires advanced training in data analytics.\n· Being able to apply basic statistical/ML concepts and reasoning to address and solve business problems such as targeting, test design, KPI projection and performance measurement.\n· Entrepreneurial, highly self-motivated, collaborative, keen attention to detail, willingness and capable learn quickly, and ability to effectively prioritize and execute tasks in a high pressure environment.\n· Being flexible to accept different task assignments and able to work on a tight time schedule.\n· Excellent command of one or more programming languages; preferably Python, SAS or R\n· Familiar with one of the database technologies such as PostgreSQL, MySQL, can write basic SQL queries\n· Great communication skills (verbal, written and presentation)\nPreferred Qualifications\n· Experience or exposure to large consumer and/or demographic data sets.\n· Familiarity with data manipulation and cleaning routines and techniques.
## 6 About Us:\n\nHeadquartered in beautiful Santa Barbara, HG Insights is the global leader in technology intelligence. HG Insights uses advanced data science methodologies to help the world's largest technology firms and the fastest growing companies accelerate their sales, marketing, and strategy efforts.\n\nWe offer a casual yet professional environment. Get your sweat on at one of our fitness classes or go for a run along the beach which is two blocks away. You can find employees riding bikes to lunch in the funk zone or hanging out in one of our collaboration spaces. We are passionate about our jobs with a get-it-done attitude, yet we don't take ourselves too seriously.\n\nWhat You'll Do:\n\nWe are looking for a data scientist with software development or data engineering background to join our research team which reports directly to the CTO. We are a rapidly growing company with small focused engineering teams that deliver innovative features to a fast growing market. We build big-data systems utilizing cutting edge technologies and solutions that allow for our engineers to continuously learn and develop while shipping amazing products.\n\nQualities/ Experience:\nSelf-learner, hacker, technology advocate who can work on anything\nAmazing engineering skills, you're on your way to being the one of the best engineers you know\nYou can architect, design, code, test, and mentor others\nExperience working with interesting and successful projects\nThrive in a fast growing environment\nExcellent written and spoken English communication\nAn interest in Machine Learning and Natural Language Processing\nWhat You'll Be Responsible For:\nBuild solutions for text classification, entity linking, and entity extraction and other related projects\nScaling machine learning and NLP projects to run against large datasets in virtualized environments.\nYou will collaborate with Product Development Teams to build the most effective solutions\nYou will develop features in our databases, backend apps, front end UI, and Data as a Service (DAAS) product\nYou will help architect and design large scale enterprise big-data systems\nYou will work on ideas from different team members as well as your own\nFix bugs rapidly\nAttend daily stand-up meetings, planning sessions, encourage others, and collaborate at a rapid pace\nWhat You'll Need:\nBS, MS, or Ph.D. in Computer Science or related technical discipline\nExperience Natural Language Processing, preferably in a commercial setting.\nExperience building Logistic Regression Models\nProficient in Python and Jupyter as well as related data science libraries (such as Scikit-learn, NLTK, SpaCy, Tensorflow)\nProficient in Java or Scala (3+ years experience recommended)\nExperience with MySQL, ElasticSearch, ESB, Hadoop, Spark, or other related data processing/database technologies\nExperience with Amazon Web Services (EC2, S3, RDS, EMR, ELB, etc.)\nExperience with web services using REST in Java\nActual coding experience in large distributed environments with multiple endpoints and complex interactions\nComfortable in an agile development environment\nUnderstanding and have real world experience using design patterns\nComfortable programming on a Mac with Intellij and other tools\nHG Insights Company is an Equal Opportunity Employer\n\nPlease note that HG Insights does not accept unsolicited resumes from recruiters or employment agencies. In the event of a recruiter or agency submitting a resume or candidate without a signed agreement being in place, we explicitly reserve the right to pursue and hire such candidates without any financial obligation to the recruiter or agency. Any unsolicited resumes, including those submitted directly to hiring managers, are deemed to be the property of HG Insights
## Rating Company.Name Location Headquarters
## 1 3.1 Healthfirst\n3.1 New York, NY New York, NY
## 2 4.2 ManTech\n4.2 Chantilly, VA Herndon, VA
## 3 3.8 Analysis Group\n3.8 Boston, MA Boston, MA
## 4 3.5 INFICON\n3.5 Newton, MA Bad Ragaz, Switzerland
## 5 2.9 Affinity Solutions\n2.9 New York, NY New York, NY
## 6 4.2 HG Insights\n4.2 Santa Barbara, CA Santa Barbara, CA
## Size Founded Type.of.ownership
## 1 1001 to 5000 employees 1993 Nonprofit Organization
## 2 5001 to 10000 employees 1968 Company - Public
## 3 1001 to 5000 employees 1981 Private Practice / Firm
## 4 501 to 1000 employees 2000 Company - Public
## 5 51 to 200 employees 1998 Company - Private
## 6 51 to 200 employees 2010 Company - Private
## Industry Sector
## 1 Insurance Carriers Insurance
## 2 Research & Development Business Services
## 3 Consulting Business Services
## 4 Electrical & Electronic Manufacturing Manufacturing
## 5 Advertising & Marketing Business Services
## 6 Computer Hardware & Software Information Technology
## Revenue
## 1 Unknown / Non-Applicable
## 2 $1 to $2 billion (USD)
## 3 $100 to $500 million (USD)
## 4 $100 to $500 million (USD)
## 5 Unknown / Non-Applicable
## 6 Unknown / Non-Applicable
## Competitors
## 1 EmblemHealth, UnitedHealth Group, Aetna
## 2 -1
## 3 -1
## 4 MKS Instruments, Pfeiffer Vacuum, Agilent Technologies
## 5 Commerce Signals, Cardlytics, Yodlee
## 6 -1
# Rename "Salary Estimate" to "Salary_Estimate"
df <- df %>%
rename(Salary_Estimate = `Salary.Estimate`,
Job_Title =`Job.Title`,
Job_Description = `Job.Description`,
Company = `Company.Name`,
Ownership_Type= `Type.of.ownership`
)
# Verify the column name change
colnames(df)
## [1] "index" "Job_Title" "Salary_Estimate" "Job_Description"
## [5] "Rating" "Company" "Location" "Headquarters"
## [9] "Size" "Founded" "Ownership_Type" "Industry"
## [13] "Sector" "Revenue" "Competitors"
# View column names and structure
str(df)
## 'data.frame': 672 obs. of 15 variables:
## $ index : int 0 1 2 3 4 5 6 7 8 9 ...
## $ Job_Title : chr "Sr Data Scientist" "Data Scientist" "Data Scientist" "Data Scientist" ...
## $ Salary_Estimate: chr "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" ...
## $ Job_Description: chr "Description\n\nThe Senior Data Scientist is responsible for defining, building, and improving statistical model"| __truncated__ "Secure our Nation, Ignite your Future\n\nJoin the top Information Technology and Analytic professionals in the "| __truncated__ "Overview\n\n\nAnalysis Group is one of the largest international economics consulting firms, with more than 1,0"| __truncated__ "JOB DESCRIPTION:\n\nDo you have a passion for Data and Machine Learning? Do you dream of working with customers"| __truncated__ ...
## $ Rating : num 3.1 4.2 3.8 3.5 2.9 4.2 3.9 3.5 4.4 3.6 ...
## $ Company : chr "Healthfirst\n3.1" "ManTech\n4.2" "Analysis Group\n3.8" "INFICON\n3.5" ...
## $ Location : chr "New York, NY" "Chantilly, VA" "Boston, MA" "Newton, MA" ...
## $ Headquarters : chr "New York, NY" "Herndon, VA" "Boston, MA" "Bad Ragaz, Switzerland" ...
## $ Size : chr "1001 to 5000 employees" "5001 to 10000 employees" "1001 to 5000 employees" "501 to 1000 employees" ...
## $ Founded : int 1993 1968 1981 2000 1998 2010 1996 1990 1983 2014 ...
## $ Ownership_Type : chr "Nonprofit Organization" "Company - Public" "Private Practice / Firm" "Company - Public" ...
## $ Industry : chr "Insurance Carriers" "Research & Development" "Consulting" "Electrical & Electronic Manufacturing" ...
## $ Sector : chr "Insurance" "Business Services" "Business Services" "Manufacturing" ...
## $ Revenue : chr "Unknown / Non-Applicable" "$1 to $2 billion (USD)" "$100 to $500 million (USD)" "$100 to $500 million (USD)" ...
## $ Competitors : chr "EmblemHealth, UnitedHealth Group, Aetna" "-1" "-1" "MKS Instruments, Pfeiffer Vacuum, Agilent Technologies" ...
# Summary statistics of numerical columns
summary(df)
## index Job_Title Salary_Estimate Job_Description
## Min. : 0.0 Length:672 Length:672 Length:672
## 1st Qu.:167.8 Class :character Class :character Class :character
## Median :335.5 Mode :character Mode :character Mode :character
## Mean :335.5
## 3rd Qu.:503.2
## Max. :671.0
## Rating Company Location Headquarters
## Min. :-1.000 Length:672 Length:672 Length:672
## 1st Qu.: 3.300 Class :character Class :character Class :character
## Median : 3.800 Mode :character Mode :character Mode :character
## Mean : 3.519
## 3rd Qu.: 4.300
## Max. : 5.000
## Size Founded Ownership_Type Industry
## Length:672 Min. : -1 Length:672 Length:672
## Class :character 1st Qu.:1918 Class :character Class :character
## Mode :character Median :1995 Mode :character Mode :character
## Mean :1636
## 3rd Qu.:2009
## Max. :2019
## Sector Revenue Competitors
## Length:672 Length:672 Length:672
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
# Get an overview of the dataset
skim(df)
| Name | df |
| Number of rows | 672 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 12 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Job_Title | 0 | 1 | 12 | 96 | 0 | 172 | 0 |
| Salary_Estimate | 0 | 1 | 26 | 28 | 0 | 30 | 0 |
| Job_Description | 0 | 1 | 71 | 10524 | 0 | 489 | 0 |
| Company | 0 | 1 | 4 | 55 | 0 | 432 | 0 |
| Location | 0 | 1 | 4 | 26 | 0 | 207 | 0 |
| Headquarters | 0 | 1 | 2 | 25 | 0 | 229 | 0 |
| Size | 0 | 1 | 2 | 23 | 0 | 9 | 0 |
| Ownership_Type | 0 | 1 | 2 | 30 | 0 | 13 | 0 |
| Industry | 0 | 1 | 2 | 40 | 0 | 58 | 0 |
| Sector | 0 | 1 | 2 | 34 | 0 | 23 | 0 |
| Revenue | 0 | 1 | 2 | 32 | 0 | 14 | 0 |
| Competitors | 0 | 1 | 2 | 92 | 0 | 108 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| index | 0 | 1 | 335.50 | 194.13 | 0 | 167.75 | 335.5 | 503.25 | 671 | ▇▇▇▇▇ |
| Rating | 0 | 1 | 3.52 | 1.41 | -1 | 3.30 | 3.8 | 4.30 | 5 | ▁▁▁▇▇ |
| Founded | 0 | 1 | 1635.53 | 756.75 | -1 | 1917.75 | 1995.0 | 2009.00 | 2019 | ▂▁▁▁▇ |
# Check for missing values
colSums(is.na(df))
## index Job_Title Salary_Estimate Job_Description Rating
## 0 0 0 0 0
## Company Location Headquarters Size Founded
## 0 0 0 0 0
## Ownership_Type Industry Sector Revenue Competitors
## 0 0 0 0 0
Salary Column
# Extract min and max salary values from Salary_Estimate column
df <- df %>%
mutate(
salary_usd_min_K = as.numeric(str_extract(Salary_Estimate, "\\d+")), # Extract first numeric value
salary_usd_max_K = as.numeric(str_extract_all(Salary_Estimate, "\\d+") %>% sapply(`[`, 2)) # Extract second numeric value
)
# Compute average salary in USD (thousands)
df <- df %>%
mutate(
av_salary_usd_K = (salary_usd_min_K + salary_usd_max_K ) / 2 # Calculate mean of min and max salary
)
# Display first few rows to verify
head(df)
## index Job_Title Salary_Estimate
## 1 0 Sr Data Scientist $137K-$171K (Glassdoor est.)
## 2 1 Data Scientist $137K-$171K (Glassdoor est.)
## 3 2 Data Scientist $137K-$171K (Glassdoor est.)
## 4 3 Data Scientist $137K-$171K (Glassdoor est.)
## 5 4 Data Scientist $137K-$171K (Glassdoor est.)
## 6 5 Data Scientist $137K-$171K (Glassdoor est.)
## Job_Description
## 1 Description\n\nThe Senior Data Scientist is responsible for defining, building, and improving statistical models to improve business processes and outcomes in one or more healthcare domains such as Clinical, Enrollment, Claims, and Finance. As part of the broader analytics team, Data Scientist will gather and analyze data to solve and address complex business problems and evaluate scenarios to make predictions on future outcomes and work with the business to communicate and support decision-making. This position requires strong analytical skills and experience in analytic methods including multivariate regressions, hierarchical linear models, regression trees, clustering methods and other complex statistical techniques.\n\nDuties & Responsibilities:\n\n• Develops advanced statistical models to predict, quantify or forecast various operational and performance metrics in multiple healthcare domains\n• Investigates, recommends, and initiates acquisition of new data resources from internal and external sources\n• Works with multiple teams to support data collection, integration, and retention requirements based on business needs\n• Identifies critical and emerging technologies that will support and extend quantitative analytic capabilities\n• Collaborates with business subject matter experts to select relevant sources of information\n• Develops expertise with multiple machine learning algorithms and data science techniques, such as exploratory data analysis and predictive modeling, graph theory, recommender systems, text analytics and validation\n• Develops expertise with Healthfirst datasets, data repositories, and data movement processes\n• Assists on projects/requests and may lead specific tasks within the project scope\n• Prepares and manipulates data for use in development of statistical models\n• Other duties as assigned\n\nMinimum Qualifications:\n\n-Bachelor's Degree\n\nPreferred Qualifications:\n\n- Master’s degree in Computer Science or Statistics\nFamiliarity with major cloud platforms such as AWS and Azure\nHealthcare Industry Experience\n\nMinimum Qualifications:\n\n-Bachelor's Degree\n\nPreferred Qualifications:\n\n- Master’s degree in Computer Science or Statistics\nFamiliarity with major cloud platforms such as AWS and Azure\nHealthcare Industry Experience\n\nWE ARE AN EQUAL OPPORTUNITY EMPLOYER. Applicants and employees are considered for positions and are evaluated without regard to mental or physical disability, race, color, religion, gender, national origin, age, genetic information, military or veteran status, sexual orientation, marital status or any other protected Federal, State/Province or Local status unrelated to the performance of the work involved.\n\nIf you have a disability under the Americans with Disability Act or a similar law, and want a reasonable accommodation to assist with your job search or application for employment, please contact us by sending an email to careers@Healthfirst.org or calling 212-519-1798 . In your email please include a description of the accommodation you are requesting and a description of the position for which you are applying. Only reasonable accommodation requests related to applying for a position within Healthfirst Management Services will be reviewed at the e-mail address and phone number supplied. Thank you for considering a career with Healthfirst Management Services.\nEEO Law Poster and Supplement\n\n]]>
## 2 Secure our Nation, Ignite your Future\n\nJoin the top Information Technology and Analytic professionals in the industry to make invaluable contributions to our national security on a daily basis. In this innovative, self-contained, Big Data environment, the ManTech team is responsible for everything from infrastructure, to application development, to data science, to advanced analytics and beyond. The team is diverse, the questions are thought-provoking, and the opportunities for growth and advancement are numerous\n\nThe successful candidate will possess a diverse range of data-focused skills and experience, both technical and analytical. They will have a strong desire and capability for problem solving, data analysis and troubleshooting, analytical thinking, and experimentation.\n\nDuties, Tasks & Responsibilities\nWorking with large, complex, and disparate data sets\nDesigning and implementing innovative ways to analyze and exploit the Sponsors data holdings\nResearching and reporting on a wide variety of Sponsor inquiries\nRaising proactive inquiries to the Sponsor based on observations and proposed data analysis/exploitation\nSolving difficult, non-routine problems by applying advanced analytical methodologies, and improving analytic methodologies\nDeveloping custom searches\nCommunicating and coordinating with internal and external partners as needed\nRequired Experience, Skills, & Technologies\n\nThorough knowledge of appropriate analytic tools and methodologies in one or more of the following:\nApplied mathematics (e.g. probability and statistics, formal modeling, computational social sciences)\nComputer programming (e.g. programming languages, math/statistics packages, computer science, machine learning, scientific computing)\nAbility to code or script in one or more general programming language\nExperience with and theoretical understanding of algorithms for classification, regression, clustering, and anomaly detection\nKnowledge of relational databases, including SQL and large-scale distributed systems (e.g. Hadoop)\nExpertise with statistical data analysis (e.g. linear models, multivariate analysis, stochastic models, sampling methods)\nDemonstrated effectiveness in collecting information and accurately representing/visualizing it to non-technical third parties\nTS/SCI with Polygraph\nBachelor of Science or equivalent and 12-15 years related experience, but will consider all levels of experience.\nDesired Experience, Skills & Technologies\nPrevious investigative experience using a combination of technical and analytic skills\n#LI-DU1\n\nManTech International Corporation, as well as its subsidiaries proactively fulfills its role as an equal opportunity employer. We do not discriminate against any employee or applicant for employment because of race, color, sex, religion, age, sexual orientation, gender identity and expression, national origin, marital status, physical or mental disability, status as a Disabled Veteran, Recently Separated Veteran, Active Duty Wartime or Campaign Badge Veteran, Armed Forces Services Medal, or any other characteristic protected by law.\n\nIf you require a reasonable accommodation to apply for a position with ManTech through its online applicant system, please contact ManTech's Corporate EEO Department at (703) 218-6000. ManTech is an affirmative action/equal opportunity employer - minorities, females, disabled and protected veterans are urged to apply. ManTech's utilization of any external recruitment or job placement agency is predicated upon its full compliance with our equal opportunity/affirmative action policies. ManTech does not accept resumes from unsolicited recruiting firms. We pay no fees for unsolicited services.\n\nIf you are a qualified individual with a disability or a disabled veteran, you have the right to request an accommodation if you are unable or limited in your ability to use or access http://www.mantech.com/careers/Pages/careers.aspx as a result of your disability. To request an accommodation please click careers@mantech.com and provide your name and contact information.
## 3 Overview\n\n\nAnalysis Group is one of the largest international economics consulting firms, with more than 1,000 professionals across 14 offices in North America, Europe, and Asia. Since 1981, we have provided expertise in economics, finance, health care analytics, and strategy to top law firms, Fortune Global 500 companies, and government agencies worldwide. Our internal experts, together with our network of affiliated experts from academia, industry, and government, offer our clients exceptional breadth and depth of expertise.\n\nWe are currently seeking a Data Scientist to join our team. The ideal candidate should be passionate about working on cutting edge research and analytical services for Fortune 500 companies, global pharma/biotech firms and leaders in industries such as finance, energy and life sciences. The Data Scientist will be a contributing member to client engagements and have the opportunity to work with our network of world-class experts and thought leaders.\n\nJob Functions and Responsibilities\n\nThe candidate Data Scientist will help develop, maintain and teach new tools and methodologies related to data science and high performance computing. This position will also help Analysis Group in maintaining our leadership position in terms of advancing methodology and data analytics. The Data Scientist will be responsible for staying abreast of new developments in technology relating to data science, to share more broadly with Analysis Group.\n\nKey responsibilities for this position will include:\nWorking with project teams to address data science/computing challenges\nIdentifying opportunities for technology to enhance service offerings\nActing as a resource and participating in client engagements and research as part of the project team\nMaintaining up-to-date knowledge of computing tools, providing technical training and helping to grow the in-house knowledge base, specifically in a Linux environment\nPresenting research at selected conferences\nExamples of activities for the Data Scientist will include:\nDeveloping data engineering and machine learning production systems for full stack data science projects\nUsing natural language processing methodologies to work with EMR data, social media data and other unstructured data\nOptimizing procedures for managing and accessing large databases (e.g., insurance claims, electronic health records, financial transactions)\nCreating interactive analytics portals and data visualizations (e.g., using R/Shiny, Python/Flask, D3)\nBuilding and maintaining high performance computing (HPC) tools on grid and cloud computing environments\nDeveloping and reviewing software and packages in R, Python and other Object Oriented Languages\nEstablishing optimized procedures for repetitive or computationally intensive tasks (C, C++, Cuda-C)\nQualifications\nStrong credentials and experience in database management and data visualization\nSignificant experience working within a Linux environment required\nBackground in Statistics/Econometrics or Biostatistics\nIdeally PhD in Computer Science, Mathematics, Statistics, Economics or other relevant scientific degree with relevant experience. Other candidates with at least one year of experience in the field may also be considered\nExcellent written and verbal communication skills\nProject experience with R and/or Python\nFamiliar with online/cloud computing/storage (e.g., AWS)\nDemonstrated experience working on project teams and collaborating with others\nSCIENTIFIQUE DES DONNÉES\n\n*L’utilisation du genre masculin sert uniquement à alléger le texte et est utilisé ici en tant que genre neutre\n\nSurvol\n\nGroupe d’analyse ltée est l’une des plus grandes firmes de services-conseils en économie, comptant plus de 950 professionnels répartis dans 14 bureaux en Amérique du Nord, en Europe et en Asie. Depuis 1981, nous offrons notre expertise en matière de stratégie, d’économie, de finance et d’analyse dans le domaine des soins de santé aux grands cabinets d’avocats, aux sociétés Fortune Global 500 et aux agences gouvernementales du monde entier. Nos professionnels en poste conjugués à notre réseau de spécialistes affiliés issus d’universités, d’industries spécifiques et d’organismes gouvernementaux procurent à notre clientèle un savoir-faire d’une portée et d’une profondeur exceptionnelles.\n\nNous sommes présentement à la recherche d'un Scientifique des données (« Data Scientist ») pour se joindre à notre équipe. Le candidat idéal devrait être passionné par la recherche de pointe et les services analytiques pour les entreprises Fortune 500, les entreprises pharmaceutiques et biotechnologiques mondiales et les chefs de file dans des secteurs de la finance, l'énergie et les sciences de la vie. Le Scientifique des données sera un membre contributeur aux mandats des clients et aura l'occasion de travailler avec notre réseau d'experts et de leaders d'opinion de classe mondiale.\n\nDescription du poste et des responsabilités\n\nLe scientifique des données aidera à développer, maintenir et enseigner de nouveaux outils et méthodologies liés à la science des données (« Data Science ») et au HPC. Ce poste aidera également le Groupe d'analyse à maintenir sa position de chef de file en ce qui a trait à l'avancement de la méthodologie et de l'analyse des données. Le scientifique des données sera chargé de se tenir au courant des nouveaux développements technologiques liés à la science des données, afin de les partager plus largement avec le Groupe d'analyse.\n\nLes principales responsabilités de ce poste comprendront:\n\n- Collaborer avec les consultants pour relever les défis de la science des données et de sciences informatiques\n\n- Agir à titre de ressource et participer aux mandats et à la recherche en tant que membre de l'équipe de projet\n\n- Maintenir à jour les connaissances sur les outils informatiques, fournir une formation technique et aider à développer la base de connaissances interne, notamment dans un environnement Linux\n\n- Présenter la recherche à des conférences choisies\n\nExemples de tâches du scientifique des données :\n\n- Développement de systèmes de production en ingénierie des données ainsi qu’en apprentissage machine pour des projets de science des données full stack\n\n- Utiliser des méthodologies NLP pour travailler avec les données médicales électroniques, les données des médias sociaux et d'autres données non structures\n\n- Optimiser les procédures de gestion et d'accès aux grandes bases de données (ex. réclamations d'assurance, dossiers de santé électroniques, transactions financières)\n\n- Création de portails d'analyse interactifs et de visualisations de données (par exemple, en utilisant R/Shiny, Python/Flask, D3)\n\n- Construire et maintenir des outils de calcul de haute performance (HPC).\n\n- Développement et révision de codes en R, Python et autres langages\n\n- Mise en place de procédures optimisées pour les tâches répétitives ou intensives en calcul (C, C++, Cuda-C)\n\nQualifications requises\n\n- Solides références et expérience dans la gestion de bases de données et de la visualisation de données\n\n- Expérience de travail significative dans un environnement Linux requise\n\n- Expérience antérieure en statistique/économétrie ou bio-statistique\n\n- Idéalement, être titulaire d'un doctorat en sciences informatiques, en mathématiques, en statistique, en économie ou d'un autre diplôme scientifique pertinent et posséder une expérience pertinente. Les candidats ayant au moins un an d'expérience dans le domaine peuvent également être considérés.\n\n- Excellentes aptitudes de communication écrite et verbale\n\n- Expérience de projet avec R et/ou Python\n\n- Familiarité avec l'informatique en ligne/info nuagique et le stockage (AWS)\n\n- Expérience de travail démontrée au sein d'équipes de projet et de collaboration avec d'autres personnes\n\n\nEqual Opportunity Employer/Protected Veterans/Individuals with Disabilities.\nPlease view Equal Employment Opportunity Posters provided by OFCCP here.\nThe contractor will not discharge or in any other manner discriminate against employees or applicants because they have inquired about, discussed, or disclosed their own pay or the pay of another employee or applicant. However, employees who have access to the compensation information of other employees or applicants as a part of their essential job functions cannot disclose the pay of other employees or applicants to individuals who do not otherwise have access to compensation information, unless the disclosure is (a) in response to a formal complaint or charge, (b) in furtherance of an investigation, proceeding, hearing, or action, including an investigation conducted by the employer, or (c) consistent with the contractor's legal duty to furnish information. 41 CFR 60-1.35(c)
## 4 JOB DESCRIPTION:\n\nDo you have a passion for Data and Machine Learning? Do you dream of working with customers on their most forward-looking AI initiatives? Does the challenge of developing modern machine learning solutions to solve real-world manufacturing problems exciting to you?\n\nWe develop software for monitoring semiconductor manufacturing process and are looking to leverage the latest technologies to address our customer's needs. You will be part of a team that investigates and builds solutions based all the data available in factories, ranging from time series data, to post manufacturing data, to production logs. You will be working side by side with application developers and customers on real world problems with actual manufacturing data.\n\nJOB FUNCTION:\n\nBasic and applied research in statistical machine learning, deep learning, and data science as well as signal and information processing to advance the state of the art in time series analysis of semiconductor manufacturing data.\n\nResponsibilities:\nPerform data analysis, data pre-processing, and feature engineering in support of advanced machine learning algorithm development. Incorporate physical and operational insights/constraints into statistical models to achieve a high degree of robustness.\nPrototype algorithms for proof of concept, validation, and software implementation.\nSupport performance evaluations and the transition of algorithms into existing fault detection and classification systems.\nConvey the results of scientific research to sponsors and the scientific community through briefings, conferences and peer-reviewed publications.\nOther related functions as assigned.\n\nREQUIRED QUALIFICATIONS:\nBachelor's degree in computer science or chemical engineering or related technical field.\nDemonstrated ability in machine learning/artificial intelligence (ML/AI) development and/or scientific modelling and data analysis.\nDemonstrated ability with python/MATLAB or similar abstract language. Experience with both traditional ML and modern deep learning approaches.\nExperience with agile development practices and Git version control.\nExperience with one or more of the DNN frameworks like TensorFlow, PyTorch, Chainer.\nExperience with SQL, Graph stores, or NoSQL stores.\nApplicant must have a dynamic skill set, be willing to work with new technologies, be highly organized and capable of planning and coordinating multiple tasks. The position will require attention to detail, effective problem solving skills and excellent judgment. Ability to work independently with sensitive and confidential information, maintain a professional demeanor, work as a team member without daily supervision.\n\nCOMPENSATION & BENEFITS:\n\nCompensation will be commensurate with experience including a competitive base salary, bonus opportunity, competitive benefits package, and relocation assistance.\n\nINFICON, is committed to ensuring that our online application process provides an equal opportunity to all job seekers that apply without regard to race, religion, ethnicity, national origin, citizenship, gender, age, protected veteran status, disability status, genetic information, sexual orientation, or any other protected characteristic. A notice describing Federal equal employment opportunity laws is available here and here to reaffirm this commitment.\n\nPI120660357
## 5 Data Scientist\nAffinity Solutions / Marketing Cloud seeks smart, curious, technically savvy candidates to join our cutting-edge data science team. We hire the best and brightest and give them the opportunity to work on industry-leading technologies.\nThe data sciences team at AFS/Marketing Cloud build models, machine learning algorithms that power all our ad-tech/mar-tech products at scale, develop methodology and tools to precisely and effectively measure market campaign effects, and research in-house and public data sources for consumer spend behavior insights. In this role, you'll have the opportunity to come up with new ideas and solutions that will lead to improvement of our ability to target the right audience, derive insights and provide better measurement methodology for marketing campaigns. You'll access our core data asset and machine learning infrastructure to power your ideas.\nDuties and Responsibilities\n· Support all clients model building needs, including maintaining and improving current modeling/scoring methodology and processes,\n· Provide innovative solutions to customized modeling/scoring/targeting with appropriate ML/statistical tools,\n· Provide analytical/statistical support such as marketing test design, projection, campaign measurement, market insights to clients and stakeholders.\n· Mine large consumer datasets in the cloud environment to support ad hoc business and statistical analysis,\n· Develop and Improve automation capabilities to enable customized delivery of the analytical products to clients,\n· Communicate the methodologies and the results to the management, clients and none technical stakeholders.\nBasic Qualifications\n· Advanced degree in Statistics/Mathematics/Computer Science/Economics or other fields that requires advanced training in data analytics.\n· Being able to apply basic statistical/ML concepts and reasoning to address and solve business problems such as targeting, test design, KPI projection and performance measurement.\n· Entrepreneurial, highly self-motivated, collaborative, keen attention to detail, willingness and capable learn quickly, and ability to effectively prioritize and execute tasks in a high pressure environment.\n· Being flexible to accept different task assignments and able to work on a tight time schedule.\n· Excellent command of one or more programming languages; preferably Python, SAS or R\n· Familiar with one of the database technologies such as PostgreSQL, MySQL, can write basic SQL queries\n· Great communication skills (verbal, written and presentation)\nPreferred Qualifications\n· Experience or exposure to large consumer and/or demographic data sets.\n· Familiarity with data manipulation and cleaning routines and techniques.
## 6 About Us:\n\nHeadquartered in beautiful Santa Barbara, HG Insights is the global leader in technology intelligence. HG Insights uses advanced data science methodologies to help the world's largest technology firms and the fastest growing companies accelerate their sales, marketing, and strategy efforts.\n\nWe offer a casual yet professional environment. Get your sweat on at one of our fitness classes or go for a run along the beach which is two blocks away. You can find employees riding bikes to lunch in the funk zone or hanging out in one of our collaboration spaces. We are passionate about our jobs with a get-it-done attitude, yet we don't take ourselves too seriously.\n\nWhat You'll Do:\n\nWe are looking for a data scientist with software development or data engineering background to join our research team which reports directly to the CTO. We are a rapidly growing company with small focused engineering teams that deliver innovative features to a fast growing market. We build big-data systems utilizing cutting edge technologies and solutions that allow for our engineers to continuously learn and develop while shipping amazing products.\n\nQualities/ Experience:\nSelf-learner, hacker, technology advocate who can work on anything\nAmazing engineering skills, you're on your way to being the one of the best engineers you know\nYou can architect, design, code, test, and mentor others\nExperience working with interesting and successful projects\nThrive in a fast growing environment\nExcellent written and spoken English communication\nAn interest in Machine Learning and Natural Language Processing\nWhat You'll Be Responsible For:\nBuild solutions for text classification, entity linking, and entity extraction and other related projects\nScaling machine learning and NLP projects to run against large datasets in virtualized environments.\nYou will collaborate with Product Development Teams to build the most effective solutions\nYou will develop features in our databases, backend apps, front end UI, and Data as a Service (DAAS) product\nYou will help architect and design large scale enterprise big-data systems\nYou will work on ideas from different team members as well as your own\nFix bugs rapidly\nAttend daily stand-up meetings, planning sessions, encourage others, and collaborate at a rapid pace\nWhat You'll Need:\nBS, MS, or Ph.D. in Computer Science or related technical discipline\nExperience Natural Language Processing, preferably in a commercial setting.\nExperience building Logistic Regression Models\nProficient in Python and Jupyter as well as related data science libraries (such as Scikit-learn, NLTK, SpaCy, Tensorflow)\nProficient in Java or Scala (3+ years experience recommended)\nExperience with MySQL, ElasticSearch, ESB, Hadoop, Spark, or other related data processing/database technologies\nExperience with Amazon Web Services (EC2, S3, RDS, EMR, ELB, etc.)\nExperience with web services using REST in Java\nActual coding experience in large distributed environments with multiple endpoints and complex interactions\nComfortable in an agile development environment\nUnderstanding and have real world experience using design patterns\nComfortable programming on a Mac with Intellij and other tools\nHG Insights Company is an Equal Opportunity Employer\n\nPlease note that HG Insights does not accept unsolicited resumes from recruiters or employment agencies. In the event of a recruiter or agency submitting a resume or candidate without a signed agreement being in place, we explicitly reserve the right to pursue and hire such candidates without any financial obligation to the recruiter or agency. Any unsolicited resumes, including those submitted directly to hiring managers, are deemed to be the property of HG Insights
## Rating Company Location Headquarters
## 1 3.1 Healthfirst\n3.1 New York, NY New York, NY
## 2 4.2 ManTech\n4.2 Chantilly, VA Herndon, VA
## 3 3.8 Analysis Group\n3.8 Boston, MA Boston, MA
## 4 3.5 INFICON\n3.5 Newton, MA Bad Ragaz, Switzerland
## 5 2.9 Affinity Solutions\n2.9 New York, NY New York, NY
## 6 4.2 HG Insights\n4.2 Santa Barbara, CA Santa Barbara, CA
## Size Founded Ownership_Type
## 1 1001 to 5000 employees 1993 Nonprofit Organization
## 2 5001 to 10000 employees 1968 Company - Public
## 3 1001 to 5000 employees 1981 Private Practice / Firm
## 4 501 to 1000 employees 2000 Company - Public
## 5 51 to 200 employees 1998 Company - Private
## 6 51 to 200 employees 2010 Company - Private
## Industry Sector
## 1 Insurance Carriers Insurance
## 2 Research & Development Business Services
## 3 Consulting Business Services
## 4 Electrical & Electronic Manufacturing Manufacturing
## 5 Advertising & Marketing Business Services
## 6 Computer Hardware & Software Information Technology
## Revenue
## 1 Unknown / Non-Applicable
## 2 $1 to $2 billion (USD)
## 3 $100 to $500 million (USD)
## 4 $100 to $500 million (USD)
## 5 Unknown / Non-Applicable
## 6 Unknown / Non-Applicable
## Competitors salary_usd_min_K
## 1 EmblemHealth, UnitedHealth Group, Aetna 137
## 2 -1 137
## 3 -1 137
## 4 MKS Instruments, Pfeiffer Vacuum, Agilent Technologies 137
## 5 Commerce Signals, Cardlytics, Yodlee 137
## 6 -1 137
## salary_usd_max_K av_salary_usd_K
## 1 171 154
## 2 171 154
## 3 171 154
## 4 171 154
## 5 171 154
## 6 171 154
#remove duplicate
df <- df %>% distinct()
# Boxplot to check outliers
ggplot(df, aes(y = av_salary_usd_K)) +
geom_boxplot(fill = "orange") +
labs(title = "Boxplot of Salaries") +
theme_minimal()
# Convert categorical columns to factors
df <- df %>%
mutate(across(where(is.character), as.factor))
Data Cleaning
# Function to determine experience level based on job title
get_experience_level <- function(title) {
title <- tolower(title) # Convert to lowercase for case-insensitive matching
if (str_detect(title, "senior") | str_detect(title, "lead") |str_detect(title, "sr")) {
return("Senior") # If "senior" or "lead" is in title → Senior
} else if (str_detect(title, "junior")) {
return("Junior") # If "junior" is in title → Junior
} else {
return("Mid-level") # Otherwise, classify as Mid-level
}
}
# Apply function to the Job Title column and create new Experience_Level column
df <- df %>%
mutate(Experience_Level = sapply(Job_Title, get_experience_level))
# Show the first few rows of Job_Title and Experience_Level columns
head(df[, c("Job_Title", "Experience_Level")])
## Job_Title Experience_Level
## 1 Sr Data Scientist Senior
## 2 Data Scientist Mid-level
## 3 Data Scientist Mid-level
## 4 Data Scientist Mid-level
## 5 Data Scientist Mid-level
## 6 Data Scientist Mid-level
# Count plot for Experience_Level
ggplot(df, aes(x = Experience_Level)) +
geom_bar(fill = "steelblue") +
coord_flip() +
labs(title = "Count of Data Science Job Titles", x = "Job Title", y = "Count") +
theme_minimal()
Job_Title cleaning
# Function to simplify job titles
title_simplifier <- function(title) {
title <- tolower(title) # Convert to lowercase for consistent matching
case_when(
str_detect(title, "data scientist") ~ "data scientist",
str_detect(title, "data engineer") ~ "data engineer",
str_detect(title, "analyst") ~ "analyst",
str_detect(title, "machine learning") ~ "mle", # Maps 'machine learning' to 'mle'
str_detect(title, "manager") ~ "manager",
str_detect(title, "director") ~ "director",
TRUE ~ "Other" # Default case for unmatched titles
)
}
# Apply function to 'Job_Title' column and create a new column 'Simplified_Title'
df <- df %>%
mutate(JobTitle = sapply(Job_Title, title_simplifier))
# Display first few rows to verify
head(df[, c("Job_Title", "JobTitle")])
## Job_Title JobTitle
## 1 Sr Data Scientist data scientist
## 2 Data Scientist data scientist
## 3 Data Scientist data scientist
## 4 Data Scientist data scientist
## 5 Data Scientist data scientist
## 6 Data Scientist data scientist
# Function to extract skills from job description
extract_skills <- function(description) {
skills <- c("Python", "R", "SQL", "Java", "C++", "Scala", "Julia")
found_skills <- skills[str_detect(tolower(description), tolower(skills))]
return(ifelse(length(found_skills) > 0, paste(found_skills, collapse = ", "), NA))
}
# Function to extract education levels from job description
extract_education <- function(description) {
education_levels <- c("Bachelor", "Master", "PhD", "Doctorate")
found_education <- education_levels[str_detect(tolower(description), tolower(education_levels))]
return(ifelse(length(found_education) > 0, paste(found_education, collapse = ", "), NA))
}
# Function to extract years of experience from job description (Handles NAs properly)
extract_experience <- function(description) {
experience <- str_extract(description, "(\\d+)\\+?\\s*(?:year|yr)s?")
return(ifelse(is.na(experience), NA, as.numeric(experience))) # Ensure NA remains NA
}
# Apply the functions to the 'Job_Description' column and create new columns
df <- df %>%
mutate(
Skills = sapply(Job_Description, extract_skills), # Extract skills
Education = sapply(Job_Description, extract_education), # Extract education levels
Years_Experience = sapply(Job_Description, extract_experience) # Extract experience, avoid coercion warning
)
## Warning: There were 438 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Years_Experience = sapply(Job_Description,
## extract_experience)`.
## Caused by warning in `ifelse()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 437 remaining warnings.
# Clean 'Company Name' column by keeping only the first part before "\n"
df <- df %>%
mutate(Company_Name = str_split_fixed(Company, "\n", 2)[, 1])
# Display first few rows to verify
head(df$Company_Name)
## [1] "Healthfirst" "ManTech" "Analysis Group"
## [4] "INFICON" "Affinity Solutions" "HG Insights"
# Load necessary libraries
library(stringr)
# Split 'Location' column into 'City' and 'State'
df <- df %>%
mutate(
State = str_split_fixed(Location, ", ", 2)[, 2], # Extract last part (State)
City = str_split_fixed(Location, ", ", 2)[, 1] # Extract first part (City)
)
# Display first few rows to verify
head(df[, c("Location", "City", "State")])
## Location City State
## 1 New York, NY New York NY
## 2 Chantilly, VA Chantilly VA
## 3 Boston, MA Boston MA
## 4 Newton, MA Newton MA
## 5 New York, NY New York NY
## 6 Santa Barbara, CA Santa Barbara CA
# Function to standardize the 'Size' column
standardize_size <- function(size) {
# Handle missing values or invalid input ('-1')
if (is.na(size) || size == "-1") {
return(NA) # Return NA for missing or invalid sizes
}
# Handle sizes with '+' indicating a range like '10000+ employees'
if (str_detect(size, "\\+")) {
return(str_extract(size, "\\d+") %>% paste0("+")) # Convert to '10000+'
}
# Handle size ranges formatted as '51 to 200 employees'
if (str_detect(size, "to")) {
size_values <- str_extract_all(size, "\\d+")[[1]] # Extract numeric values
return(paste(size_values[1], size_values[2], sep = "-")) # Convert to '51-200'
}
# Return the size as is if no special formatting is needed
return(size)
}
# Apply the standardize_size function to the 'Size' column
df <- df %>%
mutate(Size = sapply(Size, standardize_size))
# Display first few rows to verify
head(df$Size)
## [1] "1001-5000" "5001-10000" "1001-5000" "501-1000" "51-200"
## [6] "51-200"
# Define the current year
current_year <- 2025
# Calculate company age
df <- df %>%
mutate(Age = current_year - Founded)
# Replace invalid ages (e.g., future-dated Founded years) with NA
df <- df %>%
mutate(Age = ifelse(Age == (current_year + 1), NA, Age))
# Display first few rows to verify
head(df[, c("Founded", "Age")])
## Founded Age
## 1 1993 32
## 2 1968 57
## 3 1981 44
## 4 2000 25
## 5 1998 27
## 6 2010 15
# Remove the prefix "Company - " from 'Type of ownership' column
df <- df %>%
mutate(Type_of_ownership = str_replace(Ownership_Type, "^Company - ", ""))
# Display first few rows to verify
head(df$Type_of_ownership)
## [1] "Nonprofit Organization" "Public"
## [3] "Private Practice / Firm" "Public"
## [5] "Private" "Private"
# Standardize 'Industry' and 'Sector' columns to title case
df <- df %>%
mutate(
Industry = str_to_title(Industry),
Sector = str_to_title(Sector)
)
# Display first few rows to verify
head(df[, c("Industry", "Sector")])
## Industry Sector
## 1 Insurance Carriers Insurance
## 2 Research & Development Business Services
## 3 Consulting Business Services
## 4 Electrical & Electronic Manufacturing Manufacturing
## 5 Advertising & Marketing Business Services
## 6 Computer Hardware & Software Information Technology
# Function to clean and standardize revenue values
clean_revenue <- function(revenue) {
# Handle missing values or invalid revenue indicators
if (is.na(revenue) || revenue == "-1") {
return(NA) # Return NA for missing or invalid values
}
# Extract all numerical values from the revenue string
numbers <- as.numeric(str_extract_all(revenue, "\\d+")[[1]])
# Check if 'million' is mentioned in the revenue string
if (str_detect(tolower(revenue), "million")) {
bounds <- numbers * 1e6 # Convert to millions
}
# Check if 'billion' is mentioned in the revenue string
else if (str_detect(tolower(revenue), "billion")) {
bounds <- numbers * 1e9 # Convert to billions
}
else {
return(NA) # Return NA if neither 'million' nor 'billion' is found
}
# If there are two numbers (indicating a revenue range), calculate the average
if (length(bounds) == 2) {
return(mean(bounds)) # Return the average of the two bounds
}
else if (length(bounds) == 1) {
return(bounds[1]) # If only one number is found, return it as revenue
}
return(NA) # Return NA if no valid numbers are found
}
# Apply the clean_revenue function to the 'Revenue' column and create a new column 'AverageRevenue'
df <- df %>%
mutate(AverageRevenue = sapply(Revenue, clean_revenue))
# Display first few rows to verify
head(df[, c("Revenue", "AverageRevenue")])
## Revenue AverageRevenue
## 1 Unknown / Non-Applicable NA
## 2 $1 to $2 billion (USD) 1.5e+09
## 3 $100 to $500 million (USD) 3.0e+08
## 4 $100 to $500 million (USD) 3.0e+08
## 5 Unknown / Non-Applicable NA
## 6 Unknown / Non-Applicable NA
df <- df %>% mutate(across(where(is.character), ~ na_if(., 'Unknown')))
# Convert 'Rating' column to numeric for numerical analysis
df$Rating <- as.numeric(df$Rating)
# Convert 'Founded' column to integer to represent the founding year
df$Founded <- as.integer(df$Founded)
# List of categorical columns to convert
categorical_columns <- c('Job_Title', 'Company_Name', 'Location', 'Type_of_ownership', 'Industry', 'Sector')
# Loop through each categorical column and convert its data type to factor (category)
for (col in categorical_columns) {
df[[col]] <- as.factor(df[[col]]) # Convert to factor for memory efficiency and analysis
}
colnames(df)
## [1] "index" "Job_Title" "Salary_Estimate"
## [4] "Job_Description" "Rating" "Company"
## [7] "Location" "Headquarters" "Size"
## [10] "Founded" "Ownership_Type" "Industry"
## [13] "Sector" "Revenue" "Competitors"
## [16] "salary_usd_min_K" "salary_usd_max_K" "av_salary_usd_K"
## [19] "Experience_Level" "JobTitle" "Skills"
## [22] "Education" "Years_Experience" "Company_Name"
## [25] "State" "City" "Age"
## [28] "Type_of_ownership" "AverageRevenue"
# Remove specified columns from the dataframe
df <- df[, !(names(df) %in% c('Job_Title','Job_Description', 'Location', 'Headquarters', 'Revenue','Competitors','Salary_Estimate', 'index','Ownership_Type'))]
colnames(df)
## [1] "Rating" "Company" "Size"
## [4] "Founded" "Industry" "Sector"
## [7] "salary_usd_min_K" "salary_usd_max_K" "av_salary_usd_K"
## [10] "Experience_Level" "JobTitle" "Skills"
## [13] "Education" "Years_Experience" "Company_Name"
## [16] "State" "City" "Age"
## [19] "Type_of_ownership" "AverageRevenue"
# Define bins and corresponding labels
labels <- c('Low', 'Medium', 'High', 'Very High') # 4 labels
# Create bins for the 'avg_salary' column and assign labels
df$SalaryCategory <- cut(df$av_salary_usd_K,
breaks = 4, # Create 4 intervals
labels = labels, # Assign labels
right = FALSE) # Include the left endpoint, excluding the right one
# Ensure 'Education' column is a list-column before unnesting
# Ensure 'Education' column is a list-column before unnesting
df <- df %>%
mutate(Education = strsplit(as.character(Education), ", ")) # Convert to list
# Exploding (unnesting) the 'Education' column
df_exploded <- df %>%
unnest(Education) %>%
filter(!is.na(Education) & Education != "") # Remove NA or empty values
# Convert 'Education' and 'Experience_Level' to factors for proper ordering
df_exploded <- df_exploded %>%
mutate(
Education = factor(Education, levels = c("Bachelor", "Master", "PhD", "Doctorate")),
Experience_Level = factor(Experience_Level, levels = c("Junior", "Mid-level", "Senior"))
)
# Create the grouped bar chart
ggplot(df_exploded, aes(x = Education, y = av_salary_usd_K, fill = Experience_Level)) +
geom_bar(stat = "identity", position = "dodge") + # Create grouped bars
labs(x = "Education Level",
y = "Average Salary (USD K)",
title = "Average Salary by Education and Experience Level",
fill = "Experience Level") + # Labels and title
theme_minimal() + # Apply minimal theme for cleaner look
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels
plot.title = element_text(hjust = 0.5) # Center the title
)
#Create the box plot for salary ranges by job title
ggplot(df_exploded, aes(x = JobTitle, y = av_salary_usd_K)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) + # Boxplot with transparency
labs(title = "Salary Ranges by Job Title",
x = "Job Title",
y = "Average Salary (K USD)") +
theme_minimal() + # Clean theme
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
# Find the top 5 most common job titles
top_jobs <- df_exploded %>%
count(JobTitle, sort = TRUE) %>% # Count occurrences
top_n(5, n) # Select top 5 job titles
# Calculate the median salary for each top job title
median_salaries <- df_exploded %>%
group_by(JobTitle) %>%
summarise(Median_Salary = median(av_salary_usd_K, na.rm = TRUE)) %>%
filter(JobTitle %in% top_jobs$JobTitle) # Filter only top job titles
# Merge top_jobs and median_salaries
top_jobs_salary <- inner_join(top_jobs, median_salaries, by = "JobTitle")
# Print the table of most common job titles and their median salaries
print(top_jobs_salary)
## # A tibble: 5 × 3
## JobTitle n Median_Salary
## <chr> <int> <dbl>
## 1 data scientist 411 114
## 2 Other 73 133
## 3 analyst 36 114.
## 4 data engineer 27 116.
## 5 mle 26 103
# Visualization: Bar chart of most common job titles and their median salaries
ggplot(top_jobs_salary, aes(x = reorder(JobTitle, n), y = Median_Salary, fill = JobTitle)) +
geom_col(show.legend = FALSE) + # Bar plot
geom_text(aes(label = round(Median_Salary, 1)), vjust = -0.5, size = 4) + # Add labels
labs(title = "Most Common Job Titles and Their Median Salaries",
x = "Job Title",
y = "Median Salary (K USD)") +
theme_minimal() + # Clean theme
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
plot_histogram(df)
library(dplyr)
library(ggplot2)
# Filter the data to keep the top 10 companies with the highest average salary
top_companies <- df %>%
arrange(desc(salary_usd_max_K)) %>%
slice(1:10)
# Display the result as a data table
top_companies
## Rating Company Size Founded
## 1 4.1 Roche\n4.1 10000+ 1896
## 2 4.0 AstraZeneca\n4.0 10000+ 1913
## 3 3.6 Creative Circle\n3.6 201-500 2002
## 4 5.0 Blue Horizon Tek Solutions\n5.0 1-50 1987
## 5 3.5 Maxar Technologies\n3.5 5001-10000 -1
## 6 4.7 Sharpedge Solutions Inc\n4.7 9 -1
## 7 3.5 Maxar Technologies\n3.5 5001-10000 -1
## 8 3.6 Alaka`ina Foundation Family of Companies\n3.6 501-1000 -1
## 9 3.9 Southwest Research Institute\n3.9 1001-5000 1947
## 10 2.7 Hexagon US Federal\n2.7 51-200 2010
## Industry Sector salary_usd_min_K
## 1 Biotech & Pharmaceuticals Biotech & Pharmaceuticals 212
## 2 Biotech & Pharmaceuticals Biotech & Pharmaceuticals 212
## 3 Staffing & Outsourcing Business Services 212
## 4 Staffing & Outsourcing Business Services 212
## 5 Aerospace & Defense Aerospace & Defense 212
## 6 Publishing Media 212
## 7 Aerospace & Defense Aerospace & Defense 212
## 8 -1 -1 212
## 9 Research & Development Business Services 212
## 10 Aerospace & Defense Aerospace & Defense 212
## salary_usd_max_K av_salary_usd_K Experience_Level JobTitle
## 1 331 271.5 Senior data scientist
## 2 331 271.5 Mid-level data scientist
## 3 331 271.5 Mid-level data scientist
## 4 331 271.5 Mid-level data scientist
## 5 331 271.5 Senior data scientist
## 6 331 271.5 Mid-level data scientist
## 7 331 271.5 Mid-level data scientist
## 8 331 271.5 Mid-level data scientist
## 9 331 271.5 Mid-level Other
## 10 331 271.5 Mid-level data scientist
## Skills Education Years_Experience
## 1 Python, R, SQL, C++ Master, PhD NA
## 2 Python, R, SQL, C++ PhD NA
## 3 Python, R, SQL, C++ NA NA
## 4 Python, R, Java, C++ Bachelor, Master NA
## 5 Python, R, SQL, Java, C++ NA NA
## 6 Python, R, SQL, C++ Master NA
## 7 Python, R, SQL, Java, C++ Bachelor NA
## 8 Python, R, SQL, C++ NA NA
## 9 Python, R, Java, C++ NA NA
## 10 R, C++ Bachelor NA
## Company_Name State City Age
## 1 Roche CA Pleasanton 129
## 2 AstraZeneca DE Wilmington 112
## 3 Creative Circle United States 23
## 4 Blue Horizon Tek Solutions NY New York 38
## 5 Maxar Technologies VA Herndon NA
## 6 Sharpedge Solutions Inc WA Seattle NA
## 7 Maxar Technologies VA Herndon NA
## 8 Alaka`ina Foundation Family of Companies TX Fort Sam Houston NA
## 9 Southwest Research Institute OK Oklahoma City 78
## 10 Hexagon US Federal MD Lexington Park 15
## Type_of_ownership AverageRevenue SalaryCategory
## 1 Public 1.000e+10 Very High
## 2 Public 1.000e+10 Very High
## 3 Public NA Very High
## 4 Private NA Very High
## 5 Public 3.500e+09 Very High
## 6 Private 1.000e+06 Very High
## 7 Public 3.500e+09 Very High
## 8 Government NA Very High
## 9 Nonprofit Organization 2.505e+08 Very High
## 10 Public NA Very High
summary(top_companies)
## Rating Company
## Min. :2.700 Maxar Technologies\n3.5 :2
## 1st Qu.:3.525 Alaka`ina Foundation Family of Companies\n3.6:1
## Median :3.750 AstraZeneca\n4.0 :1
## Mean :3.860 Blue Horizon Tek Solutions\n5.0 :1
## 3rd Qu.:4.075 Creative Circle\n3.6 :1
## Max. :5.000 Hexagon US Federal\n2.7 :1
## (Other) :3
##
##
##
## Size Founded Industry
## Length:10 Min. : -1 Aerospace & Defense :3
## Class :character 1st Qu.: -1 Biotech & Pharmaceuticals:2
## Mode :character Median :1904 Staffing & Outsourcing :2
## Mean :1175 -1 :1
## 3rd Qu.:1977 Publishing :1
## Max. :2010 Research & Development :1
## (Other) :0
##
##
##
## Sector salary_usd_min_K salary_usd_max_K
## Aerospace & Defense :3 Min. :212 Min. :331
## Business Services :3 1st Qu.:212 1st Qu.:331
## Biotech & Pharmaceuticals:2 Median :212 Median :331
## -1 :1 Mean :212 Mean :331
## Media :1 3rd Qu.:212 3rd Qu.:331
## Accounting & Legal :0 Max. :212 Max. :331
## (Other) :0
##
##
##
## av_salary_usd_K Experience_Level JobTitle Skills
## Min. :271.5 Length:10 Length:10 Length:10
## 1st Qu.:271.5 Class :character Class :character Class :character
## Median :271.5 Mode :character Mode :character Mode :character
## Mean :271.5
## 3rd Qu.:271.5
## Max. :271.5
##
##
##
##
## Education.Length Education.Class Education.Mode Years_Experience
## 2 -none- character Min. : NA
## 1 -none- character 1st Qu.: NA
## 1 -none- character Median : NA
## 2 -none- character Mean :NaN
## 1 -none- character 3rd Qu.: NA
## 1 -none- character Max. : NA
## 1 -none- character NA's :10
## 1 -none- character
## 1 -none- character
## 1 -none- character
## Company_Name State
## Maxar Technologies :2 Length:10
## Alaka`ina Foundation Family of Companies:1 Class :character
## AstraZeneca :1 Mode :character
## Blue Horizon Tek Solutions :1
## Creative Circle :1
## Hexagon US Federal :1
## (Other) :3
##
##
##
## City Age Type_of_ownership
## Length:10 Min. : 15.00 Public :6
## Class :character 1st Qu.: 26.75 Private :2
## Mode :character Median : 58.00 Government :1
## Mean : 65.83 Nonprofit Organization:1
## 3rd Qu.:103.50 -1 :0
## Max. :129.00 College / University :0
## NA's :4 (Other) :0
##
##
##
## AverageRevenue SalaryCategory
## Min. :1.000e+06 Low : 0
## 1st Qu.:1.063e+09 Medium : 0
## Median :3.500e+09 High : 0
## Mean :4.542e+09 Very High:10
## 3rd Qu.:8.375e+09
## Max. :1.000e+10
## NA's :4
##
##
##
head(top_companies)
## Rating Company Size Founded
## 1 4.1 Roche\n4.1 10000+ 1896
## 2 4.0 AstraZeneca\n4.0 10000+ 1913
## 3 3.6 Creative Circle\n3.6 201-500 2002
## 4 5.0 Blue Horizon Tek Solutions\n5.0 1-50 1987
## 5 3.5 Maxar Technologies\n3.5 5001-10000 -1
## 6 4.7 Sharpedge Solutions Inc\n4.7 9 -1
## Industry Sector salary_usd_min_K
## 1 Biotech & Pharmaceuticals Biotech & Pharmaceuticals 212
## 2 Biotech & Pharmaceuticals Biotech & Pharmaceuticals 212
## 3 Staffing & Outsourcing Business Services 212
## 4 Staffing & Outsourcing Business Services 212
## 5 Aerospace & Defense Aerospace & Defense 212
## 6 Publishing Media 212
## salary_usd_max_K av_salary_usd_K Experience_Level JobTitle
## 1 331 271.5 Senior data scientist
## 2 331 271.5 Mid-level data scientist
## 3 331 271.5 Mid-level data scientist
## 4 331 271.5 Mid-level data scientist
## 5 331 271.5 Senior data scientist
## 6 331 271.5 Mid-level data scientist
## Skills Education Years_Experience
## 1 Python, R, SQL, C++ Master, PhD NA
## 2 Python, R, SQL, C++ PhD NA
## 3 Python, R, SQL, C++ NA NA
## 4 Python, R, Java, C++ Bachelor, Master NA
## 5 Python, R, SQL, Java, C++ NA NA
## 6 Python, R, SQL, C++ Master NA
## Company_Name State City Age Type_of_ownership
## 1 Roche CA Pleasanton 129 Public
## 2 AstraZeneca DE Wilmington 112 Public
## 3 Creative Circle United States 23 Public
## 4 Blue Horizon Tek Solutions NY New York 38 Private
## 5 Maxar Technologies VA Herndon NA Public
## 6 Sharpedge Solutions Inc WA Seattle NA Private
## AverageRevenue SalaryCategory
## 1 1.0e+10 Very High
## 2 1.0e+10 Very High
## 3 NA Very High
## 4 NA Very High
## 5 3.5e+09 Very High
## 6 1.0e+06 Very High
We see that the data scientist role is the common for all fo the companies.Lets Mr. X wanna join in any one of the companises.His matrixes of considrations are company size (>1000),founding year(<2000),,average revenue is (>3.500e+09)
df_filtered <- top_companies[top_companies$Size == "10000+", ]
# View the filtered data
print(df_filtered)
## Rating Company Size Founded Industry
## 1 4.1 Roche\n4.1 10000+ 1896 Biotech & Pharmaceuticals
## 2 4.0 AstraZeneca\n4.0 10000+ 1913 Biotech & Pharmaceuticals
## Sector salary_usd_min_K salary_usd_max_K av_salary_usd_K
## 1 Biotech & Pharmaceuticals 212 331 271.5
## 2 Biotech & Pharmaceuticals 212 331 271.5
## Experience_Level JobTitle Skills Education
## 1 Senior data scientist Python, R, SQL, C++ Master, PhD
## 2 Mid-level data scientist Python, R, SQL, C++ PhD
## Years_Experience Company_Name State City Age Type_of_ownership
## 1 NA Roche CA Pleasanton 129 Public
## 2 NA AstraZeneca DE Wilmington 112 Public
## AverageRevenue SalaryCategory
## 1 1e+10 Very High
## 2 1e+10 Very High
#path1 <- "C:/Users/tanzi/OneDrive/DATA/607/week6/tanzil_cheese_data.csv"
#write.csv(df_clean, path1)
Conclusion:Mr.X can join any of the two companies Roche with rating(4.1) & AstraZeneca(ratingn 4.0).Other than the experience Level & State all other variables are identical.
`