knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(readr)
library(dplyr)
library(tinytex)
## Warning: package 'tinytex' was built under R version 4.4.3
##Data has been taken from :https://www.kaggle.com/datasets/sriharshaeedala/airline-delay. #Data has been taken upto year 2015. #We will analyze data and try to find out correlations. ##Lastly we will save the data.
file <- 'https://raw.githubusercontent.com/tanzil64/Data-607-Project-02/refs/heads/main/Airline_Delay_Cause%20-%20Cleaned.csv'
data <- read.csv(file)
head(data)
## year month carrier_name airport
## 1 2023 8 Endeavor Air Inc. ABE
## 2 2023 8 Endeavor Air Inc. ABY
## 3 2023 8 Endeavor Air Inc. AEX
## 4 2023 8 Endeavor Air Inc. AGS
## 5 2023 8 Endeavor Air Inc. ALB
## 6 2023 8 Endeavor Air Inc. ATL
## airport_name arr_flights
## 1 Allentown/Bethlehem/Easton, PA: Lehigh Valley International 89
## 2 Albany, GA: Southwest Georgia Regional 62
## 3 Alexandria, LA: Alexandria International 62
## 4 Augusta, GA: Augusta Regional at Bush Field 66
## 5 Albany, NY: Albany International 92
## 6 Atlanta, GA: Hartsfield-Jackson Atlanta International 1636
## arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct
## 1 13 2.25 1.60 3.16 0 5.99
## 2 10 1.97 0.04 0.57 0 7.42
## 3 10 2.73 1.18 1.80 0 4.28
## 4 12 3.69 2.27 4.47 0 1.57
## 5 22 7.76 0.00 2.96 0 11.28
## 6 256 55.98 27.81 63.64 0 108.57
## arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay
## 1 2 1 1375 71 761 118
## 2 0 1 799 218 1 62
## 3 1 0 766 56 188 78
## 4 1 1 1397 471 320 388
## 5 2 0 1530 628 0 134
## 6 32 11 29768 9339 4557 4676
## security_delay late_aircraft_delay
## 1 0 425
## 2 0 518
## 3 0 444
## 4 0 218
## 5 0 768
## 6 0 11196
summary(data)
## year month carrier_name airport
## Min. :2015 Min. : 1.000 Length:150953 Length:150953
## 1st Qu.:2017 1st Qu.: 3.000 Class :character Class :character
## Median :2019 Median : 6.000 Mode :character Mode :character
## Mean :2019 Mean : 6.336
## 3rd Qu.:2021 3rd Qu.: 9.000
## Max. :2023 Max. :12.000
##
## airport_name arr_flights arr_del15 carrier_ct
## Length:150953 Min. : 1.0 Min. : 0.00 Min. : 0.00
## Class :character 1st Qu.: 48.0 1st Qu.: 6.00 1st Qu.: 2.00
## Mode :character Median : 95.0 Median : 16.00 Median : 6.03
## Mean : 356.2 Mean : 64.04 Mean : 20.33
## 3rd Qu.: 243.0 3rd Qu.: 45.00 3rd Qu.: 16.50
## Max. :21977.0 Max. :4176.00 Max. :1293.91
## NA's :209 NA's :410 NA's :209
## weather_ct nas_ct security_ct late_aircraft_ct
## Min. : 0.000 Min. : 0.00 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 1.00 1st Qu.: 0.0000 1st Qu.: 1.02
## Median : 0.380 Median : 3.62 Median : 0.0000 Median : 4.67
## Mean : 2.244 Mean : 18.65 Mean : 0.1611 Mean : 22.58
## 3rd Qu.: 1.850 3rd Qu.: 11.00 3rd Qu.: 0.0000 3rd Qu.: 14.43
## Max. :266.420 Max. :1884.42 Max. :58.6900 Max. :2069.07
## NA's :209 NA's :209 NA's :209 NA's :209
## arr_cancelled arr_diverted arr_delay carrier_delay
## Min. : 0.000 Min. : 0.0000 Min. : 0 Min. : 0
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 316 1st Qu.: 103
## Median : 1.000 Median : 0.0000 Median : 980 Median : 361
## Mean : 7.502 Mean : 0.8519 Mean : 4182 Mean : 1439
## 3rd Qu.: 4.000 3rd Qu.: 1.0000 3rd Qu.: 2804 3rd Qu.: 1089
## Max. :4951.000 Max. :160.0000 Max. :438783 Max. :196944
## NA's :209 NA's :209 NA's :209 NA's :209
## weather_delay nas_delay security_delay late_aircraft_delay
## Min. : 0 Min. : 0 Min. : 0.000 Min. : 0
## 1st Qu.: 0 1st Qu.: 30 1st Qu.: 0.000 1st Qu.: 59
## Median : 17 Median : 136 Median : 0.000 Median : 302
## Mean : 227 Mean : 898 Mean : 7.619 Mean : 1609
## 3rd Qu.: 147 3rd Qu.: 451 3rd Qu.: 0.000 3rd Qu.: 1038
## Max. :31960 Max. :112018 Max. :3760.000 Max. :227959
## NA's :209 NA's :209 NA's :209 NA's :209
library(dplyr)
library(ggplot2)
# Summarize data by year, carrier_name, and airport
df_summary <- data %>%
group_by(year, carrier_name, airport) %>%
summarise(total_flights = sum(arr_flights, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'year', 'carrier_name'. You can override
## using the `.groups` argument.
# Create a bar chart
ggplot(df_summary, aes(x = airport, y = total_flights, fill = carrier_name)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ year) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Total Flights by Year, Carrier, and Airport",
x = "Airport",
y = "Total Flights",
fill = "Carrier Name")
library(dplyr)
library(ggplot2)
# Filter data for the year 2023 and airport ABE
df_2023_ABE <- data %>%
filter(year == 2023, airport == "ABE")
# Summarize data by carrier_name
df_summary_2023_ABE <- df_2023_ABE %>%
group_by(carrier_name) %>%
summarise(total_delays = sum(arr_del15, na.rm = TRUE)) %>%
ungroup()
# Create a bar chart
ggplot(df_summary_2023_ABE, aes(x = carrier_name, y = total_delays, fill = carrier_name)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Total Delays by Carrier at ABE Airport in 2023",
x = "Carrier Name",
y = "Total Delays",
fill = "Carrier Name")
# Calculate total delays and percentage of each delay type
library(ggplot2)
library(dplyr)
library(tidyr)
# Calculate total delays, percentage of each delay type, and frequency
df_delay_analysis <- data %>%
summarise(
total_delay = sum(arr_delay, na.rm = TRUE),
freq_carrier_delay = sum(carrier_ct, na.rm = TRUE),
freq_weather_delay = sum(weather_ct, na.rm = TRUE),
freq_nas_delay = sum(nas_ct, na.rm = TRUE),
freq_security_delay = sum(security_ct, na.rm = TRUE),
freq_late_aircraft_delay = sum(late_aircraft_ct, na.rm = TRUE),
percent_carrier_delay = sum(carrier_delay, na.rm = TRUE) / total_delay * 100,
percent_weather_delay = sum(weather_delay, na.rm = TRUE) / total_delay * 100,
percent_nas_delay = sum(nas_delay, na.rm = TRUE) / total_delay * 100,
percent_security_delay = sum(security_delay, na.rm = TRUE) / total_delay * 100,
percent_late_aircraft_delay = sum(late_aircraft_delay, na.rm = TRUE) / total_delay * 100
)
# Reshape the data for plotting
df_delay_analysis_long <- df_delay_analysis %>%
pivot_longer(cols = -total_delay, names_to = "metric", values_to = "value") %>%
separate(metric, into = c("type", "delay_type"), sep = "_", extra = "merge") %>%
pivot_wider(names_from = type, values_from = value)
# Plot total delay, percentage delays, and frequency with connected dots
ggplot(df_delay_analysis_long, aes(x = delay_type)) +
geom_bar(aes(y = percent, fill = delay_type), stat = "identity", position = "dodge") +
geom_point(aes(y = freq / max(df_delay_analysis_long$freq) * 100), color = "red", size = 3) +
geom_line(aes(y = freq / max(df_delay_analysis_long$freq) * 100, group = 1), color = "red") +
labs(title = paste("Total Delay:", df_delay_analysis$total_delay, "minutes"),
x = "Delay Type", y = "Percentage of Total Delay / Frequency (scaled)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Use of `df_delay_analysis_long$freq` is discouraged.
## ℹ Use `freq` instead.
## Use of `df_delay_analysis_long$freq` is discouraged.
## ℹ Use `freq` instead.
library(dplyr)
library(ggplot2)
pacman::p_load(tidyr)
# Transform the dataset into a long format for numeric delay columns
data_long <- data %>%
pivot_longer(cols = c(carrier_delay, weather_delay, nas_delay, security_delay, late_aircraft_delay),
names_to = "delay_type",
values_to = "delay_value")
# Print a small part of the transformed dataset
print(head(data_long))
## # A tibble: 6 × 17
## year month carrier_name airport airport_name arr_flights arr_del15 carrier_ct
## <int> <int> <chr> <chr> <chr> <int> <int> <dbl>
## 1 2023 8 Endeavor Ai… ABE Allentown/B… 89 13 2.25
## 2 2023 8 Endeavor Ai… ABE Allentown/B… 89 13 2.25
## 3 2023 8 Endeavor Ai… ABE Allentown/B… 89 13 2.25
## 4 2023 8 Endeavor Ai… ABE Allentown/B… 89 13 2.25
## 5 2023 8 Endeavor Ai… ABE Allentown/B… 89 13 2.25
## 6 2023 8 Endeavor Ai… ABY Albany, GA:… 62 10 1.97
## # ℹ 9 more variables: weather_ct <dbl>, nas_ct <dbl>, security_ct <dbl>,
## # late_aircraft_ct <dbl>, arr_cancelled <int>, arr_diverted <int>,
## # arr_delay <int>, delay_type <chr>, delay_value <int>
library(ggplot2)
if(!require('DataExplorer')) {
install.packages('DataExplorer')
library('DataExplorer')
}
## Loading required package: DataExplorer
## Warning: package 'DataExplorer' was built under R version 4.4.3
plot_bar(data_long)
## 2 columns ignored with more than 50 categories.
## airport: 389 categories
## airport_name: 413 categories
path1 <- "C:/Users/tanzi/OneDrive/DATA/607/week6/tanzil_airline_delay.csv"
write.csv(data_long, path1)
##Chesse Data has been taken from :Discussion 05. #We will analyze data and try to find out correlations. ##Lastly we will save the data.
file <- 'https://raw.githubusercontent.com/tanzil64/Data-607-Project-02/refs/heads/main/cheeses.csv'
data <- read.csv(file)
head(data)
## cheese url milk
## 1 Aarewasser https://www.cheese.com/aarewasser/ cow
## 2 Abbaye de Belloc https://www.cheese.com/abbaye-de-belloc/ sheep
## 3 Abbaye de Belval https://www.cheese.com/abbaye-de-belval/ cow
## 4 Abbaye de Citeaux https://www.cheese.com/abbaye-de-citeaux/ cow
## 5 Abbaye de Tamié https://www.cheese.com/tamie/ cow
## 6 Abbaye de Timadeuc https://www.cheese.com/abbaye-de-timadeuc/ cow
## country region family type
## 1 Switzerland <NA> <NA> semi-soft
## 2 France Pays Basque <NA> semi-hard, artisan
## 3 France <NA> <NA> semi-hard
## 4 France Burgundy <NA> semi-soft, artisan, brined
## 5 France Savoie <NA> soft, artisan
## 6 France province of Brittany <NA> semi-hard
## fat_content calcium_content texture rind color
## 1 <NA> <NA> buttery washed yellow
## 2 <NA> <NA> creamy, dense, firm natural yellow
## 3 40-46% <NA> elastic washed ivory
## 4 <NA> <NA> creamy, dense, smooth washed white
## 5 <NA> <NA> creamy, open, smooth washed white
## 6 <NA> <NA> soft washed pale yellow
## flavor aroma vegetarian vegan
## 1 sweet buttery FALSE FALSE
## 2 burnt caramel lanoline TRUE FALSE
## 3 <NA> aromatic FALSE FALSE
## 4 acidic, milky, smooth barnyardy, earthy FALSE FALSE
## 5 fruity, nutty perfumed, pungent FALSE FALSE
## 6 salty, smooth nutty FALSE FALSE
## synonyms alt_spellings
## 1 <NA> <NA>
## 2 Abbaye Notre-Dame de Belloc <NA>
## 3 <NA> <NA>
## 4 <NA> <NA>
## 5 <NA> Tamié, Trappiste de Tamie, Abbey of Tamie
## 6 <NA> <NA>
## producers
## 1 Jumi
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 Abbaye Cistercienne NOTRE-DAME DE TIMADEUC
library(ggplot2)
library(dplyr)
# Assuming df is already loaded in the environment
# Clean and prepare the data
df_clean <- data %>%
mutate(fat_content = as.numeric(gsub(" g/100g|%", "", fat_content))) %>%
filter(!is.na(fat_content))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `fat_content = as.numeric(gsub(" g/100g|%", "", fat_content))`.
## Caused by warning:
## ! NAs introduced by coercion
# Plotting milk type vs fat content
ggplot(df_clean, aes(x = milk, y = fat_content, fill = milk)) +
geom_boxplot() +
labs(title = "Fat Content by Milk Type", x = "Milk Type", y = "Fat Content (%)") +
theme_minimal() +
theme(legend.position = "none")
library(ggplot2)
library(dplyr)
# Assuming df is already loaded in the environment
# Group by color and milk, filter out NA values, and summarize the count of cheeses
df_color_milk_summary <- data %>%
filter(!is.na(color) & !is.na(milk)) %>%
group_by(color, milk) %>%
summarise(count = n()) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'color'. You can override using the
## `.groups` argument.
# Plotting the count of cheeses by color and milk type
ggplot(df_color_milk_summary, aes(x = reorder(color, -count), y = count, fill = milk)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Count of Cheeses by Color and Milk Type", x = "Color", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust =1))
library(ggplot2)
library(dplyr)
# Assuming df is already loaded in the environment
# Filter for rows where milk includes 'cow', 'sheep', or 'goat', group by milk, and summarize the count of cheeses
df_selected_milk_summary <- data %>%
filter(grepl("cow|sheep|goat", milk, ignore.case = TRUE) & !is.na(milk)) %>%
group_by(milk) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Plotting the count of cheeses by milk type with numbers and connecting lines
ggplot(df_selected_milk_summary, aes(x = reorder(milk, -count), y = count, group = 1)) +
geom_bar(stat = "identity", aes(fill = milk), position = "dodge") +
geom_text(aes(label = count), vjust = -0.5) +
geom_line() +
geom_point() +
labs(title = "Count of Cheeses by Milk Type (Cow, Sheep, Goat)", x = "Milk Type", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
plot_bar(data)
## 12 columns ignored with more than 50 categories.
## cheese: 1187 categories
## url: 1187 categories
## country: 83 categories
## region: 350 categories
## type: 85 categories
## fat_content: 86 categories
## texture: 310 categories
## flavor: 627 categories
## aroma: 331 categories
## synonyms: 293 categories
## alt_spellings: 110 categories
## producers: 319 categories
path1 <- "C:/Users/tanzi/OneDrive/DATA/607/week6/tanzil_cheese_data.csv"
write.csv(data, path1)
file <-'https://raw.githubusercontent.com/tanzil64/Data-607-Project-02/refs/heads/main/Uncleaned_DS_jobs.csv'
data1 <- read.csv(file)
str(data1)
## 'data.frame': 672 obs. of 15 variables:
## $ index : int 0 1 2 3 4 5 6 7 8 9 ...
## $ Job.Title : chr "Sr Data Scientist" "Data Scientist" "Data Scientist" "Data Scientist" ...
## $ Salary.Estimate : chr "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" ...
## $ Job.Description : chr "Description\n\nThe Senior Data Scientist is responsible for defining, building, and improving statistical model"| __truncated__ "Secure our Nation, Ignite your Future\n\nJoin the top Information Technology and Analytic professionals in the "| __truncated__ "Overview\n\n\nAnalysis Group is one of the largest international economics consulting firms, with more than 1,0"| __truncated__ "JOB DESCRIPTION:\n\nDo you have a passion for Data and Machine Learning? Do you dream of working with customers"| __truncated__ ...
## $ Rating : num 3.1 4.2 3.8 3.5 2.9 4.2 3.9 3.5 4.4 3.6 ...
## $ Company.Name : chr "Healthfirst\n3.1" "ManTech\n4.2" "Analysis Group\n3.8" "INFICON\n3.5" ...
## $ Location : chr "New York, NY" "Chantilly, VA" "Boston, MA" "Newton, MA" ...
## $ Headquarters : chr "New York, NY" "Herndon, VA" "Boston, MA" "Bad Ragaz, Switzerland" ...
## $ Size : chr "1001 to 5000 employees" "5001 to 10000 employees" "1001 to 5000 employees" "501 to 1000 employees" ...
## $ Founded : int 1993 1968 1981 2000 1998 2010 1996 1990 1983 2014 ...
## $ Type.of.ownership: chr "Nonprofit Organization" "Company - Public" "Private Practice / Firm" "Company - Public" ...
## $ Industry : chr "Insurance Carriers" "Research & Development" "Consulting" "Electrical & Electronic Manufacturing" ...
## $ Sector : chr "Insurance" "Business Services" "Business Services" "Manufacturing" ...
## $ Revenue : chr "Unknown / Non-Applicable" "$1 to $2 billion (USD)" "$100 to $500 million (USD)" "$100 to $500 million (USD)" ...
## $ Competitors : chr "EmblemHealth, UnitedHealth Group, Aetna" "-1" "-1" "MKS Instruments, Pfeiffer Vacuum, Agilent Technologies" ...
# Load necessary library
library(dplyr)
data1[] <- lapply(data1, function(x) {
if (is.character(x)) {
x <- iconv(x, "latin1", "ASCII", sub = "")
x <- gsub("^\\s+|\\s+$", "", x)
}
return(x)
})
# Display the structure of the cleaned data frame
str(data1)
## 'data.frame': 672 obs. of 15 variables:
## $ index : int 0 1 2 3 4 5 6 7 8 9 ...
## $ Job.Title : chr "Sr Data Scientist" "Data Scientist" "Data Scientist" "Data Scientist" ...
## $ Salary.Estimate : chr "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" ...
## $ Job.Description : chr "Description\n\nThe Senior Data Scientist is responsible for defining, building, and improving statistical model"| __truncated__ "Secure our Nation, Ignite your Future\n\nJoin the top Information Technology and Analytic professionals in the "| __truncated__ "Overview\n\n\nAnalysis Group is one of the largest international economics consulting firms, with more than 1,0"| __truncated__ "JOB DESCRIPTION:\n\nDo you have a passion for Data and Machine Learning? Do you dream of working with customers"| __truncated__ ...
## $ Rating : num 3.1 4.2 3.8 3.5 2.9 4.2 3.9 3.5 4.4 3.6 ...
## $ Company.Name : chr "Healthfirst\n3.1" "ManTech\n4.2" "Analysis Group\n3.8" "INFICON\n3.5" ...
## $ Location : chr "New York, NY" "Chantilly, VA" "Boston, MA" "Newton, MA" ...
## $ Headquarters : chr "New York, NY" "Herndon, VA" "Boston, MA" "Bad Ragaz, Switzerland" ...
## $ Size : chr "1001 to 5000 employees" "5001 to 10000 employees" "1001 to 5000 employees" "501 to 1000 employees" ...
## $ Founded : int 1993 1968 1981 2000 1998 2010 1996 1990 1983 2014 ...
## $ Type.of.ownership: chr "Nonprofit Organization" "Company - Public" "Private Practice / Firm" "Company - Public" ...
## $ Industry : chr "Insurance Carriers" "Research & Development" "Consulting" "Electrical & Electronic Manufacturing" ...
## $ Sector : chr "Insurance" "Business Services" "Business Services" "Manufacturing" ...
## $ Revenue : chr "Unknown / Non-Applicable" "$1 to $2 billion (USD)" "$100 to $500 million (USD)" "$100 to $500 million (USD)" ...
## $ Competitors : chr "EmblemHealth, UnitedHealth Group, Aetna" "-1" "-1" "MKS Instruments, Pfeiffer Vacuum, Agilent Technologies" ...
# Load necessary library
library(dplyr)
# Assuming df is already loaded in the environment
# Remove the 'job_description' column
df_cleaned <- data1 %>%
dplyr::select(-Job.Description)
# Display the cleaned data frame
head(df_cleaned)
## index Job.Title Salary.Estimate Rating
## 1 0 Sr Data Scientist $137K-$171K (Glassdoor est.) 3.1
## 2 1 Data Scientist $137K-$171K (Glassdoor est.) 4.2
## 3 2 Data Scientist $137K-$171K (Glassdoor est.) 3.8
## 4 3 Data Scientist $137K-$171K (Glassdoor est.) 3.5
## 5 4 Data Scientist $137K-$171K (Glassdoor est.) 2.9
## 6 5 Data Scientist $137K-$171K (Glassdoor est.) 4.2
## Company.Name Location Headquarters
## 1 Healthfirst\n3.1 New York, NY New York, NY
## 2 ManTech\n4.2 Chantilly, VA Herndon, VA
## 3 Analysis Group\n3.8 Boston, MA Boston, MA
## 4 INFICON\n3.5 Newton, MA Bad Ragaz, Switzerland
## 5 Affinity Solutions\n2.9 New York, NY New York, NY
## 6 HG Insights\n4.2 Santa Barbara, CA Santa Barbara, CA
## Size Founded Type.of.ownership
## 1 1001 to 5000 employees 1993 Nonprofit Organization
## 2 5001 to 10000 employees 1968 Company - Public
## 3 1001 to 5000 employees 1981 Private Practice / Firm
## 4 501 to 1000 employees 2000 Company - Public
## 5 51 to 200 employees 1998 Company - Private
## 6 51 to 200 employees 2010 Company - Private
## Industry Sector
## 1 Insurance Carriers Insurance
## 2 Research & Development Business Services
## 3 Consulting Business Services
## 4 Electrical & Electronic Manufacturing Manufacturing
## 5 Advertising & Marketing Business Services
## 6 Computer Hardware & Software Information Technology
## Revenue
## 1 Unknown / Non-Applicable
## 2 $1 to $2 billion (USD)
## 3 $100 to $500 million (USD)
## 4 $100 to $500 million (USD)
## 5 Unknown / Non-Applicable
## 6 Unknown / Non-Applicable
## Competitors
## 1 EmblemHealth, UnitedHealth Group, Aetna
## 2 -1
## 3 -1
## 4 MKS Instruments, Pfeiffer Vacuum, Agilent Technologies
## 5 Commerce Signals, Cardlytics, Yodlee
## 6 -1
df3 <-data.frame(df_cleaned)
head(df3)
## index Job.Title Salary.Estimate Rating
## 1 0 Sr Data Scientist $137K-$171K (Glassdoor est.) 3.1
## 2 1 Data Scientist $137K-$171K (Glassdoor est.) 4.2
## 3 2 Data Scientist $137K-$171K (Glassdoor est.) 3.8
## 4 3 Data Scientist $137K-$171K (Glassdoor est.) 3.5
## 5 4 Data Scientist $137K-$171K (Glassdoor est.) 2.9
## 6 5 Data Scientist $137K-$171K (Glassdoor est.) 4.2
## Company.Name Location Headquarters
## 1 Healthfirst\n3.1 New York, NY New York, NY
## 2 ManTech\n4.2 Chantilly, VA Herndon, VA
## 3 Analysis Group\n3.8 Boston, MA Boston, MA
## 4 INFICON\n3.5 Newton, MA Bad Ragaz, Switzerland
## 5 Affinity Solutions\n2.9 New York, NY New York, NY
## 6 HG Insights\n4.2 Santa Barbara, CA Santa Barbara, CA
## Size Founded Type.of.ownership
## 1 1001 to 5000 employees 1993 Nonprofit Organization
## 2 5001 to 10000 employees 1968 Company - Public
## 3 1001 to 5000 employees 1981 Private Practice / Firm
## 4 501 to 1000 employees 2000 Company - Public
## 5 51 to 200 employees 1998 Company - Private
## 6 51 to 200 employees 2010 Company - Private
## Industry Sector
## 1 Insurance Carriers Insurance
## 2 Research & Development Business Services
## 3 Consulting Business Services
## 4 Electrical & Electronic Manufacturing Manufacturing
## 5 Advertising & Marketing Business Services
## 6 Computer Hardware & Software Information Technology
## Revenue
## 1 Unknown / Non-Applicable
## 2 $1 to $2 billion (USD)
## 3 $100 to $500 million (USD)
## 4 $100 to $500 million (USD)
## 5 Unknown / Non-Applicable
## 6 Unknown / Non-Applicable
## Competitors
## 1 EmblemHealth, UnitedHealth Group, Aetna
## 2 -1
## 3 -1
## 4 MKS Instruments, Pfeiffer Vacuum, Agilent Technologies
## 5 Commerce Signals, Cardlytics, Yodlee
## 6 -1
top_5_job_titles <- df3 %>%
count(Job.Title) %>%
arrange(desc(n)) %>%
slice_head(n = 5) %>%
pull(Job.Title)
df_top_5 <- df3 %>%
filter(Job.Title %in% top_5_job_titles)
df4 <-data.frame(df_top_5)
head(df4)
## index Job.Title Salary.Estimate Rating
## 1 1 Data Scientist $137K-$171K (Glassdoor est.) 4.2
## 2 2 Data Scientist $137K-$171K (Glassdoor est.) 3.8
## 3 3 Data Scientist $137K-$171K (Glassdoor est.) 3.5
## 4 4 Data Scientist $137K-$171K (Glassdoor est.) 2.9
## 5 5 Data Scientist $137K-$171K (Glassdoor est.) 4.2
## 6 7 Data Scientist $137K-$171K (Glassdoor est.) 3.5
## Company.Name Location Headquarters
## 1 ManTech\n4.2 Chantilly, VA Herndon, VA
## 2 Analysis Group\n3.8 Boston, MA Boston, MA
## 3 INFICON\n3.5 Newton, MA Bad Ragaz, Switzerland
## 4 Affinity Solutions\n2.9 New York, NY New York, NY
## 5 HG Insights\n4.2 Santa Barbara, CA Santa Barbara, CA
## 6 iRobot\n3.5 Bedford, MA Bedford, MA
## Size Founded Type.of.ownership
## 1 5001 to 10000 employees 1968 Company - Public
## 2 1001 to 5000 employees 1981 Private Practice / Firm
## 3 501 to 1000 employees 2000 Company - Public
## 4 51 to 200 employees 1998 Company - Private
## 5 51 to 200 employees 2010 Company - Private
## 6 1001 to 5000 employees 1990 Company - Public
## Industry Sector
## 1 Research & Development Business Services
## 2 Consulting Business Services
## 3 Electrical & Electronic Manufacturing Manufacturing
## 4 Advertising & Marketing Business Services
## 5 Computer Hardware & Software Information Technology
## 6 Consumer Electronics & Appliances Stores Retail
## Revenue
## 1 $1 to $2 billion (USD)
## 2 $100 to $500 million (USD)
## 3 $100 to $500 million (USD)
## 4 Unknown / Non-Applicable
## 5 Unknown / Non-Applicable
## 6 $1 to $2 billion (USD)
## Competitors
## 1 -1
## 2 -1
## 3 MKS Instruments, Pfeiffer Vacuum, Agilent Technologies
## 4 Commerce Signals, Cardlytics, Yodlee
## 5 -1
## 6 -1
# Plotting job title vs average salary
ggplot(df4, aes(y = Job.Title, x = Revenue)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Job Title vs Salary Estimate", x = "Job Title", y = "Average Salary Estimate")
library(ggplot2)
# Plotting job title vs sector
ggplot(df4, aes(x = Job.Title, fill = Sector)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Job Title vs Sector", x = "Job Title", y = "Sector", fill = "Sector")
head(df4)
## index Job.Title Salary.Estimate Rating
## 1 1 Data Scientist $137K-$171K (Glassdoor est.) 4.2
## 2 2 Data Scientist $137K-$171K (Glassdoor est.) 3.8
## 3 3 Data Scientist $137K-$171K (Glassdoor est.) 3.5
## 4 4 Data Scientist $137K-$171K (Glassdoor est.) 2.9
## 5 5 Data Scientist $137K-$171K (Glassdoor est.) 4.2
## 6 7 Data Scientist $137K-$171K (Glassdoor est.) 3.5
## Company.Name Location Headquarters
## 1 ManTech\n4.2 Chantilly, VA Herndon, VA
## 2 Analysis Group\n3.8 Boston, MA Boston, MA
## 3 INFICON\n3.5 Newton, MA Bad Ragaz, Switzerland
## 4 Affinity Solutions\n2.9 New York, NY New York, NY
## 5 HG Insights\n4.2 Santa Barbara, CA Santa Barbara, CA
## 6 iRobot\n3.5 Bedford, MA Bedford, MA
## Size Founded Type.of.ownership
## 1 5001 to 10000 employees 1968 Company - Public
## 2 1001 to 5000 employees 1981 Private Practice / Firm
## 3 501 to 1000 employees 2000 Company - Public
## 4 51 to 200 employees 1998 Company - Private
## 5 51 to 200 employees 2010 Company - Private
## 6 1001 to 5000 employees 1990 Company - Public
## Industry Sector
## 1 Research & Development Business Services
## 2 Consulting Business Services
## 3 Electrical & Electronic Manufacturing Manufacturing
## 4 Advertising & Marketing Business Services
## 5 Computer Hardware & Software Information Technology
## 6 Consumer Electronics & Appliances Stores Retail
## Revenue
## 1 $1 to $2 billion (USD)
## 2 $100 to $500 million (USD)
## 3 $100 to $500 million (USD)
## 4 Unknown / Non-Applicable
## 5 Unknown / Non-Applicable
## 6 $1 to $2 billion (USD)
## Competitors
## 1 -1
## 2 -1
## 3 MKS Instruments, Pfeiffer Vacuum, Agilent Technologies
## 4 Commerce Signals, Cardlytics, Yodlee
## 5 -1
## 6 -1
library(ggplot2)
plot_bar(df4)
## 5 columns ignored with more than 50 categories.
## Company.Name: 301 categories
## Location: 143 categories
## Headquarters: 170 categories
## Industry: 52 categories
## Competitors: 71 categories
path1 <- "C:/Users/tanzi/OneDrive/DATA/607/week6/tanzil_DS_jobs.csv"
write.csv(df4, path1)