Data 607 Project 02

knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(stringr) 
library(readxl)

## Warning: package 'readxl' was built under R version 4.4.3

library(readr)
library(dplyr)
library(tinytex)

## Warning: package 'tinytex' was built under R version 4.4.3

Data_1

##Data has been taken from :https://www.kaggle.com/datasets/sriharshaeedala/airline-delay. #Data has been taken upto year 2015. #We will analyze data and try to find out correlations. ##Lastly we will save the data.

file  <- 'https://raw.githubusercontent.com/tanzil64/Data-607-Project-02/refs/heads/main/Airline_Delay_Cause%20-%20Cleaned.csv'

data <- read.csv(file)
head(data)

##   year month      carrier_name airport
## 1 2023     8 Endeavor Air Inc.     ABE
## 2 2023     8 Endeavor Air Inc.     ABY
## 3 2023     8 Endeavor Air Inc.     AEX
## 4 2023     8 Endeavor Air Inc.     AGS
## 5 2023     8 Endeavor Air Inc.     ALB
## 6 2023     8 Endeavor Air Inc.     ATL
##                                                  airport_name arr_flights
## 1 Allentown/Bethlehem/Easton, PA: Lehigh Valley International          89
## 2                      Albany, GA: Southwest Georgia Regional          62
## 3                    Alexandria, LA: Alexandria International          62
## 4                 Augusta, GA: Augusta Regional at Bush Field          66
## 5                            Albany, NY: Albany International          92
## 6       Atlanta, GA: Hartsfield-Jackson Atlanta International        1636
##   arr_del15 carrier_ct weather_ct nas_ct security_ct late_aircraft_ct
## 1        13       2.25       1.60   3.16           0             5.99
## 2        10       1.97       0.04   0.57           0             7.42
## 3        10       2.73       1.18   1.80           0             4.28
## 4        12       3.69       2.27   4.47           0             1.57
## 5        22       7.76       0.00   2.96           0            11.28
## 6       256      55.98      27.81  63.64           0           108.57
##   arr_cancelled arr_diverted arr_delay carrier_delay weather_delay nas_delay
## 1             2            1      1375            71           761       118
## 2             0            1       799           218             1        62
## 3             1            0       766            56           188        78
## 4             1            1      1397           471           320       388
## 5             2            0      1530           628             0       134
## 6            32           11     29768          9339          4557      4676
##   security_delay late_aircraft_delay
## 1              0                 425
## 2              0                 518
## 3              0                 444
## 4              0                 218
## 5              0                 768
## 6              0               11196

summary(data)

##       year          month        carrier_name         airport         
##  Min.   :2015   Min.   : 1.000   Length:150953      Length:150953     
##  1st Qu.:2017   1st Qu.: 3.000   Class :character   Class :character  
##  Median :2019   Median : 6.000   Mode  :character   Mode  :character  
##  Mean   :2019   Mean   : 6.336                                        
##  3rd Qu.:2021   3rd Qu.: 9.000                                        
##  Max.   :2023   Max.   :12.000                                        
##                                                                       
##  airport_name        arr_flights        arr_del15         carrier_ct     
##  Length:150953      Min.   :    1.0   Min.   :   0.00   Min.   :   0.00  
##  Class :character   1st Qu.:   48.0   1st Qu.:   6.00   1st Qu.:   2.00  
##  Mode  :character   Median :   95.0   Median :  16.00   Median :   6.03  
##                     Mean   :  356.2   Mean   :  64.04   Mean   :  20.33  
##                     3rd Qu.:  243.0   3rd Qu.:  45.00   3rd Qu.:  16.50  
##                     Max.   :21977.0   Max.   :4176.00   Max.   :1293.91  
##                     NA's   :209       NA's   :410       NA's   :209      
##    weather_ct          nas_ct         security_ct      late_aircraft_ct 
##  Min.   :  0.000   Min.   :   0.00   Min.   : 0.0000   Min.   :   0.00  
##  1st Qu.:  0.000   1st Qu.:   1.00   1st Qu.: 0.0000   1st Qu.:   1.02  
##  Median :  0.380   Median :   3.62   Median : 0.0000   Median :   4.67  
##  Mean   :  2.244   Mean   :  18.65   Mean   : 0.1611   Mean   :  22.58  
##  3rd Qu.:  1.850   3rd Qu.:  11.00   3rd Qu.: 0.0000   3rd Qu.:  14.43  
##  Max.   :266.420   Max.   :1884.42   Max.   :58.6900   Max.   :2069.07  
##  NA's   :209       NA's   :209       NA's   :209       NA's   :209      
##  arr_cancelled       arr_diverted        arr_delay      carrier_delay   
##  Min.   :   0.000   Min.   :  0.0000   Min.   :     0   Min.   :     0  
##  1st Qu.:   0.000   1st Qu.:  0.0000   1st Qu.:   316   1st Qu.:   103  
##  Median :   1.000   Median :  0.0000   Median :   980   Median :   361  
##  Mean   :   7.502   Mean   :  0.8519   Mean   :  4182   Mean   :  1439  
##  3rd Qu.:   4.000   3rd Qu.:  1.0000   3rd Qu.:  2804   3rd Qu.:  1089  
##  Max.   :4951.000   Max.   :160.0000   Max.   :438783   Max.   :196944  
##  NA's   :209        NA's   :209        NA's   :209      NA's   :209     
##  weather_delay     nas_delay      security_delay     late_aircraft_delay
##  Min.   :    0   Min.   :     0   Min.   :   0.000   Min.   :     0     
##  1st Qu.:    0   1st Qu.:    30   1st Qu.:   0.000   1st Qu.:    59     
##  Median :   17   Median :   136   Median :   0.000   Median :   302     
##  Mean   :  227   Mean   :   898   Mean   :   7.619   Mean   :  1609     
##  3rd Qu.:  147   3rd Qu.:   451   3rd Qu.:   0.000   3rd Qu.:  1038     
##  Max.   :31960   Max.   :112018   Max.   :3760.000   Max.   :227959     
##  NA's   :209     NA's   :209      NA's   :209        NA's   :209

library(dplyr)
library(ggplot2)

# Summarize data by year, carrier_name, and airport
df_summary <- data %>%
  group_by(year, carrier_name, airport) %>%
  summarise(total_flights = sum(arr_flights, na.rm = TRUE)) %>%
  ungroup()

## `summarise()` has grouped output by 'year', 'carrier_name'. You can override
## using the `.groups` argument.

# Create a bar chart
ggplot(df_summary, aes(x = airport, y = total_flights, fill = carrier_name)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ year) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Total Flights by Year, Carrier, and Airport",
       x = "Airport",
       y = "Total Flights",
       fill = "Carrier Name")

library(dplyr)
library(ggplot2)

# Filter data for the year 2023 and airport ABE
df_2023_ABE <- data %>%
  filter(year == 2023, airport == "ABE")

# Summarize data by carrier_name
df_summary_2023_ABE <- df_2023_ABE %>%
  group_by(carrier_name) %>%
  summarise(total_delays = sum(arr_del15, na.rm = TRUE)) %>%
  ungroup()

# Create a bar chart
ggplot(df_summary_2023_ABE, aes(x = carrier_name, y = total_delays, fill = carrier_name)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Total Delays by Carrier at ABE Airport in 2023",
       x = "Carrier Name",
       y = "Total Delays",
       fill = "Carrier Name")

# Calculate total delays and percentage of each delay type
library(ggplot2)
library(dplyr)
library(tidyr)

# Calculate total delays, percentage of each delay type, and frequency
df_delay_analysis <- data %>%
  summarise(
    total_delay = sum(arr_delay, na.rm = TRUE),
    freq_carrier_delay = sum(carrier_ct, na.rm = TRUE),
    freq_weather_delay = sum(weather_ct, na.rm = TRUE),
    freq_nas_delay = sum(nas_ct, na.rm = TRUE),
    freq_security_delay = sum(security_ct, na.rm = TRUE),
    freq_late_aircraft_delay = sum(late_aircraft_ct, na.rm = TRUE),
    percent_carrier_delay = sum(carrier_delay, na.rm = TRUE) / total_delay * 100,
    percent_weather_delay = sum(weather_delay, na.rm = TRUE) / total_delay * 100,
    percent_nas_delay = sum(nas_delay, na.rm = TRUE) / total_delay * 100,
    percent_security_delay = sum(security_delay, na.rm = TRUE) / total_delay * 100,
    percent_late_aircraft_delay = sum(late_aircraft_delay, na.rm = TRUE) / total_delay * 100
  )

# Reshape the data for plotting
df_delay_analysis_long <- df_delay_analysis %>%
  pivot_longer(cols = -total_delay, names_to = "metric", values_to = "value") %>%
  separate(metric, into = c("type", "delay_type"), sep = "_", extra = "merge") %>%
  pivot_wider(names_from = type, values_from = value)

# Plot total delay, percentage delays, and frequency with connected dots
ggplot(df_delay_analysis_long, aes(x = delay_type)) +
  geom_bar(aes(y = percent, fill = delay_type), stat = "identity", position = "dodge") +
  geom_point(aes(y = freq / max(df_delay_analysis_long$freq) * 100), color = "red", size = 3) +
  geom_line(aes(y = freq / max(df_delay_analysis_long$freq) * 100, group = 1), color = "red") +
  labs(title = paste("Total Delay:", df_delay_analysis$total_delay, "minutes"),
       x = "Delay Type", y = "Percentage of Total Delay / Frequency (scaled)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Warning: Use of `df_delay_analysis_long$freq` is discouraged.
## ℹ Use `freq` instead.
## Use of `df_delay_analysis_long$freq` is discouraged.
## ℹ Use `freq` instead.

Long Data

library(dplyr)
library(ggplot2)

pacman::p_load(tidyr)

# Transform the dataset into a long format for numeric delay columns
data_long <- data %>%
  pivot_longer(cols = c(carrier_delay, weather_delay, nas_delay, security_delay, late_aircraft_delay),
               names_to = "delay_type",
               values_to = "delay_value")

# Print a small part of the transformed dataset
print(head(data_long))

## # A tibble: 6 × 17
##    year month carrier_name airport airport_name arr_flights arr_del15 carrier_ct
##   <int> <int> <chr>        <chr>   <chr>              <int>     <int>      <dbl>
## 1  2023     8 Endeavor Ai… ABE     Allentown/B…          89        13       2.25
## 2  2023     8 Endeavor Ai… ABE     Allentown/B…          89        13       2.25
## 3  2023     8 Endeavor Ai… ABE     Allentown/B…          89        13       2.25
## 4  2023     8 Endeavor Ai… ABE     Allentown/B…          89        13       2.25
## 5  2023     8 Endeavor Ai… ABE     Allentown/B…          89        13       2.25
## 6  2023     8 Endeavor Ai… ABY     Albany, GA:…          62        10       1.97
## # ℹ 9 more variables: weather_ct <dbl>, nas_ct <dbl>, security_ct <dbl>,
## #   late_aircraft_ct <dbl>, arr_cancelled <int>, arr_diverted <int>,
## #   arr_delay <int>, delay_type <chr>, delay_value <int>

library(ggplot2)
if(!require('DataExplorer')) {
  install.packages('DataExplorer')
  library('DataExplorer')
}

## Loading required package: DataExplorer

## Warning: package 'DataExplorer' was built under R version 4.4.3

plot_bar(data_long)

## 2 columns ignored with more than 50 categories.
## airport: 389 categories
## airport_name: 413 categories

path1 <- "C:/Users/tanzi/OneDrive/DATA/607/week6/tanzil_airline_delay.csv"
write.csv(data_long, path1)

Data_2

##Chesse Data has been taken from :Discussion 05. #We will analyze data and try to find out correlations. ##Lastly we will save the data.

file  <- 'https://raw.githubusercontent.com/tanzil64/Data-607-Project-02/refs/heads/main/cheeses.csv'

data <- read.csv(file)
head(data)

##               cheese                                        url  milk
## 1         Aarewasser         https://www.cheese.com/aarewasser/   cow
## 2   Abbaye de Belloc   https://www.cheese.com/abbaye-de-belloc/ sheep
## 3   Abbaye de Belval   https://www.cheese.com/abbaye-de-belval/   cow
## 4  Abbaye de Citeaux  https://www.cheese.com/abbaye-de-citeaux/   cow
## 5    Abbaye de Tamié              https://www.cheese.com/tamie/   cow
## 6 Abbaye de Timadeuc https://www.cheese.com/abbaye-de-timadeuc/   cow
##       country               region family                       type
## 1 Switzerland                 <NA>   <NA>                  semi-soft
## 2      France          Pays Basque   <NA>         semi-hard, artisan
## 3      France                 <NA>   <NA>                  semi-hard
## 4      France             Burgundy   <NA> semi-soft, artisan, brined
## 5      France               Savoie   <NA>              soft, artisan
## 6      France province of Brittany   <NA>                  semi-hard
##   fat_content calcium_content               texture    rind       color
## 1        <NA>            <NA>               buttery  washed      yellow
## 2        <NA>            <NA>   creamy, dense, firm natural      yellow
## 3      40-46%            <NA>               elastic  washed       ivory
## 4        <NA>            <NA> creamy, dense, smooth  washed       white
## 5        <NA>            <NA>  creamy, open, smooth  washed       white
## 6        <NA>            <NA>                  soft  washed pale yellow
##                  flavor             aroma vegetarian vegan
## 1                 sweet           buttery      FALSE FALSE
## 2         burnt caramel          lanoline       TRUE FALSE
## 3                  <NA>          aromatic      FALSE FALSE
## 4 acidic, milky, smooth barnyardy, earthy      FALSE FALSE
## 5         fruity, nutty perfumed, pungent      FALSE FALSE
## 6         salty, smooth             nutty      FALSE FALSE
##                      synonyms                             alt_spellings
## 1                        <NA>                                      <NA>
## 2 Abbaye Notre-Dame de Belloc                                      <NA>
## 3                        <NA>                                      <NA>
## 4                        <NA>                                      <NA>
## 5                        <NA> Tamié, Trappiste de Tamie, Abbey of Tamie
## 6                        <NA>                                      <NA>
##                                    producers
## 1                                       Jumi
## 2                                       <NA>
## 3                                       <NA>
## 4                                       <NA>
## 5                                       <NA>
## 6 Abbaye Cistercienne NOTRE-DAME DE TIMADEUC

library(ggplot2)
library(dplyr)

# Assuming df is already loaded in the environment

# Clean and prepare the data
df_clean <- data %>%
  mutate(fat_content = as.numeric(gsub(" g/100g|%", "", fat_content))) %>%
  filter(!is.na(fat_content))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `fat_content = as.numeric(gsub(" g/100g|%", "", fat_content))`.
## Caused by warning:
## ! NAs introduced by coercion

# Plotting milk type vs fat content
ggplot(df_clean, aes(x = milk, y = fat_content, fill = milk)) +
  geom_boxplot() +
  labs(title = "Fat Content by Milk Type", x = "Milk Type", y = "Fat Content (%)") +
  theme_minimal() +
  theme(legend.position = "none")

library(ggplot2)
library(dplyr)

# Assuming df is already loaded in the environment

# Group by color and milk, filter out NA values, and summarize the count of cheeses
df_color_milk_summary <- data %>%
  filter(!is.na(color) & !is.na(milk)) %>%
  group_by(color, milk) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

## `summarise()` has grouped output by 'color'. You can override using the
## `.groups` argument.

# Plotting the count of cheeses by color and milk type
ggplot(df_color_milk_summary, aes(x = reorder(color, -count), y = count, fill = milk)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Count of Cheeses by Color and Milk Type", x = "Color", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust =1))

library(ggplot2)
library(dplyr)

# Assuming df is already loaded in the environment

# Filter for rows where milk includes 'cow', 'sheep', or 'goat', group by milk, and summarize the count of cheeses
df_selected_milk_summary <- data %>%
  filter(grepl("cow|sheep|goat", milk, ignore.case = TRUE) & !is.na(milk)) %>%
  group_by(milk) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

# Plotting the count of cheeses by milk type with numbers and connecting lines
ggplot(df_selected_milk_summary, aes(x = reorder(milk, -count), y = count, group = 1)) +
  geom_bar(stat = "identity", aes(fill = milk), position = "dodge") +
  geom_text(aes(label = count), vjust = -0.5) +
  geom_line() +
  geom_point() +
  labs(title = "Count of Cheeses by Milk Type (Cow, Sheep, Goat)", x = "Milk Type", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

plot_bar(data)

## 12 columns ignored with more than 50 categories.
## cheese: 1187 categories
## url: 1187 categories
## country: 83 categories
## region: 350 categories
## type: 85 categories
## fat_content: 86 categories
## texture: 310 categories
## flavor: 627 categories
## aroma: 331 categories
## synonyms: 293 categories
## alt_spellings: 110 categories
## producers: 319 categories

path1 <- "C:/Users/tanzi/OneDrive/DATA/607/week6/tanzil_cheese_data.csv"
write.csv(data, path1)

file <-'https://raw.githubusercontent.com/tanzil64/Data-607-Project-02/refs/heads/main/Uncleaned_DS_jobs.csv'
data1 <- read.csv(file)
str(data1)

## 'data.frame':    672 obs. of  15 variables:
##  $ index            : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ Job.Title        : chr  "Sr Data Scientist" "Data Scientist" "Data Scientist" "Data Scientist" ...
##  $ Salary.Estimate  : chr  "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" ...
##  $ Job.Description  : chr  "Description\n\nThe Senior Data Scientist is responsible for defining, building, and improving statistical model"| __truncated__ "Secure our Nation, Ignite your Future\n\nJoin the top Information Technology and Analytic professionals in the "| __truncated__ "Overview\n\n\nAnalysis Group is one of the largest international economics consulting firms, with more than 1,0"| __truncated__ "JOB DESCRIPTION:\n\nDo you have a passion for Data and Machine Learning? Do you dream of working with customers"| __truncated__ ...
##  $ Rating           : num  3.1 4.2 3.8 3.5 2.9 4.2 3.9 3.5 4.4 3.6 ...
##  $ Company.Name     : chr  "Healthfirst\n3.1" "ManTech\n4.2" "Analysis Group\n3.8" "INFICON\n3.5" ...
##  $ Location         : chr  "New York, NY" "Chantilly, VA" "Boston, MA" "Newton, MA" ...
##  $ Headquarters     : chr  "New York, NY" "Herndon, VA" "Boston, MA" "Bad Ragaz, Switzerland" ...
##  $ Size             : chr  "1001 to 5000 employees" "5001 to 10000 employees" "1001 to 5000 employees" "501 to 1000 employees" ...
##  $ Founded          : int  1993 1968 1981 2000 1998 2010 1996 1990 1983 2014 ...
##  $ Type.of.ownership: chr  "Nonprofit Organization" "Company - Public" "Private Practice / Firm" "Company - Public" ...
##  $ Industry         : chr  "Insurance Carriers" "Research & Development" "Consulting" "Electrical & Electronic Manufacturing" ...
##  $ Sector           : chr  "Insurance" "Business Services" "Business Services" "Manufacturing" ...
##  $ Revenue          : chr  "Unknown / Non-Applicable" "$1 to $2 billion (USD)" "$100 to $500 million (USD)" "$100 to $500 million (USD)" ...
##  $ Competitors      : chr  "EmblemHealth, UnitedHealth Group, Aetna" "-1" "-1" "MKS Instruments, Pfeiffer Vacuum, Agilent Technologies" ...

# Load necessary library
library(dplyr)

data1[] <- lapply(data1, function(x) {
  if (is.character(x)) {
    x <- iconv(x, "latin1", "ASCII", sub = "")
    x <- gsub("^\\s+|\\s+$", "", x)
  }
  return(x)
})

# Display the structure of the cleaned data frame
str(data1)

## 'data.frame':    672 obs. of  15 variables:
##  $ index            : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ Job.Title        : chr  "Sr Data Scientist" "Data Scientist" "Data Scientist" "Data Scientist" ...
##  $ Salary.Estimate  : chr  "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" "$137K-$171K (Glassdoor est.)" ...
##  $ Job.Description  : chr  "Description\n\nThe Senior Data Scientist is responsible for defining, building, and improving statistical model"| __truncated__ "Secure our Nation, Ignite your Future\n\nJoin the top Information Technology and Analytic professionals in the "| __truncated__ "Overview\n\n\nAnalysis Group is one of the largest international economics consulting firms, with more than 1,0"| __truncated__ "JOB DESCRIPTION:\n\nDo you have a passion for Data and Machine Learning? Do you dream of working with customers"| __truncated__ ...
##  $ Rating           : num  3.1 4.2 3.8 3.5 2.9 4.2 3.9 3.5 4.4 3.6 ...
##  $ Company.Name     : chr  "Healthfirst\n3.1" "ManTech\n4.2" "Analysis Group\n3.8" "INFICON\n3.5" ...
##  $ Location         : chr  "New York, NY" "Chantilly, VA" "Boston, MA" "Newton, MA" ...
##  $ Headquarters     : chr  "New York, NY" "Herndon, VA" "Boston, MA" "Bad Ragaz, Switzerland" ...
##  $ Size             : chr  "1001 to 5000 employees" "5001 to 10000 employees" "1001 to 5000 employees" "501 to 1000 employees" ...
##  $ Founded          : int  1993 1968 1981 2000 1998 2010 1996 1990 1983 2014 ...
##  $ Type.of.ownership: chr  "Nonprofit Organization" "Company - Public" "Private Practice / Firm" "Company - Public" ...
##  $ Industry         : chr  "Insurance Carriers" "Research & Development" "Consulting" "Electrical & Electronic Manufacturing" ...
##  $ Sector           : chr  "Insurance" "Business Services" "Business Services" "Manufacturing" ...
##  $ Revenue          : chr  "Unknown / Non-Applicable" "$1 to $2 billion (USD)" "$100 to $500 million (USD)" "$100 to $500 million (USD)" ...
##  $ Competitors      : chr  "EmblemHealth, UnitedHealth Group, Aetna" "-1" "-1" "MKS Instruments, Pfeiffer Vacuum, Agilent Technologies" ...

# Load necessary library
library(dplyr)

# Assuming df is already loaded in the environment
# Remove the 'job_description' column
df_cleaned <- data1 %>%
  dplyr::select(-Job.Description)

# Display the cleaned data frame
head(df_cleaned)

##   index         Job.Title              Salary.Estimate Rating
## 1     0 Sr Data Scientist $137K-$171K (Glassdoor est.)    3.1
## 2     1    Data Scientist $137K-$171K (Glassdoor est.)    4.2
## 3     2    Data Scientist $137K-$171K (Glassdoor est.)    3.8
## 4     3    Data Scientist $137K-$171K (Glassdoor est.)    3.5
## 5     4    Data Scientist $137K-$171K (Glassdoor est.)    2.9
## 6     5    Data Scientist $137K-$171K (Glassdoor est.)    4.2
##              Company.Name          Location           Headquarters
## 1        Healthfirst\n3.1      New York, NY           New York, NY
## 2            ManTech\n4.2     Chantilly, VA            Herndon, VA
## 3     Analysis Group\n3.8        Boston, MA             Boston, MA
## 4            INFICON\n3.5        Newton, MA Bad Ragaz, Switzerland
## 5 Affinity Solutions\n2.9      New York, NY           New York, NY
## 6        HG Insights\n4.2 Santa Barbara, CA      Santa Barbara, CA
##                      Size Founded       Type.of.ownership
## 1  1001 to 5000 employees    1993  Nonprofit Organization
## 2 5001 to 10000 employees    1968        Company - Public
## 3  1001 to 5000 employees    1981 Private Practice / Firm
## 4   501 to 1000 employees    2000        Company - Public
## 5     51 to 200 employees    1998       Company - Private
## 6     51 to 200 employees    2010       Company - Private
##                                Industry                 Sector
## 1                    Insurance Carriers              Insurance
## 2                Research & Development      Business Services
## 3                            Consulting      Business Services
## 4 Electrical & Electronic Manufacturing          Manufacturing
## 5               Advertising & Marketing      Business Services
## 6          Computer Hardware & Software Information Technology
##                      Revenue
## 1   Unknown / Non-Applicable
## 2     $1 to $2 billion (USD)
## 3 $100 to $500 million (USD)
## 4 $100 to $500 million (USD)
## 5   Unknown / Non-Applicable
## 6   Unknown / Non-Applicable
##                                              Competitors
## 1                EmblemHealth, UnitedHealth Group, Aetna
## 2                                                     -1
## 3                                                     -1
## 4 MKS Instruments, Pfeiffer Vacuum, Agilent Technologies
## 5                   Commerce Signals, Cardlytics, Yodlee
## 6                                                     -1

df3 <-data.frame(df_cleaned)
head(df3)

##   index         Job.Title              Salary.Estimate Rating
## 1     0 Sr Data Scientist $137K-$171K (Glassdoor est.)    3.1
## 2     1    Data Scientist $137K-$171K (Glassdoor est.)    4.2
## 3     2    Data Scientist $137K-$171K (Glassdoor est.)    3.8
## 4     3    Data Scientist $137K-$171K (Glassdoor est.)    3.5
## 5     4    Data Scientist $137K-$171K (Glassdoor est.)    2.9
## 6     5    Data Scientist $137K-$171K (Glassdoor est.)    4.2
##              Company.Name          Location           Headquarters
## 1        Healthfirst\n3.1      New York, NY           New York, NY
## 2            ManTech\n4.2     Chantilly, VA            Herndon, VA
## 3     Analysis Group\n3.8        Boston, MA             Boston, MA
## 4            INFICON\n3.5        Newton, MA Bad Ragaz, Switzerland
## 5 Affinity Solutions\n2.9      New York, NY           New York, NY
## 6        HG Insights\n4.2 Santa Barbara, CA      Santa Barbara, CA
##                      Size Founded       Type.of.ownership
## 1  1001 to 5000 employees    1993  Nonprofit Organization
## 2 5001 to 10000 employees    1968        Company - Public
## 3  1001 to 5000 employees    1981 Private Practice / Firm
## 4   501 to 1000 employees    2000        Company - Public
## 5     51 to 200 employees    1998       Company - Private
## 6     51 to 200 employees    2010       Company - Private
##                                Industry                 Sector
## 1                    Insurance Carriers              Insurance
## 2                Research & Development      Business Services
## 3                            Consulting      Business Services
## 4 Electrical & Electronic Manufacturing          Manufacturing
## 5               Advertising & Marketing      Business Services
## 6          Computer Hardware & Software Information Technology
##                      Revenue
## 1   Unknown / Non-Applicable
## 2     $1 to $2 billion (USD)
## 3 $100 to $500 million (USD)
## 4 $100 to $500 million (USD)
## 5   Unknown / Non-Applicable
## 6   Unknown / Non-Applicable
##                                              Competitors
## 1                EmblemHealth, UnitedHealth Group, Aetna
## 2                                                     -1
## 3                                                     -1
## 4 MKS Instruments, Pfeiffer Vacuum, Agilent Technologies
## 5                   Commerce Signals, Cardlytics, Yodlee
## 6                                                     -1

top_5_job_titles <- df3 %>%
  count(Job.Title) %>%
  arrange(desc(n)) %>%
  slice_head(n = 5) %>%
  pull(Job.Title)

df_top_5 <- df3 %>%
  filter(Job.Title %in% top_5_job_titles)

df4 <-data.frame(df_top_5)

head(df4)

##   index      Job.Title              Salary.Estimate Rating
## 1     1 Data Scientist $137K-$171K (Glassdoor est.)    4.2
## 2     2 Data Scientist $137K-$171K (Glassdoor est.)    3.8
## 3     3 Data Scientist $137K-$171K (Glassdoor est.)    3.5
## 4     4 Data Scientist $137K-$171K (Glassdoor est.)    2.9
## 5     5 Data Scientist $137K-$171K (Glassdoor est.)    4.2
## 6     7 Data Scientist $137K-$171K (Glassdoor est.)    3.5
##              Company.Name          Location           Headquarters
## 1            ManTech\n4.2     Chantilly, VA            Herndon, VA
## 2     Analysis Group\n3.8        Boston, MA             Boston, MA
## 3            INFICON\n3.5        Newton, MA Bad Ragaz, Switzerland
## 4 Affinity Solutions\n2.9      New York, NY           New York, NY
## 5        HG Insights\n4.2 Santa Barbara, CA      Santa Barbara, CA
## 6             iRobot\n3.5       Bedford, MA            Bedford, MA
##                      Size Founded       Type.of.ownership
## 1 5001 to 10000 employees    1968        Company - Public
## 2  1001 to 5000 employees    1981 Private Practice / Firm
## 3   501 to 1000 employees    2000        Company - Public
## 4     51 to 200 employees    1998       Company - Private
## 5     51 to 200 employees    2010       Company - Private
## 6  1001 to 5000 employees    1990        Company - Public
##                                   Industry                 Sector
## 1                   Research & Development      Business Services
## 2                               Consulting      Business Services
## 3    Electrical & Electronic Manufacturing          Manufacturing
## 4                  Advertising & Marketing      Business Services
## 5             Computer Hardware & Software Information Technology
## 6 Consumer Electronics & Appliances Stores                 Retail
##                      Revenue
## 1     $1 to $2 billion (USD)
## 2 $100 to $500 million (USD)
## 3 $100 to $500 million (USD)
## 4   Unknown / Non-Applicable
## 5   Unknown / Non-Applicable
## 6     $1 to $2 billion (USD)
##                                              Competitors
## 1                                                     -1
## 2                                                     -1
## 3 MKS Instruments, Pfeiffer Vacuum, Agilent Technologies
## 4                   Commerce Signals, Cardlytics, Yodlee
## 5                                                     -1
## 6                                                     -1

# Plotting job title vs average salary
ggplot(df4, aes(y = Job.Title, x = Revenue)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Job Title vs Salary Estimate", x = "Job Title", y = "Average Salary Estimate")

library(ggplot2)

# Plotting job title vs sector
ggplot(df4, aes(x = Job.Title, fill = Sector)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Job Title vs Sector", x = "Job Title", y = "Sector", fill = "Sector")

head(df4)

##   index      Job.Title              Salary.Estimate Rating
## 1     1 Data Scientist $137K-$171K (Glassdoor est.)    4.2
## 2     2 Data Scientist $137K-$171K (Glassdoor est.)    3.8
## 3     3 Data Scientist $137K-$171K (Glassdoor est.)    3.5
## 4     4 Data Scientist $137K-$171K (Glassdoor est.)    2.9
## 5     5 Data Scientist $137K-$171K (Glassdoor est.)    4.2
## 6     7 Data Scientist $137K-$171K (Glassdoor est.)    3.5
##              Company.Name          Location           Headquarters
## 1            ManTech\n4.2     Chantilly, VA            Herndon, VA
## 2     Analysis Group\n3.8        Boston, MA             Boston, MA
## 3            INFICON\n3.5        Newton, MA Bad Ragaz, Switzerland
## 4 Affinity Solutions\n2.9      New York, NY           New York, NY
## 5        HG Insights\n4.2 Santa Barbara, CA      Santa Barbara, CA
## 6             iRobot\n3.5       Bedford, MA            Bedford, MA
##                      Size Founded       Type.of.ownership
## 1 5001 to 10000 employees    1968        Company - Public
## 2  1001 to 5000 employees    1981 Private Practice / Firm
## 3   501 to 1000 employees    2000        Company - Public
## 4     51 to 200 employees    1998       Company - Private
## 5     51 to 200 employees    2010       Company - Private
## 6  1001 to 5000 employees    1990        Company - Public
##                                   Industry                 Sector
## 1                   Research & Development      Business Services
## 2                               Consulting      Business Services
## 3    Electrical & Electronic Manufacturing          Manufacturing
## 4                  Advertising & Marketing      Business Services
## 5             Computer Hardware & Software Information Technology
## 6 Consumer Electronics & Appliances Stores                 Retail
##                      Revenue
## 1     $1 to $2 billion (USD)
## 2 $100 to $500 million (USD)
## 3 $100 to $500 million (USD)
## 4   Unknown / Non-Applicable
## 5   Unknown / Non-Applicable
## 6     $1 to $2 billion (USD)
##                                              Competitors
## 1                                                     -1
## 2                                                     -1
## 3 MKS Instruments, Pfeiffer Vacuum, Agilent Technologies
## 4                   Commerce Signals, Cardlytics, Yodlee
## 5                                                     -1
## 6                                                     -1

library(ggplot2)
plot_bar(df4)

## 5 columns ignored with more than 50 categories.
## Company.Name: 301 categories
## Location: 143 categories
## Headquarters: 170 categories
## Industry: 52 categories
## Competitors: 71 categories

path1 <- "C:/Users/tanzi/OneDrive/DATA/607/week6/tanzil_DS_jobs.csv"
write.csv(df4, path1)

Data 607 Project 02

Md. Tanzil Ehsan

2025-03-10

Data_1

Long Data

Data_2